New src/hotspot/cpu/aarch64/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/barrierSet.hpp"
  30 #include "gc/shared/barrierSetAssembler.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_aarch64.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/frame.inline.hpp"
  39 #include "runtime/handles.inline.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubCodeGenerator.hpp"
  42 #include "runtime/stubRoutines.hpp"
  43 #include "runtime/thread.inline.hpp"
  44 #include "utilities/align.hpp"
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 
  49 #ifdef BUILTIN_SIM
  50 #include "../../../../../../simulator/simulator.hpp"
  51 #endif
  52 
  53 // Declaration and definition of StubGenerator (no .hpp file).
  54 // For a more detailed description of the stub routine structure
  55 // see the comment in stubRoutines.hpp
  56 
  57 #undef __
  58 #define __ _masm->
  59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  60 
  61 #ifdef PRODUCT
  62 #define BLOCK_COMMENT(str) /* nothing */
  63 #else
  64 #define BLOCK_COMMENT(str) __ block_comment(str)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Stub Code definitions
  70 
  71 class StubGenerator: public StubCodeGenerator {
  72  private:
  73 
  74 #ifdef PRODUCT
  75 #define inc_counter_np(counter) ((void)0)
  76 #else
  77   void inc_counter_np_(int& counter) {
  78     __ lea(rscratch2, ExternalAddress((address)&counter));
  79     __ ldrw(rscratch1, Address(rscratch2));
  80     __ addw(rscratch1, rscratch1, 1);
  81     __ strw(rscratch1, Address(rscratch2));
  82   }
  83 #define inc_counter_np(counter) \
  84   BLOCK_COMMENT("inc_counter " #counter); \
  85   inc_counter_np_(counter);
  86 #endif
  87 
  88   // Call stubs are used to call Java from C
  89   //
  90   // Arguments:
  91   //    c_rarg0:   call wrapper address                   address
  92   //    c_rarg1:   result                                 address
  93   //    c_rarg2:   result type                            BasicType
  94   //    c_rarg3:   method                                 Method*
  95   //    c_rarg4:   (interpreter) entry point              address
  96   //    c_rarg5:   parameters                             intptr_t*
  97   //    c_rarg6:   parameter size (in words)              int
  98   //    c_rarg7:   thread                                 Thread*
  99   //
 100   // There is no return from the stub itself as any Java result
 101   // is written to result
 102   //
 103   // we save r30 (lr) as the return PC at the base of the frame and
 104   // link r29 (fp) below it as the frame pointer installing sp (r31)
 105   // into fp.
 106   //
 107   // we save r0-r7, which accounts for all the c arguments.
 108   //
 109   // TODO: strictly do we need to save them all? they are treated as
 110   // volatile by C so could we omit saving the ones we are going to
 111   // place in global registers (thread? method?) or those we only use
 112   // during setup of the Java call?
 113   //
 114   // we don't need to save r8 which C uses as an indirect result location
 115   // return register.
 116   //
 117   // we don't need to save r9-r15 which both C and Java treat as
 118   // volatile
 119   //
 120   // we don't need to save r16-18 because Java does not use them
 121   //
 122   // we save r19-r28 which Java uses as scratch registers and C
 123   // expects to be callee-save
 124   //
 125   // we save the bottom 64 bits of each value stored in v8-v15; it is
 126   // the responsibility of the caller to preserve larger values.
 127   //
 128   // so the stub frame looks like this when we enter Java code
 129   //
 130   //     [ return_from_Java     ] <--- sp
 131   //     [ argument word n      ]
 132   //      ...
 133   // -27 [ argument word 1      ]
 134   // -26 [ saved v15            ] <--- sp_after_call
 135   // -25 [ saved v14            ]
 136   // -24 [ saved v13            ]
 137   // -23 [ saved v12            ]
 138   // -22 [ saved v11            ]
 139   // -21 [ saved v10            ]
 140   // -20 [ saved v9             ]
 141   // -19 [ saved v8             ]
 142   // -18 [ saved r28            ]
 143   // -17 [ saved r27            ]
 144   // -16 [ saved r26            ]
 145   // -15 [ saved r25            ]
 146   // -14 [ saved r24            ]
 147   // -13 [ saved r23            ]
 148   // -12 [ saved r22            ]
 149   // -11 [ saved r21            ]
 150   // -10 [ saved r20            ]
 151   //  -9 [ saved r19            ]
 152   //  -8 [ call wrapper    (r0) ]
 153   //  -7 [ result          (r1) ]
 154   //  -6 [ result type     (r2) ]
 155   //  -5 [ method          (r3) ]
 156   //  -4 [ entry point     (r4) ]
 157   //  -3 [ parameters      (r5) ]
 158   //  -2 [ parameter size  (r6) ]
 159   //  -1 [ thread (r7)          ]
 160   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 161   //   1 [ saved lr       (r30) ]
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off = -26,
 166 
 167     d15_off            = -26,
 168     d13_off            = -24,
 169     d11_off            = -22,
 170     d9_off             = -20,
 171 
 172     r28_off            = -18,
 173     r26_off            = -16,
 174     r24_off            = -14,
 175     r22_off            = -12,
 176     r20_off            = -10,
 177     call_wrapper_off   =  -8,
 178     result_off         =  -7,
 179     result_type_off    =  -6,
 180     method_off         =  -5,
 181     entry_point_off    =  -4,
 182     parameter_size_off =  -2,
 183     thread_off         =  -1,
 184     fp_f               =   0,
 185     retaddr_off        =   1,
 186   };
 187 
 188   address generate_call_stub(address& return_address) {
 189     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 190            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 191            "adjust this code");
 192 
 193     StubCodeMark mark(this, "StubRoutines", "call_stub");
 194     address start = __ pc();
 195 
 196     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 197 
 198     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 199     const Address result        (rfp, result_off         * wordSize);
 200     const Address result_type   (rfp, result_type_off    * wordSize);
 201     const Address method        (rfp, method_off         * wordSize);
 202     const Address entry_point   (rfp, entry_point_off    * wordSize);
 203     const Address parameter_size(rfp, parameter_size_off * wordSize);
 204 
 205     const Address thread        (rfp, thread_off         * wordSize);
 206 
 207     const Address d15_save      (rfp, d15_off * wordSize);
 208     const Address d13_save      (rfp, d13_off * wordSize);
 209     const Address d11_save      (rfp, d11_off * wordSize);
 210     const Address d9_save       (rfp, d9_off * wordSize);
 211 
 212     const Address r28_save      (rfp, r28_off * wordSize);
 213     const Address r26_save      (rfp, r26_off * wordSize);
 214     const Address r24_save      (rfp, r24_off * wordSize);
 215     const Address r22_save      (rfp, r22_off * wordSize);
 216     const Address r20_save      (rfp, r20_off * wordSize);
 217 
 218     // stub code
 219 
 220     // we need a C prolog to bootstrap the x86 caller into the sim
 221     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 222 
 223     address aarch64_entry = __ pc();
 224 
 225 #ifdef BUILTIN_SIM
 226     // Save sender's SP for stack traces.
 227     __ mov(rscratch1, sp);
 228     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 229 #endif
 230     // set up frame and move sp to end of save area
 231     __ enter();
 232     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 233 
 234     // save register parameters and Java scratch/global registers
 235     // n.b. we save thread even though it gets installed in
 236     // rthread because we want to sanity check rthread later
 237     __ str(c_rarg7,  thread);
 238     __ strw(c_rarg6, parameter_size);
 239     __ stp(c_rarg4, c_rarg5,  entry_point);
 240     __ stp(c_rarg2, c_rarg3,  result_type);
 241     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 242 
 243     __ stp(r20, r19,   r20_save);
 244     __ stp(r22, r21,   r22_save);
 245     __ stp(r24, r23,   r24_save);
 246     __ stp(r26, r25,   r26_save);
 247     __ stp(r28, r27,   r28_save);
 248 
 249     __ stpd(v9,  v8,   d9_save);
 250     __ stpd(v11, v10,  d11_save);
 251     __ stpd(v13, v12,  d13_save);
 252     __ stpd(v15, v14,  d15_save);
 253 
 254     // install Java thread in global register now we have saved
 255     // whatever value it held
 256     __ mov(rthread, c_rarg7);
 257     // And method
 258     __ mov(rmethod, c_rarg3);
 259 
 260     // set up the heapbase register
 261     __ reinit_heapbase();
 262 
 263 #ifdef ASSERT
 264     // make sure we have no pending exceptions
 265     {
 266       Label L;
 267       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 268       __ cmp(rscratch1, (unsigned)NULL_WORD);
 269       __ br(Assembler::EQ, L);
 270       __ stop("StubRoutines::call_stub: entered with pending exception");
 271       __ BIND(L);
 272     }
 273 #endif
 274     // pass parameters if any
 275     __ mov(esp, sp);
 276     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 277     __ andr(sp, rscratch1, -2 * wordSize);
 278 
 279     BLOCK_COMMENT("pass parameters if any");
 280     Label parameters_done;
 281     // parameter count is still in c_rarg6
 282     // and parameter pointer identifying param 1 is in c_rarg5
 283     __ cbzw(c_rarg6, parameters_done);
 284 
 285     address loop = __ pc();
 286     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 287     __ subsw(c_rarg6, c_rarg6, 1);
 288     __ push(rscratch1);
 289     __ br(Assembler::GT, loop);
 290 
 291     __ BIND(parameters_done);
 292 
 293     // call Java entry -- passing methdoOop, and current sp
 294     //      rmethod: Method*
 295     //      r13: sender sp
 296     BLOCK_COMMENT("call Java function");
 297     __ mov(r13, sp);
 298     __ blr(c_rarg4);
 299 
 300     // tell the simulator we have returned to the stub
 301 
 302     // we do this here because the notify will already have been done
 303     // if we get to the next instruction via an exception
 304     //
 305     // n.b. adding this instruction here affects the calculation of
 306     // whether or not a routine returns to the call stub (used when
 307     // doing stack walks) since the normal test is to check the return
 308     // pc against the address saved below. so we may need to allow for
 309     // this extra instruction in the check.
 310 
 311     if (NotifySimulator) {
 312       __ notify(Assembler::method_reentry);
 313     }
 314     // save current address for use by exception handling code
 315 
 316     return_address = __ pc();
 317 
 318     // store result depending on type (everything that is not
 319     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 320     // n.b. this assumes Java returns an integral result in r0
 321     // and a floating result in j_farg0
 322     __ ldr(j_rarg2, result);
 323     Label is_long, is_float, is_double, exit;
 324     __ ldr(j_rarg1, result_type);
 325     __ cmp(j_rarg1, T_OBJECT);
 326     __ br(Assembler::EQ, is_long);
 327     __ cmp(j_rarg1, T_LONG);
 328     __ br(Assembler::EQ, is_long);
 329     __ cmp(j_rarg1, T_FLOAT);
 330     __ br(Assembler::EQ, is_float);
 331     __ cmp(j_rarg1, T_DOUBLE);
 332     __ br(Assembler::EQ, is_double);
 333 
 334     // handle T_INT case
 335     __ strw(r0, Address(j_rarg2));
 336 
 337     __ BIND(exit);
 338 
 339     // pop parameters
 340     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 341 
 342 #ifdef ASSERT
 343     // verify that threads correspond
 344     {
 345       Label L, S;
 346       __ ldr(rscratch1, thread);
 347       __ cmp(rthread, rscratch1);
 348       __ br(Assembler::NE, S);
 349       __ get_thread(rscratch1);
 350       __ cmp(rthread, rscratch1);
 351       __ br(Assembler::EQ, L);
 352       __ BIND(S);
 353       __ stop("StubRoutines::call_stub: threads must correspond");
 354       __ BIND(L);
 355     }
 356 #endif
 357 
 358     // restore callee-save registers
 359     __ ldpd(v15, v14,  d15_save);
 360     __ ldpd(v13, v12,  d13_save);
 361     __ ldpd(v11, v10,  d11_save);
 362     __ ldpd(v9,  v8,   d9_save);
 363 
 364     __ ldp(r28, r27,   r28_save);
 365     __ ldp(r26, r25,   r26_save);
 366     __ ldp(r24, r23,   r24_save);
 367     __ ldp(r22, r21,   r22_save);
 368     __ ldp(r20, r19,   r20_save);
 369 
 370     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 371     __ ldrw(c_rarg2, result_type);
 372     __ ldr(c_rarg3,  method);
 373     __ ldp(c_rarg4, c_rarg5,  entry_point);
 374     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 375 
 376 #ifndef PRODUCT
 377     // tell the simulator we are about to end Java execution
 378     if (NotifySimulator) {
 379       __ notify(Assembler::method_exit);
 380     }
 381 #endif
 382     // leave frame and return to caller
 383     __ leave();
 384     __ ret(lr);
 385 
 386     // handle return types different from T_INT
 387 
 388     __ BIND(is_long);
 389     __ str(r0, Address(j_rarg2, 0));
 390     __ br(Assembler::AL, exit);
 391 
 392     __ BIND(is_float);
 393     __ strs(j_farg0, Address(j_rarg2, 0));
 394     __ br(Assembler::AL, exit);
 395 
 396     __ BIND(is_double);
 397     __ strd(j_farg0, Address(j_rarg2, 0));
 398     __ br(Assembler::AL, exit);
 399 
 400     return start;
 401   }
 402 
 403   // Return point for a Java call if there's an exception thrown in
 404   // Java code.  The exception is caught and transformed into a
 405   // pending exception stored in JavaThread that can be tested from
 406   // within the VM.
 407   //
 408   // Note: Usually the parameters are removed by the callee. In case
 409   // of an exception crossing an activation frame boundary, that is
 410   // not the case if the callee is compiled code => need to setup the
 411   // rsp.
 412   //
 413   // r0: exception oop
 414 
 415   // NOTE: this is used as a target from the signal handler so it
 416   // needs an x86 prolog which returns into the current simulator
 417   // executing the generated catch_exception code. so the prolog
 418   // needs to install rax in a sim register and adjust the sim's
 419   // restart pc to enter the generated code at the start position
 420   // then return from native to simulated execution.
 421 
 422   address generate_catch_exception() {
 423     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 424     address start = __ pc();
 425 
 426     // same as in generate_call_stub():
 427     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 428     const Address thread        (rfp, thread_off         * wordSize);
 429 
 430 #ifdef ASSERT
 431     // verify that threads correspond
 432     {
 433       Label L, S;
 434       __ ldr(rscratch1, thread);
 435       __ cmp(rthread, rscratch1);
 436       __ br(Assembler::NE, S);
 437       __ get_thread(rscratch1);
 438       __ cmp(rthread, rscratch1);
 439       __ br(Assembler::EQ, L);
 440       __ bind(S);
 441       __ stop("StubRoutines::catch_exception: threads must correspond");
 442       __ bind(L);
 443     }
 444 #endif
 445 
 446     // set pending exception
 447     __ verify_oop(r0);
 448 
 449     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 450     __ mov(rscratch1, (address)__FILE__);
 451     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 452     __ movw(rscratch1, (int)__LINE__);
 453     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 454 
 455     // complete return to VM
 456     assert(StubRoutines::_call_stub_return_address != NULL,
 457            "_call_stub_return_address must have been generated before");
 458     __ b(StubRoutines::_call_stub_return_address);
 459 
 460     return start;
 461   }
 462 
 463   // Continuation point for runtime calls returning with a pending
 464   // exception.  The pending exception check happened in the runtime
 465   // or native call stub.  The pending exception in Thread is
 466   // converted into a Java-level exception.
 467   //
 468   // Contract with Java-level exception handlers:
 469   // r0: exception
 470   // r3: throwing pc
 471   //
 472   // NOTE: At entry of this stub, exception-pc must be in LR !!
 473 
 474   // NOTE: this is always used as a jump target within generated code
 475   // so it just needs to be generated code wiht no x86 prolog
 476 
 477   address generate_forward_exception() {
 478     StubCodeMark mark(this, "StubRoutines", "forward exception");
 479     address start = __ pc();
 480 
 481     // Upon entry, LR points to the return address returning into
 482     // Java (interpreted or compiled) code; i.e., the return address
 483     // becomes the throwing pc.
 484     //
 485     // Arguments pushed before the runtime call are still on the stack
 486     // but the exception handler will reset the stack pointer ->
 487     // ignore them.  A potential result in registers can be ignored as
 488     // well.
 489 
 490 #ifdef ASSERT
 491     // make sure this code is only executed if there is a pending exception
 492     {
 493       Label L;
 494       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 495       __ cbnz(rscratch1, L);
 496       __ stop("StubRoutines::forward exception: no pending exception (1)");
 497       __ bind(L);
 498     }
 499 #endif
 500 
 501     // compute exception handler into r19
 502 
 503     // call the VM to find the handler address associated with the
 504     // caller address. pass thread in r0 and caller pc (ret address)
 505     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 506     // the stack.
 507     __ mov(c_rarg1, lr);
 508     // lr will be trashed by the VM call so we move it to R19
 509     // (callee-saved) because we also need to pass it to the handler
 510     // returned by this call.
 511     __ mov(r19, lr);
 512     BLOCK_COMMENT("call exception_handler_for_return_address");
 513     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 514                          SharedRuntime::exception_handler_for_return_address),
 515                     rthread, c_rarg1);
 516     // we should not really care that lr is no longer the callee
 517     // address. we saved the value the handler needs in r19 so we can
 518     // just copy it to r3. however, the C2 handler will push its own
 519     // frame and then calls into the VM and the VM code asserts that
 520     // the PC for the frame above the handler belongs to a compiled
 521     // Java method. So, we restore lr here to satisfy that assert.
 522     __ mov(lr, r19);
 523     // setup r0 & r3 & clear pending exception
 524     __ mov(r3, r19);
 525     __ mov(r19, r0);
 526     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 527     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 528 
 529 #ifdef ASSERT
 530     // make sure exception is set
 531     {
 532       Label L;
 533       __ cbnz(r0, L);
 534       __ stop("StubRoutines::forward exception: no pending exception (2)");
 535       __ bind(L);
 536     }
 537 #endif
 538 
 539     // continue at exception handler
 540     // r0: exception
 541     // r3: throwing pc
 542     // r19: exception handler
 543     __ verify_oop(r0);
 544     __ br(r19);
 545 
 546     return start;
 547   }
 548 
 549   // Non-destructive plausibility checks for oops
 550   //
 551   // Arguments:
 552   //    r0: oop to verify
 553   //    rscratch1: error message
 554   //
 555   // Stack after saving c_rarg3:
 556   //    [tos + 0]: saved c_rarg3
 557   //    [tos + 1]: saved c_rarg2
 558   //    [tos + 2]: saved lr
 559   //    [tos + 3]: saved rscratch2
 560   //    [tos + 4]: saved r0
 561   //    [tos + 5]: saved rscratch1
 562   address generate_verify_oop() {
 563 
 564     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 565     address start = __ pc();
 566 
 567     Label exit, error;
 568 
 569     // save c_rarg2 and c_rarg3
 570     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 571 
 572     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 573     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 574     __ ldr(c_rarg3, Address(c_rarg2));
 575     __ add(c_rarg3, c_rarg3, 1);
 576     __ str(c_rarg3, Address(c_rarg2));
 577 
 578     // object is in r0
 579     // make sure object is 'reasonable'
 580     __ cbz(r0, exit); // if obj is NULL it is OK
 581 
 582     // Check if the oop is in the right area of memory
 583     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 584     __ andr(c_rarg2, r0, c_rarg3);
 585     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 586 
 587     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 588     // instruction here because the flags register is live.
 589     __ eor(c_rarg2, c_rarg2, c_rarg3);
 590     __ cbnz(c_rarg2, error);
 591 
 592     // make sure klass is 'reasonable', which is not zero.
 593     __ load_klass(r0, r0);  // get klass
 594     __ cbz(r0, error);      // if klass is NULL it is broken
 595 
 596     // return if everything seems ok
 597     __ bind(exit);
 598 
 599     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 600     __ ret(lr);
 601 
 602     // handle errors
 603     __ bind(error);
 604     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 605 
 606     __ push(RegSet::range(r0, r29), sp);
 607     // debug(char* msg, int64_t pc, int64_t regs[])
 608     __ mov(c_rarg0, rscratch1);      // pass address of error message
 609     __ mov(c_rarg1, lr);             // pass return address
 610     __ mov(c_rarg2, sp);             // pass address of regs on stack
 611 #ifndef PRODUCT
 612     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 613 #endif
 614     BLOCK_COMMENT("call MacroAssembler::debug");
 615     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 616     __ blrt(rscratch1, 3, 0, 1);
 617 
 618     return start;
 619   }
 620 
 621   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 622 
 623   // The inner part of zero_words().  This is the bulk operation,
 624   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 625   // caller is responsible for zeroing the last few words.
 626   //
 627   // Inputs:
 628   // r10: the HeapWord-aligned base address of an array to zero.
 629   // r11: the count in HeapWords, r11 > 0.
 630   //
 631   // Returns r10 and r11, adjusted for the caller to clear.
 632   // r10: the base address of the tail of words left to clear.
 633   // r11: the number of words in the tail.
 634   //      r11 < MacroAssembler::zero_words_block_size.
 635 
 636   address generate_zero_blocks() {
 637     Label store_pair, loop_store_pair, done;
 638     Label base_aligned;
 639 
 640     Register base = r10, cnt = r11;
 641 
 642     __ align(CodeEntryAlignment);
 643     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 644     address start = __ pc();
 645 
 646     if (UseBlockZeroing) {
 647       int zva_length = VM_Version::zva_length();
 648 
 649       // Ensure ZVA length can be divided by 16. This is required by
 650       // the subsequent operations.
 651       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 652 
 653       __ tbz(base, 3, base_aligned);
 654       __ str(zr, Address(__ post(base, 8)));
 655       __ sub(cnt, cnt, 1);
 656       __ bind(base_aligned);
 657 
 658       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 659       // alignment.
 660       Label small;
 661       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 662       __ subs(rscratch1, cnt, low_limit >> 3);
 663       __ br(Assembler::LT, small);
 664       __ zero_dcache_blocks(base, cnt);
 665       __ bind(small);
 666     }
 667 
 668     {
 669       // Number of stp instructions we'll unroll
 670       const int unroll =
 671         MacroAssembler::zero_words_block_size / 2;
 672       // Clear the remaining blocks.
 673       Label loop;
 674       __ subs(cnt, cnt, unroll * 2);
 675       __ br(Assembler::LT, done);
 676       __ bind(loop);
 677       for (int i = 0; i < unroll; i++)
 678         __ stp(zr, zr, __ post(base, 16));
 679       __ subs(cnt, cnt, unroll * 2);
 680       __ br(Assembler::GE, loop);
 681       __ bind(done);
 682       __ add(cnt, cnt, unroll * 2);
 683     }
 684 
 685     __ ret(lr);
 686 
 687     return start;
 688   }
 689 
 690 
 691   typedef enum {
 692     copy_forwards = 1,
 693     copy_backwards = -1
 694   } copy_direction;
 695 
 696   // Bulk copy of blocks of 8 words.
 697   //
 698   // count is a count of words.
 699   //
 700   // Precondition: count >= 8
 701   //
 702   // Postconditions:
 703   //
 704   // The least significant bit of count contains the remaining count
 705   // of words to copy.  The rest of count is trash.
 706   //
 707   // s and d are adjusted to point to the remaining words to copy
 708   //
 709   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 710                            copy_direction direction) {
 711     int unit = wordSize * direction;
 712     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 713 
 714     int offset;
 715     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 716       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 717     const Register stride = r13;
 718 
 719     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 720     assert_different_registers(s, d, count, rscratch1);
 721 
 722     Label again, drain;
 723     const char *stub_name;
 724     if (direction == copy_forwards)
 725       stub_name = "forward_copy_longs";
 726     else
 727       stub_name = "backward_copy_longs";
 728     StubCodeMark mark(this, "StubRoutines", stub_name);
 729     __ align(CodeEntryAlignment);
 730     __ bind(start);
 731 
 732     Label unaligned_copy_long;
 733     if (AvoidUnalignedAccesses) {
 734       __ tbnz(d, 3, unaligned_copy_long);
 735     }
 736 
 737     if (direction == copy_forwards) {
 738       __ sub(s, s, bias);
 739       __ sub(d, d, bias);
 740     }
 741 
 742 #ifdef ASSERT
 743     // Make sure we are never given < 8 words
 744     {
 745       Label L;
 746       __ cmp(count, 8);
 747       __ br(Assembler::GE, L);
 748       __ stop("genrate_copy_longs called with < 8 words");
 749       __ bind(L);
 750     }
 751 #endif
 752 
 753     // Fill 8 registers
 754     if (UseSIMDForMemoryOps) {
 755       __ ldpq(v0, v1, Address(s, 4 * unit));
 756       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 757     } else {
 758       __ ldp(t0, t1, Address(s, 2 * unit));
 759       __ ldp(t2, t3, Address(s, 4 * unit));
 760       __ ldp(t4, t5, Address(s, 6 * unit));
 761       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 762     }
 763 
 764     __ subs(count, count, 16);
 765     __ br(Assembler::LO, drain);
 766 
 767     int prefetch = PrefetchCopyIntervalInBytes;
 768     bool use_stride = false;
 769     if (direction == copy_backwards) {
 770        use_stride = prefetch > 256;
 771        prefetch = -prefetch;
 772        if (use_stride) __ mov(stride, prefetch);
 773     }
 774 
 775     __ bind(again);
 776 
 777     if (PrefetchCopyIntervalInBytes > 0)
 778       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 779 
 780     if (UseSIMDForMemoryOps) {
 781       __ stpq(v0, v1, Address(d, 4 * unit));
 782       __ ldpq(v0, v1, Address(s, 4 * unit));
 783       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 784       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 785     } else {
 786       __ stp(t0, t1, Address(d, 2 * unit));
 787       __ ldp(t0, t1, Address(s, 2 * unit));
 788       __ stp(t2, t3, Address(d, 4 * unit));
 789       __ ldp(t2, t3, Address(s, 4 * unit));
 790       __ stp(t4, t5, Address(d, 6 * unit));
 791       __ ldp(t4, t5, Address(s, 6 * unit));
 792       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 793       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 794     }
 795 
 796     __ subs(count, count, 8);
 797     __ br(Assembler::HS, again);
 798 
 799     // Drain
 800     __ bind(drain);
 801     if (UseSIMDForMemoryOps) {
 802       __ stpq(v0, v1, Address(d, 4 * unit));
 803       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 804     } else {
 805       __ stp(t0, t1, Address(d, 2 * unit));
 806       __ stp(t2, t3, Address(d, 4 * unit));
 807       __ stp(t4, t5, Address(d, 6 * unit));
 808       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 809     }
 810 
 811     {
 812       Label L1, L2;
 813       __ tbz(count, exact_log2(4), L1);
 814       if (UseSIMDForMemoryOps) {
 815         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 816         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 817       } else {
 818         __ ldp(t0, t1, Address(s, 2 * unit));
 819         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 820         __ stp(t0, t1, Address(d, 2 * unit));
 821         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 822       }
 823       __ bind(L1);
 824 
 825       if (direction == copy_forwards) {
 826         __ add(s, s, bias);
 827         __ add(d, d, bias);
 828       }
 829 
 830       __ tbz(count, 1, L2);
 831       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 832       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 833       __ bind(L2);
 834     }
 835 
 836     __ ret(lr);
 837 
 838     if (AvoidUnalignedAccesses) {
 839       Label drain, again;
 840       // Register order for storing. Order is different for backward copy.
 841 
 842       __ bind(unaligned_copy_long);
 843 
 844       // source address is even aligned, target odd aligned
 845       //
 846       // when forward copying word pairs we read long pairs at offsets
 847       // {0, 2, 4, 6} (in long words). when backwards copying we read
 848       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 849       // address by -2 in the forwards case so we can compute the
 850       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 851       // or -1.
 852       //
 853       // when forward copying we need to store 1 word, 3 pairs and
 854       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 855       // zero offset We adjust the destination by -1 which means we
 856       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 857       //
 858       // When backwards copyng we need to store 1 word, 3 pairs and
 859       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 860       // offsets {1, 3, 5, 7, 8} * unit.
 861 
 862       if (direction == copy_forwards) {
 863         __ sub(s, s, 16);
 864         __ sub(d, d, 8);
 865       }
 866 
 867       // Fill 8 registers
 868       //
 869       // for forwards copy s was offset by -16 from the original input
 870       // value of s so the register contents are at these offsets
 871       // relative to the 64 bit block addressed by that original input
 872       // and so on for each successive 64 byte block when s is updated
 873       //
 874       // t0 at offset 0,  t1 at offset 8
 875       // t2 at offset 16, t3 at offset 24
 876       // t4 at offset 32, t5 at offset 40
 877       // t6 at offset 48, t7 at offset 56
 878 
 879       // for backwards copy s was not offset so the register contents
 880       // are at these offsets into the preceding 64 byte block
 881       // relative to that original input and so on for each successive
 882       // preceding 64 byte block when s is updated. this explains the
 883       // slightly counter-intuitive looking pattern of register usage
 884       // in the stp instructions for backwards copy.
 885       //
 886       // t0 at offset -16, t1 at offset -8
 887       // t2 at offset -32, t3 at offset -24
 888       // t4 at offset -48, t5 at offset -40
 889       // t6 at offset -64, t7 at offset -56
 890 
 891       __ ldp(t0, t1, Address(s, 2 * unit));
 892       __ ldp(t2, t3, Address(s, 4 * unit));
 893       __ ldp(t4, t5, Address(s, 6 * unit));
 894       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 895 
 896       __ subs(count, count, 16);
 897       __ br(Assembler::LO, drain);
 898 
 899       int prefetch = PrefetchCopyIntervalInBytes;
 900       bool use_stride = false;
 901       if (direction == copy_backwards) {
 902          use_stride = prefetch > 256;
 903          prefetch = -prefetch;
 904          if (use_stride) __ mov(stride, prefetch);
 905       }
 906 
 907       __ bind(again);
 908 
 909       if (PrefetchCopyIntervalInBytes > 0)
 910         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 911 
 912       if (direction == copy_forwards) {
 913        // allowing for the offset of -8 the store instructions place
 914        // registers into the target 64 bit block at the following
 915        // offsets
 916        //
 917        // t0 at offset 0
 918        // t1 at offset 8,  t2 at offset 16
 919        // t3 at offset 24, t4 at offset 32
 920        // t5 at offset 40, t6 at offset 48
 921        // t7 at offset 56
 922 
 923         __ str(t0, Address(d, 1 * unit));
 924         __ stp(t1, t2, Address(d, 2 * unit));
 925         __ ldp(t0, t1, Address(s, 2 * unit));
 926         __ stp(t3, t4, Address(d, 4 * unit));
 927         __ ldp(t2, t3, Address(s, 4 * unit));
 928         __ stp(t5, t6, Address(d, 6 * unit));
 929         __ ldp(t4, t5, Address(s, 6 * unit));
 930         __ str(t7, Address(__ pre(d, 8 * unit)));
 931         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 932       } else {
 933        // d was not offset when we started so the registers are
 934        // written into the 64 bit block preceding d with the following
 935        // offsets
 936        //
 937        // t1 at offset -8
 938        // t3 at offset -24, t0 at offset -16
 939        // t5 at offset -48, t2 at offset -32
 940        // t7 at offset -56, t4 at offset -48
 941        //                   t6 at offset -64
 942        //
 943        // note that this matches the offsets previously noted for the
 944        // loads
 945 
 946         __ str(t1, Address(d, 1 * unit));
 947         __ stp(t3, t0, Address(d, 3 * unit));
 948         __ ldp(t0, t1, Address(s, 2 * unit));
 949         __ stp(t5, t2, Address(d, 5 * unit));
 950         __ ldp(t2, t3, Address(s, 4 * unit));
 951         __ stp(t7, t4, Address(d, 7 * unit));
 952         __ ldp(t4, t5, Address(s, 6 * unit));
 953         __ str(t6, Address(__ pre(d, 8 * unit)));
 954         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 955       }
 956 
 957       __ subs(count, count, 8);
 958       __ br(Assembler::HS, again);
 959 
 960       // Drain
 961       //
 962       // this uses the same pattern of offsets and register arguments
 963       // as above
 964       __ bind(drain);
 965       if (direction == copy_forwards) {
 966         __ str(t0, Address(d, 1 * unit));
 967         __ stp(t1, t2, Address(d, 2 * unit));
 968         __ stp(t3, t4, Address(d, 4 * unit));
 969         __ stp(t5, t6, Address(d, 6 * unit));
 970         __ str(t7, Address(__ pre(d, 8 * unit)));
 971       } else {
 972         __ str(t1, Address(d, 1 * unit));
 973         __ stp(t3, t0, Address(d, 3 * unit));
 974         __ stp(t5, t2, Address(d, 5 * unit));
 975         __ stp(t7, t4, Address(d, 7 * unit));
 976         __ str(t6, Address(__ pre(d, 8 * unit)));
 977       }
 978       // now we need to copy any remaining part block which may
 979       // include a 4 word block subblock and/or a 2 word subblock.
 980       // bits 2 and 1 in the count are the tell-tale for whetehr we
 981       // have each such subblock
 982       {
 983         Label L1, L2;
 984         __ tbz(count, exact_log2(4), L1);
 985        // this is the same as above but copying only 4 longs hence
 986        // with ony one intervening stp between the str instructions
 987        // but note that the offsets and registers still follow the
 988        // same pattern
 989         __ ldp(t0, t1, Address(s, 2 * unit));
 990         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 991         if (direction == copy_forwards) {
 992           __ str(t0, Address(d, 1 * unit));
 993           __ stp(t1, t2, Address(d, 2 * unit));
 994           __ str(t3, Address(__ pre(d, 4 * unit)));
 995         } else {
 996           __ str(t1, Address(d, 1 * unit));
 997           __ stp(t3, t0, Address(d, 3 * unit));
 998           __ str(t2, Address(__ pre(d, 4 * unit)));
 999         }
1000         __ bind(L1);
1001 
1002         __ tbz(count, 1, L2);
1003        // this is the same as above but copying only 2 longs hence
1004        // there is no intervening stp between the str instructions
1005        // but note that the offset and register patterns are still
1006        // the same
1007         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1008         if (direction == copy_forwards) {
1009           __ str(t0, Address(d, 1 * unit));
1010           __ str(t1, Address(__ pre(d, 2 * unit)));
1011         } else {
1012           __ str(t1, Address(d, 1 * unit));
1013           __ str(t0, Address(__ pre(d, 2 * unit)));
1014         }
1015         __ bind(L2);
1016 
1017        // for forwards copy we need to re-adjust the offsets we
1018        // applied so that s and d are follow the last words written
1019 
1020        if (direction == copy_forwards) {
1021          __ add(s, s, 16);
1022          __ add(d, d, 8);
1023        }
1024 
1025       }
1026 
1027       __ ret(lr);
1028       }
1029   }
1030 
1031   // Small copy: less than 16 bytes.
1032   //
1033   // NB: Ignores all of the bits of count which represent more than 15
1034   // bytes, so a caller doesn't have to mask them.
1035 
1036   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1037     bool is_backwards = step < 0;
1038     size_t granularity = uabs(step);
1039     int direction = is_backwards ? -1 : 1;
1040     int unit = wordSize * direction;
1041 
1042     Label Lpair, Lword, Lint, Lshort, Lbyte;
1043 
1044     assert(granularity
1045            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1046 
1047     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1048 
1049     // ??? I don't know if this bit-test-and-branch is the right thing
1050     // to do.  It does a lot of jumping, resulting in several
1051     // mispredicted branches.  It might make more sense to do this
1052     // with something like Duff's device with a single computed branch.
1053 
1054     __ tbz(count, 3 - exact_log2(granularity), Lword);
1055     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1056     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1057     __ bind(Lword);
1058 
1059     if (granularity <= sizeof (jint)) {
1060       __ tbz(count, 2 - exact_log2(granularity), Lint);
1061       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1062       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1063       __ bind(Lint);
1064     }
1065 
1066     if (granularity <= sizeof (jshort)) {
1067       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1068       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1069       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1070       __ bind(Lshort);
1071     }
1072 
1073     if (granularity <= sizeof (jbyte)) {
1074       __ tbz(count, 0, Lbyte);
1075       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1076       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1077       __ bind(Lbyte);
1078     }
1079   }
1080 
1081   Label copy_f, copy_b;
1082 
1083   // All-singing all-dancing memory copy.
1084   //
1085   // Copy count units of memory from s to d.  The size of a unit is
1086   // step, which can be positive or negative depending on the direction
1087   // of copy.  If is_aligned is false, we align the source address.
1088   //
1089 
1090   void copy_memory(bool is_aligned, Register s, Register d,
1091                    Register count, Register tmp, int step) {
1092     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1093     bool is_backwards = step < 0;
1094     int granularity = uabs(step);
1095     const Register t0 = r3, t1 = r4;
1096 
1097     // <= 96 bytes do inline. Direction doesn't matter because we always
1098     // load all the data before writing anything
1099     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1100     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1101     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1102     const Register send = r17, dend = r18;
1103 
1104     if (PrefetchCopyIntervalInBytes > 0)
1105       __ prfm(Address(s, 0), PLDL1KEEP);
1106     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1107     __ br(Assembler::HI, copy_big);
1108 
1109     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1110     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1111 
1112     __ cmp(count, 16/granularity);
1113     __ br(Assembler::LS, copy16);
1114 
1115     __ cmp(count, 64/granularity);
1116     __ br(Assembler::HI, copy80);
1117 
1118     __ cmp(count, 32/granularity);
1119     __ br(Assembler::LS, copy32);
1120 
1121     // 33..64 bytes
1122     if (UseSIMDForMemoryOps) {
1123       __ ldpq(v0, v1, Address(s, 0));
1124       __ ldpq(v2, v3, Address(send, -32));
1125       __ stpq(v0, v1, Address(d, 0));
1126       __ stpq(v2, v3, Address(dend, -32));
1127     } else {
1128       __ ldp(t0, t1, Address(s, 0));
1129       __ ldp(t2, t3, Address(s, 16));
1130       __ ldp(t4, t5, Address(send, -32));
1131       __ ldp(t6, t7, Address(send, -16));
1132 
1133       __ stp(t0, t1, Address(d, 0));
1134       __ stp(t2, t3, Address(d, 16));
1135       __ stp(t4, t5, Address(dend, -32));
1136       __ stp(t6, t7, Address(dend, -16));
1137     }
1138     __ b(finish);
1139 
1140     // 17..32 bytes
1141     __ bind(copy32);
1142     __ ldp(t0, t1, Address(s, 0));
1143     __ ldp(t2, t3, Address(send, -16));
1144     __ stp(t0, t1, Address(d, 0));
1145     __ stp(t2, t3, Address(dend, -16));
1146     __ b(finish);
1147 
1148     // 65..80/96 bytes
1149     // (96 bytes if SIMD because we do 32 byes per instruction)
1150     __ bind(copy80);
1151     if (UseSIMDForMemoryOps) {
1152       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1153       __ ldpq(v4, v5, Address(send, -32));
1154       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1155       __ stpq(v4, v5, Address(dend, -32));
1156     } else {
1157       __ ldp(t0, t1, Address(s, 0));
1158       __ ldp(t2, t3, Address(s, 16));
1159       __ ldp(t4, t5, Address(s, 32));
1160       __ ldp(t6, t7, Address(s, 48));
1161       __ ldp(t8, t9, Address(send, -16));
1162 
1163       __ stp(t0, t1, Address(d, 0));
1164       __ stp(t2, t3, Address(d, 16));
1165       __ stp(t4, t5, Address(d, 32));
1166       __ stp(t6, t7, Address(d, 48));
1167       __ stp(t8, t9, Address(dend, -16));
1168     }
1169     __ b(finish);
1170 
1171     // 0..16 bytes
1172     __ bind(copy16);
1173     __ cmp(count, 8/granularity);
1174     __ br(Assembler::LO, copy8);
1175 
1176     // 8..16 bytes
1177     __ ldr(t0, Address(s, 0));
1178     __ ldr(t1, Address(send, -8));
1179     __ str(t0, Address(d, 0));
1180     __ str(t1, Address(dend, -8));
1181     __ b(finish);
1182 
1183     if (granularity < 8) {
1184       // 4..7 bytes
1185       __ bind(copy8);
1186       __ tbz(count, 2 - exact_log2(granularity), copy4);
1187       __ ldrw(t0, Address(s, 0));
1188       __ ldrw(t1, Address(send, -4));
1189       __ strw(t0, Address(d, 0));
1190       __ strw(t1, Address(dend, -4));
1191       __ b(finish);
1192       if (granularity < 4) {
1193         // 0..3 bytes
1194         __ bind(copy4);
1195         __ cbz(count, finish); // get rid of 0 case
1196         if (granularity == 2) {
1197           __ ldrh(t0, Address(s, 0));
1198           __ strh(t0, Address(d, 0));
1199         } else { // granularity == 1
1200           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1201           // the first and last byte.
1202           // Handle the 3 byte case by loading and storing base + count/2
1203           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1204           // This does means in the 1 byte case we load/store the same
1205           // byte 3 times.
1206           __ lsr(count, count, 1);
1207           __ ldrb(t0, Address(s, 0));
1208           __ ldrb(t1, Address(send, -1));
1209           __ ldrb(t2, Address(s, count));
1210           __ strb(t0, Address(d, 0));
1211           __ strb(t1, Address(dend, -1));
1212           __ strb(t2, Address(d, count));
1213         }
1214         __ b(finish);
1215       }
1216     }
1217 
1218     __ bind(copy_big);
1219     if (is_backwards) {
1220       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1221       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1222     }
1223 
1224     // Now we've got the small case out of the way we can align the
1225     // source address on a 2-word boundary.
1226 
1227     Label aligned;
1228 
1229     if (is_aligned) {
1230       // We may have to adjust by 1 word to get s 2-word-aligned.
1231       __ tbz(s, exact_log2(wordSize), aligned);
1232       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1233       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1234       __ sub(count, count, wordSize/granularity);
1235     } else {
1236       if (is_backwards) {
1237         __ andr(rscratch2, s, 2 * wordSize - 1);
1238       } else {
1239         __ neg(rscratch2, s);
1240         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1241       }
1242       // rscratch2 is the byte adjustment needed to align s.
1243       __ cbz(rscratch2, aligned);
1244       int shift = exact_log2(granularity);
1245       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1246       __ sub(count, count, rscratch2);
1247 
1248 #if 0
1249       // ?? This code is only correct for a disjoint copy.  It may or
1250       // may not make sense to use it in that case.
1251 
1252       // Copy the first pair; s and d may not be aligned.
1253       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1254       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1255 
1256       // Align s and d, adjust count
1257       if (is_backwards) {
1258         __ sub(s, s, rscratch2);
1259         __ sub(d, d, rscratch2);
1260       } else {
1261         __ add(s, s, rscratch2);
1262         __ add(d, d, rscratch2);
1263       }
1264 #else
1265       copy_memory_small(s, d, rscratch2, rscratch1, step);
1266 #endif
1267     }
1268 
1269     __ bind(aligned);
1270 
1271     // s is now 2-word-aligned.
1272 
1273     // We have a count of units and some trailing bytes.  Adjust the
1274     // count and do a bulk copy of words.
1275     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1276     if (direction == copy_forwards)
1277       __ bl(copy_f);
1278     else
1279       __ bl(copy_b);
1280 
1281     // And the tail.
1282     copy_memory_small(s, d, count, tmp, step);
1283 
1284     if (granularity >= 8) __ bind(copy8);
1285     if (granularity >= 4) __ bind(copy4);
1286     __ bind(finish);
1287   }
1288 
1289 
1290   void clobber_registers() {
1291 #ifdef ASSERT
1292     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1293     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1294     for (Register r = r3; r <= r18; r++)
1295       if (r != rscratch1) __ mov(r, rscratch1);
1296 #endif
1297   }
1298 
1299   // Scan over array at a for count oops, verifying each one.
1300   // Preserves a and count, clobbers rscratch1 and rscratch2.
1301   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1302     Label loop, end;
1303     __ mov(rscratch1, a);
1304     __ mov(rscratch2, zr);
1305     __ bind(loop);
1306     __ cmp(rscratch2, count);
1307     __ br(Assembler::HS, end);
1308     if (size == (size_t)wordSize) {
1309       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1310       __ verify_oop(temp);
1311     } else {
1312       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1313       __ decode_heap_oop(temp); // calls verify_oop
1314     }
1315     __ add(rscratch2, rscratch2, size);
1316     __ b(loop);
1317     __ bind(end);
1318   }
1319 
1320   // Arguments:
1321   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1322   //             ignored
1323   //   is_oop  - true => oop array, so generate store check code
1324   //   name    - stub name string
1325   //
1326   // Inputs:
1327   //   c_rarg0   - source array address
1328   //   c_rarg1   - destination array address
1329   //   c_rarg2   - element count, treated as ssize_t, can be zero
1330   //
1331   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1332   // the hardware handle it.  The two dwords within qwords that span
1333   // cache line boundaries will still be loaded and stored atomicly.
1334   //
1335   // Side Effects:
1336   //   disjoint_int_copy_entry is set to the no-overlap entry point
1337   //   used by generate_conjoint_int_oop_copy().
1338   //
1339   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1340                                   const char *name, bool dest_uninitialized = false) {
1341     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1342     RegSet saved_reg = RegSet::of(s, d, count);
1343     __ align(CodeEntryAlignment);
1344     StubCodeMark mark(this, "StubRoutines", name);
1345     address start = __ pc();
1346     __ enter();
1347 
1348     if (entry != NULL) {
1349       *entry = __ pc();
1350       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1351       BLOCK_COMMENT("Entry:");
1352     }
1353 
1354     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
1355     if (dest_uninitialized) {
1356       decorators |= IS_DEST_UNINITIALIZED;
1357     }
1358     if (aligned) {
1359       decorators |= ARRAYCOPY_ALIGNED;
1360     }
1361 
1362     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1363     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg);
1364 
1365     if (is_oop) {
1366       // save regs before copy_memory
1367       __ push(RegSet::of(d, count), sp);
1368     }
1369     copy_memory(aligned, s, d, count, rscratch1, size);
1370 
1371     if (is_oop) {
1372       __ pop(RegSet::of(d, count), sp);
1373       if (VerifyOops)
1374         verify_oop_array(size, d, count, r16);
1375       __ sub(count, count, 1); // make an inclusive end pointer
1376       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1377     }
1378 
1379     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1380 
1381     __ leave();
1382     __ mov(r0, zr); // return 0
1383     __ ret(lr);
1384 #ifdef BUILTIN_SIM
1385     {
1386       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1387       sim->notifyCompile(const_cast<char*>(name), start);
1388     }
1389 #endif
1390     return start;
1391   }
1392 
1393   // Arguments:
1394   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1395   //             ignored
1396   //   is_oop  - true => oop array, so generate store check code
1397   //   name    - stub name string
1398   //
1399   // Inputs:
1400   //   c_rarg0   - source array address
1401   //   c_rarg1   - destination array address
1402   //   c_rarg2   - element count, treated as ssize_t, can be zero
1403   //
1404   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1405   // the hardware handle it.  The two dwords within qwords that span
1406   // cache line boundaries will still be loaded and stored atomicly.
1407   //
1408   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1409                                  address *entry, const char *name,
1410                                  bool dest_uninitialized = false) {
1411     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1412     RegSet saved_regs = RegSet::of(s, d, count);
1413     StubCodeMark mark(this, "StubRoutines", name);
1414     address start = __ pc();
1415     __ enter();
1416 
1417     if (entry != NULL) {
1418       *entry = __ pc();
1419       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1420       BLOCK_COMMENT("Entry:");
1421     }
1422 
1423     // use fwd copy when (d-s) above_equal (count*size)
1424     __ sub(rscratch1, d, s);
1425     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1426     __ br(Assembler::HS, nooverlap_target);
1427 
1428     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
1429     if (dest_uninitialized) {
1430       decorators |= IS_DEST_UNINITIALIZED;
1431     }
1432     if (aligned) {
1433       decorators |= ARRAYCOPY_ALIGNED;
1434     }
1435 
1436     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1437     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs);
1438 
1439     if (is_oop) {
1440       // save regs before copy_memory
1441       __ push(RegSet::of(d, count), sp);
1442     }
1443     copy_memory(aligned, s, d, count, rscratch1, -size);
1444     if (is_oop) {
1445       __ pop(RegSet::of(d, count), sp);
1446       if (VerifyOops)
1447         verify_oop_array(size, d, count, r16);
1448       __ sub(count, count, 1); // make an inclusive end pointer
1449       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1450     }
1451     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet());
1452     __ leave();
1453     __ mov(r0, zr); // return 0
1454     __ ret(lr);
1455 #ifdef BUILTIN_SIM
1456     {
1457       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1458       sim->notifyCompile(const_cast<char*>(name), start);
1459     }
1460 #endif
1461     return start;
1462 }
1463 
1464   // Arguments:
1465   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1466   //             ignored
1467   //   name    - stub name string
1468   //
1469   // Inputs:
1470   //   c_rarg0   - source array address
1471   //   c_rarg1   - destination array address
1472   //   c_rarg2   - element count, treated as ssize_t, can be zero
1473   //
1474   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1475   // we let the hardware handle it.  The one to eight bytes within words,
1476   // dwords or qwords that span cache line boundaries will still be loaded
1477   // and stored atomically.
1478   //
1479   // Side Effects:
1480   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1481   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1482   // we let the hardware handle it.  The one to eight bytes within words,
1483   // dwords or qwords that span cache line boundaries will still be loaded
1484   // and stored atomically.
1485   //
1486   // Side Effects:
1487   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1488   //   used by generate_conjoint_byte_copy().
1489   //
1490   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1491     const bool not_oop = false;
1492     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1493   }
1494 
1495   // Arguments:
1496   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1497   //             ignored
1498   //   name    - stub name string
1499   //
1500   // Inputs:
1501   //   c_rarg0   - source array address
1502   //   c_rarg1   - destination array address
1503   //   c_rarg2   - element count, treated as ssize_t, can be zero
1504   //
1505   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1506   // we let the hardware handle it.  The one to eight bytes within words,
1507   // dwords or qwords that span cache line boundaries will still be loaded
1508   // and stored atomically.
1509   //
1510   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1511                                       address* entry, const char *name) {
1512     const bool not_oop = false;
1513     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1514   }
1515 
1516   // Arguments:
1517   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1518   //             ignored
1519   //   name    - stub name string
1520   //
1521   // Inputs:
1522   //   c_rarg0   - source array address
1523   //   c_rarg1   - destination array address
1524   //   c_rarg2   - element count, treated as ssize_t, can be zero
1525   //
1526   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1527   // let the hardware handle it.  The two or four words within dwords
1528   // or qwords that span cache line boundaries will still be loaded
1529   // and stored atomically.
1530   //
1531   // Side Effects:
1532   //   disjoint_short_copy_entry is set to the no-overlap entry point
1533   //   used by generate_conjoint_short_copy().
1534   //
1535   address generate_disjoint_short_copy(bool aligned,
1536                                        address* entry, const char *name) {
1537     const bool not_oop = false;
1538     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1539   }
1540 
1541   // Arguments:
1542   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1543   //             ignored
1544   //   name    - stub name string
1545   //
1546   // Inputs:
1547   //   c_rarg0   - source array address
1548   //   c_rarg1   - destination array address
1549   //   c_rarg2   - element count, treated as ssize_t, can be zero
1550   //
1551   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1552   // let the hardware handle it.  The two or four words within dwords
1553   // or qwords that span cache line boundaries will still be loaded
1554   // and stored atomically.
1555   //
1556   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1557                                        address *entry, const char *name) {
1558     const bool not_oop = false;
1559     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1560 
1561   }
1562   // Arguments:
1563   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1564   //             ignored
1565   //   name    - stub name string
1566   //
1567   // Inputs:
1568   //   c_rarg0   - source array address
1569   //   c_rarg1   - destination array address
1570   //   c_rarg2   - element count, treated as ssize_t, can be zero
1571   //
1572   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1573   // the hardware handle it.  The two dwords within qwords that span
1574   // cache line boundaries will still be loaded and stored atomicly.
1575   //
1576   // Side Effects:
1577   //   disjoint_int_copy_entry is set to the no-overlap entry point
1578   //   used by generate_conjoint_int_oop_copy().
1579   //
1580   address generate_disjoint_int_copy(bool aligned, address *entry,
1581                                          const char *name, bool dest_uninitialized = false) {
1582     const bool not_oop = false;
1583     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1584   }
1585 
1586   // Arguments:
1587   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1588   //             ignored
1589   //   name    - stub name string
1590   //
1591   // Inputs:
1592   //   c_rarg0   - source array address
1593   //   c_rarg1   - destination array address
1594   //   c_rarg2   - element count, treated as ssize_t, can be zero
1595   //
1596   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1597   // the hardware handle it.  The two dwords within qwords that span
1598   // cache line boundaries will still be loaded and stored atomicly.
1599   //
1600   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1601                                      address *entry, const char *name,
1602                                      bool dest_uninitialized = false) {
1603     const bool not_oop = false;
1604     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1605   }
1606 
1607 
1608   // Arguments:
1609   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1610   //             ignored
1611   //   name    - stub name string
1612   //
1613   // Inputs:
1614   //   c_rarg0   - source array address
1615   //   c_rarg1   - destination array address
1616   //   c_rarg2   - element count, treated as size_t, can be zero
1617   //
1618   // Side Effects:
1619   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1620   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1621   //
1622   address generate_disjoint_long_copy(bool aligned, address *entry,
1623                                           const char *name, bool dest_uninitialized = false) {
1624     const bool not_oop = false;
1625     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1626   }
1627 
1628   // Arguments:
1629   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1630   //             ignored
1631   //   name    - stub name string
1632   //
1633   // Inputs:
1634   //   c_rarg0   - source array address
1635   //   c_rarg1   - destination array address
1636   //   c_rarg2   - element count, treated as size_t, can be zero
1637   //
1638   address generate_conjoint_long_copy(bool aligned,
1639                                       address nooverlap_target, address *entry,
1640                                       const char *name, bool dest_uninitialized = false) {
1641     const bool not_oop = false;
1642     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1643   }
1644 
1645   // Arguments:
1646   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1647   //             ignored
1648   //   name    - stub name string
1649   //
1650   // Inputs:
1651   //   c_rarg0   - source array address
1652   //   c_rarg1   - destination array address
1653   //   c_rarg2   - element count, treated as size_t, can be zero
1654   //
1655   // Side Effects:
1656   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1657   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1658   //
1659   address generate_disjoint_oop_copy(bool aligned, address *entry,
1660                                      const char *name, bool dest_uninitialized) {
1661     const bool is_oop = true;
1662     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1663     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1664   }
1665 
1666   // Arguments:
1667   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1668   //             ignored
1669   //   name    - stub name string
1670   //
1671   // Inputs:
1672   //   c_rarg0   - source array address
1673   //   c_rarg1   - destination array address
1674   //   c_rarg2   - element count, treated as size_t, can be zero
1675   //
1676   address generate_conjoint_oop_copy(bool aligned,
1677                                      address nooverlap_target, address *entry,
1678                                      const char *name, bool dest_uninitialized) {
1679     const bool is_oop = true;
1680     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1681     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1682                                   name, dest_uninitialized);
1683   }
1684 
1685 
1686   // Helper for generating a dynamic type check.
1687   // Smashes rscratch1.
1688   void generate_type_check(Register sub_klass,
1689                            Register super_check_offset,
1690                            Register super_klass,
1691                            Label& L_success) {
1692     assert_different_registers(sub_klass, super_check_offset, super_klass);
1693 
1694     BLOCK_COMMENT("type_check:");
1695 
1696     Label L_miss;
1697 
1698     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1699                                      super_check_offset);
1700     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1701 
1702     // Fall through on failure!
1703     __ BIND(L_miss);
1704   }
1705 
1706   //
1707   //  Generate checkcasting array copy stub
1708   //
1709   //  Input:
1710   //    c_rarg0   - source array address
1711   //    c_rarg1   - destination array address
1712   //    c_rarg2   - element count, treated as ssize_t, can be zero
1713   //    c_rarg3   - size_t ckoff (super_check_offset)
1714   //    c_rarg4   - oop ckval (super_klass)
1715   //
1716   //  Output:
1717   //    r0 ==  0  -  success
1718   //    r0 == -1^K - failure, where K is partial transfer count
1719   //
1720   address generate_checkcast_copy(const char *name, address *entry,
1721                                   bool dest_uninitialized = false) {
1722 
1723     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1724 
1725     // Input registers (after setup_arg_regs)
1726     const Register from        = c_rarg0;   // source array address
1727     const Register to          = c_rarg1;   // destination array address
1728     const Register count       = c_rarg2;   // elementscount
1729     const Register ckoff       = c_rarg3;   // super_check_offset
1730     const Register ckval       = c_rarg4;   // super_klass
1731 
1732     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1733     RegSet wb_post_saved_regs = RegSet::of(count);
1734 
1735     // Registers used as temps (r18, r19, r20 are save-on-entry)
1736     const Register count_save  = r21;       // orig elementscount
1737     const Register start_to    = r20;       // destination array start address
1738     const Register copied_oop  = r18;       // actual oop copied
1739     const Register r19_klass   = r19;       // oop._klass
1740 
1741     //---------------------------------------------------------------
1742     // Assembler stub will be used for this call to arraycopy
1743     // if the two arrays are subtypes of Object[] but the
1744     // destination array type is not equal to or a supertype
1745     // of the source type.  Each element must be separately
1746     // checked.
1747 
1748     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1749                                copied_oop, r19_klass, count_save);
1750 
1751     __ align(CodeEntryAlignment);
1752     StubCodeMark mark(this, "StubRoutines", name);
1753     address start = __ pc();
1754 
1755     __ enter(); // required for proper stackwalking of RuntimeStub frame
1756 
1757 #ifdef ASSERT
1758     // caller guarantees that the arrays really are different
1759     // otherwise, we would have to make conjoint checks
1760     { Label L;
1761       array_overlap_test(L, TIMES_OOP);
1762       __ stop("checkcast_copy within a single array");
1763       __ bind(L);
1764     }
1765 #endif //ASSERT
1766 
1767     // Caller of this entry point must set up the argument registers.
1768     if (entry != NULL) {
1769       *entry = __ pc();
1770       BLOCK_COMMENT("Entry:");
1771     }
1772 
1773      // Empty array:  Nothing to do.
1774     __ cbz(count, L_done);
1775 
1776     __ push(RegSet::of(r18, r19, r20, r21), sp);
1777 
1778 #ifdef ASSERT
1779     BLOCK_COMMENT("assert consistent ckoff/ckval");
1780     // The ckoff and ckval must be mutually consistent,
1781     // even though caller generates both.
1782     { Label L;
1783       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1784       __ ldrw(start_to, Address(ckval, sco_offset));
1785       __ cmpw(ckoff, start_to);
1786       __ br(Assembler::EQ, L);
1787       __ stop("super_check_offset inconsistent");
1788       __ bind(L);
1789     }
1790 #endif //ASSERT
1791 
1792     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
1793     bool is_oop = true;
1794     if (dest_uninitialized) {
1795       decorators |= IS_DEST_UNINITIALIZED;
1796     }
1797 
1798     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
1799     bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs);
1800 
1801     // save the original count
1802     __ mov(count_save, count);
1803 
1804     // Copy from low to high addresses
1805     __ mov(start_to, to);              // Save destination array start address
1806     __ b(L_load_element);
1807 
1808     // ======== begin loop ========
1809     // (Loop is rotated; its entry is L_load_element.)
1810     // Loop control:
1811     //   for (; count != 0; count--) {
1812     //     copied_oop = load_heap_oop(from++);
1813     //     ... generate_type_check ...;
1814     //     store_heap_oop(to++, copied_oop);
1815     //   }
1816     __ align(OptoLoopAlignment);
1817 
1818     __ BIND(L_store_element);
1819     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW);  // store the oop
1820     __ sub(count, count, 1);
1821     __ cbz(count, L_do_card_marks);
1822 
1823     // ======== loop entry is here ========
1824     __ BIND(L_load_element);
1825     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop
1826     __ cbz(copied_oop, L_store_element);
1827 
1828     __ load_klass(r19_klass, copied_oop);// query the object klass
1829     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1830     // ======== end loop ========
1831 
1832     // It was a real error; we must depend on the caller to finish the job.
1833     // Register count = remaining oops, count_orig = total oops.
1834     // Emit GC store barriers for the oops we have copied and report
1835     // their number to the caller.
1836 
1837     __ subs(count, count_save, count);     // K = partially copied oop count
1838     __ eon(count, count, zr);                   // report (-1^K) to caller
1839     __ br(Assembler::EQ, L_done_pop);
1840 
1841     __ BIND(L_do_card_marks);
1842     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1843     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs);
1844 
1845     __ bind(L_done_pop);
1846     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1847     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1848 
1849     __ bind(L_done);
1850     __ mov(r0, count);
1851     __ leave();
1852     __ ret(lr);
1853 
1854     return start;
1855   }
1856 
1857   // Perform range checks on the proposed arraycopy.
1858   // Kills temp, but nothing else.
1859   // Also, clean the sign bits of src_pos and dst_pos.
1860   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1861                               Register src_pos, // source position (c_rarg1)
1862                               Register dst,     // destination array oo (c_rarg2)
1863                               Register dst_pos, // destination position (c_rarg3)
1864                               Register length,
1865                               Register temp,
1866                               Label& L_failed) {
1867     BLOCK_COMMENT("arraycopy_range_checks:");
1868 
1869     assert_different_registers(rscratch1, temp);
1870 
1871     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1872     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1873     __ addw(temp, length, src_pos);
1874     __ cmpw(temp, rscratch1);
1875     __ br(Assembler::HI, L_failed);
1876 
1877     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1878     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1879     __ addw(temp, length, dst_pos);
1880     __ cmpw(temp, rscratch1);
1881     __ br(Assembler::HI, L_failed);
1882 
1883     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1884     __ movw(src_pos, src_pos);
1885     __ movw(dst_pos, dst_pos);
1886 
1887     BLOCK_COMMENT("arraycopy_range_checks done");
1888   }
1889 
1890   // These stubs get called from some dumb test routine.
1891   // I'll write them properly when they're called from
1892   // something that's actually doing something.
1893   static void fake_arraycopy_stub(address src, address dst, int count) {
1894     assert(count == 0, "huh?");
1895   }
1896 
1897 
1898   //
1899   //  Generate 'unsafe' array copy stub
1900   //  Though just as safe as the other stubs, it takes an unscaled
1901   //  size_t argument instead of an element count.
1902   //
1903   //  Input:
1904   //    c_rarg0   - source array address
1905   //    c_rarg1   - destination array address
1906   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1907   //
1908   // Examines the alignment of the operands and dispatches
1909   // to a long, int, short, or byte copy loop.
1910   //
1911   address generate_unsafe_copy(const char *name,
1912                                address byte_copy_entry,
1913                                address short_copy_entry,
1914                                address int_copy_entry,
1915                                address long_copy_entry) {
1916     Label L_long_aligned, L_int_aligned, L_short_aligned;
1917     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1918 
1919     __ align(CodeEntryAlignment);
1920     StubCodeMark mark(this, "StubRoutines", name);
1921     address start = __ pc();
1922     __ enter(); // required for proper stackwalking of RuntimeStub frame
1923 
1924     // bump this on entry, not on exit:
1925     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1926 
1927     __ orr(rscratch1, s, d);
1928     __ orr(rscratch1, rscratch1, count);
1929 
1930     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1931     __ cbz(rscratch1, L_long_aligned);
1932     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1933     __ cbz(rscratch1, L_int_aligned);
1934     __ tbz(rscratch1, 0, L_short_aligned);
1935     __ b(RuntimeAddress(byte_copy_entry));
1936 
1937     __ BIND(L_short_aligned);
1938     __ lsr(count, count, LogBytesPerShort);  // size => short_count
1939     __ b(RuntimeAddress(short_copy_entry));
1940     __ BIND(L_int_aligned);
1941     __ lsr(count, count, LogBytesPerInt);    // size => int_count
1942     __ b(RuntimeAddress(int_copy_entry));
1943     __ BIND(L_long_aligned);
1944     __ lsr(count, count, LogBytesPerLong);   // size => long_count
1945     __ b(RuntimeAddress(long_copy_entry));
1946 
1947     return start;
1948   }
1949 
1950   //
1951   //  Generate generic array copy stubs
1952   //
1953   //  Input:
1954   //    c_rarg0    -  src oop
1955   //    c_rarg1    -  src_pos (32-bits)
1956   //    c_rarg2    -  dst oop
1957   //    c_rarg3    -  dst_pos (32-bits)
1958   //    c_rarg4    -  element count (32-bits)
1959   //
1960   //  Output:
1961   //    r0 ==  0  -  success
1962   //    r0 == -1^K - failure, where K is partial transfer count
1963   //
1964   address generate_generic_copy(const char *name,
1965                                 address byte_copy_entry, address short_copy_entry,
1966                                 address int_copy_entry, address oop_copy_entry,
1967                                 address long_copy_entry, address checkcast_copy_entry) {
1968 
1969     Label L_failed, L_failed_0, L_objArray;
1970     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1971 
1972     // Input registers
1973     const Register src        = c_rarg0;  // source array oop
1974     const Register src_pos    = c_rarg1;  // source position
1975     const Register dst        = c_rarg2;  // destination array oop
1976     const Register dst_pos    = c_rarg3;  // destination position
1977     const Register length     = c_rarg4;
1978 
1979     StubCodeMark mark(this, "StubRoutines", name);
1980 
1981     __ align(CodeEntryAlignment);
1982     address start = __ pc();
1983 
1984     __ enter(); // required for proper stackwalking of RuntimeStub frame
1985 
1986     // bump this on entry, not on exit:
1987     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1988 
1989     //-----------------------------------------------------------------------
1990     // Assembler stub will be used for this call to arraycopy
1991     // if the following conditions are met:
1992     //
1993     // (1) src and dst must not be null.
1994     // (2) src_pos must not be negative.
1995     // (3) dst_pos must not be negative.
1996     // (4) length  must not be negative.
1997     // (5) src klass and dst klass should be the same and not NULL.
1998     // (6) src and dst should be arrays.
1999     // (7) src_pos + length must not exceed length of src.
2000     // (8) dst_pos + length must not exceed length of dst.
2001     //
2002 
2003     //  if (src == NULL) return -1;
2004     __ cbz(src, L_failed);
2005 
2006     //  if (src_pos < 0) return -1;
2007     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2008 
2009     //  if (dst == NULL) return -1;
2010     __ cbz(dst, L_failed);
2011 
2012     //  if (dst_pos < 0) return -1;
2013     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2014 
2015     // registers used as temp
2016     const Register scratch_length    = r16; // elements count to copy
2017     const Register scratch_src_klass = r17; // array klass
2018     const Register lh                = r18; // layout helper
2019 
2020     //  if (length < 0) return -1;
2021     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2022     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2023 
2024     __ load_klass(scratch_src_klass, src);
2025 #ifdef ASSERT
2026     //  assert(src->klass() != NULL);
2027     {
2028       BLOCK_COMMENT("assert klasses not null {");
2029       Label L1, L2;
2030       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2031       __ bind(L1);
2032       __ stop("broken null klass");
2033       __ bind(L2);
2034       __ load_klass(rscratch1, dst);
2035       __ cbz(rscratch1, L1);     // this would be broken also
2036       BLOCK_COMMENT("} assert klasses not null done");
2037     }
2038 #endif
2039 
2040     // Load layout helper (32-bits)
2041     //
2042     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2043     // 32        30    24            16              8     2                 0
2044     //
2045     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2046     //
2047 
2048     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2049 
2050     // Handle objArrays completely differently...
2051     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2052     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2053     __ movw(rscratch1, objArray_lh);
2054     __ eorw(rscratch2, lh, rscratch1);
2055     __ cbzw(rscratch2, L_objArray);
2056 
2057     //  if (src->klass() != dst->klass()) return -1;
2058     __ load_klass(rscratch2, dst);
2059     __ eor(rscratch2, rscratch2, scratch_src_klass);
2060     __ cbnz(rscratch2, L_failed);
2061 
2062     //  if (!src->is_Array()) return -1;
2063     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2064 
2065     // At this point, it is known to be a typeArray (array_tag 0x3).
2066 #ifdef ASSERT
2067     {
2068       BLOCK_COMMENT("assert primitive array {");
2069       Label L;
2070       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2071       __ cmpw(lh, rscratch2);
2072       __ br(Assembler::GE, L);
2073       __ stop("must be a primitive array");
2074       __ bind(L);
2075       BLOCK_COMMENT("} assert primitive array done");
2076     }
2077 #endif
2078 
2079     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2080                            rscratch2, L_failed);
2081 
2082     // TypeArrayKlass
2083     //
2084     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2085     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2086     //
2087 
2088     const Register rscratch1_offset = rscratch1;    // array offset
2089     const Register r18_elsize = lh; // element size
2090 
2091     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2092            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2093     __ add(src, src, rscratch1_offset);           // src array offset
2094     __ add(dst, dst, rscratch1_offset);           // dst array offset
2095     BLOCK_COMMENT("choose copy loop based on element size");
2096 
2097     // next registers should be set before the jump to corresponding stub
2098     const Register from     = c_rarg0;  // source array address
2099     const Register to       = c_rarg1;  // destination array address
2100     const Register count    = c_rarg2;  // elements count
2101 
2102     // 'from', 'to', 'count' registers should be set in such order
2103     // since they are the same as 'src', 'src_pos', 'dst'.
2104 
2105     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2106 
2107     // The possible values of elsize are 0-3, i.e. exact_log2(element
2108     // size in bytes).  We do a simple bitwise binary search.
2109   __ BIND(L_copy_bytes);
2110     __ tbnz(r18_elsize, 1, L_copy_ints);
2111     __ tbnz(r18_elsize, 0, L_copy_shorts);
2112     __ lea(from, Address(src, src_pos));// src_addr
2113     __ lea(to,   Address(dst, dst_pos));// dst_addr
2114     __ movw(count, scratch_length); // length
2115     __ b(RuntimeAddress(byte_copy_entry));
2116 
2117   __ BIND(L_copy_shorts);
2118     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2119     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2120     __ movw(count, scratch_length); // length
2121     __ b(RuntimeAddress(short_copy_entry));
2122 
2123   __ BIND(L_copy_ints);
2124     __ tbnz(r18_elsize, 0, L_copy_longs);
2125     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2126     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2127     __ movw(count, scratch_length); // length
2128     __ b(RuntimeAddress(int_copy_entry));
2129 
2130   __ BIND(L_copy_longs);
2131 #ifdef ASSERT
2132     {
2133       BLOCK_COMMENT("assert long copy {");
2134       Label L;
2135       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2136       __ cmpw(r18_elsize, LogBytesPerLong);
2137       __ br(Assembler::EQ, L);
2138       __ stop("must be long copy, but elsize is wrong");
2139       __ bind(L);
2140       BLOCK_COMMENT("} assert long copy done");
2141     }
2142 #endif
2143     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2144     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2145     __ movw(count, scratch_length); // length
2146     __ b(RuntimeAddress(long_copy_entry));
2147 
2148     // ObjArrayKlass
2149   __ BIND(L_objArray);
2150     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2151 
2152     Label L_plain_copy, L_checkcast_copy;
2153     //  test array classes for subtyping
2154     __ load_klass(r18, dst);
2155     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2156     __ br(Assembler::NE, L_checkcast_copy);
2157 
2158     // Identically typed arrays can be copied without element-wise checks.
2159     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2160                            rscratch2, L_failed);
2161 
2162     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2163     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2164     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2165     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2166     __ movw(count, scratch_length); // length
2167   __ BIND(L_plain_copy);
2168     __ b(RuntimeAddress(oop_copy_entry));
2169 
2170   __ BIND(L_checkcast_copy);
2171     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2172     {
2173       // Before looking at dst.length, make sure dst is also an objArray.
2174       __ ldrw(rscratch1, Address(r18, lh_offset));
2175       __ movw(rscratch2, objArray_lh);
2176       __ eorw(rscratch1, rscratch1, rscratch2);
2177       __ cbnzw(rscratch1, L_failed);
2178 
2179       // It is safe to examine both src.length and dst.length.
2180       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2181                              r18, L_failed);
2182 
2183       const Register rscratch2_dst_klass = rscratch2;
2184       __ load_klass(rscratch2_dst_klass, dst); // reload
2185 
2186       // Marshal the base address arguments now, freeing registers.
2187       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2188       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2189       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2190       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2191       __ movw(count, length);           // length (reloaded)
2192       Register sco_temp = c_rarg3;      // this register is free now
2193       assert_different_registers(from, to, count, sco_temp,
2194                                  rscratch2_dst_klass, scratch_src_klass);
2195       // assert_clean_int(count, sco_temp);
2196 
2197       // Generate the type check.
2198       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2199       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2200       // assert_clean_int(sco_temp, r18);
2201       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2202 
2203       // Fetch destination element klass from the ObjArrayKlass header.
2204       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2205       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2206       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2207 
2208       // the checkcast_copy loop needs two extra arguments:
2209       assert(c_rarg3 == sco_temp, "#3 already in place");
2210       // Set up arguments for checkcast_copy_entry.
2211       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2212       __ b(RuntimeAddress(checkcast_copy_entry));
2213     }
2214 
2215   __ BIND(L_failed);
2216     __ mov(r0, -1);
2217     __ leave();   // required for proper stackwalking of RuntimeStub frame
2218     __ ret(lr);
2219 
2220     return start;
2221   }
2222 
2223   //
2224   // Generate stub for array fill. If "aligned" is true, the
2225   // "to" address is assumed to be heapword aligned.
2226   //
2227   // Arguments for generated stub:
2228   //   to:    c_rarg0
2229   //   value: c_rarg1
2230   //   count: c_rarg2 treated as signed
2231   //
2232   address generate_fill(BasicType t, bool aligned, const char *name) {
2233     __ align(CodeEntryAlignment);
2234     StubCodeMark mark(this, "StubRoutines", name);
2235     address start = __ pc();
2236 
2237     BLOCK_COMMENT("Entry:");
2238 
2239     const Register to        = c_rarg0;  // source array address
2240     const Register value     = c_rarg1;  // value
2241     const Register count     = c_rarg2;  // elements count
2242 
2243     const Register bz_base = r10;        // base for block_zero routine
2244     const Register cnt_words = r11;      // temp register
2245 
2246     __ enter();
2247 
2248     Label L_fill_elements, L_exit1;
2249 
2250     int shift = -1;
2251     switch (t) {
2252       case T_BYTE:
2253         shift = 0;
2254         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2255         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2256         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2257         __ br(Assembler::LO, L_fill_elements);
2258         break;
2259       case T_SHORT:
2260         shift = 1;
2261         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2262         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2263         __ br(Assembler::LO, L_fill_elements);
2264         break;
2265       case T_INT:
2266         shift = 2;
2267         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2268         __ br(Assembler::LO, L_fill_elements);
2269         break;
2270       default: ShouldNotReachHere();
2271     }
2272 
2273     // Align source address at 8 bytes address boundary.
2274     Label L_skip_align1, L_skip_align2, L_skip_align4;
2275     if (!aligned) {
2276       switch (t) {
2277         case T_BYTE:
2278           // One byte misalignment happens only for byte arrays.
2279           __ tbz(to, 0, L_skip_align1);
2280           __ strb(value, Address(__ post(to, 1)));
2281           __ subw(count, count, 1);
2282           __ bind(L_skip_align1);
2283           // Fallthrough
2284         case T_SHORT:
2285           // Two bytes misalignment happens only for byte and short (char) arrays.
2286           __ tbz(to, 1, L_skip_align2);
2287           __ strh(value, Address(__ post(to, 2)));
2288           __ subw(count, count, 2 >> shift);
2289           __ bind(L_skip_align2);
2290           // Fallthrough
2291         case T_INT:
2292           // Align to 8 bytes, we know we are 4 byte aligned to start.
2293           __ tbz(to, 2, L_skip_align4);
2294           __ strw(value, Address(__ post(to, 4)));
2295           __ subw(count, count, 4 >> shift);
2296           __ bind(L_skip_align4);
2297           break;
2298         default: ShouldNotReachHere();
2299       }
2300     }
2301 
2302     //
2303     //  Fill large chunks
2304     //
2305     __ lsrw(cnt_words, count, 3 - shift); // number of words
2306     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2307     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2308     if (UseBlockZeroing) {
2309       Label non_block_zeroing, rest;
2310       // If the fill value is zero we can use the fast zero_words().
2311       __ cbnz(value, non_block_zeroing);
2312       __ mov(bz_base, to);
2313       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2314       __ zero_words(bz_base, cnt_words);
2315       __ b(rest);
2316       __ bind(non_block_zeroing);
2317       __ fill_words(to, cnt_words, value);
2318       __ bind(rest);
2319     } else {
2320       __ fill_words(to, cnt_words, value);
2321     }
2322 
2323     // Remaining count is less than 8 bytes. Fill it by a single store.
2324     // Note that the total length is no less than 8 bytes.
2325     if (t == T_BYTE || t == T_SHORT) {
2326       Label L_exit1;
2327       __ cbzw(count, L_exit1);
2328       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2329       __ str(value, Address(to, -8));    // overwrite some elements
2330       __ bind(L_exit1);
2331       __ leave();
2332       __ ret(lr);
2333     }
2334 
2335     // Handle copies less than 8 bytes.
2336     Label L_fill_2, L_fill_4, L_exit2;
2337     __ bind(L_fill_elements);
2338     switch (t) {
2339       case T_BYTE:
2340         __ tbz(count, 0, L_fill_2);
2341         __ strb(value, Address(__ post(to, 1)));
2342         __ bind(L_fill_2);
2343         __ tbz(count, 1, L_fill_4);
2344         __ strh(value, Address(__ post(to, 2)));
2345         __ bind(L_fill_4);
2346         __ tbz(count, 2, L_exit2);
2347         __ strw(value, Address(to));
2348         break;
2349       case T_SHORT:
2350         __ tbz(count, 0, L_fill_4);
2351         __ strh(value, Address(__ post(to, 2)));
2352         __ bind(L_fill_4);
2353         __ tbz(count, 1, L_exit2);
2354         __ strw(value, Address(to));
2355         break;
2356       case T_INT:
2357         __ cbzw(count, L_exit2);
2358         __ strw(value, Address(to));
2359         break;
2360       default: ShouldNotReachHere();
2361     }
2362     __ bind(L_exit2);
2363     __ leave();
2364     __ ret(lr);
2365     return start;
2366   }
2367 
2368   address generate_data_cache_writeback() {
2369     const Register line        = c_rarg0;  // address of line to write back
2370 
2371     __ align(CodeEntryAlignment);
2372 
2373     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
2374 
2375     address start = __ pc();
2376     __ enter();
2377     __ cache_wb(Address(line, 0));
2378     __ leave();
2379     __ ret(lr);
2380 
2381     return start;
2382   }
2383 
2384   address generate_data_cache_writeback_sync() {
2385     const Register is_pre     = c_rarg0;  // pre or post sync
2386 
2387     __ align(CodeEntryAlignment);
2388 
2389     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
2390 
2391     // pre wbsync is a no-op
2392     // post wbsync translates to an sfence
2393 
2394     Label skip;
2395     address start = __ pc();
2396     __ enter();
2397     __ cbnz(is_pre, skip);
2398     __ cache_wbsync(false);
2399     __ bind(skip);
2400     __ leave();
2401     __ ret(lr);
2402 
2403     return start;
2404   }
2405 
2406   void generate_arraycopy_stubs() {
2407     address entry;
2408     address entry_jbyte_arraycopy;
2409     address entry_jshort_arraycopy;
2410     address entry_jint_arraycopy;
2411     address entry_oop_arraycopy;
2412     address entry_jlong_arraycopy;
2413     address entry_checkcast_arraycopy;
2414 
2415     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2416     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2417 
2418     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2419 
2420     //*** jbyte
2421     // Always need aligned and unaligned versions
2422     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2423                                                                                   "jbyte_disjoint_arraycopy");
2424     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2425                                                                                   &entry_jbyte_arraycopy,
2426                                                                                   "jbyte_arraycopy");
2427     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2428                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2429     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2430                                                                                   "arrayof_jbyte_arraycopy");
2431 
2432     //*** jshort
2433     // Always need aligned and unaligned versions
2434     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2435                                                                                     "jshort_disjoint_arraycopy");
2436     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2437                                                                                     &entry_jshort_arraycopy,
2438                                                                                     "jshort_arraycopy");
2439     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2440                                                                                     "arrayof_jshort_disjoint_arraycopy");
2441     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2442                                                                                     "arrayof_jshort_arraycopy");
2443 
2444     //*** jint
2445     // Aligned versions
2446     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2447                                                                                 "arrayof_jint_disjoint_arraycopy");
2448     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2449                                                                                 "arrayof_jint_arraycopy");
2450     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2451     // entry_jint_arraycopy always points to the unaligned version
2452     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2453                                                                                 "jint_disjoint_arraycopy");
2454     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2455                                                                                 &entry_jint_arraycopy,
2456                                                                                 "jint_arraycopy");
2457 
2458     //*** jlong
2459     // It is always aligned
2460     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2461                                                                                   "arrayof_jlong_disjoint_arraycopy");
2462     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2463                                                                                   "arrayof_jlong_arraycopy");
2464     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2465     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2466 
2467     //*** oops
2468     {
2469       // With compressed oops we need unaligned versions; notice that
2470       // we overwrite entry_oop_arraycopy.
2471       bool aligned = !UseCompressedOops;
2472 
2473       StubRoutines::_arrayof_oop_disjoint_arraycopy
2474         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2475                                      /*dest_uninitialized*/false);
2476       StubRoutines::_arrayof_oop_arraycopy
2477         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2478                                      /*dest_uninitialized*/false);
2479       // Aligned versions without pre-barriers
2480       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2481         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2482                                      /*dest_uninitialized*/true);
2483       StubRoutines::_arrayof_oop_arraycopy_uninit
2484         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2485                                      /*dest_uninitialized*/true);
2486     }
2487 
2488     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2489     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2490     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2491     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2492 
2493     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2494     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2495                                                                         /*dest_uninitialized*/true);
2496 
2497     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2498                                                               entry_jbyte_arraycopy,
2499                                                               entry_jshort_arraycopy,
2500                                                               entry_jint_arraycopy,
2501                                                               entry_jlong_arraycopy);
2502 
2503     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2504                                                                entry_jbyte_arraycopy,
2505                                                                entry_jshort_arraycopy,
2506                                                                entry_jint_arraycopy,
2507                                                                entry_oop_arraycopy,
2508                                                                entry_jlong_arraycopy,
2509                                                                entry_checkcast_arraycopy);
2510 
2511     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2512     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2513     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2514     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2515     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2516     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2517   }
2518 
2519   void generate_math_stubs() { Unimplemented(); }
2520 
2521   // Arguments:
2522   //
2523   // Inputs:
2524   //   c_rarg0   - source byte array address
2525   //   c_rarg1   - destination byte array address
2526   //   c_rarg2   - K (key) in little endian int array
2527   //
2528   address generate_aescrypt_encryptBlock() {
2529     __ align(CodeEntryAlignment);
2530     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2531 
2532     Label L_doLast;
2533 
2534     const Register from        = c_rarg0;  // source array address
2535     const Register to          = c_rarg1;  // destination array address
2536     const Register key         = c_rarg2;  // key array address
2537     const Register keylen      = rscratch1;
2538 
2539     address start = __ pc();
2540     __ enter();
2541 
2542     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2543 
2544     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2545 
2546     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2547     __ rev32(v1, __ T16B, v1);
2548     __ rev32(v2, __ T16B, v2);
2549     __ rev32(v3, __ T16B, v3);
2550     __ rev32(v4, __ T16B, v4);
2551     __ aese(v0, v1);
2552     __ aesmc(v0, v0);
2553     __ aese(v0, v2);
2554     __ aesmc(v0, v0);
2555     __ aese(v0, v3);
2556     __ aesmc(v0, v0);
2557     __ aese(v0, v4);
2558     __ aesmc(v0, v0);
2559 
2560     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2561     __ rev32(v1, __ T16B, v1);
2562     __ rev32(v2, __ T16B, v2);
2563     __ rev32(v3, __ T16B, v3);
2564     __ rev32(v4, __ T16B, v4);
2565     __ aese(v0, v1);
2566     __ aesmc(v0, v0);
2567     __ aese(v0, v2);
2568     __ aesmc(v0, v0);
2569     __ aese(v0, v3);
2570     __ aesmc(v0, v0);
2571     __ aese(v0, v4);
2572     __ aesmc(v0, v0);
2573 
2574     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2575     __ rev32(v1, __ T16B, v1);
2576     __ rev32(v2, __ T16B, v2);
2577 
2578     __ cmpw(keylen, 44);
2579     __ br(Assembler::EQ, L_doLast);
2580 
2581     __ aese(v0, v1);
2582     __ aesmc(v0, v0);
2583     __ aese(v0, v2);
2584     __ aesmc(v0, v0);
2585 
2586     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2587     __ rev32(v1, __ T16B, v1);
2588     __ rev32(v2, __ T16B, v2);
2589 
2590     __ cmpw(keylen, 52);
2591     __ br(Assembler::EQ, L_doLast);
2592 
2593     __ aese(v0, v1);
2594     __ aesmc(v0, v0);
2595     __ aese(v0, v2);
2596     __ aesmc(v0, v0);
2597 
2598     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2599     __ rev32(v1, __ T16B, v1);
2600     __ rev32(v2, __ T16B, v2);
2601 
2602     __ BIND(L_doLast);
2603 
2604     __ aese(v0, v1);
2605     __ aesmc(v0, v0);
2606     __ aese(v0, v2);
2607 
2608     __ ld1(v1, __ T16B, key);
2609     __ rev32(v1, __ T16B, v1);
2610     __ eor(v0, __ T16B, v0, v1);
2611 
2612     __ st1(v0, __ T16B, to);
2613 
2614     __ mov(r0, 0);
2615 
2616     __ leave();
2617     __ ret(lr);
2618 
2619     return start;
2620   }
2621 
2622   // Arguments:
2623   //
2624   // Inputs:
2625   //   c_rarg0   - source byte array address
2626   //   c_rarg1   - destination byte array address
2627   //   c_rarg2   - K (key) in little endian int array
2628   //
2629   address generate_aescrypt_decryptBlock() {
2630     assert(UseAES, "need AES instructions and misaligned SSE support");
2631     __ align(CodeEntryAlignment);
2632     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2633     Label L_doLast;
2634 
2635     const Register from        = c_rarg0;  // source array address
2636     const Register to          = c_rarg1;  // destination array address
2637     const Register key         = c_rarg2;  // key array address
2638     const Register keylen      = rscratch1;
2639 
2640     address start = __ pc();
2641     __ enter(); // required for proper stackwalking of RuntimeStub frame
2642 
2643     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2644 
2645     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2646 
2647     __ ld1(v5, __ T16B, __ post(key, 16));
2648     __ rev32(v5, __ T16B, v5);
2649 
2650     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2651     __ rev32(v1, __ T16B, v1);
2652     __ rev32(v2, __ T16B, v2);
2653     __ rev32(v3, __ T16B, v3);
2654     __ rev32(v4, __ T16B, v4);
2655     __ aesd(v0, v1);
2656     __ aesimc(v0, v0);
2657     __ aesd(v0, v2);
2658     __ aesimc(v0, v0);
2659     __ aesd(v0, v3);
2660     __ aesimc(v0, v0);
2661     __ aesd(v0, v4);
2662     __ aesimc(v0, v0);
2663 
2664     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2665     __ rev32(v1, __ T16B, v1);
2666     __ rev32(v2, __ T16B, v2);
2667     __ rev32(v3, __ T16B, v3);
2668     __ rev32(v4, __ T16B, v4);
2669     __ aesd(v0, v1);
2670     __ aesimc(v0, v0);
2671     __ aesd(v0, v2);
2672     __ aesimc(v0, v0);
2673     __ aesd(v0, v3);
2674     __ aesimc(v0, v0);
2675     __ aesd(v0, v4);
2676     __ aesimc(v0, v0);
2677 
2678     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2679     __ rev32(v1, __ T16B, v1);
2680     __ rev32(v2, __ T16B, v2);
2681 
2682     __ cmpw(keylen, 44);
2683     __ br(Assembler::EQ, L_doLast);
2684 
2685     __ aesd(v0, v1);
2686     __ aesimc(v0, v0);
2687     __ aesd(v0, v2);
2688     __ aesimc(v0, v0);
2689 
2690     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2691     __ rev32(v1, __ T16B, v1);
2692     __ rev32(v2, __ T16B, v2);
2693 
2694     __ cmpw(keylen, 52);
2695     __ br(Assembler::EQ, L_doLast);
2696 
2697     __ aesd(v0, v1);
2698     __ aesimc(v0, v0);
2699     __ aesd(v0, v2);
2700     __ aesimc(v0, v0);
2701 
2702     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2703     __ rev32(v1, __ T16B, v1);
2704     __ rev32(v2, __ T16B, v2);
2705 
2706     __ BIND(L_doLast);
2707 
2708     __ aesd(v0, v1);
2709     __ aesimc(v0, v0);
2710     __ aesd(v0, v2);
2711 
2712     __ eor(v0, __ T16B, v0, v5);
2713 
2714     __ st1(v0, __ T16B, to);
2715 
2716     __ mov(r0, 0);
2717 
2718     __ leave();
2719     __ ret(lr);
2720 
2721     return start;
2722   }
2723 
2724   // Arguments:
2725   //
2726   // Inputs:
2727   //   c_rarg0   - source byte array address
2728   //   c_rarg1   - destination byte array address
2729   //   c_rarg2   - K (key) in little endian int array
2730   //   c_rarg3   - r vector byte array address
2731   //   c_rarg4   - input length
2732   //
2733   // Output:
2734   //   x0        - input length
2735   //
2736   address generate_cipherBlockChaining_encryptAESCrypt() {
2737     assert(UseAES, "need AES instructions and misaligned SSE support");
2738     __ align(CodeEntryAlignment);
2739     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2740 
2741     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2742 
2743     const Register from        = c_rarg0;  // source array address
2744     const Register to          = c_rarg1;  // destination array address
2745     const Register key         = c_rarg2;  // key array address
2746     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2747                                            // and left with the results of the last encryption block
2748     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2749     const Register keylen      = rscratch1;
2750 
2751     address start = __ pc();
2752 
2753       __ enter();
2754 
2755       __ movw(rscratch2, len_reg);
2756 
2757       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2758 
2759       __ ld1(v0, __ T16B, rvec);
2760 
2761       __ cmpw(keylen, 52);
2762       __ br(Assembler::CC, L_loadkeys_44);
2763       __ br(Assembler::EQ, L_loadkeys_52);
2764 
2765       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2766       __ rev32(v17, __ T16B, v17);
2767       __ rev32(v18, __ T16B, v18);
2768     __ BIND(L_loadkeys_52);
2769       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2770       __ rev32(v19, __ T16B, v19);
2771       __ rev32(v20, __ T16B, v20);
2772     __ BIND(L_loadkeys_44);
2773       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2774       __ rev32(v21, __ T16B, v21);
2775       __ rev32(v22, __ T16B, v22);
2776       __ rev32(v23, __ T16B, v23);
2777       __ rev32(v24, __ T16B, v24);
2778       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2779       __ rev32(v25, __ T16B, v25);
2780       __ rev32(v26, __ T16B, v26);
2781       __ rev32(v27, __ T16B, v27);
2782       __ rev32(v28, __ T16B, v28);
2783       __ ld1(v29, v30, v31, __ T16B, key);
2784       __ rev32(v29, __ T16B, v29);
2785       __ rev32(v30, __ T16B, v30);
2786       __ rev32(v31, __ T16B, v31);
2787 
2788     __ BIND(L_aes_loop);
2789       __ ld1(v1, __ T16B, __ post(from, 16));
2790       __ eor(v0, __ T16B, v0, v1);
2791 
2792       __ br(Assembler::CC, L_rounds_44);
2793       __ br(Assembler::EQ, L_rounds_52);
2794 
2795       __ aese(v0, v17); __ aesmc(v0, v0);
2796       __ aese(v0, v18); __ aesmc(v0, v0);
2797     __ BIND(L_rounds_52);
2798       __ aese(v0, v19); __ aesmc(v0, v0);
2799       __ aese(v0, v20); __ aesmc(v0, v0);
2800     __ BIND(L_rounds_44);
2801       __ aese(v0, v21); __ aesmc(v0, v0);
2802       __ aese(v0, v22); __ aesmc(v0, v0);
2803       __ aese(v0, v23); __ aesmc(v0, v0);
2804       __ aese(v0, v24); __ aesmc(v0, v0);
2805       __ aese(v0, v25); __ aesmc(v0, v0);
2806       __ aese(v0, v26); __ aesmc(v0, v0);
2807       __ aese(v0, v27); __ aesmc(v0, v0);
2808       __ aese(v0, v28); __ aesmc(v0, v0);
2809       __ aese(v0, v29); __ aesmc(v0, v0);
2810       __ aese(v0, v30);
2811       __ eor(v0, __ T16B, v0, v31);
2812 
2813       __ st1(v0, __ T16B, __ post(to, 16));
2814 
2815       __ subw(len_reg, len_reg, 16);
2816       __ cbnzw(len_reg, L_aes_loop);
2817 
2818       __ st1(v0, __ T16B, rvec);
2819 
2820       __ mov(r0, rscratch2);
2821 
2822       __ leave();
2823       __ ret(lr);
2824 
2825       return start;
2826   }
2827 
2828   // Arguments:
2829   //
2830   // Inputs:
2831   //   c_rarg0   - source byte array address
2832   //   c_rarg1   - destination byte array address
2833   //   c_rarg2   - K (key) in little endian int array
2834   //   c_rarg3   - r vector byte array address
2835   //   c_rarg4   - input length
2836   //
2837   // Output:
2838   //   r0        - input length
2839   //
2840   address generate_cipherBlockChaining_decryptAESCrypt() {
2841     assert(UseAES, "need AES instructions and misaligned SSE support");
2842     __ align(CodeEntryAlignment);
2843     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2844 
2845     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2846 
2847     const Register from        = c_rarg0;  // source array address
2848     const Register to          = c_rarg1;  // destination array address
2849     const Register key         = c_rarg2;  // key array address
2850     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2851                                            // and left with the results of the last encryption block
2852     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2853     const Register keylen      = rscratch1;
2854 
2855     address start = __ pc();
2856 
2857       __ enter();
2858 
2859       __ movw(rscratch2, len_reg);
2860 
2861       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2862 
2863       __ ld1(v2, __ T16B, rvec);
2864 
2865       __ ld1(v31, __ T16B, __ post(key, 16));
2866       __ rev32(v31, __ T16B, v31);
2867 
2868       __ cmpw(keylen, 52);
2869       __ br(Assembler::CC, L_loadkeys_44);
2870       __ br(Assembler::EQ, L_loadkeys_52);
2871 
2872       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2873       __ rev32(v17, __ T16B, v17);
2874       __ rev32(v18, __ T16B, v18);
2875     __ BIND(L_loadkeys_52);
2876       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2877       __ rev32(v19, __ T16B, v19);
2878       __ rev32(v20, __ T16B, v20);
2879     __ BIND(L_loadkeys_44);
2880       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2881       __ rev32(v21, __ T16B, v21);
2882       __ rev32(v22, __ T16B, v22);
2883       __ rev32(v23, __ T16B, v23);
2884       __ rev32(v24, __ T16B, v24);
2885       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2886       __ rev32(v25, __ T16B, v25);
2887       __ rev32(v26, __ T16B, v26);
2888       __ rev32(v27, __ T16B, v27);
2889       __ rev32(v28, __ T16B, v28);
2890       __ ld1(v29, v30, __ T16B, key);
2891       __ rev32(v29, __ T16B, v29);
2892       __ rev32(v30, __ T16B, v30);
2893 
2894     __ BIND(L_aes_loop);
2895       __ ld1(v0, __ T16B, __ post(from, 16));
2896       __ orr(v1, __ T16B, v0, v0);
2897 
2898       __ br(Assembler::CC, L_rounds_44);
2899       __ br(Assembler::EQ, L_rounds_52);
2900 
2901       __ aesd(v0, v17); __ aesimc(v0, v0);
2902       __ aesd(v0, v18); __ aesimc(v0, v0);
2903     __ BIND(L_rounds_52);
2904       __ aesd(v0, v19); __ aesimc(v0, v0);
2905       __ aesd(v0, v20); __ aesimc(v0, v0);
2906     __ BIND(L_rounds_44);
2907       __ aesd(v0, v21); __ aesimc(v0, v0);
2908       __ aesd(v0, v22); __ aesimc(v0, v0);
2909       __ aesd(v0, v23); __ aesimc(v0, v0);
2910       __ aesd(v0, v24); __ aesimc(v0, v0);
2911       __ aesd(v0, v25); __ aesimc(v0, v0);
2912       __ aesd(v0, v26); __ aesimc(v0, v0);
2913       __ aesd(v0, v27); __ aesimc(v0, v0);
2914       __ aesd(v0, v28); __ aesimc(v0, v0);
2915       __ aesd(v0, v29); __ aesimc(v0, v0);
2916       __ aesd(v0, v30);
2917       __ eor(v0, __ T16B, v0, v31);
2918       __ eor(v0, __ T16B, v0, v2);
2919 
2920       __ st1(v0, __ T16B, __ post(to, 16));
2921       __ orr(v2, __ T16B, v1, v1);
2922 
2923       __ subw(len_reg, len_reg, 16);
2924       __ cbnzw(len_reg, L_aes_loop);
2925 
2926       __ st1(v2, __ T16B, rvec);
2927 
2928       __ mov(r0, rscratch2);
2929 
2930       __ leave();
2931       __ ret(lr);
2932 
2933     return start;
2934   }
2935 
2936   // Arguments:
2937   //
2938   // Inputs:
2939   //   c_rarg0   - byte[]  source+offset
2940   //   c_rarg1   - int[]   SHA.state
2941   //   c_rarg2   - int     offset
2942   //   c_rarg3   - int     limit
2943   //
2944   address generate_sha1_implCompress(bool multi_block, const char *name) {
2945     __ align(CodeEntryAlignment);
2946     StubCodeMark mark(this, "StubRoutines", name);
2947     address start = __ pc();
2948 
2949     Register buf   = c_rarg0;
2950     Register state = c_rarg1;
2951     Register ofs   = c_rarg2;
2952     Register limit = c_rarg3;
2953 
2954     Label keys;
2955     Label sha1_loop;
2956 
2957     // load the keys into v0..v3
2958     __ adr(rscratch1, keys);
2959     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2960     // load 5 words state into v6, v7
2961     __ ldrq(v6, Address(state, 0));
2962     __ ldrs(v7, Address(state, 16));
2963 
2964 
2965     __ BIND(sha1_loop);
2966     // load 64 bytes of data into v16..v19
2967     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2968     __ rev32(v16, __ T16B, v16);
2969     __ rev32(v17, __ T16B, v17);
2970     __ rev32(v18, __ T16B, v18);
2971     __ rev32(v19, __ T16B, v19);
2972 
2973     // do the sha1
2974     __ addv(v4, __ T4S, v16, v0);
2975     __ orr(v20, __ T16B, v6, v6);
2976 
2977     FloatRegister d0 = v16;
2978     FloatRegister d1 = v17;
2979     FloatRegister d2 = v18;
2980     FloatRegister d3 = v19;
2981 
2982     for (int round = 0; round < 20; round++) {
2983       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2984       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2985       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2986       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2987       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2988 
2989       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2990       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2991       __ sha1h(tmp2, __ T4S, v20);
2992       if (round < 5)
2993         __ sha1c(v20, __ T4S, tmp3, tmp4);
2994       else if (round < 10 || round >= 15)
2995         __ sha1p(v20, __ T4S, tmp3, tmp4);
2996       else
2997         __ sha1m(v20, __ T4S, tmp3, tmp4);
2998       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2999 
3000       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3001     }
3002 
3003     __ addv(v7, __ T2S, v7, v21);
3004     __ addv(v6, __ T4S, v6, v20);
3005 
3006     if (multi_block) {
3007       __ add(ofs, ofs, 64);
3008       __ cmp(ofs, limit);
3009       __ br(Assembler::LE, sha1_loop);
3010       __ mov(c_rarg0, ofs); // return ofs
3011     }
3012 
3013     __ strq(v6, Address(state, 0));
3014     __ strs(v7, Address(state, 16));
3015 
3016     __ ret(lr);
3017 
3018     __ bind(keys);
3019     __ emit_int32(0x5a827999);
3020     __ emit_int32(0x6ed9eba1);
3021     __ emit_int32(0x8f1bbcdc);
3022     __ emit_int32(0xca62c1d6);
3023 
3024     return start;
3025   }
3026 
3027 
3028   // Arguments:
3029   //
3030   // Inputs:
3031   //   c_rarg0   - byte[]  source+offset
3032   //   c_rarg1   - int[]   SHA.state
3033   //   c_rarg2   - int     offset
3034   //   c_rarg3   - int     limit
3035   //
3036   address generate_sha256_implCompress(bool multi_block, const char *name) {
3037     static const uint32_t round_consts[64] = {
3038       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3039       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3040       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3041       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3042       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3043       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3044       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3045       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3046       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3047       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3048       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3049       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3050       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3051       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3052       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3053       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3054     };
3055     __ align(CodeEntryAlignment);
3056     StubCodeMark mark(this, "StubRoutines", name);
3057     address start = __ pc();
3058 
3059     Register buf   = c_rarg0;
3060     Register state = c_rarg1;
3061     Register ofs   = c_rarg2;
3062     Register limit = c_rarg3;
3063 
3064     Label sha1_loop;
3065 
3066     __ stpd(v8, v9, __ pre(sp, -32));
3067     __ stpd(v10, v11, Address(sp, 16));
3068 
3069 // dga == v0
3070 // dgb == v1
3071 // dg0 == v2
3072 // dg1 == v3
3073 // dg2 == v4
3074 // t0 == v6
3075 // t1 == v7
3076 
3077     // load 16 keys to v16..v31
3078     __ lea(rscratch1, ExternalAddress((address)round_consts));
3079     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3080     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3081     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3082     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3083 
3084     // load 8 words (256 bits) state
3085     __ ldpq(v0, v1, state);
3086 
3087     __ BIND(sha1_loop);
3088     // load 64 bytes of data into v8..v11
3089     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3090     __ rev32(v8, __ T16B, v8);
3091     __ rev32(v9, __ T16B, v9);
3092     __ rev32(v10, __ T16B, v10);
3093     __ rev32(v11, __ T16B, v11);
3094 
3095     __ addv(v6, __ T4S, v8, v16);
3096     __ orr(v2, __ T16B, v0, v0);
3097     __ orr(v3, __ T16B, v1, v1);
3098 
3099     FloatRegister d0 = v8;
3100     FloatRegister d1 = v9;
3101     FloatRegister d2 = v10;
3102     FloatRegister d3 = v11;
3103 
3104 
3105     for (int round = 0; round < 16; round++) {
3106       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3107       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3108       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3109       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3110 
3111       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3112        __ orr(v4, __ T16B, v2, v2);
3113       if (round < 15)
3114         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3115       __ sha256h(v2, __ T4S, v3, tmp2);
3116       __ sha256h2(v3, __ T4S, v4, tmp2);
3117       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3118 
3119       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3120     }
3121 
3122     __ addv(v0, __ T4S, v0, v2);
3123     __ addv(v1, __ T4S, v1, v3);
3124 
3125     if (multi_block) {
3126       __ add(ofs, ofs, 64);
3127       __ cmp(ofs, limit);
3128       __ br(Assembler::LE, sha1_loop);
3129       __ mov(c_rarg0, ofs); // return ofs
3130     }
3131 
3132     __ ldpd(v10, v11, Address(sp, 16));
3133     __ ldpd(v8, v9, __ post(sp, 32));
3134 
3135     __ stpq(v0, v1, state);
3136 
3137     __ ret(lr);
3138 
3139     return start;
3140   }
3141 
3142 #ifndef BUILTIN_SIM
3143   // Safefetch stubs.
3144   void generate_safefetch(const char* name, int size, address* entry,
3145                           address* fault_pc, address* continuation_pc) {
3146     // safefetch signatures:
3147     //   int      SafeFetch32(int*      adr, int      errValue);
3148     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3149     //
3150     // arguments:
3151     //   c_rarg0 = adr
3152     //   c_rarg1 = errValue
3153     //
3154     // result:
3155     //   PPC_RET  = *adr or errValue
3156 
3157     StubCodeMark mark(this, "StubRoutines", name);
3158 
3159     // Entry point, pc or function descriptor.
3160     *entry = __ pc();
3161 
3162     // Load *adr into c_rarg1, may fault.
3163     *fault_pc = __ pc();
3164     switch (size) {
3165       case 4:
3166         // int32_t
3167         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3168         break;
3169       case 8:
3170         // int64_t
3171         __ ldr(c_rarg1, Address(c_rarg0, 0));
3172         break;
3173       default:
3174         ShouldNotReachHere();
3175     }
3176 
3177     // return errValue or *adr
3178     *continuation_pc = __ pc();
3179     __ mov(r0, c_rarg1);
3180     __ ret(lr);
3181   }
3182 #endif
3183 
3184   /**
3185    *  Arguments:
3186    *
3187    * Inputs:
3188    *   c_rarg0   - int crc
3189    *   c_rarg1   - byte* buf
3190    *   c_rarg2   - int length
3191    *
3192    * Ouput:
3193    *       rax   - int crc result
3194    */
3195   address generate_updateBytesCRC32() {
3196     assert(UseCRC32Intrinsics, "what are we doing here?");
3197 
3198     __ align(CodeEntryAlignment);
3199     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3200 
3201     address start = __ pc();
3202 
3203     const Register crc   = c_rarg0;  // crc
3204     const Register buf   = c_rarg1;  // source java byte array address
3205     const Register len   = c_rarg2;  // length
3206     const Register table0 = c_rarg3; // crc_table address
3207     const Register table1 = c_rarg4;
3208     const Register table2 = c_rarg5;
3209     const Register table3 = c_rarg6;
3210     const Register tmp3 = c_rarg7;
3211 
3212     BLOCK_COMMENT("Entry:");
3213     __ enter(); // required for proper stackwalking of RuntimeStub frame
3214 
3215     __ kernel_crc32(crc, buf, len,
3216               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3217 
3218     __ leave(); // required for proper stackwalking of RuntimeStub frame
3219     __ ret(lr);
3220 
3221     return start;
3222   }
3223 
3224   /**
3225    *  Arguments:
3226    *
3227    * Inputs:
3228    *   c_rarg0   - int crc
3229    *   c_rarg1   - byte* buf
3230    *   c_rarg2   - int length
3231    *   c_rarg3   - int* table
3232    *
3233    * Ouput:
3234    *       r0   - int crc result
3235    */
3236   address generate_updateBytesCRC32C() {
3237     assert(UseCRC32CIntrinsics, "what are we doing here?");
3238 
3239     __ align(CodeEntryAlignment);
3240     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3241 
3242     address start = __ pc();
3243 
3244     const Register crc   = c_rarg0;  // crc
3245     const Register buf   = c_rarg1;  // source java byte array address
3246     const Register len   = c_rarg2;  // length
3247     const Register table0 = c_rarg3; // crc_table address
3248     const Register table1 = c_rarg4;
3249     const Register table2 = c_rarg5;
3250     const Register table3 = c_rarg6;
3251     const Register tmp3 = c_rarg7;
3252 
3253     BLOCK_COMMENT("Entry:");
3254     __ enter(); // required for proper stackwalking of RuntimeStub frame
3255 
3256     __ kernel_crc32c(crc, buf, len,
3257               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3258 
3259     __ leave(); // required for proper stackwalking of RuntimeStub frame
3260     __ ret(lr);
3261 
3262     return start;
3263   }
3264 
3265   /***
3266    *  Arguments:
3267    *
3268    *  Inputs:
3269    *   c_rarg0   - int   adler
3270    *   c_rarg1   - byte* buff
3271    *   c_rarg2   - int   len
3272    *
3273    * Output:
3274    *   c_rarg0   - int adler result
3275    */
3276   address generate_updateBytesAdler32() {
3277     __ align(CodeEntryAlignment);
3278     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3279     address start = __ pc();
3280 
3281     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3282 
3283     // Aliases
3284     Register adler  = c_rarg0;
3285     Register s1     = c_rarg0;
3286     Register s2     = c_rarg3;
3287     Register buff   = c_rarg1;
3288     Register len    = c_rarg2;
3289     Register nmax  = r4;
3290     Register base = r5;
3291     Register count = r6;
3292     Register temp0 = rscratch1;
3293     Register temp1 = rscratch2;
3294     Register temp2 = r7;
3295 
3296     // Max number of bytes we can process before having to take the mod
3297     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3298     unsigned long BASE = 0xfff1;
3299     unsigned long NMAX = 0x15B0;
3300 
3301     __ mov(base, BASE);
3302     __ mov(nmax, NMAX);
3303 
3304     // s1 is initialized to the lower 16 bits of adler
3305     // s2 is initialized to the upper 16 bits of adler
3306     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3307     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3308 
3309     // The pipelined loop needs at least 16 elements for 1 iteration
3310     // It does check this, but it is more effective to skip to the cleanup loop
3311     __ cmp(len, 16);
3312     __ br(Assembler::HS, L_nmax);
3313     __ cbz(len, L_combine);
3314 
3315     __ bind(L_simple_by1_loop);
3316     __ ldrb(temp0, Address(__ post(buff, 1)));
3317     __ add(s1, s1, temp0);
3318     __ add(s2, s2, s1);
3319     __ subs(len, len, 1);
3320     __ br(Assembler::HI, L_simple_by1_loop);
3321 
3322     // s1 = s1 % BASE
3323     __ subs(temp0, s1, base);
3324     __ csel(s1, temp0, s1, Assembler::HS);
3325 
3326     // s2 = s2 % BASE
3327     __ lsr(temp0, s2, 16);
3328     __ lsl(temp1, temp0, 4);
3329     __ sub(temp1, temp1, temp0);
3330     __ add(s2, temp1, s2, ext::uxth);
3331 
3332     __ subs(temp0, s2, base);
3333     __ csel(s2, temp0, s2, Assembler::HS);
3334 
3335     __ b(L_combine);
3336 
3337     __ bind(L_nmax);
3338     __ subs(len, len, nmax);
3339     __ sub(count, nmax, 16);
3340     __ br(Assembler::LO, L_by16);
3341 
3342     __ bind(L_nmax_loop);
3343 
3344     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3345 
3346     __ add(s1, s1, temp0, ext::uxtb);
3347     __ ubfx(temp2, temp0, 8, 8);
3348     __ add(s2, s2, s1);
3349     __ add(s1, s1, temp2);
3350     __ ubfx(temp2, temp0, 16, 8);
3351     __ add(s2, s2, s1);
3352     __ add(s1, s1, temp2);
3353     __ ubfx(temp2, temp0, 24, 8);
3354     __ add(s2, s2, s1);
3355     __ add(s1, s1, temp2);
3356     __ ubfx(temp2, temp0, 32, 8);
3357     __ add(s2, s2, s1);
3358     __ add(s1, s1, temp2);
3359     __ ubfx(temp2, temp0, 40, 8);
3360     __ add(s2, s2, s1);
3361     __ add(s1, s1, temp2);
3362     __ ubfx(temp2, temp0, 48, 8);
3363     __ add(s2, s2, s1);
3364     __ add(s1, s1, temp2);
3365     __ add(s2, s2, s1);
3366     __ add(s1, s1, temp0, Assembler::LSR, 56);
3367     __ add(s2, s2, s1);
3368 
3369     __ add(s1, s1, temp1, ext::uxtb);
3370     __ ubfx(temp2, temp1, 8, 8);
3371     __ add(s2, s2, s1);
3372     __ add(s1, s1, temp2);
3373     __ ubfx(temp2, temp1, 16, 8);
3374     __ add(s2, s2, s1);
3375     __ add(s1, s1, temp2);
3376     __ ubfx(temp2, temp1, 24, 8);
3377     __ add(s2, s2, s1);
3378     __ add(s1, s1, temp2);
3379     __ ubfx(temp2, temp1, 32, 8);
3380     __ add(s2, s2, s1);
3381     __ add(s1, s1, temp2);
3382     __ ubfx(temp2, temp1, 40, 8);
3383     __ add(s2, s2, s1);
3384     __ add(s1, s1, temp2);
3385     __ ubfx(temp2, temp1, 48, 8);
3386     __ add(s2, s2, s1);
3387     __ add(s1, s1, temp2);
3388     __ add(s2, s2, s1);
3389     __ add(s1, s1, temp1, Assembler::LSR, 56);
3390     __ add(s2, s2, s1);
3391 
3392     __ subs(count, count, 16);
3393     __ br(Assembler::HS, L_nmax_loop);
3394 
3395     // s1 = s1 % BASE
3396     __ lsr(temp0, s1, 16);
3397     __ lsl(temp1, temp0, 4);
3398     __ sub(temp1, temp1, temp0);
3399     __ add(temp1, temp1, s1, ext::uxth);
3400 
3401     __ lsr(temp0, temp1, 16);
3402     __ lsl(s1, temp0, 4);
3403     __ sub(s1, s1, temp0);
3404     __ add(s1, s1, temp1, ext:: uxth);
3405 
3406     __ subs(temp0, s1, base);
3407     __ csel(s1, temp0, s1, Assembler::HS);
3408 
3409     // s2 = s2 % BASE
3410     __ lsr(temp0, s2, 16);
3411     __ lsl(temp1, temp0, 4);
3412     __ sub(temp1, temp1, temp0);
3413     __ add(temp1, temp1, s2, ext::uxth);
3414 
3415     __ lsr(temp0, temp1, 16);
3416     __ lsl(s2, temp0, 4);
3417     __ sub(s2, s2, temp0);
3418     __ add(s2, s2, temp1, ext:: uxth);
3419 
3420     __ subs(temp0, s2, base);
3421     __ csel(s2, temp0, s2, Assembler::HS);
3422 
3423     __ subs(len, len, nmax);
3424     __ sub(count, nmax, 16);
3425     __ br(Assembler::HS, L_nmax_loop);
3426 
3427     __ bind(L_by16);
3428     __ adds(len, len, count);
3429     __ br(Assembler::LO, L_by1);
3430 
3431     __ bind(L_by16_loop);
3432 
3433     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3434 
3435     __ add(s1, s1, temp0, ext::uxtb);
3436     __ ubfx(temp2, temp0, 8, 8);
3437     __ add(s2, s2, s1);
3438     __ add(s1, s1, temp2);
3439     __ ubfx(temp2, temp0, 16, 8);
3440     __ add(s2, s2, s1);
3441     __ add(s1, s1, temp2);
3442     __ ubfx(temp2, temp0, 24, 8);
3443     __ add(s2, s2, s1);
3444     __ add(s1, s1, temp2);
3445     __ ubfx(temp2, temp0, 32, 8);
3446     __ add(s2, s2, s1);
3447     __ add(s1, s1, temp2);
3448     __ ubfx(temp2, temp0, 40, 8);
3449     __ add(s2, s2, s1);
3450     __ add(s1, s1, temp2);
3451     __ ubfx(temp2, temp0, 48, 8);
3452     __ add(s2, s2, s1);
3453     __ add(s1, s1, temp2);
3454     __ add(s2, s2, s1);
3455     __ add(s1, s1, temp0, Assembler::LSR, 56);
3456     __ add(s2, s2, s1);
3457 
3458     __ add(s1, s1, temp1, ext::uxtb);
3459     __ ubfx(temp2, temp1, 8, 8);
3460     __ add(s2, s2, s1);
3461     __ add(s1, s1, temp2);
3462     __ ubfx(temp2, temp1, 16, 8);
3463     __ add(s2, s2, s1);
3464     __ add(s1, s1, temp2);
3465     __ ubfx(temp2, temp1, 24, 8);
3466     __ add(s2, s2, s1);
3467     __ add(s1, s1, temp2);
3468     __ ubfx(temp2, temp1, 32, 8);
3469     __ add(s2, s2, s1);
3470     __ add(s1, s1, temp2);
3471     __ ubfx(temp2, temp1, 40, 8);
3472     __ add(s2, s2, s1);
3473     __ add(s1, s1, temp2);
3474     __ ubfx(temp2, temp1, 48, 8);
3475     __ add(s2, s2, s1);
3476     __ add(s1, s1, temp2);
3477     __ add(s2, s2, s1);
3478     __ add(s1, s1, temp1, Assembler::LSR, 56);
3479     __ add(s2, s2, s1);
3480 
3481     __ subs(len, len, 16);
3482     __ br(Assembler::HS, L_by16_loop);
3483 
3484     __ bind(L_by1);
3485     __ adds(len, len, 15);
3486     __ br(Assembler::LO, L_do_mod);
3487 
3488     __ bind(L_by1_loop);
3489     __ ldrb(temp0, Address(__ post(buff, 1)));
3490     __ add(s1, temp0, s1);
3491     __ add(s2, s2, s1);
3492     __ subs(len, len, 1);
3493     __ br(Assembler::HS, L_by1_loop);
3494 
3495     __ bind(L_do_mod);
3496     // s1 = s1 % BASE
3497     __ lsr(temp0, s1, 16);
3498     __ lsl(temp1, temp0, 4);
3499     __ sub(temp1, temp1, temp0);
3500     __ add(temp1, temp1, s1, ext::uxth);
3501 
3502     __ lsr(temp0, temp1, 16);
3503     __ lsl(s1, temp0, 4);
3504     __ sub(s1, s1, temp0);
3505     __ add(s1, s1, temp1, ext:: uxth);
3506 
3507     __ subs(temp0, s1, base);
3508     __ csel(s1, temp0, s1, Assembler::HS);
3509 
3510     // s2 = s2 % BASE
3511     __ lsr(temp0, s2, 16);
3512     __ lsl(temp1, temp0, 4);
3513     __ sub(temp1, temp1, temp0);
3514     __ add(temp1, temp1, s2, ext::uxth);
3515 
3516     __ lsr(temp0, temp1, 16);
3517     __ lsl(s2, temp0, 4);
3518     __ sub(s2, s2, temp0);
3519     __ add(s2, s2, temp1, ext:: uxth);
3520 
3521     __ subs(temp0, s2, base);
3522     __ csel(s2, temp0, s2, Assembler::HS);
3523 
3524     // Combine lower bits and higher bits
3525     __ bind(L_combine);
3526     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3527 
3528     __ ret(lr);
3529 
3530     return start;
3531   }
3532 
3533   /**
3534    *  Arguments:
3535    *
3536    *  Input:
3537    *    c_rarg0   - x address
3538    *    c_rarg1   - x length
3539    *    c_rarg2   - y address
3540    *    c_rarg3   - y lenth
3541    *    c_rarg4   - z address
3542    *    c_rarg5   - z length
3543    */
3544   address generate_multiplyToLen() {
3545     __ align(CodeEntryAlignment);
3546     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3547 
3548     address start = __ pc();
3549     const Register x     = r0;
3550     const Register xlen  = r1;
3551     const Register y     = r2;
3552     const Register ylen  = r3;
3553     const Register z     = r4;
3554     const Register zlen  = r5;
3555 
3556     const Register tmp1  = r10;
3557     const Register tmp2  = r11;
3558     const Register tmp3  = r12;
3559     const Register tmp4  = r13;
3560     const Register tmp5  = r14;
3561     const Register tmp6  = r15;
3562     const Register tmp7  = r16;
3563 
3564     BLOCK_COMMENT("Entry:");
3565     __ enter(); // required for proper stackwalking of RuntimeStub frame
3566     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3567     __ leave(); // required for proper stackwalking of RuntimeStub frame
3568     __ ret(lr);
3569 
3570     return start;
3571   }
3572 
3573   address generate_squareToLen() {
3574     // squareToLen algorithm for sizes 1..127 described in java code works
3575     // faster than multiply_to_len on some CPUs and slower on others, but
3576     // multiply_to_len shows a bit better overall results
3577     __ align(CodeEntryAlignment);
3578     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3579     address start = __ pc();
3580 
3581     const Register x     = r0;
3582     const Register xlen  = r1;
3583     const Register z     = r2;
3584     const Register zlen  = r3;
3585     const Register y     = r4; // == x
3586     const Register ylen  = r5; // == xlen
3587 
3588     const Register tmp1  = r10;
3589     const Register tmp2  = r11;
3590     const Register tmp3  = r12;
3591     const Register tmp4  = r13;
3592     const Register tmp5  = r14;
3593     const Register tmp6  = r15;
3594     const Register tmp7  = r16;
3595 
3596     RegSet spilled_regs = RegSet::of(y, ylen);
3597     BLOCK_COMMENT("Entry:");
3598     __ enter();
3599     __ push(spilled_regs, sp);
3600     __ mov(y, x);
3601     __ mov(ylen, xlen);
3602     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3603     __ pop(spilled_regs, sp);
3604     __ leave();
3605     __ ret(lr);
3606     return start;
3607   }
3608 
3609   address generate_mulAdd() {
3610     __ align(CodeEntryAlignment);
3611     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3612 
3613     address start = __ pc();
3614 
3615     const Register out     = r0;
3616     const Register in      = r1;
3617     const Register offset  = r2;
3618     const Register len     = r3;
3619     const Register k       = r4;
3620 
3621     BLOCK_COMMENT("Entry:");
3622     __ enter();
3623     __ mul_add(out, in, offset, len, k);
3624     __ leave();
3625     __ ret(lr);
3626 
3627     return start;
3628   }
3629 
3630   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3631                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3632                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3633     // Karatsuba multiplication performs a 128*128 -> 256-bit
3634     // multiplication in three 128-bit multiplications and a few
3635     // additions.
3636     //
3637     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3638     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3639     //
3640     // Inputs:
3641     //
3642     // A0 in a.d[0]     (subkey)
3643     // A1 in a.d[1]
3644     // (A1+A0) in a1_xor_a0.d[0]
3645     //
3646     // B0 in b.d[0]     (state)
3647     // B1 in b.d[1]
3648 
3649     __ ext(tmp1, __ T16B, b, b, 0x08);
3650     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3651     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3652     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3653     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3654 
3655     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3656     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3657     __ eor(tmp2, __ T16B, tmp2, tmp4);
3658     __ eor(tmp2, __ T16B, tmp2, tmp3);
3659 
3660     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3661     __ ins(result_hi, __ D, tmp2, 0, 1);
3662     __ ins(result_lo, __ D, tmp2, 1, 0);
3663   }
3664 
3665   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3666                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3667     const FloatRegister t0 = result;
3668 
3669     // The GCM field polynomial f is z^128 + p(z), where p =
3670     // z^7+z^2+z+1.
3671     //
3672     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3673     //
3674     // so, given that the product we're reducing is
3675     //    a == lo + hi * z^128
3676     // substituting,
3677     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3678     //
3679     // we reduce by multiplying hi by p(z) and subtracting the result
3680     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3681     // bits we can do this with two 64-bit multiplications, lo*p and
3682     // hi*p.
3683 
3684     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3685     __ ext(t1, __ T16B, t0, z, 8);
3686     __ eor(hi, __ T16B, hi, t1);
3687     __ ext(t1, __ T16B, z, t0, 8);
3688     __ eor(lo, __ T16B, lo, t1);
3689     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3690     __ eor(result, __ T16B, lo, t0);
3691   }
3692 
3693   address generate_has_negatives(address &has_negatives_long) {
3694     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3695     const int large_loop_size = 64;
3696     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3697     int dcache_line = VM_Version::dcache_line_size();
3698 
3699     Register ary1 = r1, len = r2, result = r0;
3700 
3701     __ align(CodeEntryAlignment);
3702     address entry = __ pc();
3703 
3704     __ enter();
3705 
3706   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3707         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3708 
3709   __ cmp(len, 15);
3710   __ br(Assembler::GT, LEN_OVER_15);
3711   // The only case when execution falls into this code is when pointer is near
3712   // the end of memory page and we have to avoid reading next page
3713   __ add(ary1, ary1, len);
3714   __ subs(len, len, 8);
3715   __ br(Assembler::GT, LEN_OVER_8);
3716   __ ldr(rscratch2, Address(ary1, -8));
3717   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3718   __ lsrv(rscratch2, rscratch2, rscratch1);
3719   __ tst(rscratch2, UPPER_BIT_MASK);
3720   __ cset(result, Assembler::NE);
3721   __ leave();
3722   __ ret(lr);
3723   __ bind(LEN_OVER_8);
3724   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3725   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3726   __ tst(rscratch2, UPPER_BIT_MASK);
3727   __ br(Assembler::NE, RET_TRUE_NO_POP);
3728   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3729   __ lsrv(rscratch1, rscratch1, rscratch2);
3730   __ tst(rscratch1, UPPER_BIT_MASK);
3731   __ cset(result, Assembler::NE);
3732   __ leave();
3733   __ ret(lr);
3734 
3735   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3736   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3737 
3738   has_negatives_long = __ pc(); // 2nd entry point
3739 
3740   __ enter();
3741 
3742   __ bind(LEN_OVER_15);
3743     __ push(spilled_regs, sp);
3744     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3745     __ cbz(rscratch2, ALIGNED);
3746     __ ldp(tmp6, tmp1, Address(ary1));
3747     __ mov(tmp5, 16);
3748     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3749     __ add(ary1, ary1, rscratch1);
3750     __ sub(len, len, rscratch1);
3751     __ orr(tmp6, tmp6, tmp1);
3752     __ tst(tmp6, UPPER_BIT_MASK);
3753     __ br(Assembler::NE, RET_TRUE);
3754 
3755   __ bind(ALIGNED);
3756     __ cmp(len, large_loop_size);
3757     __ br(Assembler::LT, CHECK_16);
3758     // Perform 16-byte load as early return in pre-loop to handle situation
3759     // when initially aligned large array has negative values at starting bytes,
3760     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3761     // slower. Cases with negative bytes further ahead won't be affected that
3762     // much. In fact, it'll be faster due to early loads, less instructions and
3763     // less branches in LARGE_LOOP.
3764     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3765     __ sub(len, len, 16);
3766     __ orr(tmp6, tmp6, tmp1);
3767     __ tst(tmp6, UPPER_BIT_MASK);
3768     __ br(Assembler::NE, RET_TRUE);
3769     __ cmp(len, large_loop_size);
3770     __ br(Assembler::LT, CHECK_16);
3771 
3772     if (SoftwarePrefetchHintDistance >= 0
3773         && SoftwarePrefetchHintDistance >= dcache_line) {
3774       // initial prefetch
3775       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3776     }
3777   __ bind(LARGE_LOOP);
3778     if (SoftwarePrefetchHintDistance >= 0) {
3779       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3780     }
3781     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3782     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3783     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3784     // instructions per cycle and have less branches, but this approach disables
3785     // early return, thus, all 64 bytes are loaded and checked every time.
3786     __ ldp(tmp2, tmp3, Address(ary1));
3787     __ ldp(tmp4, tmp5, Address(ary1, 16));
3788     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3789     __ ldp(tmp6, tmp1, Address(ary1, 48));
3790     __ add(ary1, ary1, large_loop_size);
3791     __ sub(len, len, large_loop_size);
3792     __ orr(tmp2, tmp2, tmp3);
3793     __ orr(tmp4, tmp4, tmp5);
3794     __ orr(rscratch1, rscratch1, rscratch2);
3795     __ orr(tmp6, tmp6, tmp1);
3796     __ orr(tmp2, tmp2, tmp4);
3797     __ orr(rscratch1, rscratch1, tmp6);
3798     __ orr(tmp2, tmp2, rscratch1);
3799     __ tst(tmp2, UPPER_BIT_MASK);
3800     __ br(Assembler::NE, RET_TRUE);
3801     __ cmp(len, large_loop_size);
3802     __ br(Assembler::GE, LARGE_LOOP);
3803 
3804   __ bind(CHECK_16); // small 16-byte load pre-loop
3805     __ cmp(len, 16);
3806     __ br(Assembler::LT, POST_LOOP16);
3807 
3808   __ bind(LOOP16); // small 16-byte load loop
3809     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3810     __ sub(len, len, 16);
3811     __ orr(tmp2, tmp2, tmp3);
3812     __ tst(tmp2, UPPER_BIT_MASK);
3813     __ br(Assembler::NE, RET_TRUE);
3814     __ cmp(len, 16);
3815     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3816 
3817   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3818     __ cmp(len, 8);
3819     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3820     __ ldr(tmp3, Address(__ post(ary1, 8)));
3821     __ sub(len, len, 8);
3822     __ tst(tmp3, UPPER_BIT_MASK);
3823     __ br(Assembler::NE, RET_TRUE);
3824 
3825   __ bind(POST_LOOP16_LOAD_TAIL);
3826     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3827     __ ldr(tmp1, Address(ary1));
3828     __ mov(tmp2, 64);
3829     __ sub(tmp4, tmp2, len, __ LSL, 3);
3830     __ lslv(tmp1, tmp1, tmp4);
3831     __ tst(tmp1, UPPER_BIT_MASK);
3832     __ br(Assembler::NE, RET_TRUE);
3833     // Fallthrough
3834 
3835   __ bind(RET_FALSE);
3836     __ pop(spilled_regs, sp);
3837     __ leave();
3838     __ mov(result, zr);
3839     __ ret(lr);
3840 
3841   __ bind(RET_TRUE);
3842     __ pop(spilled_regs, sp);
3843   __ bind(RET_TRUE_NO_POP);
3844     __ leave();
3845     __ mov(result, 1);
3846     __ ret(lr);
3847 
3848   __ bind(DONE);
3849     __ pop(spilled_regs, sp);
3850     __ leave();
3851     __ ret(lr);
3852     return entry;
3853   }
3854 
3855   void generate_large_array_equals_loop_nonsimd(int loopThreshold,
3856         bool usePrefetch, Label &NOT_EQUAL) {
3857     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3858         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3859         tmp7 = r12, tmp8 = r13;
3860     Label LOOP;
3861 
3862     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3863     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3864     __ bind(LOOP);
3865     if (usePrefetch) {
3866       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3867       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3868     }
3869     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3870     __ eor(tmp1, tmp1, tmp2);
3871     __ eor(tmp3, tmp3, tmp4);
3872     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3873     __ orr(tmp1, tmp1, tmp3);
3874     __ cbnz(tmp1, NOT_EQUAL);
3875     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3876     __ eor(tmp5, tmp5, tmp6);
3877     __ eor(tmp7, tmp7, tmp8);
3878     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3879     __ orr(tmp5, tmp5, tmp7);
3880     __ cbnz(tmp5, NOT_EQUAL);
3881     __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize)));
3882     __ eor(tmp1, tmp1, tmp2);
3883     __ eor(tmp3, tmp3, tmp4);
3884     __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize)));
3885     __ orr(tmp1, tmp1, tmp3);
3886     __ cbnz(tmp1, NOT_EQUAL);
3887     __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize)));
3888     __ eor(tmp5, tmp5, tmp6);
3889     __ sub(cnt1, cnt1, 8 * wordSize);
3890     __ eor(tmp7, tmp7, tmp8);
3891     __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize)));
3892     // tmp6 is not used. MacroAssembler::subs is used here (rather than
3893     // cmp) because subs allows an unlimited range of immediate operand.
3894     __ subs(tmp6, cnt1, loopThreshold);
3895     __ orr(tmp5, tmp5, tmp7);
3896     __ cbnz(tmp5, NOT_EQUAL);
3897     __ br(__ GE, LOOP);
3898     // post-loop
3899     __ eor(tmp1, tmp1, tmp2);
3900     __ eor(tmp3, tmp3, tmp4);
3901     __ orr(tmp1, tmp1, tmp3);
3902     __ sub(cnt1, cnt1, 2 * wordSize);
3903     __ cbnz(tmp1, NOT_EQUAL);
3904   }
3905 
3906   void generate_large_array_equals_loop_simd(int loopThreshold,
3907         bool usePrefetch, Label &NOT_EQUAL) {
3908     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3909         tmp2 = rscratch2;
3910     Label LOOP;
3911 
3912     __ bind(LOOP);
3913     if (usePrefetch) {
3914       __ prfm(Address(a1, SoftwarePrefetchHintDistance));
3915       __ prfm(Address(a2, SoftwarePrefetchHintDistance));
3916     }
3917     __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize)));
3918     __ sub(cnt1, cnt1, 8 * wordSize);
3919     __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize)));
3920     __ subs(tmp1, cnt1, loopThreshold);
3921     __ eor(v0, __ T16B, v0, v4);
3922     __ eor(v1, __ T16B, v1, v5);
3923     __ eor(v2, __ T16B, v2, v6);
3924     __ eor(v3, __ T16B, v3, v7);
3925     __ orr(v0, __ T16B, v0, v1);
3926     __ orr(v1, __ T16B, v2, v3);
3927     __ orr(v0, __ T16B, v0, v1);
3928     __ umov(tmp1, v0, __ D, 0);
3929     __ umov(tmp2, v0, __ D, 1);
3930     __ orr(tmp1, tmp1, tmp2);
3931     __ cbnz(tmp1, NOT_EQUAL);
3932     __ br(__ GE, LOOP);
3933   }
3934 
3935   // a1 = r1 - array1 address
3936   // a2 = r2 - array2 address
3937   // result = r0 - return value. Already contains "false"
3938   // cnt1 = r10 - amount of elements left to check, reduced by wordSize
3939   // r3-r5 are reserved temporary registers
3940   address generate_large_array_equals() {
3941     StubCodeMark mark(this, "StubRoutines", "large_array_equals");
3942     Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1,
3943         tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11,
3944         tmp7 = r12, tmp8 = r13;
3945     Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP,
3946         SMALL_LOOP, POST_LOOP;
3947     const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16;
3948     // calculate if at least 32 prefetched bytes are used
3949     int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32;
3950     int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE);
3951     RegSet spilled_regs = RegSet::range(tmp6, tmp8);
3952     assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4,
3953         tmp5, tmp6, tmp7, tmp8);
3954 
3955     __ align(CodeEntryAlignment);
3956     address entry = __ pc();
3957     __ enter();
3958     __ sub(cnt1, cnt1, wordSize);  // first 8 bytes were loaded outside of stub
3959     // also advance pointers to use post-increment instead of pre-increment
3960     __ add(a1, a1, wordSize);
3961     __ add(a2, a2, wordSize);
3962     if (AvoidUnalignedAccesses) {
3963       // both implementations (SIMD/nonSIMD) are using relatively large load
3964       // instructions (ld1/ldp), which has huge penalty (up to x2 exec time)
3965       // on some CPUs in case of address is not at least 16-byte aligned.
3966       // Arrays are 8-byte aligned currently, so, we can make additional 8-byte
3967       // load if needed at least for 1st address and make if 16-byte aligned.
3968       Label ALIGNED16;
3969       __ tbz(a1, 3, ALIGNED16);
3970       __ ldr(tmp1, Address(__ post(a1, wordSize)));
3971       __ ldr(tmp2, Address(__ post(a2, wordSize)));
3972       __ sub(cnt1, cnt1, wordSize);
3973       __ eor(tmp1, tmp1, tmp2);
3974       __ cbnz(tmp1, NOT_EQUAL_NO_POP);
3975       __ bind(ALIGNED16);
3976     }
3977     if (UseSIMDForArrayEquals) {
3978       if (SoftwarePrefetchHintDistance >= 0) {
3979         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3980         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3981         generate_large_array_equals_loop_simd(prefetchLoopThreshold,
3982             /* prfm = */ true, NOT_EQUAL);
3983         __ cmp(cnt1, nonPrefetchLoopThreshold);
3984         __ br(__ LT, TAIL);
3985       }
3986       __ bind(NO_PREFETCH_LARGE_LOOP);
3987       generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold,
3988           /* prfm = */ false, NOT_EQUAL);
3989     } else {
3990       __ push(spilled_regs, sp);
3991       if (SoftwarePrefetchHintDistance >= 0) {
3992         __ subs(tmp1, cnt1, prefetchLoopThreshold);
3993         __ br(__ LE, NO_PREFETCH_LARGE_LOOP);
3994         generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold,
3995             /* prfm = */ true, NOT_EQUAL);
3996         __ cmp(cnt1, nonPrefetchLoopThreshold);
3997         __ br(__ LT, TAIL);
3998       }
3999       __ bind(NO_PREFETCH_LARGE_LOOP);
4000       generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold,
4001           /* prfm = */ false, NOT_EQUAL);
4002     }
4003     __ bind(TAIL);
4004       __ cbz(cnt1, EQUAL);
4005       __ subs(cnt1, cnt1, wordSize);
4006       __ br(__ LE, POST_LOOP);
4007     __ bind(SMALL_LOOP);
4008       __ ldr(tmp1, Address(__ post(a1, wordSize)));
4009       __ ldr(tmp2, Address(__ post(a2, wordSize)));
4010       __ subs(cnt1, cnt1, wordSize);
4011       __ eor(tmp1, tmp1, tmp2);
4012       __ cbnz(tmp1, NOT_EQUAL);
4013       __ br(__ GT, SMALL_LOOP);
4014     __ bind(POST_LOOP);
4015       __ ldr(tmp1, Address(a1, cnt1));
4016       __ ldr(tmp2, Address(a2, cnt1));
4017       __ eor(tmp1, tmp1, tmp2);
4018       __ cbnz(tmp1, NOT_EQUAL);
4019     __ bind(EQUAL);
4020       __ mov(result, true);
4021     __ bind(NOT_EQUAL);
4022       if (!UseSIMDForArrayEquals) {
4023         __ pop(spilled_regs, sp);
4024       }
4025     __ bind(NOT_EQUAL_NO_POP);
4026     __ leave();
4027     __ ret(lr);
4028     return entry;
4029   }
4030 
4031   address generate_dsin_dcos(bool isCos) {
4032     __ align(CodeEntryAlignment);
4033     StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin");
4034     address start = __ pc();
4035     __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw,
4036         (address)StubRoutines::aarch64::_two_over_pi,
4037         (address)StubRoutines::aarch64::_pio2,
4038         (address)StubRoutines::aarch64::_dsin_coef,
4039         (address)StubRoutines::aarch64::_dcos_coef);
4040     return start;
4041   }
4042 
4043   address generate_dlog() {
4044     __ align(CodeEntryAlignment);
4045     StubCodeMark mark(this, "StubRoutines", "dlog");
4046     address entry = __ pc();
4047     FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4,
4048         vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19;
4049     Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4;
4050     __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3,
4051         tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5);
4052     return entry;
4053   }
4054 
4055   // code for comparing 16 bytes of strings with same encoding
4056   void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) {
4057     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11;
4058     __ ldr(rscratch1, Address(__ post(str1, 8)));
4059     __ eor(rscratch2, tmp1, tmp2);
4060     __ ldr(cnt1, Address(__ post(str2, 8)));
4061     __ cbnz(rscratch2, DIFF1);
4062     __ ldr(tmp1, Address(__ post(str1, 8)));
4063     __ eor(rscratch2, rscratch1, cnt1);
4064     __ ldr(tmp2, Address(__ post(str2, 8)));
4065     __ cbnz(rscratch2, DIFF2);
4066   }
4067 
4068   // code for comparing 16 characters of strings with Latin1 and Utf16 encoding
4069   void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1,
4070       Label &DIFF2) {
4071     Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12;
4072     FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2;
4073 
4074     __ ldrq(vtmp, Address(__ post(tmp2, 16)));
4075     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4076     __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4077     // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3
4078 
4079     __ fmovd(tmpL, vtmp3);
4080     __ eor(rscratch2, tmp3, tmpL);
4081     __ cbnz(rscratch2, DIFF2);
4082 
4083     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4084     __ umov(tmpL, vtmp3, __ D, 1);
4085     __ eor(rscratch2, tmpU, tmpL);
4086     __ cbnz(rscratch2, DIFF1);
4087 
4088     __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4089     __ ldr(tmpU, Address(__ post(cnt1, 8)));
4090     __ fmovd(tmpL, vtmp);
4091     __ eor(rscratch2, tmp3, tmpL);
4092     __ cbnz(rscratch2, DIFF2);
4093 
4094     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4095     __ umov(tmpL, vtmp, __ D, 1);
4096     __ eor(rscratch2, tmpU, tmpL);
4097     __ cbnz(rscratch2, DIFF1);
4098   }
4099 
4100   // r0  = result
4101   // r1  = str1
4102   // r2  = cnt1
4103   // r3  = str2
4104   // r4  = cnt2
4105   // r10 = tmp1
4106   // r11 = tmp2
4107   address generate_compare_long_string_different_encoding(bool isLU) {
4108     __ align(CodeEntryAlignment);
4109     StubCodeMark mark(this, "StubRoutines", isLU
4110         ? "compare_long_string_different_encoding LU"
4111         : "compare_long_string_different_encoding UL");
4112     address entry = __ pc();
4113     Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2,
4114         DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER,
4115         LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2;
4116     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4117         tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14;
4118     FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2;
4119     RegSet spilled_regs = RegSet::of(tmp3, tmp4);
4120 
4121     int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2);
4122 
4123     __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ);
4124     // cnt2 == amount of characters left to compare
4125     // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL))
4126     __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4127     __ add(str1, str1, isLU ? wordSize/2 : wordSize);
4128     __ add(str2, str2, isLU ? wordSize : wordSize/2);
4129     __ fmovd(isLU ? tmp1 : tmp2, vtmp);
4130     __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case.
4131     __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1);
4132     __ eor(rscratch2, tmp1, tmp2);
4133     __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0);
4134     __ mov(rscratch1, tmp2);
4135     __ cbnz(rscratch2, CALCULATE_DIFFERENCE);
4136     Register strU = isLU ? str2 : str1,
4137              strL = isLU ? str1 : str2,
4138              tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison
4139              tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison
4140     __ push(spilled_regs, sp);
4141     __ sub(tmp2, strL, cnt2); // strL pointer to load from
4142     __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from
4143 
4144     __ ldr(tmp3, Address(__ post(cnt1, 8)));
4145 
4146     if (SoftwarePrefetchHintDistance >= 0) {
4147       __ cmp(cnt2, prefetchLoopExitCondition);
4148       __ br(__ LT, SMALL_LOOP);
4149       __ bind(LARGE_LOOP_PREFETCH);
4150         __ prfm(Address(tmp2, SoftwarePrefetchHintDistance));
4151         __ mov(tmp4, 2);
4152         __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4153         __ bind(LARGE_LOOP_PREFETCH_REPEAT1);
4154           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4155           __ subs(tmp4, tmp4, 1);
4156           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1);
4157           __ prfm(Address(cnt1, SoftwarePrefetchHintDistance));
4158           __ mov(tmp4, 2);
4159         __ bind(LARGE_LOOP_PREFETCH_REPEAT2);
4160           compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4161           __ subs(tmp4, tmp4, 1);
4162           __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2);
4163           __ sub(cnt2, cnt2, 64);
4164           __ cmp(cnt2, prefetchLoopExitCondition);
4165           __ br(__ GE, LARGE_LOOP_PREFETCH);
4166     }
4167     __ cbz(cnt2, LOAD_LAST); // no characters left except last load
4168     __ subs(cnt2, cnt2, 16);
4169     __ br(__ LT, TAIL);
4170     __ b(SMALL_LOOP_ENTER);
4171     __ bind(SMALL_LOOP); // smaller loop
4172       __ subs(cnt2, cnt2, 16);
4173     __ bind(SMALL_LOOP_ENTER);
4174       compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2);
4175       __ br(__ GE, SMALL_LOOP);
4176       __ cbz(cnt2, LOAD_LAST);
4177     __ bind(TAIL); // 1..15 characters left
4178       __ cmp(cnt2, -8);
4179       __ br(__ GT, TAIL_LOAD_16);
4180       __ ldrd(vtmp, Address(tmp2));
4181       __ zip1(vtmp3, __ T8B, vtmp, vtmpZ);
4182 
4183       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4184       __ fmovd(tmpL, vtmp3);
4185       __ eor(rscratch2, tmp3, tmpL);
4186       __ cbnz(rscratch2, DIFF2);
4187       __ umov(tmpL, vtmp3, __ D, 1);
4188       __ eor(rscratch2, tmpU, tmpL);
4189       __ cbnz(rscratch2, DIFF1);
4190       __ b(LOAD_LAST);
4191     __ bind(TAIL_LOAD_16);
4192       __ ldrq(vtmp, Address(tmp2));
4193       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4194       __ zip1(vtmp3, __ T16B, vtmp, vtmpZ);
4195       __ zip2(vtmp, __ T16B, vtmp, vtmpZ);
4196       __ fmovd(tmpL, vtmp3);
4197       __ eor(rscratch2, tmp3, tmpL);
4198       __ cbnz(rscratch2, DIFF2);
4199 
4200       __ ldr(tmp3, Address(__ post(cnt1, 8)));
4201       __ umov(tmpL, vtmp3, __ D, 1);
4202       __ eor(rscratch2, tmpU, tmpL);
4203       __ cbnz(rscratch2, DIFF1);
4204 
4205       __ ldr(tmpU, Address(__ post(cnt1, 8)));
4206       __ fmovd(tmpL, vtmp);
4207       __ eor(rscratch2, tmp3, tmpL);
4208       __ cbnz(rscratch2, DIFF2);
4209 
4210       __ umov(tmpL, vtmp, __ D, 1);
4211       __ eor(rscratch2, tmpU, tmpL);
4212       __ cbnz(rscratch2, DIFF1);
4213       __ b(LOAD_LAST);
4214     __ bind(DIFF2);
4215       __ mov(tmpU, tmp3);
4216     __ bind(DIFF1);
4217       __ pop(spilled_regs, sp);
4218       __ b(CALCULATE_DIFFERENCE);
4219     __ bind(LOAD_LAST);
4220       __ pop(spilled_regs, sp);
4221 
4222       __ ldrs(vtmp, Address(strL));
4223       __ ldr(tmpU, Address(strU));
4224       __ zip1(vtmp, __ T8B, vtmp, vtmpZ);
4225       __ fmovd(tmpL, vtmp);
4226 
4227       __ eor(rscratch2, tmpU, tmpL);
4228       __ cbz(rscratch2, DONE);
4229 
4230     // Find the first different characters in the longwords and
4231     // compute their difference.
4232     __ bind(CALCULATE_DIFFERENCE);
4233       __ rev(rscratch2, rscratch2);
4234       __ clz(rscratch2, rscratch2);
4235       __ andr(rscratch2, rscratch2, -16);
4236       __ lsrv(tmp1, tmp1, rscratch2);
4237       __ uxthw(tmp1, tmp1);
4238       __ lsrv(rscratch1, rscratch1, rscratch2);
4239       __ uxthw(rscratch1, rscratch1);
4240       __ subw(result, tmp1, rscratch1);
4241     __ bind(DONE);
4242       __ ret(lr);
4243     return entry;
4244   }
4245 
4246   // r0  = result
4247   // r1  = str1
4248   // r2  = cnt1
4249   // r3  = str2
4250   // r4  = cnt2
4251   // r10 = tmp1
4252   // r11 = tmp2
4253   address generate_compare_long_string_same_encoding(bool isLL) {
4254     __ align(CodeEntryAlignment);
4255     StubCodeMark mark(this, "StubRoutines", isLL
4256         ? "compare_long_string_same_encoding LL"
4257         : "compare_long_string_same_encoding UU");
4258     address entry = __ pc();
4259     Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4,
4260         tmp1 = r10, tmp2 = r11;
4261     Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL,
4262         LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF,
4263         DIFF_LAST_POSITION, DIFF_LAST_POSITION2;
4264     // exit from large loop when less than 64 bytes left to read or we're about
4265     // to prefetch memory behind array border
4266     int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2);
4267     // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used
4268     // update cnt2 counter with already loaded 8 bytes
4269     __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2));
4270     // update pointers, because of previous read
4271     __ add(str1, str1, wordSize);
4272     __ add(str2, str2, wordSize);
4273     if (SoftwarePrefetchHintDistance >= 0) {
4274       __ bind(LARGE_LOOP_PREFETCH);
4275         __ prfm(Address(str1, SoftwarePrefetchHintDistance));
4276         __ prfm(Address(str2, SoftwarePrefetchHintDistance));
4277         compare_string_16_bytes_same(DIFF, DIFF2);
4278         compare_string_16_bytes_same(DIFF, DIFF2);
4279         __ sub(cnt2, cnt2, isLL ? 64 : 32);
4280         compare_string_16_bytes_same(DIFF, DIFF2);
4281         __ cmp(cnt2, largeLoopExitCondition);
4282         compare_string_16_bytes_same(DIFF, DIFF2);
4283         __ br(__ GT, LARGE_LOOP_PREFETCH);
4284         __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left?
4285         // less than 16 bytes left?
4286         __ subs(cnt2, cnt2, isLL ? 16 : 8);
4287         __ br(__ LT, TAIL);
4288     }
4289     __ bind(SMALL_LOOP);
4290       compare_string_16_bytes_same(DIFF, DIFF2);
4291       __ subs(cnt2, cnt2, isLL ? 16 : 8);
4292       __ br(__ GE, SMALL_LOOP);
4293     __ bind(TAIL);
4294       __ adds(cnt2, cnt2, isLL ? 16 : 8);
4295       __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF);
4296       __ subs(cnt2, cnt2, isLL ? 8 : 4);
4297       __ br(__ LE, CHECK_LAST);
4298       __ eor(rscratch2, tmp1, tmp2);
4299       __ cbnz(rscratch2, DIFF);
4300       __ ldr(tmp1, Address(__ post(str1, 8)));
4301       __ ldr(tmp2, Address(__ post(str2, 8)));
4302       __ sub(cnt2, cnt2, isLL ? 8 : 4);
4303     __ bind(CHECK_LAST);
4304       if (!isLL) {
4305         __ add(cnt2, cnt2, cnt2); // now in bytes
4306       }
4307       __ eor(rscratch2, tmp1, tmp2);
4308       __ cbnz(rscratch2, DIFF);
4309       __ ldr(rscratch1, Address(str1, cnt2));
4310       __ ldr(cnt1, Address(str2, cnt2));
4311       __ eor(rscratch2, rscratch1, cnt1);
4312       __ cbz(rscratch2, LENGTH_DIFF);
4313       // Find the first different characters in the longwords and
4314       // compute their difference.
4315     __ bind(DIFF2);
4316       __ rev(rscratch2, rscratch2);
4317       __ clz(rscratch2, rscratch2);
4318       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4319       __ lsrv(rscratch1, rscratch1, rscratch2);
4320       if (isLL) {
4321         __ lsrv(cnt1, cnt1, rscratch2);
4322         __ uxtbw(rscratch1, rscratch1);
4323         __ uxtbw(cnt1, cnt1);
4324       } else {
4325         __ lsrv(cnt1, cnt1, rscratch2);
4326         __ uxthw(rscratch1, rscratch1);
4327         __ uxthw(cnt1, cnt1);
4328       }
4329       __ subw(result, rscratch1, cnt1);
4330       __ b(LENGTH_DIFF);
4331     __ bind(DIFF);
4332       __ rev(rscratch2, rscratch2);
4333       __ clz(rscratch2, rscratch2);
4334       __ andr(rscratch2, rscratch2, isLL ? -8 : -16);
4335       __ lsrv(tmp1, tmp1, rscratch2);
4336       if (isLL) {
4337         __ lsrv(tmp2, tmp2, rscratch2);
4338         __ uxtbw(tmp1, tmp1);
4339         __ uxtbw(tmp2, tmp2);
4340       } else {
4341         __ lsrv(tmp2, tmp2, rscratch2);
4342         __ uxthw(tmp1, tmp1);
4343         __ uxthw(tmp2, tmp2);
4344       }
4345       __ subw(result, tmp1, tmp2);
4346       __ b(LENGTH_DIFF);
4347     __ bind(LAST_CHECK_AND_LENGTH_DIFF);
4348       __ eor(rscratch2, tmp1, tmp2);
4349       __ cbnz(rscratch2, DIFF);
4350     __ bind(LENGTH_DIFF);
4351       __ ret(lr);
4352     return entry;
4353   }
4354 
4355   void generate_compare_long_strings() {
4356       StubRoutines::aarch64::_compare_long_string_LL
4357           = generate_compare_long_string_same_encoding(true);
4358       StubRoutines::aarch64::_compare_long_string_UU
4359           = generate_compare_long_string_same_encoding(false);
4360       StubRoutines::aarch64::_compare_long_string_LU
4361           = generate_compare_long_string_different_encoding(true);
4362       StubRoutines::aarch64::_compare_long_string_UL
4363           = generate_compare_long_string_different_encoding(false);
4364   }
4365 
4366   // R0 = result
4367   // R1 = str2
4368   // R2 = cnt1
4369   // R3 = str1
4370   // R4 = cnt2
4371   // This generic linear code use few additional ideas, which makes it faster:
4372   // 1) we can safely keep at least 1st register of pattern(since length >= 8)
4373   // in order to skip initial loading(help in systems with 1 ld pipeline)
4374   // 2) we can use "fast" algorithm of finding single character to search for
4375   // first symbol with less branches(1 branch per each loaded register instead
4376   // of branch for each symbol), so, this is where constants like
4377   // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from
4378   // 3) after loading and analyzing 1st register of source string, it can be
4379   // used to search for every 1st character entry, saving few loads in
4380   // comparison with "simplier-but-slower" implementation
4381   // 4) in order to avoid lots of push/pop operations, code below is heavily
4382   // re-using/re-initializing/compressing register values, which makes code
4383   // larger and a bit less readable, however, most of extra operations are
4384   // issued during loads or branches, so, penalty is minimal
4385   address generate_string_indexof_linear(bool str1_isL, bool str2_isL) {
4386     const char* stubName = str1_isL
4387         ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul")
4388         : "indexof_linear_uu";
4389     __ align(CodeEntryAlignment);
4390     StubCodeMark mark(this, "StubRoutines", stubName);
4391     address entry = __ pc();
4392 
4393     int str1_chr_size = str1_isL ? 1 : 2;
4394     int str2_chr_size = str2_isL ? 1 : 2;
4395     int str1_chr_shift = str1_isL ? 0 : 1;
4396     int str2_chr_shift = str2_isL ? 0 : 1;
4397     bool isL = str1_isL && str2_isL;
4398    // parameters
4399     Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4;
4400     // temporary registers
4401     Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23;
4402     RegSet spilled_regs = RegSet::range(tmp1, tmp4);
4403     // redefinitions
4404     Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3;
4405 
4406     __ push(spilled_regs, sp);
4407     Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, L_SMALL_MATCH_LOOP,
4408         L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED,
4409         L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP,
4410         L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH,
4411         L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2,
4412         L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH;
4413     // Read whole register from str1. It is safe, because length >=8 here
4414     __ ldr(ch1, Address(str1));
4415     // Read whole register from str2. It is safe, because length >=8 here
4416     __ ldr(ch2, Address(str2));
4417     __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF);
4418     if (str1_isL != str2_isL) {
4419       __ eor(v0, __ T16B, v0, v0);
4420     }
4421     __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001);
4422     __ mul(first, first, tmp1);
4423     // check if we have less than 1 register to check
4424     __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1);
4425     if (str1_isL != str2_isL) {
4426       __ fmovd(v1, ch1);
4427     }
4428     __ br(__ LE, L_SMALL);
4429     __ eor(ch2, first, ch2);
4430     if (str1_isL != str2_isL) {
4431       __ zip1(v1, __ T16B, v1, v0);
4432     }
4433     __ sub(tmp2, ch2, tmp1);
4434     __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4435     __ bics(tmp2, tmp2, ch2);
4436     if (str1_isL != str2_isL) {
4437       __ fmovd(ch1, v1);
4438     }
4439     __ br(__ NE, L_HAS_ZERO);
4440     __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4441     __ add(result, result, wordSize/str2_chr_size);
4442     __ add(str2, str2, wordSize);
4443     __ br(__ LT, L_POST_LOOP);
4444     __ BIND(L_LOOP);
4445       __ ldr(ch2, Address(str2));
4446       __ eor(ch2, first, ch2);
4447       __ sub(tmp2, ch2, tmp1);
4448       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4449       __ bics(tmp2, tmp2, ch2);
4450       __ br(__ NE, L_HAS_ZERO);
4451     __ BIND(L_LOOP_PROCEED);
4452       __ subs(cnt2, cnt2, wordSize/str2_chr_size);
4453       __ add(str2, str2, wordSize);
4454       __ add(result, result, wordSize/str2_chr_size);
4455       __ br(__ GE, L_LOOP);
4456     __ BIND(L_POST_LOOP);
4457       __ cmp(cnt2, -wordSize/str2_chr_size); // no extra characters to check
4458       __ br(__ LE, NOMATCH);
4459       __ ldr(ch2, Address(str2));
4460       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4461       __ eor(ch2, first, ch2);
4462       __ sub(tmp2, ch2, tmp1);
4463       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4464       __ mov(tmp4, -1); // all bits set
4465       __ b(L_SMALL_PROCEED);
4466     __ align(OptoLoopAlignment);
4467     __ BIND(L_SMALL);
4468       __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift);
4469       __ eor(ch2, first, ch2);
4470       if (str1_isL != str2_isL) {
4471         __ zip1(v1, __ T16B, v1, v0);
4472       }
4473       __ sub(tmp2, ch2, tmp1);
4474       __ mov(tmp4, -1); // all bits set
4475       __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff);
4476       if (str1_isL != str2_isL) {
4477         __ fmovd(ch1, v1); // move converted 4 symbols
4478       }
4479     __ BIND(L_SMALL_PROCEED);
4480       __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits.
4481       __ bic(tmp2, tmp2, ch2);
4482       __ ands(tmp2, tmp2, tmp4); // clear useless bits and check
4483       __ rbit(tmp2, tmp2);
4484       __ br(__ EQ, NOMATCH);
4485     __ BIND(L_SMALL_HAS_ZERO_LOOP);
4486       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's
4487       __ cmp(cnt1, wordSize/str2_chr_size);
4488       __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2);
4489       if (str2_isL) { // LL
4490         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4491         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4492         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4493         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4494         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4495       } else {
4496         __ mov(ch2, 0xE); // all bits in byte set except last one
4497         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4498         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4499         __ lslv(tmp2, tmp2, tmp4);
4500         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4501         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4502         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4503         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4504       }
4505       __ cmp(ch1, ch2);
4506       __ mov(tmp4, wordSize/str2_chr_size);
4507       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4508     __ BIND(L_SMALL_CMP_LOOP);
4509       str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4510                : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4511       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4512                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4513       __ add(tmp4, tmp4, 1);
4514       __ cmp(tmp4, cnt1);
4515       __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP);
4516       __ cmp(first, ch2);
4517       __ br(__ EQ, L_SMALL_CMP_LOOP);
4518     __ BIND(L_SMALL_CMP_LOOP_NOMATCH);
4519       __ cbz(tmp2, NOMATCH); // no more matches. exit
4520       __ clz(tmp4, tmp2);
4521       __ add(result, result, 1); // advance index
4522       __ add(str2, str2, str2_chr_size); // advance pointer
4523       __ b(L_SMALL_HAS_ZERO_LOOP);
4524     __ align(OptoLoopAlignment);
4525     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP);
4526       __ cmp(first, ch2);
4527       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4528       __ b(DONE);
4529     __ align(OptoLoopAlignment);
4530     __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2);
4531       if (str2_isL) { // LL
4532         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index"
4533         __ ldr(ch2, Address(str2)); // read whole register of str2. Safe.
4534         __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info
4535         __ add(result, result, tmp4, __ LSR, LogBitsPerByte);
4536         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4537       } else {
4538         __ mov(ch2, 0xE); // all bits in byte set except last one
4539         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4540         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4541         __ lslv(tmp2, tmp2, tmp4);
4542         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4543         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4544         __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info
4545         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4546       }
4547       __ cmp(ch1, ch2);
4548       __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH);
4549       __ b(DONE);
4550     __ align(OptoLoopAlignment);
4551     __ BIND(L_HAS_ZERO);
4552       __ rbit(tmp2, tmp2);
4553       __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's
4554       // Now, perform compression of counters(cnt2 and cnt1) into one register.
4555       // It's fine because both counters are 32bit and are not changed in this
4556       // loop. Just restore it on exit. So, cnt1 can be re-used in this loop.
4557       __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2);
4558       __ sub(result, result, 1);
4559     __ BIND(L_HAS_ZERO_LOOP);
4560       __ mov(cnt1, wordSize/str2_chr_size);
4561       __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4562       __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare
4563       if (str2_isL) {
4564         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4565         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4566         __ lslv(tmp2, tmp2, tmp4);
4567         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4568         __ add(tmp4, tmp4, 1);
4569         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4570         __ lsl(tmp2, tmp2, 1);
4571         __ mov(tmp4, wordSize/str2_chr_size);
4572       } else {
4573         __ mov(ch2, 0xE);
4574         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4575         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4576         __ lslv(tmp2, tmp2, tmp4);
4577         __ add(tmp4, tmp4, 1);
4578         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4579         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4580         __ lsl(tmp2, tmp2, 1);
4581         __ mov(tmp4, wordSize/str2_chr_size);
4582         __ sub(str2, str2, str2_chr_size);
4583       }
4584       __ cmp(ch1, ch2);
4585       __ mov(tmp4, wordSize/str2_chr_size);
4586       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4587     __ BIND(L_CMP_LOOP);
4588       str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)))
4589                : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift)));
4590       str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)))
4591                : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift)));
4592       __ add(tmp4, tmp4, 1);
4593       __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2);
4594       __ br(__ GE, L_CMP_LOOP_LAST_CMP);
4595       __ cmp(cnt1, ch2);
4596       __ br(__ EQ, L_CMP_LOOP);
4597     __ BIND(L_CMP_LOOP_NOMATCH);
4598       // here we're not matched
4599       __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop
4600       __ clz(tmp4, tmp2);
4601       __ add(str2, str2, str2_chr_size); // advance pointer
4602       __ b(L_HAS_ZERO_LOOP);
4603     __ align(OptoLoopAlignment);
4604     __ BIND(L_CMP_LOOP_LAST_CMP);
4605       __ cmp(cnt1, ch2);
4606       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4607       __ b(DONE);
4608     __ align(OptoLoopAlignment);
4609     __ BIND(L_CMP_LOOP_LAST_CMP2);
4610       if (str2_isL) {
4611         __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index
4612         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4613         __ lslv(tmp2, tmp2, tmp4);
4614         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4615         __ add(tmp4, tmp4, 1);
4616         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4617         __ lsl(tmp2, tmp2, 1);
4618       } else {
4619         __ mov(ch2, 0xE);
4620         __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount
4621         __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe.
4622         __ lslv(tmp2, tmp2, tmp4);
4623         __ add(tmp4, tmp4, 1);
4624         __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift);
4625         __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte);
4626         __ lsl(tmp2, tmp2, 1);
4627         __ sub(str2, str2, str2_chr_size);
4628       }
4629       __ cmp(ch1, ch2);
4630       __ br(__ NE, L_CMP_LOOP_NOMATCH);
4631       __ b(DONE);
4632     __ align(OptoLoopAlignment);
4633     __ BIND(L_HAS_ZERO_LOOP_NOMATCH);
4634       // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until
4635       // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP,
4636       // so, result was increased at max by wordSize/str2_chr_size - 1, so,
4637       // respective high bit wasn't changed. L_LOOP_PROCEED will increase
4638       // result by analyzed characters value, so, we can just reset lower bits
4639       // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL
4640       // 2) restore cnt1 and cnt2 values from "compressed" cnt2
4641       // 3) advance str2 value to represent next str2 octet. result & 7/3 is
4642       // index of last analyzed substring inside current octet. So, str2 in at
4643       // respective start address. We need to advance it to next octet
4644       __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed
4645       __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2);
4646       __ bfm(result, zr, 0, 2 - str2_chr_shift);
4647       __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2
4648       __ movw(cnt2, cnt2);
4649       __ b(L_LOOP_PROCEED);
4650     __ align(OptoLoopAlignment);
4651     __ BIND(NOMATCH);
4652       __ mov(result, -1);
4653     __ BIND(DONE);
4654       __ pop(spilled_regs, sp);
4655       __ ret(lr);
4656     return entry;
4657   }
4658 
4659   void generate_string_indexof_stubs() {
4660     StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true);
4661     StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false);
4662     StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false);
4663   }
4664 
4665   void inflate_and_store_2_fp_registers(bool generatePrfm,
4666       FloatRegister src1, FloatRegister src2) {
4667     Register dst = r1;
4668     __ zip1(v1, __ T16B, src1, v0);
4669     __ zip2(v2, __ T16B, src1, v0);
4670     if (generatePrfm) {
4671       __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM);
4672     }
4673     __ zip1(v3, __ T16B, src2, v0);
4674     __ zip2(v4, __ T16B, src2, v0);
4675     __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64)));
4676   }
4677 
4678   // R0 = src
4679   // R1 = dst
4680   // R2 = len
4681   // R3 = len >> 3
4682   // V0 = 0
4683   // v1 = loaded 8 bytes
4684   address generate_large_byte_array_inflate() {
4685     __ align(CodeEntryAlignment);
4686     StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate");
4687     address entry = __ pc();
4688     Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE;
4689     Register src = r0, dst = r1, len = r2, octetCounter = r3;
4690     const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4;
4691 
4692     // do one more 8-byte read to have address 16-byte aligned in most cases
4693     // also use single store instruction
4694     __ ldrd(v2, __ post(src, 8));
4695     __ sub(octetCounter, octetCounter, 2);
4696     __ zip1(v1, __ T16B, v1, v0);
4697     __ zip1(v2, __ T16B, v2, v0);
4698     __ st1(v1, v2, __ T16B, __ post(dst, 32));
4699     __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4700     __ cmp(octetCounter, large_loop_threshold);
4701     __ br(__ LE, LOOP_START);
4702     __ b(LOOP_PRFM_START);
4703     __ bind(LOOP_PRFM);
4704       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4705     __ bind(LOOP_PRFM_START);
4706       __ prfm(Address(src, SoftwarePrefetchHintDistance));
4707       __ sub(octetCounter, octetCounter, 8);
4708       __ cmp(octetCounter, large_loop_threshold);
4709       inflate_and_store_2_fp_registers(true, v3, v4);
4710       inflate_and_store_2_fp_registers(true, v5, v6);
4711       __ br(__ GT, LOOP_PRFM);
4712       __ cmp(octetCounter, 8);
4713       __ br(__ LT, DONE);
4714     __ bind(LOOP);
4715       __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64)));
4716       __ bind(LOOP_START);
4717       __ sub(octetCounter, octetCounter, 8);
4718       __ cmp(octetCounter, 8);
4719       inflate_and_store_2_fp_registers(false, v3, v4);
4720       inflate_and_store_2_fp_registers(false, v5, v6);
4721       __ br(__ GE, LOOP);
4722     __ bind(DONE);
4723       __ ret(lr);
4724     return entry;
4725   }
4726 
4727   /**
4728    *  Arguments:
4729    *
4730    *  Input:
4731    *  c_rarg0   - current state address
4732    *  c_rarg1   - H key address
4733    *  c_rarg2   - data address
4734    *  c_rarg3   - number of blocks
4735    *
4736    *  Output:
4737    *  Updated state at c_rarg0
4738    */
4739   address generate_ghash_processBlocks() {
4740     // Bafflingly, GCM uses little-endian for the byte order, but
4741     // big-endian for the bit order.  For example, the polynomial 1 is
4742     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
4743     //
4744     // So, we must either reverse the bytes in each word and do
4745     // everything big-endian or reverse the bits in each byte and do
4746     // it little-endian.  On AArch64 it's more idiomatic to reverse
4747     // the bits in each byte (we have an instruction, RBIT, to do
4748     // that) and keep the data in little-endian bit order throught the
4749     // calculation, bit-reversing the inputs and outputs.
4750 
4751     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4752     __ align(wordSize * 2);
4753     address p = __ pc();
4754     __ emit_int64(0x87);  // The low-order bits of the field
4755                           // polynomial (i.e. p = z^7+z^2+z+1)
4756                           // repeated in the low and high parts of a
4757                           // 128-bit vector
4758     __ emit_int64(0x87);
4759 
4760     __ align(CodeEntryAlignment);
4761     address start = __ pc();
4762 
4763     Register state   = c_rarg0;
4764     Register subkeyH = c_rarg1;
4765     Register data    = c_rarg2;
4766     Register blocks  = c_rarg3;
4767 
4768     FloatRegister vzr = v30;
4769     __ eor(vzr, __ T16B, vzr, vzr); // zero register
4770 
4771     __ ldrq(v0, Address(state));
4772     __ ldrq(v1, Address(subkeyH));
4773 
4774     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
4775     __ rbit(v0, __ T16B, v0);
4776     __ rev64(v1, __ T16B, v1);
4777     __ rbit(v1, __ T16B, v1);
4778 
4779     __ ldrq(v26, p);
4780 
4781     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
4782     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
4783 
4784     {
4785       Label L_ghash_loop;
4786       __ bind(L_ghash_loop);
4787 
4788       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
4789                                                  // reversing each byte
4790       __ rbit(v2, __ T16B, v2);
4791       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
4792 
4793       // Multiply state in v2 by subkey in v1
4794       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
4795                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
4796                      /*temps*/v6, v20, v18, v21);
4797       // Reduce v7:v5 by the field polynomial
4798       ghash_reduce(v0, v5, v7, v26, vzr, v20);
4799 
4800       __ sub(blocks, blocks, 1);
4801       __ cbnz(blocks, L_ghash_loop);
4802     }
4803 
4804     // The bit-reversed result is at this point in v0
4805     __ rev64(v1, __ T16B, v0);
4806     __ rbit(v1, __ T16B, v1);
4807 
4808     __ st1(v1, __ T16B, state);
4809     __ ret(lr);
4810 
4811     return start;
4812   }
4813 
4814   // Continuation point for throwing of implicit exceptions that are
4815   // not handled in the current activation. Fabricates an exception
4816   // oop and initiates normal exception dispatching in this
4817   // frame. Since we need to preserve callee-saved values (currently
4818   // only for C2, but done for C1 as well) we need a callee-saved oop
4819   // map and therefore have to make these stubs into RuntimeStubs
4820   // rather than BufferBlobs.  If the compiler needs all registers to
4821   // be preserved between the fault point and the exception handler
4822   // then it must assume responsibility for that in
4823   // AbstractCompiler::continuation_for_implicit_null_exception or
4824   // continuation_for_implicit_division_by_zero_exception. All other
4825   // implicit exceptions (e.g., NullPointerException or
4826   // AbstractMethodError on entry) are either at call sites or
4827   // otherwise assume that stack unwinding will be initiated, so
4828   // caller saved registers were assumed volatile in the compiler.
4829 
4830 #undef __
4831 #define __ masm->
4832 
4833   address generate_throw_exception(const char* name,
4834                                    address runtime_entry,
4835                                    Register arg1 = noreg,
4836                                    Register arg2 = noreg) {
4837     // Information about frame layout at time of blocking runtime call.
4838     // Note that we only have to preserve callee-saved registers since
4839     // the compilers are responsible for supplying a continuation point
4840     // if they expect all registers to be preserved.
4841     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4842     enum layout {
4843       rfp_off = 0,
4844       rfp_off2,
4845       return_off,
4846       return_off2,
4847       framesize // inclusive of return address
4848     };
4849 
4850     int insts_size = 512;
4851     int locs_size  = 64;
4852 
4853     CodeBuffer code(name, insts_size, locs_size);
4854     OopMapSet* oop_maps  = new OopMapSet();
4855     MacroAssembler* masm = new MacroAssembler(&code);
4856 
4857     address start = __ pc();
4858 
4859     // This is an inlined and slightly modified version of call_VM
4860     // which has the ability to fetch the return PC out of
4861     // thread-local storage and also sets up last_Java_sp slightly
4862     // differently than the real call_VM
4863 
4864     __ enter(); // Save FP and LR before call
4865 
4866     assert(is_even(framesize/2), "sp not 16-byte aligned");
4867 
4868     // lr and fp are already in place
4869     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4870 
4871     int frame_complete = __ pc() - start;
4872 
4873     // Set up last_Java_sp and last_Java_fp
4874     address the_pc = __ pc();
4875     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
4876 
4877     // Call runtime
4878     if (arg1 != noreg) {
4879       assert(arg2 != c_rarg1, "clobbered");
4880       __ mov(c_rarg1, arg1);
4881     }
4882     if (arg2 != noreg) {
4883       __ mov(c_rarg2, arg2);
4884     }
4885     __ mov(c_rarg0, rthread);
4886     BLOCK_COMMENT("call runtime_entry");
4887     __ mov(rscratch1, runtime_entry);
4888     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4889 
4890     // Generate oop map
4891     OopMap* map = new OopMap(framesize, 0);
4892 
4893     oop_maps->add_gc_map(the_pc - start, map);
4894 
4895     __ reset_last_Java_frame(true);
4896     __ maybe_isb();
4897 
4898     __ leave();
4899 
4900     // check for pending exceptions
4901 #ifdef ASSERT
4902     Label L;
4903     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4904     __ cbnz(rscratch1, L);
4905     __ should_not_reach_here();
4906     __ bind(L);
4907 #endif // ASSERT
4908     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4909 
4910 
4911     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4912     RuntimeStub* stub =
4913       RuntimeStub::new_runtime_stub(name,
4914                                     &code,
4915                                     frame_complete,
4916                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4917                                     oop_maps, false);
4918     return stub->entry_point();
4919   }
4920 
4921   class MontgomeryMultiplyGenerator : public MacroAssembler {
4922 
4923     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4924       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4925 
4926     RegSet _toSave;
4927     bool _squaring;
4928 
4929   public:
4930     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4931       : MacroAssembler(as->code()), _squaring(squaring) {
4932 
4933       // Register allocation
4934 
4935       Register reg = c_rarg0;
4936       Pa_base = reg;       // Argument registers
4937       if (squaring)
4938         Pb_base = Pa_base;
4939       else
4940         Pb_base = ++reg;
4941       Pn_base = ++reg;
4942       Rlen= ++reg;
4943       inv = ++reg;
4944       Pm_base = ++reg;
4945 
4946                           // Working registers:
4947       Ra =  ++reg;        // The current digit of a, b, n, and m.
4948       Rb =  ++reg;
4949       Rm =  ++reg;
4950       Rn =  ++reg;
4951 
4952       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4953       Pb =  ++reg;
4954       Pm =  ++reg;
4955       Pn =  ++reg;
4956 
4957       t0 =  ++reg;        // Three registers which form a
4958       t1 =  ++reg;        // triple-precision accumuator.
4959       t2 =  ++reg;
4960 
4961       Ri =  ++reg;        // Inner and outer loop indexes.
4962       Rj =  ++reg;
4963 
4964       Rhi_ab = ++reg;     // Product registers: low and high parts
4965       Rlo_ab = ++reg;     // of a*b and m*n.
4966       Rhi_mn = ++reg;
4967       Rlo_mn = ++reg;
4968 
4969       // r19 and up are callee-saved.
4970       _toSave = RegSet::range(r19, reg) + Pm_base;
4971     }
4972 
4973   private:
4974     void save_regs() {
4975       push(_toSave, sp);
4976     }
4977 
4978     void restore_regs() {
4979       pop(_toSave, sp);
4980     }
4981 
4982     template <typename T>
4983     void unroll_2(Register count, T block) {
4984       Label loop, end, odd;
4985       tbnz(count, 0, odd);
4986       cbz(count, end);
4987       align(16);
4988       bind(loop);
4989       (this->*block)();
4990       bind(odd);
4991       (this->*block)();
4992       subs(count, count, 2);
4993       br(Assembler::GT, loop);
4994       bind(end);
4995     }
4996 
4997     template <typename T>
4998     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4999       Label loop, end, odd;
5000       tbnz(count, 0, odd);
5001       cbz(count, end);
5002       align(16);
5003       bind(loop);
5004       (this->*block)(d, s, tmp);
5005       bind(odd);
5006       (this->*block)(d, s, tmp);
5007       subs(count, count, 2);
5008       br(Assembler::GT, loop);
5009       bind(end);
5010     }
5011 
5012     void pre1(RegisterOrConstant i) {
5013       block_comment("pre1");
5014       // Pa = Pa_base;
5015       // Pb = Pb_base + i;
5016       // Pm = Pm_base;
5017       // Pn = Pn_base + i;
5018       // Ra = *Pa;
5019       // Rb = *Pb;
5020       // Rm = *Pm;
5021       // Rn = *Pn;
5022       ldr(Ra, Address(Pa_base));
5023       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5024       ldr(Rm, Address(Pm_base));
5025       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5026       lea(Pa, Address(Pa_base));
5027       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
5028       lea(Pm, Address(Pm_base));
5029       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5030 
5031       // Zero the m*n result.
5032       mov(Rhi_mn, zr);
5033       mov(Rlo_mn, zr);
5034     }
5035 
5036     // The core multiply-accumulate step of a Montgomery
5037     // multiplication.  The idea is to schedule operations as a
5038     // pipeline so that instructions with long latencies (loads and
5039     // multiplies) have time to complete before their results are
5040     // used.  This most benefits in-order implementations of the
5041     // architecture but out-of-order ones also benefit.
5042     void step() {
5043       block_comment("step");
5044       // MACC(Ra, Rb, t0, t1, t2);
5045       // Ra = *++Pa;
5046       // Rb = *--Pb;
5047       umulh(Rhi_ab, Ra, Rb);
5048       mul(Rlo_ab, Ra, Rb);
5049       ldr(Ra, pre(Pa, wordSize));
5050       ldr(Rb, pre(Pb, -wordSize));
5051       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
5052                                        // previous iteration.
5053       // MACC(Rm, Rn, t0, t1, t2);
5054       // Rm = *++Pm;
5055       // Rn = *--Pn;
5056       umulh(Rhi_mn, Rm, Rn);
5057       mul(Rlo_mn, Rm, Rn);
5058       ldr(Rm, pre(Pm, wordSize));
5059       ldr(Rn, pre(Pn, -wordSize));
5060       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5061     }
5062 
5063     void post1() {
5064       block_comment("post1");
5065 
5066       // MACC(Ra, Rb, t0, t1, t2);
5067       // Ra = *++Pa;
5068       // Rb = *--Pb;
5069       umulh(Rhi_ab, Ra, Rb);
5070       mul(Rlo_ab, Ra, Rb);
5071       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5072       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5073 
5074       // *Pm = Rm = t0 * inv;
5075       mul(Rm, t0, inv);
5076       str(Rm, Address(Pm));
5077 
5078       // MACC(Rm, Rn, t0, t1, t2);
5079       // t0 = t1; t1 = t2; t2 = 0;
5080       umulh(Rhi_mn, Rm, Rn);
5081 
5082 #ifndef PRODUCT
5083       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5084       {
5085         mul(Rlo_mn, Rm, Rn);
5086         add(Rlo_mn, t0, Rlo_mn);
5087         Label ok;
5088         cbz(Rlo_mn, ok); {
5089           stop("broken Montgomery multiply");
5090         } bind(ok);
5091       }
5092 #endif
5093       // We have very carefully set things up so that
5094       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5095       // the lower half of Rm * Rn because we know the result already:
5096       // it must be -t0.  t0 + (-t0) must generate a carry iff
5097       // t0 != 0.  So, rather than do a mul and an adds we just set
5098       // the carry flag iff t0 is nonzero.
5099       //
5100       // mul(Rlo_mn, Rm, Rn);
5101       // adds(zr, t0, Rlo_mn);
5102       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5103       adcs(t0, t1, Rhi_mn);
5104       adc(t1, t2, zr);
5105       mov(t2, zr);
5106     }
5107 
5108     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
5109       block_comment("pre2");
5110       // Pa = Pa_base + i-len;
5111       // Pb = Pb_base + len;
5112       // Pm = Pm_base + i-len;
5113       // Pn = Pn_base + len;
5114 
5115       if (i.is_register()) {
5116         sub(Rj, i.as_register(), len);
5117       } else {
5118         mov(Rj, i.as_constant());
5119         sub(Rj, Rj, len);
5120       }
5121       // Rj == i-len
5122 
5123       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
5124       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
5125       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5126       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
5127 
5128       // Ra = *++Pa;
5129       // Rb = *--Pb;
5130       // Rm = *++Pm;
5131       // Rn = *--Pn;
5132       ldr(Ra, pre(Pa, wordSize));
5133       ldr(Rb, pre(Pb, -wordSize));
5134       ldr(Rm, pre(Pm, wordSize));
5135       ldr(Rn, pre(Pn, -wordSize));
5136 
5137       mov(Rhi_mn, zr);
5138       mov(Rlo_mn, zr);
5139     }
5140 
5141     void post2(RegisterOrConstant i, RegisterOrConstant len) {
5142       block_comment("post2");
5143       if (i.is_constant()) {
5144         mov(Rj, i.as_constant()-len.as_constant());
5145       } else {
5146         sub(Rj, i.as_register(), len);
5147       }
5148 
5149       adds(t0, t0, Rlo_mn); // The pending m*n, low part
5150 
5151       // As soon as we know the least significant digit of our result,
5152       // store it.
5153       // Pm_base[i-len] = t0;
5154       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
5155 
5156       // t0 = t1; t1 = t2; t2 = 0;
5157       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
5158       adc(t1, t2, zr);
5159       mov(t2, zr);
5160     }
5161 
5162     // A carry in t0 after Montgomery multiplication means that we
5163     // should subtract multiples of n from our result in m.  We'll
5164     // keep doing that until there is no carry.
5165     void normalize(RegisterOrConstant len) {
5166       block_comment("normalize");
5167       // while (t0)
5168       //   t0 = sub(Pm_base, Pn_base, t0, len);
5169       Label loop, post, again;
5170       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
5171       cbz(t0, post); {
5172         bind(again); {
5173           mov(i, zr);
5174           mov(cnt, len);
5175           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5176           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5177           subs(zr, zr, zr); // set carry flag, i.e. no borrow
5178           align(16);
5179           bind(loop); {
5180             sbcs(Rm, Rm, Rn);
5181             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5182             add(i, i, 1);
5183             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
5184             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
5185             sub(cnt, cnt, 1);
5186           } cbnz(cnt, loop);
5187           sbc(t0, t0, zr);
5188         } cbnz(t0, again);
5189       } bind(post);
5190     }
5191 
5192     // Move memory at s to d, reversing words.
5193     //    Increments d to end of copied memory
5194     //    Destroys tmp1, tmp2
5195     //    Preserves len
5196     //    Leaves s pointing to the address which was in d at start
5197     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
5198       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
5199 
5200       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
5201       mov(tmp1, len);
5202       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
5203       sub(s, d, len, ext::uxtw, LogBytesPerWord);
5204     }
5205     // where
5206     void reverse1(Register d, Register s, Register tmp) {
5207       ldr(tmp, pre(s, -wordSize));
5208       ror(tmp, tmp, 32);
5209       str(tmp, post(d, wordSize));
5210     }
5211 
5212     void step_squaring() {
5213       // An extra ACC
5214       step();
5215       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5216     }
5217 
5218     void last_squaring(RegisterOrConstant i) {
5219       Label dont;
5220       // if ((i & 1) == 0) {
5221       tbnz(i.as_register(), 0, dont); {
5222         // MACC(Ra, Rb, t0, t1, t2);
5223         // Ra = *++Pa;
5224         // Rb = *--Pb;
5225         umulh(Rhi_ab, Ra, Rb);
5226         mul(Rlo_ab, Ra, Rb);
5227         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
5228       } bind(dont);
5229     }
5230 
5231     void extra_step_squaring() {
5232       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5233 
5234       // MACC(Rm, Rn, t0, t1, t2);
5235       // Rm = *++Pm;
5236       // Rn = *--Pn;
5237       umulh(Rhi_mn, Rm, Rn);
5238       mul(Rlo_mn, Rm, Rn);
5239       ldr(Rm, pre(Pm, wordSize));
5240       ldr(Rn, pre(Pn, -wordSize));
5241     }
5242 
5243     void post1_squaring() {
5244       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
5245 
5246       // *Pm = Rm = t0 * inv;
5247       mul(Rm, t0, inv);
5248       str(Rm, Address(Pm));
5249 
5250       // MACC(Rm, Rn, t0, t1, t2);
5251       // t0 = t1; t1 = t2; t2 = 0;
5252       umulh(Rhi_mn, Rm, Rn);
5253 
5254 #ifndef PRODUCT
5255       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
5256       {
5257         mul(Rlo_mn, Rm, Rn);
5258         add(Rlo_mn, t0, Rlo_mn);
5259         Label ok;
5260         cbz(Rlo_mn, ok); {
5261           stop("broken Montgomery multiply");
5262         } bind(ok);
5263       }
5264 #endif
5265       // We have very carefully set things up so that
5266       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
5267       // the lower half of Rm * Rn because we know the result already:
5268       // it must be -t0.  t0 + (-t0) must generate a carry iff
5269       // t0 != 0.  So, rather than do a mul and an adds we just set
5270       // the carry flag iff t0 is nonzero.
5271       //
5272       // mul(Rlo_mn, Rm, Rn);
5273       // adds(zr, t0, Rlo_mn);
5274       subs(zr, t0, 1); // Set carry iff t0 is nonzero
5275       adcs(t0, t1, Rhi_mn);
5276       adc(t1, t2, zr);
5277       mov(t2, zr);
5278     }
5279 
5280     void acc(Register Rhi, Register Rlo,
5281              Register t0, Register t1, Register t2) {
5282       adds(t0, t0, Rlo);
5283       adcs(t1, t1, Rhi);
5284       adc(t2, t2, zr);
5285     }
5286 
5287   public:
5288     /**
5289      * Fast Montgomery multiplication.  The derivation of the
5290      * algorithm is in A Cryptographic Library for the Motorola
5291      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
5292      *
5293      * Arguments:
5294      *
5295      * Inputs for multiplication:
5296      *   c_rarg0   - int array elements a
5297      *   c_rarg1   - int array elements b
5298      *   c_rarg2   - int array elements n (the modulus)
5299      *   c_rarg3   - int length
5300      *   c_rarg4   - int inv
5301      *   c_rarg5   - int array elements m (the result)
5302      *
5303      * Inputs for squaring:
5304      *   c_rarg0   - int array elements a
5305      *   c_rarg1   - int array elements n (the modulus)
5306      *   c_rarg2   - int length
5307      *   c_rarg3   - int inv
5308      *   c_rarg4   - int array elements m (the result)
5309      *
5310      */
5311     address generate_multiply() {
5312       Label argh, nothing;
5313       bind(argh);
5314       stop("MontgomeryMultiply total_allocation must be <= 8192");
5315 
5316       align(CodeEntryAlignment);
5317       address entry = pc();
5318 
5319       cbzw(Rlen, nothing);
5320 
5321       enter();
5322 
5323       // Make room.
5324       cmpw(Rlen, 512);
5325       br(Assembler::HI, argh);
5326       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5327       andr(sp, Ra, -2 * wordSize);
5328 
5329       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5330 
5331       {
5332         // Copy input args, reversing as we go.  We use Ra as a
5333         // temporary variable.
5334         reverse(Ra, Pa_base, Rlen, t0, t1);
5335         if (!_squaring)
5336           reverse(Ra, Pb_base, Rlen, t0, t1);
5337         reverse(Ra, Pn_base, Rlen, t0, t1);
5338       }
5339 
5340       // Push all call-saved registers and also Pm_base which we'll need
5341       // at the end.
5342       save_regs();
5343 
5344 #ifndef PRODUCT
5345       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
5346       {
5347         ldr(Rn, Address(Pn_base, 0));
5348         mul(Rlo_mn, Rn, inv);
5349         cmp(Rlo_mn, -1);
5350         Label ok;
5351         br(EQ, ok); {
5352           stop("broken inverse in Montgomery multiply");
5353         } bind(ok);
5354       }
5355 #endif
5356 
5357       mov(Pm_base, Ra);
5358 
5359       mov(t0, zr);
5360       mov(t1, zr);
5361       mov(t2, zr);
5362 
5363       block_comment("for (int i = 0; i < len; i++) {");
5364       mov(Ri, zr); {
5365         Label loop, end;
5366         cmpw(Ri, Rlen);
5367         br(Assembler::GE, end);
5368 
5369         bind(loop);
5370         pre1(Ri);
5371 
5372         block_comment("  for (j = i; j; j--) {"); {
5373           movw(Rj, Ri);
5374           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5375         } block_comment("  } // j");
5376 
5377         post1();
5378         addw(Ri, Ri, 1);
5379         cmpw(Ri, Rlen);
5380         br(Assembler::LT, loop);
5381         bind(end);
5382         block_comment("} // i");
5383       }
5384 
5385       block_comment("for (int i = len; i < 2*len; i++) {");
5386       mov(Ri, Rlen); {
5387         Label loop, end;
5388         cmpw(Ri, Rlen, Assembler::LSL, 1);
5389         br(Assembler::GE, end);
5390 
5391         bind(loop);
5392         pre2(Ri, Rlen);
5393 
5394         block_comment("  for (j = len*2-i-1; j; j--) {"); {
5395           lslw(Rj, Rlen, 1);
5396           subw(Rj, Rj, Ri);
5397           subw(Rj, Rj, 1);
5398           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
5399         } block_comment("  } // j");
5400 
5401         post2(Ri, Rlen);
5402         addw(Ri, Ri, 1);
5403         cmpw(Ri, Rlen, Assembler::LSL, 1);
5404         br(Assembler::LT, loop);
5405         bind(end);
5406       }
5407       block_comment("} // i");
5408 
5409       normalize(Rlen);
5410 
5411       mov(Ra, Pm_base);  // Save Pm_base in Ra
5412       restore_regs();  // Restore caller's Pm_base
5413 
5414       // Copy our result into caller's Pm_base
5415       reverse(Pm_base, Ra, Rlen, t0, t1);
5416 
5417       leave();
5418       bind(nothing);
5419       ret(lr);
5420 
5421       return entry;
5422     }
5423     // In C, approximately:
5424 
5425     // void
5426     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
5427     //                     unsigned long Pn_base[], unsigned long Pm_base[],
5428     //                     unsigned long inv, int len) {
5429     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5430     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5431     //   unsigned long Ra, Rb, Rn, Rm;
5432 
5433     //   int i;
5434 
5435     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5436 
5437     //   for (i = 0; i < len; i++) {
5438     //     int j;
5439 
5440     //     Pa = Pa_base;
5441     //     Pb = Pb_base + i;
5442     //     Pm = Pm_base;
5443     //     Pn = Pn_base + i;
5444 
5445     //     Ra = *Pa;
5446     //     Rb = *Pb;
5447     //     Rm = *Pm;
5448     //     Rn = *Pn;
5449 
5450     //     int iters = i;
5451     //     for (j = 0; iters--; j++) {
5452     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5453     //       MACC(Ra, Rb, t0, t1, t2);
5454     //       Ra = *++Pa;
5455     //       Rb = *--Pb;
5456     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5457     //       MACC(Rm, Rn, t0, t1, t2);
5458     //       Rm = *++Pm;
5459     //       Rn = *--Pn;
5460     //     }
5461 
5462     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
5463     //     MACC(Ra, Rb, t0, t1, t2);
5464     //     *Pm = Rm = t0 * inv;
5465     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5466     //     MACC(Rm, Rn, t0, t1, t2);
5467 
5468     //     assert(t0 == 0, "broken Montgomery multiply");
5469 
5470     //     t0 = t1; t1 = t2; t2 = 0;
5471     //   }
5472 
5473     //   for (i = len; i < 2*len; i++) {
5474     //     int j;
5475 
5476     //     Pa = Pa_base + i-len;
5477     //     Pb = Pb_base + len;
5478     //     Pm = Pm_base + i-len;
5479     //     Pn = Pn_base + len;
5480 
5481     //     Ra = *++Pa;
5482     //     Rb = *--Pb;
5483     //     Rm = *++Pm;
5484     //     Rn = *--Pn;
5485 
5486     //     int iters = len*2-i-1;
5487     //     for (j = i-len+1; iters--; j++) {
5488     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
5489     //       MACC(Ra, Rb, t0, t1, t2);
5490     //       Ra = *++Pa;
5491     //       Rb = *--Pb;
5492     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5493     //       MACC(Rm, Rn, t0, t1, t2);
5494     //       Rm = *++Pm;
5495     //       Rn = *--Pn;
5496     //     }
5497 
5498     //     Pm_base[i-len] = t0;
5499     //     t0 = t1; t1 = t2; t2 = 0;
5500     //   }
5501 
5502     //   while (t0)
5503     //     t0 = sub(Pm_base, Pn_base, t0, len);
5504     // }
5505 
5506     /**
5507      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
5508      * multiplies than Montgomery multiplication so it should be up to
5509      * 25% faster.  However, its loop control is more complex and it
5510      * may actually run slower on some machines.
5511      *
5512      * Arguments:
5513      *
5514      * Inputs:
5515      *   c_rarg0   - int array elements a
5516      *   c_rarg1   - int array elements n (the modulus)
5517      *   c_rarg2   - int length
5518      *   c_rarg3   - int inv
5519      *   c_rarg4   - int array elements m (the result)
5520      *
5521      */
5522     address generate_square() {
5523       Label argh;
5524       bind(argh);
5525       stop("MontgomeryMultiply total_allocation must be <= 8192");
5526 
5527       align(CodeEntryAlignment);
5528       address entry = pc();
5529 
5530       enter();
5531 
5532       // Make room.
5533       cmpw(Rlen, 512);
5534       br(Assembler::HI, argh);
5535       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
5536       andr(sp, Ra, -2 * wordSize);
5537 
5538       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
5539 
5540       {
5541         // Copy input args, reversing as we go.  We use Ra as a
5542         // temporary variable.
5543         reverse(Ra, Pa_base, Rlen, t0, t1);
5544         reverse(Ra, Pn_base, Rlen, t0, t1);
5545       }
5546 
5547       // Push all call-saved registers and also Pm_base which we'll need
5548       // at the end.
5549       save_regs();
5550 
5551       mov(Pm_base, Ra);
5552 
5553       mov(t0, zr);
5554       mov(t1, zr);
5555       mov(t2, zr);
5556 
5557       block_comment("for (int i = 0; i < len; i++) {");
5558       mov(Ri, zr); {
5559         Label loop, end;
5560         bind(loop);
5561         cmp(Ri, Rlen);
5562         br(Assembler::GE, end);
5563 
5564         pre1(Ri);
5565 
5566         block_comment("for (j = (i+1)/2; j; j--) {"); {
5567           add(Rj, Ri, 1);
5568           lsr(Rj, Rj, 1);
5569           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5570         } block_comment("  } // j");
5571 
5572         last_squaring(Ri);
5573 
5574         block_comment("  for (j = i/2; j; j--) {"); {
5575           lsr(Rj, Ri, 1);
5576           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5577         } block_comment("  } // j");
5578 
5579         post1_squaring();
5580         add(Ri, Ri, 1);
5581         cmp(Ri, Rlen);
5582         br(Assembler::LT, loop);
5583 
5584         bind(end);
5585         block_comment("} // i");
5586       }
5587 
5588       block_comment("for (int i = len; i < 2*len; i++) {");
5589       mov(Ri, Rlen); {
5590         Label loop, end;
5591         bind(loop);
5592         cmp(Ri, Rlen, Assembler::LSL, 1);
5593         br(Assembler::GE, end);
5594 
5595         pre2(Ri, Rlen);
5596 
5597         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
5598           lsl(Rj, Rlen, 1);
5599           sub(Rj, Rj, Ri);
5600           sub(Rj, Rj, 1);
5601           lsr(Rj, Rj, 1);
5602           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
5603         } block_comment("  } // j");
5604 
5605         last_squaring(Ri);
5606 
5607         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
5608           lsl(Rj, Rlen, 1);
5609           sub(Rj, Rj, Ri);
5610           lsr(Rj, Rj, 1);
5611           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
5612         } block_comment("  } // j");
5613 
5614         post2(Ri, Rlen);
5615         add(Ri, Ri, 1);
5616         cmp(Ri, Rlen, Assembler::LSL, 1);
5617 
5618         br(Assembler::LT, loop);
5619         bind(end);
5620         block_comment("} // i");
5621       }
5622 
5623       normalize(Rlen);
5624 
5625       mov(Ra, Pm_base);  // Save Pm_base in Ra
5626       restore_regs();  // Restore caller's Pm_base
5627 
5628       // Copy our result into caller's Pm_base
5629       reverse(Pm_base, Ra, Rlen, t0, t1);
5630 
5631       leave();
5632       ret(lr);
5633 
5634       return entry;
5635     }
5636     // In C, approximately:
5637 
5638     // void
5639     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
5640     //                   unsigned long Pm_base[], unsigned long inv, int len) {
5641     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
5642     //   unsigned long *Pa, *Pb, *Pn, *Pm;
5643     //   unsigned long Ra, Rb, Rn, Rm;
5644 
5645     //   int i;
5646 
5647     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
5648 
5649     //   for (i = 0; i < len; i++) {
5650     //     int j;
5651 
5652     //     Pa = Pa_base;
5653     //     Pb = Pa_base + i;
5654     //     Pm = Pm_base;
5655     //     Pn = Pn_base + i;
5656 
5657     //     Ra = *Pa;
5658     //     Rb = *Pb;
5659     //     Rm = *Pm;
5660     //     Rn = *Pn;
5661 
5662     //     int iters = (i+1)/2;
5663     //     for (j = 0; iters--; j++) {
5664     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5665     //       MACC2(Ra, Rb, t0, t1, t2);
5666     //       Ra = *++Pa;
5667     //       Rb = *--Pb;
5668     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5669     //       MACC(Rm, Rn, t0, t1, t2);
5670     //       Rm = *++Pm;
5671     //       Rn = *--Pn;
5672     //     }
5673     //     if ((i & 1) == 0) {
5674     //       assert(Ra == Pa_base[j], "must be");
5675     //       MACC(Ra, Ra, t0, t1, t2);
5676     //     }
5677     //     iters = i/2;
5678     //     assert(iters == i-j, "must be");
5679     //     for (; iters--; j++) {
5680     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5681     //       MACC(Rm, Rn, t0, t1, t2);
5682     //       Rm = *++Pm;
5683     //       Rn = *--Pn;
5684     //     }
5685 
5686     //     *Pm = Rm = t0 * inv;
5687     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
5688     //     MACC(Rm, Rn, t0, t1, t2);
5689 
5690     //     assert(t0 == 0, "broken Montgomery multiply");
5691 
5692     //     t0 = t1; t1 = t2; t2 = 0;
5693     //   }
5694 
5695     //   for (i = len; i < 2*len; i++) {
5696     //     int start = i-len+1;
5697     //     int end = start + (len - start)/2;
5698     //     int j;
5699 
5700     //     Pa = Pa_base + i-len;
5701     //     Pb = Pa_base + len;
5702     //     Pm = Pm_base + i-len;
5703     //     Pn = Pn_base + len;
5704 
5705     //     Ra = *++Pa;
5706     //     Rb = *--Pb;
5707     //     Rm = *++Pm;
5708     //     Rn = *--Pn;
5709 
5710     //     int iters = (2*len-i-1)/2;
5711     //     assert(iters == end-start, "must be");
5712     //     for (j = start; iters--; j++) {
5713     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
5714     //       MACC2(Ra, Rb, t0, t1, t2);
5715     //       Ra = *++Pa;
5716     //       Rb = *--Pb;
5717     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5718     //       MACC(Rm, Rn, t0, t1, t2);
5719     //       Rm = *++Pm;
5720     //       Rn = *--Pn;
5721     //     }
5722     //     if ((i & 1) == 0) {
5723     //       assert(Ra == Pa_base[j], "must be");
5724     //       MACC(Ra, Ra, t0, t1, t2);
5725     //     }
5726     //     iters =  (2*len-i)/2;
5727     //     assert(iters == len-j, "must be");
5728     //     for (; iters--; j++) {
5729     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
5730     //       MACC(Rm, Rn, t0, t1, t2);
5731     //       Rm = *++Pm;
5732     //       Rn = *--Pn;
5733     //     }
5734     //     Pm_base[i-len] = t0;
5735     //     t0 = t1; t1 = t2; t2 = 0;
5736     //   }
5737 
5738     //   while (t0)
5739     //     t0 = sub(Pm_base, Pn_base, t0, len);
5740     // }
5741   };
5742 
5743 
5744   // Initialization
5745   void generate_initial() {
5746     // Generate initial stubs and initializes the entry points
5747 
5748     // entry points that exist in all platforms Note: This is code
5749     // that could be shared among different platforms - however the
5750     // benefit seems to be smaller than the disadvantage of having a
5751     // much more complicated generator structure. See also comment in
5752     // stubRoutines.hpp.
5753 
5754     StubRoutines::_forward_exception_entry = generate_forward_exception();
5755 
5756     StubRoutines::_call_stub_entry =
5757       generate_call_stub(StubRoutines::_call_stub_return_address);
5758 
5759     // is referenced by megamorphic call
5760     StubRoutines::_catch_exception_entry = generate_catch_exception();
5761 
5762     // Build this early so it's available for the interpreter.
5763     StubRoutines::_throw_StackOverflowError_entry =
5764       generate_throw_exception("StackOverflowError throw_exception",
5765                                CAST_FROM_FN_PTR(address,
5766                                                 SharedRuntime::throw_StackOverflowError));
5767     StubRoutines::_throw_delayed_StackOverflowError_entry =
5768       generate_throw_exception("delayed StackOverflowError throw_exception",
5769                                CAST_FROM_FN_PTR(address,
5770                                                 SharedRuntime::throw_delayed_StackOverflowError));
5771     if (UseCRC32Intrinsics) {
5772       // set table address before stub generation which use it
5773       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
5774       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5775     }
5776 
5777     if (UseCRC32CIntrinsics) {
5778       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5779     }
5780 
5781     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) {
5782       StubRoutines::_dlog = generate_dlog();
5783     }
5784 
5785     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) {
5786       StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false);
5787     }
5788 
5789     if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) {
5790       StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true);
5791     }
5792   }
5793 
5794   void generate_all() {
5795     // support for verify_oop (must happen after universe_init)
5796     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
5797     StubRoutines::_throw_AbstractMethodError_entry =
5798       generate_throw_exception("AbstractMethodError throw_exception",
5799                                CAST_FROM_FN_PTR(address,
5800                                                 SharedRuntime::
5801                                                 throw_AbstractMethodError));
5802 
5803     StubRoutines::_throw_IncompatibleClassChangeError_entry =
5804       generate_throw_exception("IncompatibleClassChangeError throw_exception",
5805                                CAST_FROM_FN_PTR(address,
5806                                                 SharedRuntime::
5807                                                 throw_IncompatibleClassChangeError));
5808 
5809     StubRoutines::_throw_NullPointerException_at_call_entry =
5810       generate_throw_exception("NullPointerException at call throw_exception",
5811                                CAST_FROM_FN_PTR(address,
5812                                                 SharedRuntime::
5813                                                 throw_NullPointerException_at_call));
5814 
5815     // arraycopy stubs used by compilers
5816     generate_arraycopy_stubs();
5817 
5818     // has negatives stub for large arrays.
5819     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
5820 
5821     // array equals stub for large arrays.
5822     if (!UseSimpleArrayEquals) {
5823       StubRoutines::aarch64::_large_array_equals = generate_large_array_equals();
5824     }
5825 
5826     generate_compare_long_strings();
5827 
5828     generate_string_indexof_stubs();
5829 
5830     // byte_array_inflate stub for large arrays.
5831     StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate();
5832 
5833     if (UseMultiplyToLenIntrinsic) {
5834       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5835     }
5836 
5837     if (UseSquareToLenIntrinsic) {
5838       StubRoutines::_squareToLen = generate_squareToLen();
5839     }
5840 
5841     if (UseMulAddIntrinsic) {
5842       StubRoutines::_mulAdd = generate_mulAdd();
5843     }
5844 
5845     if (UseMontgomeryMultiplyIntrinsic) {
5846       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
5847       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
5848       StubRoutines::_montgomeryMultiply = g.generate_multiply();
5849     }
5850 
5851     if (UseMontgomerySquareIntrinsic) {
5852       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
5853       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
5854       // We use generate_multiply() rather than generate_square()
5855       // because it's faster for the sizes of modulus we care about.
5856       StubRoutines::_montgomerySquare = g.generate_multiply();
5857     }
5858 
5859 #ifndef BUILTIN_SIM
5860     // generate GHASH intrinsics code
5861     if (UseGHASHIntrinsics) {
5862       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5863     }
5864 
5865     // data cache line writeback
5866     StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
5867     StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
5868     
5869     if (UseAESIntrinsics) {
5870       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5871       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5872       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5873       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5874     }
5875 
5876     if (UseSHA1Intrinsics) {
5877       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5878       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5879     }
5880     if (UseSHA256Intrinsics) {
5881       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5882       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5883     }
5884 
5885     // generate Adler32 intrinsics code
5886     if (UseAdler32Intrinsics) {
5887       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5888     }
5889 
5890     // Safefetch stubs.
5891     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5892                                                        &StubRoutines::_safefetch32_fault_pc,
5893                                                        &StubRoutines::_safefetch32_continuation_pc);
5894     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5895                                                        &StubRoutines::_safefetchN_fault_pc,
5896                                                        &StubRoutines::_safefetchN_continuation_pc);
5897 #endif
5898     StubRoutines::aarch64::set_completed();
5899   }
5900 
5901  public:
5902   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5903     if (all) {
5904       generate_all();
5905     } else {
5906       generate_initial();
5907     }
5908   }
5909 }; // end class declaration
5910 
5911 void StubGenerator_generate(CodeBuffer* code, bool all) {
5912   StubGenerator g(code, all);
5913 }