1 /*
   2  * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shared/cardTable.hpp"
  30 #include "gc/shared/cardTableModRefBS.hpp"
  31 #include "interpreter/interpreter.hpp"
  32 #include "nativeInst_aarch64.hpp"
  33 #include "oops/instanceOop.hpp"
  34 #include "oops/method.hpp"
  35 #include "oops/objArrayKlass.hpp"
  36 #include "oops/oop.inline.hpp"
  37 #include "prims/methodHandles.hpp"
  38 #include "runtime/frame.inline.hpp"
  39 #include "runtime/handles.inline.hpp"
  40 #include "runtime/sharedRuntime.hpp"
  41 #include "runtime/stubCodeGenerator.hpp"
  42 #include "runtime/stubRoutines.hpp"
  43 #include "runtime/thread.inline.hpp"
  44 #include "utilities/align.hpp"
  45 #ifdef COMPILER2
  46 #include "opto/runtime.hpp"
  47 #endif
  48 
  49 #ifdef BUILTIN_SIM
  50 #include "../../../../../../simulator/simulator.hpp"
  51 #endif
  52 
  53 // Declaration and definition of StubGenerator (no .hpp file).
  54 // For a more detailed description of the stub routine structure
  55 // see the comment in stubRoutines.hpp
  56 
  57 #undef __
  58 #define __ _masm->
  59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  60 
  61 #ifdef PRODUCT
  62 #define BLOCK_COMMENT(str) /* nothing */
  63 #else
  64 #define BLOCK_COMMENT(str) __ block_comment(str)
  65 #endif
  66 
  67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  68 
  69 // Stub Code definitions
  70 
  71 class StubGenerator: public StubCodeGenerator {
  72  private:
  73 
  74 #ifdef PRODUCT
  75 #define inc_counter_np(counter) ((void)0)
  76 #else
  77   void inc_counter_np_(int& counter) {
  78     __ lea(rscratch2, ExternalAddress((address)&counter));
  79     __ ldrw(rscratch1, Address(rscratch2));
  80     __ addw(rscratch1, rscratch1, 1);
  81     __ strw(rscratch1, Address(rscratch2));
  82   }
  83 #define inc_counter_np(counter) \
  84   BLOCK_COMMENT("inc_counter " #counter); \
  85   inc_counter_np_(counter);
  86 #endif
  87 
  88   // Call stubs are used to call Java from C
  89   //
  90   // Arguments:
  91   //    c_rarg0:   call wrapper address                   address
  92   //    c_rarg1:   result                                 address
  93   //    c_rarg2:   result type                            BasicType
  94   //    c_rarg3:   method                                 Method*
  95   //    c_rarg4:   (interpreter) entry point              address
  96   //    c_rarg5:   parameters                             intptr_t*
  97   //    c_rarg6:   parameter size (in words)              int
  98   //    c_rarg7:   thread                                 Thread*
  99   //
 100   // There is no return from the stub itself as any Java result
 101   // is written to result
 102   //
 103   // we save r30 (lr) as the return PC at the base of the frame and
 104   // link r29 (fp) below it as the frame pointer installing sp (r31)
 105   // into fp.
 106   //
 107   // we save r0-r7, which accounts for all the c arguments.
 108   //
 109   // TODO: strictly do we need to save them all? they are treated as
 110   // volatile by C so could we omit saving the ones we are going to
 111   // place in global registers (thread? method?) or those we only use
 112   // during setup of the Java call?
 113   //
 114   // we don't need to save r8 which C uses as an indirect result location
 115   // return register.
 116   //
 117   // we don't need to save r9-r15 which both C and Java treat as
 118   // volatile
 119   //
 120   // we don't need to save r16-18 because Java does not use them
 121   //
 122   // we save r19-r28 which Java uses as scratch registers and C
 123   // expects to be callee-save
 124   //
 125   // we save the bottom 64 bits of each value stored in v8-v15; it is
 126   // the responsibility of the caller to preserve larger values.
 127   //
 128   // so the stub frame looks like this when we enter Java code
 129   //
 130   //     [ return_from_Java     ] <--- sp
 131   //     [ argument word n      ]
 132   //      ...
 133   // -27 [ argument word 1      ]
 134   // -26 [ saved v15            ] <--- sp_after_call
 135   // -25 [ saved v14            ]
 136   // -24 [ saved v13            ]
 137   // -23 [ saved v12            ]
 138   // -22 [ saved v11            ]
 139   // -21 [ saved v10            ]
 140   // -20 [ saved v9             ]
 141   // -19 [ saved v8             ]
 142   // -18 [ saved r28            ]
 143   // -17 [ saved r27            ]
 144   // -16 [ saved r26            ]
 145   // -15 [ saved r25            ]
 146   // -14 [ saved r24            ]
 147   // -13 [ saved r23            ]
 148   // -12 [ saved r22            ]
 149   // -11 [ saved r21            ]
 150   // -10 [ saved r20            ]
 151   //  -9 [ saved r19            ]
 152   //  -8 [ call wrapper    (r0) ]
 153   //  -7 [ result          (r1) ]
 154   //  -6 [ result type     (r2) ]
 155   //  -5 [ method          (r3) ]
 156   //  -4 [ entry point     (r4) ]
 157   //  -3 [ parameters      (r5) ]
 158   //  -2 [ parameter size  (r6) ]
 159   //  -1 [ thread (r7)          ]
 160   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 161   //   1 [ saved lr       (r30) ]
 162 
 163   // Call stub stack layout word offsets from fp
 164   enum call_stub_layout {
 165     sp_after_call_off = -26,
 166 
 167     d15_off            = -26,
 168     d13_off            = -24,
 169     d11_off            = -22,
 170     d9_off             = -20,
 171 
 172     r28_off            = -18,
 173     r26_off            = -16,
 174     r24_off            = -14,
 175     r22_off            = -12,
 176     r20_off            = -10,
 177     call_wrapper_off   =  -8,
 178     result_off         =  -7,
 179     result_type_off    =  -6,
 180     method_off         =  -5,
 181     entry_point_off    =  -4,
 182     parameter_size_off =  -2,
 183     thread_off         =  -1,
 184     fp_f               =   0,
 185     retaddr_off        =   1,
 186   };
 187 
 188   address generate_call_stub(address& return_address) {
 189     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 190            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 191            "adjust this code");
 192 
 193     StubCodeMark mark(this, "StubRoutines", "call_stub");
 194     address start = __ pc();
 195 
 196     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 197 
 198     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 199     const Address result        (rfp, result_off         * wordSize);
 200     const Address result_type   (rfp, result_type_off    * wordSize);
 201     const Address method        (rfp, method_off         * wordSize);
 202     const Address entry_point   (rfp, entry_point_off    * wordSize);
 203     const Address parameter_size(rfp, parameter_size_off * wordSize);
 204 
 205     const Address thread        (rfp, thread_off         * wordSize);
 206 
 207     const Address d15_save      (rfp, d15_off * wordSize);
 208     const Address d13_save      (rfp, d13_off * wordSize);
 209     const Address d11_save      (rfp, d11_off * wordSize);
 210     const Address d9_save       (rfp, d9_off * wordSize);
 211 
 212     const Address r28_save      (rfp, r28_off * wordSize);
 213     const Address r26_save      (rfp, r26_off * wordSize);
 214     const Address r24_save      (rfp, r24_off * wordSize);
 215     const Address r22_save      (rfp, r22_off * wordSize);
 216     const Address r20_save      (rfp, r20_off * wordSize);
 217 
 218     // stub code
 219 
 220     // we need a C prolog to bootstrap the x86 caller into the sim
 221     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 222 
 223     address aarch64_entry = __ pc();
 224 
 225 #ifdef BUILTIN_SIM
 226     // Save sender's SP for stack traces.
 227     __ mov(rscratch1, sp);
 228     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 229 #endif
 230     // set up frame and move sp to end of save area
 231     __ enter();
 232     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 233 
 234     // save register parameters and Java scratch/global registers
 235     // n.b. we save thread even though it gets installed in
 236     // rthread because we want to sanity check rthread later
 237     __ str(c_rarg7,  thread);
 238     __ strw(c_rarg6, parameter_size);
 239     __ stp(c_rarg4, c_rarg5,  entry_point);
 240     __ stp(c_rarg2, c_rarg3,  result_type);
 241     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 242 
 243     __ stp(r20, r19,   r20_save);
 244     __ stp(r22, r21,   r22_save);
 245     __ stp(r24, r23,   r24_save);
 246     __ stp(r26, r25,   r26_save);
 247     __ stp(r28, r27,   r28_save);
 248 
 249     __ stpd(v9,  v8,   d9_save);
 250     __ stpd(v11, v10,  d11_save);
 251     __ stpd(v13, v12,  d13_save);
 252     __ stpd(v15, v14,  d15_save);
 253 
 254     // install Java thread in global register now we have saved
 255     // whatever value it held
 256     __ mov(rthread, c_rarg7);
 257     // And method
 258     __ mov(rmethod, c_rarg3);
 259 
 260     // set up the heapbase register
 261     __ reinit_heapbase();
 262 
 263 #ifdef ASSERT
 264     // make sure we have no pending exceptions
 265     {
 266       Label L;
 267       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 268       __ cmp(rscratch1, (unsigned)NULL_WORD);
 269       __ br(Assembler::EQ, L);
 270       __ stop("StubRoutines::call_stub: entered with pending exception");
 271       __ BIND(L);
 272     }
 273 #endif
 274     // pass parameters if any
 275     __ mov(esp, sp);
 276     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 277     __ andr(sp, rscratch1, -2 * wordSize);
 278 
 279     BLOCK_COMMENT("pass parameters if any");
 280     Label parameters_done;
 281     // parameter count is still in c_rarg6
 282     // and parameter pointer identifying param 1 is in c_rarg5
 283     __ cbzw(c_rarg6, parameters_done);
 284 
 285     address loop = __ pc();
 286     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 287     __ subsw(c_rarg6, c_rarg6, 1);
 288     __ push(rscratch1);
 289     __ br(Assembler::GT, loop);
 290 
 291     __ BIND(parameters_done);
 292 
 293     // call Java entry -- passing methdoOop, and current sp
 294     //      rmethod: Method*
 295     //      r13: sender sp
 296     BLOCK_COMMENT("call Java function");
 297     __ mov(r13, sp);
 298     __ blr(c_rarg4);
 299 
 300     // tell the simulator we have returned to the stub
 301 
 302     // we do this here because the notify will already have been done
 303     // if we get to the next instruction via an exception
 304     //
 305     // n.b. adding this instruction here affects the calculation of
 306     // whether or not a routine returns to the call stub (used when
 307     // doing stack walks) since the normal test is to check the return
 308     // pc against the address saved below. so we may need to allow for
 309     // this extra instruction in the check.
 310 
 311     if (NotifySimulator) {
 312       __ notify(Assembler::method_reentry);
 313     }
 314     // save current address for use by exception handling code
 315 
 316     return_address = __ pc();
 317 
 318     // store result depending on type (everything that is not
 319     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 320     // n.b. this assumes Java returns an integral result in r0
 321     // and a floating result in j_farg0
 322     __ ldr(j_rarg2, result);
 323     Label is_long, is_float, is_double, exit;
 324     __ ldr(j_rarg1, result_type);
 325     __ cmp(j_rarg1, T_OBJECT);
 326     __ br(Assembler::EQ, is_long);
 327     __ cmp(j_rarg1, T_LONG);
 328     __ br(Assembler::EQ, is_long);
 329     __ cmp(j_rarg1, T_FLOAT);
 330     __ br(Assembler::EQ, is_float);
 331     __ cmp(j_rarg1, T_DOUBLE);
 332     __ br(Assembler::EQ, is_double);
 333 
 334     // handle T_INT case
 335     __ strw(r0, Address(j_rarg2));
 336 
 337     __ BIND(exit);
 338 
 339     // pop parameters
 340     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 341 
 342 #ifdef ASSERT
 343     // verify that threads correspond
 344     {
 345       Label L, S;
 346       __ ldr(rscratch1, thread);
 347       __ cmp(rthread, rscratch1);
 348       __ br(Assembler::NE, S);
 349       __ get_thread(rscratch1);
 350       __ cmp(rthread, rscratch1);
 351       __ br(Assembler::EQ, L);
 352       __ BIND(S);
 353       __ stop("StubRoutines::call_stub: threads must correspond");
 354       __ BIND(L);
 355     }
 356 #endif
 357 
 358     // restore callee-save registers
 359     __ ldpd(v15, v14,  d15_save);
 360     __ ldpd(v13, v12,  d13_save);
 361     __ ldpd(v11, v10,  d11_save);
 362     __ ldpd(v9,  v8,   d9_save);
 363 
 364     __ ldp(r28, r27,   r28_save);
 365     __ ldp(r26, r25,   r26_save);
 366     __ ldp(r24, r23,   r24_save);
 367     __ ldp(r22, r21,   r22_save);
 368     __ ldp(r20, r19,   r20_save);
 369 
 370     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 371     __ ldrw(c_rarg2, result_type);
 372     __ ldr(c_rarg3,  method);
 373     __ ldp(c_rarg4, c_rarg5,  entry_point);
 374     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 375 
 376 #ifndef PRODUCT
 377     // tell the simulator we are about to end Java execution
 378     if (NotifySimulator) {
 379       __ notify(Assembler::method_exit);
 380     }
 381 #endif
 382     // leave frame and return to caller
 383     __ leave();
 384     __ ret(lr);
 385 
 386     // handle return types different from T_INT
 387 
 388     __ BIND(is_long);
 389     __ str(r0, Address(j_rarg2, 0));
 390     __ br(Assembler::AL, exit);
 391 
 392     __ BIND(is_float);
 393     __ strs(j_farg0, Address(j_rarg2, 0));
 394     __ br(Assembler::AL, exit);
 395 
 396     __ BIND(is_double);
 397     __ strd(j_farg0, Address(j_rarg2, 0));
 398     __ br(Assembler::AL, exit);
 399 
 400     return start;
 401   }
 402 
 403   // Return point for a Java call if there's an exception thrown in
 404   // Java code.  The exception is caught and transformed into a
 405   // pending exception stored in JavaThread that can be tested from
 406   // within the VM.
 407   //
 408   // Note: Usually the parameters are removed by the callee. In case
 409   // of an exception crossing an activation frame boundary, that is
 410   // not the case if the callee is compiled code => need to setup the
 411   // rsp.
 412   //
 413   // r0: exception oop
 414 
 415   // NOTE: this is used as a target from the signal handler so it
 416   // needs an x86 prolog which returns into the current simulator
 417   // executing the generated catch_exception code. so the prolog
 418   // needs to install rax in a sim register and adjust the sim's
 419   // restart pc to enter the generated code at the start position
 420   // then return from native to simulated execution.
 421 
 422   address generate_catch_exception() {
 423     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 424     address start = __ pc();
 425 
 426     // same as in generate_call_stub():
 427     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 428     const Address thread        (rfp, thread_off         * wordSize);
 429 
 430 #ifdef ASSERT
 431     // verify that threads correspond
 432     {
 433       Label L, S;
 434       __ ldr(rscratch1, thread);
 435       __ cmp(rthread, rscratch1);
 436       __ br(Assembler::NE, S);
 437       __ get_thread(rscratch1);
 438       __ cmp(rthread, rscratch1);
 439       __ br(Assembler::EQ, L);
 440       __ bind(S);
 441       __ stop("StubRoutines::catch_exception: threads must correspond");
 442       __ bind(L);
 443     }
 444 #endif
 445 
 446     // set pending exception
 447     __ verify_oop(r0);
 448 
 449     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 450     __ mov(rscratch1, (address)__FILE__);
 451     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 452     __ movw(rscratch1, (int)__LINE__);
 453     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 454 
 455     // complete return to VM
 456     assert(StubRoutines::_call_stub_return_address != NULL,
 457            "_call_stub_return_address must have been generated before");
 458     __ b(StubRoutines::_call_stub_return_address);
 459 
 460     return start;
 461   }
 462 
 463   // Continuation point for runtime calls returning with a pending
 464   // exception.  The pending exception check happened in the runtime
 465   // or native call stub.  The pending exception in Thread is
 466   // converted into a Java-level exception.
 467   //
 468   // Contract with Java-level exception handlers:
 469   // r0: exception
 470   // r3: throwing pc
 471   //
 472   // NOTE: At entry of this stub, exception-pc must be in LR !!
 473 
 474   // NOTE: this is always used as a jump target within generated code
 475   // so it just needs to be generated code wiht no x86 prolog
 476 
 477   address generate_forward_exception() {
 478     StubCodeMark mark(this, "StubRoutines", "forward exception");
 479     address start = __ pc();
 480 
 481     // Upon entry, LR points to the return address returning into
 482     // Java (interpreted or compiled) code; i.e., the return address
 483     // becomes the throwing pc.
 484     //
 485     // Arguments pushed before the runtime call are still on the stack
 486     // but the exception handler will reset the stack pointer ->
 487     // ignore them.  A potential result in registers can be ignored as
 488     // well.
 489 
 490 #ifdef ASSERT
 491     // make sure this code is only executed if there is a pending exception
 492     {
 493       Label L;
 494       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 495       __ cbnz(rscratch1, L);
 496       __ stop("StubRoutines::forward exception: no pending exception (1)");
 497       __ bind(L);
 498     }
 499 #endif
 500 
 501     // compute exception handler into r19
 502 
 503     // call the VM to find the handler address associated with the
 504     // caller address. pass thread in r0 and caller pc (ret address)
 505     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 506     // the stack.
 507     __ mov(c_rarg1, lr);
 508     // lr will be trashed by the VM call so we move it to R19
 509     // (callee-saved) because we also need to pass it to the handler
 510     // returned by this call.
 511     __ mov(r19, lr);
 512     BLOCK_COMMENT("call exception_handler_for_return_address");
 513     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 514                          SharedRuntime::exception_handler_for_return_address),
 515                     rthread, c_rarg1);
 516     // we should not really care that lr is no longer the callee
 517     // address. we saved the value the handler needs in r19 so we can
 518     // just copy it to r3. however, the C2 handler will push its own
 519     // frame and then calls into the VM and the VM code asserts that
 520     // the PC for the frame above the handler belongs to a compiled
 521     // Java method. So, we restore lr here to satisfy that assert.
 522     __ mov(lr, r19);
 523     // setup r0 & r3 & clear pending exception
 524     __ mov(r3, r19);
 525     __ mov(r19, r0);
 526     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 527     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 528 
 529 #ifdef ASSERT
 530     // make sure exception is set
 531     {
 532       Label L;
 533       __ cbnz(r0, L);
 534       __ stop("StubRoutines::forward exception: no pending exception (2)");
 535       __ bind(L);
 536     }
 537 #endif
 538 
 539     // continue at exception handler
 540     // r0: exception
 541     // r3: throwing pc
 542     // r19: exception handler
 543     __ verify_oop(r0);
 544     __ br(r19);
 545 
 546     return start;
 547   }
 548 
 549   // Non-destructive plausibility checks for oops
 550   //
 551   // Arguments:
 552   //    r0: oop to verify
 553   //    rscratch1: error message
 554   //
 555   // Stack after saving c_rarg3:
 556   //    [tos + 0]: saved c_rarg3
 557   //    [tos + 1]: saved c_rarg2
 558   //    [tos + 2]: saved lr
 559   //    [tos + 3]: saved rscratch2
 560   //    [tos + 4]: saved r0
 561   //    [tos + 5]: saved rscratch1
 562   address generate_verify_oop() {
 563 
 564     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 565     address start = __ pc();
 566 
 567     Label exit, error;
 568 
 569     // save c_rarg2 and c_rarg3
 570     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 571 
 572     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 573     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 574     __ ldr(c_rarg3, Address(c_rarg2));
 575     __ add(c_rarg3, c_rarg3, 1);
 576     __ str(c_rarg3, Address(c_rarg2));
 577 
 578     // object is in r0
 579     // make sure object is 'reasonable'
 580     __ cbz(r0, exit); // if obj is NULL it is OK
 581 
 582     // Check if the oop is in the right area of memory
 583     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 584     __ andr(c_rarg2, r0, c_rarg3);
 585     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 586 
 587     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 588     // instruction here because the flags register is live.
 589     __ eor(c_rarg2, c_rarg2, c_rarg3);
 590     __ cbnz(c_rarg2, error);
 591 
 592     // make sure klass is 'reasonable', which is not zero.
 593     __ load_klass(r0, r0);  // get klass
 594     __ cbz(r0, error);      // if klass is NULL it is broken
 595 
 596     // return if everything seems ok
 597     __ bind(exit);
 598 
 599     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 600     __ ret(lr);
 601 
 602     // handle errors
 603     __ bind(error);
 604     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 605 
 606     __ push(RegSet::range(r0, r29), sp);
 607     // debug(char* msg, int64_t pc, int64_t regs[])
 608     __ mov(c_rarg0, rscratch1);      // pass address of error message
 609     __ mov(c_rarg1, lr);             // pass return address
 610     __ mov(c_rarg2, sp);             // pass address of regs on stack
 611 #ifndef PRODUCT
 612     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 613 #endif
 614     BLOCK_COMMENT("call MacroAssembler::debug");
 615     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 616     __ blrt(rscratch1, 3, 0, 1);
 617 
 618     return start;
 619   }
 620 
 621   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 622 
 623   // Generate code for an array write pre barrier
 624   //
 625   //     addr       - starting address
 626   //     count      - element count
 627   //     tmp        - scratch register
 628   //     saved_regs - registers to be saved before calling static_write_ref_array_pre
 629   //
 630   //     Callers must specify which registers to preserve in saved_regs.
 631   //     Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs.
 632   //
 633   void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized, RegSet saved_regs) {
 634     BarrierSet* bs = Universe::heap()->barrier_set();
 635     switch (bs->kind()) {
 636     case BarrierSet::G1BarrierSet:
 637       // With G1, don't generate the call if we statically know that the target in uninitialized
 638       if (!dest_uninitialized) {
 639         __ push(saved_regs, sp);
 640         if (count == c_rarg0) {
 641           if (addr == c_rarg1) {
 642             // exactly backwards!!
 643             __ mov(rscratch1, c_rarg0);
 644             __ mov(c_rarg0, c_rarg1);
 645             __ mov(c_rarg1, rscratch1);
 646           } else {
 647             __ mov(c_rarg1, count);
 648             __ mov(c_rarg0, addr);
 649           }
 650         } else {
 651           __ mov(c_rarg0, addr);
 652           __ mov(c_rarg1, count);
 653         }
 654         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 655         __ pop(saved_regs, sp);
 656         break;
 657       case BarrierSet::CardTableModRef:
 658         break;
 659       default:
 660         ShouldNotReachHere();
 661 
 662       }
 663     }
 664   }
 665 
 666   //
 667   // Generate code for an array write post barrier
 668   //
 669   //  Input:
 670   //     start      - register containing starting address of destination array
 671   //     end        - register containing ending address of destination array
 672   //     scratch    - scratch register
 673   //     saved_regs - registers to be saved before calling static_write_ref_array_post
 674   //
 675   //  The input registers are overwritten.
 676   //  The ending address is inclusive.
 677   //  Callers must specify which registers to preserve in saved_regs.
 678   //  Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs.
 679   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch, RegSet saved_regs) {
 680     assert_different_registers(start, end, scratch);
 681     BarrierSet* bs = Universe::heap()->barrier_set();
 682     switch (bs->kind()) {
 683       case BarrierSet::G1BarrierSet:
 684 
 685         {
 686           __ push(saved_regs, sp);
 687           // must compute element count unless barrier set interface is changed (other platforms supply count)
 688           assert_different_registers(start, end, scratch);
 689           __ lea(scratch, Address(end, BytesPerHeapOop));
 690           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 691           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 692           __ mov(c_rarg0, start);
 693           __ mov(c_rarg1, scratch);
 694           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 695           __ pop(saved_regs, sp);
 696         }
 697         break;
 698       case BarrierSet::CardTableModRef:
 699         {
 700           CardTableModRefBS* ctbs = barrier_set_cast<CardTableModRefBS>(bs);
 701           CardTable* ct = ctbs->card_table();
 702           assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code");
 703 
 704           Label L_loop;
 705 
 706            __ lsr(start, start, CardTable::card_shift);
 707            __ lsr(end, end, CardTable::card_shift);
 708            __ sub(end, end, start); // number of bytes to copy
 709 
 710           const Register count = end; // 'end' register contains bytes count now
 711           __ load_byte_map_base(scratch);
 712           __ add(start, start, scratch);
 713           if (UseConcMarkSweepGC) {
 714             __ membar(__ StoreStore);
 715           }
 716           __ BIND(L_loop);
 717           __ strb(zr, Address(start, count));
 718           __ subs(count, count, 1);
 719           __ br(Assembler::GE, L_loop);
 720         }
 721         break;
 722       default:
 723         ShouldNotReachHere();
 724 
 725     }
 726   }
 727 
 728   // The inner part of zero_words().  This is the bulk operation,
 729   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 730   // caller is responsible for zeroing the last few words.
 731   //
 732   // Inputs:
 733   // r10: the HeapWord-aligned base address of an array to zero.
 734   // r11: the count in HeapWords, r11 > 0.
 735   //
 736   // Returns r10 and r11, adjusted for the caller to clear.
 737   // r10: the base address of the tail of words left to clear.
 738   // r11: the number of words in the tail.
 739   //      r11 < MacroAssembler::zero_words_block_size.
 740 
 741   address generate_zero_blocks() {
 742     Label store_pair, loop_store_pair, done;
 743     Label base_aligned;
 744 
 745     Register base = r10, cnt = r11;
 746 
 747     __ align(CodeEntryAlignment);
 748     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 749     address start = __ pc();
 750 
 751     if (UseBlockZeroing) {
 752       int zva_length = VM_Version::zva_length();
 753 
 754       // Ensure ZVA length can be divided by 16. This is required by
 755       // the subsequent operations.
 756       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 757 
 758       __ tbz(base, 3, base_aligned);
 759       __ str(zr, Address(__ post(base, 8)));
 760       __ sub(cnt, cnt, 1);
 761       __ bind(base_aligned);
 762 
 763       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 764       // alignment.
 765       Label small;
 766       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 767       __ subs(rscratch1, cnt, low_limit >> 3);
 768       __ br(Assembler::LT, small);
 769       __ zero_dcache_blocks(base, cnt);
 770       __ bind(small);
 771     }
 772 
 773     {
 774       // Number of stp instructions we'll unroll
 775       const int unroll =
 776         MacroAssembler::zero_words_block_size / 2;
 777       // Clear the remaining blocks.
 778       Label loop;
 779       __ subs(cnt, cnt, unroll * 2);
 780       __ br(Assembler::LT, done);
 781       __ bind(loop);
 782       for (int i = 0; i < unroll; i++)
 783         __ stp(zr, zr, __ post(base, 16));
 784       __ subs(cnt, cnt, unroll * 2);
 785       __ br(Assembler::GE, loop);
 786       __ bind(done);
 787       __ add(cnt, cnt, unroll * 2);
 788     }
 789 
 790     __ ret(lr);
 791 
 792     return start;
 793   }
 794 
 795 
 796   typedef enum {
 797     copy_forwards = 1,
 798     copy_backwards = -1
 799   } copy_direction;
 800 
 801   // Bulk copy of blocks of 8 words.
 802   //
 803   // count is a count of words.
 804   //
 805   // Precondition: count >= 8
 806   //
 807   // Postconditions:
 808   //
 809   // The least significant bit of count contains the remaining count
 810   // of words to copy.  The rest of count is trash.
 811   //
 812   // s and d are adjusted to point to the remaining words to copy
 813   //
 814   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 815                            copy_direction direction) {
 816     int unit = wordSize * direction;
 817     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 818 
 819     int offset;
 820     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 821       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 822     const Register stride = r13;
 823 
 824     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 825     assert_different_registers(s, d, count, rscratch1);
 826 
 827     Label again, drain;
 828     const char *stub_name;
 829     if (direction == copy_forwards)
 830       stub_name = "forward_copy_longs";
 831     else
 832       stub_name = "backward_copy_longs";
 833     StubCodeMark mark(this, "StubRoutines", stub_name);
 834     __ align(CodeEntryAlignment);
 835     __ bind(start);
 836 
 837     Label unaligned_copy_long;
 838     if (AvoidUnalignedAccesses) {
 839       __ tbnz(d, 3, unaligned_copy_long);
 840     }
 841 
 842     if (direction == copy_forwards) {
 843       __ sub(s, s, bias);
 844       __ sub(d, d, bias);
 845     }
 846 
 847 #ifdef ASSERT
 848     // Make sure we are never given < 8 words
 849     {
 850       Label L;
 851       __ cmp(count, 8);
 852       __ br(Assembler::GE, L);
 853       __ stop("genrate_copy_longs called with < 8 words");
 854       __ bind(L);
 855     }
 856 #endif
 857 
 858     // Fill 8 registers
 859     if (UseSIMDForMemoryOps) {
 860       __ ldpq(v0, v1, Address(s, 4 * unit));
 861       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 862     } else {
 863       __ ldp(t0, t1, Address(s, 2 * unit));
 864       __ ldp(t2, t3, Address(s, 4 * unit));
 865       __ ldp(t4, t5, Address(s, 6 * unit));
 866       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 867     }
 868 
 869     __ subs(count, count, 16);
 870     __ br(Assembler::LO, drain);
 871 
 872     int prefetch = PrefetchCopyIntervalInBytes;
 873     bool use_stride = false;
 874     if (direction == copy_backwards) {
 875        use_stride = prefetch > 256;
 876        prefetch = -prefetch;
 877        if (use_stride) __ mov(stride, prefetch);
 878     }
 879 
 880     __ bind(again);
 881 
 882     if (PrefetchCopyIntervalInBytes > 0)
 883       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 884 
 885     if (UseSIMDForMemoryOps) {
 886       __ stpq(v0, v1, Address(d, 4 * unit));
 887       __ ldpq(v0, v1, Address(s, 4 * unit));
 888       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 889       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 890     } else {
 891       __ stp(t0, t1, Address(d, 2 * unit));
 892       __ ldp(t0, t1, Address(s, 2 * unit));
 893       __ stp(t2, t3, Address(d, 4 * unit));
 894       __ ldp(t2, t3, Address(s, 4 * unit));
 895       __ stp(t4, t5, Address(d, 6 * unit));
 896       __ ldp(t4, t5, Address(s, 6 * unit));
 897       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 898       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 899     }
 900 
 901     __ subs(count, count, 8);
 902     __ br(Assembler::HS, again);
 903 
 904     // Drain
 905     __ bind(drain);
 906     if (UseSIMDForMemoryOps) {
 907       __ stpq(v0, v1, Address(d, 4 * unit));
 908       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 909     } else {
 910       __ stp(t0, t1, Address(d, 2 * unit));
 911       __ stp(t2, t3, Address(d, 4 * unit));
 912       __ stp(t4, t5, Address(d, 6 * unit));
 913       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 914     }
 915 
 916     {
 917       Label L1, L2;
 918       __ tbz(count, exact_log2(4), L1);
 919       if (UseSIMDForMemoryOps) {
 920         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 921         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 922       } else {
 923         __ ldp(t0, t1, Address(s, 2 * unit));
 924         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 925         __ stp(t0, t1, Address(d, 2 * unit));
 926         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 927       }
 928       __ bind(L1);
 929 
 930       if (direction == copy_forwards) {
 931         __ add(s, s, bias);
 932         __ add(d, d, bias);
 933       }
 934 
 935       __ tbz(count, 1, L2);
 936       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 937       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 938       __ bind(L2);
 939     }
 940 
 941     __ ret(lr);
 942 
 943     if (AvoidUnalignedAccesses) {
 944       Label drain, again;
 945       // Register order for storing. Order is different for backward copy.
 946 
 947       __ bind(unaligned_copy_long);
 948 
 949       // source address is even aligned, target odd aligned
 950       //
 951       // when forward copying word pairs we read long pairs at offsets
 952       // {0, 2, 4, 6} (in long words). when backwards copying we read
 953       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 954       // address by -2 in the forwards case so we can compute the
 955       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 956       // or -1.
 957       //
 958       // when forward copying we need to store 1 word, 3 pairs and
 959       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 960       // zero offset We adjust the destination by -1 which means we
 961       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 962       //
 963       // When backwards copyng we need to store 1 word, 3 pairs and
 964       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 965       // offsets {1, 3, 5, 7, 8} * unit.
 966 
 967       if (direction == copy_forwards) {
 968         __ sub(s, s, 16);
 969         __ sub(d, d, 8);
 970       }
 971 
 972       // Fill 8 registers
 973       //
 974       // for forwards copy s was offset by -16 from the original input
 975       // value of s so the register contents are at these offsets
 976       // relative to the 64 bit block addressed by that original input
 977       // and so on for each successive 64 byte block when s is updated
 978       //
 979       // t0 at offset 0,  t1 at offset 8
 980       // t2 at offset 16, t3 at offset 24
 981       // t4 at offset 32, t5 at offset 40
 982       // t6 at offset 48, t7 at offset 56
 983 
 984       // for backwards copy s was not offset so the register contents
 985       // are at these offsets into the preceding 64 byte block
 986       // relative to that original input and so on for each successive
 987       // preceding 64 byte block when s is updated. this explains the
 988       // slightly counter-intuitive looking pattern of register usage
 989       // in the stp instructions for backwards copy.
 990       //
 991       // t0 at offset -16, t1 at offset -8
 992       // t2 at offset -32, t3 at offset -24
 993       // t4 at offset -48, t5 at offset -40
 994       // t6 at offset -64, t7 at offset -56
 995 
 996       __ ldp(t0, t1, Address(s, 2 * unit));
 997       __ ldp(t2, t3, Address(s, 4 * unit));
 998       __ ldp(t4, t5, Address(s, 6 * unit));
 999       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1000 
1001       __ subs(count, count, 16);
1002       __ br(Assembler::LO, drain);
1003 
1004       int prefetch = PrefetchCopyIntervalInBytes;
1005       bool use_stride = false;
1006       if (direction == copy_backwards) {
1007          use_stride = prefetch > 256;
1008          prefetch = -prefetch;
1009          if (use_stride) __ mov(stride, prefetch);
1010       }
1011 
1012       __ bind(again);
1013 
1014       if (PrefetchCopyIntervalInBytes > 0)
1015         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1016 
1017       if (direction == copy_forwards) {
1018        // allowing for the offset of -8 the store instructions place
1019        // registers into the target 64 bit block at the following
1020        // offsets
1021        //
1022        // t0 at offset 0
1023        // t1 at offset 8,  t2 at offset 16
1024        // t3 at offset 24, t4 at offset 32
1025        // t5 at offset 40, t6 at offset 48
1026        // t7 at offset 56
1027 
1028         __ str(t0, Address(d, 1 * unit));
1029         __ stp(t1, t2, Address(d, 2 * unit));
1030         __ ldp(t0, t1, Address(s, 2 * unit));
1031         __ stp(t3, t4, Address(d, 4 * unit));
1032         __ ldp(t2, t3, Address(s, 4 * unit));
1033         __ stp(t5, t6, Address(d, 6 * unit));
1034         __ ldp(t4, t5, Address(s, 6 * unit));
1035         __ str(t7, Address(__ pre(d, 8 * unit)));
1036         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1037       } else {
1038        // d was not offset when we started so the registers are
1039        // written into the 64 bit block preceding d with the following
1040        // offsets
1041        //
1042        // t1 at offset -8
1043        // t3 at offset -24, t0 at offset -16
1044        // t5 at offset -48, t2 at offset -32
1045        // t7 at offset -56, t4 at offset -48
1046        //                   t6 at offset -64
1047        //
1048        // note that this matches the offsets previously noted for the
1049        // loads
1050 
1051         __ str(t1, Address(d, 1 * unit));
1052         __ stp(t3, t0, Address(d, 3 * unit));
1053         __ ldp(t0, t1, Address(s, 2 * unit));
1054         __ stp(t5, t2, Address(d, 5 * unit));
1055         __ ldp(t2, t3, Address(s, 4 * unit));
1056         __ stp(t7, t4, Address(d, 7 * unit));
1057         __ ldp(t4, t5, Address(s, 6 * unit));
1058         __ str(t6, Address(__ pre(d, 8 * unit)));
1059         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1060       }
1061 
1062       __ subs(count, count, 8);
1063       __ br(Assembler::HS, again);
1064 
1065       // Drain
1066       //
1067       // this uses the same pattern of offsets and register arguments
1068       // as above
1069       __ bind(drain);
1070       if (direction == copy_forwards) {
1071         __ str(t0, Address(d, 1 * unit));
1072         __ stp(t1, t2, Address(d, 2 * unit));
1073         __ stp(t3, t4, Address(d, 4 * unit));
1074         __ stp(t5, t6, Address(d, 6 * unit));
1075         __ str(t7, Address(__ pre(d, 8 * unit)));
1076       } else {
1077         __ str(t1, Address(d, 1 * unit));
1078         __ stp(t3, t0, Address(d, 3 * unit));
1079         __ stp(t5, t2, Address(d, 5 * unit));
1080         __ stp(t7, t4, Address(d, 7 * unit));
1081         __ str(t6, Address(__ pre(d, 8 * unit)));
1082       }
1083       // now we need to copy any remaining part block which may
1084       // include a 4 word block subblock and/or a 2 word subblock.
1085       // bits 2 and 1 in the count are the tell-tale for whetehr we
1086       // have each such subblock
1087       {
1088         Label L1, L2;
1089         __ tbz(count, exact_log2(4), L1);
1090        // this is the same as above but copying only 4 longs hence
1091        // with ony one intervening stp between the str instructions
1092        // but note that the offsets and registers still follow the
1093        // same pattern
1094         __ ldp(t0, t1, Address(s, 2 * unit));
1095         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1096         if (direction == copy_forwards) {
1097           __ str(t0, Address(d, 1 * unit));
1098           __ stp(t1, t2, Address(d, 2 * unit));
1099           __ str(t3, Address(__ pre(d, 4 * unit)));
1100         } else {
1101           __ str(t1, Address(d, 1 * unit));
1102           __ stp(t3, t0, Address(d, 3 * unit));
1103           __ str(t2, Address(__ pre(d, 4 * unit)));
1104         }
1105         __ bind(L1);
1106 
1107         __ tbz(count, 1, L2);
1108        // this is the same as above but copying only 2 longs hence
1109        // there is no intervening stp between the str instructions
1110        // but note that the offset and register patterns are still
1111        // the same
1112         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1113         if (direction == copy_forwards) {
1114           __ str(t0, Address(d, 1 * unit));
1115           __ str(t1, Address(__ pre(d, 2 * unit)));
1116         } else {
1117           __ str(t1, Address(d, 1 * unit));
1118           __ str(t0, Address(__ pre(d, 2 * unit)));
1119         }
1120         __ bind(L2);
1121 
1122        // for forwards copy we need to re-adjust the offsets we
1123        // applied so that s and d are follow the last words written
1124 
1125        if (direction == copy_forwards) {
1126          __ add(s, s, 16);
1127          __ add(d, d, 8);
1128        }
1129 
1130       }
1131 
1132       __ ret(lr);
1133       }
1134   }
1135 
1136   // Small copy: less than 16 bytes.
1137   //
1138   // NB: Ignores all of the bits of count which represent more than 15
1139   // bytes, so a caller doesn't have to mask them.
1140 
1141   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1142     bool is_backwards = step < 0;
1143     size_t granularity = uabs(step);
1144     int direction = is_backwards ? -1 : 1;
1145     int unit = wordSize * direction;
1146 
1147     Label Lpair, Lword, Lint, Lshort, Lbyte;
1148 
1149     assert(granularity
1150            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1151 
1152     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1153 
1154     // ??? I don't know if this bit-test-and-branch is the right thing
1155     // to do.  It does a lot of jumping, resulting in several
1156     // mispredicted branches.  It might make more sense to do this
1157     // with something like Duff's device with a single computed branch.
1158 
1159     __ tbz(count, 3 - exact_log2(granularity), Lword);
1160     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1161     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1162     __ bind(Lword);
1163 
1164     if (granularity <= sizeof (jint)) {
1165       __ tbz(count, 2 - exact_log2(granularity), Lint);
1166       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1167       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1168       __ bind(Lint);
1169     }
1170 
1171     if (granularity <= sizeof (jshort)) {
1172       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1173       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1174       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1175       __ bind(Lshort);
1176     }
1177 
1178     if (granularity <= sizeof (jbyte)) {
1179       __ tbz(count, 0, Lbyte);
1180       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1181       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1182       __ bind(Lbyte);
1183     }
1184   }
1185 
1186   Label copy_f, copy_b;
1187 
1188   // All-singing all-dancing memory copy.
1189   //
1190   // Copy count units of memory from s to d.  The size of a unit is
1191   // step, which can be positive or negative depending on the direction
1192   // of copy.  If is_aligned is false, we align the source address.
1193   //
1194 
1195   void copy_memory(bool is_aligned, Register s, Register d,
1196                    Register count, Register tmp, int step) {
1197     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1198     bool is_backwards = step < 0;
1199     int granularity = uabs(step);
1200     const Register t0 = r3, t1 = r4;
1201 
1202     // <= 96 bytes do inline. Direction doesn't matter because we always
1203     // load all the data before writing anything
1204     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1205     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1206     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1207     const Register send = r17, dend = r18;
1208 
1209     if (PrefetchCopyIntervalInBytes > 0)
1210       __ prfm(Address(s, 0), PLDL1KEEP);
1211     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1212     __ br(Assembler::HI, copy_big);
1213 
1214     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1215     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1216 
1217     __ cmp(count, 16/granularity);
1218     __ br(Assembler::LS, copy16);
1219 
1220     __ cmp(count, 64/granularity);
1221     __ br(Assembler::HI, copy80);
1222 
1223     __ cmp(count, 32/granularity);
1224     __ br(Assembler::LS, copy32);
1225 
1226     // 33..64 bytes
1227     if (UseSIMDForMemoryOps) {
1228       __ ldpq(v0, v1, Address(s, 0));
1229       __ ldpq(v2, v3, Address(send, -32));
1230       __ stpq(v0, v1, Address(d, 0));
1231       __ stpq(v2, v3, Address(dend, -32));
1232     } else {
1233       __ ldp(t0, t1, Address(s, 0));
1234       __ ldp(t2, t3, Address(s, 16));
1235       __ ldp(t4, t5, Address(send, -32));
1236       __ ldp(t6, t7, Address(send, -16));
1237 
1238       __ stp(t0, t1, Address(d, 0));
1239       __ stp(t2, t3, Address(d, 16));
1240       __ stp(t4, t5, Address(dend, -32));
1241       __ stp(t6, t7, Address(dend, -16));
1242     }
1243     __ b(finish);
1244 
1245     // 17..32 bytes
1246     __ bind(copy32);
1247     __ ldp(t0, t1, Address(s, 0));
1248     __ ldp(t2, t3, Address(send, -16));
1249     __ stp(t0, t1, Address(d, 0));
1250     __ stp(t2, t3, Address(dend, -16));
1251     __ b(finish);
1252 
1253     // 65..80/96 bytes
1254     // (96 bytes if SIMD because we do 32 byes per instruction)
1255     __ bind(copy80);
1256     if (UseSIMDForMemoryOps) {
1257       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1258       __ ldpq(v4, v5, Address(send, -32));
1259       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1260       __ stpq(v4, v5, Address(dend, -32));
1261     } else {
1262       __ ldp(t0, t1, Address(s, 0));
1263       __ ldp(t2, t3, Address(s, 16));
1264       __ ldp(t4, t5, Address(s, 32));
1265       __ ldp(t6, t7, Address(s, 48));
1266       __ ldp(t8, t9, Address(send, -16));
1267 
1268       __ stp(t0, t1, Address(d, 0));
1269       __ stp(t2, t3, Address(d, 16));
1270       __ stp(t4, t5, Address(d, 32));
1271       __ stp(t6, t7, Address(d, 48));
1272       __ stp(t8, t9, Address(dend, -16));
1273     }
1274     __ b(finish);
1275 
1276     // 0..16 bytes
1277     __ bind(copy16);
1278     __ cmp(count, 8/granularity);
1279     __ br(Assembler::LO, copy8);
1280 
1281     // 8..16 bytes
1282     __ ldr(t0, Address(s, 0));
1283     __ ldr(t1, Address(send, -8));
1284     __ str(t0, Address(d, 0));
1285     __ str(t1, Address(dend, -8));
1286     __ b(finish);
1287 
1288     if (granularity < 8) {
1289       // 4..7 bytes
1290       __ bind(copy8);
1291       __ tbz(count, 2 - exact_log2(granularity), copy4);
1292       __ ldrw(t0, Address(s, 0));
1293       __ ldrw(t1, Address(send, -4));
1294       __ strw(t0, Address(d, 0));
1295       __ strw(t1, Address(dend, -4));
1296       __ b(finish);
1297       if (granularity < 4) {
1298         // 0..3 bytes
1299         __ bind(copy4);
1300         __ cbz(count, finish); // get rid of 0 case
1301         if (granularity == 2) {
1302           __ ldrh(t0, Address(s, 0));
1303           __ strh(t0, Address(d, 0));
1304         } else { // granularity == 1
1305           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1306           // the first and last byte.
1307           // Handle the 3 byte case by loading and storing base + count/2
1308           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1309           // This does means in the 1 byte case we load/store the same
1310           // byte 3 times.
1311           __ lsr(count, count, 1);
1312           __ ldrb(t0, Address(s, 0));
1313           __ ldrb(t1, Address(send, -1));
1314           __ ldrb(t2, Address(s, count));
1315           __ strb(t0, Address(d, 0));
1316           __ strb(t1, Address(dend, -1));
1317           __ strb(t2, Address(d, count));
1318         }
1319         __ b(finish);
1320       }
1321     }
1322 
1323     __ bind(copy_big);
1324     if (is_backwards) {
1325       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1326       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1327     }
1328 
1329     // Now we've got the small case out of the way we can align the
1330     // source address on a 2-word boundary.
1331 
1332     Label aligned;
1333 
1334     if (is_aligned) {
1335       // We may have to adjust by 1 word to get s 2-word-aligned.
1336       __ tbz(s, exact_log2(wordSize), aligned);
1337       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1338       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1339       __ sub(count, count, wordSize/granularity);
1340     } else {
1341       if (is_backwards) {
1342         __ andr(rscratch2, s, 2 * wordSize - 1);
1343       } else {
1344         __ neg(rscratch2, s);
1345         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1346       }
1347       // rscratch2 is the byte adjustment needed to align s.
1348       __ cbz(rscratch2, aligned);
1349       int shift = exact_log2(granularity);
1350       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1351       __ sub(count, count, rscratch2);
1352 
1353 #if 0
1354       // ?? This code is only correct for a disjoint copy.  It may or
1355       // may not make sense to use it in that case.
1356 
1357       // Copy the first pair; s and d may not be aligned.
1358       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1359       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1360 
1361       // Align s and d, adjust count
1362       if (is_backwards) {
1363         __ sub(s, s, rscratch2);
1364         __ sub(d, d, rscratch2);
1365       } else {
1366         __ add(s, s, rscratch2);
1367         __ add(d, d, rscratch2);
1368       }
1369 #else
1370       copy_memory_small(s, d, rscratch2, rscratch1, step);
1371 #endif
1372     }
1373 
1374     __ bind(aligned);
1375 
1376     // s is now 2-word-aligned.
1377 
1378     // We have a count of units and some trailing bytes.  Adjust the
1379     // count and do a bulk copy of words.
1380     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1381     if (direction == copy_forwards)
1382       __ bl(copy_f);
1383     else
1384       __ bl(copy_b);
1385 
1386     // And the tail.
1387     copy_memory_small(s, d, count, tmp, step);
1388 
1389     if (granularity >= 8) __ bind(copy8);
1390     if (granularity >= 4) __ bind(copy4);
1391     __ bind(finish);
1392   }
1393 
1394 
1395   void clobber_registers() {
1396 #ifdef ASSERT
1397     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1398     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1399     for (Register r = r3; r <= r18; r++)
1400       if (r != rscratch1) __ mov(r, rscratch1);
1401 #endif
1402   }
1403 
1404   // Scan over array at a for count oops, verifying each one.
1405   // Preserves a and count, clobbers rscratch1 and rscratch2.
1406   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1407     Label loop, end;
1408     __ mov(rscratch1, a);
1409     __ mov(rscratch2, zr);
1410     __ bind(loop);
1411     __ cmp(rscratch2, count);
1412     __ br(Assembler::HS, end);
1413     if (size == (size_t)wordSize) {
1414       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1415       __ verify_oop(temp);
1416     } else {
1417       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1418       __ decode_heap_oop(temp); // calls verify_oop
1419     }
1420     __ add(rscratch2, rscratch2, size);
1421     __ b(loop);
1422     __ bind(end);
1423   }
1424 
1425   // Arguments:
1426   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1427   //             ignored
1428   //   is_oop  - true => oop array, so generate store check code
1429   //   name    - stub name string
1430   //
1431   // Inputs:
1432   //   c_rarg0   - source array address
1433   //   c_rarg1   - destination array address
1434   //   c_rarg2   - element count, treated as ssize_t, can be zero
1435   //
1436   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1437   // the hardware handle it.  The two dwords within qwords that span
1438   // cache line boundaries will still be loaded and stored atomicly.
1439   //
1440   // Side Effects:
1441   //   disjoint_int_copy_entry is set to the no-overlap entry point
1442   //   used by generate_conjoint_int_oop_copy().
1443   //
1444   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1445                                   const char *name, bool dest_uninitialized = false) {
1446     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1447     RegSet saved_reg = RegSet::of(s, d, count);
1448     __ align(CodeEntryAlignment);
1449     StubCodeMark mark(this, "StubRoutines", name);
1450     address start = __ pc();
1451     __ enter();
1452 
1453     if (entry != NULL) {
1454       *entry = __ pc();
1455       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1456       BLOCK_COMMENT("Entry:");
1457     }
1458 
1459     if (is_oop) {
1460       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_reg);
1461       // save regs before copy_memory
1462       __ push(RegSet::of(d, count), sp);
1463     }
1464     copy_memory(aligned, s, d, count, rscratch1, size);
1465     if (is_oop) {
1466       __ pop(RegSet::of(d, count), sp);
1467       if (VerifyOops)
1468         verify_oop_array(size, d, count, r16);
1469       __ sub(count, count, 1); // make an inclusive end pointer
1470       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1471       gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet());
1472     }
1473     __ leave();
1474     __ mov(r0, zr); // return 0
1475     __ ret(lr);
1476 #ifdef BUILTIN_SIM
1477     {
1478       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1479       sim->notifyCompile(const_cast<char*>(name), start);
1480     }
1481 #endif
1482     return start;
1483   }
1484 
1485   // Arguments:
1486   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1487   //             ignored
1488   //   is_oop  - true => oop array, so generate store check code
1489   //   name    - stub name string
1490   //
1491   // Inputs:
1492   //   c_rarg0   - source array address
1493   //   c_rarg1   - destination array address
1494   //   c_rarg2   - element count, treated as ssize_t, can be zero
1495   //
1496   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1497   // the hardware handle it.  The two dwords within qwords that span
1498   // cache line boundaries will still be loaded and stored atomicly.
1499   //
1500   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1501                                  address *entry, const char *name,
1502                                  bool dest_uninitialized = false) {
1503     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1504     RegSet saved_regs = RegSet::of(s, d, count);
1505     StubCodeMark mark(this, "StubRoutines", name);
1506     address start = __ pc();
1507     __ enter();
1508 
1509     if (entry != NULL) {
1510       *entry = __ pc();
1511       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1512       BLOCK_COMMENT("Entry:");
1513     }
1514 
1515     // use fwd copy when (d-s) above_equal (count*size)
1516     __ sub(rscratch1, d, s);
1517     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1518     __ br(Assembler::HS, nooverlap_target);
1519 
1520     if (is_oop) {
1521       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_regs);
1522       // save regs before copy_memory
1523       __ push(RegSet::of(d, count), sp);
1524     }
1525     copy_memory(aligned, s, d, count, rscratch1, -size);
1526     if (is_oop) {
1527       __ pop(RegSet::of(d, count), sp);
1528       if (VerifyOops)
1529         verify_oop_array(size, d, count, r16);
1530       __ sub(count, count, 1); // make an inclusive end pointer
1531       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1532       gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet());
1533     }
1534     __ leave();
1535     __ mov(r0, zr); // return 0
1536     __ ret(lr);
1537 #ifdef BUILTIN_SIM
1538     {
1539       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1540       sim->notifyCompile(const_cast<char*>(name), start);
1541     }
1542 #endif
1543     return start;
1544 }
1545 
1546   // Arguments:
1547   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1548   //             ignored
1549   //   name    - stub name string
1550   //
1551   // Inputs:
1552   //   c_rarg0   - source array address
1553   //   c_rarg1   - destination array address
1554   //   c_rarg2   - element count, treated as ssize_t, can be zero
1555   //
1556   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1557   // we let the hardware handle it.  The one to eight bytes within words,
1558   // dwords or qwords that span cache line boundaries will still be loaded
1559   // and stored atomically.
1560   //
1561   // Side Effects:
1562   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1563   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1564   // we let the hardware handle it.  The one to eight bytes within words,
1565   // dwords or qwords that span cache line boundaries will still be loaded
1566   // and stored atomically.
1567   //
1568   // Side Effects:
1569   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1570   //   used by generate_conjoint_byte_copy().
1571   //
1572   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1573     const bool not_oop = false;
1574     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1575   }
1576 
1577   // Arguments:
1578   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1579   //             ignored
1580   //   name    - stub name string
1581   //
1582   // Inputs:
1583   //   c_rarg0   - source array address
1584   //   c_rarg1   - destination array address
1585   //   c_rarg2   - element count, treated as ssize_t, can be zero
1586   //
1587   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1588   // we let the hardware handle it.  The one to eight bytes within words,
1589   // dwords or qwords that span cache line boundaries will still be loaded
1590   // and stored atomically.
1591   //
1592   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1593                                       address* entry, const char *name) {
1594     const bool not_oop = false;
1595     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1596   }
1597 
1598   // Arguments:
1599   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1600   //             ignored
1601   //   name    - stub name string
1602   //
1603   // Inputs:
1604   //   c_rarg0   - source array address
1605   //   c_rarg1   - destination array address
1606   //   c_rarg2   - element count, treated as ssize_t, can be zero
1607   //
1608   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1609   // let the hardware handle it.  The two or four words within dwords
1610   // or qwords that span cache line boundaries will still be loaded
1611   // and stored atomically.
1612   //
1613   // Side Effects:
1614   //   disjoint_short_copy_entry is set to the no-overlap entry point
1615   //   used by generate_conjoint_short_copy().
1616   //
1617   address generate_disjoint_short_copy(bool aligned,
1618                                        address* entry, const char *name) {
1619     const bool not_oop = false;
1620     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1621   }
1622 
1623   // Arguments:
1624   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1625   //             ignored
1626   //   name    - stub name string
1627   //
1628   // Inputs:
1629   //   c_rarg0   - source array address
1630   //   c_rarg1   - destination array address
1631   //   c_rarg2   - element count, treated as ssize_t, can be zero
1632   //
1633   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1634   // let the hardware handle it.  The two or four words within dwords
1635   // or qwords that span cache line boundaries will still be loaded
1636   // and stored atomically.
1637   //
1638   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1639                                        address *entry, const char *name) {
1640     const bool not_oop = false;
1641     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1642 
1643   }
1644   // Arguments:
1645   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1646   //             ignored
1647   //   name    - stub name string
1648   //
1649   // Inputs:
1650   //   c_rarg0   - source array address
1651   //   c_rarg1   - destination array address
1652   //   c_rarg2   - element count, treated as ssize_t, can be zero
1653   //
1654   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1655   // the hardware handle it.  The two dwords within qwords that span
1656   // cache line boundaries will still be loaded and stored atomicly.
1657   //
1658   // Side Effects:
1659   //   disjoint_int_copy_entry is set to the no-overlap entry point
1660   //   used by generate_conjoint_int_oop_copy().
1661   //
1662   address generate_disjoint_int_copy(bool aligned, address *entry,
1663                                          const char *name, bool dest_uninitialized = false) {
1664     const bool not_oop = false;
1665     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1666   }
1667 
1668   // Arguments:
1669   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1670   //             ignored
1671   //   name    - stub name string
1672   //
1673   // Inputs:
1674   //   c_rarg0   - source array address
1675   //   c_rarg1   - destination array address
1676   //   c_rarg2   - element count, treated as ssize_t, can be zero
1677   //
1678   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1679   // the hardware handle it.  The two dwords within qwords that span
1680   // cache line boundaries will still be loaded and stored atomicly.
1681   //
1682   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1683                                      address *entry, const char *name,
1684                                      bool dest_uninitialized = false) {
1685     const bool not_oop = false;
1686     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1687   }
1688 
1689 
1690   // Arguments:
1691   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1692   //             ignored
1693   //   name    - stub name string
1694   //
1695   // Inputs:
1696   //   c_rarg0   - source array address
1697   //   c_rarg1   - destination array address
1698   //   c_rarg2   - element count, treated as size_t, can be zero
1699   //
1700   // Side Effects:
1701   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1702   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1703   //
1704   address generate_disjoint_long_copy(bool aligned, address *entry,
1705                                           const char *name, bool dest_uninitialized = false) {
1706     const bool not_oop = false;
1707     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1708   }
1709 
1710   // Arguments:
1711   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1712   //             ignored
1713   //   name    - stub name string
1714   //
1715   // Inputs:
1716   //   c_rarg0   - source array address
1717   //   c_rarg1   - destination array address
1718   //   c_rarg2   - element count, treated as size_t, can be zero
1719   //
1720   address generate_conjoint_long_copy(bool aligned,
1721                                       address nooverlap_target, address *entry,
1722                                       const char *name, bool dest_uninitialized = false) {
1723     const bool not_oop = false;
1724     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1725   }
1726 
1727   // Arguments:
1728   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1729   //             ignored
1730   //   name    - stub name string
1731   //
1732   // Inputs:
1733   //   c_rarg0   - source array address
1734   //   c_rarg1   - destination array address
1735   //   c_rarg2   - element count, treated as size_t, can be zero
1736   //
1737   // Side Effects:
1738   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1739   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1740   //
1741   address generate_disjoint_oop_copy(bool aligned, address *entry,
1742                                      const char *name, bool dest_uninitialized) {
1743     const bool is_oop = true;
1744     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1745     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1746   }
1747 
1748   // Arguments:
1749   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1750   //             ignored
1751   //   name    - stub name string
1752   //
1753   // Inputs:
1754   //   c_rarg0   - source array address
1755   //   c_rarg1   - destination array address
1756   //   c_rarg2   - element count, treated as size_t, can be zero
1757   //
1758   address generate_conjoint_oop_copy(bool aligned,
1759                                      address nooverlap_target, address *entry,
1760                                      const char *name, bool dest_uninitialized) {
1761     const bool is_oop = true;
1762     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1763     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1764                                   name, dest_uninitialized);
1765   }
1766 
1767 
1768   // Helper for generating a dynamic type check.
1769   // Smashes rscratch1.
1770   void generate_type_check(Register sub_klass,
1771                            Register super_check_offset,
1772                            Register super_klass,
1773                            Label& L_success) {
1774     assert_different_registers(sub_klass, super_check_offset, super_klass);
1775 
1776     BLOCK_COMMENT("type_check:");
1777 
1778     Label L_miss;
1779 
1780     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1781                                      super_check_offset);
1782     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1783 
1784     // Fall through on failure!
1785     __ BIND(L_miss);
1786   }
1787 
1788   //
1789   //  Generate checkcasting array copy stub
1790   //
1791   //  Input:
1792   //    c_rarg0   - source array address
1793   //    c_rarg1   - destination array address
1794   //    c_rarg2   - element count, treated as ssize_t, can be zero
1795   //    c_rarg3   - size_t ckoff (super_check_offset)
1796   //    c_rarg4   - oop ckval (super_klass)
1797   //
1798   //  Output:
1799   //    r0 ==  0  -  success
1800   //    r0 == -1^K - failure, where K is partial transfer count
1801   //
1802   address generate_checkcast_copy(const char *name, address *entry,
1803                                   bool dest_uninitialized = false) {
1804 
1805     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1806 
1807     // Input registers (after setup_arg_regs)
1808     const Register from        = c_rarg0;   // source array address
1809     const Register to          = c_rarg1;   // destination array address
1810     const Register count       = c_rarg2;   // elementscount
1811     const Register ckoff       = c_rarg3;   // super_check_offset
1812     const Register ckval       = c_rarg4;   // super_klass
1813 
1814     RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4);
1815     RegSet wb_post_saved_regs = RegSet::of(count);
1816 
1817     // Registers used as temps (r18, r19, r20 are save-on-entry)
1818     const Register count_save  = r21;       // orig elementscount
1819     const Register start_to    = r20;       // destination array start address
1820     const Register copied_oop  = r18;       // actual oop copied
1821     const Register r19_klass   = r19;       // oop._klass
1822 
1823     //---------------------------------------------------------------
1824     // Assembler stub will be used for this call to arraycopy
1825     // if the two arrays are subtypes of Object[] but the
1826     // destination array type is not equal to or a supertype
1827     // of the source type.  Each element must be separately
1828     // checked.
1829 
1830     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1831                                copied_oop, r19_klass, count_save);
1832 
1833     __ align(CodeEntryAlignment);
1834     StubCodeMark mark(this, "StubRoutines", name);
1835     address start = __ pc();
1836 
1837     __ enter(); // required for proper stackwalking of RuntimeStub frame
1838 
1839 #ifdef ASSERT
1840     // caller guarantees that the arrays really are different
1841     // otherwise, we would have to make conjoint checks
1842     { Label L;
1843       array_overlap_test(L, TIMES_OOP);
1844       __ stop("checkcast_copy within a single array");
1845       __ bind(L);
1846     }
1847 #endif //ASSERT
1848 
1849     // Caller of this entry point must set up the argument registers.
1850     if (entry != NULL) {
1851       *entry = __ pc();
1852       BLOCK_COMMENT("Entry:");
1853     }
1854 
1855      // Empty array:  Nothing to do.
1856     __ cbz(count, L_done);
1857 
1858     __ push(RegSet::of(r18, r19, r20, r21), sp);
1859 
1860 #ifdef ASSERT
1861     BLOCK_COMMENT("assert consistent ckoff/ckval");
1862     // The ckoff and ckval must be mutually consistent,
1863     // even though caller generates both.
1864     { Label L;
1865       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1866       __ ldrw(start_to, Address(ckval, sco_offset));
1867       __ cmpw(ckoff, start_to);
1868       __ br(Assembler::EQ, L);
1869       __ stop("super_check_offset inconsistent");
1870       __ bind(L);
1871     }
1872 #endif //ASSERT
1873 
1874     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized, wb_pre_saved_regs);
1875 
1876     // save the original count
1877     __ mov(count_save, count);
1878 
1879     // Copy from low to high addresses
1880     __ mov(start_to, to);              // Save destination array start address
1881     __ b(L_load_element);
1882 
1883     // ======== begin loop ========
1884     // (Loop is rotated; its entry is L_load_element.)
1885     // Loop control:
1886     //   for (; count != 0; count--) {
1887     //     copied_oop = load_heap_oop(from++);
1888     //     ... generate_type_check ...;
1889     //     store_heap_oop(to++, copied_oop);
1890     //   }
1891     __ align(OptoLoopAlignment);
1892 
1893     __ BIND(L_store_element);
1894     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1895     __ sub(count, count, 1);
1896     __ cbz(count, L_do_card_marks);
1897 
1898     // ======== loop entry is here ========
1899     __ BIND(L_load_element);
1900     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1901     __ cbz(copied_oop, L_store_element);
1902 
1903     __ load_klass(r19_klass, copied_oop);// query the object klass
1904     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1905     // ======== end loop ========
1906 
1907     // It was a real error; we must depend on the caller to finish the job.
1908     // Register count = remaining oops, count_orig = total oops.
1909     // Emit GC store barriers for the oops we have copied and report
1910     // their number to the caller.
1911 
1912     __ subs(count, count_save, count);     // K = partially copied oop count
1913     __ eon(count, count, zr);                   // report (-1^K) to caller
1914     __ br(Assembler::EQ, L_done_pop);
1915 
1916     __ BIND(L_do_card_marks);
1917     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1918     gen_write_ref_array_post_barrier(start_to, to, rscratch1, wb_post_saved_regs);
1919 
1920     __ bind(L_done_pop);
1921     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1922     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1923 
1924     __ bind(L_done);
1925     __ mov(r0, count);
1926     __ leave();
1927     __ ret(lr);
1928 
1929     return start;
1930   }
1931 
1932   // Perform range checks on the proposed arraycopy.
1933   // Kills temp, but nothing else.
1934   // Also, clean the sign bits of src_pos and dst_pos.
1935   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1936                               Register src_pos, // source position (c_rarg1)
1937                               Register dst,     // destination array oo (c_rarg2)
1938                               Register dst_pos, // destination position (c_rarg3)
1939                               Register length,
1940                               Register temp,
1941                               Label& L_failed) {
1942     BLOCK_COMMENT("arraycopy_range_checks:");
1943 
1944     assert_different_registers(rscratch1, temp);
1945 
1946     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1947     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1948     __ addw(temp, length, src_pos);
1949     __ cmpw(temp, rscratch1);
1950     __ br(Assembler::HI, L_failed);
1951 
1952     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1953     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1954     __ addw(temp, length, dst_pos);
1955     __ cmpw(temp, rscratch1);
1956     __ br(Assembler::HI, L_failed);
1957 
1958     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1959     __ movw(src_pos, src_pos);
1960     __ movw(dst_pos, dst_pos);
1961 
1962     BLOCK_COMMENT("arraycopy_range_checks done");
1963   }
1964 
1965   // These stubs get called from some dumb test routine.
1966   // I'll write them properly when they're called from
1967   // something that's actually doing something.
1968   static void fake_arraycopy_stub(address src, address dst, int count) {
1969     assert(count == 0, "huh?");
1970   }
1971 
1972 
1973   //
1974   //  Generate 'unsafe' array copy stub
1975   //  Though just as safe as the other stubs, it takes an unscaled
1976   //  size_t argument instead of an element count.
1977   //
1978   //  Input:
1979   //    c_rarg0   - source array address
1980   //    c_rarg1   - destination array address
1981   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1982   //
1983   // Examines the alignment of the operands and dispatches
1984   // to a long, int, short, or byte copy loop.
1985   //
1986   address generate_unsafe_copy(const char *name,
1987                                address byte_copy_entry,
1988                                address short_copy_entry,
1989                                address int_copy_entry,
1990                                address long_copy_entry) {
1991     Label L_long_aligned, L_int_aligned, L_short_aligned;
1992     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1993 
1994     __ align(CodeEntryAlignment);
1995     StubCodeMark mark(this, "StubRoutines", name);
1996     address start = __ pc();
1997     __ enter(); // required for proper stackwalking of RuntimeStub frame
1998 
1999     // bump this on entry, not on exit:
2000     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2001 
2002     __ orr(rscratch1, s, d);
2003     __ orr(rscratch1, rscratch1, count);
2004 
2005     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2006     __ cbz(rscratch1, L_long_aligned);
2007     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2008     __ cbz(rscratch1, L_int_aligned);
2009     __ tbz(rscratch1, 0, L_short_aligned);
2010     __ b(RuntimeAddress(byte_copy_entry));
2011 
2012     __ BIND(L_short_aligned);
2013     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2014     __ b(RuntimeAddress(short_copy_entry));
2015     __ BIND(L_int_aligned);
2016     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2017     __ b(RuntimeAddress(int_copy_entry));
2018     __ BIND(L_long_aligned);
2019     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2020     __ b(RuntimeAddress(long_copy_entry));
2021 
2022     return start;
2023   }
2024 
2025   //
2026   //  Generate generic array copy stubs
2027   //
2028   //  Input:
2029   //    c_rarg0    -  src oop
2030   //    c_rarg1    -  src_pos (32-bits)
2031   //    c_rarg2    -  dst oop
2032   //    c_rarg3    -  dst_pos (32-bits)
2033   //    c_rarg4    -  element count (32-bits)
2034   //
2035   //  Output:
2036   //    r0 ==  0  -  success
2037   //    r0 == -1^K - failure, where K is partial transfer count
2038   //
2039   address generate_generic_copy(const char *name,
2040                                 address byte_copy_entry, address short_copy_entry,
2041                                 address int_copy_entry, address oop_copy_entry,
2042                                 address long_copy_entry, address checkcast_copy_entry) {
2043 
2044     Label L_failed, L_failed_0, L_objArray;
2045     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2046 
2047     // Input registers
2048     const Register src        = c_rarg0;  // source array oop
2049     const Register src_pos    = c_rarg1;  // source position
2050     const Register dst        = c_rarg2;  // destination array oop
2051     const Register dst_pos    = c_rarg3;  // destination position
2052     const Register length     = c_rarg4;
2053 
2054     StubCodeMark mark(this, "StubRoutines", name);
2055 
2056     __ align(CodeEntryAlignment);
2057     address start = __ pc();
2058 
2059     __ enter(); // required for proper stackwalking of RuntimeStub frame
2060 
2061     // bump this on entry, not on exit:
2062     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2063 
2064     //-----------------------------------------------------------------------
2065     // Assembler stub will be used for this call to arraycopy
2066     // if the following conditions are met:
2067     //
2068     // (1) src and dst must not be null.
2069     // (2) src_pos must not be negative.
2070     // (3) dst_pos must not be negative.
2071     // (4) length  must not be negative.
2072     // (5) src klass and dst klass should be the same and not NULL.
2073     // (6) src and dst should be arrays.
2074     // (7) src_pos + length must not exceed length of src.
2075     // (8) dst_pos + length must not exceed length of dst.
2076     //
2077 
2078     //  if (src == NULL) return -1;
2079     __ cbz(src, L_failed);
2080 
2081     //  if (src_pos < 0) return -1;
2082     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2083 
2084     //  if (dst == NULL) return -1;
2085     __ cbz(dst, L_failed);
2086 
2087     //  if (dst_pos < 0) return -1;
2088     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2089 
2090     // registers used as temp
2091     const Register scratch_length    = r16; // elements count to copy
2092     const Register scratch_src_klass = r17; // array klass
2093     const Register lh                = r18; // layout helper
2094 
2095     //  if (length < 0) return -1;
2096     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2097     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2098 
2099     __ load_klass(scratch_src_klass, src);
2100 #ifdef ASSERT
2101     //  assert(src->klass() != NULL);
2102     {
2103       BLOCK_COMMENT("assert klasses not null {");
2104       Label L1, L2;
2105       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2106       __ bind(L1);
2107       __ stop("broken null klass");
2108       __ bind(L2);
2109       __ load_klass(rscratch1, dst);
2110       __ cbz(rscratch1, L1);     // this would be broken also
2111       BLOCK_COMMENT("} assert klasses not null done");
2112     }
2113 #endif
2114 
2115     // Load layout helper (32-bits)
2116     //
2117     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2118     // 32        30    24            16              8     2                 0
2119     //
2120     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2121     //
2122 
2123     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2124 
2125     // Handle objArrays completely differently...
2126     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2127     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2128     __ movw(rscratch1, objArray_lh);
2129     __ eorw(rscratch2, lh, rscratch1);
2130     __ cbzw(rscratch2, L_objArray);
2131 
2132     //  if (src->klass() != dst->klass()) return -1;
2133     __ load_klass(rscratch2, dst);
2134     __ eor(rscratch2, rscratch2, scratch_src_klass);
2135     __ cbnz(rscratch2, L_failed);
2136 
2137     //  if (!src->is_Array()) return -1;
2138     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2139 
2140     // At this point, it is known to be a typeArray (array_tag 0x3).
2141 #ifdef ASSERT
2142     {
2143       BLOCK_COMMENT("assert primitive array {");
2144       Label L;
2145       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2146       __ cmpw(lh, rscratch2);
2147       __ br(Assembler::GE, L);
2148       __ stop("must be a primitive array");
2149       __ bind(L);
2150       BLOCK_COMMENT("} assert primitive array done");
2151     }
2152 #endif
2153 
2154     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2155                            rscratch2, L_failed);
2156 
2157     // TypeArrayKlass
2158     //
2159     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2160     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2161     //
2162 
2163     const Register rscratch1_offset = rscratch1;    // array offset
2164     const Register r18_elsize = lh; // element size
2165 
2166     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2167            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2168     __ add(src, src, rscratch1_offset);           // src array offset
2169     __ add(dst, dst, rscratch1_offset);           // dst array offset
2170     BLOCK_COMMENT("choose copy loop based on element size");
2171 
2172     // next registers should be set before the jump to corresponding stub
2173     const Register from     = c_rarg0;  // source array address
2174     const Register to       = c_rarg1;  // destination array address
2175     const Register count    = c_rarg2;  // elements count
2176 
2177     // 'from', 'to', 'count' registers should be set in such order
2178     // since they are the same as 'src', 'src_pos', 'dst'.
2179 
2180     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2181 
2182     // The possible values of elsize are 0-3, i.e. exact_log2(element
2183     // size in bytes).  We do a simple bitwise binary search.
2184   __ BIND(L_copy_bytes);
2185     __ tbnz(r18_elsize, 1, L_copy_ints);
2186     __ tbnz(r18_elsize, 0, L_copy_shorts);
2187     __ lea(from, Address(src, src_pos));// src_addr
2188     __ lea(to,   Address(dst, dst_pos));// dst_addr
2189     __ movw(count, scratch_length); // length
2190     __ b(RuntimeAddress(byte_copy_entry));
2191 
2192   __ BIND(L_copy_shorts);
2193     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2194     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2195     __ movw(count, scratch_length); // length
2196     __ b(RuntimeAddress(short_copy_entry));
2197 
2198   __ BIND(L_copy_ints);
2199     __ tbnz(r18_elsize, 0, L_copy_longs);
2200     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2201     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2202     __ movw(count, scratch_length); // length
2203     __ b(RuntimeAddress(int_copy_entry));
2204 
2205   __ BIND(L_copy_longs);
2206 #ifdef ASSERT
2207     {
2208       BLOCK_COMMENT("assert long copy {");
2209       Label L;
2210       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2211       __ cmpw(r18_elsize, LogBytesPerLong);
2212       __ br(Assembler::EQ, L);
2213       __ stop("must be long copy, but elsize is wrong");
2214       __ bind(L);
2215       BLOCK_COMMENT("} assert long copy done");
2216     }
2217 #endif
2218     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2219     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2220     __ movw(count, scratch_length); // length
2221     __ b(RuntimeAddress(long_copy_entry));
2222 
2223     // ObjArrayKlass
2224   __ BIND(L_objArray);
2225     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2226 
2227     Label L_plain_copy, L_checkcast_copy;
2228     //  test array classes for subtyping
2229     __ load_klass(r18, dst);
2230     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2231     __ br(Assembler::NE, L_checkcast_copy);
2232 
2233     // Identically typed arrays can be copied without element-wise checks.
2234     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2235                            rscratch2, L_failed);
2236 
2237     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2238     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2239     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2240     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2241     __ movw(count, scratch_length); // length
2242   __ BIND(L_plain_copy);
2243     __ b(RuntimeAddress(oop_copy_entry));
2244 
2245   __ BIND(L_checkcast_copy);
2246     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2247     {
2248       // Before looking at dst.length, make sure dst is also an objArray.
2249       __ ldrw(rscratch1, Address(r18, lh_offset));
2250       __ movw(rscratch2, objArray_lh);
2251       __ eorw(rscratch1, rscratch1, rscratch2);
2252       __ cbnzw(rscratch1, L_failed);
2253 
2254       // It is safe to examine both src.length and dst.length.
2255       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2256                              r18, L_failed);
2257 
2258       const Register rscratch2_dst_klass = rscratch2;
2259       __ load_klass(rscratch2_dst_klass, dst); // reload
2260 
2261       // Marshal the base address arguments now, freeing registers.
2262       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2263       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2264       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2265       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2266       __ movw(count, length);           // length (reloaded)
2267       Register sco_temp = c_rarg3;      // this register is free now
2268       assert_different_registers(from, to, count, sco_temp,
2269                                  rscratch2_dst_klass, scratch_src_klass);
2270       // assert_clean_int(count, sco_temp);
2271 
2272       // Generate the type check.
2273       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2274       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2275       // assert_clean_int(sco_temp, r18);
2276       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2277 
2278       // Fetch destination element klass from the ObjArrayKlass header.
2279       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2280       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2281       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2282 
2283       // the checkcast_copy loop needs two extra arguments:
2284       assert(c_rarg3 == sco_temp, "#3 already in place");
2285       // Set up arguments for checkcast_copy_entry.
2286       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2287       __ b(RuntimeAddress(checkcast_copy_entry));
2288     }
2289 
2290   __ BIND(L_failed);
2291     __ mov(r0, -1);
2292     __ leave();   // required for proper stackwalking of RuntimeStub frame
2293     __ ret(lr);
2294 
2295     return start;
2296   }
2297 
2298   //
2299   // Generate stub for array fill. If "aligned" is true, the
2300   // "to" address is assumed to be heapword aligned.
2301   //
2302   // Arguments for generated stub:
2303   //   to:    c_rarg0
2304   //   value: c_rarg1
2305   //   count: c_rarg2 treated as signed
2306   //
2307   address generate_fill(BasicType t, bool aligned, const char *name) {
2308     __ align(CodeEntryAlignment);
2309     StubCodeMark mark(this, "StubRoutines", name);
2310     address start = __ pc();
2311 
2312     BLOCK_COMMENT("Entry:");
2313 
2314     const Register to        = c_rarg0;  // source array address
2315     const Register value     = c_rarg1;  // value
2316     const Register count     = c_rarg2;  // elements count
2317 
2318     const Register bz_base = r10;        // base for block_zero routine
2319     const Register cnt_words = r11;      // temp register
2320 
2321     __ enter();
2322 
2323     Label L_fill_elements, L_exit1;
2324 
2325     int shift = -1;
2326     switch (t) {
2327       case T_BYTE:
2328         shift = 0;
2329         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2330         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2331         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2332         __ br(Assembler::LO, L_fill_elements);
2333         break;
2334       case T_SHORT:
2335         shift = 1;
2336         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2337         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2338         __ br(Assembler::LO, L_fill_elements);
2339         break;
2340       case T_INT:
2341         shift = 2;
2342         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2343         __ br(Assembler::LO, L_fill_elements);
2344         break;
2345       default: ShouldNotReachHere();
2346     }
2347 
2348     // Align source address at 8 bytes address boundary.
2349     Label L_skip_align1, L_skip_align2, L_skip_align4;
2350     if (!aligned) {
2351       switch (t) {
2352         case T_BYTE:
2353           // One byte misalignment happens only for byte arrays.
2354           __ tbz(to, 0, L_skip_align1);
2355           __ strb(value, Address(__ post(to, 1)));
2356           __ subw(count, count, 1);
2357           __ bind(L_skip_align1);
2358           // Fallthrough
2359         case T_SHORT:
2360           // Two bytes misalignment happens only for byte and short (char) arrays.
2361           __ tbz(to, 1, L_skip_align2);
2362           __ strh(value, Address(__ post(to, 2)));
2363           __ subw(count, count, 2 >> shift);
2364           __ bind(L_skip_align2);
2365           // Fallthrough
2366         case T_INT:
2367           // Align to 8 bytes, we know we are 4 byte aligned to start.
2368           __ tbz(to, 2, L_skip_align4);
2369           __ strw(value, Address(__ post(to, 4)));
2370           __ subw(count, count, 4 >> shift);
2371           __ bind(L_skip_align4);
2372           break;
2373         default: ShouldNotReachHere();
2374       }
2375     }
2376 
2377     //
2378     //  Fill large chunks
2379     //
2380     __ lsrw(cnt_words, count, 3 - shift); // number of words
2381     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2382     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2383     if (UseBlockZeroing) {
2384       Label non_block_zeroing, rest;
2385       // If the fill value is zero we can use the fast zero_words().
2386       __ cbnz(value, non_block_zeroing);
2387       __ mov(bz_base, to);
2388       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2389       __ zero_words(bz_base, cnt_words);
2390       __ b(rest);
2391       __ bind(non_block_zeroing);
2392       __ fill_words(to, cnt_words, value);
2393       __ bind(rest);
2394     } else {
2395       __ fill_words(to, cnt_words, value);
2396     }
2397 
2398     // Remaining count is less than 8 bytes. Fill it by a single store.
2399     // Note that the total length is no less than 8 bytes.
2400     if (t == T_BYTE || t == T_SHORT) {
2401       Label L_exit1;
2402       __ cbzw(count, L_exit1);
2403       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2404       __ str(value, Address(to, -8));    // overwrite some elements
2405       __ bind(L_exit1);
2406       __ leave();
2407       __ ret(lr);
2408     }
2409 
2410     // Handle copies less than 8 bytes.
2411     Label L_fill_2, L_fill_4, L_exit2;
2412     __ bind(L_fill_elements);
2413     switch (t) {
2414       case T_BYTE:
2415         __ tbz(count, 0, L_fill_2);
2416         __ strb(value, Address(__ post(to, 1)));
2417         __ bind(L_fill_2);
2418         __ tbz(count, 1, L_fill_4);
2419         __ strh(value, Address(__ post(to, 2)));
2420         __ bind(L_fill_4);
2421         __ tbz(count, 2, L_exit2);
2422         __ strw(value, Address(to));
2423         break;
2424       case T_SHORT:
2425         __ tbz(count, 0, L_fill_4);
2426         __ strh(value, Address(__ post(to, 2)));
2427         __ bind(L_fill_4);
2428         __ tbz(count, 1, L_exit2);
2429         __ strw(value, Address(to));
2430         break;
2431       case T_INT:
2432         __ cbzw(count, L_exit2);
2433         __ strw(value, Address(to));
2434         break;
2435       default: ShouldNotReachHere();
2436     }
2437     __ bind(L_exit2);
2438     __ leave();
2439     __ ret(lr);
2440     return start;
2441   }
2442 
2443   void generate_arraycopy_stubs() {
2444     address entry;
2445     address entry_jbyte_arraycopy;
2446     address entry_jshort_arraycopy;
2447     address entry_jint_arraycopy;
2448     address entry_oop_arraycopy;
2449     address entry_jlong_arraycopy;
2450     address entry_checkcast_arraycopy;
2451 
2452     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2453     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2454 
2455     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2456 
2457     //*** jbyte
2458     // Always need aligned and unaligned versions
2459     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2460                                                                                   "jbyte_disjoint_arraycopy");
2461     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2462                                                                                   &entry_jbyte_arraycopy,
2463                                                                                   "jbyte_arraycopy");
2464     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2465                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2466     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2467                                                                                   "arrayof_jbyte_arraycopy");
2468 
2469     //*** jshort
2470     // Always need aligned and unaligned versions
2471     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2472                                                                                     "jshort_disjoint_arraycopy");
2473     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2474                                                                                     &entry_jshort_arraycopy,
2475                                                                                     "jshort_arraycopy");
2476     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2477                                                                                     "arrayof_jshort_disjoint_arraycopy");
2478     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2479                                                                                     "arrayof_jshort_arraycopy");
2480 
2481     //*** jint
2482     // Aligned versions
2483     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2484                                                                                 "arrayof_jint_disjoint_arraycopy");
2485     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2486                                                                                 "arrayof_jint_arraycopy");
2487     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2488     // entry_jint_arraycopy always points to the unaligned version
2489     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2490                                                                                 "jint_disjoint_arraycopy");
2491     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2492                                                                                 &entry_jint_arraycopy,
2493                                                                                 "jint_arraycopy");
2494 
2495     //*** jlong
2496     // It is always aligned
2497     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2498                                                                                   "arrayof_jlong_disjoint_arraycopy");
2499     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2500                                                                                   "arrayof_jlong_arraycopy");
2501     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2502     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2503 
2504     //*** oops
2505     {
2506       // With compressed oops we need unaligned versions; notice that
2507       // we overwrite entry_oop_arraycopy.
2508       bool aligned = !UseCompressedOops;
2509 
2510       StubRoutines::_arrayof_oop_disjoint_arraycopy
2511         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2512                                      /*dest_uninitialized*/false);
2513       StubRoutines::_arrayof_oop_arraycopy
2514         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2515                                      /*dest_uninitialized*/false);
2516       // Aligned versions without pre-barriers
2517       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2518         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2519                                      /*dest_uninitialized*/true);
2520       StubRoutines::_arrayof_oop_arraycopy_uninit
2521         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2522                                      /*dest_uninitialized*/true);
2523     }
2524 
2525     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2526     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2527     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2528     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2529 
2530     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2531     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2532                                                                         /*dest_uninitialized*/true);
2533 
2534     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2535                                                               entry_jbyte_arraycopy,
2536                                                               entry_jshort_arraycopy,
2537                                                               entry_jint_arraycopy,
2538                                                               entry_jlong_arraycopy);
2539 
2540     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2541                                                                entry_jbyte_arraycopy,
2542                                                                entry_jshort_arraycopy,
2543                                                                entry_jint_arraycopy,
2544                                                                entry_oop_arraycopy,
2545                                                                entry_jlong_arraycopy,
2546                                                                entry_checkcast_arraycopy);
2547 
2548     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2549     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2550     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2551     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2552     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2553     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2554   }
2555 
2556   void generate_math_stubs() { Unimplemented(); }
2557 
2558   // Arguments:
2559   //
2560   // Inputs:
2561   //   c_rarg0   - source byte array address
2562   //   c_rarg1   - destination byte array address
2563   //   c_rarg2   - K (key) in little endian int array
2564   //
2565   address generate_aescrypt_encryptBlock() {
2566     __ align(CodeEntryAlignment);
2567     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2568 
2569     Label L_doLast;
2570 
2571     const Register from        = c_rarg0;  // source array address
2572     const Register to          = c_rarg1;  // destination array address
2573     const Register key         = c_rarg2;  // key array address
2574     const Register keylen      = rscratch1;
2575 
2576     address start = __ pc();
2577     __ enter();
2578 
2579     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2580 
2581     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2582 
2583     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2584     __ rev32(v1, __ T16B, v1);
2585     __ rev32(v2, __ T16B, v2);
2586     __ rev32(v3, __ T16B, v3);
2587     __ rev32(v4, __ T16B, v4);
2588     __ aese(v0, v1);
2589     __ aesmc(v0, v0);
2590     __ aese(v0, v2);
2591     __ aesmc(v0, v0);
2592     __ aese(v0, v3);
2593     __ aesmc(v0, v0);
2594     __ aese(v0, v4);
2595     __ aesmc(v0, v0);
2596 
2597     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2598     __ rev32(v1, __ T16B, v1);
2599     __ rev32(v2, __ T16B, v2);
2600     __ rev32(v3, __ T16B, v3);
2601     __ rev32(v4, __ T16B, v4);
2602     __ aese(v0, v1);
2603     __ aesmc(v0, v0);
2604     __ aese(v0, v2);
2605     __ aesmc(v0, v0);
2606     __ aese(v0, v3);
2607     __ aesmc(v0, v0);
2608     __ aese(v0, v4);
2609     __ aesmc(v0, v0);
2610 
2611     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2612     __ rev32(v1, __ T16B, v1);
2613     __ rev32(v2, __ T16B, v2);
2614 
2615     __ cmpw(keylen, 44);
2616     __ br(Assembler::EQ, L_doLast);
2617 
2618     __ aese(v0, v1);
2619     __ aesmc(v0, v0);
2620     __ aese(v0, v2);
2621     __ aesmc(v0, v0);
2622 
2623     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2624     __ rev32(v1, __ T16B, v1);
2625     __ rev32(v2, __ T16B, v2);
2626 
2627     __ cmpw(keylen, 52);
2628     __ br(Assembler::EQ, L_doLast);
2629 
2630     __ aese(v0, v1);
2631     __ aesmc(v0, v0);
2632     __ aese(v0, v2);
2633     __ aesmc(v0, v0);
2634 
2635     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2636     __ rev32(v1, __ T16B, v1);
2637     __ rev32(v2, __ T16B, v2);
2638 
2639     __ BIND(L_doLast);
2640 
2641     __ aese(v0, v1);
2642     __ aesmc(v0, v0);
2643     __ aese(v0, v2);
2644 
2645     __ ld1(v1, __ T16B, key);
2646     __ rev32(v1, __ T16B, v1);
2647     __ eor(v0, __ T16B, v0, v1);
2648 
2649     __ st1(v0, __ T16B, to);
2650 
2651     __ mov(r0, 0);
2652 
2653     __ leave();
2654     __ ret(lr);
2655 
2656     return start;
2657   }
2658 
2659   // Arguments:
2660   //
2661   // Inputs:
2662   //   c_rarg0   - source byte array address
2663   //   c_rarg1   - destination byte array address
2664   //   c_rarg2   - K (key) in little endian int array
2665   //
2666   address generate_aescrypt_decryptBlock() {
2667     assert(UseAES, "need AES instructions and misaligned SSE support");
2668     __ align(CodeEntryAlignment);
2669     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2670     Label L_doLast;
2671 
2672     const Register from        = c_rarg0;  // source array address
2673     const Register to          = c_rarg1;  // destination array address
2674     const Register key         = c_rarg2;  // key array address
2675     const Register keylen      = rscratch1;
2676 
2677     address start = __ pc();
2678     __ enter(); // required for proper stackwalking of RuntimeStub frame
2679 
2680     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2681 
2682     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2683 
2684     __ ld1(v5, __ T16B, __ post(key, 16));
2685     __ rev32(v5, __ T16B, v5);
2686 
2687     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2688     __ rev32(v1, __ T16B, v1);
2689     __ rev32(v2, __ T16B, v2);
2690     __ rev32(v3, __ T16B, v3);
2691     __ rev32(v4, __ T16B, v4);
2692     __ aesd(v0, v1);
2693     __ aesimc(v0, v0);
2694     __ aesd(v0, v2);
2695     __ aesimc(v0, v0);
2696     __ aesd(v0, v3);
2697     __ aesimc(v0, v0);
2698     __ aesd(v0, v4);
2699     __ aesimc(v0, v0);
2700 
2701     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2702     __ rev32(v1, __ T16B, v1);
2703     __ rev32(v2, __ T16B, v2);
2704     __ rev32(v3, __ T16B, v3);
2705     __ rev32(v4, __ T16B, v4);
2706     __ aesd(v0, v1);
2707     __ aesimc(v0, v0);
2708     __ aesd(v0, v2);
2709     __ aesimc(v0, v0);
2710     __ aesd(v0, v3);
2711     __ aesimc(v0, v0);
2712     __ aesd(v0, v4);
2713     __ aesimc(v0, v0);
2714 
2715     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2716     __ rev32(v1, __ T16B, v1);
2717     __ rev32(v2, __ T16B, v2);
2718 
2719     __ cmpw(keylen, 44);
2720     __ br(Assembler::EQ, L_doLast);
2721 
2722     __ aesd(v0, v1);
2723     __ aesimc(v0, v0);
2724     __ aesd(v0, v2);
2725     __ aesimc(v0, v0);
2726 
2727     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2728     __ rev32(v1, __ T16B, v1);
2729     __ rev32(v2, __ T16B, v2);
2730 
2731     __ cmpw(keylen, 52);
2732     __ br(Assembler::EQ, L_doLast);
2733 
2734     __ aesd(v0, v1);
2735     __ aesimc(v0, v0);
2736     __ aesd(v0, v2);
2737     __ aesimc(v0, v0);
2738 
2739     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2740     __ rev32(v1, __ T16B, v1);
2741     __ rev32(v2, __ T16B, v2);
2742 
2743     __ BIND(L_doLast);
2744 
2745     __ aesd(v0, v1);
2746     __ aesimc(v0, v0);
2747     __ aesd(v0, v2);
2748 
2749     __ eor(v0, __ T16B, v0, v5);
2750 
2751     __ st1(v0, __ T16B, to);
2752 
2753     __ mov(r0, 0);
2754 
2755     __ leave();
2756     __ ret(lr);
2757 
2758     return start;
2759   }
2760 
2761   // Arguments:
2762   //
2763   // Inputs:
2764   //   c_rarg0   - source byte array address
2765   //   c_rarg1   - destination byte array address
2766   //   c_rarg2   - K (key) in little endian int array
2767   //   c_rarg3   - r vector byte array address
2768   //   c_rarg4   - input length
2769   //
2770   // Output:
2771   //   x0        - input length
2772   //
2773   address generate_cipherBlockChaining_encryptAESCrypt() {
2774     assert(UseAES, "need AES instructions and misaligned SSE support");
2775     __ align(CodeEntryAlignment);
2776     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2777 
2778     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2779 
2780     const Register from        = c_rarg0;  // source array address
2781     const Register to          = c_rarg1;  // destination array address
2782     const Register key         = c_rarg2;  // key array address
2783     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2784                                            // and left with the results of the last encryption block
2785     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2786     const Register keylen      = rscratch1;
2787 
2788     address start = __ pc();
2789 
2790       __ enter();
2791 
2792       __ movw(rscratch2, len_reg);
2793 
2794       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2795 
2796       __ ld1(v0, __ T16B, rvec);
2797 
2798       __ cmpw(keylen, 52);
2799       __ br(Assembler::CC, L_loadkeys_44);
2800       __ br(Assembler::EQ, L_loadkeys_52);
2801 
2802       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2803       __ rev32(v17, __ T16B, v17);
2804       __ rev32(v18, __ T16B, v18);
2805     __ BIND(L_loadkeys_52);
2806       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2807       __ rev32(v19, __ T16B, v19);
2808       __ rev32(v20, __ T16B, v20);
2809     __ BIND(L_loadkeys_44);
2810       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2811       __ rev32(v21, __ T16B, v21);
2812       __ rev32(v22, __ T16B, v22);
2813       __ rev32(v23, __ T16B, v23);
2814       __ rev32(v24, __ T16B, v24);
2815       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2816       __ rev32(v25, __ T16B, v25);
2817       __ rev32(v26, __ T16B, v26);
2818       __ rev32(v27, __ T16B, v27);
2819       __ rev32(v28, __ T16B, v28);
2820       __ ld1(v29, v30, v31, __ T16B, key);
2821       __ rev32(v29, __ T16B, v29);
2822       __ rev32(v30, __ T16B, v30);
2823       __ rev32(v31, __ T16B, v31);
2824 
2825     __ BIND(L_aes_loop);
2826       __ ld1(v1, __ T16B, __ post(from, 16));
2827       __ eor(v0, __ T16B, v0, v1);
2828 
2829       __ br(Assembler::CC, L_rounds_44);
2830       __ br(Assembler::EQ, L_rounds_52);
2831 
2832       __ aese(v0, v17); __ aesmc(v0, v0);
2833       __ aese(v0, v18); __ aesmc(v0, v0);
2834     __ BIND(L_rounds_52);
2835       __ aese(v0, v19); __ aesmc(v0, v0);
2836       __ aese(v0, v20); __ aesmc(v0, v0);
2837     __ BIND(L_rounds_44);
2838       __ aese(v0, v21); __ aesmc(v0, v0);
2839       __ aese(v0, v22); __ aesmc(v0, v0);
2840       __ aese(v0, v23); __ aesmc(v0, v0);
2841       __ aese(v0, v24); __ aesmc(v0, v0);
2842       __ aese(v0, v25); __ aesmc(v0, v0);
2843       __ aese(v0, v26); __ aesmc(v0, v0);
2844       __ aese(v0, v27); __ aesmc(v0, v0);
2845       __ aese(v0, v28); __ aesmc(v0, v0);
2846       __ aese(v0, v29); __ aesmc(v0, v0);
2847       __ aese(v0, v30);
2848       __ eor(v0, __ T16B, v0, v31);
2849 
2850       __ st1(v0, __ T16B, __ post(to, 16));
2851 
2852       __ subw(len_reg, len_reg, 16);
2853       __ cbnzw(len_reg, L_aes_loop);
2854 
2855       __ st1(v0, __ T16B, rvec);
2856 
2857       __ mov(r0, rscratch2);
2858 
2859       __ leave();
2860       __ ret(lr);
2861 
2862       return start;
2863   }
2864 
2865   // Arguments:
2866   //
2867   // Inputs:
2868   //   c_rarg0   - source byte array address
2869   //   c_rarg1   - destination byte array address
2870   //   c_rarg2   - K (key) in little endian int array
2871   //   c_rarg3   - r vector byte array address
2872   //   c_rarg4   - input length
2873   //
2874   // Output:
2875   //   r0        - input length
2876   //
2877   address generate_cipherBlockChaining_decryptAESCrypt() {
2878     assert(UseAES, "need AES instructions and misaligned SSE support");
2879     __ align(CodeEntryAlignment);
2880     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2881 
2882     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2883 
2884     const Register from        = c_rarg0;  // source array address
2885     const Register to          = c_rarg1;  // destination array address
2886     const Register key         = c_rarg2;  // key array address
2887     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2888                                            // and left with the results of the last encryption block
2889     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2890     const Register keylen      = rscratch1;
2891 
2892     address start = __ pc();
2893 
2894       __ enter();
2895 
2896       __ movw(rscratch2, len_reg);
2897 
2898       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2899 
2900       __ ld1(v2, __ T16B, rvec);
2901 
2902       __ ld1(v31, __ T16B, __ post(key, 16));
2903       __ rev32(v31, __ T16B, v31);
2904 
2905       __ cmpw(keylen, 52);
2906       __ br(Assembler::CC, L_loadkeys_44);
2907       __ br(Assembler::EQ, L_loadkeys_52);
2908 
2909       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2910       __ rev32(v17, __ T16B, v17);
2911       __ rev32(v18, __ T16B, v18);
2912     __ BIND(L_loadkeys_52);
2913       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2914       __ rev32(v19, __ T16B, v19);
2915       __ rev32(v20, __ T16B, v20);
2916     __ BIND(L_loadkeys_44);
2917       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2918       __ rev32(v21, __ T16B, v21);
2919       __ rev32(v22, __ T16B, v22);
2920       __ rev32(v23, __ T16B, v23);
2921       __ rev32(v24, __ T16B, v24);
2922       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2923       __ rev32(v25, __ T16B, v25);
2924       __ rev32(v26, __ T16B, v26);
2925       __ rev32(v27, __ T16B, v27);
2926       __ rev32(v28, __ T16B, v28);
2927       __ ld1(v29, v30, __ T16B, key);
2928       __ rev32(v29, __ T16B, v29);
2929       __ rev32(v30, __ T16B, v30);
2930 
2931     __ BIND(L_aes_loop);
2932       __ ld1(v0, __ T16B, __ post(from, 16));
2933       __ orr(v1, __ T16B, v0, v0);
2934 
2935       __ br(Assembler::CC, L_rounds_44);
2936       __ br(Assembler::EQ, L_rounds_52);
2937 
2938       __ aesd(v0, v17); __ aesimc(v0, v0);
2939       __ aesd(v0, v18); __ aesimc(v0, v0);
2940     __ BIND(L_rounds_52);
2941       __ aesd(v0, v19); __ aesimc(v0, v0);
2942       __ aesd(v0, v20); __ aesimc(v0, v0);
2943     __ BIND(L_rounds_44);
2944       __ aesd(v0, v21); __ aesimc(v0, v0);
2945       __ aesd(v0, v22); __ aesimc(v0, v0);
2946       __ aesd(v0, v23); __ aesimc(v0, v0);
2947       __ aesd(v0, v24); __ aesimc(v0, v0);
2948       __ aesd(v0, v25); __ aesimc(v0, v0);
2949       __ aesd(v0, v26); __ aesimc(v0, v0);
2950       __ aesd(v0, v27); __ aesimc(v0, v0);
2951       __ aesd(v0, v28); __ aesimc(v0, v0);
2952       __ aesd(v0, v29); __ aesimc(v0, v0);
2953       __ aesd(v0, v30);
2954       __ eor(v0, __ T16B, v0, v31);
2955       __ eor(v0, __ T16B, v0, v2);
2956 
2957       __ st1(v0, __ T16B, __ post(to, 16));
2958       __ orr(v2, __ T16B, v1, v1);
2959 
2960       __ subw(len_reg, len_reg, 16);
2961       __ cbnzw(len_reg, L_aes_loop);
2962 
2963       __ st1(v2, __ T16B, rvec);
2964 
2965       __ mov(r0, rscratch2);
2966 
2967       __ leave();
2968       __ ret(lr);
2969 
2970     return start;
2971   }
2972 
2973   // Arguments:
2974   //
2975   // Inputs:
2976   //   c_rarg0   - byte[]  source+offset
2977   //   c_rarg1   - int[]   SHA.state
2978   //   c_rarg2   - int     offset
2979   //   c_rarg3   - int     limit
2980   //
2981   address generate_sha1_implCompress(bool multi_block, const char *name) {
2982     __ align(CodeEntryAlignment);
2983     StubCodeMark mark(this, "StubRoutines", name);
2984     address start = __ pc();
2985 
2986     Register buf   = c_rarg0;
2987     Register state = c_rarg1;
2988     Register ofs   = c_rarg2;
2989     Register limit = c_rarg3;
2990 
2991     Label keys;
2992     Label sha1_loop;
2993 
2994     // load the keys into v0..v3
2995     __ adr(rscratch1, keys);
2996     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2997     // load 5 words state into v6, v7
2998     __ ldrq(v6, Address(state, 0));
2999     __ ldrs(v7, Address(state, 16));
3000 
3001 
3002     __ BIND(sha1_loop);
3003     // load 64 bytes of data into v16..v19
3004     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3005     __ rev32(v16, __ T16B, v16);
3006     __ rev32(v17, __ T16B, v17);
3007     __ rev32(v18, __ T16B, v18);
3008     __ rev32(v19, __ T16B, v19);
3009 
3010     // do the sha1
3011     __ addv(v4, __ T4S, v16, v0);
3012     __ orr(v20, __ T16B, v6, v6);
3013 
3014     FloatRegister d0 = v16;
3015     FloatRegister d1 = v17;
3016     FloatRegister d2 = v18;
3017     FloatRegister d3 = v19;
3018 
3019     for (int round = 0; round < 20; round++) {
3020       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3021       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3022       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3023       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3024       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3025 
3026       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3027       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3028       __ sha1h(tmp2, __ T4S, v20);
3029       if (round < 5)
3030         __ sha1c(v20, __ T4S, tmp3, tmp4);
3031       else if (round < 10 || round >= 15)
3032         __ sha1p(v20, __ T4S, tmp3, tmp4);
3033       else
3034         __ sha1m(v20, __ T4S, tmp3, tmp4);
3035       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3036 
3037       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3038     }
3039 
3040     __ addv(v7, __ T2S, v7, v21);
3041     __ addv(v6, __ T4S, v6, v20);
3042 
3043     if (multi_block) {
3044       __ add(ofs, ofs, 64);
3045       __ cmp(ofs, limit);
3046       __ br(Assembler::LE, sha1_loop);
3047       __ mov(c_rarg0, ofs); // return ofs
3048     }
3049 
3050     __ strq(v6, Address(state, 0));
3051     __ strs(v7, Address(state, 16));
3052 
3053     __ ret(lr);
3054 
3055     __ bind(keys);
3056     __ emit_int32(0x5a827999);
3057     __ emit_int32(0x6ed9eba1);
3058     __ emit_int32(0x8f1bbcdc);
3059     __ emit_int32(0xca62c1d6);
3060 
3061     return start;
3062   }
3063 
3064 
3065   // Arguments:
3066   //
3067   // Inputs:
3068   //   c_rarg0   - byte[]  source+offset
3069   //   c_rarg1   - int[]   SHA.state
3070   //   c_rarg2   - int     offset
3071   //   c_rarg3   - int     limit
3072   //
3073   address generate_sha256_implCompress(bool multi_block, const char *name) {
3074     static const uint32_t round_consts[64] = {
3075       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3076       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3077       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3078       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3079       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3080       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3081       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3082       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3083       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3084       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3085       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3086       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3087       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3088       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3089       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3090       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3091     };
3092     __ align(CodeEntryAlignment);
3093     StubCodeMark mark(this, "StubRoutines", name);
3094     address start = __ pc();
3095 
3096     Register buf   = c_rarg0;
3097     Register state = c_rarg1;
3098     Register ofs   = c_rarg2;
3099     Register limit = c_rarg3;
3100 
3101     Label sha1_loop;
3102 
3103     __ stpd(v8, v9, __ pre(sp, -32));
3104     __ stpd(v10, v11, Address(sp, 16));
3105 
3106 // dga == v0
3107 // dgb == v1
3108 // dg0 == v2
3109 // dg1 == v3
3110 // dg2 == v4
3111 // t0 == v6
3112 // t1 == v7
3113 
3114     // load 16 keys to v16..v31
3115     __ lea(rscratch1, ExternalAddress((address)round_consts));
3116     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3117     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3118     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3119     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3120 
3121     // load 8 words (256 bits) state
3122     __ ldpq(v0, v1, state);
3123 
3124     __ BIND(sha1_loop);
3125     // load 64 bytes of data into v8..v11
3126     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3127     __ rev32(v8, __ T16B, v8);
3128     __ rev32(v9, __ T16B, v9);
3129     __ rev32(v10, __ T16B, v10);
3130     __ rev32(v11, __ T16B, v11);
3131 
3132     __ addv(v6, __ T4S, v8, v16);
3133     __ orr(v2, __ T16B, v0, v0);
3134     __ orr(v3, __ T16B, v1, v1);
3135 
3136     FloatRegister d0 = v8;
3137     FloatRegister d1 = v9;
3138     FloatRegister d2 = v10;
3139     FloatRegister d3 = v11;
3140 
3141 
3142     for (int round = 0; round < 16; round++) {
3143       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3144       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3145       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3146       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3147 
3148       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3149        __ orr(v4, __ T16B, v2, v2);
3150       if (round < 15)
3151         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3152       __ sha256h(v2, __ T4S, v3, tmp2);
3153       __ sha256h2(v3, __ T4S, v4, tmp2);
3154       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3155 
3156       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3157     }
3158 
3159     __ addv(v0, __ T4S, v0, v2);
3160     __ addv(v1, __ T4S, v1, v3);
3161 
3162     if (multi_block) {
3163       __ add(ofs, ofs, 64);
3164       __ cmp(ofs, limit);
3165       __ br(Assembler::LE, sha1_loop);
3166       __ mov(c_rarg0, ofs); // return ofs
3167     }
3168 
3169     __ ldpd(v10, v11, Address(sp, 16));
3170     __ ldpd(v8, v9, __ post(sp, 32));
3171 
3172     __ stpq(v0, v1, state);
3173 
3174     __ ret(lr);
3175 
3176     return start;
3177   }
3178 
3179 #ifndef BUILTIN_SIM
3180   // Safefetch stubs.
3181   void generate_safefetch(const char* name, int size, address* entry,
3182                           address* fault_pc, address* continuation_pc) {
3183     // safefetch signatures:
3184     //   int      SafeFetch32(int*      adr, int      errValue);
3185     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3186     //
3187     // arguments:
3188     //   c_rarg0 = adr
3189     //   c_rarg1 = errValue
3190     //
3191     // result:
3192     //   PPC_RET  = *adr or errValue
3193 
3194     StubCodeMark mark(this, "StubRoutines", name);
3195 
3196     // Entry point, pc or function descriptor.
3197     *entry = __ pc();
3198 
3199     // Load *adr into c_rarg1, may fault.
3200     *fault_pc = __ pc();
3201     switch (size) {
3202       case 4:
3203         // int32_t
3204         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3205         break;
3206       case 8:
3207         // int64_t
3208         __ ldr(c_rarg1, Address(c_rarg0, 0));
3209         break;
3210       default:
3211         ShouldNotReachHere();
3212     }
3213 
3214     // return errValue or *adr
3215     *continuation_pc = __ pc();
3216     __ mov(r0, c_rarg1);
3217     __ ret(lr);
3218   }
3219 #endif
3220 
3221   /**
3222    *  Arguments:
3223    *
3224    * Inputs:
3225    *   c_rarg0   - int crc
3226    *   c_rarg1   - byte* buf
3227    *   c_rarg2   - int length
3228    *
3229    * Ouput:
3230    *       rax   - int crc result
3231    */
3232   address generate_updateBytesCRC32() {
3233     assert(UseCRC32Intrinsics, "what are we doing here?");
3234 
3235     __ align(CodeEntryAlignment);
3236     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3237 
3238     address start = __ pc();
3239 
3240     const Register crc   = c_rarg0;  // crc
3241     const Register buf   = c_rarg1;  // source java byte array address
3242     const Register len   = c_rarg2;  // length
3243     const Register table0 = c_rarg3; // crc_table address
3244     const Register table1 = c_rarg4;
3245     const Register table2 = c_rarg5;
3246     const Register table3 = c_rarg6;
3247     const Register tmp3 = c_rarg7;
3248 
3249     BLOCK_COMMENT("Entry:");
3250     __ enter(); // required for proper stackwalking of RuntimeStub frame
3251 
3252     __ kernel_crc32(crc, buf, len,
3253               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3254 
3255     __ leave(); // required for proper stackwalking of RuntimeStub frame
3256     __ ret(lr);
3257 
3258     return start;
3259   }
3260 
3261   /**
3262    *  Arguments:
3263    *
3264    * Inputs:
3265    *   c_rarg0   - int crc
3266    *   c_rarg1   - byte* buf
3267    *   c_rarg2   - int length
3268    *   c_rarg3   - int* table
3269    *
3270    * Ouput:
3271    *       r0   - int crc result
3272    */
3273   address generate_updateBytesCRC32C() {
3274     assert(UseCRC32CIntrinsics, "what are we doing here?");
3275 
3276     __ align(CodeEntryAlignment);
3277     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3278 
3279     address start = __ pc();
3280 
3281     const Register crc   = c_rarg0;  // crc
3282     const Register buf   = c_rarg1;  // source java byte array address
3283     const Register len   = c_rarg2;  // length
3284     const Register table0 = c_rarg3; // crc_table address
3285     const Register table1 = c_rarg4;
3286     const Register table2 = c_rarg5;
3287     const Register table3 = c_rarg6;
3288     const Register tmp3 = c_rarg7;
3289 
3290     BLOCK_COMMENT("Entry:");
3291     __ enter(); // required for proper stackwalking of RuntimeStub frame
3292 
3293     __ kernel_crc32c(crc, buf, len,
3294               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3295 
3296     __ leave(); // required for proper stackwalking of RuntimeStub frame
3297     __ ret(lr);
3298 
3299     return start;
3300   }
3301 
3302   /***
3303    *  Arguments:
3304    *
3305    *  Inputs:
3306    *   c_rarg0   - int   adler
3307    *   c_rarg1   - byte* buff
3308    *   c_rarg2   - int   len
3309    *
3310    * Output:
3311    *   c_rarg0   - int adler result
3312    */
3313   address generate_updateBytesAdler32() {
3314     __ align(CodeEntryAlignment);
3315     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3316     address start = __ pc();
3317 
3318     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3319 
3320     // Aliases
3321     Register adler  = c_rarg0;
3322     Register s1     = c_rarg0;
3323     Register s2     = c_rarg3;
3324     Register buff   = c_rarg1;
3325     Register len    = c_rarg2;
3326     Register nmax  = r4;
3327     Register base = r5;
3328     Register count = r6;
3329     Register temp0 = rscratch1;
3330     Register temp1 = rscratch2;
3331     Register temp2 = r7;
3332 
3333     // Max number of bytes we can process before having to take the mod
3334     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3335     unsigned long BASE = 0xfff1;
3336     unsigned long NMAX = 0x15B0;
3337 
3338     __ mov(base, BASE);
3339     __ mov(nmax, NMAX);
3340 
3341     // s1 is initialized to the lower 16 bits of adler
3342     // s2 is initialized to the upper 16 bits of adler
3343     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3344     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3345 
3346     // The pipelined loop needs at least 16 elements for 1 iteration
3347     // It does check this, but it is more effective to skip to the cleanup loop
3348     __ cmp(len, 16);
3349     __ br(Assembler::HS, L_nmax);
3350     __ cbz(len, L_combine);
3351 
3352     __ bind(L_simple_by1_loop);
3353     __ ldrb(temp0, Address(__ post(buff, 1)));
3354     __ add(s1, s1, temp0);
3355     __ add(s2, s2, s1);
3356     __ subs(len, len, 1);
3357     __ br(Assembler::HI, L_simple_by1_loop);
3358 
3359     // s1 = s1 % BASE
3360     __ subs(temp0, s1, base);
3361     __ csel(s1, temp0, s1, Assembler::HS);
3362 
3363     // s2 = s2 % BASE
3364     __ lsr(temp0, s2, 16);
3365     __ lsl(temp1, temp0, 4);
3366     __ sub(temp1, temp1, temp0);
3367     __ add(s2, temp1, s2, ext::uxth);
3368 
3369     __ subs(temp0, s2, base);
3370     __ csel(s2, temp0, s2, Assembler::HS);
3371 
3372     __ b(L_combine);
3373 
3374     __ bind(L_nmax);
3375     __ subs(len, len, nmax);
3376     __ sub(count, nmax, 16);
3377     __ br(Assembler::LO, L_by16);
3378 
3379     __ bind(L_nmax_loop);
3380 
3381     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3382 
3383     __ add(s1, s1, temp0, ext::uxtb);
3384     __ ubfx(temp2, temp0, 8, 8);
3385     __ add(s2, s2, s1);
3386     __ add(s1, s1, temp2);
3387     __ ubfx(temp2, temp0, 16, 8);
3388     __ add(s2, s2, s1);
3389     __ add(s1, s1, temp2);
3390     __ ubfx(temp2, temp0, 24, 8);
3391     __ add(s2, s2, s1);
3392     __ add(s1, s1, temp2);
3393     __ ubfx(temp2, temp0, 32, 8);
3394     __ add(s2, s2, s1);
3395     __ add(s1, s1, temp2);
3396     __ ubfx(temp2, temp0, 40, 8);
3397     __ add(s2, s2, s1);
3398     __ add(s1, s1, temp2);
3399     __ ubfx(temp2, temp0, 48, 8);
3400     __ add(s2, s2, s1);
3401     __ add(s1, s1, temp2);
3402     __ add(s2, s2, s1);
3403     __ add(s1, s1, temp0, Assembler::LSR, 56);
3404     __ add(s2, s2, s1);
3405 
3406     __ add(s1, s1, temp1, ext::uxtb);
3407     __ ubfx(temp2, temp1, 8, 8);
3408     __ add(s2, s2, s1);
3409     __ add(s1, s1, temp2);
3410     __ ubfx(temp2, temp1, 16, 8);
3411     __ add(s2, s2, s1);
3412     __ add(s1, s1, temp2);
3413     __ ubfx(temp2, temp1, 24, 8);
3414     __ add(s2, s2, s1);
3415     __ add(s1, s1, temp2);
3416     __ ubfx(temp2, temp1, 32, 8);
3417     __ add(s2, s2, s1);
3418     __ add(s1, s1, temp2);
3419     __ ubfx(temp2, temp1, 40, 8);
3420     __ add(s2, s2, s1);
3421     __ add(s1, s1, temp2);
3422     __ ubfx(temp2, temp1, 48, 8);
3423     __ add(s2, s2, s1);
3424     __ add(s1, s1, temp2);
3425     __ add(s2, s2, s1);
3426     __ add(s1, s1, temp1, Assembler::LSR, 56);
3427     __ add(s2, s2, s1);
3428 
3429     __ subs(count, count, 16);
3430     __ br(Assembler::HS, L_nmax_loop);
3431 
3432     // s1 = s1 % BASE
3433     __ lsr(temp0, s1, 16);
3434     __ lsl(temp1, temp0, 4);
3435     __ sub(temp1, temp1, temp0);
3436     __ add(temp1, temp1, s1, ext::uxth);
3437 
3438     __ lsr(temp0, temp1, 16);
3439     __ lsl(s1, temp0, 4);
3440     __ sub(s1, s1, temp0);
3441     __ add(s1, s1, temp1, ext:: uxth);
3442 
3443     __ subs(temp0, s1, base);
3444     __ csel(s1, temp0, s1, Assembler::HS);
3445 
3446     // s2 = s2 % BASE
3447     __ lsr(temp0, s2, 16);
3448     __ lsl(temp1, temp0, 4);
3449     __ sub(temp1, temp1, temp0);
3450     __ add(temp1, temp1, s2, ext::uxth);
3451 
3452     __ lsr(temp0, temp1, 16);
3453     __ lsl(s2, temp0, 4);
3454     __ sub(s2, s2, temp0);
3455     __ add(s2, s2, temp1, ext:: uxth);
3456 
3457     __ subs(temp0, s2, base);
3458     __ csel(s2, temp0, s2, Assembler::HS);
3459 
3460     __ subs(len, len, nmax);
3461     __ sub(count, nmax, 16);
3462     __ br(Assembler::HS, L_nmax_loop);
3463 
3464     __ bind(L_by16);
3465     __ adds(len, len, count);
3466     __ br(Assembler::LO, L_by1);
3467 
3468     __ bind(L_by16_loop);
3469 
3470     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3471 
3472     __ add(s1, s1, temp0, ext::uxtb);
3473     __ ubfx(temp2, temp0, 8, 8);
3474     __ add(s2, s2, s1);
3475     __ add(s1, s1, temp2);
3476     __ ubfx(temp2, temp0, 16, 8);
3477     __ add(s2, s2, s1);
3478     __ add(s1, s1, temp2);
3479     __ ubfx(temp2, temp0, 24, 8);
3480     __ add(s2, s2, s1);
3481     __ add(s1, s1, temp2);
3482     __ ubfx(temp2, temp0, 32, 8);
3483     __ add(s2, s2, s1);
3484     __ add(s1, s1, temp2);
3485     __ ubfx(temp2, temp0, 40, 8);
3486     __ add(s2, s2, s1);
3487     __ add(s1, s1, temp2);
3488     __ ubfx(temp2, temp0, 48, 8);
3489     __ add(s2, s2, s1);
3490     __ add(s1, s1, temp2);
3491     __ add(s2, s2, s1);
3492     __ add(s1, s1, temp0, Assembler::LSR, 56);
3493     __ add(s2, s2, s1);
3494 
3495     __ add(s1, s1, temp1, ext::uxtb);
3496     __ ubfx(temp2, temp1, 8, 8);
3497     __ add(s2, s2, s1);
3498     __ add(s1, s1, temp2);
3499     __ ubfx(temp2, temp1, 16, 8);
3500     __ add(s2, s2, s1);
3501     __ add(s1, s1, temp2);
3502     __ ubfx(temp2, temp1, 24, 8);
3503     __ add(s2, s2, s1);
3504     __ add(s1, s1, temp2);
3505     __ ubfx(temp2, temp1, 32, 8);
3506     __ add(s2, s2, s1);
3507     __ add(s1, s1, temp2);
3508     __ ubfx(temp2, temp1, 40, 8);
3509     __ add(s2, s2, s1);
3510     __ add(s1, s1, temp2);
3511     __ ubfx(temp2, temp1, 48, 8);
3512     __ add(s2, s2, s1);
3513     __ add(s1, s1, temp2);
3514     __ add(s2, s2, s1);
3515     __ add(s1, s1, temp1, Assembler::LSR, 56);
3516     __ add(s2, s2, s1);
3517 
3518     __ subs(len, len, 16);
3519     __ br(Assembler::HS, L_by16_loop);
3520 
3521     __ bind(L_by1);
3522     __ adds(len, len, 15);
3523     __ br(Assembler::LO, L_do_mod);
3524 
3525     __ bind(L_by1_loop);
3526     __ ldrb(temp0, Address(__ post(buff, 1)));
3527     __ add(s1, temp0, s1);
3528     __ add(s2, s2, s1);
3529     __ subs(len, len, 1);
3530     __ br(Assembler::HS, L_by1_loop);
3531 
3532     __ bind(L_do_mod);
3533     // s1 = s1 % BASE
3534     __ lsr(temp0, s1, 16);
3535     __ lsl(temp1, temp0, 4);
3536     __ sub(temp1, temp1, temp0);
3537     __ add(temp1, temp1, s1, ext::uxth);
3538 
3539     __ lsr(temp0, temp1, 16);
3540     __ lsl(s1, temp0, 4);
3541     __ sub(s1, s1, temp0);
3542     __ add(s1, s1, temp1, ext:: uxth);
3543 
3544     __ subs(temp0, s1, base);
3545     __ csel(s1, temp0, s1, Assembler::HS);
3546 
3547     // s2 = s2 % BASE
3548     __ lsr(temp0, s2, 16);
3549     __ lsl(temp1, temp0, 4);
3550     __ sub(temp1, temp1, temp0);
3551     __ add(temp1, temp1, s2, ext::uxth);
3552 
3553     __ lsr(temp0, temp1, 16);
3554     __ lsl(s2, temp0, 4);
3555     __ sub(s2, s2, temp0);
3556     __ add(s2, s2, temp1, ext:: uxth);
3557 
3558     __ subs(temp0, s2, base);
3559     __ csel(s2, temp0, s2, Assembler::HS);
3560 
3561     // Combine lower bits and higher bits
3562     __ bind(L_combine);
3563     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3564 
3565     __ ret(lr);
3566 
3567     return start;
3568   }
3569 
3570   /**
3571    *  Arguments:
3572    *
3573    *  Input:
3574    *    c_rarg0   - x address
3575    *    c_rarg1   - x length
3576    *    c_rarg2   - y address
3577    *    c_rarg3   - y lenth
3578    *    c_rarg4   - z address
3579    *    c_rarg5   - z length
3580    */
3581   address generate_multiplyToLen() {
3582     __ align(CodeEntryAlignment);
3583     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3584 
3585     address start = __ pc();
3586     const Register x     = r0;
3587     const Register xlen  = r1;
3588     const Register y     = r2;
3589     const Register ylen  = r3;
3590     const Register z     = r4;
3591     const Register zlen  = r5;
3592 
3593     const Register tmp1  = r10;
3594     const Register tmp2  = r11;
3595     const Register tmp3  = r12;
3596     const Register tmp4  = r13;
3597     const Register tmp5  = r14;
3598     const Register tmp6  = r15;
3599     const Register tmp7  = r16;
3600 
3601     BLOCK_COMMENT("Entry:");
3602     __ enter(); // required for proper stackwalking of RuntimeStub frame
3603     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3604     __ leave(); // required for proper stackwalking of RuntimeStub frame
3605     __ ret(lr);
3606 
3607     return start;
3608   }
3609 
3610   address generate_squareToLen() {
3611     // squareToLen algorithm for sizes 1..127 described in java code works
3612     // faster than multiply_to_len on some CPUs and slower on others, but
3613     // multiply_to_len shows a bit better overall results
3614     __ align(CodeEntryAlignment);
3615     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3616     address start = __ pc();
3617 
3618     const Register x     = r0;
3619     const Register xlen  = r1;
3620     const Register z     = r2;
3621     const Register zlen  = r3;
3622     const Register y     = r4; // == x
3623     const Register ylen  = r5; // == xlen
3624 
3625     const Register tmp1  = r10;
3626     const Register tmp2  = r11;
3627     const Register tmp3  = r12;
3628     const Register tmp4  = r13;
3629     const Register tmp5  = r14;
3630     const Register tmp6  = r15;
3631     const Register tmp7  = r16;
3632 
3633     RegSet spilled_regs = RegSet::of(y, ylen);
3634     BLOCK_COMMENT("Entry:");
3635     __ enter();
3636     __ push(spilled_regs, sp);
3637     __ mov(y, x);
3638     __ mov(ylen, xlen);
3639     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3640     __ pop(spilled_regs, sp);
3641     __ leave();
3642     __ ret(lr);
3643     return start;
3644   }
3645 
3646   address generate_mulAdd() {
3647     __ align(CodeEntryAlignment);
3648     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3649 
3650     address start = __ pc();
3651 
3652     const Register out     = r0;
3653     const Register in      = r1;
3654     const Register offset  = r2;
3655     const Register len     = r3;
3656     const Register k       = r4;
3657 
3658     BLOCK_COMMENT("Entry:");
3659     __ enter();
3660     __ mul_add(out, in, offset, len, k);
3661     __ leave();
3662     __ ret(lr);
3663 
3664     return start;
3665   }
3666 
3667   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3668                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3669                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3670     // Karatsuba multiplication performs a 128*128 -> 256-bit
3671     // multiplication in three 128-bit multiplications and a few
3672     // additions.
3673     //
3674     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3675     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3676     //
3677     // Inputs:
3678     //
3679     // A0 in a.d[0]     (subkey)
3680     // A1 in a.d[1]
3681     // (A1+A0) in a1_xor_a0.d[0]
3682     //
3683     // B0 in b.d[0]     (state)
3684     // B1 in b.d[1]
3685 
3686     __ ext(tmp1, __ T16B, b, b, 0x08);
3687     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3688     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3689     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3690     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3691 
3692     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3693     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3694     __ eor(tmp2, __ T16B, tmp2, tmp4);
3695     __ eor(tmp2, __ T16B, tmp2, tmp3);
3696 
3697     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3698     __ ins(result_hi, __ D, tmp2, 0, 1);
3699     __ ins(result_lo, __ D, tmp2, 1, 0);
3700   }
3701 
3702   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3703                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3704     const FloatRegister t0 = result;
3705 
3706     // The GCM field polynomial f is z^128 + p(z), where p =
3707     // z^7+z^2+z+1.
3708     //
3709     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3710     //
3711     // so, given that the product we're reducing is
3712     //    a == lo + hi * z^128
3713     // substituting,
3714     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3715     //
3716     // we reduce by multiplying hi by p(z) and subtracting the result
3717     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3718     // bits we can do this with two 64-bit multiplications, lo*p and
3719     // hi*p.
3720 
3721     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3722     __ ext(t1, __ T16B, t0, z, 8);
3723     __ eor(hi, __ T16B, hi, t1);
3724     __ ext(t1, __ T16B, z, t0, 8);
3725     __ eor(lo, __ T16B, lo, t1);
3726     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3727     __ eor(result, __ T16B, lo, t0);
3728   }
3729 
3730   address generate_has_negatives(address &has_negatives_long) {
3731     StubCodeMark mark(this, "StubRoutines", "has_negatives");
3732     const int large_loop_size = 64;
3733     const uint64_t UPPER_BIT_MASK=0x8080808080808080;
3734     int dcache_line = VM_Version::dcache_line_size();
3735 
3736     Register ary1 = r1, len = r2, result = r0;
3737 
3738     __ align(CodeEntryAlignment);
3739     address entry = __ pc();
3740 
3741     __ enter();
3742 
3743   Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE,
3744         LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL;
3745 
3746   __ cmp(len, 15);
3747   __ br(Assembler::GT, LEN_OVER_15);
3748   // The only case when execution falls into this code is when pointer is near
3749   // the end of memory page and we have to avoid reading next page
3750   __ add(ary1, ary1, len);
3751   __ subs(len, len, 8);
3752   __ br(Assembler::GT, LEN_OVER_8);
3753   __ ldr(rscratch2, Address(ary1, -8));
3754   __ sub(rscratch1, zr, len, __ LSL, 3);  // LSL 3 is to get bits from bytes.
3755   __ lsrv(rscratch2, rscratch2, rscratch1);
3756   __ tst(rscratch2, UPPER_BIT_MASK);
3757   __ cset(result, Assembler::NE);
3758   __ leave();
3759   __ ret(lr);
3760   __ bind(LEN_OVER_8);
3761   __ ldp(rscratch1, rscratch2, Address(ary1, -16));
3762   __ sub(len, len, 8); // no data dep., then sub can be executed while loading
3763   __ tst(rscratch2, UPPER_BIT_MASK);
3764   __ br(Assembler::NE, RET_TRUE_NO_POP);
3765   __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes
3766   __ lsrv(rscratch1, rscratch1, rscratch2);
3767   __ tst(rscratch1, UPPER_BIT_MASK);
3768   __ cset(result, Assembler::NE);
3769   __ leave();
3770   __ ret(lr);
3771 
3772   Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10;
3773   const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6;
3774 
3775   has_negatives_long = __ pc(); // 2nd entry point
3776 
3777   __ enter();
3778 
3779   __ bind(LEN_OVER_15);
3780     __ push(spilled_regs, sp);
3781     __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment
3782     __ cbz(rscratch2, ALIGNED);
3783     __ ldp(tmp6, tmp1, Address(ary1));
3784     __ mov(tmp5, 16);
3785     __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address
3786     __ add(ary1, ary1, rscratch1);
3787     __ sub(len, len, rscratch1);
3788     __ orr(tmp6, tmp6, tmp1);
3789     __ tst(tmp6, UPPER_BIT_MASK);
3790     __ br(Assembler::NE, RET_TRUE);
3791 
3792   __ bind(ALIGNED);
3793     __ cmp(len, large_loop_size);
3794     __ br(Assembler::LT, CHECK_16);
3795     // Perform 16-byte load as early return in pre-loop to handle situation
3796     // when initially aligned large array has negative values at starting bytes,
3797     // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is
3798     // slower. Cases with negative bytes further ahead won't be affected that
3799     // much. In fact, it'll be faster due to early loads, less instructions and
3800     // less branches in LARGE_LOOP.
3801     __ ldp(tmp6, tmp1, Address(__ post(ary1, 16)));
3802     __ sub(len, len, 16);
3803     __ orr(tmp6, tmp6, tmp1);
3804     __ tst(tmp6, UPPER_BIT_MASK);
3805     __ br(Assembler::NE, RET_TRUE);
3806     __ cmp(len, large_loop_size);
3807     __ br(Assembler::LT, CHECK_16);
3808 
3809     if (SoftwarePrefetchHintDistance >= 0
3810         && SoftwarePrefetchHintDistance >= dcache_line) {
3811       // initial prefetch
3812       __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line));
3813     }
3814   __ bind(LARGE_LOOP);
3815     if (SoftwarePrefetchHintDistance >= 0) {
3816       __ prfm(Address(ary1, SoftwarePrefetchHintDistance));
3817     }
3818     // Issue load instructions first, since it can save few CPU/MEM cycles, also
3819     // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp)
3820     // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3
3821     // instructions per cycle and have less branches, but this approach disables
3822     // early return, thus, all 64 bytes are loaded and checked every time.
3823     __ ldp(tmp2, tmp3, Address(ary1));
3824     __ ldp(tmp4, tmp5, Address(ary1, 16));
3825     __ ldp(rscratch1, rscratch2, Address(ary1, 32));
3826     __ ldp(tmp6, tmp1, Address(ary1, 48));
3827     __ add(ary1, ary1, large_loop_size);
3828     __ sub(len, len, large_loop_size);
3829     __ orr(tmp2, tmp2, tmp3);
3830     __ orr(tmp4, tmp4, tmp5);
3831     __ orr(rscratch1, rscratch1, rscratch2);
3832     __ orr(tmp6, tmp6, tmp1);
3833     __ orr(tmp2, tmp2, tmp4);
3834     __ orr(rscratch1, rscratch1, tmp6);
3835     __ orr(tmp2, tmp2, rscratch1);
3836     __ tst(tmp2, UPPER_BIT_MASK);
3837     __ br(Assembler::NE, RET_TRUE);
3838     __ cmp(len, large_loop_size);
3839     __ br(Assembler::GE, LARGE_LOOP);
3840 
3841   __ bind(CHECK_16); // small 16-byte load pre-loop
3842     __ cmp(len, 16);
3843     __ br(Assembler::LT, POST_LOOP16);
3844 
3845   __ bind(LOOP16); // small 16-byte load loop
3846     __ ldp(tmp2, tmp3, Address(__ post(ary1, 16)));
3847     __ sub(len, len, 16);
3848     __ orr(tmp2, tmp2, tmp3);
3849     __ tst(tmp2, UPPER_BIT_MASK);
3850     __ br(Assembler::NE, RET_TRUE);
3851     __ cmp(len, 16);
3852     __ br(Assembler::GE, LOOP16); // 16-byte load loop end
3853 
3854   __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally
3855     __ cmp(len, 8);
3856     __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL);
3857     __ ldr(tmp3, Address(__ post(ary1, 8)));
3858     __ sub(len, len, 8);
3859     __ tst(tmp3, UPPER_BIT_MASK);
3860     __ br(Assembler::NE, RET_TRUE);
3861 
3862   __ bind(POST_LOOP16_LOAD_TAIL);
3863     __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0
3864     __ ldr(tmp1, Address(ary1));
3865     __ mov(tmp2, 64);
3866     __ sub(tmp4, tmp2, len, __ LSL, 3);
3867     __ lslv(tmp1, tmp1, tmp4);
3868     __ tst(tmp1, UPPER_BIT_MASK);
3869     __ br(Assembler::NE, RET_TRUE);
3870     // Fallthrough
3871 
3872   __ bind(RET_FALSE);
3873     __ pop(spilled_regs, sp);
3874     __ leave();
3875     __ mov(result, zr);
3876     __ ret(lr);
3877 
3878   __ bind(RET_TRUE);
3879     __ pop(spilled_regs, sp);
3880   __ bind(RET_TRUE_NO_POP);
3881     __ leave();
3882     __ mov(result, 1);
3883     __ ret(lr);
3884 
3885   __ bind(DONE);
3886     __ pop(spilled_regs, sp);
3887     __ leave();
3888     __ ret(lr);
3889     return entry;
3890   }
3891   /**
3892    *  Arguments:
3893    *
3894    *  Input:
3895    *  c_rarg0   - current state address
3896    *  c_rarg1   - H key address
3897    *  c_rarg2   - data address
3898    *  c_rarg3   - number of blocks
3899    *
3900    *  Output:
3901    *  Updated state at c_rarg0
3902    */
3903   address generate_ghash_processBlocks() {
3904     // Bafflingly, GCM uses little-endian for the byte order, but
3905     // big-endian for the bit order.  For example, the polynomial 1 is
3906     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3907     //
3908     // So, we must either reverse the bytes in each word and do
3909     // everything big-endian or reverse the bits in each byte and do
3910     // it little-endian.  On AArch64 it's more idiomatic to reverse
3911     // the bits in each byte (we have an instruction, RBIT, to do
3912     // that) and keep the data in little-endian bit order throught the
3913     // calculation, bit-reversing the inputs and outputs.
3914 
3915     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3916     __ align(wordSize * 2);
3917     address p = __ pc();
3918     __ emit_int64(0x87);  // The low-order bits of the field
3919                           // polynomial (i.e. p = z^7+z^2+z+1)
3920                           // repeated in the low and high parts of a
3921                           // 128-bit vector
3922     __ emit_int64(0x87);
3923 
3924     __ align(CodeEntryAlignment);
3925     address start = __ pc();
3926 
3927     Register state   = c_rarg0;
3928     Register subkeyH = c_rarg1;
3929     Register data    = c_rarg2;
3930     Register blocks  = c_rarg3;
3931 
3932     FloatRegister vzr = v30;
3933     __ eor(vzr, __ T16B, vzr, vzr); // zero register
3934 
3935     __ ldrq(v0, Address(state));
3936     __ ldrq(v1, Address(subkeyH));
3937 
3938     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3939     __ rbit(v0, __ T16B, v0);
3940     __ rev64(v1, __ T16B, v1);
3941     __ rbit(v1, __ T16B, v1);
3942 
3943     __ ldrq(v26, p);
3944 
3945     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
3946     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
3947 
3948     {
3949       Label L_ghash_loop;
3950       __ bind(L_ghash_loop);
3951 
3952       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
3953                                                  // reversing each byte
3954       __ rbit(v2, __ T16B, v2);
3955       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
3956 
3957       // Multiply state in v2 by subkey in v1
3958       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
3959                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
3960                      /*temps*/v6, v20, v18, v21);
3961       // Reduce v7:v5 by the field polynomial
3962       ghash_reduce(v0, v5, v7, v26, vzr, v20);
3963 
3964       __ sub(blocks, blocks, 1);
3965       __ cbnz(blocks, L_ghash_loop);
3966     }
3967 
3968     // The bit-reversed result is at this point in v0
3969     __ rev64(v1, __ T16B, v0);
3970     __ rbit(v1, __ T16B, v1);
3971 
3972     __ st1(v1, __ T16B, state);
3973     __ ret(lr);
3974 
3975     return start;
3976   }
3977 
3978   // Continuation point for throwing of implicit exceptions that are
3979   // not handled in the current activation. Fabricates an exception
3980   // oop and initiates normal exception dispatching in this
3981   // frame. Since we need to preserve callee-saved values (currently
3982   // only for C2, but done for C1 as well) we need a callee-saved oop
3983   // map and therefore have to make these stubs into RuntimeStubs
3984   // rather than BufferBlobs.  If the compiler needs all registers to
3985   // be preserved between the fault point and the exception handler
3986   // then it must assume responsibility for that in
3987   // AbstractCompiler::continuation_for_implicit_null_exception or
3988   // continuation_for_implicit_division_by_zero_exception. All other
3989   // implicit exceptions (e.g., NullPointerException or
3990   // AbstractMethodError on entry) are either at call sites or
3991   // otherwise assume that stack unwinding will be initiated, so
3992   // caller saved registers were assumed volatile in the compiler.
3993 
3994 #undef __
3995 #define __ masm->
3996 
3997   address generate_throw_exception(const char* name,
3998                                    address runtime_entry,
3999                                    Register arg1 = noreg,
4000                                    Register arg2 = noreg) {
4001     // Information about frame layout at time of blocking runtime call.
4002     // Note that we only have to preserve callee-saved registers since
4003     // the compilers are responsible for supplying a continuation point
4004     // if they expect all registers to be preserved.
4005     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
4006     enum layout {
4007       rfp_off = 0,
4008       rfp_off2,
4009       return_off,
4010       return_off2,
4011       framesize // inclusive of return address
4012     };
4013 
4014     int insts_size = 512;
4015     int locs_size  = 64;
4016 
4017     CodeBuffer code(name, insts_size, locs_size);
4018     OopMapSet* oop_maps  = new OopMapSet();
4019     MacroAssembler* masm = new MacroAssembler(&code);
4020 
4021     address start = __ pc();
4022 
4023     // This is an inlined and slightly modified version of call_VM
4024     // which has the ability to fetch the return PC out of
4025     // thread-local storage and also sets up last_Java_sp slightly
4026     // differently than the real call_VM
4027 
4028     __ enter(); // Save FP and LR before call
4029 
4030     assert(is_even(framesize/2), "sp not 16-byte aligned");
4031 
4032     // lr and fp are already in place
4033     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
4034 
4035     int frame_complete = __ pc() - start;
4036 
4037     // Set up last_Java_sp and last_Java_fp
4038     address the_pc = __ pc();
4039     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
4040 
4041     // Call runtime
4042     if (arg1 != noreg) {
4043       assert(arg2 != c_rarg1, "clobbered");
4044       __ mov(c_rarg1, arg1);
4045     }
4046     if (arg2 != noreg) {
4047       __ mov(c_rarg2, arg2);
4048     }
4049     __ mov(c_rarg0, rthread);
4050     BLOCK_COMMENT("call runtime_entry");
4051     __ mov(rscratch1, runtime_entry);
4052     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
4053 
4054     // Generate oop map
4055     OopMap* map = new OopMap(framesize, 0);
4056 
4057     oop_maps->add_gc_map(the_pc - start, map);
4058 
4059     __ reset_last_Java_frame(true);
4060     __ maybe_isb();
4061 
4062     __ leave();
4063 
4064     // check for pending exceptions
4065 #ifdef ASSERT
4066     Label L;
4067     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
4068     __ cbnz(rscratch1, L);
4069     __ should_not_reach_here();
4070     __ bind(L);
4071 #endif // ASSERT
4072     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
4073 
4074 
4075     // codeBlob framesize is in words (not VMRegImpl::slot_size)
4076     RuntimeStub* stub =
4077       RuntimeStub::new_runtime_stub(name,
4078                                     &code,
4079                                     frame_complete,
4080                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
4081                                     oop_maps, false);
4082     return stub->entry_point();
4083   }
4084 
4085   class MontgomeryMultiplyGenerator : public MacroAssembler {
4086 
4087     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
4088       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
4089 
4090     RegSet _toSave;
4091     bool _squaring;
4092 
4093   public:
4094     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4095       : MacroAssembler(as->code()), _squaring(squaring) {
4096 
4097       // Register allocation
4098 
4099       Register reg = c_rarg0;
4100       Pa_base = reg;       // Argument registers
4101       if (squaring)
4102         Pb_base = Pa_base;
4103       else
4104         Pb_base = ++reg;
4105       Pn_base = ++reg;
4106       Rlen= ++reg;
4107       inv = ++reg;
4108       Pm_base = ++reg;
4109 
4110                           // Working registers:
4111       Ra =  ++reg;        // The current digit of a, b, n, and m.
4112       Rb =  ++reg;
4113       Rm =  ++reg;
4114       Rn =  ++reg;
4115 
4116       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4117       Pb =  ++reg;
4118       Pm =  ++reg;
4119       Pn =  ++reg;
4120 
4121       t0 =  ++reg;        // Three registers which form a
4122       t1 =  ++reg;        // triple-precision accumuator.
4123       t2 =  ++reg;
4124 
4125       Ri =  ++reg;        // Inner and outer loop indexes.
4126       Rj =  ++reg;
4127 
4128       Rhi_ab = ++reg;     // Product registers: low and high parts
4129       Rlo_ab = ++reg;     // of a*b and m*n.
4130       Rhi_mn = ++reg;
4131       Rlo_mn = ++reg;
4132 
4133       // r19 and up are callee-saved.
4134       _toSave = RegSet::range(r19, reg) + Pm_base;
4135     }
4136 
4137   private:
4138     void save_regs() {
4139       push(_toSave, sp);
4140     }
4141 
4142     void restore_regs() {
4143       pop(_toSave, sp);
4144     }
4145 
4146     template <typename T>
4147     void unroll_2(Register count, T block) {
4148       Label loop, end, odd;
4149       tbnz(count, 0, odd);
4150       cbz(count, end);
4151       align(16);
4152       bind(loop);
4153       (this->*block)();
4154       bind(odd);
4155       (this->*block)();
4156       subs(count, count, 2);
4157       br(Assembler::GT, loop);
4158       bind(end);
4159     }
4160 
4161     template <typename T>
4162     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4163       Label loop, end, odd;
4164       tbnz(count, 0, odd);
4165       cbz(count, end);
4166       align(16);
4167       bind(loop);
4168       (this->*block)(d, s, tmp);
4169       bind(odd);
4170       (this->*block)(d, s, tmp);
4171       subs(count, count, 2);
4172       br(Assembler::GT, loop);
4173       bind(end);
4174     }
4175 
4176     void pre1(RegisterOrConstant i) {
4177       block_comment("pre1");
4178       // Pa = Pa_base;
4179       // Pb = Pb_base + i;
4180       // Pm = Pm_base;
4181       // Pn = Pn_base + i;
4182       // Ra = *Pa;
4183       // Rb = *Pb;
4184       // Rm = *Pm;
4185       // Rn = *Pn;
4186       ldr(Ra, Address(Pa_base));
4187       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4188       ldr(Rm, Address(Pm_base));
4189       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4190       lea(Pa, Address(Pa_base));
4191       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4192       lea(Pm, Address(Pm_base));
4193       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4194 
4195       // Zero the m*n result.
4196       mov(Rhi_mn, zr);
4197       mov(Rlo_mn, zr);
4198     }
4199 
4200     // The core multiply-accumulate step of a Montgomery
4201     // multiplication.  The idea is to schedule operations as a
4202     // pipeline so that instructions with long latencies (loads and
4203     // multiplies) have time to complete before their results are
4204     // used.  This most benefits in-order implementations of the
4205     // architecture but out-of-order ones also benefit.
4206     void step() {
4207       block_comment("step");
4208       // MACC(Ra, Rb, t0, t1, t2);
4209       // Ra = *++Pa;
4210       // Rb = *--Pb;
4211       umulh(Rhi_ab, Ra, Rb);
4212       mul(Rlo_ab, Ra, Rb);
4213       ldr(Ra, pre(Pa, wordSize));
4214       ldr(Rb, pre(Pb, -wordSize));
4215       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4216                                        // previous iteration.
4217       // MACC(Rm, Rn, t0, t1, t2);
4218       // Rm = *++Pm;
4219       // Rn = *--Pn;
4220       umulh(Rhi_mn, Rm, Rn);
4221       mul(Rlo_mn, Rm, Rn);
4222       ldr(Rm, pre(Pm, wordSize));
4223       ldr(Rn, pre(Pn, -wordSize));
4224       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4225     }
4226 
4227     void post1() {
4228       block_comment("post1");
4229 
4230       // MACC(Ra, Rb, t0, t1, t2);
4231       // Ra = *++Pa;
4232       // Rb = *--Pb;
4233       umulh(Rhi_ab, Ra, Rb);
4234       mul(Rlo_ab, Ra, Rb);
4235       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4236       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4237 
4238       // *Pm = Rm = t0 * inv;
4239       mul(Rm, t0, inv);
4240       str(Rm, Address(Pm));
4241 
4242       // MACC(Rm, Rn, t0, t1, t2);
4243       // t0 = t1; t1 = t2; t2 = 0;
4244       umulh(Rhi_mn, Rm, Rn);
4245 
4246 #ifndef PRODUCT
4247       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4248       {
4249         mul(Rlo_mn, Rm, Rn);
4250         add(Rlo_mn, t0, Rlo_mn);
4251         Label ok;
4252         cbz(Rlo_mn, ok); {
4253           stop("broken Montgomery multiply");
4254         } bind(ok);
4255       }
4256 #endif
4257       // We have very carefully set things up so that
4258       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4259       // the lower half of Rm * Rn because we know the result already:
4260       // it must be -t0.  t0 + (-t0) must generate a carry iff
4261       // t0 != 0.  So, rather than do a mul and an adds we just set
4262       // the carry flag iff t0 is nonzero.
4263       //
4264       // mul(Rlo_mn, Rm, Rn);
4265       // adds(zr, t0, Rlo_mn);
4266       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4267       adcs(t0, t1, Rhi_mn);
4268       adc(t1, t2, zr);
4269       mov(t2, zr);
4270     }
4271 
4272     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
4273       block_comment("pre2");
4274       // Pa = Pa_base + i-len;
4275       // Pb = Pb_base + len;
4276       // Pm = Pm_base + i-len;
4277       // Pn = Pn_base + len;
4278 
4279       if (i.is_register()) {
4280         sub(Rj, i.as_register(), len);
4281       } else {
4282         mov(Rj, i.as_constant());
4283         sub(Rj, Rj, len);
4284       }
4285       // Rj == i-len
4286 
4287       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
4288       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
4289       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4290       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
4291 
4292       // Ra = *++Pa;
4293       // Rb = *--Pb;
4294       // Rm = *++Pm;
4295       // Rn = *--Pn;
4296       ldr(Ra, pre(Pa, wordSize));
4297       ldr(Rb, pre(Pb, -wordSize));
4298       ldr(Rm, pre(Pm, wordSize));
4299       ldr(Rn, pre(Pn, -wordSize));
4300 
4301       mov(Rhi_mn, zr);
4302       mov(Rlo_mn, zr);
4303     }
4304 
4305     void post2(RegisterOrConstant i, RegisterOrConstant len) {
4306       block_comment("post2");
4307       if (i.is_constant()) {
4308         mov(Rj, i.as_constant()-len.as_constant());
4309       } else {
4310         sub(Rj, i.as_register(), len);
4311       }
4312 
4313       adds(t0, t0, Rlo_mn); // The pending m*n, low part
4314 
4315       // As soon as we know the least significant digit of our result,
4316       // store it.
4317       // Pm_base[i-len] = t0;
4318       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4319 
4320       // t0 = t1; t1 = t2; t2 = 0;
4321       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
4322       adc(t1, t2, zr);
4323       mov(t2, zr);
4324     }
4325 
4326     // A carry in t0 after Montgomery multiplication means that we
4327     // should subtract multiples of n from our result in m.  We'll
4328     // keep doing that until there is no carry.
4329     void normalize(RegisterOrConstant len) {
4330       block_comment("normalize");
4331       // while (t0)
4332       //   t0 = sub(Pm_base, Pn_base, t0, len);
4333       Label loop, post, again;
4334       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
4335       cbz(t0, post); {
4336         bind(again); {
4337           mov(i, zr);
4338           mov(cnt, len);
4339           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4340           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4341           subs(zr, zr, zr); // set carry flag, i.e. no borrow
4342           align(16);
4343           bind(loop); {
4344             sbcs(Rm, Rm, Rn);
4345             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4346             add(i, i, 1);
4347             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4348             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4349             sub(cnt, cnt, 1);
4350           } cbnz(cnt, loop);
4351           sbc(t0, t0, zr);
4352         } cbnz(t0, again);
4353       } bind(post);
4354     }
4355 
4356     // Move memory at s to d, reversing words.
4357     //    Increments d to end of copied memory
4358     //    Destroys tmp1, tmp2
4359     //    Preserves len
4360     //    Leaves s pointing to the address which was in d at start
4361     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4362       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
4363 
4364       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
4365       mov(tmp1, len);
4366       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4367       sub(s, d, len, ext::uxtw, LogBytesPerWord);
4368     }
4369     // where
4370     void reverse1(Register d, Register s, Register tmp) {
4371       ldr(tmp, pre(s, -wordSize));
4372       ror(tmp, tmp, 32);
4373       str(tmp, post(d, wordSize));
4374     }
4375 
4376     void step_squaring() {
4377       // An extra ACC
4378       step();
4379       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4380     }
4381 
4382     void last_squaring(RegisterOrConstant i) {
4383       Label dont;
4384       // if ((i & 1) == 0) {
4385       tbnz(i.as_register(), 0, dont); {
4386         // MACC(Ra, Rb, t0, t1, t2);
4387         // Ra = *++Pa;
4388         // Rb = *--Pb;
4389         umulh(Rhi_ab, Ra, Rb);
4390         mul(Rlo_ab, Ra, Rb);
4391         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4392       } bind(dont);
4393     }
4394 
4395     void extra_step_squaring() {
4396       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4397 
4398       // MACC(Rm, Rn, t0, t1, t2);
4399       // Rm = *++Pm;
4400       // Rn = *--Pn;
4401       umulh(Rhi_mn, Rm, Rn);
4402       mul(Rlo_mn, Rm, Rn);
4403       ldr(Rm, pre(Pm, wordSize));
4404       ldr(Rn, pre(Pn, -wordSize));
4405     }
4406 
4407     void post1_squaring() {
4408       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4409 
4410       // *Pm = Rm = t0 * inv;
4411       mul(Rm, t0, inv);
4412       str(Rm, Address(Pm));
4413 
4414       // MACC(Rm, Rn, t0, t1, t2);
4415       // t0 = t1; t1 = t2; t2 = 0;
4416       umulh(Rhi_mn, Rm, Rn);
4417 
4418 #ifndef PRODUCT
4419       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4420       {
4421         mul(Rlo_mn, Rm, Rn);
4422         add(Rlo_mn, t0, Rlo_mn);
4423         Label ok;
4424         cbz(Rlo_mn, ok); {
4425           stop("broken Montgomery multiply");
4426         } bind(ok);
4427       }
4428 #endif
4429       // We have very carefully set things up so that
4430       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4431       // the lower half of Rm * Rn because we know the result already:
4432       // it must be -t0.  t0 + (-t0) must generate a carry iff
4433       // t0 != 0.  So, rather than do a mul and an adds we just set
4434       // the carry flag iff t0 is nonzero.
4435       //
4436       // mul(Rlo_mn, Rm, Rn);
4437       // adds(zr, t0, Rlo_mn);
4438       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4439       adcs(t0, t1, Rhi_mn);
4440       adc(t1, t2, zr);
4441       mov(t2, zr);
4442     }
4443 
4444     void acc(Register Rhi, Register Rlo,
4445              Register t0, Register t1, Register t2) {
4446       adds(t0, t0, Rlo);
4447       adcs(t1, t1, Rhi);
4448       adc(t2, t2, zr);
4449     }
4450 
4451   public:
4452     /**
4453      * Fast Montgomery multiplication.  The derivation of the
4454      * algorithm is in A Cryptographic Library for the Motorola
4455      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4456      *
4457      * Arguments:
4458      *
4459      * Inputs for multiplication:
4460      *   c_rarg0   - int array elements a
4461      *   c_rarg1   - int array elements b
4462      *   c_rarg2   - int array elements n (the modulus)
4463      *   c_rarg3   - int length
4464      *   c_rarg4   - int inv
4465      *   c_rarg5   - int array elements m (the result)
4466      *
4467      * Inputs for squaring:
4468      *   c_rarg0   - int array elements a
4469      *   c_rarg1   - int array elements n (the modulus)
4470      *   c_rarg2   - int length
4471      *   c_rarg3   - int inv
4472      *   c_rarg4   - int array elements m (the result)
4473      *
4474      */
4475     address generate_multiply() {
4476       Label argh, nothing;
4477       bind(argh);
4478       stop("MontgomeryMultiply total_allocation must be <= 8192");
4479 
4480       align(CodeEntryAlignment);
4481       address entry = pc();
4482 
4483       cbzw(Rlen, nothing);
4484 
4485       enter();
4486 
4487       // Make room.
4488       cmpw(Rlen, 512);
4489       br(Assembler::HI, argh);
4490       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4491       andr(sp, Ra, -2 * wordSize);
4492 
4493       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4494 
4495       {
4496         // Copy input args, reversing as we go.  We use Ra as a
4497         // temporary variable.
4498         reverse(Ra, Pa_base, Rlen, t0, t1);
4499         if (!_squaring)
4500           reverse(Ra, Pb_base, Rlen, t0, t1);
4501         reverse(Ra, Pn_base, Rlen, t0, t1);
4502       }
4503 
4504       // Push all call-saved registers and also Pm_base which we'll need
4505       // at the end.
4506       save_regs();
4507 
4508 #ifndef PRODUCT
4509       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4510       {
4511         ldr(Rn, Address(Pn_base, 0));
4512         mul(Rlo_mn, Rn, inv);
4513         cmp(Rlo_mn, -1);
4514         Label ok;
4515         br(EQ, ok); {
4516           stop("broken inverse in Montgomery multiply");
4517         } bind(ok);
4518       }
4519 #endif
4520 
4521       mov(Pm_base, Ra);
4522 
4523       mov(t0, zr);
4524       mov(t1, zr);
4525       mov(t2, zr);
4526 
4527       block_comment("for (int i = 0; i < len; i++) {");
4528       mov(Ri, zr); {
4529         Label loop, end;
4530         cmpw(Ri, Rlen);
4531         br(Assembler::GE, end);
4532 
4533         bind(loop);
4534         pre1(Ri);
4535 
4536         block_comment("  for (j = i; j; j--) {"); {
4537           movw(Rj, Ri);
4538           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4539         } block_comment("  } // j");
4540 
4541         post1();
4542         addw(Ri, Ri, 1);
4543         cmpw(Ri, Rlen);
4544         br(Assembler::LT, loop);
4545         bind(end);
4546         block_comment("} // i");
4547       }
4548 
4549       block_comment("for (int i = len; i < 2*len; i++) {");
4550       mov(Ri, Rlen); {
4551         Label loop, end;
4552         cmpw(Ri, Rlen, Assembler::LSL, 1);
4553         br(Assembler::GE, end);
4554 
4555         bind(loop);
4556         pre2(Ri, Rlen);
4557 
4558         block_comment("  for (j = len*2-i-1; j; j--) {"); {
4559           lslw(Rj, Rlen, 1);
4560           subw(Rj, Rj, Ri);
4561           subw(Rj, Rj, 1);
4562           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4563         } block_comment("  } // j");
4564 
4565         post2(Ri, Rlen);
4566         addw(Ri, Ri, 1);
4567         cmpw(Ri, Rlen, Assembler::LSL, 1);
4568         br(Assembler::LT, loop);
4569         bind(end);
4570       }
4571       block_comment("} // i");
4572 
4573       normalize(Rlen);
4574 
4575       mov(Ra, Pm_base);  // Save Pm_base in Ra
4576       restore_regs();  // Restore caller's Pm_base
4577 
4578       // Copy our result into caller's Pm_base
4579       reverse(Pm_base, Ra, Rlen, t0, t1);
4580 
4581       leave();
4582       bind(nothing);
4583       ret(lr);
4584 
4585       return entry;
4586     }
4587     // In C, approximately:
4588 
4589     // void
4590     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
4591     //                     unsigned long Pn_base[], unsigned long Pm_base[],
4592     //                     unsigned long inv, int len) {
4593     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4594     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4595     //   unsigned long Ra, Rb, Rn, Rm;
4596 
4597     //   int i;
4598 
4599     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4600 
4601     //   for (i = 0; i < len; i++) {
4602     //     int j;
4603 
4604     //     Pa = Pa_base;
4605     //     Pb = Pb_base + i;
4606     //     Pm = Pm_base;
4607     //     Pn = Pn_base + i;
4608 
4609     //     Ra = *Pa;
4610     //     Rb = *Pb;
4611     //     Rm = *Pm;
4612     //     Rn = *Pn;
4613 
4614     //     int iters = i;
4615     //     for (j = 0; iters--; j++) {
4616     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4617     //       MACC(Ra, Rb, t0, t1, t2);
4618     //       Ra = *++Pa;
4619     //       Rb = *--Pb;
4620     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4621     //       MACC(Rm, Rn, t0, t1, t2);
4622     //       Rm = *++Pm;
4623     //       Rn = *--Pn;
4624     //     }
4625 
4626     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
4627     //     MACC(Ra, Rb, t0, t1, t2);
4628     //     *Pm = Rm = t0 * inv;
4629     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4630     //     MACC(Rm, Rn, t0, t1, t2);
4631 
4632     //     assert(t0 == 0, "broken Montgomery multiply");
4633 
4634     //     t0 = t1; t1 = t2; t2 = 0;
4635     //   }
4636 
4637     //   for (i = len; i < 2*len; i++) {
4638     //     int j;
4639 
4640     //     Pa = Pa_base + i-len;
4641     //     Pb = Pb_base + len;
4642     //     Pm = Pm_base + i-len;
4643     //     Pn = Pn_base + len;
4644 
4645     //     Ra = *++Pa;
4646     //     Rb = *--Pb;
4647     //     Rm = *++Pm;
4648     //     Rn = *--Pn;
4649 
4650     //     int iters = len*2-i-1;
4651     //     for (j = i-len+1; iters--; j++) {
4652     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4653     //       MACC(Ra, Rb, t0, t1, t2);
4654     //       Ra = *++Pa;
4655     //       Rb = *--Pb;
4656     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4657     //       MACC(Rm, Rn, t0, t1, t2);
4658     //       Rm = *++Pm;
4659     //       Rn = *--Pn;
4660     //     }
4661 
4662     //     Pm_base[i-len] = t0;
4663     //     t0 = t1; t1 = t2; t2 = 0;
4664     //   }
4665 
4666     //   while (t0)
4667     //     t0 = sub(Pm_base, Pn_base, t0, len);
4668     // }
4669 
4670     /**
4671      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4672      * multiplies than Montgomery multiplication so it should be up to
4673      * 25% faster.  However, its loop control is more complex and it
4674      * may actually run slower on some machines.
4675      *
4676      * Arguments:
4677      *
4678      * Inputs:
4679      *   c_rarg0   - int array elements a
4680      *   c_rarg1   - int array elements n (the modulus)
4681      *   c_rarg2   - int length
4682      *   c_rarg3   - int inv
4683      *   c_rarg4   - int array elements m (the result)
4684      *
4685      */
4686     address generate_square() {
4687       Label argh;
4688       bind(argh);
4689       stop("MontgomeryMultiply total_allocation must be <= 8192");
4690 
4691       align(CodeEntryAlignment);
4692       address entry = pc();
4693 
4694       enter();
4695 
4696       // Make room.
4697       cmpw(Rlen, 512);
4698       br(Assembler::HI, argh);
4699       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4700       andr(sp, Ra, -2 * wordSize);
4701 
4702       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4703 
4704       {
4705         // Copy input args, reversing as we go.  We use Ra as a
4706         // temporary variable.
4707         reverse(Ra, Pa_base, Rlen, t0, t1);
4708         reverse(Ra, Pn_base, Rlen, t0, t1);
4709       }
4710 
4711       // Push all call-saved registers and also Pm_base which we'll need
4712       // at the end.
4713       save_regs();
4714 
4715       mov(Pm_base, Ra);
4716 
4717       mov(t0, zr);
4718       mov(t1, zr);
4719       mov(t2, zr);
4720 
4721       block_comment("for (int i = 0; i < len; i++) {");
4722       mov(Ri, zr); {
4723         Label loop, end;
4724         bind(loop);
4725         cmp(Ri, Rlen);
4726         br(Assembler::GE, end);
4727 
4728         pre1(Ri);
4729 
4730         block_comment("for (j = (i+1)/2; j; j--) {"); {
4731           add(Rj, Ri, 1);
4732           lsr(Rj, Rj, 1);
4733           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4734         } block_comment("  } // j");
4735 
4736         last_squaring(Ri);
4737 
4738         block_comment("  for (j = i/2; j; j--) {"); {
4739           lsr(Rj, Ri, 1);
4740           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4741         } block_comment("  } // j");
4742 
4743         post1_squaring();
4744         add(Ri, Ri, 1);
4745         cmp(Ri, Rlen);
4746         br(Assembler::LT, loop);
4747 
4748         bind(end);
4749         block_comment("} // i");
4750       }
4751 
4752       block_comment("for (int i = len; i < 2*len; i++) {");
4753       mov(Ri, Rlen); {
4754         Label loop, end;
4755         bind(loop);
4756         cmp(Ri, Rlen, Assembler::LSL, 1);
4757         br(Assembler::GE, end);
4758 
4759         pre2(Ri, Rlen);
4760 
4761         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4762           lsl(Rj, Rlen, 1);
4763           sub(Rj, Rj, Ri);
4764           sub(Rj, Rj, 1);
4765           lsr(Rj, Rj, 1);
4766           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4767         } block_comment("  } // j");
4768 
4769         last_squaring(Ri);
4770 
4771         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4772           lsl(Rj, Rlen, 1);
4773           sub(Rj, Rj, Ri);
4774           lsr(Rj, Rj, 1);
4775           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4776         } block_comment("  } // j");
4777 
4778         post2(Ri, Rlen);
4779         add(Ri, Ri, 1);
4780         cmp(Ri, Rlen, Assembler::LSL, 1);
4781 
4782         br(Assembler::LT, loop);
4783         bind(end);
4784         block_comment("} // i");
4785       }
4786 
4787       normalize(Rlen);
4788 
4789       mov(Ra, Pm_base);  // Save Pm_base in Ra
4790       restore_regs();  // Restore caller's Pm_base
4791 
4792       // Copy our result into caller's Pm_base
4793       reverse(Pm_base, Ra, Rlen, t0, t1);
4794 
4795       leave();
4796       ret(lr);
4797 
4798       return entry;
4799     }
4800     // In C, approximately:
4801 
4802     // void
4803     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4804     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4805     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4806     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4807     //   unsigned long Ra, Rb, Rn, Rm;
4808 
4809     //   int i;
4810 
4811     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4812 
4813     //   for (i = 0; i < len; i++) {
4814     //     int j;
4815 
4816     //     Pa = Pa_base;
4817     //     Pb = Pa_base + i;
4818     //     Pm = Pm_base;
4819     //     Pn = Pn_base + i;
4820 
4821     //     Ra = *Pa;
4822     //     Rb = *Pb;
4823     //     Rm = *Pm;
4824     //     Rn = *Pn;
4825 
4826     //     int iters = (i+1)/2;
4827     //     for (j = 0; iters--; j++) {
4828     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4829     //       MACC2(Ra, Rb, t0, t1, t2);
4830     //       Ra = *++Pa;
4831     //       Rb = *--Pb;
4832     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4833     //       MACC(Rm, Rn, t0, t1, t2);
4834     //       Rm = *++Pm;
4835     //       Rn = *--Pn;
4836     //     }
4837     //     if ((i & 1) == 0) {
4838     //       assert(Ra == Pa_base[j], "must be");
4839     //       MACC(Ra, Ra, t0, t1, t2);
4840     //     }
4841     //     iters = i/2;
4842     //     assert(iters == i-j, "must be");
4843     //     for (; iters--; j++) {
4844     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4845     //       MACC(Rm, Rn, t0, t1, t2);
4846     //       Rm = *++Pm;
4847     //       Rn = *--Pn;
4848     //     }
4849 
4850     //     *Pm = Rm = t0 * inv;
4851     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4852     //     MACC(Rm, Rn, t0, t1, t2);
4853 
4854     //     assert(t0 == 0, "broken Montgomery multiply");
4855 
4856     //     t0 = t1; t1 = t2; t2 = 0;
4857     //   }
4858 
4859     //   for (i = len; i < 2*len; i++) {
4860     //     int start = i-len+1;
4861     //     int end = start + (len - start)/2;
4862     //     int j;
4863 
4864     //     Pa = Pa_base + i-len;
4865     //     Pb = Pa_base + len;
4866     //     Pm = Pm_base + i-len;
4867     //     Pn = Pn_base + len;
4868 
4869     //     Ra = *++Pa;
4870     //     Rb = *--Pb;
4871     //     Rm = *++Pm;
4872     //     Rn = *--Pn;
4873 
4874     //     int iters = (2*len-i-1)/2;
4875     //     assert(iters == end-start, "must be");
4876     //     for (j = start; iters--; j++) {
4877     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4878     //       MACC2(Ra, Rb, t0, t1, t2);
4879     //       Ra = *++Pa;
4880     //       Rb = *--Pb;
4881     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4882     //       MACC(Rm, Rn, t0, t1, t2);
4883     //       Rm = *++Pm;
4884     //       Rn = *--Pn;
4885     //     }
4886     //     if ((i & 1) == 0) {
4887     //       assert(Ra == Pa_base[j], "must be");
4888     //       MACC(Ra, Ra, t0, t1, t2);
4889     //     }
4890     //     iters =  (2*len-i)/2;
4891     //     assert(iters == len-j, "must be");
4892     //     for (; iters--; j++) {
4893     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4894     //       MACC(Rm, Rn, t0, t1, t2);
4895     //       Rm = *++Pm;
4896     //       Rn = *--Pn;
4897     //     }
4898     //     Pm_base[i-len] = t0;
4899     //     t0 = t1; t1 = t2; t2 = 0;
4900     //   }
4901 
4902     //   while (t0)
4903     //     t0 = sub(Pm_base, Pn_base, t0, len);
4904     // }
4905   };
4906 
4907 
4908   // Initialization
4909   void generate_initial() {
4910     // Generate initial stubs and initializes the entry points
4911 
4912     // entry points that exist in all platforms Note: This is code
4913     // that could be shared among different platforms - however the
4914     // benefit seems to be smaller than the disadvantage of having a
4915     // much more complicated generator structure. See also comment in
4916     // stubRoutines.hpp.
4917 
4918     StubRoutines::_forward_exception_entry = generate_forward_exception();
4919 
4920     StubRoutines::_call_stub_entry =
4921       generate_call_stub(StubRoutines::_call_stub_return_address);
4922 
4923     // is referenced by megamorphic call
4924     StubRoutines::_catch_exception_entry = generate_catch_exception();
4925 
4926     // Build this early so it's available for the interpreter.
4927     StubRoutines::_throw_StackOverflowError_entry =
4928       generate_throw_exception("StackOverflowError throw_exception",
4929                                CAST_FROM_FN_PTR(address,
4930                                                 SharedRuntime::throw_StackOverflowError));
4931     StubRoutines::_throw_delayed_StackOverflowError_entry =
4932       generate_throw_exception("delayed StackOverflowError throw_exception",
4933                                CAST_FROM_FN_PTR(address,
4934                                                 SharedRuntime::throw_delayed_StackOverflowError));
4935     if (UseCRC32Intrinsics) {
4936       // set table address before stub generation which use it
4937       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4938       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4939     }
4940 
4941     if (UseCRC32CIntrinsics) {
4942       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
4943     }
4944   }
4945 
4946   void generate_all() {
4947     // support for verify_oop (must happen after universe_init)
4948     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4949     StubRoutines::_throw_AbstractMethodError_entry =
4950       generate_throw_exception("AbstractMethodError throw_exception",
4951                                CAST_FROM_FN_PTR(address,
4952                                                 SharedRuntime::
4953                                                 throw_AbstractMethodError));
4954 
4955     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4956       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4957                                CAST_FROM_FN_PTR(address,
4958                                                 SharedRuntime::
4959                                                 throw_IncompatibleClassChangeError));
4960 
4961     StubRoutines::_throw_NullPointerException_at_call_entry =
4962       generate_throw_exception("NullPointerException at call throw_exception",
4963                                CAST_FROM_FN_PTR(address,
4964                                                 SharedRuntime::
4965                                                 throw_NullPointerException_at_call));
4966 
4967     // arraycopy stubs used by compilers
4968     generate_arraycopy_stubs();
4969 
4970     // has negatives stub for large arrays.
4971     StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long);
4972 
4973     if (UseMultiplyToLenIntrinsic) {
4974       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4975     }
4976 
4977     if (UseSquareToLenIntrinsic) {
4978       StubRoutines::_squareToLen = generate_squareToLen();
4979     }
4980 
4981     if (UseMulAddIntrinsic) {
4982       StubRoutines::_mulAdd = generate_mulAdd();
4983     }
4984 
4985     if (UseMontgomeryMultiplyIntrinsic) {
4986       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4987       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4988       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4989     }
4990 
4991     if (UseMontgomerySquareIntrinsic) {
4992       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4993       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4994       // We use generate_multiply() rather than generate_square()
4995       // because it's faster for the sizes of modulus we care about.
4996       StubRoutines::_montgomerySquare = g.generate_multiply();
4997     }
4998 
4999 #ifndef BUILTIN_SIM
5000     // generate GHASH intrinsics code
5001     if (UseGHASHIntrinsics) {
5002       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5003     }
5004 
5005     if (UseAESIntrinsics) {
5006       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5007       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5008       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5009       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
5010     }
5011 
5012     if (UseSHA1Intrinsics) {
5013       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5014       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5015     }
5016     if (UseSHA256Intrinsics) {
5017       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5018       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5019     }
5020 
5021     // generate Adler32 intrinsics code
5022     if (UseAdler32Intrinsics) {
5023       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5024     }
5025 
5026     // Safefetch stubs.
5027     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5028                                                        &StubRoutines::_safefetch32_fault_pc,
5029                                                        &StubRoutines::_safefetch32_continuation_pc);
5030     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5031                                                        &StubRoutines::_safefetchN_fault_pc,
5032                                                        &StubRoutines::_safefetchN_continuation_pc);
5033 #endif
5034     StubRoutines::aarch64::set_completed();
5035   }
5036 
5037  public:
5038   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5039     if (all) {
5040       generate_all();
5041     } else {
5042       generate_initial();
5043     }
5044   }
5045 }; // end class declaration
5046 
5047 void StubGenerator_generate(CodeBuffer* code, bool all) {
5048   StubGenerator g(code, all);
5049 }