New src/cpu/aarch64/vm/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "interpreter/interpreter.hpp"
  30 #include "nativeInst_aarch64.hpp"
  31 #include "oops/instanceOop.hpp"
  32 #include "oops/method.hpp"
  33 #include "oops/objArrayKlass.hpp"
  34 #include "oops/oop.inline.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/frame.inline.hpp"
  37 #include "runtime/handles.inline.hpp"
  38 #include "runtime/sharedRuntime.hpp"
  39 #include "runtime/stubCodeGenerator.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "runtime/thread.inline.hpp"
  42 #include "utilities/align.hpp"
  43 #ifdef COMPILER2
  44 #include "opto/runtime.hpp"
  45 #endif
  46 
  47 #ifdef BUILTIN_SIM
  48 #include "../../../../../../simulator/simulator.hpp"
  49 #endif
  50 
  51 // Declaration and definition of StubGenerator (no .hpp file).
  52 // For a more detailed description of the stub routine structure
  53 // see the comment in stubRoutines.hpp
  54 
  55 #undef __
  56 #define __ _masm->
  57 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #else
  62 #define BLOCK_COMMENT(str) __ block_comment(str)
  63 #endif
  64 
  65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  66 
  67 // Stub Code definitions
  68 
  69 class StubGenerator: public StubCodeGenerator {
  70  private:
  71 
  72 #ifdef PRODUCT
  73 #define inc_counter_np(counter) ((void)0)
  74 #else
  75   void inc_counter_np_(int& counter) {
  76     __ lea(rscratch2, ExternalAddress((address)&counter));
  77     __ ldrw(rscratch1, Address(rscratch2));
  78     __ addw(rscratch1, rscratch1, 1);
  79     __ strw(rscratch1, Address(rscratch2));
  80   }
  81 #define inc_counter_np(counter) \
  82   BLOCK_COMMENT("inc_counter " #counter); \
  83   inc_counter_np_(counter);
  84 #endif
  85 
  86   // Call stubs are used to call Java from C
  87   //
  88   // Arguments:
  89   //    c_rarg0:   call wrapper address                   address
  90   //    c_rarg1:   result                                 address
  91   //    c_rarg2:   result type                            BasicType
  92   //    c_rarg3:   method                                 Method*
  93   //    c_rarg4:   (interpreter) entry point              address
  94   //    c_rarg5:   parameters                             intptr_t*
  95   //    c_rarg6:   parameter size (in words)              int
  96   //    c_rarg7:   thread                                 Thread*
  97   //
  98   // There is no return from the stub itself as any Java result
  99   // is written to result
 100   //
 101   // we save r30 (lr) as the return PC at the base of the frame and
 102   // link r29 (fp) below it as the frame pointer installing sp (r31)
 103   // into fp.
 104   //
 105   // we save r0-r7, which accounts for all the c arguments.
 106   //
 107   // TODO: strictly do we need to save them all? they are treated as
 108   // volatile by C so could we omit saving the ones we are going to
 109   // place in global registers (thread? method?) or those we only use
 110   // during setup of the Java call?
 111   //
 112   // we don't need to save r8 which C uses as an indirect result location
 113   // return register.
 114   //
 115   // we don't need to save r9-r15 which both C and Java treat as
 116   // volatile
 117   //
 118   // we don't need to save r16-18 because Java does not use them
 119   //
 120   // we save r19-r28 which Java uses as scratch registers and C
 121   // expects to be callee-save
 122   //
 123   // we save the bottom 64 bits of each value stored in v8-v15; it is
 124   // the responsibility of the caller to preserve larger values.
 125   //
 126   // so the stub frame looks like this when we enter Java code
 127   //
 128   //     [ return_from_Java     ] <--- sp
 129   //     [ argument word n      ]
 130   //      ...
 131   // -27 [ argument word 1      ]
 132   // -26 [ saved v15            ] <--- sp_after_call
 133   // -25 [ saved v14            ]
 134   // -24 [ saved v13            ]
 135   // -23 [ saved v12            ]
 136   // -22 [ saved v11            ]
 137   // -21 [ saved v10            ]
 138   // -20 [ saved v9             ]
 139   // -19 [ saved v8             ]
 140   // -18 [ saved r28            ]
 141   // -17 [ saved r27            ]
 142   // -16 [ saved r26            ]
 143   // -15 [ saved r25            ]
 144   // -14 [ saved r24            ]
 145   // -13 [ saved r23            ]
 146   // -12 [ saved r22            ]
 147   // -11 [ saved r21            ]
 148   // -10 [ saved r20            ]
 149   //  -9 [ saved r19            ]
 150   //  -8 [ call wrapper    (r0) ]
 151   //  -7 [ result          (r1) ]
 152   //  -6 [ result type     (r2) ]
 153   //  -5 [ method          (r3) ]
 154   //  -4 [ entry point     (r4) ]
 155   //  -3 [ parameters      (r5) ]
 156   //  -2 [ parameter size  (r6) ]
 157   //  -1 [ thread (r7)          ]
 158   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 159   //   1 [ saved lr       (r30) ]
 160 
 161   // Call stub stack layout word offsets from fp
 162   enum call_stub_layout {
 163     sp_after_call_off = -26,
 164 
 165     d15_off            = -26,
 166     d13_off            = -24,
 167     d11_off            = -22,
 168     d9_off             = -20,
 169 
 170     r28_off            = -18,
 171     r26_off            = -16,
 172     r24_off            = -14,
 173     r22_off            = -12,
 174     r20_off            = -10,
 175     call_wrapper_off   =  -8,
 176     result_off         =  -7,
 177     result_type_off    =  -6,
 178     method_off         =  -5,
 179     entry_point_off    =  -4,
 180     parameter_size_off =  -2,
 181     thread_off         =  -1,
 182     fp_f               =   0,
 183     retaddr_off        =   1,
 184   };
 185 
 186   address generate_call_stub(address& return_address) {
 187     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 188            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 189            "adjust this code");
 190 
 191     StubCodeMark mark(this, "StubRoutines", "call_stub");
 192     address start = __ pc();
 193 
 194     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 195 
 196     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 197     const Address result        (rfp, result_off         * wordSize);
 198     const Address result_type   (rfp, result_type_off    * wordSize);
 199     const Address method        (rfp, method_off         * wordSize);
 200     const Address entry_point   (rfp, entry_point_off    * wordSize);
 201     const Address parameter_size(rfp, parameter_size_off * wordSize);
 202 
 203     const Address thread        (rfp, thread_off         * wordSize);
 204 
 205     const Address d15_save      (rfp, d15_off * wordSize);
 206     const Address d13_save      (rfp, d13_off * wordSize);
 207     const Address d11_save      (rfp, d11_off * wordSize);
 208     const Address d9_save       (rfp, d9_off * wordSize);
 209 
 210     const Address r28_save      (rfp, r28_off * wordSize);
 211     const Address r26_save      (rfp, r26_off * wordSize);
 212     const Address r24_save      (rfp, r24_off * wordSize);
 213     const Address r22_save      (rfp, r22_off * wordSize);
 214     const Address r20_save      (rfp, r20_off * wordSize);
 215 
 216     // stub code
 217 
 218     // we need a C prolog to bootstrap the x86 caller into the sim
 219     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 220 
 221     address aarch64_entry = __ pc();
 222 
 223 #ifdef BUILTIN_SIM
 224     // Save sender's SP for stack traces.
 225     __ mov(rscratch1, sp);
 226     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 227 #endif
 228     // set up frame and move sp to end of save area
 229     __ enter();
 230     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 231 
 232     // save register parameters and Java scratch/global registers
 233     // n.b. we save thread even though it gets installed in
 234     // rthread because we want to sanity check rthread later
 235     __ str(c_rarg7,  thread);
 236     __ strw(c_rarg6, parameter_size);
 237     __ stp(c_rarg4, c_rarg5,  entry_point);
 238     __ stp(c_rarg2, c_rarg3,  result_type);
 239     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 240 
 241     __ stp(r20, r19,   r20_save);
 242     __ stp(r22, r21,   r22_save);
 243     __ stp(r24, r23,   r24_save);
 244     __ stp(r26, r25,   r26_save);
 245     __ stp(r28, r27,   r28_save);
 246 
 247     __ stpd(v9,  v8,   d9_save);
 248     __ stpd(v11, v10,  d11_save);
 249     __ stpd(v13, v12,  d13_save);
 250     __ stpd(v15, v14,  d15_save);
 251 
 252     // install Java thread in global register now we have saved
 253     // whatever value it held
 254     __ mov(rthread, c_rarg7);
 255     // And method
 256     __ mov(rmethod, c_rarg3);
 257 
 258     // set up the heapbase register
 259     __ reinit_heapbase();
 260 
 261 #ifdef ASSERT
 262     // make sure we have no pending exceptions
 263     {
 264       Label L;
 265       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 266       __ cmp(rscratch1, (unsigned)NULL_WORD);
 267       __ br(Assembler::EQ, L);
 268       __ stop("StubRoutines::call_stub: entered with pending exception");
 269       __ BIND(L);
 270     }
 271 #endif
 272     // pass parameters if any
 273     __ mov(esp, sp);
 274     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 275     __ andr(sp, rscratch1, -2 * wordSize);
 276 
 277     BLOCK_COMMENT("pass parameters if any");
 278     Label parameters_done;
 279     // parameter count is still in c_rarg6
 280     // and parameter pointer identifying param 1 is in c_rarg5
 281     __ cbzw(c_rarg6, parameters_done);
 282 
 283     address loop = __ pc();
 284     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 285     __ subsw(c_rarg6, c_rarg6, 1);
 286     __ push(rscratch1);
 287     __ br(Assembler::GT, loop);
 288 
 289     __ BIND(parameters_done);
 290 
 291     // call Java entry -- passing methdoOop, and current sp
 292     //      rmethod: Method*
 293     //      r13: sender sp
 294     BLOCK_COMMENT("call Java function");
 295     __ mov(r13, sp);
 296     __ blr(c_rarg4);
 297 
 298     // tell the simulator we have returned to the stub
 299 
 300     // we do this here because the notify will already have been done
 301     // if we get to the next instruction via an exception
 302     //
 303     // n.b. adding this instruction here affects the calculation of
 304     // whether or not a routine returns to the call stub (used when
 305     // doing stack walks) since the normal test is to check the return
 306     // pc against the address saved below. so we may need to allow for
 307     // this extra instruction in the check.
 308 
 309     if (NotifySimulator) {
 310       __ notify(Assembler::method_reentry);
 311     }
 312     // save current address for use by exception handling code
 313 
 314     return_address = __ pc();
 315 
 316     // store result depending on type (everything that is not
 317     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 318     // n.b. this assumes Java returns an integral result in r0
 319     // and a floating result in j_farg0
 320     __ ldr(j_rarg2, result);
 321     Label is_long, is_float, is_double, exit;
 322     __ ldr(j_rarg1, result_type);
 323     __ cmp(j_rarg1, T_OBJECT);
 324     __ br(Assembler::EQ, is_long);
 325     __ cmp(j_rarg1, T_LONG);
 326     __ br(Assembler::EQ, is_long);
 327     __ cmp(j_rarg1, T_FLOAT);
 328     __ br(Assembler::EQ, is_float);
 329     __ cmp(j_rarg1, T_DOUBLE);
 330     __ br(Assembler::EQ, is_double);
 331 
 332     // handle T_INT case
 333     __ strw(r0, Address(j_rarg2));
 334 
 335     __ BIND(exit);
 336 
 337     // pop parameters
 338     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 339 
 340 #ifdef ASSERT
 341     // verify that threads correspond
 342     {
 343       Label L, S;
 344       __ ldr(rscratch1, thread);
 345       __ cmp(rthread, rscratch1);
 346       __ br(Assembler::NE, S);
 347       __ get_thread(rscratch1);
 348       __ cmp(rthread, rscratch1);
 349       __ br(Assembler::EQ, L);
 350       __ BIND(S);
 351       __ stop("StubRoutines::call_stub: threads must correspond");
 352       __ BIND(L);
 353     }
 354 #endif
 355 
 356     // restore callee-save registers
 357     __ ldpd(v15, v14,  d15_save);
 358     __ ldpd(v13, v12,  d13_save);
 359     __ ldpd(v11, v10,  d11_save);
 360     __ ldpd(v9,  v8,   d9_save);
 361 
 362     __ ldp(r28, r27,   r28_save);
 363     __ ldp(r26, r25,   r26_save);
 364     __ ldp(r24, r23,   r24_save);
 365     __ ldp(r22, r21,   r22_save);
 366     __ ldp(r20, r19,   r20_save);
 367 
 368     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 369     __ ldrw(c_rarg2, result_type);
 370     __ ldr(c_rarg3,  method);
 371     __ ldp(c_rarg4, c_rarg5,  entry_point);
 372     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 373 
 374 #ifndef PRODUCT
 375     // tell the simulator we are about to end Java execution
 376     if (NotifySimulator) {
 377       __ notify(Assembler::method_exit);
 378     }
 379 #endif
 380     // leave frame and return to caller
 381     __ leave();
 382     __ ret(lr);
 383 
 384     // handle return types different from T_INT
 385 
 386     __ BIND(is_long);
 387     __ str(r0, Address(j_rarg2, 0));
 388     __ br(Assembler::AL, exit);
 389 
 390     __ BIND(is_float);
 391     __ strs(j_farg0, Address(j_rarg2, 0));
 392     __ br(Assembler::AL, exit);
 393 
 394     __ BIND(is_double);
 395     __ strd(j_farg0, Address(j_rarg2, 0));
 396     __ br(Assembler::AL, exit);
 397 
 398     return start;
 399   }
 400 
 401   // Return point for a Java call if there's an exception thrown in
 402   // Java code.  The exception is caught and transformed into a
 403   // pending exception stored in JavaThread that can be tested from
 404   // within the VM.
 405   //
 406   // Note: Usually the parameters are removed by the callee. In case
 407   // of an exception crossing an activation frame boundary, that is
 408   // not the case if the callee is compiled code => need to setup the
 409   // rsp.
 410   //
 411   // r0: exception oop
 412 
 413   // NOTE: this is used as a target from the signal handler so it
 414   // needs an x86 prolog which returns into the current simulator
 415   // executing the generated catch_exception code. so the prolog
 416   // needs to install rax in a sim register and adjust the sim's
 417   // restart pc to enter the generated code at the start position
 418   // then return from native to simulated execution.
 419 
 420   address generate_catch_exception() {
 421     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 422     address start = __ pc();
 423 
 424     // same as in generate_call_stub():
 425     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 426     const Address thread        (rfp, thread_off         * wordSize);
 427 
 428 #ifdef ASSERT
 429     // verify that threads correspond
 430     {
 431       Label L, S;
 432       __ ldr(rscratch1, thread);
 433       __ cmp(rthread, rscratch1);
 434       __ br(Assembler::NE, S);
 435       __ get_thread(rscratch1);
 436       __ cmp(rthread, rscratch1);
 437       __ br(Assembler::EQ, L);
 438       __ bind(S);
 439       __ stop("StubRoutines::catch_exception: threads must correspond");
 440       __ bind(L);
 441     }
 442 #endif
 443 
 444     // set pending exception
 445     __ verify_oop(r0);
 446 
 447     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 448     __ mov(rscratch1, (address)__FILE__);
 449     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 450     __ movw(rscratch1, (int)__LINE__);
 451     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 452 
 453     // complete return to VM
 454     assert(StubRoutines::_call_stub_return_address != NULL,
 455            "_call_stub_return_address must have been generated before");
 456     __ b(StubRoutines::_call_stub_return_address);
 457 
 458     return start;
 459   }
 460 
 461   // Continuation point for runtime calls returning with a pending
 462   // exception.  The pending exception check happened in the runtime
 463   // or native call stub.  The pending exception in Thread is
 464   // converted into a Java-level exception.
 465   //
 466   // Contract with Java-level exception handlers:
 467   // r0: exception
 468   // r3: throwing pc
 469   //
 470   // NOTE: At entry of this stub, exception-pc must be in LR !!
 471 
 472   // NOTE: this is always used as a jump target within generated code
 473   // so it just needs to be generated code wiht no x86 prolog
 474 
 475   address generate_forward_exception() {
 476     StubCodeMark mark(this, "StubRoutines", "forward exception");
 477     address start = __ pc();
 478 
 479     // Upon entry, LR points to the return address returning into
 480     // Java (interpreted or compiled) code; i.e., the return address
 481     // becomes the throwing pc.
 482     //
 483     // Arguments pushed before the runtime call are still on the stack
 484     // but the exception handler will reset the stack pointer ->
 485     // ignore them.  A potential result in registers can be ignored as
 486     // well.
 487 
 488 #ifdef ASSERT
 489     // make sure this code is only executed if there is a pending exception
 490     {
 491       Label L;
 492       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 493       __ cbnz(rscratch1, L);
 494       __ stop("StubRoutines::forward exception: no pending exception (1)");
 495       __ bind(L);
 496     }
 497 #endif
 498 
 499     // compute exception handler into r19
 500 
 501     // call the VM to find the handler address associated with the
 502     // caller address. pass thread in r0 and caller pc (ret address)
 503     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 504     // the stack.
 505     __ mov(c_rarg1, lr);
 506     // lr will be trashed by the VM call so we move it to R19
 507     // (callee-saved) because we also need to pass it to the handler
 508     // returned by this call.
 509     __ mov(r19, lr);
 510     BLOCK_COMMENT("call exception_handler_for_return_address");
 511     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 512                          SharedRuntime::exception_handler_for_return_address),
 513                     rthread, c_rarg1);
 514     // we should not really care that lr is no longer the callee
 515     // address. we saved the value the handler needs in r19 so we can
 516     // just copy it to r3. however, the C2 handler will push its own
 517     // frame and then calls into the VM and the VM code asserts that
 518     // the PC for the frame above the handler belongs to a compiled
 519     // Java method. So, we restore lr here to satisfy that assert.
 520     __ mov(lr, r19);
 521     // setup r0 & r3 & clear pending exception
 522     __ mov(r3, r19);
 523     __ mov(r19, r0);
 524     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 525     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 526 
 527 #ifdef ASSERT
 528     // make sure exception is set
 529     {
 530       Label L;
 531       __ cbnz(r0, L);
 532       __ stop("StubRoutines::forward exception: no pending exception (2)");
 533       __ bind(L);
 534     }
 535 #endif
 536 
 537     // continue at exception handler
 538     // r0: exception
 539     // r3: throwing pc
 540     // r19: exception handler
 541     __ verify_oop(r0);
 542     __ br(r19);
 543 
 544     return start;
 545   }
 546 
 547   // Non-destructive plausibility checks for oops
 548   //
 549   // Arguments:
 550   //    r0: oop to verify
 551   //    rscratch1: error message
 552   //
 553   // Stack after saving c_rarg3:
 554   //    [tos + 0]: saved c_rarg3
 555   //    [tos + 1]: saved c_rarg2
 556   //    [tos + 2]: saved lr
 557   //    [tos + 3]: saved rscratch2
 558   //    [tos + 4]: saved r0
 559   //    [tos + 5]: saved rscratch1
 560   address generate_verify_oop() {
 561 
 562     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 563     address start = __ pc();
 564 
 565     Label exit, error;
 566 
 567     // save c_rarg2 and c_rarg3
 568     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 569 
 570     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 571     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 572     __ ldr(c_rarg3, Address(c_rarg2));
 573     __ add(c_rarg3, c_rarg3, 1);
 574     __ str(c_rarg3, Address(c_rarg2));
 575 
 576     // object is in r0
 577     // make sure object is 'reasonable'
 578     __ cbz(r0, exit); // if obj is NULL it is OK
 579 
 580     // Check if the oop is in the right area of memory
 581     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 582     __ andr(c_rarg2, r0, c_rarg3);
 583     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 584 
 585     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 586     // instruction here because the flags register is live.
 587     __ eor(c_rarg2, c_rarg2, c_rarg3);
 588     __ cbnz(c_rarg2, error);
 589 
 590     // make sure klass is 'reasonable', which is not zero.
 591     __ load_klass(r0, r0);  // get klass
 592     __ cbz(r0, error);      // if klass is NULL it is broken
 593 
 594     // return if everything seems ok
 595     __ bind(exit);
 596 
 597     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 598     __ ret(lr);
 599 
 600     // handle errors
 601     __ bind(error);
 602     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 603 
 604     __ push(RegSet::range(r0, r29), sp);
 605     // debug(char* msg, int64_t pc, int64_t regs[])
 606     __ mov(c_rarg0, rscratch1);      // pass address of error message
 607     __ mov(c_rarg1, lr);             // pass return address
 608     __ mov(c_rarg2, sp);             // pass address of regs on stack
 609 #ifndef PRODUCT
 610     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 611 #endif
 612     BLOCK_COMMENT("call MacroAssembler::debug");
 613     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 614     __ blrt(rscratch1, 3, 0, 1);
 615 
 616     return start;
 617   }
 618 
 619   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 620 
 621   // Generate code for an array write pre barrier
 622   //
 623   //     addr    -  starting address
 624   //     count   -  element count
 625   //     tmp     - scratch register
 626   //
 627   //     Destroy no registers except rscratch1 and rscratch2
 628   //
 629   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 630     BarrierSet* bs = Universe::heap()->barrier_set();
 631     switch (bs->kind()) {
 632     case BarrierSet::G1SATBCTLogging:
 633       // With G1, don't generate the call if we statically know that the target in uninitialized
 634       if (!dest_uninitialized) {
 635         __ push_call_clobbered_registers();
 636         if (count == c_rarg0) {
 637           if (addr == c_rarg1) {
 638             // exactly backwards!!
 639             __ mov(rscratch1, c_rarg0);
 640             __ mov(c_rarg0, c_rarg1);
 641             __ mov(c_rarg1, rscratch1);
 642           } else {
 643             __ mov(c_rarg1, count);
 644             __ mov(c_rarg0, addr);
 645           }
 646         } else {
 647           __ mov(c_rarg0, addr);
 648           __ mov(c_rarg1, count);
 649         }
 650         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 651         __ pop_call_clobbered_registers();
 652         break;
 653       case BarrierSet::CardTableForRS:
 654       case BarrierSet::CardTableExtension:
 655       case BarrierSet::ModRef:
 656         break;
 657       default:
 658         ShouldNotReachHere();
 659 
 660       }
 661     }
 662   }
 663 
 664   //
 665   // Generate code for an array write post barrier
 666   //
 667   //  Input:
 668   //     start    - register containing starting address of destination array
 669   //     end      - register containing ending address of destination array
 670   //     scratch  - scratch register
 671   //
 672   //  The input registers are overwritten.
 673   //  The ending address is inclusive.
 674   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 675     assert_different_registers(start, end, scratch);
 676     BarrierSet* bs = Universe::heap()->barrier_set();
 677     switch (bs->kind()) {
 678       case BarrierSet::G1SATBCTLogging:
 679 
 680         {
 681           __ push_call_clobbered_registers();
 682           // must compute element count unless barrier set interface is changed (other platforms supply count)
 683           assert_different_registers(start, end, scratch);
 684           __ lea(scratch, Address(end, BytesPerHeapOop));
 685           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 686           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 687           __ mov(c_rarg0, start);
 688           __ mov(c_rarg1, scratch);
 689           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 690           __ pop_call_clobbered_registers();
 691         }
 692         break;
 693       case BarrierSet::CardTableForRS:
 694       case BarrierSet::CardTableExtension:
 695         {
 696           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 697           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 698 
 699           Label L_loop;
 700 
 701            __ lsr(start, start, CardTableModRefBS::card_shift);
 702            __ lsr(end, end, CardTableModRefBS::card_shift);
 703            __ sub(end, end, start); // number of bytes to copy
 704 
 705           const Register count = end; // 'end' register contains bytes count now
 706           __ load_byte_map_base(scratch);
 707           __ add(start, start, scratch);
 708           if (UseConcMarkSweepGC) {
 709             __ membar(__ StoreStore);
 710           }
 711           __ BIND(L_loop);
 712           __ strb(zr, Address(start, count));
 713           __ subs(count, count, 1);
 714           __ br(Assembler::GE, L_loop);
 715         }
 716         break;
 717       default:
 718         ShouldNotReachHere();
 719 
 720     }
 721   }
 722 
 723   // The inner part of zero_words().  This is the bulk operation,
 724   // zeroing words in blocks, possibly using DC ZVA to do it.  The
 725   // caller is responsible for zeroing the last few words.
 726   //
 727   // Inputs:
 728   // r10: the HeapWord-aligned base address of an array to zero.
 729   // r11: the count in HeapWords, r11 > 0.
 730   //
 731   // Returns r10 and r11, adjusted for the caller to clear.
 732   // r10: the base address of the tail of words left to clear.
 733   // r11: the number of words in the tail.
 734   //      r11 < MacroAssembler::zero_words_block_size.
 735 
 736   address generate_zero_blocks() {
 737     Label store_pair, loop_store_pair, done;
 738     Label base_aligned;
 739 
 740     Register base = r10, cnt = r11;
 741 
 742     __ align(CodeEntryAlignment);
 743     StubCodeMark mark(this, "StubRoutines", "zero_blocks");
 744     address start = __ pc();
 745 
 746     if (UseBlockZeroing) {
 747       int zva_length = VM_Version::zva_length();
 748 
 749       // Ensure ZVA length can be divided by 16. This is required by
 750       // the subsequent operations.
 751       assert (zva_length % 16 == 0, "Unexpected ZVA Length");
 752 
 753       __ tbz(base, 3, base_aligned);
 754       __ str(zr, Address(__ post(base, 8)));
 755       __ sub(cnt, cnt, 1);
 756       __ bind(base_aligned);
 757 
 758       // Ensure count >= zva_length * 2 so that it still deserves a zva after
 759       // alignment.
 760       Label small;
 761       int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit);
 762       __ cmp(cnt, low_limit >> 3);
 763       __ br(Assembler::LT, small);
 764       __ zero_dcache_blocks(base, cnt);
 765       __ bind(small);
 766     }
 767 
 768     {
 769       // Number of stp instructions we'll unroll
 770       const int unroll =
 771         MacroAssembler::zero_words_block_size / 2;
 772       // Clear the remaining blocks.
 773       Label loop;
 774       __ subs(cnt, cnt, unroll * 2);
 775       __ br(Assembler::LT, done);
 776       __ bind(loop);
 777       for (int i = 0; i < unroll; i++)
 778         __ stp(zr, zr, __ post(base, 16));
 779       __ subs(cnt, cnt, unroll * 2);
 780       __ br(Assembler::GE, loop);
 781       __ bind(done);
 782       __ add(cnt, cnt, unroll * 2);
 783     }
 784 
 785     __ ret(lr);
 786 
 787     return start;
 788   }
 789 
 790 
 791   typedef enum {
 792     copy_forwards = 1,
 793     copy_backwards = -1
 794   } copy_direction;
 795 
 796   // Bulk copy of blocks of 8 words.
 797   //
 798   // count is a count of words.
 799   //
 800   // Precondition: count >= 8
 801   //
 802   // Postconditions:
 803   //
 804   // The least significant bit of count contains the remaining count
 805   // of words to copy.  The rest of count is trash.
 806   //
 807   // s and d are adjusted to point to the remaining words to copy
 808   //
 809   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 810                            copy_direction direction) {
 811     int unit = wordSize * direction;
 812     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 813 
 814     int offset;
 815     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 816       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 817     const Register stride = r13;
 818 
 819     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 820     assert_different_registers(s, d, count, rscratch1);
 821 
 822     Label again, drain;
 823     const char *stub_name;
 824     if (direction == copy_forwards)
 825       stub_name = "foward_copy_longs";
 826     else
 827       stub_name = "backward_copy_longs";
 828     StubCodeMark mark(this, "StubRoutines", stub_name);
 829     __ align(CodeEntryAlignment);
 830     __ bind(start);
 831 
 832     Label unaligned_copy_long;
 833     if (AvoidUnalignedAccesses) {
 834       __ tbnz(d, 3, unaligned_copy_long);
 835     }
 836 
 837     if (direction == copy_forwards) {
 838       __ sub(s, s, bias);
 839       __ sub(d, d, bias);
 840     }
 841 
 842 #ifdef ASSERT
 843     // Make sure we are never given < 8 words
 844     {
 845       Label L;
 846       __ cmp(count, 8);
 847       __ br(Assembler::GE, L);
 848       __ stop("genrate_copy_longs called with < 8 words");
 849       __ bind(L);
 850     }
 851 #endif
 852 
 853     // Fill 8 registers
 854     if (UseSIMDForMemoryOps) {
 855       __ ldpq(v0, v1, Address(s, 4 * unit));
 856       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 857     } else {
 858       __ ldp(t0, t1, Address(s, 2 * unit));
 859       __ ldp(t2, t3, Address(s, 4 * unit));
 860       __ ldp(t4, t5, Address(s, 6 * unit));
 861       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 862     }
 863 
 864     __ subs(count, count, 16);
 865     __ br(Assembler::LO, drain);
 866 
 867     int prefetch = PrefetchCopyIntervalInBytes;
 868     bool use_stride = false;
 869     if (direction == copy_backwards) {
 870        use_stride = prefetch > 256;
 871        prefetch = -prefetch;
 872        if (use_stride) __ mov(stride, prefetch);
 873     }
 874 
 875     __ bind(again);
 876 
 877     if (PrefetchCopyIntervalInBytes > 0)
 878       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 879 
 880     if (UseSIMDForMemoryOps) {
 881       __ stpq(v0, v1, Address(d, 4 * unit));
 882       __ ldpq(v0, v1, Address(s, 4 * unit));
 883       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 884       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 885     } else {
 886       __ stp(t0, t1, Address(d, 2 * unit));
 887       __ ldp(t0, t1, Address(s, 2 * unit));
 888       __ stp(t2, t3, Address(d, 4 * unit));
 889       __ ldp(t2, t3, Address(s, 4 * unit));
 890       __ stp(t4, t5, Address(d, 6 * unit));
 891       __ ldp(t4, t5, Address(s, 6 * unit));
 892       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 893       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 894     }
 895 
 896     __ subs(count, count, 8);
 897     __ br(Assembler::HS, again);
 898 
 899     // Drain
 900     __ bind(drain);
 901     if (UseSIMDForMemoryOps) {
 902       __ stpq(v0, v1, Address(d, 4 * unit));
 903       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 904     } else {
 905       __ stp(t0, t1, Address(d, 2 * unit));
 906       __ stp(t2, t3, Address(d, 4 * unit));
 907       __ stp(t4, t5, Address(d, 6 * unit));
 908       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 909     }
 910 
 911     {
 912       Label L1, L2;
 913       __ tbz(count, exact_log2(4), L1);
 914       if (UseSIMDForMemoryOps) {
 915         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 916         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 917       } else {
 918         __ ldp(t0, t1, Address(s, 2 * unit));
 919         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 920         __ stp(t0, t1, Address(d, 2 * unit));
 921         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 922       }
 923       __ bind(L1);
 924 
 925       if (direction == copy_forwards) {
 926         __ add(s, s, bias);
 927         __ add(d, d, bias);
 928       }
 929 
 930       __ tbz(count, 1, L2);
 931       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 932       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 933       __ bind(L2);
 934     }
 935 
 936     __ ret(lr);
 937 
 938     if (AvoidUnalignedAccesses) {
 939       Label drain, again;
 940       // Register order for storing. Order is different for backward copy.
 941 
 942       __ bind(unaligned_copy_long);
 943 
 944       // source address is even aligned, target odd aligned
 945       //
 946       // when forward copying word pairs we read long pairs at offsets
 947       // {0, 2, 4, 6} (in long words). when backwards copying we read
 948       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
 949       // address by -2 in the forwards case so we can compute the
 950       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
 951       // or -1.
 952       //
 953       // when forward copying we need to store 1 word, 3 pairs and
 954       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
 955       // zero offset We adjust the destination by -1 which means we
 956       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
 957       //
 958       // When backwards copyng we need to store 1 word, 3 pairs and
 959       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
 960       // offsets {1, 3, 5, 7, 8} * unit.
 961 
 962       if (direction == copy_forwards) {
 963         __ sub(s, s, 16);
 964         __ sub(d, d, 8);
 965       }
 966 
 967       // Fill 8 registers
 968       //
 969       // for forwards copy s was offset by -16 from the original input
 970       // value of s so the register contents are at these offsets
 971       // relative to the 64 bit block addressed by that original input
 972       // and so on for each successive 64 byte block when s is updated
 973       //
 974       // t0 at offset 0,  t1 at offset 8
 975       // t2 at offset 16, t3 at offset 24
 976       // t4 at offset 32, t5 at offset 40
 977       // t6 at offset 48, t7 at offset 56
 978 
 979       // for backwards copy s was not offset so the register contents
 980       // are at these offsets into the preceding 64 byte block
 981       // relative to that original input and so on for each successive
 982       // preceding 64 byte block when s is updated. this explains the
 983       // slightly counter-intuitive looking pattern of register usage
 984       // in the stp instructions for backwards copy.
 985       //
 986       // t0 at offset -16, t1 at offset -8
 987       // t2 at offset -32, t3 at offset -24
 988       // t4 at offset -48, t5 at offset -40
 989       // t6 at offset -64, t7 at offset -56
 990 
 991       __ ldp(t0, t1, Address(s, 2 * unit));
 992       __ ldp(t2, t3, Address(s, 4 * unit));
 993       __ ldp(t4, t5, Address(s, 6 * unit));
 994       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 995 
 996       __ subs(count, count, 16);
 997       __ br(Assembler::LO, drain);
 998 
 999       int prefetch = PrefetchCopyIntervalInBytes;
1000       bool use_stride = false;
1001       if (direction == copy_backwards) {
1002          use_stride = prefetch > 256;
1003          prefetch = -prefetch;
1004          if (use_stride) __ mov(stride, prefetch);
1005       }
1006 
1007       __ bind(again);
1008 
1009       if (PrefetchCopyIntervalInBytes > 0)
1010         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1011 
1012       if (direction == copy_forwards) {
1013        // allowing for the offset of -8 the store instructions place
1014        // registers into the target 64 bit block at the following
1015        // offsets
1016        //
1017        // t0 at offset 0
1018        // t1 at offset 8,  t2 at offset 16
1019        // t3 at offset 24, t4 at offset 32
1020        // t5 at offset 40, t6 at offset 48
1021        // t7 at offset 56
1022 
1023         __ str(t0, Address(d, 1 * unit));
1024         __ stp(t1, t2, Address(d, 2 * unit));
1025         __ ldp(t0, t1, Address(s, 2 * unit));
1026         __ stp(t3, t4, Address(d, 4 * unit));
1027         __ ldp(t2, t3, Address(s, 4 * unit));
1028         __ stp(t5, t6, Address(d, 6 * unit));
1029         __ ldp(t4, t5, Address(s, 6 * unit));
1030         __ str(t7, Address(__ pre(d, 8 * unit)));
1031         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1032       } else {
1033        // d was not offset when we started so the registers are
1034        // written into the 64 bit block preceding d with the following
1035        // offsets
1036        //
1037        // t1 at offset -8
1038        // t3 at offset -24, t0 at offset -16
1039        // t5 at offset -48, t2 at offset -32
1040        // t7 at offset -56, t4 at offset -48
1041        //                   t6 at offset -64
1042        //
1043        // note that this matches the offsets previously noted for the
1044        // loads
1045 
1046         __ str(t1, Address(d, 1 * unit));
1047         __ stp(t3, t0, Address(d, 3 * unit));
1048         __ ldp(t0, t1, Address(s, 2 * unit));
1049         __ stp(t5, t2, Address(d, 5 * unit));
1050         __ ldp(t2, t3, Address(s, 4 * unit));
1051         __ stp(t7, t4, Address(d, 7 * unit));
1052         __ ldp(t4, t5, Address(s, 6 * unit));
1053         __ str(t6, Address(__ pre(d, 8 * unit)));
1054         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1055       }
1056 
1057       __ subs(count, count, 8);
1058       __ br(Assembler::HS, again);
1059 
1060       // Drain
1061       //
1062       // this uses the same pattern of offsets and register arguments
1063       // as above
1064       __ bind(drain);
1065       if (direction == copy_forwards) {
1066         __ str(t0, Address(d, 1 * unit));
1067         __ stp(t1, t2, Address(d, 2 * unit));
1068         __ stp(t3, t4, Address(d, 4 * unit));
1069         __ stp(t5, t6, Address(d, 6 * unit));
1070         __ str(t7, Address(__ pre(d, 8 * unit)));
1071       } else {
1072         __ str(t1, Address(d, 1 * unit));
1073         __ stp(t3, t0, Address(d, 3 * unit));
1074         __ stp(t5, t2, Address(d, 5 * unit));
1075         __ stp(t7, t4, Address(d, 7 * unit));
1076         __ str(t6, Address(__ pre(d, 8 * unit)));
1077       }
1078       // now we need to copy any remaining part block which may
1079       // include a 4 word block subblock and/or a 2 word subblock.
1080       // bits 2 and 1 in the count are the tell-tale for whetehr we
1081       // have each such subblock
1082       {
1083         Label L1, L2;
1084         __ tbz(count, exact_log2(4), L1);
1085        // this is the same as above but copying only 4 longs hence
1086        // with ony one intervening stp between the str instructions
1087        // but note that the offsets and registers still follow the
1088        // same pattern
1089         __ ldp(t0, t1, Address(s, 2 * unit));
1090         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1091         if (direction == copy_forwards) {
1092           __ str(t0, Address(d, 1 * unit));
1093           __ stp(t1, t2, Address(d, 2 * unit));
1094           __ str(t3, Address(__ pre(d, 4 * unit)));
1095         } else {
1096           __ str(t1, Address(d, 1 * unit));
1097           __ stp(t3, t0, Address(d, 3 * unit));
1098           __ str(t2, Address(__ pre(d, 4 * unit)));
1099         }
1100         __ bind(L1);
1101 
1102         __ tbz(count, 1, L2);
1103        // this is the same as above but copying only 2 longs hence
1104        // there is no intervening stp between the str instructions
1105        // but note that the offset and register patterns are still
1106        // the same
1107         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1108         if (direction == copy_forwards) {
1109           __ str(t0, Address(d, 1 * unit));
1110           __ str(t1, Address(__ pre(d, 2 * unit)));
1111         } else {
1112           __ str(t1, Address(d, 1 * unit));
1113           __ str(t0, Address(__ pre(d, 2 * unit)));
1114         }
1115         __ bind(L2);
1116 
1117        // for forwards copy we need to re-adjust the offsets we
1118        // applied so that s and d are follow the last words written
1119 
1120        if (direction == copy_forwards) {
1121          __ add(s, s, 16);
1122          __ add(d, d, 8);
1123        }
1124 
1125       }
1126 
1127       __ ret(lr);
1128       }
1129   }
1130 
1131   // Small copy: less than 16 bytes.
1132   //
1133   // NB: Ignores all of the bits of count which represent more than 15
1134   // bytes, so a caller doesn't have to mask them.
1135 
1136   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1137     bool is_backwards = step < 0;
1138     size_t granularity = uabs(step);
1139     int direction = is_backwards ? -1 : 1;
1140     int unit = wordSize * direction;
1141 
1142     Label Lpair, Lword, Lint, Lshort, Lbyte;
1143 
1144     assert(granularity
1145            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1146 
1147     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1148 
1149     // ??? I don't know if this bit-test-and-branch is the right thing
1150     // to do.  It does a lot of jumping, resulting in several
1151     // mispredicted branches.  It might make more sense to do this
1152     // with something like Duff's device with a single computed branch.
1153 
1154     __ tbz(count, 3 - exact_log2(granularity), Lword);
1155     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1156     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1157     __ bind(Lword);
1158 
1159     if (granularity <= sizeof (jint)) {
1160       __ tbz(count, 2 - exact_log2(granularity), Lint);
1161       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1162       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1163       __ bind(Lint);
1164     }
1165 
1166     if (granularity <= sizeof (jshort)) {
1167       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1168       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1169       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1170       __ bind(Lshort);
1171     }
1172 
1173     if (granularity <= sizeof (jbyte)) {
1174       __ tbz(count, 0, Lbyte);
1175       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1176       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1177       __ bind(Lbyte);
1178     }
1179   }
1180 
1181   Label copy_f, copy_b;
1182 
1183   // All-singing all-dancing memory copy.
1184   //
1185   // Copy count units of memory from s to d.  The size of a unit is
1186   // step, which can be positive or negative depending on the direction
1187   // of copy.  If is_aligned is false, we align the source address.
1188   //
1189 
1190   void copy_memory(bool is_aligned, Register s, Register d,
1191                    Register count, Register tmp, int step) {
1192     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1193     bool is_backwards = step < 0;
1194     int granularity = uabs(step);
1195     const Register t0 = r3, t1 = r4;
1196 
1197     // <= 96 bytes do inline. Direction doesn't matter because we always
1198     // load all the data before writing anything
1199     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1200     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1201     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1202     const Register send = r17, dend = r18;
1203 
1204     if (PrefetchCopyIntervalInBytes > 0)
1205       __ prfm(Address(s, 0), PLDL1KEEP);
1206     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1207     __ br(Assembler::HI, copy_big);
1208 
1209     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1210     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1211 
1212     __ cmp(count, 16/granularity);
1213     __ br(Assembler::LS, copy16);
1214 
1215     __ cmp(count, 64/granularity);
1216     __ br(Assembler::HI, copy80);
1217 
1218     __ cmp(count, 32/granularity);
1219     __ br(Assembler::LS, copy32);
1220 
1221     // 33..64 bytes
1222     if (UseSIMDForMemoryOps) {
1223       __ ldpq(v0, v1, Address(s, 0));
1224       __ ldpq(v2, v3, Address(send, -32));
1225       __ stpq(v0, v1, Address(d, 0));
1226       __ stpq(v2, v3, Address(dend, -32));
1227     } else {
1228       __ ldp(t0, t1, Address(s, 0));
1229       __ ldp(t2, t3, Address(s, 16));
1230       __ ldp(t4, t5, Address(send, -32));
1231       __ ldp(t6, t7, Address(send, -16));
1232 
1233       __ stp(t0, t1, Address(d, 0));
1234       __ stp(t2, t3, Address(d, 16));
1235       __ stp(t4, t5, Address(dend, -32));
1236       __ stp(t6, t7, Address(dend, -16));
1237     }
1238     __ b(finish);
1239 
1240     // 17..32 bytes
1241     __ bind(copy32);
1242     __ ldp(t0, t1, Address(s, 0));
1243     __ ldp(t2, t3, Address(send, -16));
1244     __ stp(t0, t1, Address(d, 0));
1245     __ stp(t2, t3, Address(dend, -16));
1246     __ b(finish);
1247 
1248     // 65..80/96 bytes
1249     // (96 bytes if SIMD because we do 32 byes per instruction)
1250     __ bind(copy80);
1251     if (UseSIMDForMemoryOps) {
1252       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1253       __ ldpq(v4, v5, Address(send, -32));
1254       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1255       __ stpq(v4, v5, Address(dend, -32));
1256     } else {
1257       __ ldp(t0, t1, Address(s, 0));
1258       __ ldp(t2, t3, Address(s, 16));
1259       __ ldp(t4, t5, Address(s, 32));
1260       __ ldp(t6, t7, Address(s, 48));
1261       __ ldp(t8, t9, Address(send, -16));
1262 
1263       __ stp(t0, t1, Address(d, 0));
1264       __ stp(t2, t3, Address(d, 16));
1265       __ stp(t4, t5, Address(d, 32));
1266       __ stp(t6, t7, Address(d, 48));
1267       __ stp(t8, t9, Address(dend, -16));
1268     }
1269     __ b(finish);
1270 
1271     // 0..16 bytes
1272     __ bind(copy16);
1273     __ cmp(count, 8/granularity);
1274     __ br(Assembler::LO, copy8);
1275 
1276     // 8..16 bytes
1277     __ ldr(t0, Address(s, 0));
1278     __ ldr(t1, Address(send, -8));
1279     __ str(t0, Address(d, 0));
1280     __ str(t1, Address(dend, -8));
1281     __ b(finish);
1282 
1283     if (granularity < 8) {
1284       // 4..7 bytes
1285       __ bind(copy8);
1286       __ tbz(count, 2 - exact_log2(granularity), copy4);
1287       __ ldrw(t0, Address(s, 0));
1288       __ ldrw(t1, Address(send, -4));
1289       __ strw(t0, Address(d, 0));
1290       __ strw(t1, Address(dend, -4));
1291       __ b(finish);
1292       if (granularity < 4) {
1293         // 0..3 bytes
1294         __ bind(copy4);
1295         __ cbz(count, finish); // get rid of 0 case
1296         if (granularity == 2) {
1297           __ ldrh(t0, Address(s, 0));
1298           __ strh(t0, Address(d, 0));
1299         } else { // granularity == 1
1300           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1301           // the first and last byte.
1302           // Handle the 3 byte case by loading and storing base + count/2
1303           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1304           // This does means in the 1 byte case we load/store the same
1305           // byte 3 times.
1306           __ lsr(count, count, 1);
1307           __ ldrb(t0, Address(s, 0));
1308           __ ldrb(t1, Address(send, -1));
1309           __ ldrb(t2, Address(s, count));
1310           __ strb(t0, Address(d, 0));
1311           __ strb(t1, Address(dend, -1));
1312           __ strb(t2, Address(d, count));
1313         }
1314         __ b(finish);
1315       }
1316     }
1317 
1318     __ bind(copy_big);
1319     if (is_backwards) {
1320       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1321       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1322     }
1323 
1324     // Now we've got the small case out of the way we can align the
1325     // source address on a 2-word boundary.
1326 
1327     Label aligned;
1328 
1329     if (is_aligned) {
1330       // We may have to adjust by 1 word to get s 2-word-aligned.
1331       __ tbz(s, exact_log2(wordSize), aligned);
1332       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1333       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1334       __ sub(count, count, wordSize/granularity);
1335     } else {
1336       if (is_backwards) {
1337         __ andr(rscratch2, s, 2 * wordSize - 1);
1338       } else {
1339         __ neg(rscratch2, s);
1340         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1341       }
1342       // rscratch2 is the byte adjustment needed to align s.
1343       __ cbz(rscratch2, aligned);
1344       int shift = exact_log2(granularity);
1345       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1346       __ sub(count, count, rscratch2);
1347 
1348 #if 0
1349       // ?? This code is only correct for a disjoint copy.  It may or
1350       // may not make sense to use it in that case.
1351 
1352       // Copy the first pair; s and d may not be aligned.
1353       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1354       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1355 
1356       // Align s and d, adjust count
1357       if (is_backwards) {
1358         __ sub(s, s, rscratch2);
1359         __ sub(d, d, rscratch2);
1360       } else {
1361         __ add(s, s, rscratch2);
1362         __ add(d, d, rscratch2);
1363       }
1364 #else
1365       copy_memory_small(s, d, rscratch2, rscratch1, step);
1366 #endif
1367     }
1368 
1369     __ bind(aligned);
1370 
1371     // s is now 2-word-aligned.
1372 
1373     // We have a count of units and some trailing bytes.  Adjust the
1374     // count and do a bulk copy of words.
1375     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1376     if (direction == copy_forwards)
1377       __ bl(copy_f);
1378     else
1379       __ bl(copy_b);
1380 
1381     // And the tail.
1382     copy_memory_small(s, d, count, tmp, step);
1383 
1384     if (granularity >= 8) __ bind(copy8);
1385     if (granularity >= 4) __ bind(copy4);
1386     __ bind(finish);
1387   }
1388 
1389 
1390   void clobber_registers() {
1391 #ifdef ASSERT
1392     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1393     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1394     for (Register r = r3; r <= r18; r++)
1395       if (r != rscratch1) __ mov(r, rscratch1);
1396 #endif
1397   }
1398 
1399   // Scan over array at a for count oops, verifying each one.
1400   // Preserves a and count, clobbers rscratch1 and rscratch2.
1401   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1402     Label loop, end;
1403     __ mov(rscratch1, a);
1404     __ mov(rscratch2, zr);
1405     __ bind(loop);
1406     __ cmp(rscratch2, count);
1407     __ br(Assembler::HS, end);
1408     if (size == (size_t)wordSize) {
1409       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1410       __ verify_oop(temp);
1411     } else {
1412       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1413       __ decode_heap_oop(temp); // calls verify_oop
1414     }
1415     __ add(rscratch2, rscratch2, size);
1416     __ b(loop);
1417     __ bind(end);
1418   }
1419 
1420   // Arguments:
1421   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1422   //             ignored
1423   //   is_oop  - true => oop array, so generate store check code
1424   //   name    - stub name string
1425   //
1426   // Inputs:
1427   //   c_rarg0   - source array address
1428   //   c_rarg1   - destination array address
1429   //   c_rarg2   - element count, treated as ssize_t, can be zero
1430   //
1431   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1432   // the hardware handle it.  The two dwords within qwords that span
1433   // cache line boundaries will still be loaded and stored atomicly.
1434   //
1435   // Side Effects:
1436   //   disjoint_int_copy_entry is set to the no-overlap entry point
1437   //   used by generate_conjoint_int_oop_copy().
1438   //
1439   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1440                                   const char *name, bool dest_uninitialized = false) {
1441     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1442     __ align(CodeEntryAlignment);
1443     StubCodeMark mark(this, "StubRoutines", name);
1444     address start = __ pc();
1445     __ enter();
1446 
1447     if (entry != NULL) {
1448       *entry = __ pc();
1449       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1450       BLOCK_COMMENT("Entry:");
1451     }
1452 
1453     if (is_oop) {
1454       __ push(RegSet::of(d, count), sp);
1455       // no registers are destroyed by this call
1456       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1457     }
1458     copy_memory(aligned, s, d, count, rscratch1, size);
1459     if (is_oop) {
1460       __ pop(RegSet::of(d, count), sp);
1461       if (VerifyOops)
1462         verify_oop_array(size, d, count, r16);
1463       __ sub(count, count, 1); // make an inclusive end pointer
1464       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1465       gen_write_ref_array_post_barrier(d, count, rscratch1);
1466     }
1467     __ leave();
1468     __ mov(r0, zr); // return 0
1469     __ ret(lr);
1470 #ifdef BUILTIN_SIM
1471     {
1472       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1473       sim->notifyCompile(const_cast<char*>(name), start);
1474     }
1475 #endif
1476     return start;
1477   }
1478 
1479   // Arguments:
1480   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1481   //             ignored
1482   //   is_oop  - true => oop array, so generate store check code
1483   //   name    - stub name string
1484   //
1485   // Inputs:
1486   //   c_rarg0   - source array address
1487   //   c_rarg1   - destination array address
1488   //   c_rarg2   - element count, treated as ssize_t, can be zero
1489   //
1490   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1491   // the hardware handle it.  The two dwords within qwords that span
1492   // cache line boundaries will still be loaded and stored atomicly.
1493   //
1494   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1495                                  address *entry, const char *name,
1496                                  bool dest_uninitialized = false) {
1497     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1498 
1499     StubCodeMark mark(this, "StubRoutines", name);
1500     address start = __ pc();
1501     __ enter();
1502 
1503     if (entry != NULL) {
1504       *entry = __ pc();
1505       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1506       BLOCK_COMMENT("Entry:");
1507     }
1508 
1509     // use fwd copy when (d-s) above_equal (count*size)
1510     __ sub(rscratch1, d, s);
1511     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1512     __ br(Assembler::HS, nooverlap_target);
1513 
1514     if (is_oop) {
1515       __ push(RegSet::of(d, count), sp);
1516       // no registers are destroyed by this call
1517       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1518     }
1519     copy_memory(aligned, s, d, count, rscratch1, -size);
1520     if (is_oop) {
1521       __ pop(RegSet::of(d, count), sp);
1522       if (VerifyOops)
1523         verify_oop_array(size, d, count, r16);
1524       __ sub(count, count, 1); // make an inclusive end pointer
1525       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1526       gen_write_ref_array_post_barrier(d, count, rscratch1);
1527     }
1528     __ leave();
1529     __ mov(r0, zr); // return 0
1530     __ ret(lr);
1531 #ifdef BUILTIN_SIM
1532     {
1533       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1534       sim->notifyCompile(const_cast<char*>(name), start);
1535     }
1536 #endif
1537     return start;
1538 }
1539 
1540   // Arguments:
1541   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1542   //             ignored
1543   //   name    - stub name string
1544   //
1545   // Inputs:
1546   //   c_rarg0   - source array address
1547   //   c_rarg1   - destination array address
1548   //   c_rarg2   - element count, treated as ssize_t, can be zero
1549   //
1550   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1551   // we let the hardware handle it.  The one to eight bytes within words,
1552   // dwords or qwords that span cache line boundaries will still be loaded
1553   // and stored atomically.
1554   //
1555   // Side Effects:
1556   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1557   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1558   // we let the hardware handle it.  The one to eight bytes within words,
1559   // dwords or qwords that span cache line boundaries will still be loaded
1560   // and stored atomically.
1561   //
1562   // Side Effects:
1563   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1564   //   used by generate_conjoint_byte_copy().
1565   //
1566   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1567     const bool not_oop = false;
1568     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1569   }
1570 
1571   // Arguments:
1572   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1573   //             ignored
1574   //   name    - stub name string
1575   //
1576   // Inputs:
1577   //   c_rarg0   - source array address
1578   //   c_rarg1   - destination array address
1579   //   c_rarg2   - element count, treated as ssize_t, can be zero
1580   //
1581   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1582   // we let the hardware handle it.  The one to eight bytes within words,
1583   // dwords or qwords that span cache line boundaries will still be loaded
1584   // and stored atomically.
1585   //
1586   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1587                                       address* entry, const char *name) {
1588     const bool not_oop = false;
1589     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1590   }
1591 
1592   // Arguments:
1593   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1594   //             ignored
1595   //   name    - stub name string
1596   //
1597   // Inputs:
1598   //   c_rarg0   - source array address
1599   //   c_rarg1   - destination array address
1600   //   c_rarg2   - element count, treated as ssize_t, can be zero
1601   //
1602   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1603   // let the hardware handle it.  The two or four words within dwords
1604   // or qwords that span cache line boundaries will still be loaded
1605   // and stored atomically.
1606   //
1607   // Side Effects:
1608   //   disjoint_short_copy_entry is set to the no-overlap entry point
1609   //   used by generate_conjoint_short_copy().
1610   //
1611   address generate_disjoint_short_copy(bool aligned,
1612                                        address* entry, const char *name) {
1613     const bool not_oop = false;
1614     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1615   }
1616 
1617   // Arguments:
1618   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1619   //             ignored
1620   //   name    - stub name string
1621   //
1622   // Inputs:
1623   //   c_rarg0   - source array address
1624   //   c_rarg1   - destination array address
1625   //   c_rarg2   - element count, treated as ssize_t, can be zero
1626   //
1627   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1628   // let the hardware handle it.  The two or four words within dwords
1629   // or qwords that span cache line boundaries will still be loaded
1630   // and stored atomically.
1631   //
1632   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1633                                        address *entry, const char *name) {
1634     const bool not_oop = false;
1635     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1636 
1637   }
1638   // Arguments:
1639   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1640   //             ignored
1641   //   name    - stub name string
1642   //
1643   // Inputs:
1644   //   c_rarg0   - source array address
1645   //   c_rarg1   - destination array address
1646   //   c_rarg2   - element count, treated as ssize_t, can be zero
1647   //
1648   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1649   // the hardware handle it.  The two dwords within qwords that span
1650   // cache line boundaries will still be loaded and stored atomicly.
1651   //
1652   // Side Effects:
1653   //   disjoint_int_copy_entry is set to the no-overlap entry point
1654   //   used by generate_conjoint_int_oop_copy().
1655   //
1656   address generate_disjoint_int_copy(bool aligned, address *entry,
1657                                          const char *name, bool dest_uninitialized = false) {
1658     const bool not_oop = false;
1659     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1660   }
1661 
1662   // Arguments:
1663   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1664   //             ignored
1665   //   name    - stub name string
1666   //
1667   // Inputs:
1668   //   c_rarg0   - source array address
1669   //   c_rarg1   - destination array address
1670   //   c_rarg2   - element count, treated as ssize_t, can be zero
1671   //
1672   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1673   // the hardware handle it.  The two dwords within qwords that span
1674   // cache line boundaries will still be loaded and stored atomicly.
1675   //
1676   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1677                                      address *entry, const char *name,
1678                                      bool dest_uninitialized = false) {
1679     const bool not_oop = false;
1680     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1681   }
1682 
1683 
1684   // Arguments:
1685   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1686   //             ignored
1687   //   name    - stub name string
1688   //
1689   // Inputs:
1690   //   c_rarg0   - source array address
1691   //   c_rarg1   - destination array address
1692   //   c_rarg2   - element count, treated as size_t, can be zero
1693   //
1694   // Side Effects:
1695   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1696   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1697   //
1698   address generate_disjoint_long_copy(bool aligned, address *entry,
1699                                           const char *name, bool dest_uninitialized = false) {
1700     const bool not_oop = false;
1701     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1702   }
1703 
1704   // Arguments:
1705   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1706   //             ignored
1707   //   name    - stub name string
1708   //
1709   // Inputs:
1710   //   c_rarg0   - source array address
1711   //   c_rarg1   - destination array address
1712   //   c_rarg2   - element count, treated as size_t, can be zero
1713   //
1714   address generate_conjoint_long_copy(bool aligned,
1715                                       address nooverlap_target, address *entry,
1716                                       const char *name, bool dest_uninitialized = false) {
1717     const bool not_oop = false;
1718     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1719   }
1720 
1721   // Arguments:
1722   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1723   //             ignored
1724   //   name    - stub name string
1725   //
1726   // Inputs:
1727   //   c_rarg0   - source array address
1728   //   c_rarg1   - destination array address
1729   //   c_rarg2   - element count, treated as size_t, can be zero
1730   //
1731   // Side Effects:
1732   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1733   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1734   //
1735   address generate_disjoint_oop_copy(bool aligned, address *entry,
1736                                      const char *name, bool dest_uninitialized) {
1737     const bool is_oop = true;
1738     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1739     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1740   }
1741 
1742   // Arguments:
1743   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1744   //             ignored
1745   //   name    - stub name string
1746   //
1747   // Inputs:
1748   //   c_rarg0   - source array address
1749   //   c_rarg1   - destination array address
1750   //   c_rarg2   - element count, treated as size_t, can be zero
1751   //
1752   address generate_conjoint_oop_copy(bool aligned,
1753                                      address nooverlap_target, address *entry,
1754                                      const char *name, bool dest_uninitialized) {
1755     const bool is_oop = true;
1756     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1757     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1758                                   name, dest_uninitialized);
1759   }
1760 
1761 
1762   // Helper for generating a dynamic type check.
1763   // Smashes rscratch1.
1764   void generate_type_check(Register sub_klass,
1765                            Register super_check_offset,
1766                            Register super_klass,
1767                            Label& L_success) {
1768     assert_different_registers(sub_klass, super_check_offset, super_klass);
1769 
1770     BLOCK_COMMENT("type_check:");
1771 
1772     Label L_miss;
1773 
1774     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1775                                      super_check_offset);
1776     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1777 
1778     // Fall through on failure!
1779     __ BIND(L_miss);
1780   }
1781 
1782   //
1783   //  Generate checkcasting array copy stub
1784   //
1785   //  Input:
1786   //    c_rarg0   - source array address
1787   //    c_rarg1   - destination array address
1788   //    c_rarg2   - element count, treated as ssize_t, can be zero
1789   //    c_rarg3   - size_t ckoff (super_check_offset)
1790   //    c_rarg4   - oop ckval (super_klass)
1791   //
1792   //  Output:
1793   //    r0 ==  0  -  success
1794   //    r0 == -1^K - failure, where K is partial transfer count
1795   //
1796   address generate_checkcast_copy(const char *name, address *entry,
1797                                   bool dest_uninitialized = false) {
1798 
1799     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1800 
1801     // Input registers (after setup_arg_regs)
1802     const Register from        = c_rarg0;   // source array address
1803     const Register to          = c_rarg1;   // destination array address
1804     const Register count       = c_rarg2;   // elementscount
1805     const Register ckoff       = c_rarg3;   // super_check_offset
1806     const Register ckval       = c_rarg4;   // super_klass
1807 
1808     // Registers used as temps (r18, r19, r20 are save-on-entry)
1809     const Register count_save  = r21;       // orig elementscount
1810     const Register start_to    = r20;       // destination array start address
1811     const Register copied_oop  = r18;       // actual oop copied
1812     const Register r19_klass   = r19;       // oop._klass
1813 
1814     //---------------------------------------------------------------
1815     // Assembler stub will be used for this call to arraycopy
1816     // if the two arrays are subtypes of Object[] but the
1817     // destination array type is not equal to or a supertype
1818     // of the source type.  Each element must be separately
1819     // checked.
1820 
1821     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1822                                copied_oop, r19_klass, count_save);
1823 
1824     __ align(CodeEntryAlignment);
1825     StubCodeMark mark(this, "StubRoutines", name);
1826     address start = __ pc();
1827 
1828     __ enter(); // required for proper stackwalking of RuntimeStub frame
1829 
1830 #ifdef ASSERT
1831     // caller guarantees that the arrays really are different
1832     // otherwise, we would have to make conjoint checks
1833     { Label L;
1834       array_overlap_test(L, TIMES_OOP);
1835       __ stop("checkcast_copy within a single array");
1836       __ bind(L);
1837     }
1838 #endif //ASSERT
1839 
1840     // Caller of this entry point must set up the argument registers.
1841     if (entry != NULL) {
1842       *entry = __ pc();
1843       BLOCK_COMMENT("Entry:");
1844     }
1845 
1846      // Empty array:  Nothing to do.
1847     __ cbz(count, L_done);
1848 
1849     __ push(RegSet::of(r18, r19, r20, r21), sp);
1850 
1851 #ifdef ASSERT
1852     BLOCK_COMMENT("assert consistent ckoff/ckval");
1853     // The ckoff and ckval must be mutually consistent,
1854     // even though caller generates both.
1855     { Label L;
1856       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1857       __ ldrw(start_to, Address(ckval, sco_offset));
1858       __ cmpw(ckoff, start_to);
1859       __ br(Assembler::EQ, L);
1860       __ stop("super_check_offset inconsistent");
1861       __ bind(L);
1862     }
1863 #endif //ASSERT
1864 
1865     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1866 
1867     // save the original count
1868     __ mov(count_save, count);
1869 
1870     // Copy from low to high addresses
1871     __ mov(start_to, to);              // Save destination array start address
1872     __ b(L_load_element);
1873 
1874     // ======== begin loop ========
1875     // (Loop is rotated; its entry is L_load_element.)
1876     // Loop control:
1877     //   for (; count != 0; count--) {
1878     //     copied_oop = load_heap_oop(from++);
1879     //     ... generate_type_check ...;
1880     //     store_heap_oop(to++, copied_oop);
1881     //   }
1882     __ align(OptoLoopAlignment);
1883 
1884     __ BIND(L_store_element);
1885     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1886     __ sub(count, count, 1);
1887     __ cbz(count, L_do_card_marks);
1888 
1889     // ======== loop entry is here ========
1890     __ BIND(L_load_element);
1891     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1892     __ cbz(copied_oop, L_store_element);
1893 
1894     __ load_klass(r19_klass, copied_oop);// query the object klass
1895     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1896     // ======== end loop ========
1897 
1898     // It was a real error; we must depend on the caller to finish the job.
1899     // Register count = remaining oops, count_orig = total oops.
1900     // Emit GC store barriers for the oops we have copied and report
1901     // their number to the caller.
1902 
1903     __ subs(count, count_save, count);     // K = partially copied oop count
1904     __ eon(count, count, zr);                   // report (-1^K) to caller
1905     __ br(Assembler::EQ, L_done_pop);
1906 
1907     __ BIND(L_do_card_marks);
1908     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1909     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1910 
1911     __ bind(L_done_pop);
1912     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1913     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1914 
1915     __ bind(L_done);
1916     __ mov(r0, count);
1917     __ leave();
1918     __ ret(lr);
1919 
1920     return start;
1921   }
1922 
1923   // Perform range checks on the proposed arraycopy.
1924   // Kills temp, but nothing else.
1925   // Also, clean the sign bits of src_pos and dst_pos.
1926   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1927                               Register src_pos, // source position (c_rarg1)
1928                               Register dst,     // destination array oo (c_rarg2)
1929                               Register dst_pos, // destination position (c_rarg3)
1930                               Register length,
1931                               Register temp,
1932                               Label& L_failed) {
1933     BLOCK_COMMENT("arraycopy_range_checks:");
1934 
1935     assert_different_registers(rscratch1, temp);
1936 
1937     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1938     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1939     __ addw(temp, length, src_pos);
1940     __ cmpw(temp, rscratch1);
1941     __ br(Assembler::HI, L_failed);
1942 
1943     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1944     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1945     __ addw(temp, length, dst_pos);
1946     __ cmpw(temp, rscratch1);
1947     __ br(Assembler::HI, L_failed);
1948 
1949     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1950     __ movw(src_pos, src_pos);
1951     __ movw(dst_pos, dst_pos);
1952 
1953     BLOCK_COMMENT("arraycopy_range_checks done");
1954   }
1955 
1956   // These stubs get called from some dumb test routine.
1957   // I'll write them properly when they're called from
1958   // something that's actually doing something.
1959   static void fake_arraycopy_stub(address src, address dst, int count) {
1960     assert(count == 0, "huh?");
1961   }
1962 
1963 
1964   //
1965   //  Generate 'unsafe' array copy stub
1966   //  Though just as safe as the other stubs, it takes an unscaled
1967   //  size_t argument instead of an element count.
1968   //
1969   //  Input:
1970   //    c_rarg0   - source array address
1971   //    c_rarg1   - destination array address
1972   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1973   //
1974   // Examines the alignment of the operands and dispatches
1975   // to a long, int, short, or byte copy loop.
1976   //
1977   address generate_unsafe_copy(const char *name,
1978                                address byte_copy_entry,
1979                                address short_copy_entry,
1980                                address int_copy_entry,
1981                                address long_copy_entry) {
1982     Label L_long_aligned, L_int_aligned, L_short_aligned;
1983     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1984 
1985     __ align(CodeEntryAlignment);
1986     StubCodeMark mark(this, "StubRoutines", name);
1987     address start = __ pc();
1988     __ enter(); // required for proper stackwalking of RuntimeStub frame
1989 
1990     // bump this on entry, not on exit:
1991     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
1992 
1993     __ orr(rscratch1, s, d);
1994     __ orr(rscratch1, rscratch1, count);
1995 
1996     __ andr(rscratch1, rscratch1, BytesPerLong-1);
1997     __ cbz(rscratch1, L_long_aligned);
1998     __ andr(rscratch1, rscratch1, BytesPerInt-1);
1999     __ cbz(rscratch1, L_int_aligned);
2000     __ tbz(rscratch1, 0, L_short_aligned);
2001     __ b(RuntimeAddress(byte_copy_entry));
2002 
2003     __ BIND(L_short_aligned);
2004     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2005     __ b(RuntimeAddress(short_copy_entry));
2006     __ BIND(L_int_aligned);
2007     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2008     __ b(RuntimeAddress(int_copy_entry));
2009     __ BIND(L_long_aligned);
2010     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2011     __ b(RuntimeAddress(long_copy_entry));
2012 
2013     return start;
2014   }
2015 
2016   //
2017   //  Generate generic array copy stubs
2018   //
2019   //  Input:
2020   //    c_rarg0    -  src oop
2021   //    c_rarg1    -  src_pos (32-bits)
2022   //    c_rarg2    -  dst oop
2023   //    c_rarg3    -  dst_pos (32-bits)
2024   //    c_rarg4    -  element count (32-bits)
2025   //
2026   //  Output:
2027   //    r0 ==  0  -  success
2028   //    r0 == -1^K - failure, where K is partial transfer count
2029   //
2030   address generate_generic_copy(const char *name,
2031                                 address byte_copy_entry, address short_copy_entry,
2032                                 address int_copy_entry, address oop_copy_entry,
2033                                 address long_copy_entry, address checkcast_copy_entry) {
2034 
2035     Label L_failed, L_failed_0, L_objArray;
2036     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2037 
2038     // Input registers
2039     const Register src        = c_rarg0;  // source array oop
2040     const Register src_pos    = c_rarg1;  // source position
2041     const Register dst        = c_rarg2;  // destination array oop
2042     const Register dst_pos    = c_rarg3;  // destination position
2043     const Register length     = c_rarg4;
2044 
2045     StubCodeMark mark(this, "StubRoutines", name);
2046 
2047     __ align(CodeEntryAlignment);
2048     address start = __ pc();
2049 
2050     __ enter(); // required for proper stackwalking of RuntimeStub frame
2051 
2052     // bump this on entry, not on exit:
2053     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2054 
2055     //-----------------------------------------------------------------------
2056     // Assembler stub will be used for this call to arraycopy
2057     // if the following conditions are met:
2058     //
2059     // (1) src and dst must not be null.
2060     // (2) src_pos must not be negative.
2061     // (3) dst_pos must not be negative.
2062     // (4) length  must not be negative.
2063     // (5) src klass and dst klass should be the same and not NULL.
2064     // (6) src and dst should be arrays.
2065     // (7) src_pos + length must not exceed length of src.
2066     // (8) dst_pos + length must not exceed length of dst.
2067     //
2068 
2069     //  if (src == NULL) return -1;
2070     __ cbz(src, L_failed);
2071 
2072     //  if (src_pos < 0) return -1;
2073     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2074 
2075     //  if (dst == NULL) return -1;
2076     __ cbz(dst, L_failed);
2077 
2078     //  if (dst_pos < 0) return -1;
2079     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2080 
2081     // registers used as temp
2082     const Register scratch_length    = r16; // elements count to copy
2083     const Register scratch_src_klass = r17; // array klass
2084     const Register lh                = r18; // layout helper
2085 
2086     //  if (length < 0) return -1;
2087     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2088     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2089 
2090     __ load_klass(scratch_src_klass, src);
2091 #ifdef ASSERT
2092     //  assert(src->klass() != NULL);
2093     {
2094       BLOCK_COMMENT("assert klasses not null {");
2095       Label L1, L2;
2096       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2097       __ bind(L1);
2098       __ stop("broken null klass");
2099       __ bind(L2);
2100       __ load_klass(rscratch1, dst);
2101       __ cbz(rscratch1, L1);     // this would be broken also
2102       BLOCK_COMMENT("} assert klasses not null done");
2103     }
2104 #endif
2105 
2106     // Load layout helper (32-bits)
2107     //
2108     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2109     // 32        30    24            16              8     2                 0
2110     //
2111     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2112     //
2113 
2114     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2115 
2116     // Handle objArrays completely differently...
2117     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2118     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2119     __ movw(rscratch1, objArray_lh);
2120     __ eorw(rscratch2, lh, rscratch1);
2121     __ cbzw(rscratch2, L_objArray);
2122 
2123     //  if (src->klass() != dst->klass()) return -1;
2124     __ load_klass(rscratch2, dst);
2125     __ eor(rscratch2, rscratch2, scratch_src_klass);
2126     __ cbnz(rscratch2, L_failed);
2127 
2128     //  if (!src->is_Array()) return -1;
2129     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2130 
2131     // At this point, it is known to be a typeArray (array_tag 0x3).
2132 #ifdef ASSERT
2133     {
2134       BLOCK_COMMENT("assert primitive array {");
2135       Label L;
2136       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2137       __ cmpw(lh, rscratch2);
2138       __ br(Assembler::GE, L);
2139       __ stop("must be a primitive array");
2140       __ bind(L);
2141       BLOCK_COMMENT("} assert primitive array done");
2142     }
2143 #endif
2144 
2145     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2146                            rscratch2, L_failed);
2147 
2148     // TypeArrayKlass
2149     //
2150     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2151     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2152     //
2153 
2154     const Register rscratch1_offset = rscratch1;    // array offset
2155     const Register r18_elsize = lh; // element size
2156 
2157     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2158            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2159     __ add(src, src, rscratch1_offset);           // src array offset
2160     __ add(dst, dst, rscratch1_offset);           // dst array offset
2161     BLOCK_COMMENT("choose copy loop based on element size");
2162 
2163     // next registers should be set before the jump to corresponding stub
2164     const Register from     = c_rarg0;  // source array address
2165     const Register to       = c_rarg1;  // destination array address
2166     const Register count    = c_rarg2;  // elements count
2167 
2168     // 'from', 'to', 'count' registers should be set in such order
2169     // since they are the same as 'src', 'src_pos', 'dst'.
2170 
2171     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2172 
2173     // The possible values of elsize are 0-3, i.e. exact_log2(element
2174     // size in bytes).  We do a simple bitwise binary search.
2175   __ BIND(L_copy_bytes);
2176     __ tbnz(r18_elsize, 1, L_copy_ints);
2177     __ tbnz(r18_elsize, 0, L_copy_shorts);
2178     __ lea(from, Address(src, src_pos));// src_addr
2179     __ lea(to,   Address(dst, dst_pos));// dst_addr
2180     __ movw(count, scratch_length); // length
2181     __ b(RuntimeAddress(byte_copy_entry));
2182 
2183   __ BIND(L_copy_shorts);
2184     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2185     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2186     __ movw(count, scratch_length); // length
2187     __ b(RuntimeAddress(short_copy_entry));
2188 
2189   __ BIND(L_copy_ints);
2190     __ tbnz(r18_elsize, 0, L_copy_longs);
2191     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2192     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2193     __ movw(count, scratch_length); // length
2194     __ b(RuntimeAddress(int_copy_entry));
2195 
2196   __ BIND(L_copy_longs);
2197 #ifdef ASSERT
2198     {
2199       BLOCK_COMMENT("assert long copy {");
2200       Label L;
2201       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2202       __ cmpw(r18_elsize, LogBytesPerLong);
2203       __ br(Assembler::EQ, L);
2204       __ stop("must be long copy, but elsize is wrong");
2205       __ bind(L);
2206       BLOCK_COMMENT("} assert long copy done");
2207     }
2208 #endif
2209     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2210     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2211     __ movw(count, scratch_length); // length
2212     __ b(RuntimeAddress(long_copy_entry));
2213 
2214     // ObjArrayKlass
2215   __ BIND(L_objArray);
2216     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2217 
2218     Label L_plain_copy, L_checkcast_copy;
2219     //  test array classes for subtyping
2220     __ load_klass(r18, dst);
2221     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2222     __ br(Assembler::NE, L_checkcast_copy);
2223 
2224     // Identically typed arrays can be copied without element-wise checks.
2225     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2226                            rscratch2, L_failed);
2227 
2228     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2229     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2230     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2231     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2232     __ movw(count, scratch_length); // length
2233   __ BIND(L_plain_copy);
2234     __ b(RuntimeAddress(oop_copy_entry));
2235 
2236   __ BIND(L_checkcast_copy);
2237     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2238     {
2239       // Before looking at dst.length, make sure dst is also an objArray.
2240       __ ldrw(rscratch1, Address(r18, lh_offset));
2241       __ movw(rscratch2, objArray_lh);
2242       __ eorw(rscratch1, rscratch1, rscratch2);
2243       __ cbnzw(rscratch1, L_failed);
2244 
2245       // It is safe to examine both src.length and dst.length.
2246       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2247                              r18, L_failed);
2248 
2249       const Register rscratch2_dst_klass = rscratch2;
2250       __ load_klass(rscratch2_dst_klass, dst); // reload
2251 
2252       // Marshal the base address arguments now, freeing registers.
2253       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2254       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2255       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2256       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2257       __ movw(count, length);           // length (reloaded)
2258       Register sco_temp = c_rarg3;      // this register is free now
2259       assert_different_registers(from, to, count, sco_temp,
2260                                  rscratch2_dst_klass, scratch_src_klass);
2261       // assert_clean_int(count, sco_temp);
2262 
2263       // Generate the type check.
2264       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2265       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2266       // assert_clean_int(sco_temp, r18);
2267       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2268 
2269       // Fetch destination element klass from the ObjArrayKlass header.
2270       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2271       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2272       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2273 
2274       // the checkcast_copy loop needs two extra arguments:
2275       assert(c_rarg3 == sco_temp, "#3 already in place");
2276       // Set up arguments for checkcast_copy_entry.
2277       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2278       __ b(RuntimeAddress(checkcast_copy_entry));
2279     }
2280 
2281   __ BIND(L_failed);
2282     __ mov(r0, -1);
2283     __ leave();   // required for proper stackwalking of RuntimeStub frame
2284     __ ret(lr);
2285 
2286     return start;
2287   }
2288 
2289   //
2290   // Generate stub for array fill. If "aligned" is true, the
2291   // "to" address is assumed to be heapword aligned.
2292   //
2293   // Arguments for generated stub:
2294   //   to:    c_rarg0
2295   //   value: c_rarg1
2296   //   count: c_rarg2 treated as signed
2297   //
2298   address generate_fill(BasicType t, bool aligned, const char *name) {
2299     __ align(CodeEntryAlignment);
2300     StubCodeMark mark(this, "StubRoutines", name);
2301     address start = __ pc();
2302 
2303     BLOCK_COMMENT("Entry:");
2304 
2305     const Register to        = c_rarg0;  // source array address
2306     const Register value     = c_rarg1;  // value
2307     const Register count     = c_rarg2;  // elements count
2308 
2309     const Register bz_base = r10;        // base for block_zero routine
2310     const Register cnt_words = r11;      // temp register
2311 
2312     __ enter();
2313 
2314     Label L_fill_elements, L_exit1;
2315 
2316     int shift = -1;
2317     switch (t) {
2318       case T_BYTE:
2319         shift = 0;
2320         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2321         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2322         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2323         __ br(Assembler::LO, L_fill_elements);
2324         break;
2325       case T_SHORT:
2326         shift = 1;
2327         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2328         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2329         __ br(Assembler::LO, L_fill_elements);
2330         break;
2331       case T_INT:
2332         shift = 2;
2333         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2334         __ br(Assembler::LO, L_fill_elements);
2335         break;
2336       default: ShouldNotReachHere();
2337     }
2338 
2339     // Align source address at 8 bytes address boundary.
2340     Label L_skip_align1, L_skip_align2, L_skip_align4;
2341     if (!aligned) {
2342       switch (t) {
2343         case T_BYTE:
2344           // One byte misalignment happens only for byte arrays.
2345           __ tbz(to, 0, L_skip_align1);
2346           __ strb(value, Address(__ post(to, 1)));
2347           __ subw(count, count, 1);
2348           __ bind(L_skip_align1);
2349           // Fallthrough
2350         case T_SHORT:
2351           // Two bytes misalignment happens only for byte and short (char) arrays.
2352           __ tbz(to, 1, L_skip_align2);
2353           __ strh(value, Address(__ post(to, 2)));
2354           __ subw(count, count, 2 >> shift);
2355           __ bind(L_skip_align2);
2356           // Fallthrough
2357         case T_INT:
2358           // Align to 8 bytes, we know we are 4 byte aligned to start.
2359           __ tbz(to, 2, L_skip_align4);
2360           __ strw(value, Address(__ post(to, 4)));
2361           __ subw(count, count, 4 >> shift);
2362           __ bind(L_skip_align4);
2363           break;
2364         default: ShouldNotReachHere();
2365       }
2366     }
2367 
2368     //
2369     //  Fill large chunks
2370     //
2371     __ lsrw(cnt_words, count, 3 - shift); // number of words
2372     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2373     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2374     if (UseBlockZeroing) {
2375       Label non_block_zeroing, rest;
2376       // If the fill value is zero we can use the fast zero_words().
2377       __ cbnz(value, non_block_zeroing);
2378       __ mov(bz_base, to);
2379       __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord);
2380       __ zero_words(bz_base, cnt_words);
2381       __ b(rest);
2382       __ bind(non_block_zeroing);
2383       __ fill_words(to, cnt_words, value);
2384       __ bind(rest);
2385     } else {
2386       __ fill_words(to, cnt_words, value);
2387     }
2388 
2389     // Remaining count is less than 8 bytes. Fill it by a single store.
2390     // Note that the total length is no less than 8 bytes.
2391     if (t == T_BYTE || t == T_SHORT) {
2392       Label L_exit1;
2393       __ cbzw(count, L_exit1);
2394       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2395       __ str(value, Address(to, -8));    // overwrite some elements
2396       __ bind(L_exit1);
2397       __ leave();
2398       __ ret(lr);
2399     }
2400 
2401     // Handle copies less than 8 bytes.
2402     Label L_fill_2, L_fill_4, L_exit2;
2403     __ bind(L_fill_elements);
2404     switch (t) {
2405       case T_BYTE:
2406         __ tbz(count, 0, L_fill_2);
2407         __ strb(value, Address(__ post(to, 1)));
2408         __ bind(L_fill_2);
2409         __ tbz(count, 1, L_fill_4);
2410         __ strh(value, Address(__ post(to, 2)));
2411         __ bind(L_fill_4);
2412         __ tbz(count, 2, L_exit2);
2413         __ strw(value, Address(to));
2414         break;
2415       case T_SHORT:
2416         __ tbz(count, 0, L_fill_4);
2417         __ strh(value, Address(__ post(to, 2)));
2418         __ bind(L_fill_4);
2419         __ tbz(count, 1, L_exit2);
2420         __ strw(value, Address(to));
2421         break;
2422       case T_INT:
2423         __ cbzw(count, L_exit2);
2424         __ strw(value, Address(to));
2425         break;
2426       default: ShouldNotReachHere();
2427     }
2428     __ bind(L_exit2);
2429     __ leave();
2430     __ ret(lr);
2431     return start;
2432   }
2433 
2434   void generate_arraycopy_stubs() {
2435     address entry;
2436     address entry_jbyte_arraycopy;
2437     address entry_jshort_arraycopy;
2438     address entry_jint_arraycopy;
2439     address entry_oop_arraycopy;
2440     address entry_jlong_arraycopy;
2441     address entry_checkcast_arraycopy;
2442 
2443     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2444     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2445 
2446     StubRoutines::aarch64::_zero_blocks = generate_zero_blocks();
2447 
2448     //*** jbyte
2449     // Always need aligned and unaligned versions
2450     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2451                                                                                   "jbyte_disjoint_arraycopy");
2452     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2453                                                                                   &entry_jbyte_arraycopy,
2454                                                                                   "jbyte_arraycopy");
2455     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2456                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2457     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2458                                                                                   "arrayof_jbyte_arraycopy");
2459 
2460     //*** jshort
2461     // Always need aligned and unaligned versions
2462     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2463                                                                                     "jshort_disjoint_arraycopy");
2464     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2465                                                                                     &entry_jshort_arraycopy,
2466                                                                                     "jshort_arraycopy");
2467     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2468                                                                                     "arrayof_jshort_disjoint_arraycopy");
2469     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2470                                                                                     "arrayof_jshort_arraycopy");
2471 
2472     //*** jint
2473     // Aligned versions
2474     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2475                                                                                 "arrayof_jint_disjoint_arraycopy");
2476     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2477                                                                                 "arrayof_jint_arraycopy");
2478     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2479     // entry_jint_arraycopy always points to the unaligned version
2480     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2481                                                                                 "jint_disjoint_arraycopy");
2482     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2483                                                                                 &entry_jint_arraycopy,
2484                                                                                 "jint_arraycopy");
2485 
2486     //*** jlong
2487     // It is always aligned
2488     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2489                                                                                   "arrayof_jlong_disjoint_arraycopy");
2490     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2491                                                                                   "arrayof_jlong_arraycopy");
2492     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2493     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2494 
2495     //*** oops
2496     {
2497       // With compressed oops we need unaligned versions; notice that
2498       // we overwrite entry_oop_arraycopy.
2499       bool aligned = !UseCompressedOops;
2500 
2501       StubRoutines::_arrayof_oop_disjoint_arraycopy
2502         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2503                                      /*dest_uninitialized*/false);
2504       StubRoutines::_arrayof_oop_arraycopy
2505         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2506                                      /*dest_uninitialized*/false);
2507       // Aligned versions without pre-barriers
2508       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2509         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2510                                      /*dest_uninitialized*/true);
2511       StubRoutines::_arrayof_oop_arraycopy_uninit
2512         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2513                                      /*dest_uninitialized*/true);
2514     }
2515 
2516     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2517     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2518     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2519     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2520 
2521     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2522     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2523                                                                         /*dest_uninitialized*/true);
2524 
2525     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2526                                                               entry_jbyte_arraycopy,
2527                                                               entry_jshort_arraycopy,
2528                                                               entry_jint_arraycopy,
2529                                                               entry_jlong_arraycopy);
2530 
2531     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2532                                                                entry_jbyte_arraycopy,
2533                                                                entry_jshort_arraycopy,
2534                                                                entry_jint_arraycopy,
2535                                                                entry_oop_arraycopy,
2536                                                                entry_jlong_arraycopy,
2537                                                                entry_checkcast_arraycopy);
2538 
2539     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2540     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2541     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2542     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2543     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2544     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2545   }
2546 
2547   void generate_math_stubs() { Unimplemented(); }
2548 
2549   // Arguments:
2550   //
2551   // Inputs:
2552   //   c_rarg0   - source byte array address
2553   //   c_rarg1   - destination byte array address
2554   //   c_rarg2   - K (key) in little endian int array
2555   //
2556   address generate_aescrypt_encryptBlock() {
2557     __ align(CodeEntryAlignment);
2558     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2559 
2560     Label L_doLast;
2561 
2562     const Register from        = c_rarg0;  // source array address
2563     const Register to          = c_rarg1;  // destination array address
2564     const Register key         = c_rarg2;  // key array address
2565     const Register keylen      = rscratch1;
2566 
2567     address start = __ pc();
2568     __ enter();
2569 
2570     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2571 
2572     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2573 
2574     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2575     __ rev32(v1, __ T16B, v1);
2576     __ rev32(v2, __ T16B, v2);
2577     __ rev32(v3, __ T16B, v3);
2578     __ rev32(v4, __ T16B, v4);
2579     __ aese(v0, v1);
2580     __ aesmc(v0, v0);
2581     __ aese(v0, v2);
2582     __ aesmc(v0, v0);
2583     __ aese(v0, v3);
2584     __ aesmc(v0, v0);
2585     __ aese(v0, v4);
2586     __ aesmc(v0, v0);
2587 
2588     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2589     __ rev32(v1, __ T16B, v1);
2590     __ rev32(v2, __ T16B, v2);
2591     __ rev32(v3, __ T16B, v3);
2592     __ rev32(v4, __ T16B, v4);
2593     __ aese(v0, v1);
2594     __ aesmc(v0, v0);
2595     __ aese(v0, v2);
2596     __ aesmc(v0, v0);
2597     __ aese(v0, v3);
2598     __ aesmc(v0, v0);
2599     __ aese(v0, v4);
2600     __ aesmc(v0, v0);
2601 
2602     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2603     __ rev32(v1, __ T16B, v1);
2604     __ rev32(v2, __ T16B, v2);
2605 
2606     __ cmpw(keylen, 44);
2607     __ br(Assembler::EQ, L_doLast);
2608 
2609     __ aese(v0, v1);
2610     __ aesmc(v0, v0);
2611     __ aese(v0, v2);
2612     __ aesmc(v0, v0);
2613 
2614     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2615     __ rev32(v1, __ T16B, v1);
2616     __ rev32(v2, __ T16B, v2);
2617 
2618     __ cmpw(keylen, 52);
2619     __ br(Assembler::EQ, L_doLast);
2620 
2621     __ aese(v0, v1);
2622     __ aesmc(v0, v0);
2623     __ aese(v0, v2);
2624     __ aesmc(v0, v0);
2625 
2626     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2627     __ rev32(v1, __ T16B, v1);
2628     __ rev32(v2, __ T16B, v2);
2629 
2630     __ BIND(L_doLast);
2631 
2632     __ aese(v0, v1);
2633     __ aesmc(v0, v0);
2634     __ aese(v0, v2);
2635 
2636     __ ld1(v1, __ T16B, key);
2637     __ rev32(v1, __ T16B, v1);
2638     __ eor(v0, __ T16B, v0, v1);
2639 
2640     __ st1(v0, __ T16B, to);
2641 
2642     __ mov(r0, 0);
2643 
2644     __ leave();
2645     __ ret(lr);
2646 
2647     return start;
2648   }
2649 
2650   // Arguments:
2651   //
2652   // Inputs:
2653   //   c_rarg0   - source byte array address
2654   //   c_rarg1   - destination byte array address
2655   //   c_rarg2   - K (key) in little endian int array
2656   //
2657   address generate_aescrypt_decryptBlock() {
2658     assert(UseAES, "need AES instructions and misaligned SSE support");
2659     __ align(CodeEntryAlignment);
2660     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2661     Label L_doLast;
2662 
2663     const Register from        = c_rarg0;  // source array address
2664     const Register to          = c_rarg1;  // destination array address
2665     const Register key         = c_rarg2;  // key array address
2666     const Register keylen      = rscratch1;
2667 
2668     address start = __ pc();
2669     __ enter(); // required for proper stackwalking of RuntimeStub frame
2670 
2671     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2672 
2673     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2674 
2675     __ ld1(v5, __ T16B, __ post(key, 16));
2676     __ rev32(v5, __ T16B, v5);
2677 
2678     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2679     __ rev32(v1, __ T16B, v1);
2680     __ rev32(v2, __ T16B, v2);
2681     __ rev32(v3, __ T16B, v3);
2682     __ rev32(v4, __ T16B, v4);
2683     __ aesd(v0, v1);
2684     __ aesimc(v0, v0);
2685     __ aesd(v0, v2);
2686     __ aesimc(v0, v0);
2687     __ aesd(v0, v3);
2688     __ aesimc(v0, v0);
2689     __ aesd(v0, v4);
2690     __ aesimc(v0, v0);
2691 
2692     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2693     __ rev32(v1, __ T16B, v1);
2694     __ rev32(v2, __ T16B, v2);
2695     __ rev32(v3, __ T16B, v3);
2696     __ rev32(v4, __ T16B, v4);
2697     __ aesd(v0, v1);
2698     __ aesimc(v0, v0);
2699     __ aesd(v0, v2);
2700     __ aesimc(v0, v0);
2701     __ aesd(v0, v3);
2702     __ aesimc(v0, v0);
2703     __ aesd(v0, v4);
2704     __ aesimc(v0, v0);
2705 
2706     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2707     __ rev32(v1, __ T16B, v1);
2708     __ rev32(v2, __ T16B, v2);
2709 
2710     __ cmpw(keylen, 44);
2711     __ br(Assembler::EQ, L_doLast);
2712 
2713     __ aesd(v0, v1);
2714     __ aesimc(v0, v0);
2715     __ aesd(v0, v2);
2716     __ aesimc(v0, v0);
2717 
2718     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2719     __ rev32(v1, __ T16B, v1);
2720     __ rev32(v2, __ T16B, v2);
2721 
2722     __ cmpw(keylen, 52);
2723     __ br(Assembler::EQ, L_doLast);
2724 
2725     __ aesd(v0, v1);
2726     __ aesimc(v0, v0);
2727     __ aesd(v0, v2);
2728     __ aesimc(v0, v0);
2729 
2730     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2731     __ rev32(v1, __ T16B, v1);
2732     __ rev32(v2, __ T16B, v2);
2733 
2734     __ BIND(L_doLast);
2735 
2736     __ aesd(v0, v1);
2737     __ aesimc(v0, v0);
2738     __ aesd(v0, v2);
2739 
2740     __ eor(v0, __ T16B, v0, v5);
2741 
2742     __ st1(v0, __ T16B, to);
2743 
2744     __ mov(r0, 0);
2745 
2746     __ leave();
2747     __ ret(lr);
2748 
2749     return start;
2750   }
2751 
2752   // Arguments:
2753   //
2754   // Inputs:
2755   //   c_rarg0   - source byte array address
2756   //   c_rarg1   - destination byte array address
2757   //   c_rarg2   - K (key) in little endian int array
2758   //   c_rarg3   - r vector byte array address
2759   //   c_rarg4   - input length
2760   //
2761   // Output:
2762   //   x0        - input length
2763   //
2764   address generate_cipherBlockChaining_encryptAESCrypt() {
2765     assert(UseAES, "need AES instructions and misaligned SSE support");
2766     __ align(CodeEntryAlignment);
2767     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2768 
2769     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2770 
2771     const Register from        = c_rarg0;  // source array address
2772     const Register to          = c_rarg1;  // destination array address
2773     const Register key         = c_rarg2;  // key array address
2774     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2775                                            // and left with the results of the last encryption block
2776     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2777     const Register keylen      = rscratch1;
2778 
2779     address start = __ pc();
2780 
2781       __ enter();
2782 
2783       __ movw(rscratch2, len_reg);
2784 
2785       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2786 
2787       __ ld1(v0, __ T16B, rvec);
2788 
2789       __ cmpw(keylen, 52);
2790       __ br(Assembler::CC, L_loadkeys_44);
2791       __ br(Assembler::EQ, L_loadkeys_52);
2792 
2793       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2794       __ rev32(v17, __ T16B, v17);
2795       __ rev32(v18, __ T16B, v18);
2796     __ BIND(L_loadkeys_52);
2797       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2798       __ rev32(v19, __ T16B, v19);
2799       __ rev32(v20, __ T16B, v20);
2800     __ BIND(L_loadkeys_44);
2801       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2802       __ rev32(v21, __ T16B, v21);
2803       __ rev32(v22, __ T16B, v22);
2804       __ rev32(v23, __ T16B, v23);
2805       __ rev32(v24, __ T16B, v24);
2806       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2807       __ rev32(v25, __ T16B, v25);
2808       __ rev32(v26, __ T16B, v26);
2809       __ rev32(v27, __ T16B, v27);
2810       __ rev32(v28, __ T16B, v28);
2811       __ ld1(v29, v30, v31, __ T16B, key);
2812       __ rev32(v29, __ T16B, v29);
2813       __ rev32(v30, __ T16B, v30);
2814       __ rev32(v31, __ T16B, v31);
2815 
2816     __ BIND(L_aes_loop);
2817       __ ld1(v1, __ T16B, __ post(from, 16));
2818       __ eor(v0, __ T16B, v0, v1);
2819 
2820       __ br(Assembler::CC, L_rounds_44);
2821       __ br(Assembler::EQ, L_rounds_52);
2822 
2823       __ aese(v0, v17); __ aesmc(v0, v0);
2824       __ aese(v0, v18); __ aesmc(v0, v0);
2825     __ BIND(L_rounds_52);
2826       __ aese(v0, v19); __ aesmc(v0, v0);
2827       __ aese(v0, v20); __ aesmc(v0, v0);
2828     __ BIND(L_rounds_44);
2829       __ aese(v0, v21); __ aesmc(v0, v0);
2830       __ aese(v0, v22); __ aesmc(v0, v0);
2831       __ aese(v0, v23); __ aesmc(v0, v0);
2832       __ aese(v0, v24); __ aesmc(v0, v0);
2833       __ aese(v0, v25); __ aesmc(v0, v0);
2834       __ aese(v0, v26); __ aesmc(v0, v0);
2835       __ aese(v0, v27); __ aesmc(v0, v0);
2836       __ aese(v0, v28); __ aesmc(v0, v0);
2837       __ aese(v0, v29); __ aesmc(v0, v0);
2838       __ aese(v0, v30);
2839       __ eor(v0, __ T16B, v0, v31);
2840 
2841       __ st1(v0, __ T16B, __ post(to, 16));
2842 
2843       __ subw(len_reg, len_reg, 16);
2844       __ cbnzw(len_reg, L_aes_loop);
2845 
2846       __ st1(v0, __ T16B, rvec);
2847 
2848       __ mov(r0, rscratch2);
2849 
2850       __ leave();
2851       __ ret(lr);
2852 
2853       return start;
2854   }
2855 
2856   // Arguments:
2857   //
2858   // Inputs:
2859   //   c_rarg0   - source byte array address
2860   //   c_rarg1   - destination byte array address
2861   //   c_rarg2   - K (key) in little endian int array
2862   //   c_rarg3   - r vector byte array address
2863   //   c_rarg4   - input length
2864   //
2865   // Output:
2866   //   r0        - input length
2867   //
2868   address generate_cipherBlockChaining_decryptAESCrypt() {
2869     assert(UseAES, "need AES instructions and misaligned SSE support");
2870     __ align(CodeEntryAlignment);
2871     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2872 
2873     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2874 
2875     const Register from        = c_rarg0;  // source array address
2876     const Register to          = c_rarg1;  // destination array address
2877     const Register key         = c_rarg2;  // key array address
2878     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2879                                            // and left with the results of the last encryption block
2880     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2881     const Register keylen      = rscratch1;
2882 
2883     address start = __ pc();
2884 
2885       __ enter();
2886 
2887       __ movw(rscratch2, len_reg);
2888 
2889       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2890 
2891       __ ld1(v2, __ T16B, rvec);
2892 
2893       __ ld1(v31, __ T16B, __ post(key, 16));
2894       __ rev32(v31, __ T16B, v31);
2895 
2896       __ cmpw(keylen, 52);
2897       __ br(Assembler::CC, L_loadkeys_44);
2898       __ br(Assembler::EQ, L_loadkeys_52);
2899 
2900       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2901       __ rev32(v17, __ T16B, v17);
2902       __ rev32(v18, __ T16B, v18);
2903     __ BIND(L_loadkeys_52);
2904       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2905       __ rev32(v19, __ T16B, v19);
2906       __ rev32(v20, __ T16B, v20);
2907     __ BIND(L_loadkeys_44);
2908       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2909       __ rev32(v21, __ T16B, v21);
2910       __ rev32(v22, __ T16B, v22);
2911       __ rev32(v23, __ T16B, v23);
2912       __ rev32(v24, __ T16B, v24);
2913       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2914       __ rev32(v25, __ T16B, v25);
2915       __ rev32(v26, __ T16B, v26);
2916       __ rev32(v27, __ T16B, v27);
2917       __ rev32(v28, __ T16B, v28);
2918       __ ld1(v29, v30, __ T16B, key);
2919       __ rev32(v29, __ T16B, v29);
2920       __ rev32(v30, __ T16B, v30);
2921 
2922     __ BIND(L_aes_loop);
2923       __ ld1(v0, __ T16B, __ post(from, 16));
2924       __ orr(v1, __ T16B, v0, v0);
2925 
2926       __ br(Assembler::CC, L_rounds_44);
2927       __ br(Assembler::EQ, L_rounds_52);
2928 
2929       __ aesd(v0, v17); __ aesimc(v0, v0);
2930       __ aesd(v0, v18); __ aesimc(v0, v0);
2931     __ BIND(L_rounds_52);
2932       __ aesd(v0, v19); __ aesimc(v0, v0);
2933       __ aesd(v0, v20); __ aesimc(v0, v0);
2934     __ BIND(L_rounds_44);
2935       __ aesd(v0, v21); __ aesimc(v0, v0);
2936       __ aesd(v0, v22); __ aesimc(v0, v0);
2937       __ aesd(v0, v23); __ aesimc(v0, v0);
2938       __ aesd(v0, v24); __ aesimc(v0, v0);
2939       __ aesd(v0, v25); __ aesimc(v0, v0);
2940       __ aesd(v0, v26); __ aesimc(v0, v0);
2941       __ aesd(v0, v27); __ aesimc(v0, v0);
2942       __ aesd(v0, v28); __ aesimc(v0, v0);
2943       __ aesd(v0, v29); __ aesimc(v0, v0);
2944       __ aesd(v0, v30);
2945       __ eor(v0, __ T16B, v0, v31);
2946       __ eor(v0, __ T16B, v0, v2);
2947 
2948       __ st1(v0, __ T16B, __ post(to, 16));
2949       __ orr(v2, __ T16B, v1, v1);
2950 
2951       __ subw(len_reg, len_reg, 16);
2952       __ cbnzw(len_reg, L_aes_loop);
2953 
2954       __ st1(v2, __ T16B, rvec);
2955 
2956       __ mov(r0, rscratch2);
2957 
2958       __ leave();
2959       __ ret(lr);
2960 
2961     return start;
2962   }
2963 
2964   // Arguments:
2965   //
2966   // Inputs:
2967   //   c_rarg0   - byte[]  source+offset
2968   //   c_rarg1   - int[]   SHA.state
2969   //   c_rarg2   - int     offset
2970   //   c_rarg3   - int     limit
2971   //
2972   address generate_sha1_implCompress(bool multi_block, const char *name) {
2973     __ align(CodeEntryAlignment);
2974     StubCodeMark mark(this, "StubRoutines", name);
2975     address start = __ pc();
2976 
2977     Register buf   = c_rarg0;
2978     Register state = c_rarg1;
2979     Register ofs   = c_rarg2;
2980     Register limit = c_rarg3;
2981 
2982     Label keys;
2983     Label sha1_loop;
2984 
2985     // load the keys into v0..v3
2986     __ adr(rscratch1, keys);
2987     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2988     // load 5 words state into v6, v7
2989     __ ldrq(v6, Address(state, 0));
2990     __ ldrs(v7, Address(state, 16));
2991 
2992 
2993     __ BIND(sha1_loop);
2994     // load 64 bytes of data into v16..v19
2995     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2996     __ rev32(v16, __ T16B, v16);
2997     __ rev32(v17, __ T16B, v17);
2998     __ rev32(v18, __ T16B, v18);
2999     __ rev32(v19, __ T16B, v19);
3000 
3001     // do the sha1
3002     __ addv(v4, __ T4S, v16, v0);
3003     __ orr(v20, __ T16B, v6, v6);
3004 
3005     FloatRegister d0 = v16;
3006     FloatRegister d1 = v17;
3007     FloatRegister d2 = v18;
3008     FloatRegister d3 = v19;
3009 
3010     for (int round = 0; round < 20; round++) {
3011       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3012       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3013       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3014       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3015       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3016 
3017       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3018       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3019       __ sha1h(tmp2, __ T4S, v20);
3020       if (round < 5)
3021         __ sha1c(v20, __ T4S, tmp3, tmp4);
3022       else if (round < 10 || round >= 15)
3023         __ sha1p(v20, __ T4S, tmp3, tmp4);
3024       else
3025         __ sha1m(v20, __ T4S, tmp3, tmp4);
3026       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3027 
3028       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3029     }
3030 
3031     __ addv(v7, __ T2S, v7, v21);
3032     __ addv(v6, __ T4S, v6, v20);
3033 
3034     if (multi_block) {
3035       __ add(ofs, ofs, 64);
3036       __ cmp(ofs, limit);
3037       __ br(Assembler::LE, sha1_loop);
3038       __ mov(c_rarg0, ofs); // return ofs
3039     }
3040 
3041     __ strq(v6, Address(state, 0));
3042     __ strs(v7, Address(state, 16));
3043 
3044     __ ret(lr);
3045 
3046     __ bind(keys);
3047     __ emit_int32(0x5a827999);
3048     __ emit_int32(0x6ed9eba1);
3049     __ emit_int32(0x8f1bbcdc);
3050     __ emit_int32(0xca62c1d6);
3051 
3052     return start;
3053   }
3054 
3055 
3056   // Arguments:
3057   //
3058   // Inputs:
3059   //   c_rarg0   - byte[]  source+offset
3060   //   c_rarg1   - int[]   SHA.state
3061   //   c_rarg2   - int     offset
3062   //   c_rarg3   - int     limit
3063   //
3064   address generate_sha256_implCompress(bool multi_block, const char *name) {
3065     static const uint32_t round_consts[64] = {
3066       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3067       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3068       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3069       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3070       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3071       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3072       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3073       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3074       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3075       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3076       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3077       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3078       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3079       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3080       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3081       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3082     };
3083     __ align(CodeEntryAlignment);
3084     StubCodeMark mark(this, "StubRoutines", name);
3085     address start = __ pc();
3086 
3087     Register buf   = c_rarg0;
3088     Register state = c_rarg1;
3089     Register ofs   = c_rarg2;
3090     Register limit = c_rarg3;
3091 
3092     Label sha1_loop;
3093 
3094     __ stpd(v8, v9, __ pre(sp, -32));
3095     __ stpd(v10, v11, Address(sp, 16));
3096 
3097 // dga == v0
3098 // dgb == v1
3099 // dg0 == v2
3100 // dg1 == v3
3101 // dg2 == v4
3102 // t0 == v6
3103 // t1 == v7
3104 
3105     // load 16 keys to v16..v31
3106     __ lea(rscratch1, ExternalAddress((address)round_consts));
3107     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3108     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3109     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3110     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3111 
3112     // load 8 words (256 bits) state
3113     __ ldpq(v0, v1, state);
3114 
3115     __ BIND(sha1_loop);
3116     // load 64 bytes of data into v8..v11
3117     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3118     __ rev32(v8, __ T16B, v8);
3119     __ rev32(v9, __ T16B, v9);
3120     __ rev32(v10, __ T16B, v10);
3121     __ rev32(v11, __ T16B, v11);
3122 
3123     __ addv(v6, __ T4S, v8, v16);
3124     __ orr(v2, __ T16B, v0, v0);
3125     __ orr(v3, __ T16B, v1, v1);
3126 
3127     FloatRegister d0 = v8;
3128     FloatRegister d1 = v9;
3129     FloatRegister d2 = v10;
3130     FloatRegister d3 = v11;
3131 
3132 
3133     for (int round = 0; round < 16; round++) {
3134       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3135       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3136       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3137       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3138 
3139       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3140        __ orr(v4, __ T16B, v2, v2);
3141       if (round < 15)
3142         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3143       __ sha256h(v2, __ T4S, v3, tmp2);
3144       __ sha256h2(v3, __ T4S, v4, tmp2);
3145       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3146 
3147       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3148     }
3149 
3150     __ addv(v0, __ T4S, v0, v2);
3151     __ addv(v1, __ T4S, v1, v3);
3152 
3153     if (multi_block) {
3154       __ add(ofs, ofs, 64);
3155       __ cmp(ofs, limit);
3156       __ br(Assembler::LE, sha1_loop);
3157       __ mov(c_rarg0, ofs); // return ofs
3158     }
3159 
3160     __ ldpd(v10, v11, Address(sp, 16));
3161     __ ldpd(v8, v9, __ post(sp, 32));
3162 
3163     __ stpq(v0, v1, state);
3164 
3165     __ ret(lr);
3166 
3167     return start;
3168   }
3169 
3170 #ifndef BUILTIN_SIM
3171   // Safefetch stubs.
3172   void generate_safefetch(const char* name, int size, address* entry,
3173                           address* fault_pc, address* continuation_pc) {
3174     // safefetch signatures:
3175     //   int      SafeFetch32(int*      adr, int      errValue);
3176     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3177     //
3178     // arguments:
3179     //   c_rarg0 = adr
3180     //   c_rarg1 = errValue
3181     //
3182     // result:
3183     //   PPC_RET  = *adr or errValue
3184 
3185     StubCodeMark mark(this, "StubRoutines", name);
3186 
3187     // Entry point, pc or function descriptor.
3188     *entry = __ pc();
3189 
3190     // Load *adr into c_rarg1, may fault.
3191     *fault_pc = __ pc();
3192     switch (size) {
3193       case 4:
3194         // int32_t
3195         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3196         break;
3197       case 8:
3198         // int64_t
3199         __ ldr(c_rarg1, Address(c_rarg0, 0));
3200         break;
3201       default:
3202         ShouldNotReachHere();
3203     }
3204 
3205     // return errValue or *adr
3206     *continuation_pc = __ pc();
3207     __ mov(r0, c_rarg1);
3208     __ ret(lr);
3209   }
3210 #endif
3211 
3212   /**
3213    *  Arguments:
3214    *
3215    * Inputs:
3216    *   c_rarg0   - int crc
3217    *   c_rarg1   - byte* buf
3218    *   c_rarg2   - int length
3219    *
3220    * Ouput:
3221    *       rax   - int crc result
3222    */
3223   address generate_updateBytesCRC32() {
3224     assert(UseCRC32Intrinsics, "what are we doing here?");
3225 
3226     __ align(CodeEntryAlignment);
3227     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3228 
3229     address start = __ pc();
3230 
3231     const Register crc   = c_rarg0;  // crc
3232     const Register buf   = c_rarg1;  // source java byte array address
3233     const Register len   = c_rarg2;  // length
3234     const Register table0 = c_rarg3; // crc_table address
3235     const Register table1 = c_rarg4;
3236     const Register table2 = c_rarg5;
3237     const Register table3 = c_rarg6;
3238     const Register tmp3 = c_rarg7;
3239 
3240     BLOCK_COMMENT("Entry:");
3241     __ enter(); // required for proper stackwalking of RuntimeStub frame
3242 
3243     __ kernel_crc32(crc, buf, len,
3244               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3245 
3246     __ leave(); // required for proper stackwalking of RuntimeStub frame
3247     __ ret(lr);
3248 
3249     return start;
3250   }
3251 
3252   /**
3253    *  Arguments:
3254    *
3255    * Inputs:
3256    *   c_rarg0   - int crc
3257    *   c_rarg1   - byte* buf
3258    *   c_rarg2   - int length
3259    *   c_rarg3   - int* table
3260    *
3261    * Ouput:
3262    *       r0   - int crc result
3263    */
3264   address generate_updateBytesCRC32C() {
3265     assert(UseCRC32CIntrinsics, "what are we doing here?");
3266 
3267     __ align(CodeEntryAlignment);
3268     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3269 
3270     address start = __ pc();
3271 
3272     const Register crc   = c_rarg0;  // crc
3273     const Register buf   = c_rarg1;  // source java byte array address
3274     const Register len   = c_rarg2;  // length
3275     const Register table0 = c_rarg3; // crc_table address
3276     const Register table1 = c_rarg4;
3277     const Register table2 = c_rarg5;
3278     const Register table3 = c_rarg6;
3279     const Register tmp3 = c_rarg7;
3280 
3281     BLOCK_COMMENT("Entry:");
3282     __ enter(); // required for proper stackwalking of RuntimeStub frame
3283 
3284     __ kernel_crc32c(crc, buf, len,
3285               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3286 
3287     __ leave(); // required for proper stackwalking of RuntimeStub frame
3288     __ ret(lr);
3289 
3290     return start;
3291   }
3292 
3293   /***
3294    *  Arguments:
3295    *
3296    *  Inputs:
3297    *   c_rarg0   - int   adler
3298    *   c_rarg1   - byte* buff
3299    *   c_rarg2   - int   len
3300    *
3301    * Output:
3302    *   c_rarg0   - int adler result
3303    */
3304   address generate_updateBytesAdler32() {
3305     __ align(CodeEntryAlignment);
3306     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3307     address start = __ pc();
3308 
3309     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3310 
3311     // Aliases
3312     Register adler  = c_rarg0;
3313     Register s1     = c_rarg0;
3314     Register s2     = c_rarg3;
3315     Register buff   = c_rarg1;
3316     Register len    = c_rarg2;
3317     Register nmax  = r4;
3318     Register base = r5;
3319     Register count = r6;
3320     Register temp0 = rscratch1;
3321     Register temp1 = rscratch2;
3322     Register temp2 = r7;
3323 
3324     // Max number of bytes we can process before having to take the mod
3325     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3326     unsigned long BASE = 0xfff1;
3327     unsigned long NMAX = 0x15B0;
3328 
3329     __ mov(base, BASE);
3330     __ mov(nmax, NMAX);
3331 
3332     // s1 is initialized to the lower 16 bits of adler
3333     // s2 is initialized to the upper 16 bits of adler
3334     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3335     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3336 
3337     // The pipelined loop needs at least 16 elements for 1 iteration
3338     // It does check this, but it is more effective to skip to the cleanup loop
3339     __ cmp(len, 16);
3340     __ br(Assembler::HS, L_nmax);
3341     __ cbz(len, L_combine);
3342 
3343     __ bind(L_simple_by1_loop);
3344     __ ldrb(temp0, Address(__ post(buff, 1)));
3345     __ add(s1, s1, temp0);
3346     __ add(s2, s2, s1);
3347     __ subs(len, len, 1);
3348     __ br(Assembler::HI, L_simple_by1_loop);
3349 
3350     // s1 = s1 % BASE
3351     __ subs(temp0, s1, base);
3352     __ csel(s1, temp0, s1, Assembler::HS);
3353 
3354     // s2 = s2 % BASE
3355     __ lsr(temp0, s2, 16);
3356     __ lsl(temp1, temp0, 4);
3357     __ sub(temp1, temp1, temp0);
3358     __ add(s2, temp1, s2, ext::uxth);
3359 
3360     __ subs(temp0, s2, base);
3361     __ csel(s2, temp0, s2, Assembler::HS);
3362 
3363     __ b(L_combine);
3364 
3365     __ bind(L_nmax);
3366     __ subs(len, len, nmax);
3367     __ sub(count, nmax, 16);
3368     __ br(Assembler::LO, L_by16);
3369 
3370     __ bind(L_nmax_loop);
3371 
3372     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3373 
3374     __ add(s1, s1, temp0, ext::uxtb);
3375     __ ubfx(temp2, temp0, 8, 8);
3376     __ add(s2, s2, s1);
3377     __ add(s1, s1, temp2);
3378     __ ubfx(temp2, temp0, 16, 8);
3379     __ add(s2, s2, s1);
3380     __ add(s1, s1, temp2);
3381     __ ubfx(temp2, temp0, 24, 8);
3382     __ add(s2, s2, s1);
3383     __ add(s1, s1, temp2);
3384     __ ubfx(temp2, temp0, 32, 8);
3385     __ add(s2, s2, s1);
3386     __ add(s1, s1, temp2);
3387     __ ubfx(temp2, temp0, 40, 8);
3388     __ add(s2, s2, s1);
3389     __ add(s1, s1, temp2);
3390     __ ubfx(temp2, temp0, 48, 8);
3391     __ add(s2, s2, s1);
3392     __ add(s1, s1, temp2);
3393     __ add(s2, s2, s1);
3394     __ add(s1, s1, temp0, Assembler::LSR, 56);
3395     __ add(s2, s2, s1);
3396 
3397     __ add(s1, s1, temp1, ext::uxtb);
3398     __ ubfx(temp2, temp1, 8, 8);
3399     __ add(s2, s2, s1);
3400     __ add(s1, s1, temp2);
3401     __ ubfx(temp2, temp1, 16, 8);
3402     __ add(s2, s2, s1);
3403     __ add(s1, s1, temp2);
3404     __ ubfx(temp2, temp1, 24, 8);
3405     __ add(s2, s2, s1);
3406     __ add(s1, s1, temp2);
3407     __ ubfx(temp2, temp1, 32, 8);
3408     __ add(s2, s2, s1);
3409     __ add(s1, s1, temp2);
3410     __ ubfx(temp2, temp1, 40, 8);
3411     __ add(s2, s2, s1);
3412     __ add(s1, s1, temp2);
3413     __ ubfx(temp2, temp1, 48, 8);
3414     __ add(s2, s2, s1);
3415     __ add(s1, s1, temp2);
3416     __ add(s2, s2, s1);
3417     __ add(s1, s1, temp1, Assembler::LSR, 56);
3418     __ add(s2, s2, s1);
3419 
3420     __ subs(count, count, 16);
3421     __ br(Assembler::HS, L_nmax_loop);
3422 
3423     // s1 = s1 % BASE
3424     __ lsr(temp0, s1, 16);
3425     __ lsl(temp1, temp0, 4);
3426     __ sub(temp1, temp1, temp0);
3427     __ add(temp1, temp1, s1, ext::uxth);
3428 
3429     __ lsr(temp0, temp1, 16);
3430     __ lsl(s1, temp0, 4);
3431     __ sub(s1, s1, temp0);
3432     __ add(s1, s1, temp1, ext:: uxth);
3433 
3434     __ subs(temp0, s1, base);
3435     __ csel(s1, temp0, s1, Assembler::HS);
3436 
3437     // s2 = s2 % BASE
3438     __ lsr(temp0, s2, 16);
3439     __ lsl(temp1, temp0, 4);
3440     __ sub(temp1, temp1, temp0);
3441     __ add(temp1, temp1, s2, ext::uxth);
3442 
3443     __ lsr(temp0, temp1, 16);
3444     __ lsl(s2, temp0, 4);
3445     __ sub(s2, s2, temp0);
3446     __ add(s2, s2, temp1, ext:: uxth);
3447 
3448     __ subs(temp0, s2, base);
3449     __ csel(s2, temp0, s2, Assembler::HS);
3450 
3451     __ subs(len, len, nmax);
3452     __ sub(count, nmax, 16);
3453     __ br(Assembler::HS, L_nmax_loop);
3454 
3455     __ bind(L_by16);
3456     __ adds(len, len, count);
3457     __ br(Assembler::LO, L_by1);
3458 
3459     __ bind(L_by16_loop);
3460 
3461     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3462 
3463     __ add(s1, s1, temp0, ext::uxtb);
3464     __ ubfx(temp2, temp0, 8, 8);
3465     __ add(s2, s2, s1);
3466     __ add(s1, s1, temp2);
3467     __ ubfx(temp2, temp0, 16, 8);
3468     __ add(s2, s2, s1);
3469     __ add(s1, s1, temp2);
3470     __ ubfx(temp2, temp0, 24, 8);
3471     __ add(s2, s2, s1);
3472     __ add(s1, s1, temp2);
3473     __ ubfx(temp2, temp0, 32, 8);
3474     __ add(s2, s2, s1);
3475     __ add(s1, s1, temp2);
3476     __ ubfx(temp2, temp0, 40, 8);
3477     __ add(s2, s2, s1);
3478     __ add(s1, s1, temp2);
3479     __ ubfx(temp2, temp0, 48, 8);
3480     __ add(s2, s2, s1);
3481     __ add(s1, s1, temp2);
3482     __ add(s2, s2, s1);
3483     __ add(s1, s1, temp0, Assembler::LSR, 56);
3484     __ add(s2, s2, s1);
3485 
3486     __ add(s1, s1, temp1, ext::uxtb);
3487     __ ubfx(temp2, temp1, 8, 8);
3488     __ add(s2, s2, s1);
3489     __ add(s1, s1, temp2);
3490     __ ubfx(temp2, temp1, 16, 8);
3491     __ add(s2, s2, s1);
3492     __ add(s1, s1, temp2);
3493     __ ubfx(temp2, temp1, 24, 8);
3494     __ add(s2, s2, s1);
3495     __ add(s1, s1, temp2);
3496     __ ubfx(temp2, temp1, 32, 8);
3497     __ add(s2, s2, s1);
3498     __ add(s1, s1, temp2);
3499     __ ubfx(temp2, temp1, 40, 8);
3500     __ add(s2, s2, s1);
3501     __ add(s1, s1, temp2);
3502     __ ubfx(temp2, temp1, 48, 8);
3503     __ add(s2, s2, s1);
3504     __ add(s1, s1, temp2);
3505     __ add(s2, s2, s1);
3506     __ add(s1, s1, temp1, Assembler::LSR, 56);
3507     __ add(s2, s2, s1);
3508 
3509     __ subs(len, len, 16);
3510     __ br(Assembler::HS, L_by16_loop);
3511 
3512     __ bind(L_by1);
3513     __ adds(len, len, 15);
3514     __ br(Assembler::LO, L_do_mod);
3515 
3516     __ bind(L_by1_loop);
3517     __ ldrb(temp0, Address(__ post(buff, 1)));
3518     __ add(s1, temp0, s1);
3519     __ add(s2, s2, s1);
3520     __ subs(len, len, 1);
3521     __ br(Assembler::HS, L_by1_loop);
3522 
3523     __ bind(L_do_mod);
3524     // s1 = s1 % BASE
3525     __ lsr(temp0, s1, 16);
3526     __ lsl(temp1, temp0, 4);
3527     __ sub(temp1, temp1, temp0);
3528     __ add(temp1, temp1, s1, ext::uxth);
3529 
3530     __ lsr(temp0, temp1, 16);
3531     __ lsl(s1, temp0, 4);
3532     __ sub(s1, s1, temp0);
3533     __ add(s1, s1, temp1, ext:: uxth);
3534 
3535     __ subs(temp0, s1, base);
3536     __ csel(s1, temp0, s1, Assembler::HS);
3537 
3538     // s2 = s2 % BASE
3539     __ lsr(temp0, s2, 16);
3540     __ lsl(temp1, temp0, 4);
3541     __ sub(temp1, temp1, temp0);
3542     __ add(temp1, temp1, s2, ext::uxth);
3543 
3544     __ lsr(temp0, temp1, 16);
3545     __ lsl(s2, temp0, 4);
3546     __ sub(s2, s2, temp0);
3547     __ add(s2, s2, temp1, ext:: uxth);
3548 
3549     __ subs(temp0, s2, base);
3550     __ csel(s2, temp0, s2, Assembler::HS);
3551 
3552     // Combine lower bits and higher bits
3553     __ bind(L_combine);
3554     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3555 
3556     __ ret(lr);
3557 
3558     return start;
3559   }
3560 
3561   /**
3562    *  Arguments:
3563    *
3564    *  Input:
3565    *    c_rarg0   - x address
3566    *    c_rarg1   - x length
3567    *    c_rarg2   - y address
3568    *    c_rarg3   - y lenth
3569    *    c_rarg4   - z address
3570    *    c_rarg5   - z length
3571    */
3572   address generate_multiplyToLen() {
3573     __ align(CodeEntryAlignment);
3574     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3575 
3576     address start = __ pc();
3577     const Register x     = r0;
3578     const Register xlen  = r1;
3579     const Register y     = r2;
3580     const Register ylen  = r3;
3581     const Register z     = r4;
3582     const Register zlen  = r5;
3583 
3584     const Register tmp1  = r10;
3585     const Register tmp2  = r11;
3586     const Register tmp3  = r12;
3587     const Register tmp4  = r13;
3588     const Register tmp5  = r14;
3589     const Register tmp6  = r15;
3590     const Register tmp7  = r16;
3591 
3592     BLOCK_COMMENT("Entry:");
3593     __ enter(); // required for proper stackwalking of RuntimeStub frame
3594     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3595     __ leave(); // required for proper stackwalking of RuntimeStub frame
3596     __ ret(lr);
3597 
3598     return start;
3599   }
3600 
3601   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3602                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3603                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3604     // Karatsuba multiplication performs a 128*128 -> 256-bit
3605     // multiplication in three 128-bit multiplications and a few
3606     // additions.
3607     //
3608     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3609     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3610     //
3611     // Inputs:
3612     //
3613     // A0 in a.d[0]     (subkey)
3614     // A1 in a.d[1]
3615     // (A1+A0) in a1_xor_a0.d[0]
3616     //
3617     // B0 in b.d[0]     (state)
3618     // B1 in b.d[1]
3619 
3620     __ ext(tmp1, __ T16B, b, b, 0x08);
3621     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3622     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3623     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3624     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3625 
3626     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3627     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3628     __ eor(tmp2, __ T16B, tmp2, tmp4);
3629     __ eor(tmp2, __ T16B, tmp2, tmp3);
3630 
3631     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3632     __ ins(result_hi, __ D, tmp2, 0, 1);
3633     __ ins(result_lo, __ D, tmp2, 1, 0);
3634   }
3635 
3636   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3637                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3638     const FloatRegister t0 = result;
3639 
3640     // The GCM field polynomial f is z^128 + p(z), where p =
3641     // z^7+z^2+z+1.
3642     //
3643     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3644     //
3645     // so, given that the product we're reducing is
3646     //    a == lo + hi * z^128
3647     // substituting,
3648     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3649     //
3650     // we reduce by multiplying hi by p(z) and subtracting the result
3651     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3652     // bits we can do this with two 64-bit multiplications, lo*p and
3653     // hi*p.
3654 
3655     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3656     __ ext(t1, __ T16B, t0, z, 8);
3657     __ eor(hi, __ T16B, hi, t1);
3658     __ ext(t1, __ T16B, z, t0, 8);
3659     __ eor(lo, __ T16B, lo, t1);
3660     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3661     __ eor(result, __ T16B, lo, t0);
3662   }
3663 
3664   /**
3665    *  Arguments:
3666    *
3667    *  Input:
3668    *  c_rarg0   - current state address
3669    *  c_rarg1   - H key address
3670    *  c_rarg2   - data address
3671    *  c_rarg3   - number of blocks
3672    *
3673    *  Output:
3674    *  Updated state at c_rarg0
3675    */
3676   address generate_ghash_processBlocks() {
3677     // Bafflingly, GCM uses little-endian for the byte order, but
3678     // big-endian for the bit order.  For example, the polynomial 1 is
3679     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3680     //
3681     // So, we must either reverse the bytes in each word and do
3682     // everything big-endian or reverse the bits in each byte and do
3683     // it little-endian.  On AArch64 it's more idiomatic to reverse
3684     // the bits in each byte (we have an instruction, RBIT, to do
3685     // that) and keep the data in little-endian bit order throught the
3686     // calculation, bit-reversing the inputs and outputs.
3687 
3688     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3689     __ align(wordSize * 2);
3690     address p = __ pc();
3691     __ emit_int64(0x87);  // The low-order bits of the field
3692                           // polynomial (i.e. p = z^7+z^2+z+1)
3693                           // repeated in the low and high parts of a
3694                           // 128-bit vector
3695     __ emit_int64(0x87);
3696 
3697     __ align(CodeEntryAlignment);
3698     address start = __ pc();
3699 
3700     Register state   = c_rarg0;
3701     Register subkeyH = c_rarg1;
3702     Register data    = c_rarg2;
3703     Register blocks  = c_rarg3;
3704 
3705     FloatRegister vzr = v30;
3706     __ eor(vzr, __ T16B, vzr, vzr); // zero register
3707 
3708     __ ldrq(v0, Address(state));
3709     __ ldrq(v1, Address(subkeyH));
3710 
3711     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3712     __ rbit(v0, __ T16B, v0);
3713     __ rev64(v1, __ T16B, v1);
3714     __ rbit(v1, __ T16B, v1);
3715 
3716     __ ldrq(v26, p);
3717 
3718     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
3719     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
3720 
3721     {
3722       Label L_ghash_loop;
3723       __ bind(L_ghash_loop);
3724 
3725       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
3726                                                  // reversing each byte
3727       __ rbit(v2, __ T16B, v2);
3728       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
3729 
3730       // Multiply state in v2 by subkey in v1
3731       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
3732                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
3733                      /*temps*/v6, v20, v18, v21);
3734       // Reduce v7:v5 by the field polynomial
3735       ghash_reduce(v0, v5, v7, v26, vzr, v20);
3736 
3737       __ sub(blocks, blocks, 1);
3738       __ cbnz(blocks, L_ghash_loop);
3739     }
3740 
3741     // The bit-reversed result is at this point in v0
3742     __ rev64(v1, __ T16B, v0);
3743     __ rbit(v1, __ T16B, v1);
3744 
3745     __ st1(v1, __ T16B, state);
3746     __ ret(lr);
3747 
3748     return start;
3749   }
3750 
3751   // Continuation point for throwing of implicit exceptions that are
3752   // not handled in the current activation. Fabricates an exception
3753   // oop and initiates normal exception dispatching in this
3754   // frame. Since we need to preserve callee-saved values (currently
3755   // only for C2, but done for C1 as well) we need a callee-saved oop
3756   // map and therefore have to make these stubs into RuntimeStubs
3757   // rather than BufferBlobs.  If the compiler needs all registers to
3758   // be preserved between the fault point and the exception handler
3759   // then it must assume responsibility for that in
3760   // AbstractCompiler::continuation_for_implicit_null_exception or
3761   // continuation_for_implicit_division_by_zero_exception. All other
3762   // implicit exceptions (e.g., NullPointerException or
3763   // AbstractMethodError on entry) are either at call sites or
3764   // otherwise assume that stack unwinding will be initiated, so
3765   // caller saved registers were assumed volatile in the compiler.
3766 
3767 #undef __
3768 #define __ masm->
3769 
3770   address generate_throw_exception(const char* name,
3771                                    address runtime_entry,
3772                                    Register arg1 = noreg,
3773                                    Register arg2 = noreg) {
3774     // Information about frame layout at time of blocking runtime call.
3775     // Note that we only have to preserve callee-saved registers since
3776     // the compilers are responsible for supplying a continuation point
3777     // if they expect all registers to be preserved.
3778     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3779     enum layout {
3780       rfp_off = 0,
3781       rfp_off2,
3782       return_off,
3783       return_off2,
3784       framesize // inclusive of return address
3785     };
3786 
3787     int insts_size = 512;
3788     int locs_size  = 64;
3789 
3790     CodeBuffer code(name, insts_size, locs_size);
3791     OopMapSet* oop_maps  = new OopMapSet();
3792     MacroAssembler* masm = new MacroAssembler(&code);
3793 
3794     address start = __ pc();
3795 
3796     // This is an inlined and slightly modified version of call_VM
3797     // which has the ability to fetch the return PC out of
3798     // thread-local storage and also sets up last_Java_sp slightly
3799     // differently than the real call_VM
3800 
3801     __ enter(); // Save FP and LR before call
3802 
3803     assert(is_even(framesize/2), "sp not 16-byte aligned");
3804 
3805     // lr and fp are already in place
3806     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3807 
3808     int frame_complete = __ pc() - start;
3809 
3810     // Set up last_Java_sp and last_Java_fp
3811     address the_pc = __ pc();
3812     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3813 
3814     // Call runtime
3815     if (arg1 != noreg) {
3816       assert(arg2 != c_rarg1, "clobbered");
3817       __ mov(c_rarg1, arg1);
3818     }
3819     if (arg2 != noreg) {
3820       __ mov(c_rarg2, arg2);
3821     }
3822     __ mov(c_rarg0, rthread);
3823     BLOCK_COMMENT("call runtime_entry");
3824     __ mov(rscratch1, runtime_entry);
3825     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
3826 
3827     // Generate oop map
3828     OopMap* map = new OopMap(framesize, 0);
3829 
3830     oop_maps->add_gc_map(the_pc - start, map);
3831 
3832     __ reset_last_Java_frame(true);
3833     __ maybe_isb();
3834 
3835     __ leave();
3836 
3837     // check for pending exceptions
3838 #ifdef ASSERT
3839     Label L;
3840     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3841     __ cbnz(rscratch1, L);
3842     __ should_not_reach_here();
3843     __ bind(L);
3844 #endif // ASSERT
3845     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3846 
3847 
3848     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3849     RuntimeStub* stub =
3850       RuntimeStub::new_runtime_stub(name,
3851                                     &code,
3852                                     frame_complete,
3853                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3854                                     oop_maps, false);
3855     return stub->entry_point();
3856   }
3857 
3858   class MontgomeryMultiplyGenerator : public MacroAssembler {
3859 
3860     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3861       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
3862 
3863     RegSet _toSave;
3864     bool _squaring;
3865 
3866   public:
3867     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3868       : MacroAssembler(as->code()), _squaring(squaring) {
3869 
3870       // Register allocation
3871 
3872       Register reg = c_rarg0;
3873       Pa_base = reg;       // Argument registers
3874       if (squaring)
3875         Pb_base = Pa_base;
3876       else
3877         Pb_base = ++reg;
3878       Pn_base = ++reg;
3879       Rlen= ++reg;
3880       inv = ++reg;
3881       Pm_base = ++reg;
3882 
3883                           // Working registers:
3884       Ra =  ++reg;        // The current digit of a, b, n, and m.
3885       Rb =  ++reg;
3886       Rm =  ++reg;
3887       Rn =  ++reg;
3888 
3889       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
3890       Pb =  ++reg;
3891       Pm =  ++reg;
3892       Pn =  ++reg;
3893 
3894       t0 =  ++reg;        // Three registers which form a
3895       t1 =  ++reg;        // triple-precision accumuator.
3896       t2 =  ++reg;
3897 
3898       Ri =  ++reg;        // Inner and outer loop indexes.
3899       Rj =  ++reg;
3900 
3901       Rhi_ab = ++reg;     // Product registers: low and high parts
3902       Rlo_ab = ++reg;     // of a*b and m*n.
3903       Rhi_mn = ++reg;
3904       Rlo_mn = ++reg;
3905 
3906       // r19 and up are callee-saved.
3907       _toSave = RegSet::range(r19, reg) + Pm_base;
3908     }
3909 
3910   private:
3911     void save_regs() {
3912       push(_toSave, sp);
3913     }
3914 
3915     void restore_regs() {
3916       pop(_toSave, sp);
3917     }
3918 
3919     template <typename T>
3920     void unroll_2(Register count, T block) {
3921       Label loop, end, odd;
3922       tbnz(count, 0, odd);
3923       cbz(count, end);
3924       align(16);
3925       bind(loop);
3926       (this->*block)();
3927       bind(odd);
3928       (this->*block)();
3929       subs(count, count, 2);
3930       br(Assembler::GT, loop);
3931       bind(end);
3932     }
3933 
3934     template <typename T>
3935     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3936       Label loop, end, odd;
3937       tbnz(count, 0, odd);
3938       cbz(count, end);
3939       align(16);
3940       bind(loop);
3941       (this->*block)(d, s, tmp);
3942       bind(odd);
3943       (this->*block)(d, s, tmp);
3944       subs(count, count, 2);
3945       br(Assembler::GT, loop);
3946       bind(end);
3947     }
3948 
3949     void pre1(RegisterOrConstant i) {
3950       block_comment("pre1");
3951       // Pa = Pa_base;
3952       // Pb = Pb_base + i;
3953       // Pm = Pm_base;
3954       // Pn = Pn_base + i;
3955       // Ra = *Pa;
3956       // Rb = *Pb;
3957       // Rm = *Pm;
3958       // Rn = *Pn;
3959       ldr(Ra, Address(Pa_base));
3960       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3961       ldr(Rm, Address(Pm_base));
3962       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3963       lea(Pa, Address(Pa_base));
3964       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3965       lea(Pm, Address(Pm_base));
3966       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3967 
3968       // Zero the m*n result.
3969       mov(Rhi_mn, zr);
3970       mov(Rlo_mn, zr);
3971     }
3972 
3973     // The core multiply-accumulate step of a Montgomery
3974     // multiplication.  The idea is to schedule operations as a
3975     // pipeline so that instructions with long latencies (loads and
3976     // multiplies) have time to complete before their results are
3977     // used.  This most benefits in-order implementations of the
3978     // architecture but out-of-order ones also benefit.
3979     void step() {
3980       block_comment("step");
3981       // MACC(Ra, Rb, t0, t1, t2);
3982       // Ra = *++Pa;
3983       // Rb = *--Pb;
3984       umulh(Rhi_ab, Ra, Rb);
3985       mul(Rlo_ab, Ra, Rb);
3986       ldr(Ra, pre(Pa, wordSize));
3987       ldr(Rb, pre(Pb, -wordSize));
3988       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
3989                                        // previous iteration.
3990       // MACC(Rm, Rn, t0, t1, t2);
3991       // Rm = *++Pm;
3992       // Rn = *--Pn;
3993       umulh(Rhi_mn, Rm, Rn);
3994       mul(Rlo_mn, Rm, Rn);
3995       ldr(Rm, pre(Pm, wordSize));
3996       ldr(Rn, pre(Pn, -wordSize));
3997       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3998     }
3999 
4000     void post1() {
4001       block_comment("post1");
4002 
4003       // MACC(Ra, Rb, t0, t1, t2);
4004       // Ra = *++Pa;
4005       // Rb = *--Pb;
4006       umulh(Rhi_ab, Ra, Rb);
4007       mul(Rlo_ab, Ra, Rb);
4008       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4009       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4010 
4011       // *Pm = Rm = t0 * inv;
4012       mul(Rm, t0, inv);
4013       str(Rm, Address(Pm));
4014 
4015       // MACC(Rm, Rn, t0, t1, t2);
4016       // t0 = t1; t1 = t2; t2 = 0;
4017       umulh(Rhi_mn, Rm, Rn);
4018 
4019 #ifndef PRODUCT
4020       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4021       {
4022         mul(Rlo_mn, Rm, Rn);
4023         add(Rlo_mn, t0, Rlo_mn);
4024         Label ok;
4025         cbz(Rlo_mn, ok); {
4026           stop("broken Montgomery multiply");
4027         } bind(ok);
4028       }
4029 #endif
4030       // We have very carefully set things up so that
4031       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4032       // the lower half of Rm * Rn because we know the result already:
4033       // it must be -t0.  t0 + (-t0) must generate a carry iff
4034       // t0 != 0.  So, rather than do a mul and an adds we just set
4035       // the carry flag iff t0 is nonzero.
4036       //
4037       // mul(Rlo_mn, Rm, Rn);
4038       // adds(zr, t0, Rlo_mn);
4039       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4040       adcs(t0, t1, Rhi_mn);
4041       adc(t1, t2, zr);
4042       mov(t2, zr);
4043     }
4044 
4045     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
4046       block_comment("pre2");
4047       // Pa = Pa_base + i-len;
4048       // Pb = Pb_base + len;
4049       // Pm = Pm_base + i-len;
4050       // Pn = Pn_base + len;
4051 
4052       if (i.is_register()) {
4053         sub(Rj, i.as_register(), len);
4054       } else {
4055         mov(Rj, i.as_constant());
4056         sub(Rj, Rj, len);
4057       }
4058       // Rj == i-len
4059 
4060       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
4061       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
4062       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4063       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
4064 
4065       // Ra = *++Pa;
4066       // Rb = *--Pb;
4067       // Rm = *++Pm;
4068       // Rn = *--Pn;
4069       ldr(Ra, pre(Pa, wordSize));
4070       ldr(Rb, pre(Pb, -wordSize));
4071       ldr(Rm, pre(Pm, wordSize));
4072       ldr(Rn, pre(Pn, -wordSize));
4073 
4074       mov(Rhi_mn, zr);
4075       mov(Rlo_mn, zr);
4076     }
4077 
4078     void post2(RegisterOrConstant i, RegisterOrConstant len) {
4079       block_comment("post2");
4080       if (i.is_constant()) {
4081         mov(Rj, i.as_constant()-len.as_constant());
4082       } else {
4083         sub(Rj, i.as_register(), len);
4084       }
4085 
4086       adds(t0, t0, Rlo_mn); // The pending m*n, low part
4087 
4088       // As soon as we know the least significant digit of our result,
4089       // store it.
4090       // Pm_base[i-len] = t0;
4091       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4092 
4093       // t0 = t1; t1 = t2; t2 = 0;
4094       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
4095       adc(t1, t2, zr);
4096       mov(t2, zr);
4097     }
4098 
4099     // A carry in t0 after Montgomery multiplication means that we
4100     // should subtract multiples of n from our result in m.  We'll
4101     // keep doing that until there is no carry.
4102     void normalize(RegisterOrConstant len) {
4103       block_comment("normalize");
4104       // while (t0)
4105       //   t0 = sub(Pm_base, Pn_base, t0, len);
4106       Label loop, post, again;
4107       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
4108       cbz(t0, post); {
4109         bind(again); {
4110           mov(i, zr);
4111           mov(cnt, len);
4112           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4113           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4114           subs(zr, zr, zr); // set carry flag, i.e. no borrow
4115           align(16);
4116           bind(loop); {
4117             sbcs(Rm, Rm, Rn);
4118             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4119             add(i, i, 1);
4120             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4121             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4122             sub(cnt, cnt, 1);
4123           } cbnz(cnt, loop);
4124           sbc(t0, t0, zr);
4125         } cbnz(t0, again);
4126       } bind(post);
4127     }
4128 
4129     // Move memory at s to d, reversing words.
4130     //    Increments d to end of copied memory
4131     //    Destroys tmp1, tmp2
4132     //    Preserves len
4133     //    Leaves s pointing to the address which was in d at start
4134     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4135       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
4136 
4137       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
4138       mov(tmp1, len);
4139       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4140       sub(s, d, len, ext::uxtw, LogBytesPerWord);
4141     }
4142     // where
4143     void reverse1(Register d, Register s, Register tmp) {
4144       ldr(tmp, pre(s, -wordSize));
4145       ror(tmp, tmp, 32);
4146       str(tmp, post(d, wordSize));
4147     }
4148 
4149     void step_squaring() {
4150       // An extra ACC
4151       step();
4152       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4153     }
4154 
4155     void last_squaring(RegisterOrConstant i) {
4156       Label dont;
4157       // if ((i & 1) == 0) {
4158       tbnz(i.as_register(), 0, dont); {
4159         // MACC(Ra, Rb, t0, t1, t2);
4160         // Ra = *++Pa;
4161         // Rb = *--Pb;
4162         umulh(Rhi_ab, Ra, Rb);
4163         mul(Rlo_ab, Ra, Rb);
4164         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4165       } bind(dont);
4166     }
4167 
4168     void extra_step_squaring() {
4169       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4170 
4171       // MACC(Rm, Rn, t0, t1, t2);
4172       // Rm = *++Pm;
4173       // Rn = *--Pn;
4174       umulh(Rhi_mn, Rm, Rn);
4175       mul(Rlo_mn, Rm, Rn);
4176       ldr(Rm, pre(Pm, wordSize));
4177       ldr(Rn, pre(Pn, -wordSize));
4178     }
4179 
4180     void post1_squaring() {
4181       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4182 
4183       // *Pm = Rm = t0 * inv;
4184       mul(Rm, t0, inv);
4185       str(Rm, Address(Pm));
4186 
4187       // MACC(Rm, Rn, t0, t1, t2);
4188       // t0 = t1; t1 = t2; t2 = 0;
4189       umulh(Rhi_mn, Rm, Rn);
4190 
4191 #ifndef PRODUCT
4192       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4193       {
4194         mul(Rlo_mn, Rm, Rn);
4195         add(Rlo_mn, t0, Rlo_mn);
4196         Label ok;
4197         cbz(Rlo_mn, ok); {
4198           stop("broken Montgomery multiply");
4199         } bind(ok);
4200       }
4201 #endif
4202       // We have very carefully set things up so that
4203       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4204       // the lower half of Rm * Rn because we know the result already:
4205       // it must be -t0.  t0 + (-t0) must generate a carry iff
4206       // t0 != 0.  So, rather than do a mul and an adds we just set
4207       // the carry flag iff t0 is nonzero.
4208       //
4209       // mul(Rlo_mn, Rm, Rn);
4210       // adds(zr, t0, Rlo_mn);
4211       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4212       adcs(t0, t1, Rhi_mn);
4213       adc(t1, t2, zr);
4214       mov(t2, zr);
4215     }
4216 
4217     void acc(Register Rhi, Register Rlo,
4218              Register t0, Register t1, Register t2) {
4219       adds(t0, t0, Rlo);
4220       adcs(t1, t1, Rhi);
4221       adc(t2, t2, zr);
4222     }
4223 
4224   public:
4225     /**
4226      * Fast Montgomery multiplication.  The derivation of the
4227      * algorithm is in A Cryptographic Library for the Motorola
4228      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4229      *
4230      * Arguments:
4231      *
4232      * Inputs for multiplication:
4233      *   c_rarg0   - int array elements a
4234      *   c_rarg1   - int array elements b
4235      *   c_rarg2   - int array elements n (the modulus)
4236      *   c_rarg3   - int length
4237      *   c_rarg4   - int inv
4238      *   c_rarg5   - int array elements m (the result)
4239      *
4240      * Inputs for squaring:
4241      *   c_rarg0   - int array elements a
4242      *   c_rarg1   - int array elements n (the modulus)
4243      *   c_rarg2   - int length
4244      *   c_rarg3   - int inv
4245      *   c_rarg4   - int array elements m (the result)
4246      *
4247      */
4248     address generate_multiply() {
4249       Label argh, nothing;
4250       bind(argh);
4251       stop("MontgomeryMultiply total_allocation must be <= 8192");
4252 
4253       align(CodeEntryAlignment);
4254       address entry = pc();
4255 
4256       cbzw(Rlen, nothing);
4257 
4258       enter();
4259 
4260       // Make room.
4261       cmpw(Rlen, 512);
4262       br(Assembler::HI, argh);
4263       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4264       andr(sp, Ra, -2 * wordSize);
4265 
4266       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4267 
4268       {
4269         // Copy input args, reversing as we go.  We use Ra as a
4270         // temporary variable.
4271         reverse(Ra, Pa_base, Rlen, t0, t1);
4272         if (!_squaring)
4273           reverse(Ra, Pb_base, Rlen, t0, t1);
4274         reverse(Ra, Pn_base, Rlen, t0, t1);
4275       }
4276 
4277       // Push all call-saved registers and also Pm_base which we'll need
4278       // at the end.
4279       save_regs();
4280 
4281 #ifndef PRODUCT
4282       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4283       {
4284         ldr(Rn, Address(Pn_base, 0));
4285         mul(Rlo_mn, Rn, inv);
4286         cmp(Rlo_mn, -1);
4287         Label ok;
4288         br(EQ, ok); {
4289           stop("broken inverse in Montgomery multiply");
4290         } bind(ok);
4291       }
4292 #endif
4293 
4294       mov(Pm_base, Ra);
4295 
4296       mov(t0, zr);
4297       mov(t1, zr);
4298       mov(t2, zr);
4299 
4300       block_comment("for (int i = 0; i < len; i++) {");
4301       mov(Ri, zr); {
4302         Label loop, end;
4303         cmpw(Ri, Rlen);
4304         br(Assembler::GE, end);
4305 
4306         bind(loop);
4307         pre1(Ri);
4308 
4309         block_comment("  for (j = i; j; j--) {"); {
4310           movw(Rj, Ri);
4311           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4312         } block_comment("  } // j");
4313 
4314         post1();
4315         addw(Ri, Ri, 1);
4316         cmpw(Ri, Rlen);
4317         br(Assembler::LT, loop);
4318         bind(end);
4319         block_comment("} // i");
4320       }
4321 
4322       block_comment("for (int i = len; i < 2*len; i++) {");
4323       mov(Ri, Rlen); {
4324         Label loop, end;
4325         cmpw(Ri, Rlen, Assembler::LSL, 1);
4326         br(Assembler::GE, end);
4327 
4328         bind(loop);
4329         pre2(Ri, Rlen);
4330 
4331         block_comment("  for (j = len*2-i-1; j; j--) {"); {
4332           lslw(Rj, Rlen, 1);
4333           subw(Rj, Rj, Ri);
4334           subw(Rj, Rj, 1);
4335           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4336         } block_comment("  } // j");
4337 
4338         post2(Ri, Rlen);
4339         addw(Ri, Ri, 1);
4340         cmpw(Ri, Rlen, Assembler::LSL, 1);
4341         br(Assembler::LT, loop);
4342         bind(end);
4343       }
4344       block_comment("} // i");
4345 
4346       normalize(Rlen);
4347 
4348       mov(Ra, Pm_base);  // Save Pm_base in Ra
4349       restore_regs();  // Restore caller's Pm_base
4350 
4351       // Copy our result into caller's Pm_base
4352       reverse(Pm_base, Ra, Rlen, t0, t1);
4353 
4354       leave();
4355       bind(nothing);
4356       ret(lr);
4357 
4358       return entry;
4359     }
4360     // In C, approximately:
4361 
4362     // void
4363     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
4364     //                     unsigned long Pn_base[], unsigned long Pm_base[],
4365     //                     unsigned long inv, int len) {
4366     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4367     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4368     //   unsigned long Ra, Rb, Rn, Rm;
4369 
4370     //   int i;
4371 
4372     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4373 
4374     //   for (i = 0; i < len; i++) {
4375     //     int j;
4376 
4377     //     Pa = Pa_base;
4378     //     Pb = Pb_base + i;
4379     //     Pm = Pm_base;
4380     //     Pn = Pn_base + i;
4381 
4382     //     Ra = *Pa;
4383     //     Rb = *Pb;
4384     //     Rm = *Pm;
4385     //     Rn = *Pn;
4386 
4387     //     int iters = i;
4388     //     for (j = 0; iters--; j++) {
4389     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4390     //       MACC(Ra, Rb, t0, t1, t2);
4391     //       Ra = *++Pa;
4392     //       Rb = *--Pb;
4393     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4394     //       MACC(Rm, Rn, t0, t1, t2);
4395     //       Rm = *++Pm;
4396     //       Rn = *--Pn;
4397     //     }
4398 
4399     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
4400     //     MACC(Ra, Rb, t0, t1, t2);
4401     //     *Pm = Rm = t0 * inv;
4402     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4403     //     MACC(Rm, Rn, t0, t1, t2);
4404 
4405     //     assert(t0 == 0, "broken Montgomery multiply");
4406 
4407     //     t0 = t1; t1 = t2; t2 = 0;
4408     //   }
4409 
4410     //   for (i = len; i < 2*len; i++) {
4411     //     int j;
4412 
4413     //     Pa = Pa_base + i-len;
4414     //     Pb = Pb_base + len;
4415     //     Pm = Pm_base + i-len;
4416     //     Pn = Pn_base + len;
4417 
4418     //     Ra = *++Pa;
4419     //     Rb = *--Pb;
4420     //     Rm = *++Pm;
4421     //     Rn = *--Pn;
4422 
4423     //     int iters = len*2-i-1;
4424     //     for (j = i-len+1; iters--; j++) {
4425     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4426     //       MACC(Ra, Rb, t0, t1, t2);
4427     //       Ra = *++Pa;
4428     //       Rb = *--Pb;
4429     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4430     //       MACC(Rm, Rn, t0, t1, t2);
4431     //       Rm = *++Pm;
4432     //       Rn = *--Pn;
4433     //     }
4434 
4435     //     Pm_base[i-len] = t0;
4436     //     t0 = t1; t1 = t2; t2 = 0;
4437     //   }
4438 
4439     //   while (t0)
4440     //     t0 = sub(Pm_base, Pn_base, t0, len);
4441     // }
4442 
4443     /**
4444      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4445      * multiplies than Montgomery multiplication so it should be up to
4446      * 25% faster.  However, its loop control is more complex and it
4447      * may actually run slower on some machines.
4448      *
4449      * Arguments:
4450      *
4451      * Inputs:
4452      *   c_rarg0   - int array elements a
4453      *   c_rarg1   - int array elements n (the modulus)
4454      *   c_rarg2   - int length
4455      *   c_rarg3   - int inv
4456      *   c_rarg4   - int array elements m (the result)
4457      *
4458      */
4459     address generate_square() {
4460       Label argh;
4461       bind(argh);
4462       stop("MontgomeryMultiply total_allocation must be <= 8192");
4463 
4464       align(CodeEntryAlignment);
4465       address entry = pc();
4466 
4467       enter();
4468 
4469       // Make room.
4470       cmpw(Rlen, 512);
4471       br(Assembler::HI, argh);
4472       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4473       andr(sp, Ra, -2 * wordSize);
4474 
4475       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4476 
4477       {
4478         // Copy input args, reversing as we go.  We use Ra as a
4479         // temporary variable.
4480         reverse(Ra, Pa_base, Rlen, t0, t1);
4481         reverse(Ra, Pn_base, Rlen, t0, t1);
4482       }
4483 
4484       // Push all call-saved registers and also Pm_base which we'll need
4485       // at the end.
4486       save_regs();
4487 
4488       mov(Pm_base, Ra);
4489 
4490       mov(t0, zr);
4491       mov(t1, zr);
4492       mov(t2, zr);
4493 
4494       block_comment("for (int i = 0; i < len; i++) {");
4495       mov(Ri, zr); {
4496         Label loop, end;
4497         bind(loop);
4498         cmp(Ri, Rlen);
4499         br(Assembler::GE, end);
4500 
4501         pre1(Ri);
4502 
4503         block_comment("for (j = (i+1)/2; j; j--) {"); {
4504           add(Rj, Ri, 1);
4505           lsr(Rj, Rj, 1);
4506           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4507         } block_comment("  } // j");
4508 
4509         last_squaring(Ri);
4510 
4511         block_comment("  for (j = i/2; j; j--) {"); {
4512           lsr(Rj, Ri, 1);
4513           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4514         } block_comment("  } // j");
4515 
4516         post1_squaring();
4517         add(Ri, Ri, 1);
4518         cmp(Ri, Rlen);
4519         br(Assembler::LT, loop);
4520 
4521         bind(end);
4522         block_comment("} // i");
4523       }
4524 
4525       block_comment("for (int i = len; i < 2*len; i++) {");
4526       mov(Ri, Rlen); {
4527         Label loop, end;
4528         bind(loop);
4529         cmp(Ri, Rlen, Assembler::LSL, 1);
4530         br(Assembler::GE, end);
4531 
4532         pre2(Ri, Rlen);
4533 
4534         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4535           lsl(Rj, Rlen, 1);
4536           sub(Rj, Rj, Ri);
4537           sub(Rj, Rj, 1);
4538           lsr(Rj, Rj, 1);
4539           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4540         } block_comment("  } // j");
4541 
4542         last_squaring(Ri);
4543 
4544         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4545           lsl(Rj, Rlen, 1);
4546           sub(Rj, Rj, Ri);
4547           lsr(Rj, Rj, 1);
4548           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4549         } block_comment("  } // j");
4550 
4551         post2(Ri, Rlen);
4552         add(Ri, Ri, 1);
4553         cmp(Ri, Rlen, Assembler::LSL, 1);
4554 
4555         br(Assembler::LT, loop);
4556         bind(end);
4557         block_comment("} // i");
4558       }
4559 
4560       normalize(Rlen);
4561 
4562       mov(Ra, Pm_base);  // Save Pm_base in Ra
4563       restore_regs();  // Restore caller's Pm_base
4564 
4565       // Copy our result into caller's Pm_base
4566       reverse(Pm_base, Ra, Rlen, t0, t1);
4567 
4568       leave();
4569       ret(lr);
4570 
4571       return entry;
4572     }
4573     // In C, approximately:
4574 
4575     // void
4576     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4577     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4578     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4579     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4580     //   unsigned long Ra, Rb, Rn, Rm;
4581 
4582     //   int i;
4583 
4584     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4585 
4586     //   for (i = 0; i < len; i++) {
4587     //     int j;
4588 
4589     //     Pa = Pa_base;
4590     //     Pb = Pa_base + i;
4591     //     Pm = Pm_base;
4592     //     Pn = Pn_base + i;
4593 
4594     //     Ra = *Pa;
4595     //     Rb = *Pb;
4596     //     Rm = *Pm;
4597     //     Rn = *Pn;
4598 
4599     //     int iters = (i+1)/2;
4600     //     for (j = 0; iters--; j++) {
4601     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4602     //       MACC2(Ra, Rb, t0, t1, t2);
4603     //       Ra = *++Pa;
4604     //       Rb = *--Pb;
4605     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4606     //       MACC(Rm, Rn, t0, t1, t2);
4607     //       Rm = *++Pm;
4608     //       Rn = *--Pn;
4609     //     }
4610     //     if ((i & 1) == 0) {
4611     //       assert(Ra == Pa_base[j], "must be");
4612     //       MACC(Ra, Ra, t0, t1, t2);
4613     //     }
4614     //     iters = i/2;
4615     //     assert(iters == i-j, "must be");
4616     //     for (; iters--; j++) {
4617     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4618     //       MACC(Rm, Rn, t0, t1, t2);
4619     //       Rm = *++Pm;
4620     //       Rn = *--Pn;
4621     //     }
4622 
4623     //     *Pm = Rm = t0 * inv;
4624     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4625     //     MACC(Rm, Rn, t0, t1, t2);
4626 
4627     //     assert(t0 == 0, "broken Montgomery multiply");
4628 
4629     //     t0 = t1; t1 = t2; t2 = 0;
4630     //   }
4631 
4632     //   for (i = len; i < 2*len; i++) {
4633     //     int start = i-len+1;
4634     //     int end = start + (len - start)/2;
4635     //     int j;
4636 
4637     //     Pa = Pa_base + i-len;
4638     //     Pb = Pa_base + len;
4639     //     Pm = Pm_base + i-len;
4640     //     Pn = Pn_base + len;
4641 
4642     //     Ra = *++Pa;
4643     //     Rb = *--Pb;
4644     //     Rm = *++Pm;
4645     //     Rn = *--Pn;
4646 
4647     //     int iters = (2*len-i-1)/2;
4648     //     assert(iters == end-start, "must be");
4649     //     for (j = start; iters--; j++) {
4650     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4651     //       MACC2(Ra, Rb, t0, t1, t2);
4652     //       Ra = *++Pa;
4653     //       Rb = *--Pb;
4654     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4655     //       MACC(Rm, Rn, t0, t1, t2);
4656     //       Rm = *++Pm;
4657     //       Rn = *--Pn;
4658     //     }
4659     //     if ((i & 1) == 0) {
4660     //       assert(Ra == Pa_base[j], "must be");
4661     //       MACC(Ra, Ra, t0, t1, t2);
4662     //     }
4663     //     iters =  (2*len-i)/2;
4664     //     assert(iters == len-j, "must be");
4665     //     for (; iters--; j++) {
4666     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4667     //       MACC(Rm, Rn, t0, t1, t2);
4668     //       Rm = *++Pm;
4669     //       Rn = *--Pn;
4670     //     }
4671     //     Pm_base[i-len] = t0;
4672     //     t0 = t1; t1 = t2; t2 = 0;
4673     //   }
4674 
4675     //   while (t0)
4676     //     t0 = sub(Pm_base, Pn_base, t0, len);
4677     // }
4678   };
4679 
4680   // Initialization
4681   void generate_initial() {
4682     // Generate initial stubs and initializes the entry points
4683 
4684     // entry points that exist in all platforms Note: This is code
4685     // that could be shared among different platforms - however the
4686     // benefit seems to be smaller than the disadvantage of having a
4687     // much more complicated generator structure. See also comment in
4688     // stubRoutines.hpp.
4689 
4690     StubRoutines::_forward_exception_entry = generate_forward_exception();
4691 
4692     StubRoutines::_call_stub_entry =
4693       generate_call_stub(StubRoutines::_call_stub_return_address);
4694 
4695     // is referenced by megamorphic call
4696     StubRoutines::_catch_exception_entry = generate_catch_exception();
4697 
4698     // Build this early so it's available for the interpreter.
4699     StubRoutines::_throw_StackOverflowError_entry =
4700       generate_throw_exception("StackOverflowError throw_exception",
4701                                CAST_FROM_FN_PTR(address,
4702                                                 SharedRuntime::throw_StackOverflowError));
4703     StubRoutines::_throw_delayed_StackOverflowError_entry =
4704       generate_throw_exception("delayed StackOverflowError throw_exception",
4705                                CAST_FROM_FN_PTR(address,
4706                                                 SharedRuntime::throw_delayed_StackOverflowError));
4707     if (UseCRC32Intrinsics) {
4708       // set table address before stub generation which use it
4709       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4710       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4711     }
4712   }
4713 
4714   void generate_all() {
4715     // support for verify_oop (must happen after universe_init)
4716     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4717     StubRoutines::_throw_AbstractMethodError_entry =
4718       generate_throw_exception("AbstractMethodError throw_exception",
4719                                CAST_FROM_FN_PTR(address,
4720                                                 SharedRuntime::
4721                                                 throw_AbstractMethodError));
4722 
4723     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4724       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4725                                CAST_FROM_FN_PTR(address,
4726                                                 SharedRuntime::
4727                                                 throw_IncompatibleClassChangeError));
4728 
4729     StubRoutines::_throw_NullPointerException_at_call_entry =
4730       generate_throw_exception("NullPointerException at call throw_exception",
4731                                CAST_FROM_FN_PTR(address,
4732                                                 SharedRuntime::
4733                                                 throw_NullPointerException_at_call));
4734 
4735     // arraycopy stubs used by compilers
4736     generate_arraycopy_stubs();
4737 
4738     if (UseMultiplyToLenIntrinsic) {
4739       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4740     }
4741 
4742     if (UseMontgomeryMultiplyIntrinsic) {
4743       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4744       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4745       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4746     }
4747 
4748     if (UseMontgomerySquareIntrinsic) {
4749       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4750       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4751       // We use generate_multiply() rather than generate_square()
4752       // because it's faster for the sizes of modulus we care about.
4753       StubRoutines::_montgomerySquare = g.generate_multiply();
4754     }
4755 
4756 #ifndef BUILTIN_SIM
4757     // generate GHASH intrinsics code
4758     if (UseGHASHIntrinsics) {
4759       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4760     }
4761 
4762     if (UseAESIntrinsics) {
4763       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4764       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4765       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4766       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4767     }
4768 
4769     if (UseSHA1Intrinsics) {
4770       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4771       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4772     }
4773     if (UseSHA256Intrinsics) {
4774       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4775       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4776     }
4777 
4778     if (UseCRC32CIntrinsics) {
4779       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
4780     }
4781 
4782     // generate Adler32 intrinsics code
4783     if (UseAdler32Intrinsics) {
4784       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
4785     }
4786 
4787     // Safefetch stubs.
4788     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4789                                                        &StubRoutines::_safefetch32_fault_pc,
4790                                                        &StubRoutines::_safefetch32_continuation_pc);
4791     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4792                                                        &StubRoutines::_safefetchN_fault_pc,
4793                                                        &StubRoutines::_safefetchN_continuation_pc);
4794 #endif
4795     StubRoutines::aarch64::set_completed();
4796   }
4797 
4798  public:
4799   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4800     if (all) {
4801       generate_all();
4802     } else {
4803       generate_initial();
4804     }
4805   }
4806 }; // end class declaration
4807 
4808 void StubGenerator_generate(CodeBuffer* code, bool all) {
4809   StubGenerator g(code, all);
4810 }