Old src/cpu/aarch64/vm/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "interpreter/interpreter.hpp"
  30 #include "nativeInst_aarch64.hpp"
  31 #include "oops/instanceOop.hpp"
  32 #include "oops/method.hpp"
  33 #include "oops/objArrayKlass.hpp"
  34 #include "oops/oop.inline.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/frame.inline.hpp"
  37 #include "runtime/handles.inline.hpp"
  38 #include "runtime/sharedRuntime.hpp"
  39 #include "runtime/stubCodeGenerator.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "runtime/thread.inline.hpp"
  42 #include "utilities/top.hpp"
  43 #ifdef COMPILER2
  44 #include "opto/runtime.hpp"
  45 #endif
  46 
  47 #ifdef BUILTIN_SIM
  48 #include "../../../../../../simulator/simulator.hpp"
  49 #endif
  50 
  51 // Declaration and definition of StubGenerator (no .hpp file).
  52 // For a more detailed description of the stub routine structure
  53 // see the comment in stubRoutines.hpp
  54 
  55 #undef __
  56 #define __ _masm->
  57 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #else
  62 #define BLOCK_COMMENT(str) __ block_comment(str)
  63 #endif
  64 
  65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  66 
  67 // Stub Code definitions
  68 
  69 class StubGenerator: public StubCodeGenerator {
  70  private:
  71 
  72 #ifdef PRODUCT
  73 #define inc_counter_np(counter) ((void)0)
  74 #else
  75   void inc_counter_np_(int& counter) {
  76     __ lea(rscratch2, ExternalAddress((address)&counter));
  77     __ ldrw(rscratch1, Address(rscratch2));
  78     __ addw(rscratch1, rscratch1, 1);
  79     __ strw(rscratch1, Address(rscratch2));
  80   }
  81 #define inc_counter_np(counter) \
  82   BLOCK_COMMENT("inc_counter " #counter); \
  83   inc_counter_np_(counter);
  84 #endif
  85 
  86   // Call stubs are used to call Java from C
  87   //
  88   // Arguments:
  89   //    c_rarg0:   call wrapper address                   address
  90   //    c_rarg1:   result                                 address
  91   //    c_rarg2:   result type                            BasicType
  92   //    c_rarg3:   method                                 Method*
  93   //    c_rarg4:   (interpreter) entry point              address
  94   //    c_rarg5:   parameters                             intptr_t*
  95   //    c_rarg6:   parameter size (in words)              int
  96   //    c_rarg7:   thread                                 Thread*
  97   //
  98   // There is no return from the stub itself as any Java result
  99   // is written to result
 100   //
 101   // we save r30 (lr) as the return PC at the base of the frame and
 102   // link r29 (fp) below it as the frame pointer installing sp (r31)
 103   // into fp.
 104   //
 105   // we save r0-r7, which accounts for all the c arguments.
 106   //
 107   // TODO: strictly do we need to save them all? they are treated as
 108   // volatile by C so could we omit saving the ones we are going to
 109   // place in global registers (thread? method?) or those we only use
 110   // during setup of the Java call?
 111   //
 112   // we don't need to save r8 which C uses as an indirect result location
 113   // return register.
 114   //
 115   // we don't need to save r9-r15 which both C and Java treat as
 116   // volatile
 117   //
 118   // we don't need to save r16-18 because Java does not use them
 119   //
 120   // we save r19-r28 which Java uses as scratch registers and C
 121   // expects to be callee-save
 122   //
 123   // we save the bottom 64 bits of each value stored in v8-v15; it is
 124   // the responsibility of the caller to preserve larger values.
 125   //
 126   // so the stub frame looks like this when we enter Java code
 127   //
 128   //     [ return_from_Java     ] <--- sp
 129   //     [ argument word n      ]
 130   //      ...
 131   // -27 [ argument word 1      ]
 132   // -26 [ saved v15            ] <--- sp_after_call
 133   // -25 [ saved v14            ]
 134   // -24 [ saved v13            ]
 135   // -23 [ saved v12            ]
 136   // -22 [ saved v11            ]
 137   // -21 [ saved v10            ]
 138   // -20 [ saved v9             ]
 139   // -19 [ saved v8             ]
 140   // -18 [ saved r28            ]
 141   // -17 [ saved r27            ]
 142   // -16 [ saved r26            ]
 143   // -15 [ saved r25            ]
 144   // -14 [ saved r24            ]
 145   // -13 [ saved r23            ]
 146   // -12 [ saved r22            ]
 147   // -11 [ saved r21            ]
 148   // -10 [ saved r20            ]
 149   //  -9 [ saved r19            ]
 150   //  -8 [ call wrapper    (r0) ]
 151   //  -7 [ result          (r1) ]
 152   //  -6 [ result type     (r2) ]
 153   //  -5 [ method          (r3) ]
 154   //  -4 [ entry point     (r4) ]
 155   //  -3 [ parameters      (r5) ]
 156   //  -2 [ parameter size  (r6) ]
 157   //  -1 [ thread (r7)          ]
 158   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 159   //   1 [ saved lr       (r30) ]
 160 
 161   // Call stub stack layout word offsets from fp
 162   enum call_stub_layout {
 163     sp_after_call_off = -26,
 164 
 165     d15_off            = -26,
 166     d14_off            = -25,
 167     d13_off            = -24,
 168     d12_off            = -23,
 169     d11_off            = -22,
 170     d10_off            = -21,
 171     d9_off             = -20,
 172     d8_off             = -19,
 173 
 174     r28_off            = -18,
 175     r27_off            = -17,
 176     r26_off            = -16,
 177     r25_off            = -15,
 178     r24_off            = -14,
 179     r23_off            = -13,
 180     r22_off            = -12,
 181     r21_off            = -11,
 182     r20_off            = -10,
 183     r19_off            =  -9,
 184     call_wrapper_off   =  -8,
 185     result_off         =  -7,
 186     result_type_off    =  -6,
 187     method_off         =  -5,
 188     entry_point_off    =  -4,
 189     parameters_off     =  -3,
 190     parameter_size_off =  -2,
 191     thread_off         =  -1,
 192     fp_f               =   0,
 193     retaddr_off        =   1,
 194   };
 195 
 196   address generate_call_stub(address& return_address) {
 197     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 198            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 199            "adjust this code");
 200 
 201     StubCodeMark mark(this, "StubRoutines", "call_stub");
 202     address start = __ pc();
 203 
 204     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 205 
 206     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 207     const Address result        (rfp, result_off         * wordSize);
 208     const Address result_type   (rfp, result_type_off    * wordSize);
 209     const Address method        (rfp, method_off         * wordSize);
 210     const Address entry_point   (rfp, entry_point_off    * wordSize);
 211     const Address parameters    (rfp, parameters_off     * wordSize);
 212     const Address parameter_size(rfp, parameter_size_off * wordSize);
 213 
 214     const Address thread        (rfp, thread_off         * wordSize);
 215 
 216     const Address d15_save      (rfp, d15_off * wordSize);
 217     const Address d14_save      (rfp, d14_off * wordSize);
 218     const Address d13_save      (rfp, d13_off * wordSize);
 219     const Address d12_save      (rfp, d12_off * wordSize);
 220     const Address d11_save      (rfp, d11_off * wordSize);
 221     const Address d10_save      (rfp, d10_off * wordSize);
 222     const Address d9_save       (rfp, d9_off * wordSize);
 223     const Address d8_save       (rfp, d8_off * wordSize);
 224 
 225     const Address r28_save      (rfp, r28_off * wordSize);
 226     const Address r27_save      (rfp, r27_off * wordSize);
 227     const Address r26_save      (rfp, r26_off * wordSize);
 228     const Address r25_save      (rfp, r25_off * wordSize);
 229     const Address r24_save      (rfp, r24_off * wordSize);
 230     const Address r23_save      (rfp, r23_off * wordSize);
 231     const Address r22_save      (rfp, r22_off * wordSize);
 232     const Address r21_save      (rfp, r21_off * wordSize);
 233     const Address r20_save      (rfp, r20_off * wordSize);
 234     const Address r19_save      (rfp, r19_off * wordSize);
 235 
 236     // stub code
 237 
 238     // we need a C prolog to bootstrap the x86 caller into the sim
 239     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 240 
 241     address aarch64_entry = __ pc();
 242 
 243 #ifdef BUILTIN_SIM
 244     // Save sender's SP for stack traces.
 245     __ mov(rscratch1, sp);
 246     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 247 #endif
 248     // set up frame and move sp to end of save area
 249     __ enter();
 250     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 251 
 252     // save register parameters and Java scratch/global registers
 253     // n.b. we save thread even though it gets installed in
 254     // rthread because we want to sanity check rthread later
 255     __ str(c_rarg7,  thread);
 256     __ strw(c_rarg6, parameter_size);
 257     __ str(c_rarg5,  parameters);
 258     __ str(c_rarg4,  entry_point);
 259     __ str(c_rarg3,  method);
 260     __ str(c_rarg2,  result_type);
 261     __ str(c_rarg1,  result);
 262     __ str(c_rarg0,  call_wrapper);
 263     __ str(r19,      r19_save);
 264     __ str(r20,      r20_save);
 265     __ str(r21,      r21_save);
 266     __ str(r22,      r22_save);
 267     __ str(r23,      r23_save);
 268     __ str(r24,      r24_save);
 269     __ str(r25,      r25_save);
 270     __ str(r26,      r26_save);
 271     __ str(r27,      r27_save);
 272     __ str(r28,      r28_save);
 273 
 274     __ strd(v8,      d8_save);
 275     __ strd(v9,      d9_save);
 276     __ strd(v10,     d10_save);
 277     __ strd(v11,     d11_save);
 278     __ strd(v12,     d12_save);
 279     __ strd(v13,     d13_save);
 280     __ strd(v14,     d14_save);
 281     __ strd(v15,     d15_save);
 282 
 283     // install Java thread in global register now we have saved
 284     // whatever value it held
 285     __ mov(rthread, c_rarg7);
 286     // And method
 287     __ mov(rmethod, c_rarg3);
 288 
 289     // set up the heapbase register
 290     __ reinit_heapbase();
 291 
 292 #ifdef ASSERT
 293     // make sure we have no pending exceptions
 294     {
 295       Label L;
 296       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 297       __ cmp(rscratch1, (unsigned)NULL_WORD);
 298       __ br(Assembler::EQ, L);
 299       __ stop("StubRoutines::call_stub: entered with pending exception");
 300       __ BIND(L);
 301     }
 302 #endif
 303     // pass parameters if any
 304     __ mov(esp, sp);
 305     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 306     __ andr(sp, rscratch1, -2 * wordSize);
 307 
 308     BLOCK_COMMENT("pass parameters if any");
 309     Label parameters_done;
 310     // parameter count is still in c_rarg6
 311     // and parameter pointer identifying param 1 is in c_rarg5
 312     __ cbzw(c_rarg6, parameters_done);
 313 
 314     address loop = __ pc();
 315     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 316     __ subsw(c_rarg6, c_rarg6, 1);
 317     __ push(rscratch1);
 318     __ br(Assembler::GT, loop);
 319 
 320     __ BIND(parameters_done);
 321 
 322     // call Java entry -- passing methdoOop, and current sp
 323     //      rmethod: Method*
 324     //      r13: sender sp
 325     BLOCK_COMMENT("call Java function");
 326     __ mov(r13, sp);
 327     __ blr(c_rarg4);
 328 
 329     // tell the simulator we have returned to the stub
 330 
 331     // we do this here because the notify will already have been done
 332     // if we get to the next instruction via an exception
 333     //
 334     // n.b. adding this instruction here affects the calculation of
 335     // whether or not a routine returns to the call stub (used when
 336     // doing stack walks) since the normal test is to check the return
 337     // pc against the address saved below. so we may need to allow for
 338     // this extra instruction in the check.
 339 
 340     if (NotifySimulator) {
 341       __ notify(Assembler::method_reentry);
 342     }
 343     // save current address for use by exception handling code
 344 
 345     return_address = __ pc();
 346 
 347     // store result depending on type (everything that is not
 348     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 349     // n.b. this assumes Java returns an integral result in r0
 350     // and a floating result in j_farg0
 351     __ ldr(j_rarg2, result);
 352     Label is_long, is_float, is_double, exit;
 353     __ ldr(j_rarg1, result_type);
 354     __ cmp(j_rarg1, T_OBJECT);
 355     __ br(Assembler::EQ, is_long);
 356     __ cmp(j_rarg1, T_LONG);
 357     __ br(Assembler::EQ, is_long);
 358     __ cmp(j_rarg1, T_FLOAT);
 359     __ br(Assembler::EQ, is_float);
 360     __ cmp(j_rarg1, T_DOUBLE);
 361     __ br(Assembler::EQ, is_double);
 362 
 363     // handle T_INT case
 364     __ strw(r0, Address(j_rarg2));
 365 
 366     __ BIND(exit);
 367 
 368     // pop parameters
 369     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 370 
 371 #ifdef ASSERT
 372     // verify that threads correspond
 373     {
 374       Label L, S;
 375       __ ldr(rscratch1, thread);
 376       __ cmp(rthread, rscratch1);
 377       __ br(Assembler::NE, S);
 378       __ get_thread(rscratch1);
 379       __ cmp(rthread, rscratch1);
 380       __ br(Assembler::EQ, L);
 381       __ BIND(S);
 382       __ stop("StubRoutines::call_stub: threads must correspond");
 383       __ BIND(L);
 384     }
 385 #endif
 386 
 387     // restore callee-save registers
 388     __ ldrd(v15,      d15_save);
 389     __ ldrd(v14,      d14_save);
 390     __ ldrd(v13,      d13_save);
 391     __ ldrd(v12,      d12_save);
 392     __ ldrd(v11,      d11_save);
 393     __ ldrd(v10,      d10_save);
 394     __ ldrd(v9,       d9_save);
 395     __ ldrd(v8,       d8_save);
 396 
 397     __ ldr(r28,      r28_save);
 398     __ ldr(r27,      r27_save);
 399     __ ldr(r26,      r26_save);
 400     __ ldr(r25,      r25_save);
 401     __ ldr(r24,      r24_save);
 402     __ ldr(r23,      r23_save);
 403     __ ldr(r22,      r22_save);
 404     __ ldr(r21,      r21_save);
 405     __ ldr(r20,      r20_save);
 406     __ ldr(r19,      r19_save);
 407     __ ldr(c_rarg0,  call_wrapper);
 408     __ ldr(c_rarg1,  result);
 409     __ ldrw(c_rarg2, result_type);
 410     __ ldr(c_rarg3,  method);
 411     __ ldr(c_rarg4,  entry_point);
 412     __ ldr(c_rarg5,  parameters);
 413     __ ldr(c_rarg6,  parameter_size);
 414     __ ldr(c_rarg7,  thread);
 415 
 416 #ifndef PRODUCT
 417     // tell the simulator we are about to end Java execution
 418     if (NotifySimulator) {
 419       __ notify(Assembler::method_exit);
 420     }
 421 #endif
 422     // leave frame and return to caller
 423     __ leave();
 424     __ ret(lr);
 425 
 426     // handle return types different from T_INT
 427 
 428     __ BIND(is_long);
 429     __ str(r0, Address(j_rarg2, 0));
 430     __ br(Assembler::AL, exit);
 431 
 432     __ BIND(is_float);
 433     __ strs(j_farg0, Address(j_rarg2, 0));
 434     __ br(Assembler::AL, exit);
 435 
 436     __ BIND(is_double);
 437     __ strd(j_farg0, Address(j_rarg2, 0));
 438     __ br(Assembler::AL, exit);
 439 
 440     return start;
 441   }
 442 
 443   // Return point for a Java call if there's an exception thrown in
 444   // Java code.  The exception is caught and transformed into a
 445   // pending exception stored in JavaThread that can be tested from
 446   // within the VM.
 447   //
 448   // Note: Usually the parameters are removed by the callee. In case
 449   // of an exception crossing an activation frame boundary, that is
 450   // not the case if the callee is compiled code => need to setup the
 451   // rsp.
 452   //
 453   // r0: exception oop
 454 
 455   // NOTE: this is used as a target from the signal handler so it
 456   // needs an x86 prolog which returns into the current simulator
 457   // executing the generated catch_exception code. so the prolog
 458   // needs to install rax in a sim register and adjust the sim's
 459   // restart pc to enter the generated code at the start position
 460   // then return from native to simulated execution.
 461 
 462   address generate_catch_exception() {
 463     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 464     address start = __ pc();
 465 
 466     // same as in generate_call_stub():
 467     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 468     const Address thread        (rfp, thread_off         * wordSize);
 469 
 470 #ifdef ASSERT
 471     // verify that threads correspond
 472     {
 473       Label L, S;
 474       __ ldr(rscratch1, thread);
 475       __ cmp(rthread, rscratch1);
 476       __ br(Assembler::NE, S);
 477       __ get_thread(rscratch1);
 478       __ cmp(rthread, rscratch1);
 479       __ br(Assembler::EQ, L);
 480       __ bind(S);
 481       __ stop("StubRoutines::catch_exception: threads must correspond");
 482       __ bind(L);
 483     }
 484 #endif
 485 
 486     // set pending exception
 487     __ verify_oop(r0);
 488 
 489     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 490     __ mov(rscratch1, (address)__FILE__);
 491     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 492     __ movw(rscratch1, (int)__LINE__);
 493     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 494 
 495     // complete return to VM
 496     assert(StubRoutines::_call_stub_return_address != NULL,
 497            "_call_stub_return_address must have been generated before");
 498     __ b(StubRoutines::_call_stub_return_address);
 499 
 500     return start;
 501   }
 502 
 503   // Continuation point for runtime calls returning with a pending
 504   // exception.  The pending exception check happened in the runtime
 505   // or native call stub.  The pending exception in Thread is
 506   // converted into a Java-level exception.
 507   //
 508   // Contract with Java-level exception handlers:
 509   // r0: exception
 510   // r3: throwing pc
 511   //
 512   // NOTE: At entry of this stub, exception-pc must be in LR !!
 513 
 514   // NOTE: this is always used as a jump target within generated code
 515   // so it just needs to be generated code wiht no x86 prolog
 516 
 517   address generate_forward_exception() {
 518     StubCodeMark mark(this, "StubRoutines", "forward exception");
 519     address start = __ pc();
 520 
 521     // Upon entry, LR points to the return address returning into
 522     // Java (interpreted or compiled) code; i.e., the return address
 523     // becomes the throwing pc.
 524     //
 525     // Arguments pushed before the runtime call are still on the stack
 526     // but the exception handler will reset the stack pointer ->
 527     // ignore them.  A potential result in registers can be ignored as
 528     // well.
 529 
 530 #ifdef ASSERT
 531     // make sure this code is only executed if there is a pending exception
 532     {
 533       Label L;
 534       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 535       __ cbnz(rscratch1, L);
 536       __ stop("StubRoutines::forward exception: no pending exception (1)");
 537       __ bind(L);
 538     }
 539 #endif
 540 
 541     // compute exception handler into r19
 542 
 543     // call the VM to find the handler address associated with the
 544     // caller address. pass thread in r0 and caller pc (ret address)
 545     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 546     // the stack.
 547     __ mov(c_rarg1, lr);
 548     // lr will be trashed by the VM call so we move it to R19
 549     // (callee-saved) because we also need to pass it to the handler
 550     // returned by this call.
 551     __ mov(r19, lr);
 552     BLOCK_COMMENT("call exception_handler_for_return_address");
 553     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 554                          SharedRuntime::exception_handler_for_return_address),
 555                     rthread, c_rarg1);
 556     // we should not really care that lr is no longer the callee
 557     // address. we saved the value the handler needs in r19 so we can
 558     // just copy it to r3. however, the C2 handler will push its own
 559     // frame and then calls into the VM and the VM code asserts that
 560     // the PC for the frame above the handler belongs to a compiled
 561     // Java method. So, we restore lr here to satisfy that assert.
 562     __ mov(lr, r19);
 563     // setup r0 & r3 & clear pending exception
 564     __ mov(r3, r19);
 565     __ mov(r19, r0);
 566     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 567     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 568 
 569 #ifdef ASSERT
 570     // make sure exception is set
 571     {
 572       Label L;
 573       __ cbnz(r0, L);
 574       __ stop("StubRoutines::forward exception: no pending exception (2)");
 575       __ bind(L);
 576     }
 577 #endif
 578 
 579     // continue at exception handler
 580     // r0: exception
 581     // r3: throwing pc
 582     // r19: exception handler
 583     __ verify_oop(r0);
 584     __ br(r19);
 585 
 586     return start;
 587   }
 588 
 589   // Non-destructive plausibility checks for oops
 590   //
 591   // Arguments:
 592   //    r0: oop to verify
 593   //    rscratch1: error message
 594   //
 595   // Stack after saving c_rarg3:
 596   //    [tos + 0]: saved c_rarg3
 597   //    [tos + 1]: saved c_rarg2
 598   //    [tos + 2]: saved lr
 599   //    [tos + 3]: saved rscratch2
 600   //    [tos + 4]: saved r0
 601   //    [tos + 5]: saved rscratch1
 602   address generate_verify_oop() {
 603 
 604     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 605     address start = __ pc();
 606 
 607     Label exit, error;
 608 
 609     // save c_rarg2 and c_rarg3
 610     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 611 
 612     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 613     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 614     __ ldr(c_rarg3, Address(c_rarg2));
 615     __ add(c_rarg3, c_rarg3, 1);
 616     __ str(c_rarg3, Address(c_rarg2));
 617 
 618     // object is in r0
 619     // make sure object is 'reasonable'
 620     __ cbz(r0, exit); // if obj is NULL it is OK
 621 
 622     // Check if the oop is in the right area of memory
 623     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 624     __ andr(c_rarg2, r0, c_rarg3);
 625     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 626 
 627     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 628     // instruction here because the flags register is live.
 629     __ eor(c_rarg2, c_rarg2, c_rarg3);
 630     __ cbnz(c_rarg2, error);
 631 
 632     // make sure klass is 'reasonable', which is not zero.
 633     __ load_klass(r0, r0);  // get klass
 634     __ cbz(r0, error);      // if klass is NULL it is broken
 635 
 636     // return if everything seems ok
 637     __ bind(exit);
 638 
 639     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 640     __ ret(lr);
 641 
 642     // handle errors
 643     __ bind(error);
 644     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 645 
 646     __ push(RegSet::range(r0, r29), sp);
 647     // debug(char* msg, int64_t pc, int64_t regs[])
 648     __ mov(c_rarg0, rscratch1);      // pass address of error message
 649     __ mov(c_rarg1, lr);             // pass return address
 650     __ mov(c_rarg2, sp);             // pass address of regs on stack
 651 #ifndef PRODUCT
 652     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 653 #endif
 654     BLOCK_COMMENT("call MacroAssembler::debug");
 655     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 656     __ blrt(rscratch1, 3, 0, 1);
 657 
 658     return start;
 659   }
 660 
 661   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 662 
 663   // Generate code for an array write pre barrier
 664   //
 665   //     addr    -  starting address
 666   //     count   -  element count
 667   //     tmp     - scratch register
 668   //
 669   //     Destroy no registers!
 670   //
 671   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 672     BarrierSet* bs = Universe::heap()->barrier_set();
 673     switch (bs->kind()) {
 674     case BarrierSet::G1SATBCTLogging:
 675       // With G1, don't generate the call if we statically know that the target in uninitialized
 676       if (!dest_uninitialized) {
 677         __ push(RegSet::range(r0, r29), sp);         // integer registers except lr & sp
 678         if (count == c_rarg0) {
 679           if (addr == c_rarg1) {
 680             // exactly backwards!!
 681             __ stp(c_rarg0, c_rarg1, __ pre(sp, -2 * wordSize));
 682             __ ldp(c_rarg1, c_rarg0, __ post(sp, -2 * wordSize));
 683           } else {
 684             __ mov(c_rarg1, count);
 685             __ mov(c_rarg0, addr);
 686           }
 687         } else {
 688           __ mov(c_rarg0, addr);
 689           __ mov(c_rarg1, count);
 690         }
 691         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 692         __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
 693         break;
 694       case BarrierSet::CardTableForRS:
 695       case BarrierSet::CardTableExtension:
 696       case BarrierSet::ModRef:
 697         break;
 698       default:
 699         ShouldNotReachHere();
 700 
 701       }
 702     }
 703   }
 704 
 705   //
 706   // Generate code for an array write post barrier
 707   //
 708   //  Input:
 709   //     start    - register containing starting address of destination array
 710   //     end      - register containing ending address of destination array
 711   //     scratch  - scratch register
 712   //
 713   //  The input registers are overwritten.
 714   //  The ending address is inclusive.
 715   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 716     assert_different_registers(start, end, scratch);
 717     BarrierSet* bs = Universe::heap()->barrier_set();
 718     switch (bs->kind()) {
 719       case BarrierSet::G1SATBCTLogging:
 720 
 721         {
 722           __ push(RegSet::range(r0, r29), sp);         // integer registers except lr & sp
 723           // must compute element count unless barrier set interface is changed (other platforms supply count)
 724           assert_different_registers(start, end, scratch);
 725           __ lea(scratch, Address(end, BytesPerHeapOop));
 726           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 727           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 728           __ mov(c_rarg0, start);
 729           __ mov(c_rarg1, scratch);
 730           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 731           __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
 732         }
 733         break;
 734       case BarrierSet::CardTableForRS:
 735       case BarrierSet::CardTableExtension:
 736         {
 737           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 738           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 739 
 740           Label L_loop;
 741 
 742            __ lsr(start, start, CardTableModRefBS::card_shift);
 743            __ lsr(end, end, CardTableModRefBS::card_shift);
 744            __ sub(end, end, start); // number of bytes to copy
 745 
 746           const Register count = end; // 'end' register contains bytes count now
 747           __ load_byte_map_base(scratch);
 748           __ add(start, start, scratch);
 749           if (UseConcMarkSweepGC) {
 750             __ membar(__ StoreStore);
 751           }
 752           __ BIND(L_loop);
 753           __ strb(zr, Address(start, count));
 754           __ subs(count, count, 1);
 755           __ br(Assembler::HS, L_loop);
 756         }
 757         break;
 758       default:
 759         ShouldNotReachHere();
 760 
 761     }
 762   }
 763 
 764   typedef enum {
 765     copy_forwards = 1,
 766     copy_backwards = -1
 767   } copy_direction;
 768 
 769   // Bulk copy of blocks of 8 words.
 770   //
 771   // count is a count of words.
 772   //
 773   // Precondition: count >= 2
 774   //
 775   // Postconditions:
 776   //
 777   // The least significant bit of count contains the remaining count
 778   // of words to copy.  The rest of count is trash.
 779   //
 780   // s and d are adjusted to point to the remaining words to copy
 781   //
 782   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 783                            copy_direction direction) {
 784     int unit = wordSize * direction;
 785 
 786     int offset;
 787     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 788       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 789     const Register stride = r13;
 790 
 791     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 792     assert_different_registers(s, d, count, rscratch1);
 793 
 794     Label again, large, small;
 795     const char *stub_name;
 796     if (direction == copy_forwards)
 797       stub_name = "foward_copy_longs";
 798     else
 799       stub_name = "backward_copy_longs";
 800     StubCodeMark mark(this, "StubRoutines", stub_name);
 801     __ align(CodeEntryAlignment);
 802     __ bind(start);
 803     __ cmp(count, 8);
 804     __ br(Assembler::LO, small);
 805     if (direction == copy_forwards) {
 806       __ sub(s, s, 2 * wordSize);
 807       __ sub(d, d, 2 * wordSize);
 808     }
 809     __ subs(count, count, 16);
 810     __ br(Assembler::GE, large);
 811 
 812     // 8 <= count < 16 words.  Copy 8.
 813     __ ldp(t0, t1, Address(s, 2 * unit));
 814     __ ldp(t2, t3, Address(s, 4 * unit));
 815     __ ldp(t4, t5, Address(s, 6 * unit));
 816     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 817 
 818     __ stp(t0, t1, Address(d, 2 * unit));
 819     __ stp(t2, t3, Address(d, 4 * unit));
 820     __ stp(t4, t5, Address(d, 6 * unit));
 821     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 822 
 823     if (direction == copy_forwards) {
 824       __ add(s, s, 2 * wordSize);
 825       __ add(d, d, 2 * wordSize);
 826     }
 827 
 828     {
 829       Label L1, L2;
 830       __ bind(small);
 831       __ tbz(count, exact_log2(4), L1);
 832       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 833       __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 834       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 835       __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 836       __ bind(L1);
 837 
 838       __ tbz(count, 1, L2);
 839       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 840       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 841       __ bind(L2);
 842     }
 843 
 844     __ ret(lr);
 845 
 846     __ align(CodeEntryAlignment);
 847     __ bind(large);
 848 
 849     // Fill 8 registers
 850     __ ldp(t0, t1, Address(s, 2 * unit));
 851     __ ldp(t2, t3, Address(s, 4 * unit));
 852     __ ldp(t4, t5, Address(s, 6 * unit));
 853     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 854 
 855     int prefetch = PrefetchCopyIntervalInBytes;
 856     bool use_stride = false;
 857     if (direction == copy_backwards) {
 858        use_stride = prefetch > 256;
 859        prefetch = -prefetch;
 860        if (use_stride) __ mov(stride, prefetch);
 861     }
 862 
 863     __ bind(again);
 864 
 865     if (PrefetchCopyIntervalInBytes > 0)
 866       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 867 
 868     __ stp(t0, t1, Address(d, 2 * unit));
 869     __ ldp(t0, t1, Address(s, 2 * unit));
 870     __ stp(t2, t3, Address(d, 4 * unit));
 871     __ ldp(t2, t3, Address(s, 4 * unit));
 872     __ stp(t4, t5, Address(d, 6 * unit));
 873     __ ldp(t4, t5, Address(s, 6 * unit));
 874     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 875     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 876 
 877     __ subs(count, count, 8);
 878     __ br(Assembler::HS, again);
 879 
 880     // Drain
 881     __ stp(t0, t1, Address(d, 2 * unit));
 882     __ stp(t2, t3, Address(d, 4 * unit));
 883     __ stp(t4, t5, Address(d, 6 * unit));
 884     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 885 
 886     if (direction == copy_forwards) {
 887       __ add(s, s, 2 * wordSize);
 888       __ add(d, d, 2 * wordSize);
 889     }
 890 
 891     {
 892       Label L1, L2;
 893       __ tbz(count, exact_log2(4), L1);
 894       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 895       __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 896       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 897       __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 898       __ bind(L1);
 899 
 900       __ tbz(count, 1, L2);
 901       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 902       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 903       __ bind(L2);
 904     }
 905 
 906     __ ret(lr);
 907   }
 908 
 909   // Small copy: less than 16 bytes.
 910   //
 911   // NB: Ignores all of the bits of count which represent more than 15
 912   // bytes, so a caller doesn't have to mask them.
 913 
 914   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
 915     bool is_backwards = step < 0;
 916     size_t granularity = uabs(step);
 917     int direction = is_backwards ? -1 : 1;
 918     int unit = wordSize * direction;
 919 
 920     Label Lpair, Lword, Lint, Lshort, Lbyte;
 921 
 922     assert(granularity
 923            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 924 
 925     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
 926 
 927     // ??? I don't know if this bit-test-and-branch is the right thing
 928     // to do.  It does a lot of jumping, resulting in several
 929     // mispredicted branches.  It might make more sense to do this
 930     // with something like Duff's device with a single computed branch.
 931 
 932     __ tbz(count, 3 - exact_log2(granularity), Lword);
 933     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
 934     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
 935     __ bind(Lword);
 936 
 937     if (granularity <= sizeof (jint)) {
 938       __ tbz(count, 2 - exact_log2(granularity), Lint);
 939       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 940       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 941       __ bind(Lint);
 942     }
 943 
 944     if (granularity <= sizeof (jshort)) {
 945       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 946       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 947       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 948       __ bind(Lshort);
 949     }
 950 
 951     if (granularity <= sizeof (jbyte)) {
 952       __ tbz(count, 0, Lbyte);
 953       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 954       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 955       __ bind(Lbyte);
 956     }
 957   }
 958 
 959   Label copy_f, copy_b;
 960 
 961   // All-singing all-dancing memory copy.
 962   //
 963   // Copy count units of memory from s to d.  The size of a unit is
 964   // step, which can be positive or negative depending on the direction
 965   // of copy.  If is_aligned is false, we align the source address.
 966   //
 967 
 968   void copy_memory(bool is_aligned, Register s, Register d,
 969                    Register count, Register tmp, int step) {
 970     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 971     bool is_backwards = step < 0;
 972     int granularity = uabs(step);
 973     const Register t0 = r3, t1 = r4;
 974 
 975     if (is_backwards) {
 976       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 977       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 978     }
 979 
 980     Label tail;
 981 
 982     __ cmp(count, 16/granularity);
 983     __ br(Assembler::LO, tail);
 984 
 985     // Now we've got the small case out of the way we can align the
 986     // source address on a 2-word boundary.
 987 
 988     Label aligned;
 989 
 990     if (is_aligned) {
 991       // We may have to adjust by 1 word to get s 2-word-aligned.
 992       __ tbz(s, exact_log2(wordSize), aligned);
 993       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
 994       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
 995       __ sub(count, count, wordSize/granularity);
 996     } else {
 997       if (is_backwards) {
 998         __ andr(rscratch2, s, 2 * wordSize - 1);
 999       } else {
1000         __ neg(rscratch2, s);
1001         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1002       }
1003       // rscratch2 is the byte adjustment needed to align s.
1004       __ cbz(rscratch2, aligned);
1005       int shift = exact_log2(granularity);
1006       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1007       __ sub(count, count, rscratch2);
1008 
1009 #if 0
1010       // ?? This code is only correct for a disjoint copy.  It may or
1011       // may not make sense to use it in that case.
1012 
1013       // Copy the first pair; s and d may not be aligned.
1014       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1015       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1016 
1017       // Align s and d, adjust count
1018       if (is_backwards) {
1019         __ sub(s, s, rscratch2);
1020         __ sub(d, d, rscratch2);
1021       } else {
1022         __ add(s, s, rscratch2);
1023         __ add(d, d, rscratch2);
1024       }
1025 #else
1026       copy_memory_small(s, d, rscratch2, rscratch1, step);
1027 #endif
1028     }
1029 
1030     __ cmp(count, 16/granularity);
1031     __ br(Assembler::LT, tail);
1032     __ bind(aligned);
1033 
1034     // s is now 2-word-aligned.
1035 
1036     // We have a count of units and some trailing bytes.  Adjust the
1037     // count and do a bulk copy of words.
1038     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1039     if (direction == copy_forwards)
1040       __ bl(copy_f);
1041     else
1042       __ bl(copy_b);
1043 
1044     // And the tail.
1045 
1046     __ bind(tail);
1047     copy_memory_small(s, d, count, tmp, step);
1048   }
1049 
1050 
1051   void clobber_registers() {
1052 #ifdef ASSERT
1053     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1054     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1055     for (Register r = r3; r <= r18; r++)
1056       if (r != rscratch1) __ mov(r, rscratch1);
1057 #endif
1058   }
1059 
1060   // Scan over array at a for count oops, verifying each one.
1061   // Preserves a and count, clobbers rscratch1 and rscratch2.
1062   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1063     Label loop, end;
1064     __ mov(rscratch1, a);
1065     __ mov(rscratch2, zr);
1066     __ bind(loop);
1067     __ cmp(rscratch2, count);
1068     __ br(Assembler::HS, end);
1069     if (size == (size_t)wordSize) {
1070       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1071       __ verify_oop(temp);
1072     } else {
1073       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1074       __ decode_heap_oop(temp); // calls verify_oop
1075     }
1076     __ add(rscratch2, rscratch2, size);
1077     __ b(loop);
1078     __ bind(end);
1079   }
1080 
1081   // Arguments:
1082   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1083   //             ignored
1084   //   is_oop  - true => oop array, so generate store check code
1085   //   name    - stub name string
1086   //
1087   // Inputs:
1088   //   c_rarg0   - source array address
1089   //   c_rarg1   - destination array address
1090   //   c_rarg2   - element count, treated as ssize_t, can be zero
1091   //
1092   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1093   // the hardware handle it.  The two dwords within qwords that span
1094   // cache line boundaries will still be loaded and stored atomicly.
1095   //
1096   // Side Effects:
1097   //   disjoint_int_copy_entry is set to the no-overlap entry point
1098   //   used by generate_conjoint_int_oop_copy().
1099   //
1100   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1101                                   const char *name, bool dest_uninitialized = false) {
1102     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1103     __ align(CodeEntryAlignment);
1104     StubCodeMark mark(this, "StubRoutines", name);
1105     address start = __ pc();
1106     __ enter();
1107 
1108     if (entry != NULL) {
1109       *entry = __ pc();
1110       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1111       BLOCK_COMMENT("Entry:");
1112     }
1113 
1114     if (is_oop) {
1115       __ push(RegSet::of(d, count), sp);
1116       // no registers are destroyed by this call
1117       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1118     }
1119     copy_memory(aligned, s, d, count, rscratch1, size);
1120     if (is_oop) {
1121       __ pop(RegSet::of(d, count), sp);
1122       if (VerifyOops)
1123         verify_oop_array(size, d, count, r16);
1124       __ sub(count, count, 1); // make an inclusive end pointer
1125       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1126       gen_write_ref_array_post_barrier(d, count, rscratch1);
1127     }
1128     __ leave();
1129     __ mov(r0, zr); // return 0
1130     __ ret(lr);
1131 #ifdef BUILTIN_SIM
1132     {
1133       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1134       sim->notifyCompile(const_cast<char*>(name), start);
1135     }
1136 #endif
1137     return start;
1138   }
1139 
1140   // Arguments:
1141   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1142   //             ignored
1143   //   is_oop  - true => oop array, so generate store check code
1144   //   name    - stub name string
1145   //
1146   // Inputs:
1147   //   c_rarg0   - source array address
1148   //   c_rarg1   - destination array address
1149   //   c_rarg2   - element count, treated as ssize_t, can be zero
1150   //
1151   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1152   // the hardware handle it.  The two dwords within qwords that span
1153   // cache line boundaries will still be loaded and stored atomicly.
1154   //
1155   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1156                                  address *entry, const char *name,
1157                                  bool dest_uninitialized = false) {
1158     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1159 
1160     StubCodeMark mark(this, "StubRoutines", name);
1161     address start = __ pc();
1162     __ enter();
1163 
1164     if (entry != NULL) {
1165       *entry = __ pc();
1166       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1167       BLOCK_COMMENT("Entry:");
1168     }
1169 
1170     // use fwd copy when (d-s) above_equal (count*size)
1171     __ sub(rscratch1, d, s);
1172     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1173     __ br(Assembler::HS, nooverlap_target);
1174 
1175     if (is_oop) {
1176       __ push(RegSet::of(d, count), sp);
1177       // no registers are destroyed by this call
1178       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1179     }
1180     copy_memory(aligned, s, d, count, rscratch1, -size);
1181     if (is_oop) {
1182       __ pop(RegSet::of(d, count), sp);
1183       if (VerifyOops)
1184         verify_oop_array(size, d, count, r16);
1185       __ sub(count, count, 1); // make an inclusive end pointer
1186       __ lea(count, Address(d, count, Address::uxtw(exact_log2(size))));
1187       gen_write_ref_array_post_barrier(d, count, rscratch1);
1188     }
1189     __ leave();
1190     __ mov(r0, zr); // return 0
1191     __ ret(lr);
1192 #ifdef BUILTIN_SIM
1193     {
1194       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1195       sim->notifyCompile(const_cast<char*>(name), start);
1196     }
1197 #endif
1198     return start;
1199 }
1200 
1201   // Arguments:
1202   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1203   //             ignored
1204   //   name    - stub name string
1205   //
1206   // Inputs:
1207   //   c_rarg0   - source array address
1208   //   c_rarg1   - destination array address
1209   //   c_rarg2   - element count, treated as ssize_t, can be zero
1210   //
1211   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1212   // we let the hardware handle it.  The one to eight bytes within words,
1213   // dwords or qwords that span cache line boundaries will still be loaded
1214   // and stored atomically.
1215   //
1216   // Side Effects:
1217   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1218   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1219   // we let the hardware handle it.  The one to eight bytes within words,
1220   // dwords or qwords that span cache line boundaries will still be loaded
1221   // and stored atomically.
1222   //
1223   // Side Effects:
1224   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1225   //   used by generate_conjoint_byte_copy().
1226   //
1227   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1228     const bool not_oop = false;
1229     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1230   }
1231 
1232   // Arguments:
1233   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1234   //             ignored
1235   //   name    - stub name string
1236   //
1237   // Inputs:
1238   //   c_rarg0   - source array address
1239   //   c_rarg1   - destination array address
1240   //   c_rarg2   - element count, treated as ssize_t, can be zero
1241   //
1242   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1243   // we let the hardware handle it.  The one to eight bytes within words,
1244   // dwords or qwords that span cache line boundaries will still be loaded
1245   // and stored atomically.
1246   //
1247   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1248                                       address* entry, const char *name) {
1249     const bool not_oop = false;
1250     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1251   }
1252 
1253   // Arguments:
1254   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1255   //             ignored
1256   //   name    - stub name string
1257   //
1258   // Inputs:
1259   //   c_rarg0   - source array address
1260   //   c_rarg1   - destination array address
1261   //   c_rarg2   - element count, treated as ssize_t, can be zero
1262   //
1263   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1264   // let the hardware handle it.  The two or four words within dwords
1265   // or qwords that span cache line boundaries will still be loaded
1266   // and stored atomically.
1267   //
1268   // Side Effects:
1269   //   disjoint_short_copy_entry is set to the no-overlap entry point
1270   //   used by generate_conjoint_short_copy().
1271   //
1272   address generate_disjoint_short_copy(bool aligned,
1273                                        address* entry, const char *name) {
1274     const bool not_oop = false;
1275     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1276   }
1277 
1278   // Arguments:
1279   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1280   //             ignored
1281   //   name    - stub name string
1282   //
1283   // Inputs:
1284   //   c_rarg0   - source array address
1285   //   c_rarg1   - destination array address
1286   //   c_rarg2   - element count, treated as ssize_t, can be zero
1287   //
1288   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1289   // let the hardware handle it.  The two or four words within dwords
1290   // or qwords that span cache line boundaries will still be loaded
1291   // and stored atomically.
1292   //
1293   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1294                                        address *entry, const char *name) {
1295     const bool not_oop = false;
1296     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1297 
1298   }
1299   // Arguments:
1300   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1301   //             ignored
1302   //   name    - stub name string
1303   //
1304   // Inputs:
1305   //   c_rarg0   - source array address
1306   //   c_rarg1   - destination array address
1307   //   c_rarg2   - element count, treated as ssize_t, can be zero
1308   //
1309   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1310   // the hardware handle it.  The two dwords within qwords that span
1311   // cache line boundaries will still be loaded and stored atomicly.
1312   //
1313   // Side Effects:
1314   //   disjoint_int_copy_entry is set to the no-overlap entry point
1315   //   used by generate_conjoint_int_oop_copy().
1316   //
1317   address generate_disjoint_int_copy(bool aligned, address *entry,
1318                                          const char *name, bool dest_uninitialized = false) {
1319     const bool not_oop = false;
1320     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1321   }
1322 
1323   // Arguments:
1324   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1325   //             ignored
1326   //   name    - stub name string
1327   //
1328   // Inputs:
1329   //   c_rarg0   - source array address
1330   //   c_rarg1   - destination array address
1331   //   c_rarg2   - element count, treated as ssize_t, can be zero
1332   //
1333   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1334   // the hardware handle it.  The two dwords within qwords that span
1335   // cache line boundaries will still be loaded and stored atomicly.
1336   //
1337   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1338                                      address *entry, const char *name,
1339                                      bool dest_uninitialized = false) {
1340     const bool not_oop = false;
1341     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1342   }
1343 
1344 
1345   // Arguments:
1346   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1347   //             ignored
1348   //   name    - stub name string
1349   //
1350   // Inputs:
1351   //   c_rarg0   - source array address
1352   //   c_rarg1   - destination array address
1353   //   c_rarg2   - element count, treated as size_t, can be zero
1354   //
1355   // Side Effects:
1356   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1357   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1358   //
1359   address generate_disjoint_long_copy(bool aligned, address *entry,
1360                                           const char *name, bool dest_uninitialized = false) {
1361     const bool not_oop = false;
1362     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1363   }
1364 
1365   // Arguments:
1366   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1367   //             ignored
1368   //   name    - stub name string
1369   //
1370   // Inputs:
1371   //   c_rarg0   - source array address
1372   //   c_rarg1   - destination array address
1373   //   c_rarg2   - element count, treated as size_t, can be zero
1374   //
1375   address generate_conjoint_long_copy(bool aligned,
1376                                       address nooverlap_target, address *entry,
1377                                       const char *name, bool dest_uninitialized = false) {
1378     const bool not_oop = false;
1379     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1380   }
1381 
1382   // Arguments:
1383   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1384   //             ignored
1385   //   name    - stub name string
1386   //
1387   // Inputs:
1388   //   c_rarg0   - source array address
1389   //   c_rarg1   - destination array address
1390   //   c_rarg2   - element count, treated as size_t, can be zero
1391   //
1392   // Side Effects:
1393   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1394   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1395   //
1396   address generate_disjoint_oop_copy(bool aligned, address *entry,
1397                                      const char *name, bool dest_uninitialized = false) {
1398     const bool is_oop = true;
1399     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1400     return generate_disjoint_copy(size, aligned, is_oop, entry, name);
1401   }
1402 
1403   // Arguments:
1404   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1405   //             ignored
1406   //   name    - stub name string
1407   //
1408   // Inputs:
1409   //   c_rarg0   - source array address
1410   //   c_rarg1   - destination array address
1411   //   c_rarg2   - element count, treated as size_t, can be zero
1412   //
1413   address generate_conjoint_oop_copy(bool aligned,
1414                                      address nooverlap_target, address *entry,
1415                                      const char *name, bool dest_uninitialized = false) {
1416     const bool is_oop = true;
1417     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1418     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, name);
1419   }
1420 
1421 
1422   // Helper for generating a dynamic type check.
1423   // Smashes rscratch1.
1424   void generate_type_check(Register sub_klass,
1425                            Register super_check_offset,
1426                            Register super_klass,
1427                            Label& L_success) {
1428     assert_different_registers(sub_klass, super_check_offset, super_klass);
1429 
1430     BLOCK_COMMENT("type_check:");
1431 
1432     Label L_miss;
1433 
1434     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1435                                      super_check_offset);
1436     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1437 
1438     // Fall through on failure!
1439     __ BIND(L_miss);
1440   }
1441 
1442   //
1443   //  Generate checkcasting array copy stub
1444   //
1445   //  Input:
1446   //    c_rarg0   - source array address
1447   //    c_rarg1   - destination array address
1448   //    c_rarg2   - element count, treated as ssize_t, can be zero
1449   //    c_rarg3   - size_t ckoff (super_check_offset)
1450   //    c_rarg4   - oop ckval (super_klass)
1451   //
1452   //  Output:
1453   //    r0 ==  0  -  success
1454   //    r0 == -1^K - failure, where K is partial transfer count
1455   //
1456   address generate_checkcast_copy(const char *name, address *entry,
1457                                   bool dest_uninitialized = false) {
1458 
1459     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1460 
1461     // Input registers (after setup_arg_regs)
1462     const Register from        = c_rarg0;   // source array address
1463     const Register to          = c_rarg1;   // destination array address
1464     const Register count       = c_rarg2;   // elementscount
1465     const Register ckoff       = c_rarg3;   // super_check_offset
1466     const Register ckval       = c_rarg4;   // super_klass
1467 
1468     // Registers used as temps (r18, r19, r20 are save-on-entry)
1469     const Register count_save  = r21;       // orig elementscount
1470     const Register start_to    = r20;       // destination array start address
1471     const Register copied_oop  = r18;       // actual oop copied
1472     const Register r19_klass   = r19;       // oop._klass
1473 
1474     //---------------------------------------------------------------
1475     // Assembler stub will be used for this call to arraycopy
1476     // if the two arrays are subtypes of Object[] but the
1477     // destination array type is not equal to or a supertype
1478     // of the source type.  Each element must be separately
1479     // checked.
1480 
1481     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1482                                copied_oop, r19_klass, count_save);
1483 
1484     __ align(CodeEntryAlignment);
1485     StubCodeMark mark(this, "StubRoutines", name);
1486     address start = __ pc();
1487 
1488     __ enter(); // required for proper stackwalking of RuntimeStub frame
1489 
1490 #ifdef ASSERT
1491     // caller guarantees that the arrays really are different
1492     // otherwise, we would have to make conjoint checks
1493     { Label L;
1494       array_overlap_test(L, TIMES_OOP);
1495       __ stop("checkcast_copy within a single array");
1496       __ bind(L);
1497     }
1498 #endif //ASSERT
1499 
1500     // Caller of this entry point must set up the argument registers.
1501     if (entry != NULL) {
1502       *entry = __ pc();
1503       BLOCK_COMMENT("Entry:");
1504     }
1505 
1506      // Empty array:  Nothing to do.
1507     __ cbz(count, L_done);
1508 
1509     __ push(RegSet::of(r18, r19, r20, r21), sp);
1510 
1511 #ifdef ASSERT
1512     BLOCK_COMMENT("assert consistent ckoff/ckval");
1513     // The ckoff and ckval must be mutually consistent,
1514     // even though caller generates both.
1515     { Label L;
1516       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1517       __ ldrw(start_to, Address(ckval, sco_offset));
1518       __ cmpw(ckoff, start_to);
1519       __ br(Assembler::EQ, L);
1520       __ stop("super_check_offset inconsistent");
1521       __ bind(L);
1522     }
1523 #endif //ASSERT
1524 
1525     // save the original count
1526     __ mov(count_save, count);
1527 
1528     // Copy from low to high addresses
1529     __ mov(start_to, to);              // Save destination array start address
1530     __ b(L_load_element);
1531 
1532     // ======== begin loop ========
1533     // (Loop is rotated; its entry is L_load_element.)
1534     // Loop control:
1535     //   for (; count != 0; count--) {
1536     //     copied_oop = load_heap_oop(from++);
1537     //     ... generate_type_check ...;
1538     //     store_heap_oop(to++, copied_oop);
1539     //   }
1540     __ align(OptoLoopAlignment);
1541 
1542     __ BIND(L_store_element);
1543     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1544     __ sub(count, count, 1);
1545     __ cbz(count, L_do_card_marks);
1546 
1547     // ======== loop entry is here ========
1548     __ BIND(L_load_element);
1549     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1550     __ cbz(copied_oop, L_store_element);
1551 
1552     __ load_klass(r19_klass, copied_oop);// query the object klass
1553     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1554     // ======== end loop ========
1555 
1556     // It was a real error; we must depend on the caller to finish the job.
1557     // Register count = remaining oops, count_orig = total oops.
1558     // Emit GC store barriers for the oops we have copied and report
1559     // their number to the caller.
1560 
1561     __ subs(count, count_save, count);     // K = partially copied oop count
1562     __ eon(count, count, zr);                   // report (-1^K) to caller
1563     __ br(Assembler::EQ, L_done_pop);
1564 
1565     __ BIND(L_do_card_marks);
1566     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1567     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1568 
1569     __ bind(L_done_pop);
1570     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1571     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1572 
1573     __ bind(L_done);
1574     __ mov(r0, count);
1575     __ leave();
1576     __ ret(lr);
1577 
1578     return start;
1579   }
1580 
1581   // Perform range checks on the proposed arraycopy.
1582   // Kills temp, but nothing else.
1583   // Also, clean the sign bits of src_pos and dst_pos.
1584   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1585                               Register src_pos, // source position (c_rarg1)
1586                               Register dst,     // destination array oo (c_rarg2)
1587                               Register dst_pos, // destination position (c_rarg3)
1588                               Register length,
1589                               Register temp,
1590                               Label& L_failed) {
1591     BLOCK_COMMENT("arraycopy_range_checks:");
1592 
1593     assert_different_registers(rscratch1, temp);
1594 
1595     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1596     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1597     __ addw(temp, length, src_pos);
1598     __ cmpw(temp, rscratch1);
1599     __ br(Assembler::HI, L_failed);
1600 
1601     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1602     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1603     __ addw(temp, length, dst_pos);
1604     __ cmpw(temp, rscratch1);
1605     __ br(Assembler::HI, L_failed);
1606 
1607     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1608     __ movw(src_pos, src_pos);
1609     __ movw(dst_pos, dst_pos);
1610 
1611     BLOCK_COMMENT("arraycopy_range_checks done");
1612   }
1613 
1614   // These stubs get called from some dumb test routine.
1615   // I'll write them properly when they're called from
1616   // something that's actually doing something.
1617   static void fake_arraycopy_stub(address src, address dst, int count) {
1618     assert(count == 0, "huh?");
1619   }
1620 
1621 
1622   //
1623   //  Generate 'unsafe' array copy stub
1624   //  Though just as safe as the other stubs, it takes an unscaled
1625   //  size_t argument instead of an element count.
1626   //
1627   //  Input:
1628   //    c_rarg0   - source array address
1629   //    c_rarg1   - destination array address
1630   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1631   //
1632   // Examines the alignment of the operands and dispatches
1633   // to a long, int, short, or byte copy loop.
1634   //
1635   address generate_unsafe_copy(const char *name,
1636                                address byte_copy_entry) {
1637 #ifdef PRODUCT
1638     return StubRoutines::_jbyte_arraycopy;
1639 #else
1640     __ align(CodeEntryAlignment);
1641     StubCodeMark mark(this, "StubRoutines", name);
1642     address start = __ pc();
1643     __ enter(); // required for proper stackwalking of RuntimeStub frame
1644     // bump this on entry, not on exit:
1645     __ lea(rscratch2, ExternalAddress((address)&SharedRuntime::_unsafe_array_copy_ctr));
1646     __ incrementw(Address(rscratch2));
1647     __ b(RuntimeAddress(byte_copy_entry));
1648     return start;
1649 #endif
1650   }
1651 
1652   //
1653   //  Generate generic array copy stubs
1654   //
1655   //  Input:
1656   //    c_rarg0    -  src oop
1657   //    c_rarg1    -  src_pos (32-bits)
1658   //    c_rarg2    -  dst oop
1659   //    c_rarg3    -  dst_pos (32-bits)
1660   //    c_rarg4    -  element count (32-bits)
1661   //
1662   //  Output:
1663   //    r0 ==  0  -  success
1664   //    r0 == -1^K - failure, where K is partial transfer count
1665   //
1666   address generate_generic_copy(const char *name,
1667                                 address byte_copy_entry, address short_copy_entry,
1668                                 address int_copy_entry, address oop_copy_entry,
1669                                 address long_copy_entry, address checkcast_copy_entry) {
1670 
1671     Label L_failed, L_failed_0, L_objArray;
1672     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1673 
1674     // Input registers
1675     const Register src        = c_rarg0;  // source array oop
1676     const Register src_pos    = c_rarg1;  // source position
1677     const Register dst        = c_rarg2;  // destination array oop
1678     const Register dst_pos    = c_rarg3;  // destination position
1679     const Register length     = c_rarg4;
1680 
1681     StubCodeMark mark(this, "StubRoutines", name);
1682 
1683     __ align(CodeEntryAlignment);
1684     address start = __ pc();
1685 
1686     __ enter(); // required for proper stackwalking of RuntimeStub frame
1687 
1688     // bump this on entry, not on exit:
1689     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1690 
1691     //-----------------------------------------------------------------------
1692     // Assembler stub will be used for this call to arraycopy
1693     // if the following conditions are met:
1694     //
1695     // (1) src and dst must not be null.
1696     // (2) src_pos must not be negative.
1697     // (3) dst_pos must not be negative.
1698     // (4) length  must not be negative.
1699     // (5) src klass and dst klass should be the same and not NULL.
1700     // (6) src and dst should be arrays.
1701     // (7) src_pos + length must not exceed length of src.
1702     // (8) dst_pos + length must not exceed length of dst.
1703     //
1704 
1705     //  if (src == NULL) return -1;
1706     __ cbz(src, L_failed);
1707 
1708     //  if (src_pos < 0) return -1;
1709     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
1710 
1711     //  if (dst == NULL) return -1;
1712     __ cbz(dst, L_failed);
1713 
1714     //  if (dst_pos < 0) return -1;
1715     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
1716 
1717     // registers used as temp
1718     const Register scratch_length    = r16; // elements count to copy
1719     const Register scratch_src_klass = r17; // array klass
1720     const Register lh                = r18; // layout helper
1721 
1722     //  if (length < 0) return -1;
1723     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
1724     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
1725 
1726     __ load_klass(scratch_src_klass, src);
1727 #ifdef ASSERT
1728     //  assert(src->klass() != NULL);
1729     {
1730       BLOCK_COMMENT("assert klasses not null {");
1731       Label L1, L2;
1732       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
1733       __ bind(L1);
1734       __ stop("broken null klass");
1735       __ bind(L2);
1736       __ load_klass(rscratch1, dst);
1737       __ cbz(rscratch1, L1);     // this would be broken also
1738       BLOCK_COMMENT("} assert klasses not null done");
1739     }
1740 #endif
1741 
1742     // Load layout helper (32-bits)
1743     //
1744     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1745     // 32        30    24            16              8     2                 0
1746     //
1747     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1748     //
1749 
1750     const int lh_offset = in_bytes(Klass::layout_helper_offset());
1751 
1752     // Handle objArrays completely differently...
1753     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1754     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
1755     __ movw(rscratch1, objArray_lh);
1756     __ eorw(rscratch2, lh, rscratch1);
1757     __ cbzw(rscratch2, L_objArray);
1758 
1759     //  if (src->klass() != dst->klass()) return -1;
1760     __ load_klass(rscratch2, dst);
1761     __ eor(rscratch2, rscratch2, scratch_src_klass);
1762     __ cbnz(rscratch2, L_failed);
1763 
1764     //  if (!src->is_Array()) return -1;
1765     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
1766 
1767     // At this point, it is known to be a typeArray (array_tag 0x3).
1768 #ifdef ASSERT
1769     {
1770       BLOCK_COMMENT("assert primitive array {");
1771       Label L;
1772       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
1773       __ cmpw(lh, rscratch2);
1774       __ br(Assembler::GE, L);
1775       __ stop("must be a primitive array");
1776       __ bind(L);
1777       BLOCK_COMMENT("} assert primitive array done");
1778     }
1779 #endif
1780 
1781     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1782                            rscratch2, L_failed);
1783 
1784     // TypeArrayKlass
1785     //
1786     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
1787     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
1788     //
1789 
1790     const Register rscratch1_offset = rscratch1;    // array offset
1791     const Register r18_elsize = lh; // element size
1792 
1793     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
1794            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
1795     __ add(src, src, rscratch1_offset);           // src array offset
1796     __ add(dst, dst, rscratch1_offset);           // dst array offset
1797     BLOCK_COMMENT("choose copy loop based on element size");
1798 
1799     // next registers should be set before the jump to corresponding stub
1800     const Register from     = c_rarg0;  // source array address
1801     const Register to       = c_rarg1;  // destination array address
1802     const Register count    = c_rarg2;  // elements count
1803 
1804     // 'from', 'to', 'count' registers should be set in such order
1805     // since they are the same as 'src', 'src_pos', 'dst'.
1806 
1807     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1808 
1809     // The possible values of elsize are 0-3, i.e. exact_log2(element
1810     // size in bytes).  We do a simple bitwise binary search.
1811   __ BIND(L_copy_bytes);
1812     __ tbnz(r18_elsize, 1, L_copy_ints);
1813     __ tbnz(r18_elsize, 0, L_copy_shorts);
1814     __ lea(from, Address(src, src_pos));// src_addr
1815     __ lea(to,   Address(dst, dst_pos));// dst_addr
1816     __ movw(count, scratch_length); // length
1817     __ b(RuntimeAddress(byte_copy_entry));
1818 
1819   __ BIND(L_copy_shorts);
1820     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
1821     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
1822     __ movw(count, scratch_length); // length
1823     __ b(RuntimeAddress(short_copy_entry));
1824 
1825   __ BIND(L_copy_ints);
1826     __ tbnz(r18_elsize, 0, L_copy_longs);
1827     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
1828     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
1829     __ movw(count, scratch_length); // length
1830     __ b(RuntimeAddress(int_copy_entry));
1831 
1832   __ BIND(L_copy_longs);
1833 #ifdef ASSERT
1834     {
1835       BLOCK_COMMENT("assert long copy {");
1836       Label L;
1837       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
1838       __ cmpw(r18_elsize, LogBytesPerLong);
1839       __ br(Assembler::EQ, L);
1840       __ stop("must be long copy, but elsize is wrong");
1841       __ bind(L);
1842       BLOCK_COMMENT("} assert long copy done");
1843     }
1844 #endif
1845     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
1846     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
1847     __ movw(count, scratch_length); // length
1848     __ b(RuntimeAddress(long_copy_entry));
1849 
1850     // ObjArrayKlass
1851   __ BIND(L_objArray);
1852     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
1853 
1854     Label L_plain_copy, L_checkcast_copy;
1855     //  test array classes for subtyping
1856     __ load_klass(r18, dst);
1857     __ cmp(scratch_src_klass, r18); // usual case is exact equality
1858     __ br(Assembler::NE, L_checkcast_copy);
1859 
1860     // Identically typed arrays can be copied without element-wise checks.
1861     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1862                            rscratch2, L_failed);
1863 
1864     __ lea(from, Address(src, src_pos, Address::lsl(3)));
1865     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1866     __ lea(to, Address(dst, dst_pos, Address::lsl(3)));
1867     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1868     __ movw(count, scratch_length); // length
1869   __ BIND(L_plain_copy);
1870     __ b(RuntimeAddress(oop_copy_entry));
1871 
1872   __ BIND(L_checkcast_copy);
1873     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
1874     {
1875       // Before looking at dst.length, make sure dst is also an objArray.
1876       __ ldrw(rscratch1, Address(r18, lh_offset));
1877       __ movw(rscratch2, objArray_lh);
1878       __ eorw(rscratch1, rscratch1, rscratch2);
1879       __ cbnzw(rscratch1, L_failed);
1880 
1881       // It is safe to examine both src.length and dst.length.
1882       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1883                              r18, L_failed);
1884 
1885       const Register rscratch2_dst_klass = rscratch2;
1886       __ load_klass(rscratch2_dst_klass, dst); // reload
1887 
1888       // Marshal the base address arguments now, freeing registers.
1889       __ lea(from, Address(src, src_pos, Address::lsl(3)));
1890       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1891       __ lea(to, Address(dst, dst_pos, Address::lsl(3)));
1892       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1893       __ movw(count, length);           // length (reloaded)
1894       Register sco_temp = c_rarg3;      // this register is free now
1895       assert_different_registers(from, to, count, sco_temp,
1896                                  rscratch2_dst_klass, scratch_src_klass);
1897       // assert_clean_int(count, sco_temp);
1898 
1899       // Generate the type check.
1900       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
1901       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
1902       // assert_clean_int(sco_temp, r18);
1903       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
1904 
1905       // Fetch destination element klass from the ObjArrayKlass header.
1906       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
1907       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
1908       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
1909 
1910       // the checkcast_copy loop needs two extra arguments:
1911       assert(c_rarg3 == sco_temp, "#3 already in place");
1912       // Set up arguments for checkcast_copy_entry.
1913       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
1914       __ b(RuntimeAddress(checkcast_copy_entry));
1915     }
1916 
1917   __ BIND(L_failed);
1918     __ mov(r0, -1);
1919     __ leave();   // required for proper stackwalking of RuntimeStub frame
1920     __ ret(lr);
1921 
1922     return start;
1923   }
1924 
1925   void generate_arraycopy_stubs() {
1926     address entry;
1927     address entry_jbyte_arraycopy;
1928     address entry_jshort_arraycopy;
1929     address entry_jint_arraycopy;
1930     address entry_oop_arraycopy;
1931     address entry_jlong_arraycopy;
1932     address entry_checkcast_arraycopy;
1933 
1934     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
1935     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
1936 
1937     //*** jbyte
1938     // Always need aligned and unaligned versions
1939     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
1940                                                                                   "jbyte_disjoint_arraycopy");
1941     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
1942                                                                                   &entry_jbyte_arraycopy,
1943                                                                                   "jbyte_arraycopy");
1944     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
1945                                                                                   "arrayof_jbyte_disjoint_arraycopy");
1946     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
1947                                                                                   "arrayof_jbyte_arraycopy");
1948 
1949     //*** jshort
1950     // Always need aligned and unaligned versions
1951     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
1952                                                                                     "jshort_disjoint_arraycopy");
1953     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
1954                                                                                     &entry_jshort_arraycopy,
1955                                                                                     "jshort_arraycopy");
1956     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
1957                                                                                     "arrayof_jshort_disjoint_arraycopy");
1958     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
1959                                                                                     "arrayof_jshort_arraycopy");
1960 
1961     //*** jint
1962     // Aligned versions
1963     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
1964                                                                                 "arrayof_jint_disjoint_arraycopy");
1965     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
1966                                                                                 "arrayof_jint_arraycopy");
1967     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
1968     // entry_jint_arraycopy always points to the unaligned version
1969     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
1970                                                                                 "jint_disjoint_arraycopy");
1971     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
1972                                                                                 &entry_jint_arraycopy,
1973                                                                                 "jint_arraycopy");
1974 
1975     //*** jlong
1976     // It is always aligned
1977     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
1978                                                                                   "arrayof_jlong_disjoint_arraycopy");
1979     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
1980                                                                                   "arrayof_jlong_arraycopy");
1981     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
1982     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
1983 
1984     //*** oops
1985     {
1986       // With compressed oops we need unaligned versions; notice that
1987       // we overwrite entry_oop_arraycopy.
1988       bool aligned = !UseCompressedOops;
1989 
1990       StubRoutines::_arrayof_oop_disjoint_arraycopy
1991         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy");
1992       StubRoutines::_arrayof_oop_arraycopy
1993         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy");
1994       // Aligned versions without pre-barriers
1995       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
1996         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
1997                                      /*dest_uninitialized*/true);
1998       StubRoutines::_arrayof_oop_arraycopy_uninit
1999         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2000                                      /*dest_uninitialized*/true);
2001     }
2002 
2003     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2004     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2005     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2006     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2007 
2008     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2009     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2010                                                                         /*dest_uninitialized*/true);
2011 
2012     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2013                                                               entry_jbyte_arraycopy);
2014 
2015     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2016                                                                entry_jbyte_arraycopy,
2017                                                                entry_jshort_arraycopy,
2018                                                                entry_jint_arraycopy,
2019                                                                entry_oop_arraycopy,
2020                                                                entry_jlong_arraycopy,
2021                                                                entry_checkcast_arraycopy);
2022 
2023   }
2024 
2025   void generate_math_stubs() { Unimplemented(); }
2026 
2027   // Arguments:
2028   //
2029   // Inputs:
2030   //   c_rarg0   - source byte array address
2031   //   c_rarg1   - destination byte array address
2032   //   c_rarg2   - K (key) in little endian int array
2033   //
2034   address generate_aescrypt_encryptBlock() {
2035     __ align(CodeEntryAlignment);
2036     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2037 
2038     Label L_doLast;
2039 
2040     const Register from        = c_rarg0;  // source array address
2041     const Register to          = c_rarg1;  // destination array address
2042     const Register key         = c_rarg2;  // key array address
2043     const Register keylen      = rscratch1;
2044 
2045     address start = __ pc();
2046     __ enter();
2047 
2048     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2049 
2050     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2051 
2052     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2053     __ rev32(v1, __ T16B, v1);
2054     __ rev32(v2, __ T16B, v2);
2055     __ rev32(v3, __ T16B, v3);
2056     __ rev32(v4, __ T16B, v4);
2057     __ aese(v0, v1);
2058     __ aesmc(v0, v0);
2059     __ aese(v0, v2);
2060     __ aesmc(v0, v0);
2061     __ aese(v0, v3);
2062     __ aesmc(v0, v0);
2063     __ aese(v0, v4);
2064     __ aesmc(v0, v0);
2065 
2066     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2067     __ rev32(v1, __ T16B, v1);
2068     __ rev32(v2, __ T16B, v2);
2069     __ rev32(v3, __ T16B, v3);
2070     __ rev32(v4, __ T16B, v4);
2071     __ aese(v0, v1);
2072     __ aesmc(v0, v0);
2073     __ aese(v0, v2);
2074     __ aesmc(v0, v0);
2075     __ aese(v0, v3);
2076     __ aesmc(v0, v0);
2077     __ aese(v0, v4);
2078     __ aesmc(v0, v0);
2079 
2080     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2081     __ rev32(v1, __ T16B, v1);
2082     __ rev32(v2, __ T16B, v2);
2083 
2084     __ cmpw(keylen, 44);
2085     __ br(Assembler::EQ, L_doLast);
2086 
2087     __ aese(v0, v1);
2088     __ aesmc(v0, v0);
2089     __ aese(v0, v2);
2090     __ aesmc(v0, v0);
2091 
2092     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2093     __ rev32(v1, __ T16B, v1);
2094     __ rev32(v2, __ T16B, v2);
2095 
2096     __ cmpw(keylen, 52);
2097     __ br(Assembler::EQ, L_doLast);
2098 
2099     __ aese(v0, v1);
2100     __ aesmc(v0, v0);
2101     __ aese(v0, v2);
2102     __ aesmc(v0, v0);
2103 
2104     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2105     __ rev32(v1, __ T16B, v1);
2106     __ rev32(v2, __ T16B, v2);
2107 
2108     __ BIND(L_doLast);
2109 
2110     __ aese(v0, v1);
2111     __ aesmc(v0, v0);
2112     __ aese(v0, v2);
2113 
2114     __ ld1(v1, __ T16B, key);
2115     __ rev32(v1, __ T16B, v1);
2116     __ eor(v0, __ T16B, v0, v1);
2117 
2118     __ st1(v0, __ T16B, to);
2119 
2120     __ mov(r0, 0);
2121 
2122     __ leave();
2123     __ ret(lr);
2124 
2125     return start;
2126   }
2127 
2128   // Arguments:
2129   //
2130   // Inputs:
2131   //   c_rarg0   - source byte array address
2132   //   c_rarg1   - destination byte array address
2133   //   c_rarg2   - K (key) in little endian int array
2134   //
2135   address generate_aescrypt_decryptBlock() {
2136     assert(UseAES, "need AES instructions and misaligned SSE support");
2137     __ align(CodeEntryAlignment);
2138     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2139     Label L_doLast;
2140 
2141     const Register from        = c_rarg0;  // source array address
2142     const Register to          = c_rarg1;  // destination array address
2143     const Register key         = c_rarg2;  // key array address
2144     const Register keylen      = rscratch1;
2145 
2146     address start = __ pc();
2147     __ enter(); // required for proper stackwalking of RuntimeStub frame
2148 
2149     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2150 
2151     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2152 
2153     __ ld1(v5, __ T16B, __ post(key, 16));
2154     __ rev32(v5, __ T16B, v5);
2155 
2156     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2157     __ rev32(v1, __ T16B, v1);
2158     __ rev32(v2, __ T16B, v2);
2159     __ rev32(v3, __ T16B, v3);
2160     __ rev32(v4, __ T16B, v4);
2161     __ aesd(v0, v1);
2162     __ aesimc(v0, v0);
2163     __ aesd(v0, v2);
2164     __ aesimc(v0, v0);
2165     __ aesd(v0, v3);
2166     __ aesimc(v0, v0);
2167     __ aesd(v0, v4);
2168     __ aesimc(v0, v0);
2169 
2170     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2171     __ rev32(v1, __ T16B, v1);
2172     __ rev32(v2, __ T16B, v2);
2173     __ rev32(v3, __ T16B, v3);
2174     __ rev32(v4, __ T16B, v4);
2175     __ aesd(v0, v1);
2176     __ aesimc(v0, v0);
2177     __ aesd(v0, v2);
2178     __ aesimc(v0, v0);
2179     __ aesd(v0, v3);
2180     __ aesimc(v0, v0);
2181     __ aesd(v0, v4);
2182     __ aesimc(v0, v0);
2183 
2184     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2185     __ rev32(v1, __ T16B, v1);
2186     __ rev32(v2, __ T16B, v2);
2187 
2188     __ cmpw(keylen, 44);
2189     __ br(Assembler::EQ, L_doLast);
2190 
2191     __ aesd(v0, v1);
2192     __ aesimc(v0, v0);
2193     __ aesd(v0, v2);
2194     __ aesimc(v0, v0);
2195 
2196     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2197     __ rev32(v1, __ T16B, v1);
2198     __ rev32(v2, __ T16B, v2);
2199 
2200     __ cmpw(keylen, 52);
2201     __ br(Assembler::EQ, L_doLast);
2202 
2203     __ aesd(v0, v1);
2204     __ aesimc(v0, v0);
2205     __ aesd(v0, v2);
2206     __ aesimc(v0, v0);
2207 
2208     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2209     __ rev32(v1, __ T16B, v1);
2210     __ rev32(v2, __ T16B, v2);
2211 
2212     __ BIND(L_doLast);
2213 
2214     __ aesd(v0, v1);
2215     __ aesimc(v0, v0);
2216     __ aesd(v0, v2);
2217 
2218     __ eor(v0, __ T16B, v0, v5);
2219 
2220     __ st1(v0, __ T16B, to);
2221 
2222     __ mov(r0, 0);
2223 
2224     __ leave();
2225     __ ret(lr);
2226 
2227     return start;
2228   }
2229 
2230   // Arguments:
2231   //
2232   // Inputs:
2233   //   c_rarg0   - source byte array address
2234   //   c_rarg1   - destination byte array address
2235   //   c_rarg2   - K (key) in little endian int array
2236   //   c_rarg3   - r vector byte array address
2237   //   c_rarg4   - input length
2238   //
2239   // Output:
2240   //   x0        - input length
2241   //
2242   address generate_cipherBlockChaining_encryptAESCrypt() {
2243     assert(UseAES, "need AES instructions and misaligned SSE support");
2244     __ align(CodeEntryAlignment);
2245     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2246 
2247     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2248 
2249     const Register from        = c_rarg0;  // source array address
2250     const Register to          = c_rarg1;  // destination array address
2251     const Register key         = c_rarg2;  // key array address
2252     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2253                                            // and left with the results of the last encryption block
2254     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2255     const Register keylen      = rscratch1;
2256 
2257     address start = __ pc();
2258       __ enter();
2259 
2260       __ mov(rscratch2, len_reg);
2261       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2262 
2263       __ ld1(v0, __ T16B, rvec);
2264 
2265       __ cmpw(keylen, 52);
2266       __ br(Assembler::CC, L_loadkeys_44);
2267       __ br(Assembler::EQ, L_loadkeys_52);
2268 
2269       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2270       __ rev32(v17, __ T16B, v17);
2271       __ rev32(v18, __ T16B, v18);
2272     __ BIND(L_loadkeys_52);
2273       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2274       __ rev32(v19, __ T16B, v19);
2275       __ rev32(v20, __ T16B, v20);
2276     __ BIND(L_loadkeys_44);
2277       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2278       __ rev32(v21, __ T16B, v21);
2279       __ rev32(v22, __ T16B, v22);
2280       __ rev32(v23, __ T16B, v23);
2281       __ rev32(v24, __ T16B, v24);
2282       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2283       __ rev32(v25, __ T16B, v25);
2284       __ rev32(v26, __ T16B, v26);
2285       __ rev32(v27, __ T16B, v27);
2286       __ rev32(v28, __ T16B, v28);
2287       __ ld1(v29, v30, v31, __ T16B, key);
2288       __ rev32(v29, __ T16B, v29);
2289       __ rev32(v30, __ T16B, v30);
2290       __ rev32(v31, __ T16B, v31);
2291 
2292     __ BIND(L_aes_loop);
2293       __ ld1(v1, __ T16B, __ post(from, 16));
2294       __ eor(v0, __ T16B, v0, v1);
2295 
2296       __ br(Assembler::CC, L_rounds_44);
2297       __ br(Assembler::EQ, L_rounds_52);
2298 
2299       __ aese(v0, v17); __ aesmc(v0, v0);
2300       __ aese(v0, v18); __ aesmc(v0, v0);
2301     __ BIND(L_rounds_52);
2302       __ aese(v0, v19); __ aesmc(v0, v0);
2303       __ aese(v0, v20); __ aesmc(v0, v0);
2304     __ BIND(L_rounds_44);
2305       __ aese(v0, v21); __ aesmc(v0, v0);
2306       __ aese(v0, v22); __ aesmc(v0, v0);
2307       __ aese(v0, v23); __ aesmc(v0, v0);
2308       __ aese(v0, v24); __ aesmc(v0, v0);
2309       __ aese(v0, v25); __ aesmc(v0, v0);
2310       __ aese(v0, v26); __ aesmc(v0, v0);
2311       __ aese(v0, v27); __ aesmc(v0, v0);
2312       __ aese(v0, v28); __ aesmc(v0, v0);
2313       __ aese(v0, v29); __ aesmc(v0, v0);
2314       __ aese(v0, v30);
2315       __ eor(v0, __ T16B, v0, v31);
2316 
2317       __ st1(v0, __ T16B, __ post(to, 16));
2318       __ sub(len_reg, len_reg, 16);
2319       __ cbnz(len_reg, L_aes_loop);
2320 
2321       __ st1(v0, __ T16B, rvec);
2322 
2323       __ mov(r0, rscratch2);
2324 
2325       __ leave();
2326       __ ret(lr);
2327 
2328       return start;
2329   }
2330 
2331   // Arguments:
2332   //
2333   // Inputs:
2334   //   c_rarg0   - source byte array address
2335   //   c_rarg1   - destination byte array address
2336   //   c_rarg2   - K (key) in little endian int array
2337   //   c_rarg3   - r vector byte array address
2338   //   c_rarg4   - input length
2339   //
2340   // Output:
2341   //   r0        - input length
2342   //
2343   address generate_cipherBlockChaining_decryptAESCrypt() {
2344     assert(UseAES, "need AES instructions and misaligned SSE support");
2345     __ align(CodeEntryAlignment);
2346     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2347 
2348     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2349 
2350     const Register from        = c_rarg0;  // source array address
2351     const Register to          = c_rarg1;  // destination array address
2352     const Register key         = c_rarg2;  // key array address
2353     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2354                                            // and left with the results of the last encryption block
2355     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2356     const Register keylen      = rscratch1;
2357 
2358     address start = __ pc();
2359       __ enter();
2360 
2361       __ mov(rscratch2, len_reg);
2362       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2363 
2364       __ ld1(v2, __ T16B, rvec);
2365 
2366       __ ld1(v31, __ T16B, __ post(key, 16));
2367       __ rev32(v31, __ T16B, v31);
2368 
2369       __ cmpw(keylen, 52);
2370       __ br(Assembler::CC, L_loadkeys_44);
2371       __ br(Assembler::EQ, L_loadkeys_52);
2372 
2373       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2374       __ rev32(v17, __ T16B, v17);
2375       __ rev32(v18, __ T16B, v18);
2376     __ BIND(L_loadkeys_52);
2377       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2378       __ rev32(v19, __ T16B, v19);
2379       __ rev32(v20, __ T16B, v20);
2380     __ BIND(L_loadkeys_44);
2381       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2382       __ rev32(v21, __ T16B, v21);
2383       __ rev32(v22, __ T16B, v22);
2384       __ rev32(v23, __ T16B, v23);
2385       __ rev32(v24, __ T16B, v24);
2386       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2387       __ rev32(v25, __ T16B, v25);
2388       __ rev32(v26, __ T16B, v26);
2389       __ rev32(v27, __ T16B, v27);
2390       __ rev32(v28, __ T16B, v28);
2391       __ ld1(v29, v30, __ T16B, key);
2392       __ rev32(v29, __ T16B, v29);
2393       __ rev32(v30, __ T16B, v30);
2394 
2395     __ BIND(L_aes_loop);
2396       __ ld1(v0, __ T16B, __ post(from, 16));
2397       __ orr(v1, __ T16B, v0, v0);
2398 
2399       __ br(Assembler::CC, L_rounds_44);
2400       __ br(Assembler::EQ, L_rounds_52);
2401 
2402       __ aesd(v0, v17); __ aesimc(v0, v0);
2403       __ aesd(v0, v18); __ aesimc(v0, v0);
2404     __ BIND(L_rounds_52);
2405       __ aesd(v0, v19); __ aesimc(v0, v0);
2406       __ aesd(v0, v20); __ aesimc(v0, v0);
2407     __ BIND(L_rounds_44);
2408       __ aesd(v0, v21); __ aesimc(v0, v0);
2409       __ aesd(v0, v22); __ aesimc(v0, v0);
2410       __ aesd(v0, v23); __ aesimc(v0, v0);
2411       __ aesd(v0, v24); __ aesimc(v0, v0);
2412       __ aesd(v0, v25); __ aesimc(v0, v0);
2413       __ aesd(v0, v26); __ aesimc(v0, v0);
2414       __ aesd(v0, v27); __ aesimc(v0, v0);
2415       __ aesd(v0, v28); __ aesimc(v0, v0);
2416       __ aesd(v0, v29); __ aesimc(v0, v0);
2417       __ aesd(v0, v30);
2418       __ eor(v0, __ T16B, v0, v31);
2419       __ eor(v0, __ T16B, v0, v2);
2420 
2421       __ st1(v0, __ T16B, __ post(to, 16));
2422       __ orr(v2, __ T16B, v1, v1);
2423 
2424       __ sub(len_reg, len_reg, 16);
2425       __ cbnz(len_reg, L_aes_loop);
2426 
2427       __ st1(v2, __ T16B, rvec);
2428 
2429       __ mov(r0, rscratch2);
2430 
2431       __ leave();
2432       __ ret(lr);
2433 
2434     return start;
2435   }
2436 
2437   // Arguments:
2438   //
2439   // Inputs:
2440   //   c_rarg0   - byte[]  source+offset
2441   //   c_rarg1   - int[]   SHA.state
2442   //   c_rarg2   - int     offset
2443   //   c_rarg3   - int     limit
2444   //
2445   address generate_sha1_implCompress(bool multi_block, const char *name) {
2446     __ align(CodeEntryAlignment);
2447     StubCodeMark mark(this, "StubRoutines", name);
2448     address start = __ pc();
2449 
2450     Register buf   = c_rarg0;
2451     Register state = c_rarg1;
2452     Register ofs   = c_rarg2;
2453     Register limit = c_rarg3;
2454 
2455     Label keys;
2456     Label sha1_loop;
2457 
2458     // load the keys into v0..v3
2459     __ adr(rscratch1, keys);
2460     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2461     // load 5 words state into v6, v7
2462     __ ldrq(v6, Address(state, 0));
2463     __ ldrs(v7, Address(state, 16));
2464 
2465 
2466     __ BIND(sha1_loop);
2467     // load 64 bytes of data into v16..v19
2468     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2469     __ rev32(v16, __ T16B, v16);
2470     __ rev32(v17, __ T16B, v17);
2471     __ rev32(v18, __ T16B, v18);
2472     __ rev32(v19, __ T16B, v19);
2473 
2474     // do the sha1
2475     __ addv(v4, __ T4S, v16, v0);
2476     __ orr(v20, __ T16B, v6, v6);
2477 
2478     FloatRegister d0 = v16;
2479     FloatRegister d1 = v17;
2480     FloatRegister d2 = v18;
2481     FloatRegister d3 = v19;
2482 
2483     for (int round = 0; round < 20; round++) {
2484       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2485       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2486       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2487       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2488       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2489 
2490       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2491       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2492       __ sha1h(tmp2, __ T4S, v20);
2493       if (round < 5)
2494         __ sha1c(v20, __ T4S, tmp3, tmp4);
2495       else if (round < 10 || round >= 15)
2496         __ sha1p(v20, __ T4S, tmp3, tmp4);
2497       else
2498         __ sha1m(v20, __ T4S, tmp3, tmp4);
2499       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2500 
2501       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2502     }
2503 
2504     __ addv(v7, __ T2S, v7, v21);
2505     __ addv(v6, __ T4S, v6, v20);
2506 
2507     if (multi_block) {
2508       __ add(ofs, ofs, 64);
2509       __ cmp(ofs, limit);
2510       __ br(Assembler::LE, sha1_loop);
2511       __ mov(c_rarg0, ofs); // return ofs
2512     }
2513 
2514     __ strq(v6, Address(state, 0));
2515     __ strs(v7, Address(state, 16));
2516 
2517     __ ret(lr);
2518 
2519     __ bind(keys);
2520     __ emit_int32(0x5a827999);
2521     __ emit_int32(0x6ed9eba1);
2522     __ emit_int32(0x8f1bbcdc);
2523     __ emit_int32(0xca62c1d6);
2524 
2525     return start;
2526   }
2527 
2528 
2529   // Arguments:
2530   //
2531   // Inputs:
2532   //   c_rarg0   - byte[]  source+offset
2533   //   c_rarg1   - int[]   SHA.state
2534   //   c_rarg2   - int     offset
2535   //   c_rarg3   - int     limit
2536   //
2537   address generate_sha256_implCompress(bool multi_block, const char *name) {
2538     static const uint32_t round_consts[64] = {
2539       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
2540       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
2541       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
2542       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
2543       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
2544       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
2545       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
2546       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
2547       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
2548       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
2549       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
2550       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
2551       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
2552       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
2553       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
2554       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
2555     };
2556     __ align(CodeEntryAlignment);
2557     StubCodeMark mark(this, "StubRoutines", name);
2558     address start = __ pc();
2559 
2560     Register buf   = c_rarg0;
2561     Register state = c_rarg1;
2562     Register ofs   = c_rarg2;
2563     Register limit = c_rarg3;
2564 
2565     Label sha1_loop;
2566 
2567     __ stpd(v8, v9, __ pre(sp, -32));
2568     __ stpd(v10, v11, Address(sp, 16));
2569 
2570 // dga == v0
2571 // dgb == v1
2572 // dg0 == v2
2573 // dg1 == v3
2574 // dg2 == v4
2575 // t0 == v6
2576 // t1 == v7
2577 
2578     // load 16 keys to v16..v31
2579     __ lea(rscratch1, ExternalAddress((address)round_consts));
2580     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
2581     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
2582     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
2583     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
2584 
2585     // load 8 words (256 bits) state
2586     __ ldpq(v0, v1, state);
2587 
2588     __ BIND(sha1_loop);
2589     // load 64 bytes of data into v8..v11
2590     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
2591     __ rev32(v8, __ T16B, v8);
2592     __ rev32(v9, __ T16B, v9);
2593     __ rev32(v10, __ T16B, v10);
2594     __ rev32(v11, __ T16B, v11);
2595 
2596     __ addv(v6, __ T4S, v8, v16);
2597     __ orr(v2, __ T16B, v0, v0);
2598     __ orr(v3, __ T16B, v1, v1);
2599 
2600     FloatRegister d0 = v8;
2601     FloatRegister d1 = v9;
2602     FloatRegister d2 = v10;
2603     FloatRegister d3 = v11;
2604 
2605 
2606     for (int round = 0; round < 16; round++) {
2607       FloatRegister tmp1 = (round & 1) ? v6 : v7;
2608       FloatRegister tmp2 = (round & 1) ? v7 : v6;
2609       FloatRegister tmp3 = (round & 1) ? v2 : v4;
2610       FloatRegister tmp4 = (round & 1) ? v4 : v2;
2611 
2612       if (round < 12) __ sha256su0(d0, __ T4S, d1);
2613        __ orr(v4, __ T16B, v2, v2);
2614       if (round < 15)
2615         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
2616       __ sha256h(v2, __ T4S, v3, tmp2);
2617       __ sha256h2(v3, __ T4S, v4, tmp2);
2618       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
2619 
2620       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2621     }
2622 
2623     __ addv(v0, __ T4S, v0, v2);
2624     __ addv(v1, __ T4S, v1, v3);
2625 
2626     if (multi_block) {
2627       __ add(ofs, ofs, 64);
2628       __ cmp(ofs, limit);
2629       __ br(Assembler::LE, sha1_loop);
2630       __ mov(c_rarg0, ofs); // return ofs
2631     }
2632 
2633     __ ldpd(v10, v11, Address(sp, 16));
2634     __ ldpd(v8, v9, __ post(sp, 32));
2635 
2636     __ stpq(v0, v1, state);
2637 
2638     __ ret(lr);
2639 
2640     return start;
2641   }
2642 
2643 #ifndef BUILTIN_SIM
2644   // Safefetch stubs.
2645   void generate_safefetch(const char* name, int size, address* entry,
2646                           address* fault_pc, address* continuation_pc) {
2647     // safefetch signatures:
2648     //   int      SafeFetch32(int*      adr, int      errValue);
2649     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
2650     //
2651     // arguments:
2652     //   c_rarg0 = adr
2653     //   c_rarg1 = errValue
2654     //
2655     // result:
2656     //   PPC_RET  = *adr or errValue
2657 
2658     StubCodeMark mark(this, "StubRoutines", name);
2659 
2660     // Entry point, pc or function descriptor.
2661     *entry = __ pc();
2662 
2663     // Load *adr into c_rarg1, may fault.
2664     *fault_pc = __ pc();
2665     switch (size) {
2666       case 4:
2667         // int32_t
2668         __ ldrw(c_rarg1, Address(c_rarg0, 0));
2669         break;
2670       case 8:
2671         // int64_t
2672         __ ldr(c_rarg1, Address(c_rarg0, 0));
2673         break;
2674       default:
2675         ShouldNotReachHere();
2676     }
2677 
2678     // return errValue or *adr
2679     *continuation_pc = __ pc();
2680     __ mov(r0, c_rarg1);
2681     __ ret(lr);
2682   }
2683 #endif
2684 
2685   /**
2686    *  Arguments:
2687    *
2688    * Inputs:
2689    *   c_rarg0   - int crc
2690    *   c_rarg1   - byte* buf
2691    *   c_rarg2   - int length
2692    *
2693    * Ouput:
2694    *       rax   - int crc result
2695    */
2696   address generate_updateBytesCRC32() {
2697     assert(UseCRC32Intrinsics, "what are we doing here?");
2698 
2699     __ align(CodeEntryAlignment);
2700     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
2701 
2702     address start = __ pc();
2703 
2704     const Register crc   = c_rarg0;  // crc
2705     const Register buf   = c_rarg1;  // source java byte array address
2706     const Register len   = c_rarg2;  // length
2707     const Register table0 = c_rarg3; // crc_table address
2708     const Register table1 = c_rarg4;
2709     const Register table2 = c_rarg5;
2710     const Register table3 = c_rarg6;
2711     const Register tmp3 = c_rarg7;
2712 
2713     BLOCK_COMMENT("Entry:");
2714     __ enter(); // required for proper stackwalking of RuntimeStub frame
2715 
2716     __ kernel_crc32(crc, buf, len,
2717               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
2718 
2719     __ leave(); // required for proper stackwalking of RuntimeStub frame
2720     __ ret(lr);
2721 
2722     return start;
2723   }
2724 
2725   /**
2726    *  Arguments:
2727    *
2728    * Inputs:
2729    *   c_rarg0   - int crc
2730    *   c_rarg1   - byte* buf
2731    *   c_rarg2   - int length
2732    *   c_rarg3   - int* table
2733    *
2734    * Ouput:
2735    *       r0   - int crc result
2736    */
2737   address generate_updateBytesCRC32C() {
2738     assert(UseCRC32CIntrinsics, "what are we doing here?");
2739 
2740     __ align(CodeEntryAlignment);
2741     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
2742 
2743     address start = __ pc();
2744 
2745     const Register crc   = c_rarg0;  // crc
2746     const Register buf   = c_rarg1;  // source java byte array address
2747     const Register len   = c_rarg2;  // length
2748     const Register table0 = c_rarg3; // crc_table address
2749     const Register table1 = c_rarg4;
2750     const Register table2 = c_rarg5;
2751     const Register table3 = c_rarg6;
2752     const Register tmp3 = c_rarg7;
2753 
2754     BLOCK_COMMENT("Entry:");
2755     __ enter(); // required for proper stackwalking of RuntimeStub frame
2756 
2757     __ kernel_crc32c(crc, buf, len,
2758               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
2759 
2760     __ leave(); // required for proper stackwalking of RuntimeStub frame
2761     __ ret(lr);
2762 
2763     return start;
2764   }
2765 
2766   /***
2767    *  Arguments:
2768    *
2769    *  Inputs:
2770    *   c_rarg0   - int   adler
2771    *   c_rarg1   - byte* buff
2772    *   c_rarg2   - int   len
2773    *
2774    * Output:
2775    *   c_rarg0   - int adler result
2776    */
2777   address generate_updateBytesAdler32() {
2778     __ align(CodeEntryAlignment);
2779     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
2780     address start = __ pc();
2781 
2782     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
2783 
2784     // Aliases
2785     Register adler  = c_rarg0;
2786     Register s1     = c_rarg0;
2787     Register s2     = c_rarg3;
2788     Register buff   = c_rarg1;
2789     Register len    = c_rarg2;
2790     Register nmax  = r4;
2791     Register base = r5;
2792     Register count = r6;
2793     Register temp0 = rscratch1;
2794     Register temp1 = rscratch2;
2795     Register temp2 = r7;
2796 
2797     // Max number of bytes we can process before having to take the mod
2798     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
2799     unsigned long BASE = 0xfff1;
2800     unsigned long NMAX = 0x15B0;
2801 
2802     __ mov(base, BASE);
2803     __ mov(nmax, NMAX);
2804 
2805     // s1 is initialized to the lower 16 bits of adler
2806     // s2 is initialized to the upper 16 bits of adler
2807     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
2808     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
2809 
2810     // The pipelined loop needs at least 16 elements for 1 iteration
2811     // It does check this, but it is more effective to skip to the cleanup loop
2812     __ cmp(len, 16);
2813     __ br(Assembler::HS, L_nmax);
2814     __ cbz(len, L_combine);
2815 
2816     __ bind(L_simple_by1_loop);
2817     __ ldrb(temp0, Address(__ post(buff, 1)));
2818     __ add(s1, s1, temp0);
2819     __ add(s2, s2, s1);
2820     __ subs(len, len, 1);
2821     __ br(Assembler::HI, L_simple_by1_loop);
2822 
2823     // s1 = s1 % BASE
2824     __ subs(temp0, s1, base);
2825     __ csel(s1, temp0, s1, Assembler::HS);
2826 
2827     // s2 = s2 % BASE
2828     __ lsr(temp0, s2, 16);
2829     __ lsl(temp1, temp0, 4);
2830     __ sub(temp1, temp1, temp0);
2831     __ add(s2, temp1, s2, ext::uxth);
2832 
2833     __ subs(temp0, s2, base);
2834     __ csel(s2, temp0, s2, Assembler::HS);
2835 
2836     __ b(L_combine);
2837 
2838     __ bind(L_nmax);
2839     __ subs(len, len, nmax);
2840     __ sub(count, nmax, 16);
2841     __ br(Assembler::LO, L_by16);
2842 
2843     __ bind(L_nmax_loop);
2844 
2845     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
2846 
2847     __ add(s1, s1, temp0, ext::uxtb);
2848     __ ubfx(temp2, temp0, 8, 8);
2849     __ add(s2, s2, s1);
2850     __ add(s1, s1, temp2);
2851     __ ubfx(temp2, temp0, 16, 8);
2852     __ add(s2, s2, s1);
2853     __ add(s1, s1, temp2);
2854     __ ubfx(temp2, temp0, 24, 8);
2855     __ add(s2, s2, s1);
2856     __ add(s1, s1, temp2);
2857     __ ubfx(temp2, temp0, 32, 8);
2858     __ add(s2, s2, s1);
2859     __ add(s1, s1, temp2);
2860     __ ubfx(temp2, temp0, 40, 8);
2861     __ add(s2, s2, s1);
2862     __ add(s1, s1, temp2);
2863     __ ubfx(temp2, temp0, 48, 8);
2864     __ add(s2, s2, s1);
2865     __ add(s1, s1, temp2);
2866     __ add(s2, s2, s1);
2867     __ add(s1, s1, temp0, Assembler::LSR, 56);
2868     __ add(s2, s2, s1);
2869 
2870     __ add(s1, s1, temp1, ext::uxtb);
2871     __ ubfx(temp2, temp1, 8, 8);
2872     __ add(s2, s2, s1);
2873     __ add(s1, s1, temp2);
2874     __ ubfx(temp2, temp1, 16, 8);
2875     __ add(s2, s2, s1);
2876     __ add(s1, s1, temp2);
2877     __ ubfx(temp2, temp1, 24, 8);
2878     __ add(s2, s2, s1);
2879     __ add(s1, s1, temp2);
2880     __ ubfx(temp2, temp1, 32, 8);
2881     __ add(s2, s2, s1);
2882     __ add(s1, s1, temp2);
2883     __ ubfx(temp2, temp1, 40, 8);
2884     __ add(s2, s2, s1);
2885     __ add(s1, s1, temp2);
2886     __ ubfx(temp2, temp1, 48, 8);
2887     __ add(s2, s2, s1);
2888     __ add(s1, s1, temp2);
2889     __ add(s2, s2, s1);
2890     __ add(s1, s1, temp1, Assembler::LSR, 56);
2891     __ add(s2, s2, s1);
2892 
2893     __ subs(count, count, 16);
2894     __ br(Assembler::HS, L_nmax_loop);
2895 
2896     // s1 = s1 % BASE
2897     __ lsr(temp0, s1, 16);
2898     __ lsl(temp1, temp0, 4);
2899     __ sub(temp1, temp1, temp0);
2900     __ add(temp1, temp1, s1, ext::uxth);
2901 
2902     __ lsr(temp0, temp1, 16);
2903     __ lsl(s1, temp0, 4);
2904     __ sub(s1, s1, temp0);
2905     __ add(s1, s1, temp1, ext:: uxth);
2906 
2907     __ subs(temp0, s1, base);
2908     __ csel(s1, temp0, s1, Assembler::HS);
2909 
2910     // s2 = s2 % BASE
2911     __ lsr(temp0, s2, 16);
2912     __ lsl(temp1, temp0, 4);
2913     __ sub(temp1, temp1, temp0);
2914     __ add(temp1, temp1, s2, ext::uxth);
2915 
2916     __ lsr(temp0, temp1, 16);
2917     __ lsl(s2, temp0, 4);
2918     __ sub(s2, s2, temp0);
2919     __ add(s2, s2, temp1, ext:: uxth);
2920 
2921     __ subs(temp0, s2, base);
2922     __ csel(s2, temp0, s2, Assembler::HS);
2923 
2924     __ subs(len, len, nmax);
2925     __ sub(count, nmax, 16);
2926     __ br(Assembler::HS, L_nmax_loop);
2927 
2928     __ bind(L_by16);
2929     __ adds(len, len, count);
2930     __ br(Assembler::LO, L_by1);
2931 
2932     __ bind(L_by16_loop);
2933 
2934     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
2935 
2936     __ add(s1, s1, temp0, ext::uxtb);
2937     __ ubfx(temp2, temp0, 8, 8);
2938     __ add(s2, s2, s1);
2939     __ add(s1, s1, temp2);
2940     __ ubfx(temp2, temp0, 16, 8);
2941     __ add(s2, s2, s1);
2942     __ add(s1, s1, temp2);
2943     __ ubfx(temp2, temp0, 24, 8);
2944     __ add(s2, s2, s1);
2945     __ add(s1, s1, temp2);
2946     __ ubfx(temp2, temp0, 32, 8);
2947     __ add(s2, s2, s1);
2948     __ add(s1, s1, temp2);
2949     __ ubfx(temp2, temp0, 40, 8);
2950     __ add(s2, s2, s1);
2951     __ add(s1, s1, temp2);
2952     __ ubfx(temp2, temp0, 48, 8);
2953     __ add(s2, s2, s1);
2954     __ add(s1, s1, temp2);
2955     __ add(s2, s2, s1);
2956     __ add(s1, s1, temp0, Assembler::LSR, 56);
2957     __ add(s2, s2, s1);
2958 
2959     __ add(s1, s1, temp1, ext::uxtb);
2960     __ ubfx(temp2, temp1, 8, 8);
2961     __ add(s2, s2, s1);
2962     __ add(s1, s1, temp2);
2963     __ ubfx(temp2, temp1, 16, 8);
2964     __ add(s2, s2, s1);
2965     __ add(s1, s1, temp2);
2966     __ ubfx(temp2, temp1, 24, 8);
2967     __ add(s2, s2, s1);
2968     __ add(s1, s1, temp2);
2969     __ ubfx(temp2, temp1, 32, 8);
2970     __ add(s2, s2, s1);
2971     __ add(s1, s1, temp2);
2972     __ ubfx(temp2, temp1, 40, 8);
2973     __ add(s2, s2, s1);
2974     __ add(s1, s1, temp2);
2975     __ ubfx(temp2, temp1, 48, 8);
2976     __ add(s2, s2, s1);
2977     __ add(s1, s1, temp2);
2978     __ add(s2, s2, s1);
2979     __ add(s1, s1, temp1, Assembler::LSR, 56);
2980     __ add(s2, s2, s1);
2981 
2982     __ subs(len, len, 16);
2983     __ br(Assembler::HS, L_by16_loop);
2984 
2985     __ bind(L_by1);
2986     __ adds(len, len, 15);
2987     __ br(Assembler::LO, L_do_mod);
2988 
2989     __ bind(L_by1_loop);
2990     __ ldrb(temp0, Address(__ post(buff, 1)));
2991     __ add(s1, temp0, s1);
2992     __ add(s2, s2, s1);
2993     __ subs(len, len, 1);
2994     __ br(Assembler::HS, L_by1_loop);
2995 
2996     __ bind(L_do_mod);
2997     // s1 = s1 % BASE
2998     __ lsr(temp0, s1, 16);
2999     __ lsl(temp1, temp0, 4);
3000     __ sub(temp1, temp1, temp0);
3001     __ add(temp1, temp1, s1, ext::uxth);
3002 
3003     __ lsr(temp0, temp1, 16);
3004     __ lsl(s1, temp0, 4);
3005     __ sub(s1, s1, temp0);
3006     __ add(s1, s1, temp1, ext:: uxth);
3007 
3008     __ subs(temp0, s1, base);
3009     __ csel(s1, temp0, s1, Assembler::HS);
3010 
3011     // s2 = s2 % BASE
3012     __ lsr(temp0, s2, 16);
3013     __ lsl(temp1, temp0, 4);
3014     __ sub(temp1, temp1, temp0);
3015     __ add(temp1, temp1, s2, ext::uxth);
3016 
3017     __ lsr(temp0, temp1, 16);
3018     __ lsl(s2, temp0, 4);
3019     __ sub(s2, s2, temp0);
3020     __ add(s2, s2, temp1, ext:: uxth);
3021 
3022     __ subs(temp0, s2, base);
3023     __ csel(s2, temp0, s2, Assembler::HS);
3024 
3025     // Combine lower bits and higher bits
3026     __ bind(L_combine);
3027     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3028 
3029     __ ret(lr);
3030 
3031     return start;
3032   }
3033 
3034   /**
3035    *  Arguments:
3036    *
3037    *  Input:
3038    *    c_rarg0   - x address
3039    *    c_rarg1   - x length
3040    *    c_rarg2   - y address
3041    *    c_rarg3   - y lenth
3042    *    c_rarg4   - z address
3043    *    c_rarg5   - z length
3044    */
3045   address generate_multiplyToLen() {
3046     __ align(CodeEntryAlignment);
3047     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3048 
3049     address start = __ pc();
3050     const Register x     = r0;
3051     const Register xlen  = r1;
3052     const Register y     = r2;
3053     const Register ylen  = r3;
3054     const Register z     = r4;
3055     const Register zlen  = r5;
3056 
3057     const Register tmp1  = r10;
3058     const Register tmp2  = r11;
3059     const Register tmp3  = r12;
3060     const Register tmp4  = r13;
3061     const Register tmp5  = r14;
3062     const Register tmp6  = r15;
3063     const Register tmp7  = r16;
3064 
3065     BLOCK_COMMENT("Entry:");
3066     __ enter(); // required for proper stackwalking of RuntimeStub frame
3067     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3068     __ leave(); // required for proper stackwalking of RuntimeStub frame
3069     __ ret(lr);
3070 
3071     return start;
3072   }
3073 
3074   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3075                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3076                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3077     // Karatsuba multiplication performs a 128*128 -> 256-bit
3078     // multiplication in three 128-bit multiplications and a few
3079     // additions.
3080     //
3081     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3082     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3083     //
3084     // Inputs:
3085     //
3086     // A0 in a.d[0]     (subkey)
3087     // A1 in a.d[1]
3088     // (A1+A0) in a1_xor_a0.d[0]
3089     //
3090     // B0 in b.d[0]     (state)
3091     // B1 in b.d[1]
3092 
3093     __ ext(tmp1, __ T16B, b, b, 0x08);
3094     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3095     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3096     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3097     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3098 
3099     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3100     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3101     __ eor(tmp2, __ T16B, tmp2, tmp4);
3102     __ eor(tmp2, __ T16B, tmp2, tmp3);
3103 
3104     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3105     __ ins(result_hi, __ D, tmp2, 0, 1);
3106     __ ins(result_lo, __ D, tmp2, 1, 0);
3107   }
3108 
3109   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3110                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3111     const FloatRegister t0 = result;
3112 
3113     // The GCM field polynomial f is z^128 + p(z), where p =
3114     // z^7+z^2+z+1.
3115     //
3116     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3117     //
3118     // so, given that the product we're reducing is
3119     //    a == lo + hi * z^128
3120     // substituting,
3121     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3122     //
3123     // we reduce by multiplying hi by p(z) and subtracting the result
3124     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3125     // bits we can do this with two 64-bit multiplications, lo*p and
3126     // hi*p.
3127 
3128     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3129     __ ext(t1, __ T16B, t0, z, 8);
3130     __ eor(hi, __ T16B, hi, t1);
3131     __ ext(t1, __ T16B, z, t0, 8);
3132     __ eor(lo, __ T16B, lo, t1);
3133     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3134     __ eor(result, __ T16B, lo, t0);
3135   }
3136 
3137   /**
3138    *  Arguments:
3139    *
3140    *  Input:
3141    *  c_rarg0   - current state address
3142    *  c_rarg1   - H key address
3143    *  c_rarg2   - data address
3144    *  c_rarg3   - number of blocks
3145    *
3146    *  Output:
3147    *  Updated state at c_rarg0
3148    */
3149   address generate_ghash_processBlocks() {
3150     // Bafflingly, GCM uses little-endian for the byte order, but
3151     // big-endian for the bit order.  For example, the polynomial 1 is
3152     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3153     //
3154     // So, we must either reverse the bytes in each word and do
3155     // everything big-endian or reverse the bits in each byte and do
3156     // it little-endian.  On AArch64 it's more idiomatic to reverse
3157     // the bits in each byte (we have an instruction, RBIT, to do
3158     // that) and keep the data in little-endian bit order throught the
3159     // calculation, bit-reversing the inputs and outputs.
3160 
3161     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3162     __ align(wordSize * 2);
3163     address p = __ pc();
3164     __ emit_int64(0x87);  // The low-order bits of the field
3165                           // polynomial (i.e. p = z^7+z^2+z+1)
3166                           // repeated in the low and high parts of a
3167                           // 128-bit vector
3168     __ emit_int64(0x87);
3169 
3170     __ align(CodeEntryAlignment);
3171     address start = __ pc();
3172 
3173     Register state   = c_rarg0;
3174     Register subkeyH = c_rarg1;
3175     Register data    = c_rarg2;
3176     Register blocks  = c_rarg3;
3177 
3178     FloatRegister vzr = v30;
3179     __ eor(vzr, __ T16B, vzr, vzr); // zero register
3180 
3181     __ ldrq(v0, Address(state));
3182     __ ldrq(v1, Address(subkeyH));
3183 
3184     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3185     __ rbit(v0, __ T16B, v0);
3186     __ rev64(v1, __ T16B, v1);
3187     __ rbit(v1, __ T16B, v1);
3188 
3189     __ ldrq(v26, p);
3190 
3191     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
3192     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
3193 
3194     {
3195       Label L_ghash_loop;
3196       __ bind(L_ghash_loop);
3197 
3198       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
3199                                                  // reversing each byte
3200       __ rbit(v2, __ T16B, v2);
3201       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
3202 
3203       // Multiply state in v2 by subkey in v1
3204       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
3205                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
3206                      /*temps*/v6, v20, v18, v21);
3207       // Reduce v7:v5 by the field polynomial
3208       ghash_reduce(v0, v5, v7, v26, vzr, v20);
3209 
3210       __ sub(blocks, blocks, 1);
3211       __ cbnz(blocks, L_ghash_loop);
3212     }
3213 
3214     // The bit-reversed result is at this point in v0
3215     __ rev64(v1, __ T16B, v0);
3216     __ rbit(v1, __ T16B, v1);
3217 
3218     __ st1(v1, __ T16B, state);
3219     __ ret(lr);
3220 
3221     return start;
3222   }
3223 
3224   // Continuation point for throwing of implicit exceptions that are
3225   // not handled in the current activation. Fabricates an exception
3226   // oop and initiates normal exception dispatching in this
3227   // frame. Since we need to preserve callee-saved values (currently
3228   // only for C2, but done for C1 as well) we need a callee-saved oop
3229   // map and therefore have to make these stubs into RuntimeStubs
3230   // rather than BufferBlobs.  If the compiler needs all registers to
3231   // be preserved between the fault point and the exception handler
3232   // then it must assume responsibility for that in
3233   // AbstractCompiler::continuation_for_implicit_null_exception or
3234   // continuation_for_implicit_division_by_zero_exception. All other
3235   // implicit exceptions (e.g., NullPointerException or
3236   // AbstractMethodError on entry) are either at call sites or
3237   // otherwise assume that stack unwinding will be initiated, so
3238   // caller saved registers were assumed volatile in the compiler.
3239 
3240 #undef __
3241 #define __ masm->
3242 
3243   address generate_throw_exception(const char* name,
3244                                    address runtime_entry,
3245                                    Register arg1 = noreg,
3246                                    Register arg2 = noreg) {
3247     // Information about frame layout at time of blocking runtime call.
3248     // Note that we only have to preserve callee-saved registers since
3249     // the compilers are responsible for supplying a continuation point
3250     // if they expect all registers to be preserved.
3251     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3252     enum layout {
3253       rfp_off = 0,
3254       rfp_off2,
3255       return_off,
3256       return_off2,
3257       framesize // inclusive of return address
3258     };
3259 
3260     int insts_size = 512;
3261     int locs_size  = 64;
3262 
3263     CodeBuffer code(name, insts_size, locs_size);
3264     OopMapSet* oop_maps  = new OopMapSet();
3265     MacroAssembler* masm = new MacroAssembler(&code);
3266 
3267     address start = __ pc();
3268 
3269     // This is an inlined and slightly modified version of call_VM
3270     // which has the ability to fetch the return PC out of
3271     // thread-local storage and also sets up last_Java_sp slightly
3272     // differently than the real call_VM
3273 
3274     __ enter(); // Save FP and LR before call
3275 
3276     assert(is_even(framesize/2), "sp not 16-byte aligned");
3277 
3278     // lr and fp are already in place
3279     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3280 
3281     int frame_complete = __ pc() - start;
3282 
3283     // Set up last_Java_sp and last_Java_fp
3284     address the_pc = __ pc();
3285     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3286 
3287     // Call runtime
3288     if (arg1 != noreg) {
3289       assert(arg2 != c_rarg1, "clobbered");
3290       __ mov(c_rarg1, arg1);
3291     }
3292     if (arg2 != noreg) {
3293       __ mov(c_rarg2, arg2);
3294     }
3295     __ mov(c_rarg0, rthread);
3296     BLOCK_COMMENT("call runtime_entry");
3297     __ mov(rscratch1, runtime_entry);
3298     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
3299 
3300     // Generate oop map
3301     OopMap* map = new OopMap(framesize, 0);
3302 
3303     oop_maps->add_gc_map(the_pc - start, map);
3304 
3305     __ reset_last_Java_frame(true, true);
3306     __ maybe_isb();
3307 
3308     __ leave();
3309 
3310     // check for pending exceptions
3311 #ifdef ASSERT
3312     Label L;
3313     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3314     __ cbnz(rscratch1, L);
3315     __ should_not_reach_here();
3316     __ bind(L);
3317 #endif // ASSERT
3318     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3319 
3320 
3321     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3322     RuntimeStub* stub =
3323       RuntimeStub::new_runtime_stub(name,
3324                                     &code,
3325                                     frame_complete,
3326                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3327                                     oop_maps, false);
3328     return stub->entry_point();
3329   }
3330 
3331   class MontgomeryMultiplyGenerator : public MacroAssembler {
3332 
3333     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3334       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
3335 
3336     RegSet _toSave;
3337     bool _squaring;
3338 
3339   public:
3340     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3341       : MacroAssembler(as->code()), _squaring(squaring) {
3342 
3343       // Register allocation
3344 
3345       Register reg = c_rarg0;
3346       Pa_base = reg;       // Argument registers
3347       if (squaring)
3348         Pb_base = Pa_base;
3349       else
3350         Pb_base = ++reg;
3351       Pn_base = ++reg;
3352       Rlen= ++reg;
3353       inv = ++reg;
3354       Pm_base = ++reg;
3355 
3356                           // Working registers:
3357       Ra =  ++reg;        // The current digit of a, b, n, and m.
3358       Rb =  ++reg;
3359       Rm =  ++reg;
3360       Rn =  ++reg;
3361 
3362       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
3363       Pb =  ++reg;
3364       Pm =  ++reg;
3365       Pn =  ++reg;
3366 
3367       t0 =  ++reg;        // Three registers which form a
3368       t1 =  ++reg;        // triple-precision accumuator.
3369       t2 =  ++reg;
3370 
3371       Ri =  ++reg;        // Inner and outer loop indexes.
3372       Rj =  ++reg;
3373 
3374       Rhi_ab = ++reg;     // Product registers: low and high parts
3375       Rlo_ab = ++reg;     // of a*b and m*n.
3376       Rhi_mn = ++reg;
3377       Rlo_mn = ++reg;
3378 
3379       // r19 and up are callee-saved.
3380       _toSave = RegSet::range(r19, reg) + Pm_base;
3381     }
3382 
3383   private:
3384     void save_regs() {
3385       push(_toSave, sp);
3386     }
3387 
3388     void restore_regs() {
3389       pop(_toSave, sp);
3390     }
3391 
3392     template <typename T>
3393     void unroll_2(Register count, T block) {
3394       Label loop, end, odd;
3395       tbnz(count, 0, odd);
3396       cbz(count, end);
3397       align(16);
3398       bind(loop);
3399       (this->*block)();
3400       bind(odd);
3401       (this->*block)();
3402       subs(count, count, 2);
3403       br(Assembler::GT, loop);
3404       bind(end);
3405     }
3406 
3407     template <typename T>
3408     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3409       Label loop, end, odd;
3410       tbnz(count, 0, odd);
3411       cbz(count, end);
3412       align(16);
3413       bind(loop);
3414       (this->*block)(d, s, tmp);
3415       bind(odd);
3416       (this->*block)(d, s, tmp);
3417       subs(count, count, 2);
3418       br(Assembler::GT, loop);
3419       bind(end);
3420     }
3421 
3422     void pre1(RegisterOrConstant i) {
3423       block_comment("pre1");
3424       // Pa = Pa_base;
3425       // Pb = Pb_base + i;
3426       // Pm = Pm_base;
3427       // Pn = Pn_base + i;
3428       // Ra = *Pa;
3429       // Rb = *Pb;
3430       // Rm = *Pm;
3431       // Rn = *Pn;
3432       ldr(Ra, Address(Pa_base));
3433       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3434       ldr(Rm, Address(Pm_base));
3435       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3436       lea(Pa, Address(Pa_base));
3437       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3438       lea(Pm, Address(Pm_base));
3439       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3440 
3441       // Zero the m*n result.
3442       mov(Rhi_mn, zr);
3443       mov(Rlo_mn, zr);
3444     }
3445 
3446     // The core multiply-accumulate step of a Montgomery
3447     // multiplication.  The idea is to schedule operations as a
3448     // pipeline so that instructions with long latencies (loads and
3449     // multiplies) have time to complete before their results are
3450     // used.  This most benefits in-order implementations of the
3451     // architecture but out-of-order ones also benefit.
3452     void step() {
3453       block_comment("step");
3454       // MACC(Ra, Rb, t0, t1, t2);
3455       // Ra = *++Pa;
3456       // Rb = *--Pb;
3457       umulh(Rhi_ab, Ra, Rb);
3458       mul(Rlo_ab, Ra, Rb);
3459       ldr(Ra, pre(Pa, wordSize));
3460       ldr(Rb, pre(Pb, -wordSize));
3461       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
3462                                        // previous iteration.
3463       // MACC(Rm, Rn, t0, t1, t2);
3464       // Rm = *++Pm;
3465       // Rn = *--Pn;
3466       umulh(Rhi_mn, Rm, Rn);
3467       mul(Rlo_mn, Rm, Rn);
3468       ldr(Rm, pre(Pm, wordSize));
3469       ldr(Rn, pre(Pn, -wordSize));
3470       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3471     }
3472 
3473     void post1() {
3474       block_comment("post1");
3475 
3476       // MACC(Ra, Rb, t0, t1, t2);
3477       // Ra = *++Pa;
3478       // Rb = *--Pb;
3479       umulh(Rhi_ab, Ra, Rb);
3480       mul(Rlo_ab, Ra, Rb);
3481       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3482       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3483 
3484       // *Pm = Rm = t0 * inv;
3485       mul(Rm, t0, inv);
3486       str(Rm, Address(Pm));
3487 
3488       // MACC(Rm, Rn, t0, t1, t2);
3489       // t0 = t1; t1 = t2; t2 = 0;
3490       umulh(Rhi_mn, Rm, Rn);
3491 
3492 #ifndef PRODUCT
3493       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3494       {
3495         mul(Rlo_mn, Rm, Rn);
3496         add(Rlo_mn, t0, Rlo_mn);
3497         Label ok;
3498         cbz(Rlo_mn, ok); {
3499           stop("broken Montgomery multiply");
3500         } bind(ok);
3501       }
3502 #endif
3503       // We have very carefully set things up so that
3504       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3505       // the lower half of Rm * Rn because we know the result already:
3506       // it must be -t0.  t0 + (-t0) must generate a carry iff
3507       // t0 != 0.  So, rather than do a mul and an adds we just set
3508       // the carry flag iff t0 is nonzero.
3509       //
3510       // mul(Rlo_mn, Rm, Rn);
3511       // adds(zr, t0, Rlo_mn);
3512       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3513       adcs(t0, t1, Rhi_mn);
3514       adc(t1, t2, zr);
3515       mov(t2, zr);
3516     }
3517 
3518     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
3519       block_comment("pre2");
3520       // Pa = Pa_base + i-len;
3521       // Pb = Pb_base + len;
3522       // Pm = Pm_base + i-len;
3523       // Pn = Pn_base + len;
3524 
3525       if (i.is_register()) {
3526         sub(Rj, i.as_register(), len);
3527       } else {
3528         mov(Rj, i.as_constant());
3529         sub(Rj, Rj, len);
3530       }
3531       // Rj == i-len
3532 
3533       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
3534       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
3535       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3536       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
3537 
3538       // Ra = *++Pa;
3539       // Rb = *--Pb;
3540       // Rm = *++Pm;
3541       // Rn = *--Pn;
3542       ldr(Ra, pre(Pa, wordSize));
3543       ldr(Rb, pre(Pb, -wordSize));
3544       ldr(Rm, pre(Pm, wordSize));
3545       ldr(Rn, pre(Pn, -wordSize));
3546 
3547       mov(Rhi_mn, zr);
3548       mov(Rlo_mn, zr);
3549     }
3550 
3551     void post2(RegisterOrConstant i, RegisterOrConstant len) {
3552       block_comment("post2");
3553       if (i.is_constant()) {
3554         mov(Rj, i.as_constant()-len.as_constant());
3555       } else {
3556         sub(Rj, i.as_register(), len);
3557       }
3558 
3559       adds(t0, t0, Rlo_mn); // The pending m*n, low part
3560 
3561       // As soon as we know the least significant digit of our result,
3562       // store it.
3563       // Pm_base[i-len] = t0;
3564       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3565 
3566       // t0 = t1; t1 = t2; t2 = 0;
3567       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
3568       adc(t1, t2, zr);
3569       mov(t2, zr);
3570     }
3571 
3572     // A carry in t0 after Montgomery multiplication means that we
3573     // should subtract multiples of n from our result in m.  We'll
3574     // keep doing that until there is no carry.
3575     void normalize(RegisterOrConstant len) {
3576       block_comment("normalize");
3577       // while (t0)
3578       //   t0 = sub(Pm_base, Pn_base, t0, len);
3579       Label loop, post, again;
3580       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
3581       cbz(t0, post); {
3582         bind(again); {
3583           mov(i, zr);
3584           mov(cnt, len);
3585           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3586           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3587           subs(zr, zr, zr); // set carry flag, i.e. no borrow
3588           align(16);
3589           bind(loop); {
3590             sbcs(Rm, Rm, Rn);
3591             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3592             add(i, i, 1);
3593             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3594             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3595             sub(cnt, cnt, 1);
3596           } cbnz(cnt, loop);
3597           sbc(t0, t0, zr);
3598         } cbnz(t0, again);
3599       } bind(post);
3600     }
3601 
3602     // Move memory at s to d, reversing words.
3603     //    Increments d to end of copied memory
3604     //    Destroys tmp1, tmp2
3605     //    Preserves len
3606     //    Leaves s pointing to the address which was in d at start
3607     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
3608       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
3609 
3610       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
3611       mov(tmp1, len);
3612       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
3613       sub(s, d, len, ext::uxtw, LogBytesPerWord);
3614     }
3615     // where
3616     void reverse1(Register d, Register s, Register tmp) {
3617       ldr(tmp, pre(s, -wordSize));
3618       ror(tmp, tmp, 32);
3619       str(tmp, post(d, wordSize));
3620     }
3621 
3622     void step_squaring() {
3623       // An extra ACC
3624       step();
3625       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3626     }
3627 
3628     void last_squaring(RegisterOrConstant i) {
3629       Label dont;
3630       // if ((i & 1) == 0) {
3631       tbnz(i.as_register(), 0, dont); {
3632         // MACC(Ra, Rb, t0, t1, t2);
3633         // Ra = *++Pa;
3634         // Rb = *--Pb;
3635         umulh(Rhi_ab, Ra, Rb);
3636         mul(Rlo_ab, Ra, Rb);
3637         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3638       } bind(dont);
3639     }
3640 
3641     void extra_step_squaring() {
3642       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3643 
3644       // MACC(Rm, Rn, t0, t1, t2);
3645       // Rm = *++Pm;
3646       // Rn = *--Pn;
3647       umulh(Rhi_mn, Rm, Rn);
3648       mul(Rlo_mn, Rm, Rn);
3649       ldr(Rm, pre(Pm, wordSize));
3650       ldr(Rn, pre(Pn, -wordSize));
3651     }
3652 
3653     void post1_squaring() {
3654       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3655 
3656       // *Pm = Rm = t0 * inv;
3657       mul(Rm, t0, inv);
3658       str(Rm, Address(Pm));
3659 
3660       // MACC(Rm, Rn, t0, t1, t2);
3661       // t0 = t1; t1 = t2; t2 = 0;
3662       umulh(Rhi_mn, Rm, Rn);
3663 
3664 #ifndef PRODUCT
3665       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3666       {
3667         mul(Rlo_mn, Rm, Rn);
3668         add(Rlo_mn, t0, Rlo_mn);
3669         Label ok;
3670         cbz(Rlo_mn, ok); {
3671           stop("broken Montgomery multiply");
3672         } bind(ok);
3673       }
3674 #endif
3675       // We have very carefully set things up so that
3676       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3677       // the lower half of Rm * Rn because we know the result already:
3678       // it must be -t0.  t0 + (-t0) must generate a carry iff
3679       // t0 != 0.  So, rather than do a mul and an adds we just set
3680       // the carry flag iff t0 is nonzero.
3681       //
3682       // mul(Rlo_mn, Rm, Rn);
3683       // adds(zr, t0, Rlo_mn);
3684       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3685       adcs(t0, t1, Rhi_mn);
3686       adc(t1, t2, zr);
3687       mov(t2, zr);
3688     }
3689 
3690     void acc(Register Rhi, Register Rlo,
3691              Register t0, Register t1, Register t2) {
3692       adds(t0, t0, Rlo);
3693       adcs(t1, t1, Rhi);
3694       adc(t2, t2, zr);
3695     }
3696 
3697   public:
3698     /**
3699      * Fast Montgomery multiplication.  The derivation of the
3700      * algorithm is in A Cryptographic Library for the Motorola
3701      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3702      *
3703      * Arguments:
3704      *
3705      * Inputs for multiplication:
3706      *   c_rarg0   - int array elements a
3707      *   c_rarg1   - int array elements b
3708      *   c_rarg2   - int array elements n (the modulus)
3709      *   c_rarg3   - int length
3710      *   c_rarg4   - int inv
3711      *   c_rarg5   - int array elements m (the result)
3712      *
3713      * Inputs for squaring:
3714      *   c_rarg0   - int array elements a
3715      *   c_rarg1   - int array elements n (the modulus)
3716      *   c_rarg2   - int length
3717      *   c_rarg3   - int inv
3718      *   c_rarg4   - int array elements m (the result)
3719      *
3720      */
3721     address generate_multiply() {
3722       Label argh, nothing;
3723       bind(argh);
3724       stop("MontgomeryMultiply total_allocation must be <= 8192");
3725 
3726       align(CodeEntryAlignment);
3727       address entry = pc();
3728 
3729       cbzw(Rlen, nothing);
3730 
3731       enter();
3732 
3733       // Make room.
3734       cmpw(Rlen, 512);
3735       br(Assembler::HI, argh);
3736       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
3737       andr(sp, Ra, -2 * wordSize);
3738 
3739       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
3740 
3741       {
3742         // Copy input args, reversing as we go.  We use Ra as a
3743         // temporary variable.
3744         reverse(Ra, Pa_base, Rlen, t0, t1);
3745         if (!_squaring)
3746           reverse(Ra, Pb_base, Rlen, t0, t1);
3747         reverse(Ra, Pn_base, Rlen, t0, t1);
3748       }
3749 
3750       // Push all call-saved registers and also Pm_base which we'll need
3751       // at the end.
3752       save_regs();
3753 
3754 #ifndef PRODUCT
3755       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3756       {
3757         ldr(Rn, Address(Pn_base, 0));
3758         mul(Rlo_mn, Rn, inv);
3759         cmp(Rlo_mn, -1);
3760         Label ok;
3761         br(EQ, ok); {
3762           stop("broken inverse in Montgomery multiply");
3763         } bind(ok);
3764       }
3765 #endif
3766 
3767       mov(Pm_base, Ra);
3768 
3769       mov(t0, zr);
3770       mov(t1, zr);
3771       mov(t2, zr);
3772 
3773       block_comment("for (int i = 0; i < len; i++) {");
3774       mov(Ri, zr); {
3775         Label loop, end;
3776         cmpw(Ri, Rlen);
3777         br(Assembler::GE, end);
3778 
3779         bind(loop);
3780         pre1(Ri);
3781 
3782         block_comment("  for (j = i; j; j--) {"); {
3783           movw(Rj, Ri);
3784           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3785         } block_comment("  } // j");
3786 
3787         post1();
3788         addw(Ri, Ri, 1);
3789         cmpw(Ri, Rlen);
3790         br(Assembler::LT, loop);
3791         bind(end);
3792         block_comment("} // i");
3793       }
3794 
3795       block_comment("for (int i = len; i < 2*len; i++) {");
3796       mov(Ri, Rlen); {
3797         Label loop, end;
3798         cmpw(Ri, Rlen, Assembler::LSL, 1);
3799         br(Assembler::GE, end);
3800 
3801         bind(loop);
3802         pre2(Ri, Rlen);
3803 
3804         block_comment("  for (j = len*2-i-1; j; j--) {"); {
3805           lslw(Rj, Rlen, 1);
3806           subw(Rj, Rj, Ri);
3807           subw(Rj, Rj, 1);
3808           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3809         } block_comment("  } // j");
3810 
3811         post2(Ri, Rlen);
3812         addw(Ri, Ri, 1);
3813         cmpw(Ri, Rlen, Assembler::LSL, 1);
3814         br(Assembler::LT, loop);
3815         bind(end);
3816       }
3817       block_comment("} // i");
3818 
3819       normalize(Rlen);
3820 
3821       mov(Ra, Pm_base);  // Save Pm_base in Ra
3822       restore_regs();  // Restore caller's Pm_base
3823 
3824       // Copy our result into caller's Pm_base
3825       reverse(Pm_base, Ra, Rlen, t0, t1);
3826 
3827       leave();
3828       bind(nothing);
3829       ret(lr);
3830 
3831       return entry;
3832     }
3833     // In C, approximately:
3834 
3835     // void
3836     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
3837     //                     unsigned long Pn_base[], unsigned long Pm_base[],
3838     //                     unsigned long inv, int len) {
3839     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3840     //   unsigned long *Pa, *Pb, *Pn, *Pm;
3841     //   unsigned long Ra, Rb, Rn, Rm;
3842 
3843     //   int i;
3844 
3845     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
3846 
3847     //   for (i = 0; i < len; i++) {
3848     //     int j;
3849 
3850     //     Pa = Pa_base;
3851     //     Pb = Pb_base + i;
3852     //     Pm = Pm_base;
3853     //     Pn = Pn_base + i;
3854 
3855     //     Ra = *Pa;
3856     //     Rb = *Pb;
3857     //     Rm = *Pm;
3858     //     Rn = *Pn;
3859 
3860     //     int iters = i;
3861     //     for (j = 0; iters--; j++) {
3862     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
3863     //       MACC(Ra, Rb, t0, t1, t2);
3864     //       Ra = *++Pa;
3865     //       Rb = *--Pb;
3866     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
3867     //       MACC(Rm, Rn, t0, t1, t2);
3868     //       Rm = *++Pm;
3869     //       Rn = *--Pn;
3870     //     }
3871 
3872     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
3873     //     MACC(Ra, Rb, t0, t1, t2);
3874     //     *Pm = Rm = t0 * inv;
3875     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
3876     //     MACC(Rm, Rn, t0, t1, t2);
3877 
3878     //     assert(t0 == 0, "broken Montgomery multiply");
3879 
3880     //     t0 = t1; t1 = t2; t2 = 0;
3881     //   }
3882 
3883     //   for (i = len; i < 2*len; i++) {
3884     //     int j;
3885 
3886     //     Pa = Pa_base + i-len;
3887     //     Pb = Pb_base + len;
3888     //     Pm = Pm_base + i-len;
3889     //     Pn = Pn_base + len;
3890 
3891     //     Ra = *++Pa;
3892     //     Rb = *--Pb;
3893     //     Rm = *++Pm;
3894     //     Rn = *--Pn;
3895 
3896     //     int iters = len*2-i-1;
3897     //     for (j = i-len+1; iters--; j++) {
3898     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
3899     //       MACC(Ra, Rb, t0, t1, t2);
3900     //       Ra = *++Pa;
3901     //       Rb = *--Pb;
3902     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
3903     //       MACC(Rm, Rn, t0, t1, t2);
3904     //       Rm = *++Pm;
3905     //       Rn = *--Pn;
3906     //     }
3907 
3908     //     Pm_base[i-len] = t0;
3909     //     t0 = t1; t1 = t2; t2 = 0;
3910     //   }
3911 
3912     //   while (t0)
3913     //     t0 = sub(Pm_base, Pn_base, t0, len);
3914     // }
3915 
3916     /**
3917      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
3918      * multiplies than Montgomery multiplication so it should be up to
3919      * 25% faster.  However, its loop control is more complex and it
3920      * may actually run slower on some machines.
3921      *
3922      * Arguments:
3923      *
3924      * Inputs:
3925      *   c_rarg0   - int array elements a
3926      *   c_rarg1   - int array elements n (the modulus)
3927      *   c_rarg2   - int length
3928      *   c_rarg3   - int inv
3929      *   c_rarg4   - int array elements m (the result)
3930      *
3931      */
3932     address generate_square() {
3933       Label argh;
3934       bind(argh);
3935       stop("MontgomeryMultiply total_allocation must be <= 8192");
3936 
3937       align(CodeEntryAlignment);
3938       address entry = pc();
3939 
3940       enter();
3941 
3942       // Make room.
3943       cmpw(Rlen, 512);
3944       br(Assembler::HI, argh);
3945       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
3946       andr(sp, Ra, -2 * wordSize);
3947 
3948       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
3949 
3950       {
3951         // Copy input args, reversing as we go.  We use Ra as a
3952         // temporary variable.
3953         reverse(Ra, Pa_base, Rlen, t0, t1);
3954         reverse(Ra, Pn_base, Rlen, t0, t1);
3955       }
3956 
3957       // Push all call-saved registers and also Pm_base which we'll need
3958       // at the end.
3959       save_regs();
3960 
3961       mov(Pm_base, Ra);
3962 
3963       mov(t0, zr);
3964       mov(t1, zr);
3965       mov(t2, zr);
3966 
3967       block_comment("for (int i = 0; i < len; i++) {");
3968       mov(Ri, zr); {
3969         Label loop, end;
3970         bind(loop);
3971         cmp(Ri, Rlen);
3972         br(Assembler::GE, end);
3973 
3974         pre1(Ri);
3975 
3976         block_comment("for (j = (i+1)/2; j; j--) {"); {
3977           add(Rj, Ri, 1);
3978           lsr(Rj, Rj, 1);
3979           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
3980         } block_comment("  } // j");
3981 
3982         last_squaring(Ri);
3983 
3984         block_comment("  for (j = i/2; j; j--) {"); {
3985           lsr(Rj, Ri, 1);
3986           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
3987         } block_comment("  } // j");
3988 
3989         post1_squaring();
3990         add(Ri, Ri, 1);
3991         cmp(Ri, Rlen);
3992         br(Assembler::LT, loop);
3993 
3994         bind(end);
3995         block_comment("} // i");
3996       }
3997 
3998       block_comment("for (int i = len; i < 2*len; i++) {");
3999       mov(Ri, Rlen); {
4000         Label loop, end;
4001         bind(loop);
4002         cmp(Ri, Rlen, Assembler::LSL, 1);
4003         br(Assembler::GE, end);
4004 
4005         pre2(Ri, Rlen);
4006 
4007         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4008           lsl(Rj, Rlen, 1);
4009           sub(Rj, Rj, Ri);
4010           sub(Rj, Rj, 1);
4011           lsr(Rj, Rj, 1);
4012           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4013         } block_comment("  } // j");
4014 
4015         last_squaring(Ri);
4016 
4017         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4018           lsl(Rj, Rlen, 1);
4019           sub(Rj, Rj, Ri);
4020           lsr(Rj, Rj, 1);
4021           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4022         } block_comment("  } // j");
4023 
4024         post2(Ri, Rlen);
4025         add(Ri, Ri, 1);
4026         cmp(Ri, Rlen, Assembler::LSL, 1);
4027 
4028         br(Assembler::LT, loop);
4029         bind(end);
4030         block_comment("} // i");
4031       }
4032 
4033       normalize(Rlen);
4034 
4035       mov(Ra, Pm_base);  // Save Pm_base in Ra
4036       restore_regs();  // Restore caller's Pm_base
4037 
4038       // Copy our result into caller's Pm_base
4039       reverse(Pm_base, Ra, Rlen, t0, t1);
4040 
4041       leave();
4042       ret(lr);
4043 
4044       return entry;
4045     }
4046     // In C, approximately:
4047 
4048     // void
4049     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4050     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4051     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4052     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4053     //   unsigned long Ra, Rb, Rn, Rm;
4054 
4055     //   int i;
4056 
4057     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4058 
4059     //   for (i = 0; i < len; i++) {
4060     //     int j;
4061 
4062     //     Pa = Pa_base;
4063     //     Pb = Pa_base + i;
4064     //     Pm = Pm_base;
4065     //     Pn = Pn_base + i;
4066 
4067     //     Ra = *Pa;
4068     //     Rb = *Pb;
4069     //     Rm = *Pm;
4070     //     Rn = *Pn;
4071 
4072     //     int iters = (i+1)/2;
4073     //     for (j = 0; iters--; j++) {
4074     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4075     //       MACC2(Ra, Rb, t0, t1, t2);
4076     //       Ra = *++Pa;
4077     //       Rb = *--Pb;
4078     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4079     //       MACC(Rm, Rn, t0, t1, t2);
4080     //       Rm = *++Pm;
4081     //       Rn = *--Pn;
4082     //     }
4083     //     if ((i & 1) == 0) {
4084     //       assert(Ra == Pa_base[j], "must be");
4085     //       MACC(Ra, Ra, t0, t1, t2);
4086     //     }
4087     //     iters = i/2;
4088     //     assert(iters == i-j, "must be");
4089     //     for (; iters--; j++) {
4090     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4091     //       MACC(Rm, Rn, t0, t1, t2);
4092     //       Rm = *++Pm;
4093     //       Rn = *--Pn;
4094     //     }
4095 
4096     //     *Pm = Rm = t0 * inv;
4097     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4098     //     MACC(Rm, Rn, t0, t1, t2);
4099 
4100     //     assert(t0 == 0, "broken Montgomery multiply");
4101 
4102     //     t0 = t1; t1 = t2; t2 = 0;
4103     //   }
4104 
4105     //   for (i = len; i < 2*len; i++) {
4106     //     int start = i-len+1;
4107     //     int end = start + (len - start)/2;
4108     //     int j;
4109 
4110     //     Pa = Pa_base + i-len;
4111     //     Pb = Pa_base + len;
4112     //     Pm = Pm_base + i-len;
4113     //     Pn = Pn_base + len;
4114 
4115     //     Ra = *++Pa;
4116     //     Rb = *--Pb;
4117     //     Rm = *++Pm;
4118     //     Rn = *--Pn;
4119 
4120     //     int iters = (2*len-i-1)/2;
4121     //     assert(iters == end-start, "must be");
4122     //     for (j = start; iters--; j++) {
4123     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4124     //       MACC2(Ra, Rb, t0, t1, t2);
4125     //       Ra = *++Pa;
4126     //       Rb = *--Pb;
4127     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4128     //       MACC(Rm, Rn, t0, t1, t2);
4129     //       Rm = *++Pm;
4130     //       Rn = *--Pn;
4131     //     }
4132     //     if ((i & 1) == 0) {
4133     //       assert(Ra == Pa_base[j], "must be");
4134     //       MACC(Ra, Ra, t0, t1, t2);
4135     //     }
4136     //     iters =  (2*len-i)/2;
4137     //     assert(iters == len-j, "must be");
4138     //     for (; iters--; j++) {
4139     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4140     //       MACC(Rm, Rn, t0, t1, t2);
4141     //       Rm = *++Pm;
4142     //       Rn = *--Pn;
4143     //     }
4144     //     Pm_base[i-len] = t0;
4145     //     t0 = t1; t1 = t2; t2 = 0;
4146     //   }
4147 
4148     //   while (t0)
4149     //     t0 = sub(Pm_base, Pn_base, t0, len);
4150     // }
4151   };
4152 
4153   // Initialization
4154   void generate_initial() {
4155     // Generate initial stubs and initializes the entry points
4156 
4157     // entry points that exist in all platforms Note: This is code
4158     // that could be shared among different platforms - however the
4159     // benefit seems to be smaller than the disadvantage of having a
4160     // much more complicated generator structure. See also comment in
4161     // stubRoutines.hpp.
4162 
4163     StubRoutines::_forward_exception_entry = generate_forward_exception();
4164 
4165     StubRoutines::_call_stub_entry =
4166       generate_call_stub(StubRoutines::_call_stub_return_address);
4167 
4168     // is referenced by megamorphic call
4169     StubRoutines::_catch_exception_entry = generate_catch_exception();
4170 
4171     // Build this early so it's available for the interpreter.
4172     StubRoutines::_throw_StackOverflowError_entry =
4173       generate_throw_exception("StackOverflowError throw_exception",
4174                                CAST_FROM_FN_PTR(address,
4175                                                 SharedRuntime::
4176                                                 throw_StackOverflowError));
4177     if (UseCRC32Intrinsics) {
4178       // set table address before stub generation which use it
4179       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4180       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4181     }
4182   }
4183 
4184   void generate_all() {
4185     // support for verify_oop (must happen after universe_init)
4186     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4187     StubRoutines::_throw_AbstractMethodError_entry =
4188       generate_throw_exception("AbstractMethodError throw_exception",
4189                                CAST_FROM_FN_PTR(address,
4190                                                 SharedRuntime::
4191                                                 throw_AbstractMethodError));
4192 
4193     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4194       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4195                                CAST_FROM_FN_PTR(address,
4196                                                 SharedRuntime::
4197                                                 throw_IncompatibleClassChangeError));
4198 
4199     StubRoutines::_throw_NullPointerException_at_call_entry =
4200       generate_throw_exception("NullPointerException at call throw_exception",
4201                                CAST_FROM_FN_PTR(address,
4202                                                 SharedRuntime::
4203                                                 throw_NullPointerException_at_call));
4204 
4205     // arraycopy stubs used by compilers
4206     generate_arraycopy_stubs();
4207 
4208     if (UseMultiplyToLenIntrinsic) {
4209       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4210     }
4211 
4212     if (UseMontgomeryMultiplyIntrinsic) {
4213       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4214       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4215       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4216     }
4217 
4218     if (UseMontgomerySquareIntrinsic) {
4219       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4220       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4221       // We use generate_multiply() rather than generate_square()
4222       // because it's faster for the sizes of modulus we care about.
4223       StubRoutines::_montgomerySquare = g.generate_multiply();
4224     }
4225 
4226 #ifndef BUILTIN_SIM
4227     // generate GHASH intrinsics code
4228     if (UseGHASHIntrinsics) {
4229       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4230     }
4231 
4232     if (UseAESIntrinsics) {
4233       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4234       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4235       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4236       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4237     }
4238 
4239     if (UseSHA1Intrinsics) {
4240       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4241       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4242     }
4243     if (UseSHA256Intrinsics) {
4244       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4245       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4246     }
4247 
4248     if (UseCRC32CIntrinsics) {
4249       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
4250     }
4251 
4252     // generate Adler32 intrinsics code
4253     if (UseAdler32Intrinsics) {
4254       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
4255     }
4256 
4257     // Safefetch stubs.
4258     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4259                                                        &StubRoutines::_safefetch32_fault_pc,
4260                                                        &StubRoutines::_safefetch32_continuation_pc);
4261     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4262                                                        &StubRoutines::_safefetchN_fault_pc,
4263                                                        &StubRoutines::_safefetchN_continuation_pc);
4264 #endif
4265   }
4266 
4267  public:
4268   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4269     if (all) {
4270       generate_all();
4271     } else {
4272       generate_initial();
4273     }
4274   }
4275 }; // end class declaration
4276 
4277 void StubGenerator_generate(CodeBuffer* code, bool all) {
4278   StubGenerator g(code, all);
4279 }