New src/cpu/aarch64/vm/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "interpreter/interpreter.hpp"
  30 #include "nativeInst_aarch64.hpp"
  31 #include "oops/instanceOop.hpp"
  32 #include "oops/method.hpp"
  33 #include "oops/objArrayKlass.hpp"
  34 #include "oops/oop.inline.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/frame.inline.hpp"
  37 #include "runtime/handles.inline.hpp"
  38 #include "runtime/sharedRuntime.hpp"
  39 #include "runtime/stubCodeGenerator.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "runtime/thread.inline.hpp"
  42 #include "utilities/top.hpp"
  43 #ifdef COMPILER2
  44 #include "opto/runtime.hpp"
  45 #endif
  46 
  47 #ifdef BUILTIN_SIM
  48 #include "../../../../../../simulator/simulator.hpp"
  49 #endif
  50 
  51 // Declaration and definition of StubGenerator (no .hpp file).
  52 // For a more detailed description of the stub routine structure
  53 // see the comment in stubRoutines.hpp
  54 
  55 #undef __
  56 #define __ _masm->
  57 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #else
  62 #define BLOCK_COMMENT(str) __ block_comment(str)
  63 #endif
  64 
  65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  66 
  67 // Stub Code definitions
  68 
  69 class StubGenerator: public StubCodeGenerator {
  70  private:
  71 
  72 #ifdef PRODUCT
  73 #define inc_counter_np(counter) ((void)0)
  74 #else
  75   void inc_counter_np_(int& counter) {
  76     __ lea(rscratch2, ExternalAddress((address)&counter));
  77     __ ldrw(rscratch1, Address(rscratch2));
  78     __ addw(rscratch1, rscratch1, 1);
  79     __ strw(rscratch1, Address(rscratch2));
  80   }
  81 #define inc_counter_np(counter) \
  82   BLOCK_COMMENT("inc_counter " #counter); \
  83   inc_counter_np_(counter);
  84 #endif
  85 
  86   // Call stubs are used to call Java from C
  87   //
  88   // Arguments:
  89   //    c_rarg0:   call wrapper address                   address
  90   //    c_rarg1:   result                                 address
  91   //    c_rarg2:   result type                            BasicType
  92   //    c_rarg3:   method                                 Method*
  93   //    c_rarg4:   (interpreter) entry point              address
  94   //    c_rarg5:   parameters                             intptr_t*
  95   //    c_rarg6:   parameter size (in words)              int
  96   //    c_rarg7:   thread                                 Thread*
  97   //
  98   // There is no return from the stub itself as any Java result
  99   // is written to result
 100   //
 101   // we save r30 (lr) as the return PC at the base of the frame and
 102   // link r29 (fp) below it as the frame pointer installing sp (r31)
 103   // into fp.
 104   //
 105   // we save r0-r7, which accounts for all the c arguments.
 106   //
 107   // TODO: strictly do we need to save them all? they are treated as
 108   // volatile by C so could we omit saving the ones we are going to
 109   // place in global registers (thread? method?) or those we only use
 110   // during setup of the Java call?
 111   //
 112   // we don't need to save r8 which C uses as an indirect result location
 113   // return register.
 114   //
 115   // we don't need to save r9-r15 which both C and Java treat as
 116   // volatile
 117   //
 118   // we don't need to save r16-18 because Java does not use them
 119   //
 120   // we save r19-r28 which Java uses as scratch registers and C
 121   // expects to be callee-save
 122   //
 123   // we save the bottom 64 bits of each value stored in v8-v15; it is
 124   // the responsibility of the caller to preserve larger values.
 125   //
 126   // so the stub frame looks like this when we enter Java code
 127   //
 128   //     [ return_from_Java     ] <--- sp
 129   //     [ argument word n      ]
 130   //      ...
 131   // -27 [ argument word 1      ]
 132   // -26 [ saved v15            ] <--- sp_after_call
 133   // -25 [ saved v14            ]
 134   // -24 [ saved v13            ]
 135   // -23 [ saved v12            ]
 136   // -22 [ saved v11            ]
 137   // -21 [ saved v10            ]
 138   // -20 [ saved v9             ]
 139   // -19 [ saved v8             ]
 140   // -18 [ saved r28            ]
 141   // -17 [ saved r27            ]
 142   // -16 [ saved r26            ]
 143   // -15 [ saved r25            ]
 144   // -14 [ saved r24            ]
 145   // -13 [ saved r23            ]
 146   // -12 [ saved r22            ]
 147   // -11 [ saved r21            ]
 148   // -10 [ saved r20            ]
 149   //  -9 [ saved r19            ]
 150   //  -8 [ call wrapper    (r0) ]
 151   //  -7 [ result          (r1) ]
 152   //  -6 [ result type     (r2) ]
 153   //  -5 [ method          (r3) ]
 154   //  -4 [ entry point     (r4) ]
 155   //  -3 [ parameters      (r5) ]
 156   //  -2 [ parameter size  (r6) ]
 157   //  -1 [ thread (r7)          ]
 158   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 159   //   1 [ saved lr       (r30) ]
 160 
 161   // Call stub stack layout word offsets from fp
 162   enum call_stub_layout {
 163     sp_after_call_off = -26,
 164 
 165     d15_off            = -26,
 166     d14_off            = -25,
 167     d13_off            = -24,
 168     d12_off            = -23,
 169     d11_off            = -22,
 170     d10_off            = -21,
 171     d9_off             = -20,
 172     d8_off             = -19,
 173 
 174     r28_off            = -18,
 175     r27_off            = -17,
 176     r26_off            = -16,
 177     r25_off            = -15,
 178     r24_off            = -14,
 179     r23_off            = -13,
 180     r22_off            = -12,
 181     r21_off            = -11,
 182     r20_off            = -10,
 183     r19_off            =  -9,
 184     call_wrapper_off   =  -8,
 185     result_off         =  -7,
 186     result_type_off    =  -6,
 187     method_off         =  -5,
 188     entry_point_off    =  -4,
 189     parameters_off     =  -3,
 190     parameter_size_off =  -2,
 191     thread_off         =  -1,
 192     fp_f               =   0,
 193     retaddr_off        =   1,
 194   };
 195 
 196   address generate_call_stub(address& return_address) {
 197     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 198            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 199            "adjust this code");
 200 
 201     StubCodeMark mark(this, "StubRoutines", "call_stub");
 202     address start = __ pc();
 203 
 204     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 205 
 206     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 207     const Address result        (rfp, result_off         * wordSize);
 208     const Address result_type   (rfp, result_type_off    * wordSize);
 209     const Address method        (rfp, method_off         * wordSize);
 210     const Address entry_point   (rfp, entry_point_off    * wordSize);
 211     const Address parameters    (rfp, parameters_off     * wordSize);
 212     const Address parameter_size(rfp, parameter_size_off * wordSize);
 213 
 214     const Address thread        (rfp, thread_off         * wordSize);
 215 
 216     const Address d15_save      (rfp, d15_off * wordSize);
 217     const Address d14_save      (rfp, d14_off * wordSize);
 218     const Address d13_save      (rfp, d13_off * wordSize);
 219     const Address d12_save      (rfp, d12_off * wordSize);
 220     const Address d11_save      (rfp, d11_off * wordSize);
 221     const Address d10_save      (rfp, d10_off * wordSize);
 222     const Address d9_save       (rfp, d9_off * wordSize);
 223     const Address d8_save       (rfp, d8_off * wordSize);
 224 
 225     const Address r28_save      (rfp, r28_off * wordSize);
 226     const Address r27_save      (rfp, r27_off * wordSize);
 227     const Address r26_save      (rfp, r26_off * wordSize);
 228     const Address r25_save      (rfp, r25_off * wordSize);
 229     const Address r24_save      (rfp, r24_off * wordSize);
 230     const Address r23_save      (rfp, r23_off * wordSize);
 231     const Address r22_save      (rfp, r22_off * wordSize);
 232     const Address r21_save      (rfp, r21_off * wordSize);
 233     const Address r20_save      (rfp, r20_off * wordSize);
 234     const Address r19_save      (rfp, r19_off * wordSize);
 235 
 236     // stub code
 237 
 238     // we need a C prolog to bootstrap the x86 caller into the sim
 239     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 240 
 241     address aarch64_entry = __ pc();
 242 
 243 #ifdef BUILTIN_SIM
 244     // Save sender's SP for stack traces.
 245     __ mov(rscratch1, sp);
 246     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 247 #endif
 248     // set up frame and move sp to end of save area
 249     __ enter();
 250     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 251 
 252     // save register parameters and Java scratch/global registers
 253     // n.b. we save thread even though it gets installed in
 254     // rthread because we want to sanity check rthread later
 255     __ str(c_rarg7,  thread);
 256     __ strw(c_rarg6, parameter_size);
 257     __ str(c_rarg5,  parameters);
 258     __ str(c_rarg4,  entry_point);
 259     __ str(c_rarg3,  method);
 260     __ str(c_rarg2,  result_type);
 261     __ str(c_rarg1,  result);
 262     __ str(c_rarg0,  call_wrapper);
 263     __ str(r19,      r19_save);
 264     __ str(r20,      r20_save);
 265     __ str(r21,      r21_save);
 266     __ str(r22,      r22_save);
 267     __ str(r23,      r23_save);
 268     __ str(r24,      r24_save);
 269     __ str(r25,      r25_save);
 270     __ str(r26,      r26_save);
 271     __ str(r27,      r27_save);
 272     __ str(r28,      r28_save);
 273 
 274     __ strd(v8,      d8_save);
 275     __ strd(v9,      d9_save);
 276     __ strd(v10,     d10_save);
 277     __ strd(v11,     d11_save);
 278     __ strd(v12,     d12_save);
 279     __ strd(v13,     d13_save);
 280     __ strd(v14,     d14_save);
 281     __ strd(v15,     d15_save);
 282 
 283     // install Java thread in global register now we have saved
 284     // whatever value it held
 285     __ mov(rthread, c_rarg7);
 286     // And method
 287     __ mov(rmethod, c_rarg3);
 288 
 289     // set up the heapbase register
 290     __ reinit_heapbase();
 291 
 292 #ifdef ASSERT
 293     // make sure we have no pending exceptions
 294     {
 295       Label L;
 296       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 297       __ cmp(rscratch1, (unsigned)NULL_WORD);
 298       __ br(Assembler::EQ, L);
 299       __ stop("StubRoutines::call_stub: entered with pending exception");
 300       __ BIND(L);
 301     }
 302 #endif
 303     // pass parameters if any
 304     __ mov(esp, sp);
 305     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 306     __ andr(sp, rscratch1, -2 * wordSize);
 307 
 308     BLOCK_COMMENT("pass parameters if any");
 309     Label parameters_done;
 310     // parameter count is still in c_rarg6
 311     // and parameter pointer identifying param 1 is in c_rarg5
 312     __ cbzw(c_rarg6, parameters_done);
 313 
 314     address loop = __ pc();
 315     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 316     __ subsw(c_rarg6, c_rarg6, 1);
 317     __ push(rscratch1);
 318     __ br(Assembler::GT, loop);
 319 
 320     __ BIND(parameters_done);
 321 
 322     // call Java entry -- passing methdoOop, and current sp
 323     //      rmethod: Method*
 324     //      r13: sender sp
 325     BLOCK_COMMENT("call Java function");
 326     __ mov(r13, sp);
 327     __ blr(c_rarg4);
 328 
 329     // tell the simulator we have returned to the stub
 330 
 331     // we do this here because the notify will already have been done
 332     // if we get to the next instruction via an exception
 333     //
 334     // n.b. adding this instruction here affects the calculation of
 335     // whether or not a routine returns to the call stub (used when
 336     // doing stack walks) since the normal test is to check the return
 337     // pc against the address saved below. so we may need to allow for
 338     // this extra instruction in the check.
 339 
 340     if (NotifySimulator) {
 341       __ notify(Assembler::method_reentry);
 342     }
 343     // save current address for use by exception handling code
 344 
 345     return_address = __ pc();
 346 
 347     // store result depending on type (everything that is not
 348     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 349     // n.b. this assumes Java returns an integral result in r0
 350     // and a floating result in j_farg0
 351     __ ldr(j_rarg2, result);
 352     Label is_long, is_float, is_double, exit;
 353     __ ldr(j_rarg1, result_type);
 354     __ cmp(j_rarg1, T_OBJECT);
 355     __ br(Assembler::EQ, is_long);
 356     __ cmp(j_rarg1, T_LONG);
 357     __ br(Assembler::EQ, is_long);
 358     __ cmp(j_rarg1, T_FLOAT);
 359     __ br(Assembler::EQ, is_float);
 360     __ cmp(j_rarg1, T_DOUBLE);
 361     __ br(Assembler::EQ, is_double);
 362 
 363     // handle T_INT case
 364     __ strw(r0, Address(j_rarg2));
 365 
 366     __ BIND(exit);
 367 
 368     // pop parameters
 369     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 370 
 371 #ifdef ASSERT
 372     // verify that threads correspond
 373     {
 374       Label L, S;
 375       __ ldr(rscratch1, thread);
 376       __ cmp(rthread, rscratch1);
 377       __ br(Assembler::NE, S);
 378       __ get_thread(rscratch1);
 379       __ cmp(rthread, rscratch1);
 380       __ br(Assembler::EQ, L);
 381       __ BIND(S);
 382       __ stop("StubRoutines::call_stub: threads must correspond");
 383       __ BIND(L);
 384     }
 385 #endif
 386 
 387     // restore callee-save registers
 388     __ ldrd(v15,      d15_save);
 389     __ ldrd(v14,      d14_save);
 390     __ ldrd(v13,      d13_save);
 391     __ ldrd(v12,      d12_save);
 392     __ ldrd(v11,      d11_save);
 393     __ ldrd(v10,      d10_save);
 394     __ ldrd(v9,       d9_save);
 395     __ ldrd(v8,       d8_save);
 396 
 397     __ ldr(r28,      r28_save);
 398     __ ldr(r27,      r27_save);
 399     __ ldr(r26,      r26_save);
 400     __ ldr(r25,      r25_save);
 401     __ ldr(r24,      r24_save);
 402     __ ldr(r23,      r23_save);
 403     __ ldr(r22,      r22_save);
 404     __ ldr(r21,      r21_save);
 405     __ ldr(r20,      r20_save);
 406     __ ldr(r19,      r19_save);
 407     __ ldr(c_rarg0,  call_wrapper);
 408     __ ldr(c_rarg1,  result);
 409     __ ldrw(c_rarg2, result_type);
 410     __ ldr(c_rarg3,  method);
 411     __ ldr(c_rarg4,  entry_point);
 412     __ ldr(c_rarg5,  parameters);
 413     __ ldr(c_rarg6,  parameter_size);
 414     __ ldr(c_rarg7,  thread);
 415 
 416 #ifndef PRODUCT
 417     // tell the simulator we are about to end Java execution
 418     if (NotifySimulator) {
 419       __ notify(Assembler::method_exit);
 420     }
 421 #endif
 422     // leave frame and return to caller
 423     __ leave();
 424     __ ret(lr);
 425 
 426     // handle return types different from T_INT
 427 
 428     __ BIND(is_long);
 429     __ str(r0, Address(j_rarg2, 0));
 430     __ br(Assembler::AL, exit);
 431 
 432     __ BIND(is_float);
 433     __ strs(j_farg0, Address(j_rarg2, 0));
 434     __ br(Assembler::AL, exit);
 435 
 436     __ BIND(is_double);
 437     __ strd(j_farg0, Address(j_rarg2, 0));
 438     __ br(Assembler::AL, exit);
 439 
 440     return start;
 441   }
 442 
 443   // Return point for a Java call if there's an exception thrown in
 444   // Java code.  The exception is caught and transformed into a
 445   // pending exception stored in JavaThread that can be tested from
 446   // within the VM.
 447   //
 448   // Note: Usually the parameters are removed by the callee. In case
 449   // of an exception crossing an activation frame boundary, that is
 450   // not the case if the callee is compiled code => need to setup the
 451   // rsp.
 452   //
 453   // r0: exception oop
 454 
 455   // NOTE: this is used as a target from the signal handler so it
 456   // needs an x86 prolog which returns into the current simulator
 457   // executing the generated catch_exception code. so the prolog
 458   // needs to install rax in a sim register and adjust the sim's
 459   // restart pc to enter the generated code at the start position
 460   // then return from native to simulated execution.
 461 
 462   address generate_catch_exception() {
 463     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 464     address start = __ pc();
 465 
 466     // same as in generate_call_stub():
 467     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 468     const Address thread        (rfp, thread_off         * wordSize);
 469 
 470 #ifdef ASSERT
 471     // verify that threads correspond
 472     {
 473       Label L, S;
 474       __ ldr(rscratch1, thread);
 475       __ cmp(rthread, rscratch1);
 476       __ br(Assembler::NE, S);
 477       __ get_thread(rscratch1);
 478       __ cmp(rthread, rscratch1);
 479       __ br(Assembler::EQ, L);
 480       __ bind(S);
 481       __ stop("StubRoutines::catch_exception: threads must correspond");
 482       __ bind(L);
 483     }
 484 #endif
 485 
 486     // set pending exception
 487     __ verify_oop(r0);
 488 
 489     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 490     __ mov(rscratch1, (address)__FILE__);
 491     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 492     __ movw(rscratch1, (int)__LINE__);
 493     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 494 
 495     // complete return to VM
 496     assert(StubRoutines::_call_stub_return_address != NULL,
 497            "_call_stub_return_address must have been generated before");
 498     __ b(StubRoutines::_call_stub_return_address);
 499 
 500     return start;
 501   }
 502 
 503   // Continuation point for runtime calls returning with a pending
 504   // exception.  The pending exception check happened in the runtime
 505   // or native call stub.  The pending exception in Thread is
 506   // converted into a Java-level exception.
 507   //
 508   // Contract with Java-level exception handlers:
 509   // r0: exception
 510   // r3: throwing pc
 511   //
 512   // NOTE: At entry of this stub, exception-pc must be in LR !!
 513 
 514   // NOTE: this is always used as a jump target within generated code
 515   // so it just needs to be generated code wiht no x86 prolog
 516 
 517   address generate_forward_exception() {
 518     StubCodeMark mark(this, "StubRoutines", "forward exception");
 519     address start = __ pc();
 520 
 521     // Upon entry, LR points to the return address returning into
 522     // Java (interpreted or compiled) code; i.e., the return address
 523     // becomes the throwing pc.
 524     //
 525     // Arguments pushed before the runtime call are still on the stack
 526     // but the exception handler will reset the stack pointer ->
 527     // ignore them.  A potential result in registers can be ignored as
 528     // well.
 529 
 530 #ifdef ASSERT
 531     // make sure this code is only executed if there is a pending exception
 532     {
 533       Label L;
 534       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 535       __ cbnz(rscratch1, L);
 536       __ stop("StubRoutines::forward exception: no pending exception (1)");
 537       __ bind(L);
 538     }
 539 #endif
 540 
 541     // compute exception handler into r19
 542 
 543     // call the VM to find the handler address associated with the
 544     // caller address. pass thread in r0 and caller pc (ret address)
 545     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 546     // the stack.
 547     __ mov(c_rarg1, lr);
 548     // lr will be trashed by the VM call so we move it to R19
 549     // (callee-saved) because we also need to pass it to the handler
 550     // returned by this call.
 551     __ mov(r19, lr);
 552     BLOCK_COMMENT("call exception_handler_for_return_address");
 553     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 554                          SharedRuntime::exception_handler_for_return_address),
 555                     rthread, c_rarg1);
 556     // we should not really care that lr is no longer the callee
 557     // address. we saved the value the handler needs in r19 so we can
 558     // just copy it to r3. however, the C2 handler will push its own
 559     // frame and then calls into the VM and the VM code asserts that
 560     // the PC for the frame above the handler belongs to a compiled
 561     // Java method. So, we restore lr here to satisfy that assert.
 562     __ mov(lr, r19);
 563     // setup r0 & r3 & clear pending exception
 564     __ mov(r3, r19);
 565     __ mov(r19, r0);
 566     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 567     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 568 
 569 #ifdef ASSERT
 570     // make sure exception is set
 571     {
 572       Label L;
 573       __ cbnz(r0, L);
 574       __ stop("StubRoutines::forward exception: no pending exception (2)");
 575       __ bind(L);
 576     }
 577 #endif
 578 
 579     // continue at exception handler
 580     // r0: exception
 581     // r3: throwing pc
 582     // r19: exception handler
 583     __ verify_oop(r0);
 584     __ br(r19);
 585 
 586     return start;
 587   }
 588 
 589   // Non-destructive plausibility checks for oops
 590   //
 591   // Arguments:
 592   //    r0: oop to verify
 593   //    rscratch1: error message
 594   //
 595   // Stack after saving c_rarg3:
 596   //    [tos + 0]: saved c_rarg3
 597   //    [tos + 1]: saved c_rarg2
 598   //    [tos + 2]: saved lr
 599   //    [tos + 3]: saved rscratch2
 600   //    [tos + 4]: saved r0
 601   //    [tos + 5]: saved rscratch1
 602   address generate_verify_oop() {
 603 
 604     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 605     address start = __ pc();
 606 
 607     Label exit, error;
 608 
 609     // save c_rarg2 and c_rarg3
 610     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 611 
 612     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 613     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 614     __ ldr(c_rarg3, Address(c_rarg2));
 615     __ add(c_rarg3, c_rarg3, 1);
 616     __ str(c_rarg3, Address(c_rarg2));
 617 
 618     // object is in r0
 619     // make sure object is 'reasonable'
 620     __ cbz(r0, exit); // if obj is NULL it is OK
 621 
 622     // Check if the oop is in the right area of memory
 623     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 624     __ andr(c_rarg2, r0, c_rarg3);
 625     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 626 
 627     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 628     // instruction here because the flags register is live.
 629     __ eor(c_rarg2, c_rarg2, c_rarg3);
 630     __ cbnz(c_rarg2, error);
 631 
 632     // make sure klass is 'reasonable', which is not zero.
 633     __ load_klass(r0, r0);  // get klass
 634     __ cbz(r0, error);      // if klass is NULL it is broken
 635 
 636     // return if everything seems ok
 637     __ bind(exit);
 638 
 639     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 640     __ ret(lr);
 641 
 642     // handle errors
 643     __ bind(error);
 644     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 645 
 646     __ push(RegSet::range(r0, r29), sp);
 647     // debug(char* msg, int64_t pc, int64_t regs[])
 648     __ mov(c_rarg0, rscratch1);      // pass address of error message
 649     __ mov(c_rarg1, lr);             // pass return address
 650     __ mov(c_rarg2, sp);             // pass address of regs on stack
 651 #ifndef PRODUCT
 652     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 653 #endif
 654     BLOCK_COMMENT("call MacroAssembler::debug");
 655     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 656     __ blrt(rscratch1, 3, 0, 1);
 657 
 658     return start;
 659   }
 660 
 661   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 662 
 663   // Generate code for an array write pre barrier
 664   //
 665   //     addr    -  starting address
 666   //     count   -  element count
 667   //     tmp     - scratch register
 668   //
 669   //     Destroy no registers!
 670   //
 671   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 672     BarrierSet* bs = Universe::heap()->barrier_set();
 673     switch (bs->kind()) {
 674     case BarrierSet::G1SATBCTLogging:
 675       // With G1, don't generate the call if we statically know that the target in uninitialized
 676       if (!dest_uninitialized) {
 677         __ push(RegSet::range(r0, r29), sp);         // integer registers except lr & sp
 678         if (count == c_rarg0) {
 679           if (addr == c_rarg1) {
 680             // exactly backwards!!
 681             __ stp(c_rarg0, c_rarg1, __ pre(sp, -2 * wordSize));
 682             __ ldp(c_rarg1, c_rarg0, __ post(sp, -2 * wordSize));
 683           } else {
 684             __ mov(c_rarg1, count);
 685             __ mov(c_rarg0, addr);
 686           }
 687         } else {
 688           __ mov(c_rarg0, addr);
 689           __ mov(c_rarg1, count);
 690         }
 691         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 692         __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
 693         break;
 694       case BarrierSet::CardTableForRS:
 695       case BarrierSet::CardTableExtension:
 696       case BarrierSet::ModRef:
 697         break;
 698       default:
 699         ShouldNotReachHere();
 700 
 701       }
 702     }
 703   }
 704 
 705   //
 706   // Generate code for an array write post barrier
 707   //
 708   //  Input:
 709   //     start    - register containing starting address of destination array
 710   //     end      - register containing ending address of destination array
 711   //     scratch  - scratch register
 712   //
 713   //  The input registers are overwritten.
 714   //  The ending address is inclusive.
 715   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 716     assert_different_registers(start, end, scratch);
 717     BarrierSet* bs = Universe::heap()->barrier_set();
 718     switch (bs->kind()) {
 719       case BarrierSet::G1SATBCTLogging:
 720 
 721         {
 722           __ push(RegSet::range(r0, r29), sp);         // integer registers except lr & sp
 723           // must compute element count unless barrier set interface is changed (other platforms supply count)
 724           assert_different_registers(start, end, scratch);
 725           __ lea(scratch, Address(end, BytesPerHeapOop));
 726           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 727           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 728           __ mov(c_rarg0, start);
 729           __ mov(c_rarg1, scratch);
 730           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 731           __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
 732         }
 733         break;
 734       case BarrierSet::CardTableForRS:
 735       case BarrierSet::CardTableExtension:
 736         {
 737           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 738           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 739 
 740           Label L_loop;
 741 
 742            __ lsr(start, start, CardTableModRefBS::card_shift);
 743            __ lsr(end, end, CardTableModRefBS::card_shift);
 744            __ sub(end, end, start); // number of bytes to copy
 745 
 746           const Register count = end; // 'end' register contains bytes count now
 747           __ load_byte_map_base(scratch);
 748           __ add(start, start, scratch);
 749           if (UseConcMarkSweepGC) {
 750             __ membar(__ StoreStore);
 751           }
 752           __ BIND(L_loop);
 753           __ strb(zr, Address(start, count));
 754           __ subs(count, count, 1);
 755           __ br(Assembler::HS, L_loop);
 756         }
 757         break;
 758       default:
 759         ShouldNotReachHere();
 760 
 761     }
 762   }
 763 
 764   typedef enum {
 765     copy_forwards = 1,
 766     copy_backwards = -1
 767   } copy_direction;
 768 
 769   // Bulk copy of blocks of 8 words.
 770   //
 771   // count is a count of words.
 772   //
 773   // Precondition: count >= 2
 774   //
 775   // Postconditions:
 776   //
 777   // The least significant bit of count contains the remaining count
 778   // of words to copy.  The rest of count is trash.
 779   //
 780   // s and d are adjusted to point to the remaining words to copy
 781   //
 782   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 783                            copy_direction direction) {
 784     int unit = wordSize * direction;
 785 
 786     int offset;
 787     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 788       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 789 
 790     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 791     assert_different_registers(s, d, count, rscratch1);
 792 
 793     Label again, large, small;
 794     const char *stub_name;
 795     if (direction == copy_forwards)
 796       stub_name = "foward_copy_longs";
 797     else
 798       stub_name = "backward_copy_longs";
 799     StubCodeMark mark(this, "StubRoutines", stub_name);
 800     __ align(6);
 801     __ bind(start);
 802     __ cmp(count, 8);
 803     __ br(Assembler::LO, small);
 804     if (direction == copy_forwards) {
 805       __ sub(s, s, 2 * wordSize);
 806       __ sub(d, d, 2 * wordSize);
 807     }
 808     __ subs(count, count, 16);
 809     __ br(Assembler::GE, large);
 810 
 811     // 8 <= count < 16 words.  Copy 8.
 812     __ ldp(t0, t1, Address(s, 2 * unit));
 813     __ ldp(t2, t3, Address(s, 4 * unit));
 814     __ ldp(t4, t5, Address(s, 6 * unit));
 815     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 816 
 817     __ stp(t0, t1, Address(d, 2 * unit));
 818     __ stp(t2, t3, Address(d, 4 * unit));
 819     __ stp(t4, t5, Address(d, 6 * unit));
 820     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 821 
 822     if (direction == copy_forwards) {
 823       __ add(s, s, 2 * wordSize);
 824       __ add(d, d, 2 * wordSize);
 825     }
 826 
 827     {
 828       Label L1, L2;
 829       __ bind(small);
 830       __ tbz(count, exact_log2(4), L1);
 831       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 832       __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 833       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 834       __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 835       __ bind(L1);
 836 
 837       __ tbz(count, 1, L2);
 838       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 839       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 840       __ bind(L2);
 841     }
 842 
 843     __ ret(lr);
 844 
 845     __ align(6);
 846     __ bind(large);
 847 
 848     // Fill 8 registers
 849     __ ldp(t0, t1, Address(s, 2 * unit));
 850     __ ldp(t2, t3, Address(s, 4 * unit));
 851     __ ldp(t4, t5, Address(s, 6 * unit));
 852     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 853 
 854     __ bind(again);
 855 
 856     if (direction == copy_forwards && PrefetchCopyIntervalInBytes > 0)
 857       __ prfm(Address(s, PrefetchCopyIntervalInBytes), PLDL1KEEP);
 858 
 859     __ stp(t0, t1, Address(d, 2 * unit));
 860     __ ldp(t0, t1, Address(s, 2 * unit));
 861     __ stp(t2, t3, Address(d, 4 * unit));
 862     __ ldp(t2, t3, Address(s, 4 * unit));
 863     __ stp(t4, t5, Address(d, 6 * unit));
 864     __ ldp(t4, t5, Address(s, 6 * unit));
 865     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 866     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 867 
 868     __ subs(count, count, 8);
 869     __ br(Assembler::HS, again);
 870 
 871     // Drain
 872     __ stp(t0, t1, Address(d, 2 * unit));
 873     __ stp(t2, t3, Address(d, 4 * unit));
 874     __ stp(t4, t5, Address(d, 6 * unit));
 875     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 876 
 877     if (direction == copy_forwards) {
 878       __ add(s, s, 2 * wordSize);
 879       __ add(d, d, 2 * wordSize);
 880     }
 881 
 882     {
 883       Label L1, L2;
 884       __ tbz(count, exact_log2(4), L1);
 885       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 886       __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 887       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 888       __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 889       __ bind(L1);
 890 
 891       __ tbz(count, 1, L2);
 892       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 893       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 894       __ bind(L2);
 895     }
 896 
 897     __ ret(lr);
 898   }
 899 
 900   // Small copy: less than 16 bytes.
 901   //
 902   // NB: Ignores all of the bits of count which represent more than 15
 903   // bytes, so a caller doesn't have to mask them.
 904 
 905   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
 906     bool is_backwards = step < 0;
 907     size_t granularity = uabs(step);
 908     int direction = is_backwards ? -1 : 1;
 909     int unit = wordSize * direction;
 910 
 911     Label Lpair, Lword, Lint, Lshort, Lbyte;
 912 
 913     assert(granularity
 914            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 915 
 916     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
 917 
 918     // ??? I don't know if this bit-test-and-branch is the right thing
 919     // to do.  It does a lot of jumping, resulting in several
 920     // mispredicted branches.  It might make more sense to do this
 921     // with something like Duff's device with a single computed branch.
 922 
 923     __ tbz(count, 3 - exact_log2(granularity), Lword);
 924     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
 925     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
 926     __ bind(Lword);
 927 
 928     if (granularity <= sizeof (jint)) {
 929       __ tbz(count, 2 - exact_log2(granularity), Lint);
 930       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 931       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 932       __ bind(Lint);
 933     }
 934 
 935     if (granularity <= sizeof (jshort)) {
 936       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 937       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 938       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 939       __ bind(Lshort);
 940     }
 941 
 942     if (granularity <= sizeof (jbyte)) {
 943       __ tbz(count, 0, Lbyte);
 944       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 945       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 946       __ bind(Lbyte);
 947     }
 948   }
 949 
 950   Label copy_f, copy_b;
 951 
 952   // All-singing all-dancing memory copy.
 953   //
 954   // Copy count units of memory from s to d.  The size of a unit is
 955   // step, which can be positive or negative depending on the direction
 956   // of copy.  If is_aligned is false, we align the source address.
 957   //
 958 
 959   void copy_memory(bool is_aligned, Register s, Register d,
 960                    Register count, Register tmp, int step) {
 961     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 962     bool is_backwards = step < 0;
 963     int granularity = uabs(step);
 964     const Register t0 = r3, t1 = r4;
 965 
 966     if (is_backwards) {
 967       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 968       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 969     }
 970 
 971     Label tail;
 972 
 973     __ cmp(count, 16/granularity);
 974     __ br(Assembler::LO, tail);
 975 
 976     // Now we've got the small case out of the way we can align the
 977     // source address on a 2-word boundary.
 978 
 979     Label aligned;
 980 
 981     if (is_aligned) {
 982       // We may have to adjust by 1 word to get s 2-word-aligned.
 983       __ tbz(s, exact_log2(wordSize), aligned);
 984       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
 985       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
 986       __ sub(count, count, wordSize/granularity);
 987     } else {
 988       if (is_backwards) {
 989         __ andr(rscratch2, s, 2 * wordSize - 1);
 990       } else {
 991         __ neg(rscratch2, s);
 992         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
 993       }
 994       // rscratch2 is the byte adjustment needed to align s.
 995       __ cbz(rscratch2, aligned);
 996       int shift = exact_log2(granularity);
 997       if (shift)  __ lsr(rscratch2, rscratch2, shift);
 998       __ sub(count, count, rscratch2);
 999 
1000 #if 0
1001       // ?? This code is only correct for a disjoint copy.  It may or
1002       // may not make sense to use it in that case.
1003 
1004       // Copy the first pair; s and d may not be aligned.
1005       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1006       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1007 
1008       // Align s and d, adjust count
1009       if (is_backwards) {
1010         __ sub(s, s, rscratch2);
1011         __ sub(d, d, rscratch2);
1012       } else {
1013         __ add(s, s, rscratch2);
1014         __ add(d, d, rscratch2);
1015       }
1016 #else
1017       copy_memory_small(s, d, rscratch2, rscratch1, step);
1018 #endif
1019     }
1020 
1021     __ cmp(count, 16/granularity);
1022     __ br(Assembler::LT, tail);
1023     __ bind(aligned);
1024 
1025     // s is now 2-word-aligned.
1026 
1027     // We have a count of units and some trailing bytes.  Adjust the
1028     // count and do a bulk copy of words.
1029     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1030     if (direction == copy_forwards)
1031       __ bl(copy_f);
1032     else
1033       __ bl(copy_b);
1034 
1035     // And the tail.
1036 
1037     __ bind(tail);
1038     copy_memory_small(s, d, count, tmp, step);
1039   }
1040 
1041 
1042   void clobber_registers() {
1043 #ifdef ASSERT
1044     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1045     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1046     for (Register r = r3; r <= r18; r++)
1047       if (r != rscratch1) __ mov(r, rscratch1);
1048 #endif
1049   }
1050 
1051   // Scan over array at a for count oops, verifying each one.
1052   // Preserves a and count, clobbers rscratch1 and rscratch2.
1053   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1054     Label loop, end;
1055     __ mov(rscratch1, a);
1056     __ mov(rscratch2, zr);
1057     __ bind(loop);
1058     __ cmp(rscratch2, count);
1059     __ br(Assembler::HS, end);
1060     if (size == (size_t)wordSize) {
1061       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1062       __ verify_oop(temp);
1063     } else {
1064       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1065       __ decode_heap_oop(temp); // calls verify_oop
1066     }
1067     __ add(rscratch2, rscratch2, size);
1068     __ b(loop);
1069     __ bind(end);
1070   }
1071 
1072   // Arguments:
1073   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1074   //             ignored
1075   //   is_oop  - true => oop array, so generate store check code
1076   //   name    - stub name string
1077   //
1078   // Inputs:
1079   //   c_rarg0   - source array address
1080   //   c_rarg1   - destination array address
1081   //   c_rarg2   - element count, treated as ssize_t, can be zero
1082   //
1083   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1084   // the hardware handle it.  The two dwords within qwords that span
1085   // cache line boundaries will still be loaded and stored atomicly.
1086   //
1087   // Side Effects:
1088   //   disjoint_int_copy_entry is set to the no-overlap entry point
1089   //   used by generate_conjoint_int_oop_copy().
1090   //
1091   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1092                                   const char *name, bool dest_uninitialized = false) {
1093     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1094     __ align(CodeEntryAlignment);
1095     StubCodeMark mark(this, "StubRoutines", name);
1096     address start = __ pc();
1097     __ enter();
1098 
1099     if (entry != NULL) {
1100       *entry = __ pc();
1101       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1102       BLOCK_COMMENT("Entry:");
1103     }
1104 
1105     if (is_oop) {
1106       __ push(RegSet::of(d, count), sp);
1107       // no registers are destroyed by this call
1108       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1109     }
1110     copy_memory(aligned, s, d, count, rscratch1, size);
1111     if (is_oop) {
1112       __ pop(RegSet::of(d, count), sp);
1113       if (VerifyOops)
1114         verify_oop_array(size, d, count, r16);
1115       __ sub(count, count, 1); // make an inclusive end pointer
1116       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1117       gen_write_ref_array_post_barrier(d, count, rscratch1);
1118     }
1119     __ leave();
1120     __ mov(r0, zr); // return 0
1121     __ ret(lr);
1122 #ifdef BUILTIN_SIM
1123     {
1124       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1125       sim->notifyCompile(const_cast<char*>(name), start);
1126     }
1127 #endif
1128     return start;
1129   }
1130 
1131   // Arguments:
1132   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1133   //             ignored
1134   //   is_oop  - true => oop array, so generate store check code
1135   //   name    - stub name string
1136   //
1137   // Inputs:
1138   //   c_rarg0   - source array address
1139   //   c_rarg1   - destination array address
1140   //   c_rarg2   - element count, treated as ssize_t, can be zero
1141   //
1142   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1143   // the hardware handle it.  The two dwords within qwords that span
1144   // cache line boundaries will still be loaded and stored atomicly.
1145   //
1146   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1147                                  address *entry, const char *name,
1148                                  bool dest_uninitialized = false) {
1149     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1150 
1151     StubCodeMark mark(this, "StubRoutines", name);
1152     address start = __ pc();
1153     __ enter();
1154 
1155     if (entry != NULL) {
1156       *entry = __ pc();
1157       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1158       BLOCK_COMMENT("Entry:");
1159     }
1160 
1161     // no overlap when (d-s) above_equal (count*size)
1162     __ sub(rscratch1, d, s);
1163     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1164     __ br(Assembler::HS, nooverlap_target);
1165 
1166     if (is_oop) {
1167       __ push(RegSet::of(d, count), sp);
1168       // no registers are destroyed by this call
1169       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1170     }
1171     copy_memory(aligned, s, d, count, rscratch1, -size);
1172     if (is_oop) {
1173       __ pop(RegSet::of(d, count), sp);
1174       if (VerifyOops)
1175         verify_oop_array(size, d, count, r16);
1176       __ sub(count, count, 1); // make an inclusive end pointer
1177       __ lea(count, Address(d, count, Address::uxtw(exact_log2(size))));
1178       gen_write_ref_array_post_barrier(d, count, rscratch1);
1179     }
1180     __ leave();
1181     __ mov(r0, zr); // return 0
1182     __ ret(lr);
1183 #ifdef BUILTIN_SIM
1184     {
1185       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1186       sim->notifyCompile(const_cast<char*>(name), start);
1187     }
1188 #endif
1189     return start;
1190 }
1191 
1192   // Arguments:
1193   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1194   //             ignored
1195   //   name    - stub name string
1196   //
1197   // Inputs:
1198   //   c_rarg0   - source array address
1199   //   c_rarg1   - destination array address
1200   //   c_rarg2   - element count, treated as ssize_t, can be zero
1201   //
1202   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1203   // we let the hardware handle it.  The one to eight bytes within words,
1204   // dwords or qwords that span cache line boundaries will still be loaded
1205   // and stored atomically.
1206   //
1207   // Side Effects:
1208   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1209   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1210   // we let the hardware handle it.  The one to eight bytes within words,
1211   // dwords or qwords that span cache line boundaries will still be loaded
1212   // and stored atomically.
1213   //
1214   // Side Effects:
1215   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1216   //   used by generate_conjoint_byte_copy().
1217   //
1218   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1219     const bool not_oop = false;
1220     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1221   }
1222 
1223   // Arguments:
1224   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1225   //             ignored
1226   //   name    - stub name string
1227   //
1228   // Inputs:
1229   //   c_rarg0   - source array address
1230   //   c_rarg1   - destination array address
1231   //   c_rarg2   - element count, treated as ssize_t, can be zero
1232   //
1233   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1234   // we let the hardware handle it.  The one to eight bytes within words,
1235   // dwords or qwords that span cache line boundaries will still be loaded
1236   // and stored atomically.
1237   //
1238   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1239                                       address* entry, const char *name) {
1240     const bool not_oop = false;
1241     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1242   }
1243 
1244   // Arguments:
1245   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1246   //             ignored
1247   //   name    - stub name string
1248   //
1249   // Inputs:
1250   //   c_rarg0   - source array address
1251   //   c_rarg1   - destination array address
1252   //   c_rarg2   - element count, treated as ssize_t, can be zero
1253   //
1254   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1255   // let the hardware handle it.  The two or four words within dwords
1256   // or qwords that span cache line boundaries will still be loaded
1257   // and stored atomically.
1258   //
1259   // Side Effects:
1260   //   disjoint_short_copy_entry is set to the no-overlap entry point
1261   //   used by generate_conjoint_short_copy().
1262   //
1263   address generate_disjoint_short_copy(bool aligned,
1264                                        address* entry, const char *name) {
1265     const bool not_oop = false;
1266     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1267   }
1268 
1269   // Arguments:
1270   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1271   //             ignored
1272   //   name    - stub name string
1273   //
1274   // Inputs:
1275   //   c_rarg0   - source array address
1276   //   c_rarg1   - destination array address
1277   //   c_rarg2   - element count, treated as ssize_t, can be zero
1278   //
1279   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1280   // let the hardware handle it.  The two or four words within dwords
1281   // or qwords that span cache line boundaries will still be loaded
1282   // and stored atomically.
1283   //
1284   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1285                                        address *entry, const char *name) {
1286     const bool not_oop = false;
1287     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1288 
1289   }
1290   // Arguments:
1291   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1292   //             ignored
1293   //   name    - stub name string
1294   //
1295   // Inputs:
1296   //   c_rarg0   - source array address
1297   //   c_rarg1   - destination array address
1298   //   c_rarg2   - element count, treated as ssize_t, can be zero
1299   //
1300   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1301   // the hardware handle it.  The two dwords within qwords that span
1302   // cache line boundaries will still be loaded and stored atomicly.
1303   //
1304   // Side Effects:
1305   //   disjoint_int_copy_entry is set to the no-overlap entry point
1306   //   used by generate_conjoint_int_oop_copy().
1307   //
1308   address generate_disjoint_int_copy(bool aligned, address *entry,
1309                                          const char *name, bool dest_uninitialized = false) {
1310     const bool not_oop = false;
1311     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1312   }
1313 
1314   // Arguments:
1315   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1316   //             ignored
1317   //   name    - stub name string
1318   //
1319   // Inputs:
1320   //   c_rarg0   - source array address
1321   //   c_rarg1   - destination array address
1322   //   c_rarg2   - element count, treated as ssize_t, can be zero
1323   //
1324   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1325   // the hardware handle it.  The two dwords within qwords that span
1326   // cache line boundaries will still be loaded and stored atomicly.
1327   //
1328   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1329                                      address *entry, const char *name,
1330                                      bool dest_uninitialized = false) {
1331     const bool not_oop = false;
1332     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1333   }
1334 
1335 
1336   // Arguments:
1337   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1338   //             ignored
1339   //   name    - stub name string
1340   //
1341   // Inputs:
1342   //   c_rarg0   - source array address
1343   //   c_rarg1   - destination array address
1344   //   c_rarg2   - element count, treated as size_t, can be zero
1345   //
1346   // Side Effects:
1347   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1348   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1349   //
1350   address generate_disjoint_long_copy(bool aligned, address *entry,
1351                                           const char *name, bool dest_uninitialized = false) {
1352     const bool not_oop = false;
1353     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1354   }
1355 
1356   // Arguments:
1357   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1358   //             ignored
1359   //   name    - stub name string
1360   //
1361   // Inputs:
1362   //   c_rarg0   - source array address
1363   //   c_rarg1   - destination array address
1364   //   c_rarg2   - element count, treated as size_t, can be zero
1365   //
1366   address generate_conjoint_long_copy(bool aligned,
1367                                       address nooverlap_target, address *entry,
1368                                       const char *name, bool dest_uninitialized = false) {
1369     const bool not_oop = false;
1370     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1371   }
1372 
1373   // Arguments:
1374   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1375   //             ignored
1376   //   name    - stub name string
1377   //
1378   // Inputs:
1379   //   c_rarg0   - source array address
1380   //   c_rarg1   - destination array address
1381   //   c_rarg2   - element count, treated as size_t, can be zero
1382   //
1383   // Side Effects:
1384   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1385   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1386   //
1387   address generate_disjoint_oop_copy(bool aligned, address *entry,
1388                                      const char *name, bool dest_uninitialized = false) {
1389     const bool is_oop = true;
1390     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1391     return generate_disjoint_copy(size, aligned, is_oop, entry, name);
1392   }
1393 
1394   // Arguments:
1395   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1396   //             ignored
1397   //   name    - stub name string
1398   //
1399   // Inputs:
1400   //   c_rarg0   - source array address
1401   //   c_rarg1   - destination array address
1402   //   c_rarg2   - element count, treated as size_t, can be zero
1403   //
1404   address generate_conjoint_oop_copy(bool aligned,
1405                                      address nooverlap_target, address *entry,
1406                                      const char *name, bool dest_uninitialized = false) {
1407     const bool is_oop = true;
1408     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1409     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, name);
1410   }
1411 
1412 
1413   // Helper for generating a dynamic type check.
1414   // Smashes rscratch1.
1415   void generate_type_check(Register sub_klass,
1416                            Register super_check_offset,
1417                            Register super_klass,
1418                            Label& L_success) {
1419     assert_different_registers(sub_klass, super_check_offset, super_klass);
1420 
1421     BLOCK_COMMENT("type_check:");
1422 
1423     Label L_miss;
1424 
1425     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1426                                      super_check_offset);
1427     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1428 
1429     // Fall through on failure!
1430     __ BIND(L_miss);
1431   }
1432 
1433   //
1434   //  Generate checkcasting array copy stub
1435   //
1436   //  Input:
1437   //    c_rarg0   - source array address
1438   //    c_rarg1   - destination array address
1439   //    c_rarg2   - element count, treated as ssize_t, can be zero
1440   //    c_rarg3   - size_t ckoff (super_check_offset)
1441   //    c_rarg4   - oop ckval (super_klass)
1442   //
1443   //  Output:
1444   //    r0 ==  0  -  success
1445   //    r0 == -1^K - failure, where K is partial transfer count
1446   //
1447   address generate_checkcast_copy(const char *name, address *entry,
1448                                   bool dest_uninitialized = false) {
1449 
1450     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1451 
1452     // Input registers (after setup_arg_regs)
1453     const Register from        = c_rarg0;   // source array address
1454     const Register to          = c_rarg1;   // destination array address
1455     const Register count       = c_rarg2;   // elementscount
1456     const Register ckoff       = c_rarg3;   // super_check_offset
1457     const Register ckval       = c_rarg4;   // super_klass
1458 
1459     // Registers used as temps (r18, r19, r20 are save-on-entry)
1460     const Register count_save  = r21;       // orig elementscount
1461     const Register start_to    = r20;       // destination array start address
1462     const Register copied_oop  = r18;       // actual oop copied
1463     const Register r19_klass   = r19;       // oop._klass
1464 
1465     //---------------------------------------------------------------
1466     // Assembler stub will be used for this call to arraycopy
1467     // if the two arrays are subtypes of Object[] but the
1468     // destination array type is not equal to or a supertype
1469     // of the source type.  Each element must be separately
1470     // checked.
1471 
1472     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1473                                copied_oop, r19_klass, count_save);
1474 
1475     __ align(CodeEntryAlignment);
1476     StubCodeMark mark(this, "StubRoutines", name);
1477     address start = __ pc();
1478 
1479     __ enter(); // required for proper stackwalking of RuntimeStub frame
1480 
1481 #ifdef ASSERT
1482     // caller guarantees that the arrays really are different
1483     // otherwise, we would have to make conjoint checks
1484     { Label L;
1485       array_overlap_test(L, TIMES_OOP);
1486       __ stop("checkcast_copy within a single array");
1487       __ bind(L);
1488     }
1489 #endif //ASSERT
1490 
1491     // Caller of this entry point must set up the argument registers.
1492     if (entry != NULL) {
1493       *entry = __ pc();
1494       BLOCK_COMMENT("Entry:");
1495     }
1496 
1497      // Empty array:  Nothing to do.
1498     __ cbz(count, L_done);
1499 
1500     __ push(RegSet::of(r18, r19, r20, r21), sp);
1501 
1502 #ifdef ASSERT
1503     BLOCK_COMMENT("assert consistent ckoff/ckval");
1504     // The ckoff and ckval must be mutually consistent,
1505     // even though caller generates both.
1506     { Label L;
1507       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1508       __ ldrw(start_to, Address(ckval, sco_offset));
1509       __ cmpw(ckoff, start_to);
1510       __ br(Assembler::EQ, L);
1511       __ stop("super_check_offset inconsistent");
1512       __ bind(L);
1513     }
1514 #endif //ASSERT
1515 
1516     // save the original count
1517     __ mov(count_save, count);
1518 
1519     // Copy from low to high addresses
1520     __ mov(start_to, to);              // Save destination array start address
1521     __ b(L_load_element);
1522 
1523     // ======== begin loop ========
1524     // (Loop is rotated; its entry is L_load_element.)
1525     // Loop control:
1526     //   for (; count != 0; count--) {
1527     //     copied_oop = load_heap_oop(from++);
1528     //     ... generate_type_check ...;
1529     //     store_heap_oop(to++, copied_oop);
1530     //   }
1531     __ align(OptoLoopAlignment);
1532 
1533     __ BIND(L_store_element);
1534     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1535     __ sub(count, count, 1);
1536     __ cbz(count, L_do_card_marks);
1537 
1538     // ======== loop entry is here ========
1539     __ BIND(L_load_element);
1540     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1541     __ cbz(copied_oop, L_store_element);
1542 
1543     __ load_klass(r19_klass, copied_oop);// query the object klass
1544     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1545     // ======== end loop ========
1546 
1547     // It was a real error; we must depend on the caller to finish the job.
1548     // Register count = remaining oops, count_orig = total oops.
1549     // Emit GC store barriers for the oops we have copied and report
1550     // their number to the caller.
1551 
1552     __ subs(count, count_save, count);     // K = partially copied oop count
1553     __ eon(count, count, zr);                   // report (-1^K) to caller
1554     __ br(Assembler::EQ, L_done_pop);
1555 
1556     __ BIND(L_do_card_marks);
1557     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1558     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1559 
1560     __ bind(L_done_pop);
1561     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1562     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1563 
1564     __ bind(L_done);
1565     __ mov(r0, count);
1566     __ leave();
1567     __ ret(lr);
1568 
1569     return start;
1570   }
1571 
1572   // Perform range checks on the proposed arraycopy.
1573   // Kills temp, but nothing else.
1574   // Also, clean the sign bits of src_pos and dst_pos.
1575   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1576                               Register src_pos, // source position (c_rarg1)
1577                               Register dst,     // destination array oo (c_rarg2)
1578                               Register dst_pos, // destination position (c_rarg3)
1579                               Register length,
1580                               Register temp,
1581                               Label& L_failed) {
1582     BLOCK_COMMENT("arraycopy_range_checks:");
1583 
1584     assert_different_registers(rscratch1, temp);
1585 
1586     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1587     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1588     __ addw(temp, length, src_pos);
1589     __ cmpw(temp, rscratch1);
1590     __ br(Assembler::HI, L_failed);
1591 
1592     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1593     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1594     __ addw(temp, length, dst_pos);
1595     __ cmpw(temp, rscratch1);
1596     __ br(Assembler::HI, L_failed);
1597 
1598     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1599     __ movw(src_pos, src_pos);
1600     __ movw(dst_pos, dst_pos);
1601 
1602     BLOCK_COMMENT("arraycopy_range_checks done");
1603   }
1604 
1605   // These stubs get called from some dumb test routine.
1606   // I'll write them properly when they're called from
1607   // something that's actually doing something.
1608   static void fake_arraycopy_stub(address src, address dst, int count) {
1609     assert(count == 0, "huh?");
1610   }
1611 
1612 
1613   //
1614   //  Generate 'unsafe' array copy stub
1615   //  Though just as safe as the other stubs, it takes an unscaled
1616   //  size_t argument instead of an element count.
1617   //
1618   //  Input:
1619   //    c_rarg0   - source array address
1620   //    c_rarg1   - destination array address
1621   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1622   //
1623   // Examines the alignment of the operands and dispatches
1624   // to a long, int, short, or byte copy loop.
1625   //
1626   address generate_unsafe_copy(const char *name,
1627                                address byte_copy_entry) {
1628 #ifdef PRODUCT
1629     return StubRoutines::_jbyte_arraycopy;
1630 #else
1631     __ align(CodeEntryAlignment);
1632     StubCodeMark mark(this, "StubRoutines", name);
1633     address start = __ pc();
1634     __ enter(); // required for proper stackwalking of RuntimeStub frame
1635     // bump this on entry, not on exit:
1636     __ lea(rscratch2, ExternalAddress((address)&SharedRuntime::_unsafe_array_copy_ctr));
1637     __ incrementw(Address(rscratch2));
1638     __ b(RuntimeAddress(byte_copy_entry));
1639     return start;
1640 #endif
1641   }
1642 
1643   //
1644   //  Generate generic array copy stubs
1645   //
1646   //  Input:
1647   //    c_rarg0    -  src oop
1648   //    c_rarg1    -  src_pos (32-bits)
1649   //    c_rarg2    -  dst oop
1650   //    c_rarg3    -  dst_pos (32-bits)
1651   //    c_rarg4    -  element count (32-bits)
1652   //
1653   //  Output:
1654   //    r0 ==  0  -  success
1655   //    r0 == -1^K - failure, where K is partial transfer count
1656   //
1657   address generate_generic_copy(const char *name,
1658                                 address byte_copy_entry, address short_copy_entry,
1659                                 address int_copy_entry, address oop_copy_entry,
1660                                 address long_copy_entry, address checkcast_copy_entry) {
1661 
1662     Label L_failed, L_failed_0, L_objArray;
1663     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1664 
1665     // Input registers
1666     const Register src        = c_rarg0;  // source array oop
1667     const Register src_pos    = c_rarg1;  // source position
1668     const Register dst        = c_rarg2;  // destination array oop
1669     const Register dst_pos    = c_rarg3;  // destination position
1670     const Register length     = c_rarg4;
1671 
1672     StubCodeMark mark(this, "StubRoutines", name);
1673 
1674     __ align(CodeEntryAlignment);
1675     address start = __ pc();
1676 
1677     __ enter(); // required for proper stackwalking of RuntimeStub frame
1678 
1679     // bump this on entry, not on exit:
1680     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1681 
1682     //-----------------------------------------------------------------------
1683     // Assembler stub will be used for this call to arraycopy
1684     // if the following conditions are met:
1685     //
1686     // (1) src and dst must not be null.
1687     // (2) src_pos must not be negative.
1688     // (3) dst_pos must not be negative.
1689     // (4) length  must not be negative.
1690     // (5) src klass and dst klass should be the same and not NULL.
1691     // (6) src and dst should be arrays.
1692     // (7) src_pos + length must not exceed length of src.
1693     // (8) dst_pos + length must not exceed length of dst.
1694     //
1695 
1696     //  if (src == NULL) return -1;
1697     __ cbz(src, L_failed);
1698 
1699     //  if (src_pos < 0) return -1;
1700     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
1701 
1702     //  if (dst == NULL) return -1;
1703     __ cbz(dst, L_failed);
1704 
1705     //  if (dst_pos < 0) return -1;
1706     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
1707 
1708     // registers used as temp
1709     const Register scratch_length    = r16; // elements count to copy
1710     const Register scratch_src_klass = r17; // array klass
1711     const Register lh                = r18; // layout helper
1712 
1713     //  if (length < 0) return -1;
1714     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
1715     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
1716 
1717     __ load_klass(scratch_src_klass, src);
1718 #ifdef ASSERT
1719     //  assert(src->klass() != NULL);
1720     {
1721       BLOCK_COMMENT("assert klasses not null {");
1722       Label L1, L2;
1723       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
1724       __ bind(L1);
1725       __ stop("broken null klass");
1726       __ bind(L2);
1727       __ load_klass(rscratch1, dst);
1728       __ cbz(rscratch1, L1);     // this would be broken also
1729       BLOCK_COMMENT("} assert klasses not null done");
1730     }
1731 #endif
1732 
1733     // Load layout helper (32-bits)
1734     //
1735     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1736     // 32        30    24            16              8     2                 0
1737     //
1738     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1739     //
1740 
1741     const int lh_offset = in_bytes(Klass::layout_helper_offset());
1742 
1743     // Handle objArrays completely differently...
1744     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1745     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
1746     __ movw(rscratch1, objArray_lh);
1747     __ eorw(rscratch2, lh, rscratch1);
1748     __ cbzw(rscratch2, L_objArray);
1749 
1750     //  if (src->klass() != dst->klass()) return -1;
1751     __ load_klass(rscratch2, dst);
1752     __ eor(rscratch2, rscratch2, scratch_src_klass);
1753     __ cbnz(rscratch2, L_failed);
1754 
1755     //  if (!src->is_Array()) return -1;
1756     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
1757 
1758     // At this point, it is known to be a typeArray (array_tag 0x3).
1759 #ifdef ASSERT
1760     {
1761       BLOCK_COMMENT("assert primitive array {");
1762       Label L;
1763       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
1764       __ cmpw(lh, rscratch2);
1765       __ br(Assembler::GE, L);
1766       __ stop("must be a primitive array");
1767       __ bind(L);
1768       BLOCK_COMMENT("} assert primitive array done");
1769     }
1770 #endif
1771 
1772     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1773                            rscratch2, L_failed);
1774 
1775     // TypeArrayKlass
1776     //
1777     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
1778     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
1779     //
1780 
1781     const Register rscratch1_offset = rscratch1;    // array offset
1782     const Register r18_elsize = lh; // element size
1783 
1784     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
1785            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
1786     __ add(src, src, rscratch1_offset);           // src array offset
1787     __ add(dst, dst, rscratch1_offset);           // dst array offset
1788     BLOCK_COMMENT("choose copy loop based on element size");
1789 
1790     // next registers should be set before the jump to corresponding stub
1791     const Register from     = c_rarg0;  // source array address
1792     const Register to       = c_rarg1;  // destination array address
1793     const Register count    = c_rarg2;  // elements count
1794 
1795     // 'from', 'to', 'count' registers should be set in such order
1796     // since they are the same as 'src', 'src_pos', 'dst'.
1797 
1798     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1799 
1800     // The possible values of elsize are 0-3, i.e. exact_log2(element
1801     // size in bytes).  We do a simple bitwise binary search.
1802   __ BIND(L_copy_bytes);
1803     __ tbnz(r18_elsize, 1, L_copy_ints);
1804     __ tbnz(r18_elsize, 0, L_copy_shorts);
1805     __ lea(from, Address(src, src_pos));// src_addr
1806     __ lea(to,   Address(dst, dst_pos));// dst_addr
1807     __ movw(count, scratch_length); // length
1808     __ b(RuntimeAddress(byte_copy_entry));
1809 
1810   __ BIND(L_copy_shorts);
1811     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
1812     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
1813     __ movw(count, scratch_length); // length
1814     __ b(RuntimeAddress(short_copy_entry));
1815 
1816   __ BIND(L_copy_ints);
1817     __ tbnz(r18_elsize, 0, L_copy_longs);
1818     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
1819     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
1820     __ movw(count, scratch_length); // length
1821     __ b(RuntimeAddress(int_copy_entry));
1822 
1823   __ BIND(L_copy_longs);
1824 #ifdef ASSERT
1825     {
1826       BLOCK_COMMENT("assert long copy {");
1827       Label L;
1828       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
1829       __ cmpw(r18_elsize, LogBytesPerLong);
1830       __ br(Assembler::EQ, L);
1831       __ stop("must be long copy, but elsize is wrong");
1832       __ bind(L);
1833       BLOCK_COMMENT("} assert long copy done");
1834     }
1835 #endif
1836     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
1837     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
1838     __ movw(count, scratch_length); // length
1839     __ b(RuntimeAddress(long_copy_entry));
1840 
1841     // ObjArrayKlass
1842   __ BIND(L_objArray);
1843     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
1844 
1845     Label L_plain_copy, L_checkcast_copy;
1846     //  test array classes for subtyping
1847     __ load_klass(r18, dst);
1848     __ cmp(scratch_src_klass, r18); // usual case is exact equality
1849     __ br(Assembler::NE, L_checkcast_copy);
1850 
1851     // Identically typed arrays can be copied without element-wise checks.
1852     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1853                            rscratch2, L_failed);
1854 
1855     __ lea(from, Address(src, src_pos, Address::lsl(3)));
1856     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1857     __ lea(to, Address(dst, dst_pos, Address::lsl(3)));
1858     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1859     __ movw(count, scratch_length); // length
1860   __ BIND(L_plain_copy);
1861     __ b(RuntimeAddress(oop_copy_entry));
1862 
1863   __ BIND(L_checkcast_copy);
1864     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
1865     {
1866       // Before looking at dst.length, make sure dst is also an objArray.
1867       __ ldrw(rscratch1, Address(r18, lh_offset));
1868       __ movw(rscratch2, objArray_lh);
1869       __ eorw(rscratch1, rscratch1, rscratch2);
1870       __ cbnzw(rscratch1, L_failed);
1871 
1872       // It is safe to examine both src.length and dst.length.
1873       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1874                              r18, L_failed);
1875 
1876       const Register rscratch2_dst_klass = rscratch2;
1877       __ load_klass(rscratch2_dst_klass, dst); // reload
1878 
1879       // Marshal the base address arguments now, freeing registers.
1880       __ lea(from, Address(src, src_pos, Address::lsl(3)));
1881       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1882       __ lea(to, Address(dst, dst_pos, Address::lsl(3)));
1883       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1884       __ movw(count, length);           // length (reloaded)
1885       Register sco_temp = c_rarg3;      // this register is free now
1886       assert_different_registers(from, to, count, sco_temp,
1887                                  rscratch2_dst_klass, scratch_src_klass);
1888       // assert_clean_int(count, sco_temp);
1889 
1890       // Generate the type check.
1891       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
1892       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
1893       // assert_clean_int(sco_temp, r18);
1894       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
1895 
1896       // Fetch destination element klass from the ObjArrayKlass header.
1897       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
1898       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
1899       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
1900 
1901       // the checkcast_copy loop needs two extra arguments:
1902       assert(c_rarg3 == sco_temp, "#3 already in place");
1903       // Set up arguments for checkcast_copy_entry.
1904       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
1905       __ b(RuntimeAddress(checkcast_copy_entry));
1906     }
1907 
1908   __ BIND(L_failed);
1909     __ mov(r0, -1);
1910     __ leave();   // required for proper stackwalking of RuntimeStub frame
1911     __ ret(lr);
1912 
1913     return start;
1914   }
1915 
1916   void generate_arraycopy_stubs() {
1917     address entry;
1918     address entry_jbyte_arraycopy;
1919     address entry_jshort_arraycopy;
1920     address entry_jint_arraycopy;
1921     address entry_oop_arraycopy;
1922     address entry_jlong_arraycopy;
1923     address entry_checkcast_arraycopy;
1924 
1925     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
1926     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
1927 
1928     //*** jbyte
1929     // Always need aligned and unaligned versions
1930     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
1931                                                                                   "jbyte_disjoint_arraycopy");
1932     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
1933                                                                                   &entry_jbyte_arraycopy,
1934                                                                                   "jbyte_arraycopy");
1935     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
1936                                                                                   "arrayof_jbyte_disjoint_arraycopy");
1937     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
1938                                                                                   "arrayof_jbyte_arraycopy");
1939 
1940     //*** jshort
1941     // Always need aligned and unaligned versions
1942     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
1943                                                                                     "jshort_disjoint_arraycopy");
1944     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
1945                                                                                     &entry_jshort_arraycopy,
1946                                                                                     "jshort_arraycopy");
1947     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
1948                                                                                     "arrayof_jshort_disjoint_arraycopy");
1949     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
1950                                                                                     "arrayof_jshort_arraycopy");
1951 
1952     //*** jint
1953     // Aligned versions
1954     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
1955                                                                                 "arrayof_jint_disjoint_arraycopy");
1956     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
1957                                                                                 "arrayof_jint_arraycopy");
1958     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
1959     // entry_jint_arraycopy always points to the unaligned version
1960     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
1961                                                                                 "jint_disjoint_arraycopy");
1962     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
1963                                                                                 &entry_jint_arraycopy,
1964                                                                                 "jint_arraycopy");
1965 
1966     //*** jlong
1967     // It is always aligned
1968     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
1969                                                                                   "arrayof_jlong_disjoint_arraycopy");
1970     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
1971                                                                                   "arrayof_jlong_arraycopy");
1972     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
1973     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
1974 
1975     //*** oops
1976     {
1977       // With compressed oops we need unaligned versions; notice that
1978       // we overwrite entry_oop_arraycopy.
1979       bool aligned = !UseCompressedOops;
1980 
1981       StubRoutines::_arrayof_oop_disjoint_arraycopy
1982         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy");
1983       StubRoutines::_arrayof_oop_arraycopy
1984         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy");
1985       // Aligned versions without pre-barriers
1986       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
1987         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
1988                                      /*dest_uninitialized*/true);
1989       StubRoutines::_arrayof_oop_arraycopy_uninit
1990         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
1991                                      /*dest_uninitialized*/true);
1992     }
1993 
1994     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
1995     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
1996     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
1997     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
1998 
1999     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2000     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2001                                                                         /*dest_uninitialized*/true);
2002 
2003     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2004                                                               entry_jbyte_arraycopy);
2005 
2006     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2007                                                                entry_jbyte_arraycopy,
2008                                                                entry_jshort_arraycopy,
2009                                                                entry_jint_arraycopy,
2010                                                                entry_oop_arraycopy,
2011                                                                entry_jlong_arraycopy,
2012                                                                entry_checkcast_arraycopy);
2013 
2014   }
2015 
2016   void generate_math_stubs() { Unimplemented(); }
2017 
2018   // Arguments:
2019   //
2020   // Inputs:
2021   //   c_rarg0   - source byte array address
2022   //   c_rarg1   - destination byte array address
2023   //   c_rarg2   - K (key) in little endian int array
2024   //
2025   address generate_aescrypt_encryptBlock() {
2026     __ align(CodeEntryAlignment);
2027     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2028 
2029     Label L_doLast;
2030 
2031     const Register from        = c_rarg0;  // source array address
2032     const Register to          = c_rarg1;  // destination array address
2033     const Register key         = c_rarg2;  // key array address
2034     const Register keylen      = rscratch1;
2035 
2036     address start = __ pc();
2037     __ enter();
2038 
2039     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2040 
2041     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2042 
2043     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2044     __ rev32(v1, __ T16B, v1);
2045     __ rev32(v2, __ T16B, v2);
2046     __ rev32(v3, __ T16B, v3);
2047     __ rev32(v4, __ T16B, v4);
2048     __ aese(v0, v1);
2049     __ aesmc(v0, v0);
2050     __ aese(v0, v2);
2051     __ aesmc(v0, v0);
2052     __ aese(v0, v3);
2053     __ aesmc(v0, v0);
2054     __ aese(v0, v4);
2055     __ aesmc(v0, v0);
2056 
2057     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2058     __ rev32(v1, __ T16B, v1);
2059     __ rev32(v2, __ T16B, v2);
2060     __ rev32(v3, __ T16B, v3);
2061     __ rev32(v4, __ T16B, v4);
2062     __ aese(v0, v1);
2063     __ aesmc(v0, v0);
2064     __ aese(v0, v2);
2065     __ aesmc(v0, v0);
2066     __ aese(v0, v3);
2067     __ aesmc(v0, v0);
2068     __ aese(v0, v4);
2069     __ aesmc(v0, v0);
2070 
2071     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2072     __ rev32(v1, __ T16B, v1);
2073     __ rev32(v2, __ T16B, v2);
2074 
2075     __ cmpw(keylen, 44);
2076     __ br(Assembler::EQ, L_doLast);
2077 
2078     __ aese(v0, v1);
2079     __ aesmc(v0, v0);
2080     __ aese(v0, v2);
2081     __ aesmc(v0, v0);
2082 
2083     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2084     __ rev32(v1, __ T16B, v1);
2085     __ rev32(v2, __ T16B, v2);
2086 
2087     __ cmpw(keylen, 52);
2088     __ br(Assembler::EQ, L_doLast);
2089 
2090     __ aese(v0, v1);
2091     __ aesmc(v0, v0);
2092     __ aese(v0, v2);
2093     __ aesmc(v0, v0);
2094 
2095     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2096     __ rev32(v1, __ T16B, v1);
2097     __ rev32(v2, __ T16B, v2);
2098 
2099     __ BIND(L_doLast);
2100 
2101     __ aese(v0, v1);
2102     __ aesmc(v0, v0);
2103     __ aese(v0, v2);
2104 
2105     __ ld1(v1, __ T16B, key);
2106     __ rev32(v1, __ T16B, v1);
2107     __ eor(v0, __ T16B, v0, v1);
2108 
2109     __ st1(v0, __ T16B, to);
2110 
2111     __ mov(r0, 0);
2112 
2113     __ leave();
2114     __ ret(lr);
2115 
2116     return start;
2117   }
2118 
2119   // Arguments:
2120   //
2121   // Inputs:
2122   //   c_rarg0   - source byte array address
2123   //   c_rarg1   - destination byte array address
2124   //   c_rarg2   - K (key) in little endian int array
2125   //
2126   address generate_aescrypt_decryptBlock() {
2127     assert(UseAES, "need AES instructions and misaligned SSE support");
2128     __ align(CodeEntryAlignment);
2129     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2130     Label L_doLast;
2131 
2132     const Register from        = c_rarg0;  // source array address
2133     const Register to          = c_rarg1;  // destination array address
2134     const Register key         = c_rarg2;  // key array address
2135     const Register keylen      = rscratch1;
2136 
2137     address start = __ pc();
2138     __ enter(); // required for proper stackwalking of RuntimeStub frame
2139 
2140     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2141 
2142     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2143 
2144     __ ld1(v5, __ T16B, __ post(key, 16));
2145     __ rev32(v5, __ T16B, v5);
2146 
2147     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2148     __ rev32(v1, __ T16B, v1);
2149     __ rev32(v2, __ T16B, v2);
2150     __ rev32(v3, __ T16B, v3);
2151     __ rev32(v4, __ T16B, v4);
2152     __ aesd(v0, v1);
2153     __ aesimc(v0, v0);
2154     __ aesd(v0, v2);
2155     __ aesimc(v0, v0);
2156     __ aesd(v0, v3);
2157     __ aesimc(v0, v0);
2158     __ aesd(v0, v4);
2159     __ aesimc(v0, v0);
2160 
2161     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2162     __ rev32(v1, __ T16B, v1);
2163     __ rev32(v2, __ T16B, v2);
2164     __ rev32(v3, __ T16B, v3);
2165     __ rev32(v4, __ T16B, v4);
2166     __ aesd(v0, v1);
2167     __ aesimc(v0, v0);
2168     __ aesd(v0, v2);
2169     __ aesimc(v0, v0);
2170     __ aesd(v0, v3);
2171     __ aesimc(v0, v0);
2172     __ aesd(v0, v4);
2173     __ aesimc(v0, v0);
2174 
2175     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2176     __ rev32(v1, __ T16B, v1);
2177     __ rev32(v2, __ T16B, v2);
2178 
2179     __ cmpw(keylen, 44);
2180     __ br(Assembler::EQ, L_doLast);
2181 
2182     __ aesd(v0, v1);
2183     __ aesimc(v0, v0);
2184     __ aesd(v0, v2);
2185     __ aesimc(v0, v0);
2186 
2187     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2188     __ rev32(v1, __ T16B, v1);
2189     __ rev32(v2, __ T16B, v2);
2190 
2191     __ cmpw(keylen, 52);
2192     __ br(Assembler::EQ, L_doLast);
2193 
2194     __ aesd(v0, v1);
2195     __ aesimc(v0, v0);
2196     __ aesd(v0, v2);
2197     __ aesimc(v0, v0);
2198 
2199     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2200     __ rev32(v1, __ T16B, v1);
2201     __ rev32(v2, __ T16B, v2);
2202 
2203     __ BIND(L_doLast);
2204 
2205     __ aesd(v0, v1);
2206     __ aesimc(v0, v0);
2207     __ aesd(v0, v2);
2208 
2209     __ eor(v0, __ T16B, v0, v5);
2210 
2211     __ st1(v0, __ T16B, to);
2212 
2213     __ mov(r0, 0);
2214 
2215     __ leave();
2216     __ ret(lr);
2217 
2218     return start;
2219   }
2220 
2221   // Arguments:
2222   //
2223   // Inputs:
2224   //   c_rarg0   - source byte array address
2225   //   c_rarg1   - destination byte array address
2226   //   c_rarg2   - K (key) in little endian int array
2227   //   c_rarg3   - r vector byte array address
2228   //   c_rarg4   - input length
2229   //
2230   // Output:
2231   //   x0        - input length
2232   //
2233   address generate_cipherBlockChaining_encryptAESCrypt() {
2234     assert(UseAES, "need AES instructions and misaligned SSE support");
2235     __ align(CodeEntryAlignment);
2236     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2237 
2238     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2239 
2240     const Register from        = c_rarg0;  // source array address
2241     const Register to          = c_rarg1;  // destination array address
2242     const Register key         = c_rarg2;  // key array address
2243     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2244                                            // and left with the results of the last encryption block
2245     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2246     const Register keylen      = rscratch1;
2247 
2248     address start = __ pc();
2249       __ enter();
2250 
2251       __ mov(rscratch2, len_reg);
2252       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2253 
2254       __ ld1(v0, __ T16B, rvec);
2255 
2256       __ cmpw(keylen, 52);
2257       __ br(Assembler::CC, L_loadkeys_44);
2258       __ br(Assembler::EQ, L_loadkeys_52);
2259 
2260       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2261       __ rev32(v17, __ T16B, v17);
2262       __ rev32(v18, __ T16B, v18);
2263     __ BIND(L_loadkeys_52);
2264       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2265       __ rev32(v19, __ T16B, v19);
2266       __ rev32(v20, __ T16B, v20);
2267     __ BIND(L_loadkeys_44);
2268       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2269       __ rev32(v21, __ T16B, v21);
2270       __ rev32(v22, __ T16B, v22);
2271       __ rev32(v23, __ T16B, v23);
2272       __ rev32(v24, __ T16B, v24);
2273       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2274       __ rev32(v25, __ T16B, v25);
2275       __ rev32(v26, __ T16B, v26);
2276       __ rev32(v27, __ T16B, v27);
2277       __ rev32(v28, __ T16B, v28);
2278       __ ld1(v29, v30, v31, __ T16B, key);
2279       __ rev32(v29, __ T16B, v29);
2280       __ rev32(v30, __ T16B, v30);
2281       __ rev32(v31, __ T16B, v31);
2282 
2283     __ BIND(L_aes_loop);
2284       __ ld1(v1, __ T16B, __ post(from, 16));
2285       __ eor(v0, __ T16B, v0, v1);
2286 
2287       __ br(Assembler::CC, L_rounds_44);
2288       __ br(Assembler::EQ, L_rounds_52);
2289 
2290       __ aese(v0, v17); __ aesmc(v0, v0);
2291       __ aese(v0, v18); __ aesmc(v0, v0);
2292     __ BIND(L_rounds_52);
2293       __ aese(v0, v19); __ aesmc(v0, v0);
2294       __ aese(v0, v20); __ aesmc(v0, v0);
2295     __ BIND(L_rounds_44);
2296       __ aese(v0, v21); __ aesmc(v0, v0);
2297       __ aese(v0, v22); __ aesmc(v0, v0);
2298       __ aese(v0, v23); __ aesmc(v0, v0);
2299       __ aese(v0, v24); __ aesmc(v0, v0);
2300       __ aese(v0, v25); __ aesmc(v0, v0);
2301       __ aese(v0, v26); __ aesmc(v0, v0);
2302       __ aese(v0, v27); __ aesmc(v0, v0);
2303       __ aese(v0, v28); __ aesmc(v0, v0);
2304       __ aese(v0, v29); __ aesmc(v0, v0);
2305       __ aese(v0, v30);
2306       __ eor(v0, __ T16B, v0, v31);
2307 
2308       __ st1(v0, __ T16B, __ post(to, 16));
2309       __ sub(len_reg, len_reg, 16);
2310       __ cbnz(len_reg, L_aes_loop);
2311 
2312       __ st1(v0, __ T16B, rvec);
2313 
2314       __ mov(r0, rscratch2);
2315 
2316       __ leave();
2317       __ ret(lr);
2318 
2319       return start;
2320   }
2321 
2322   // Arguments:
2323   //
2324   // Inputs:
2325   //   c_rarg0   - source byte array address
2326   //   c_rarg1   - destination byte array address
2327   //   c_rarg2   - K (key) in little endian int array
2328   //   c_rarg3   - r vector byte array address
2329   //   c_rarg4   - input length
2330   //
2331   // Output:
2332   //   r0        - input length
2333   //
2334   address generate_cipherBlockChaining_decryptAESCrypt() {
2335     assert(UseAES, "need AES instructions and misaligned SSE support");
2336     __ align(CodeEntryAlignment);
2337     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2338 
2339     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2340 
2341     const Register from        = c_rarg0;  // source array address
2342     const Register to          = c_rarg1;  // destination array address
2343     const Register key         = c_rarg2;  // key array address
2344     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2345                                            // and left with the results of the last encryption block
2346     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2347     const Register keylen      = rscratch1;
2348 
2349     address start = __ pc();
2350       __ enter();
2351 
2352       __ mov(rscratch2, len_reg);
2353       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2354 
2355       __ ld1(v2, __ T16B, rvec);
2356 
2357       __ ld1(v31, __ T16B, __ post(key, 16));
2358       __ rev32(v31, __ T16B, v31);
2359 
2360       __ cmpw(keylen, 52);
2361       __ br(Assembler::CC, L_loadkeys_44);
2362       __ br(Assembler::EQ, L_loadkeys_52);
2363 
2364       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2365       __ rev32(v17, __ T16B, v17);
2366       __ rev32(v18, __ T16B, v18);
2367     __ BIND(L_loadkeys_52);
2368       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2369       __ rev32(v19, __ T16B, v19);
2370       __ rev32(v20, __ T16B, v20);
2371     __ BIND(L_loadkeys_44);
2372       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2373       __ rev32(v21, __ T16B, v21);
2374       __ rev32(v22, __ T16B, v22);
2375       __ rev32(v23, __ T16B, v23);
2376       __ rev32(v24, __ T16B, v24);
2377       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2378       __ rev32(v25, __ T16B, v25);
2379       __ rev32(v26, __ T16B, v26);
2380       __ rev32(v27, __ T16B, v27);
2381       __ rev32(v28, __ T16B, v28);
2382       __ ld1(v29, v30, __ T16B, key);
2383       __ rev32(v29, __ T16B, v29);
2384       __ rev32(v30, __ T16B, v30);
2385 
2386     __ BIND(L_aes_loop);
2387       __ ld1(v0, __ T16B, __ post(from, 16));
2388       __ orr(v1, __ T16B, v0, v0);
2389 
2390       __ br(Assembler::CC, L_rounds_44);
2391       __ br(Assembler::EQ, L_rounds_52);
2392 
2393       __ aesd(v0, v17); __ aesimc(v0, v0);
2394       __ aesd(v0, v18); __ aesimc(v0, v0);
2395     __ BIND(L_rounds_52);
2396       __ aesd(v0, v19); __ aesimc(v0, v0);
2397       __ aesd(v0, v20); __ aesimc(v0, v0);
2398     __ BIND(L_rounds_44);
2399       __ aesd(v0, v21); __ aesimc(v0, v0);
2400       __ aesd(v0, v22); __ aesimc(v0, v0);
2401       __ aesd(v0, v23); __ aesimc(v0, v0);
2402       __ aesd(v0, v24); __ aesimc(v0, v0);
2403       __ aesd(v0, v25); __ aesimc(v0, v0);
2404       __ aesd(v0, v26); __ aesimc(v0, v0);
2405       __ aesd(v0, v27); __ aesimc(v0, v0);
2406       __ aesd(v0, v28); __ aesimc(v0, v0);
2407       __ aesd(v0, v29); __ aesimc(v0, v0);
2408       __ aesd(v0, v30);
2409       __ eor(v0, __ T16B, v0, v31);
2410       __ eor(v0, __ T16B, v0, v2);
2411 
2412       __ st1(v0, __ T16B, __ post(to, 16));
2413       __ orr(v2, __ T16B, v1, v1);
2414 
2415       __ sub(len_reg, len_reg, 16);
2416       __ cbnz(len_reg, L_aes_loop);
2417 
2418       __ st1(v2, __ T16B, rvec);
2419 
2420       __ mov(r0, rscratch2);
2421 
2422       __ leave();
2423       __ ret(lr);
2424 
2425     return start;
2426   }
2427 
2428   // Arguments:
2429   //
2430   // Inputs:
2431   //   c_rarg0   - byte[]  source+offset
2432   //   c_rarg1   - int[]   SHA.state
2433   //   c_rarg2   - int     offset
2434   //   c_rarg3   - int     limit
2435   //
2436   address generate_sha1_implCompress(bool multi_block, const char *name) {
2437     __ align(CodeEntryAlignment);
2438     StubCodeMark mark(this, "StubRoutines", name);
2439     address start = __ pc();
2440 
2441     Register buf   = c_rarg0;
2442     Register state = c_rarg1;
2443     Register ofs   = c_rarg2;
2444     Register limit = c_rarg3;
2445 
2446     Label keys;
2447     Label sha1_loop;
2448 
2449     // load the keys into v0..v3
2450     __ adr(rscratch1, keys);
2451     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2452     // load 5 words state into v6, v7
2453     __ ldrq(v6, Address(state, 0));
2454     __ ldrs(v7, Address(state, 16));
2455 
2456 
2457     __ BIND(sha1_loop);
2458     // load 64 bytes of data into v16..v19
2459     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2460     __ rev32(v16, __ T16B, v16);
2461     __ rev32(v17, __ T16B, v17);
2462     __ rev32(v18, __ T16B, v18);
2463     __ rev32(v19, __ T16B, v19);
2464 
2465     // do the sha1
2466     __ addv(v4, __ T4S, v16, v0);
2467     __ orr(v20, __ T16B, v6, v6);
2468 
2469     FloatRegister d0 = v16;
2470     FloatRegister d1 = v17;
2471     FloatRegister d2 = v18;
2472     FloatRegister d3 = v19;
2473 
2474     for (int round = 0; round < 20; round++) {
2475       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2476       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2477       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2478       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2479       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2480 
2481       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2482       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2483       __ sha1h(tmp2, __ T4S, v20);
2484       if (round < 5)
2485         __ sha1c(v20, __ T4S, tmp3, tmp4);
2486       else if (round < 10 || round >= 15)
2487         __ sha1p(v20, __ T4S, tmp3, tmp4);
2488       else
2489         __ sha1m(v20, __ T4S, tmp3, tmp4);
2490       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2491 
2492       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2493     }
2494 
2495     __ addv(v7, __ T2S, v7, v21);
2496     __ addv(v6, __ T4S, v6, v20);
2497 
2498     if (multi_block) {
2499       __ add(ofs, ofs, 64);
2500       __ cmp(ofs, limit);
2501       __ br(Assembler::LE, sha1_loop);
2502       __ mov(c_rarg0, ofs); // return ofs
2503     }
2504 
2505     __ strq(v6, Address(state, 0));
2506     __ strs(v7, Address(state, 16));
2507 
2508     __ ret(lr);
2509 
2510     __ bind(keys);
2511     __ emit_int32(0x5a827999);
2512     __ emit_int32(0x6ed9eba1);
2513     __ emit_int32(0x8f1bbcdc);
2514     __ emit_int32(0xca62c1d6);
2515 
2516     return start;
2517   }
2518 
2519 
2520   // Arguments:
2521   //
2522   // Inputs:
2523   //   c_rarg0   - byte[]  source+offset
2524   //   c_rarg1   - int[]   SHA.state
2525   //   c_rarg2   - int     offset
2526   //   c_rarg3   - int     limit
2527   //
2528   address generate_sha256_implCompress(bool multi_block, const char *name) {
2529     static const uint32_t round_consts[64] = {
2530       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
2531       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
2532       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
2533       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
2534       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
2535       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
2536       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
2537       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
2538       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
2539       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
2540       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
2541       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
2542       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
2543       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
2544       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
2545       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
2546     };
2547     __ align(CodeEntryAlignment);
2548     StubCodeMark mark(this, "StubRoutines", name);
2549     address start = __ pc();
2550 
2551     Register buf   = c_rarg0;
2552     Register state = c_rarg1;
2553     Register ofs   = c_rarg2;
2554     Register limit = c_rarg3;
2555 
2556     Label sha1_loop;
2557 
2558     __ stpd(v8, v9, __ pre(sp, -32));
2559     __ stpd(v10, v11, Address(sp, 16));
2560 
2561 // dga == v0
2562 // dgb == v1
2563 // dg0 == v2
2564 // dg1 == v3
2565 // dg2 == v4
2566 // t0 == v6
2567 // t1 == v7
2568 
2569     // load 16 keys to v16..v31
2570     __ lea(rscratch1, ExternalAddress((address)round_consts));
2571     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
2572     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
2573     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
2574     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
2575 
2576     // load 8 words (256 bits) state
2577     __ ldpq(v0, v1, state);
2578 
2579     __ BIND(sha1_loop);
2580     // load 64 bytes of data into v8..v11
2581     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
2582     __ rev32(v8, __ T16B, v8);
2583     __ rev32(v9, __ T16B, v9);
2584     __ rev32(v10, __ T16B, v10);
2585     __ rev32(v11, __ T16B, v11);
2586 
2587     __ addv(v6, __ T4S, v8, v16);
2588     __ orr(v2, __ T16B, v0, v0);
2589     __ orr(v3, __ T16B, v1, v1);
2590 
2591     FloatRegister d0 = v8;
2592     FloatRegister d1 = v9;
2593     FloatRegister d2 = v10;
2594     FloatRegister d3 = v11;
2595 
2596 
2597     for (int round = 0; round < 16; round++) {
2598       FloatRegister tmp1 = (round & 1) ? v6 : v7;
2599       FloatRegister tmp2 = (round & 1) ? v7 : v6;
2600       FloatRegister tmp3 = (round & 1) ? v2 : v4;
2601       FloatRegister tmp4 = (round & 1) ? v4 : v2;
2602 
2603       if (round < 12) __ sha256su0(d0, __ T4S, d1);
2604        __ orr(v4, __ T16B, v2, v2);
2605       if (round < 15)
2606         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
2607       __ sha256h(v2, __ T4S, v3, tmp2);
2608       __ sha256h2(v3, __ T4S, v4, tmp2);
2609       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
2610 
2611       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2612     }
2613 
2614     __ addv(v0, __ T4S, v0, v2);
2615     __ addv(v1, __ T4S, v1, v3);
2616 
2617     if (multi_block) {
2618       __ add(ofs, ofs, 64);
2619       __ cmp(ofs, limit);
2620       __ br(Assembler::LE, sha1_loop);
2621       __ mov(c_rarg0, ofs); // return ofs
2622     }
2623 
2624     __ ldpd(v10, v11, Address(sp, 16));
2625     __ ldpd(v8, v9, __ post(sp, 32));
2626 
2627     __ stpq(v0, v1, state);
2628 
2629     __ ret(lr);
2630 
2631     return start;
2632   }
2633 
2634 #ifndef BUILTIN_SIM
2635   // Safefetch stubs.
2636   void generate_safefetch(const char* name, int size, address* entry,
2637                           address* fault_pc, address* continuation_pc) {
2638     // safefetch signatures:
2639     //   int      SafeFetch32(int*      adr, int      errValue);
2640     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
2641     //
2642     // arguments:
2643     //   c_rarg0 = adr
2644     //   c_rarg1 = errValue
2645     //
2646     // result:
2647     //   PPC_RET  = *adr or errValue
2648 
2649     StubCodeMark mark(this, "StubRoutines", name);
2650 
2651     // Entry point, pc or function descriptor.
2652     *entry = __ pc();
2653 
2654     // Load *adr into c_rarg1, may fault.
2655     *fault_pc = __ pc();
2656     switch (size) {
2657       case 4:
2658         // int32_t
2659         __ ldrw(c_rarg1, Address(c_rarg0, 0));
2660         break;
2661       case 8:
2662         // int64_t
2663         __ ldr(c_rarg1, Address(c_rarg0, 0));
2664         break;
2665       default:
2666         ShouldNotReachHere();
2667     }
2668 
2669     // return errValue or *adr
2670     *continuation_pc = __ pc();
2671     __ mov(r0, c_rarg1);
2672     __ ret(lr);
2673   }
2674 #endif
2675 
2676   /**
2677    *  Arguments:
2678    *
2679    * Inputs:
2680    *   c_rarg0   - int crc
2681    *   c_rarg1   - byte* buf
2682    *   c_rarg2   - int length
2683    *
2684    * Ouput:
2685    *       rax   - int crc result
2686    */
2687   address generate_updateBytesCRC32() {
2688     assert(UseCRC32Intrinsics, "what are we doing here?");
2689 
2690     __ align(CodeEntryAlignment);
2691     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
2692 
2693     address start = __ pc();
2694 
2695     const Register crc   = c_rarg0;  // crc
2696     const Register buf   = c_rarg1;  // source java byte array address
2697     const Register len   = c_rarg2;  // length
2698     const Register table0 = c_rarg3; // crc_table address
2699     const Register table1 = c_rarg4;
2700     const Register table2 = c_rarg5;
2701     const Register table3 = c_rarg6;
2702     const Register tmp3 = c_rarg7;
2703 
2704     BLOCK_COMMENT("Entry:");
2705     __ enter(); // required for proper stackwalking of RuntimeStub frame
2706 
2707     __ kernel_crc32(crc, buf, len,
2708               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
2709 
2710     __ leave(); // required for proper stackwalking of RuntimeStub frame
2711     __ ret(lr);
2712 
2713     return start;
2714   }
2715 
2716   /**
2717    *  Arguments:
2718    *
2719    * Inputs:
2720    *   c_rarg0   - int crc
2721    *   c_rarg1   - byte* buf
2722    *   c_rarg2   - int length
2723    *   c_rarg3   - int* table
2724    *
2725    * Ouput:
2726    *       r0   - int crc result
2727    */
2728   address generate_updateBytesCRC32C() {
2729     assert(UseCRC32CIntrinsics, "what are we doing here?");
2730 
2731     __ align(CodeEntryAlignment);
2732     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
2733 
2734     address start = __ pc();
2735 
2736     const Register crc   = c_rarg0;  // crc
2737     const Register buf   = c_rarg1;  // source java byte array address
2738     const Register len   = c_rarg2;  // length
2739     const Register table0 = c_rarg3; // crc_table address
2740     const Register table1 = c_rarg4;
2741     const Register table2 = c_rarg5;
2742     const Register table3 = c_rarg6;
2743     const Register tmp3 = c_rarg7;
2744 
2745     BLOCK_COMMENT("Entry:");
2746     __ enter(); // required for proper stackwalking of RuntimeStub frame
2747 
2748     __ kernel_crc32c(crc, buf, len,
2749               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
2750 
2751     __ leave(); // required for proper stackwalking of RuntimeStub frame
2752     __ ret(lr);
2753 
2754     return start;
2755   }
2756 
2757   /***
2758    *  Arguments:
2759    *
2760    *  Inputs:
2761    *   c_rarg0   - int   adler
2762    *   c_rarg1   - byte* buff
2763    *   c_rarg2   - int   len
2764    *
2765    * Output:
2766    *   c_rarg0   - int adler result
2767    */
2768   address generate_updateBytesAdler32() {
2769     __ align(CodeEntryAlignment);
2770     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
2771     address start = __ pc();
2772 
2773     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
2774 
2775     // Aliases
2776     Register adler  = c_rarg0;
2777     Register s1     = c_rarg0;
2778     Register s2     = c_rarg3;
2779     Register buff   = c_rarg1;
2780     Register len    = c_rarg2;
2781     Register nmax  = r4;
2782     Register base = r5;
2783     Register count = r6;
2784     Register temp0 = rscratch1;
2785     Register temp1 = rscratch2;
2786     Register temp2 = r7;
2787 
2788     // Max number of bytes we can process before having to take the mod
2789     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
2790     unsigned long BASE = 0xfff1;
2791     unsigned long NMAX = 0x15B0;
2792 
2793     __ mov(base, BASE);
2794     __ mov(nmax, NMAX);
2795 
2796     // s1 is initialized to the lower 16 bits of adler
2797     // s2 is initialized to the upper 16 bits of adler
2798     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
2799     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
2800 
2801     // The pipelined loop needs at least 16 elements for 1 iteration
2802     // It does check this, but it is more effective to skip to the cleanup loop
2803     __ cmp(len, 16);
2804     __ br(Assembler::HS, L_nmax);
2805     __ cbz(len, L_combine);
2806 
2807     __ bind(L_simple_by1_loop);
2808     __ ldrb(temp0, Address(__ post(buff, 1)));
2809     __ add(s1, s1, temp0);
2810     __ add(s2, s2, s1);
2811     __ subs(len, len, 1);
2812     __ br(Assembler::HI, L_simple_by1_loop);
2813 
2814     // s1 = s1 % BASE
2815     __ subs(temp0, s1, base);
2816     __ csel(s1, temp0, s1, Assembler::HS);
2817 
2818     // s2 = s2 % BASE
2819     __ lsr(temp0, s2, 16);
2820     __ lsl(temp1, temp0, 4);
2821     __ sub(temp1, temp1, temp0);
2822     __ add(s2, temp1, s2, ext::uxth);
2823 
2824     __ subs(temp0, s2, base);
2825     __ csel(s2, temp0, s2, Assembler::HS);
2826 
2827     __ b(L_combine);
2828 
2829     __ bind(L_nmax);
2830     __ subs(len, len, nmax);
2831     __ sub(count, nmax, 16);
2832     __ br(Assembler::LO, L_by16);
2833 
2834     __ bind(L_nmax_loop);
2835 
2836     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
2837 
2838     __ add(s1, s1, temp0, ext::uxtb);
2839     __ ubfx(temp2, temp0, 8, 8);
2840     __ add(s2, s2, s1);
2841     __ add(s1, s1, temp2);
2842     __ ubfx(temp2, temp0, 16, 8);
2843     __ add(s2, s2, s1);
2844     __ add(s1, s1, temp2);
2845     __ ubfx(temp2, temp0, 24, 8);
2846     __ add(s2, s2, s1);
2847     __ add(s1, s1, temp2);
2848     __ ubfx(temp2, temp0, 32, 8);
2849     __ add(s2, s2, s1);
2850     __ add(s1, s1, temp2);
2851     __ ubfx(temp2, temp0, 40, 8);
2852     __ add(s2, s2, s1);
2853     __ add(s1, s1, temp2);
2854     __ ubfx(temp2, temp0, 48, 8);
2855     __ add(s2, s2, s1);
2856     __ add(s1, s1, temp2);
2857     __ add(s2, s2, s1);
2858     __ add(s1, s1, temp0, Assembler::LSR, 56);
2859     __ add(s2, s2, s1);
2860 
2861     __ add(s1, s1, temp1, ext::uxtb);
2862     __ ubfx(temp2, temp1, 8, 8);
2863     __ add(s2, s2, s1);
2864     __ add(s1, s1, temp2);
2865     __ ubfx(temp2, temp1, 16, 8);
2866     __ add(s2, s2, s1);
2867     __ add(s1, s1, temp2);
2868     __ ubfx(temp2, temp1, 24, 8);
2869     __ add(s2, s2, s1);
2870     __ add(s1, s1, temp2);
2871     __ ubfx(temp2, temp1, 32, 8);
2872     __ add(s2, s2, s1);
2873     __ add(s1, s1, temp2);
2874     __ ubfx(temp2, temp1, 40, 8);
2875     __ add(s2, s2, s1);
2876     __ add(s1, s1, temp2);
2877     __ ubfx(temp2, temp1, 48, 8);
2878     __ add(s2, s2, s1);
2879     __ add(s1, s1, temp2);
2880     __ add(s2, s2, s1);
2881     __ add(s1, s1, temp1, Assembler::LSR, 56);
2882     __ add(s2, s2, s1);
2883 
2884     __ subs(count, count, 16);
2885     __ br(Assembler::HS, L_nmax_loop);
2886 
2887     // s1 = s1 % BASE
2888     __ lsr(temp0, s1, 16);
2889     __ lsl(temp1, temp0, 4);
2890     __ sub(temp1, temp1, temp0);
2891     __ add(temp1, temp1, s1, ext::uxth);
2892 
2893     __ lsr(temp0, temp1, 16);
2894     __ lsl(s1, temp0, 4);
2895     __ sub(s1, s1, temp0);
2896     __ add(s1, s1, temp1, ext:: uxth);
2897 
2898     __ subs(temp0, s1, base);
2899     __ csel(s1, temp0, s1, Assembler::HS);
2900 
2901     // s2 = s2 % BASE
2902     __ lsr(temp0, s2, 16);
2903     __ lsl(temp1, temp0, 4);
2904     __ sub(temp1, temp1, temp0);
2905     __ add(temp1, temp1, s2, ext::uxth);
2906 
2907     __ lsr(temp0, temp1, 16);
2908     __ lsl(s2, temp0, 4);
2909     __ sub(s2, s2, temp0);
2910     __ add(s2, s2, temp1, ext:: uxth);
2911 
2912     __ subs(temp0, s2, base);
2913     __ csel(s2, temp0, s2, Assembler::HS);
2914 
2915     __ subs(len, len, nmax);
2916     __ sub(count, nmax, 16);
2917     __ br(Assembler::HS, L_nmax_loop);
2918 
2919     __ bind(L_by16);
2920     __ adds(len, len, count);
2921     __ br(Assembler::LO, L_by1);
2922 
2923     __ bind(L_by16_loop);
2924 
2925     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
2926 
2927     __ add(s1, s1, temp0, ext::uxtb);
2928     __ ubfx(temp2, temp0, 8, 8);
2929     __ add(s2, s2, s1);
2930     __ add(s1, s1, temp2);
2931     __ ubfx(temp2, temp0, 16, 8);
2932     __ add(s2, s2, s1);
2933     __ add(s1, s1, temp2);
2934     __ ubfx(temp2, temp0, 24, 8);
2935     __ add(s2, s2, s1);
2936     __ add(s1, s1, temp2);
2937     __ ubfx(temp2, temp0, 32, 8);
2938     __ add(s2, s2, s1);
2939     __ add(s1, s1, temp2);
2940     __ ubfx(temp2, temp0, 40, 8);
2941     __ add(s2, s2, s1);
2942     __ add(s1, s1, temp2);
2943     __ ubfx(temp2, temp0, 48, 8);
2944     __ add(s2, s2, s1);
2945     __ add(s1, s1, temp2);
2946     __ add(s2, s2, s1);
2947     __ add(s1, s1, temp0, Assembler::LSR, 56);
2948     __ add(s2, s2, s1);
2949 
2950     __ add(s1, s1, temp1, ext::uxtb);
2951     __ ubfx(temp2, temp1, 8, 8);
2952     __ add(s2, s2, s1);
2953     __ add(s1, s1, temp2);
2954     __ ubfx(temp2, temp1, 16, 8);
2955     __ add(s2, s2, s1);
2956     __ add(s1, s1, temp2);
2957     __ ubfx(temp2, temp1, 24, 8);
2958     __ add(s2, s2, s1);
2959     __ add(s1, s1, temp2);
2960     __ ubfx(temp2, temp1, 32, 8);
2961     __ add(s2, s2, s1);
2962     __ add(s1, s1, temp2);
2963     __ ubfx(temp2, temp1, 40, 8);
2964     __ add(s2, s2, s1);
2965     __ add(s1, s1, temp2);
2966     __ ubfx(temp2, temp1, 48, 8);
2967     __ add(s2, s2, s1);
2968     __ add(s1, s1, temp2);
2969     __ add(s2, s2, s1);
2970     __ add(s1, s1, temp1, Assembler::LSR, 56);
2971     __ add(s2, s2, s1);
2972 
2973     __ subs(len, len, 16);
2974     __ br(Assembler::HS, L_by16_loop);
2975 
2976     __ bind(L_by1);
2977     __ adds(len, len, 15);
2978     __ br(Assembler::LO, L_do_mod);
2979 
2980     __ bind(L_by1_loop);
2981     __ ldrb(temp0, Address(__ post(buff, 1)));
2982     __ add(s1, temp0, s1);
2983     __ add(s2, s2, s1);
2984     __ subs(len, len, 1);
2985     __ br(Assembler::HS, L_by1_loop);
2986 
2987     __ bind(L_do_mod);
2988     // s1 = s1 % BASE
2989     __ lsr(temp0, s1, 16);
2990     __ lsl(temp1, temp0, 4);
2991     __ sub(temp1, temp1, temp0);
2992     __ add(temp1, temp1, s1, ext::uxth);
2993 
2994     __ lsr(temp0, temp1, 16);
2995     __ lsl(s1, temp0, 4);
2996     __ sub(s1, s1, temp0);
2997     __ add(s1, s1, temp1, ext:: uxth);
2998 
2999     __ subs(temp0, s1, base);
3000     __ csel(s1, temp0, s1, Assembler::HS);
3001 
3002     // s2 = s2 % BASE
3003     __ lsr(temp0, s2, 16);
3004     __ lsl(temp1, temp0, 4);
3005     __ sub(temp1, temp1, temp0);
3006     __ add(temp1, temp1, s2, ext::uxth);
3007 
3008     __ lsr(temp0, temp1, 16);
3009     __ lsl(s2, temp0, 4);
3010     __ sub(s2, s2, temp0);
3011     __ add(s2, s2, temp1, ext:: uxth);
3012 
3013     __ subs(temp0, s2, base);
3014     __ csel(s2, temp0, s2, Assembler::HS);
3015 
3016     // Combine lower bits and higher bits
3017     __ bind(L_combine);
3018     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3019 
3020     __ ret(lr);
3021 
3022     return start;
3023   }
3024 
3025   /**
3026    *  Arguments:
3027    *
3028    *  Input:
3029    *    c_rarg0   - x address
3030    *    c_rarg1   - x length
3031    *    c_rarg2   - y address
3032    *    c_rarg3   - y lenth
3033    *    c_rarg4   - z address
3034    *    c_rarg5   - z length
3035    */
3036   address generate_multiplyToLen() {
3037     __ align(CodeEntryAlignment);
3038     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3039 
3040     address start = __ pc();
3041     const Register x     = r0;
3042     const Register xlen  = r1;
3043     const Register y     = r2;
3044     const Register ylen  = r3;
3045     const Register z     = r4;
3046     const Register zlen  = r5;
3047 
3048     const Register tmp1  = r10;
3049     const Register tmp2  = r11;
3050     const Register tmp3  = r12;
3051     const Register tmp4  = r13;
3052     const Register tmp5  = r14;
3053     const Register tmp6  = r15;
3054     const Register tmp7  = r16;
3055 
3056     BLOCK_COMMENT("Entry:");
3057     __ enter(); // required for proper stackwalking of RuntimeStub frame
3058     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3059     __ leave(); // required for proper stackwalking of RuntimeStub frame
3060     __ ret(lr);
3061 
3062     return start;
3063   }
3064 
3065   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3066                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3067                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3068     // Karatsuba multiplication performs a 128*128 -> 256-bit
3069     // multiplication in three 128-bit multiplications and a few
3070     // additions.
3071     //
3072     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3073     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3074     //
3075     // Inputs:
3076     //
3077     // A0 in a.d[0]     (subkey)
3078     // A1 in a.d[1]
3079     // (A1+A0) in a1_xor_a0.d[0]
3080     //
3081     // B0 in b.d[0]     (state)
3082     // B1 in b.d[1]
3083 
3084     __ ext(tmp1, __ T16B, b, b, 0x08);
3085     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3086     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3087     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3088     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3089 
3090     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3091     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3092     __ eor(tmp2, __ T16B, tmp2, tmp4);
3093     __ eor(tmp2, __ T16B, tmp2, tmp3);
3094 
3095     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3096     __ ins(result_hi, __ D, tmp2, 0, 1);
3097     __ ins(result_lo, __ D, tmp2, 1, 0);
3098   }
3099 
3100   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3101                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3102     const FloatRegister t0 = result;
3103 
3104     // The GCM field polynomial f is z^128 + p(z), where p =
3105     // z^7+z^2+z+1.
3106     //
3107     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3108     //
3109     // so, given that the product we're reducing is
3110     //    a == lo + hi * z^128
3111     // substituting,
3112     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3113     //
3114     // we reduce by multiplying hi by p(z) and subtracting the result
3115     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3116     // bits we can do this with two 64-bit multiplications, lo*p and
3117     // hi*p.
3118 
3119     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3120     __ ext(t1, __ T16B, t0, z, 8);
3121     __ eor(hi, __ T16B, hi, t1);
3122     __ ext(t1, __ T16B, z, t0, 8);
3123     __ eor(lo, __ T16B, lo, t1);
3124     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3125     __ eor(result, __ T16B, lo, t0);
3126   }
3127 
3128   /**
3129    *  Arguments:
3130    *
3131    *  Input:
3132    *  c_rarg0   - current state address
3133    *  c_rarg1   - H key address
3134    *  c_rarg2   - data address
3135    *  c_rarg3   - number of blocks
3136    *
3137    *  Output:
3138    *  Updated state at c_rarg0
3139    */
3140   address generate_ghash_processBlocks() {
3141     // Bafflingly, GCM uses little-endian for the byte order, but
3142     // big-endian for the bit order.  For example, the polynomial 1 is
3143     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3144     //
3145     // So, we must either reverse the bytes in each word and do
3146     // everything big-endian or reverse the bits in each byte and do
3147     // it little-endian.  On AArch64 it's more idiomatic to reverse
3148     // the bits in each byte (we have an instruction, RBIT, to do
3149     // that) and keep the data in little-endian bit order throught the
3150     // calculation, bit-reversing the inputs and outputs.
3151 
3152     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3153     __ align(wordSize * 2);
3154     address p = __ pc();
3155     __ emit_int64(0x87);  // The low-order bits of the field
3156                           // polynomial (i.e. p = z^7+z^2+z+1)
3157                           // repeated in the low and high parts of a
3158                           // 128-bit vector
3159     __ emit_int64(0x87);
3160 
3161     __ align(CodeEntryAlignment);
3162     address start = __ pc();
3163 
3164     Register state   = c_rarg0;
3165     Register subkeyH = c_rarg1;
3166     Register data    = c_rarg2;
3167     Register blocks  = c_rarg3;
3168 
3169     FloatRegister vzr = v30;
3170     __ eor(vzr, __ T16B, vzr, vzr); // zero register
3171 
3172     __ ldrq(v0, Address(state));
3173     __ ldrq(v1, Address(subkeyH));
3174 
3175     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3176     __ rbit(v0, __ T16B, v0);
3177     __ rev64(v1, __ T16B, v1);
3178     __ rbit(v1, __ T16B, v1);
3179 
3180     __ ldrq(v26, p);
3181 
3182     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
3183     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
3184 
3185     {
3186       Label L_ghash_loop;
3187       __ bind(L_ghash_loop);
3188 
3189       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
3190                                                  // reversing each byte
3191       __ rbit(v2, __ T16B, v2);
3192       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
3193 
3194       // Multiply state in v2 by subkey in v1
3195       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
3196                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
3197                      /*temps*/v6, v20, v18, v21);
3198       // Reduce v7:v5 by the field polynomial
3199       ghash_reduce(v0, v5, v7, v26, vzr, v20);
3200 
3201       __ sub(blocks, blocks, 1);
3202       __ cbnz(blocks, L_ghash_loop);
3203     }
3204 
3205     // The bit-reversed result is at this point in v0
3206     __ rev64(v1, __ T16B, v0);
3207     __ rbit(v1, __ T16B, v1);
3208 
3209     __ st1(v1, __ T16B, state);
3210     __ ret(lr);
3211 
3212     return start;
3213   }
3214 
3215   // Continuation point for throwing of implicit exceptions that are
3216   // not handled in the current activation. Fabricates an exception
3217   // oop and initiates normal exception dispatching in this
3218   // frame. Since we need to preserve callee-saved values (currently
3219   // only for C2, but done for C1 as well) we need a callee-saved oop
3220   // map and therefore have to make these stubs into RuntimeStubs
3221   // rather than BufferBlobs.  If the compiler needs all registers to
3222   // be preserved between the fault point and the exception handler
3223   // then it must assume responsibility for that in
3224   // AbstractCompiler::continuation_for_implicit_null_exception or
3225   // continuation_for_implicit_division_by_zero_exception. All other
3226   // implicit exceptions (e.g., NullPointerException or
3227   // AbstractMethodError on entry) are either at call sites or
3228   // otherwise assume that stack unwinding will be initiated, so
3229   // caller saved registers were assumed volatile in the compiler.
3230 
3231 #undef __
3232 #define __ masm->
3233 
3234   address generate_throw_exception(const char* name,
3235                                    address runtime_entry,
3236                                    Register arg1 = noreg,
3237                                    Register arg2 = noreg) {
3238     // Information about frame layout at time of blocking runtime call.
3239     // Note that we only have to preserve callee-saved registers since
3240     // the compilers are responsible for supplying a continuation point
3241     // if they expect all registers to be preserved.
3242     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3243     enum layout {
3244       rfp_off = 0,
3245       rfp_off2,
3246       return_off,
3247       return_off2,
3248       framesize // inclusive of return address
3249     };
3250 
3251     int insts_size = 512;
3252     int locs_size  = 64;
3253 
3254     CodeBuffer code(name, insts_size, locs_size);
3255     OopMapSet* oop_maps  = new OopMapSet();
3256     MacroAssembler* masm = new MacroAssembler(&code);
3257 
3258     address start = __ pc();
3259 
3260     // This is an inlined and slightly modified version of call_VM
3261     // which has the ability to fetch the return PC out of
3262     // thread-local storage and also sets up last_Java_sp slightly
3263     // differently than the real call_VM
3264 
3265     __ enter(); // Save FP and LR before call
3266 
3267     assert(is_even(framesize/2), "sp not 16-byte aligned");
3268 
3269     // lr and fp are already in place
3270     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3271 
3272     int frame_complete = __ pc() - start;
3273 
3274     // Set up last_Java_sp and last_Java_fp
3275     address the_pc = __ pc();
3276     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3277 
3278     // Call runtime
3279     if (arg1 != noreg) {
3280       assert(arg2 != c_rarg1, "clobbered");
3281       __ mov(c_rarg1, arg1);
3282     }
3283     if (arg2 != noreg) {
3284       __ mov(c_rarg2, arg2);
3285     }
3286     __ mov(c_rarg0, rthread);
3287     BLOCK_COMMENT("call runtime_entry");
3288     __ mov(rscratch1, runtime_entry);
3289     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
3290 
3291     // Generate oop map
3292     OopMap* map = new OopMap(framesize, 0);
3293 
3294     oop_maps->add_gc_map(the_pc - start, map);
3295 
3296     __ reset_last_Java_frame(true, true);
3297     __ maybe_isb();
3298 
3299     __ leave();
3300 
3301     // check for pending exceptions
3302 #ifdef ASSERT
3303     Label L;
3304     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3305     __ cbnz(rscratch1, L);
3306     __ should_not_reach_here();
3307     __ bind(L);
3308 #endif // ASSERT
3309     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3310 
3311 
3312     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3313     RuntimeStub* stub =
3314       RuntimeStub::new_runtime_stub(name,
3315                                     &code,
3316                                     frame_complete,
3317                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3318                                     oop_maps, false);
3319     return stub->entry_point();
3320   }
3321 
3322   class MontgomeryMultiplyGenerator : public MacroAssembler {
3323 
3324     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3325       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
3326 
3327     RegSet _toSave;
3328     bool _squaring;
3329 
3330   public:
3331     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3332       : MacroAssembler(as->code()), _squaring(squaring) {
3333 
3334       // Register allocation
3335 
3336       Register reg = c_rarg0;
3337       Pa_base = reg;       // Argument registers
3338       if (squaring)
3339         Pb_base = Pa_base;
3340       else
3341         Pb_base = ++reg;
3342       Pn_base = ++reg;
3343       Rlen= ++reg;
3344       inv = ++reg;
3345       Pm_base = ++reg;
3346 
3347                           // Working registers:
3348       Ra =  ++reg;        // The current digit of a, b, n, and m.
3349       Rb =  ++reg;
3350       Rm =  ++reg;
3351       Rn =  ++reg;
3352 
3353       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
3354       Pb =  ++reg;
3355       Pm =  ++reg;
3356       Pn =  ++reg;
3357 
3358       t0 =  ++reg;        // Three registers which form a
3359       t1 =  ++reg;        // triple-precision accumuator.
3360       t2 =  ++reg;
3361 
3362       Ri =  ++reg;        // Inner and outer loop indexes.
3363       Rj =  ++reg;
3364 
3365       Rhi_ab = ++reg;     // Product registers: low and high parts
3366       Rlo_ab = ++reg;     // of a*b and m*n.
3367       Rhi_mn = ++reg;
3368       Rlo_mn = ++reg;
3369 
3370       // r19 and up are callee-saved.
3371       _toSave = RegSet::range(r19, reg) + Pm_base;
3372     }
3373 
3374   private:
3375     void save_regs() {
3376       push(_toSave, sp);
3377     }
3378 
3379     void restore_regs() {
3380       pop(_toSave, sp);
3381     }
3382 
3383     template <typename T>
3384     void unroll_2(Register count, T block) {
3385       Label loop, end, odd;
3386       tbnz(count, 0, odd);
3387       cbz(count, end);
3388       align(16);
3389       bind(loop);
3390       (this->*block)();
3391       bind(odd);
3392       (this->*block)();
3393       subs(count, count, 2);
3394       br(Assembler::GT, loop);
3395       bind(end);
3396     }
3397 
3398     template <typename T>
3399     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3400       Label loop, end, odd;
3401       tbnz(count, 0, odd);
3402       cbz(count, end);
3403       align(16);
3404       bind(loop);
3405       (this->*block)(d, s, tmp);
3406       bind(odd);
3407       (this->*block)(d, s, tmp);
3408       subs(count, count, 2);
3409       br(Assembler::GT, loop);
3410       bind(end);
3411     }
3412 
3413     void pre1(RegisterOrConstant i) {
3414       block_comment("pre1");
3415       // Pa = Pa_base;
3416       // Pb = Pb_base + i;
3417       // Pm = Pm_base;
3418       // Pn = Pn_base + i;
3419       // Ra = *Pa;
3420       // Rb = *Pb;
3421       // Rm = *Pm;
3422       // Rn = *Pn;
3423       ldr(Ra, Address(Pa_base));
3424       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3425       ldr(Rm, Address(Pm_base));
3426       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3427       lea(Pa, Address(Pa_base));
3428       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3429       lea(Pm, Address(Pm_base));
3430       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3431 
3432       // Zero the m*n result.
3433       mov(Rhi_mn, zr);
3434       mov(Rlo_mn, zr);
3435     }
3436 
3437     // The core multiply-accumulate step of a Montgomery
3438     // multiplication.  The idea is to schedule operations as a
3439     // pipeline so that instructions with long latencies (loads and
3440     // multiplies) have time to complete before their results are
3441     // used.  This most benefits in-order implementations of the
3442     // architecture but out-of-order ones also benefit.
3443     void step() {
3444       block_comment("step");
3445       // MACC(Ra, Rb, t0, t1, t2);
3446       // Ra = *++Pa;
3447       // Rb = *--Pb;
3448       umulh(Rhi_ab, Ra, Rb);
3449       mul(Rlo_ab, Ra, Rb);
3450       ldr(Ra, pre(Pa, wordSize));
3451       ldr(Rb, pre(Pb, -wordSize));
3452       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
3453                                        // previous iteration.
3454       // MACC(Rm, Rn, t0, t1, t2);
3455       // Rm = *++Pm;
3456       // Rn = *--Pn;
3457       umulh(Rhi_mn, Rm, Rn);
3458       mul(Rlo_mn, Rm, Rn);
3459       ldr(Rm, pre(Pm, wordSize));
3460       ldr(Rn, pre(Pn, -wordSize));
3461       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3462     }
3463 
3464     void post1() {
3465       block_comment("post1");
3466 
3467       // MACC(Ra, Rb, t0, t1, t2);
3468       // Ra = *++Pa;
3469       // Rb = *--Pb;
3470       umulh(Rhi_ab, Ra, Rb);
3471       mul(Rlo_ab, Ra, Rb);
3472       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3473       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3474 
3475       // *Pm = Rm = t0 * inv;
3476       mul(Rm, t0, inv);
3477       str(Rm, Address(Pm));
3478 
3479       // MACC(Rm, Rn, t0, t1, t2);
3480       // t0 = t1; t1 = t2; t2 = 0;
3481       umulh(Rhi_mn, Rm, Rn);
3482 
3483 #ifndef PRODUCT
3484       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3485       {
3486         mul(Rlo_mn, Rm, Rn);
3487         add(Rlo_mn, t0, Rlo_mn);
3488         Label ok;
3489         cbz(Rlo_mn, ok); {
3490           stop("broken Montgomery multiply");
3491         } bind(ok);
3492       }
3493 #endif
3494       // We have very carefully set things up so that
3495       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3496       // the lower half of Rm * Rn because we know the result already:
3497       // it must be -t0.  t0 + (-t0) must generate a carry iff
3498       // t0 != 0.  So, rather than do a mul and an adds we just set
3499       // the carry flag iff t0 is nonzero.
3500       //
3501       // mul(Rlo_mn, Rm, Rn);
3502       // adds(zr, t0, Rlo_mn);
3503       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3504       adcs(t0, t1, Rhi_mn);
3505       adc(t1, t2, zr);
3506       mov(t2, zr);
3507     }
3508 
3509     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
3510       block_comment("pre2");
3511       // Pa = Pa_base + i-len;
3512       // Pb = Pb_base + len;
3513       // Pm = Pm_base + i-len;
3514       // Pn = Pn_base + len;
3515 
3516       if (i.is_register()) {
3517         sub(Rj, i.as_register(), len);
3518       } else {
3519         mov(Rj, i.as_constant());
3520         sub(Rj, Rj, len);
3521       }
3522       // Rj == i-len
3523 
3524       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
3525       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
3526       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3527       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
3528 
3529       // Ra = *++Pa;
3530       // Rb = *--Pb;
3531       // Rm = *++Pm;
3532       // Rn = *--Pn;
3533       ldr(Ra, pre(Pa, wordSize));
3534       ldr(Rb, pre(Pb, -wordSize));
3535       ldr(Rm, pre(Pm, wordSize));
3536       ldr(Rn, pre(Pn, -wordSize));
3537 
3538       mov(Rhi_mn, zr);
3539       mov(Rlo_mn, zr);
3540     }
3541 
3542     void post2(RegisterOrConstant i, RegisterOrConstant len) {
3543       block_comment("post2");
3544       if (i.is_constant()) {
3545         mov(Rj, i.as_constant()-len.as_constant());
3546       } else {
3547         sub(Rj, i.as_register(), len);
3548       }
3549 
3550       adds(t0, t0, Rlo_mn); // The pending m*n, low part
3551 
3552       // As soon as we know the least significant digit of our result,
3553       // store it.
3554       // Pm_base[i-len] = t0;
3555       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3556 
3557       // t0 = t1; t1 = t2; t2 = 0;
3558       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
3559       adc(t1, t2, zr);
3560       mov(t2, zr);
3561     }
3562 
3563     // A carry in t0 after Montgomery multiplication means that we
3564     // should subtract multiples of n from our result in m.  We'll
3565     // keep doing that until there is no carry.
3566     void normalize(RegisterOrConstant len) {
3567       block_comment("normalize");
3568       // while (t0)
3569       //   t0 = sub(Pm_base, Pn_base, t0, len);
3570       Label loop, post, again;
3571       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
3572       cbz(t0, post); {
3573         bind(again); {
3574           mov(i, zr);
3575           mov(cnt, len);
3576           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3577           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3578           subs(zr, zr, zr); // set carry flag, i.e. no borrow
3579           align(16);
3580           bind(loop); {
3581             sbcs(Rm, Rm, Rn);
3582             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3583             add(i, i, 1);
3584             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3585             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3586             sub(cnt, cnt, 1);
3587           } cbnz(cnt, loop);
3588           sbc(t0, t0, zr);
3589         } cbnz(t0, again);
3590       } bind(post);
3591     }
3592 
3593     // Move memory at s to d, reversing words.
3594     //    Increments d to end of copied memory
3595     //    Destroys tmp1, tmp2
3596     //    Preserves len
3597     //    Leaves s pointing to the address which was in d at start
3598     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
3599       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
3600 
3601       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
3602       mov(tmp1, len);
3603       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
3604       sub(s, d, len, ext::uxtw, LogBytesPerWord);
3605     }
3606     // where
3607     void reverse1(Register d, Register s, Register tmp) {
3608       ldr(tmp, pre(s, -wordSize));
3609       ror(tmp, tmp, 32);
3610       str(tmp, post(d, wordSize));
3611     }
3612 
3613     void step_squaring() {
3614       // An extra ACC
3615       step();
3616       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3617     }
3618 
3619     void last_squaring(RegisterOrConstant i) {
3620       Label dont;
3621       // if ((i & 1) == 0) {
3622       tbnz(i.as_register(), 0, dont); {
3623         // MACC(Ra, Rb, t0, t1, t2);
3624         // Ra = *++Pa;
3625         // Rb = *--Pb;
3626         umulh(Rhi_ab, Ra, Rb);
3627         mul(Rlo_ab, Ra, Rb);
3628         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3629       } bind(dont);
3630     }
3631 
3632     void extra_step_squaring() {
3633       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3634 
3635       // MACC(Rm, Rn, t0, t1, t2);
3636       // Rm = *++Pm;
3637       // Rn = *--Pn;
3638       umulh(Rhi_mn, Rm, Rn);
3639       mul(Rlo_mn, Rm, Rn);
3640       ldr(Rm, pre(Pm, wordSize));
3641       ldr(Rn, pre(Pn, -wordSize));
3642     }
3643 
3644     void post1_squaring() {
3645       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3646 
3647       // *Pm = Rm = t0 * inv;
3648       mul(Rm, t0, inv);
3649       str(Rm, Address(Pm));
3650 
3651       // MACC(Rm, Rn, t0, t1, t2);
3652       // t0 = t1; t1 = t2; t2 = 0;
3653       umulh(Rhi_mn, Rm, Rn);
3654 
3655 #ifndef PRODUCT
3656       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3657       {
3658         mul(Rlo_mn, Rm, Rn);
3659         add(Rlo_mn, t0, Rlo_mn);
3660         Label ok;
3661         cbz(Rlo_mn, ok); {
3662           stop("broken Montgomery multiply");
3663         } bind(ok);
3664       }
3665 #endif
3666       // We have very carefully set things up so that
3667       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3668       // the lower half of Rm * Rn because we know the result already:
3669       // it must be -t0.  t0 + (-t0) must generate a carry iff
3670       // t0 != 0.  So, rather than do a mul and an adds we just set
3671       // the carry flag iff t0 is nonzero.
3672       //
3673       // mul(Rlo_mn, Rm, Rn);
3674       // adds(zr, t0, Rlo_mn);
3675       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3676       adcs(t0, t1, Rhi_mn);
3677       adc(t1, t2, zr);
3678       mov(t2, zr);
3679     }
3680 
3681     void acc(Register Rhi, Register Rlo,
3682              Register t0, Register t1, Register t2) {
3683       adds(t0, t0, Rlo);
3684       adcs(t1, t1, Rhi);
3685       adc(t2, t2, zr);
3686     }
3687 
3688   public:
3689     /**
3690      * Fast Montgomery multiplication.  The derivation of the
3691      * algorithm is in A Cryptographic Library for the Motorola
3692      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3693      *
3694      * Arguments:
3695      *
3696      * Inputs for multiplication:
3697      *   c_rarg0   - int array elements a
3698      *   c_rarg1   - int array elements b
3699      *   c_rarg2   - int array elements n (the modulus)
3700      *   c_rarg3   - int length
3701      *   c_rarg4   - int inv
3702      *   c_rarg5   - int array elements m (the result)
3703      *
3704      * Inputs for squaring:
3705      *   c_rarg0   - int array elements a
3706      *   c_rarg1   - int array elements n (the modulus)
3707      *   c_rarg2   - int length
3708      *   c_rarg3   - int inv
3709      *   c_rarg4   - int array elements m (the result)
3710      *
3711      */
3712     address generate_multiply() {
3713       Label argh, nothing;
3714       bind(argh);
3715       stop("MontgomeryMultiply total_allocation must be <= 8192");
3716 
3717       align(CodeEntryAlignment);
3718       address entry = pc();
3719 
3720       cbzw(Rlen, nothing);
3721 
3722       enter();
3723 
3724       // Make room.
3725       cmpw(Rlen, 512);
3726       br(Assembler::HI, argh);
3727       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
3728       andr(sp, Ra, -2 * wordSize);
3729 
3730       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
3731 
3732       {
3733         // Copy input args, reversing as we go.  We use Ra as a
3734         // temporary variable.
3735         reverse(Ra, Pa_base, Rlen, t0, t1);
3736         if (!_squaring)
3737           reverse(Ra, Pb_base, Rlen, t0, t1);
3738         reverse(Ra, Pn_base, Rlen, t0, t1);
3739       }
3740 
3741       // Push all call-saved registers and also Pm_base which we'll need
3742       // at the end.
3743       save_regs();
3744 
3745 #ifndef PRODUCT
3746       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3747       {
3748         ldr(Rn, Address(Pn_base, 0));
3749         mul(Rlo_mn, Rn, inv);
3750         cmp(Rlo_mn, -1);
3751         Label ok;
3752         br(EQ, ok); {
3753           stop("broken inverse in Montgomery multiply");
3754         } bind(ok);
3755       }
3756 #endif
3757 
3758       mov(Pm_base, Ra);
3759 
3760       mov(t0, zr);
3761       mov(t1, zr);
3762       mov(t2, zr);
3763 
3764       block_comment("for (int i = 0; i < len; i++) {");
3765       mov(Ri, zr); {
3766         Label loop, end;
3767         cmpw(Ri, Rlen);
3768         br(Assembler::GE, end);
3769 
3770         bind(loop);
3771         pre1(Ri);
3772 
3773         block_comment("  for (j = i; j; j--) {"); {
3774           movw(Rj, Ri);
3775           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3776         } block_comment("  } // j");
3777 
3778         post1();
3779         addw(Ri, Ri, 1);
3780         cmpw(Ri, Rlen);
3781         br(Assembler::LT, loop);
3782         bind(end);
3783         block_comment("} // i");
3784       }
3785 
3786       block_comment("for (int i = len; i < 2*len; i++) {");
3787       mov(Ri, Rlen); {
3788         Label loop, end;
3789         cmpw(Ri, Rlen, Assembler::LSL, 1);
3790         br(Assembler::GE, end);
3791 
3792         bind(loop);
3793         pre2(Ri, Rlen);
3794 
3795         block_comment("  for (j = len*2-i-1; j; j--) {"); {
3796           lslw(Rj, Rlen, 1);
3797           subw(Rj, Rj, Ri);
3798           subw(Rj, Rj, 1);
3799           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3800         } block_comment("  } // j");
3801 
3802         post2(Ri, Rlen);
3803         addw(Ri, Ri, 1);
3804         cmpw(Ri, Rlen, Assembler::LSL, 1);
3805         br(Assembler::LT, loop);
3806         bind(end);
3807       }
3808       block_comment("} // i");
3809 
3810       normalize(Rlen);
3811 
3812       mov(Ra, Pm_base);  // Save Pm_base in Ra
3813       restore_regs();  // Restore caller's Pm_base
3814 
3815       // Copy our result into caller's Pm_base
3816       reverse(Pm_base, Ra, Rlen, t0, t1);
3817 
3818       leave();
3819       bind(nothing);
3820       ret(lr);
3821 
3822       return entry;
3823     }
3824     // In C, approximately:
3825 
3826     // void
3827     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
3828     //                     unsigned long Pn_base[], unsigned long Pm_base[],
3829     //                     unsigned long inv, int len) {
3830     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3831     //   unsigned long *Pa, *Pb, *Pn, *Pm;
3832     //   unsigned long Ra, Rb, Rn, Rm;
3833 
3834     //   int i;
3835 
3836     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
3837 
3838     //   for (i = 0; i < len; i++) {
3839     //     int j;
3840 
3841     //     Pa = Pa_base;
3842     //     Pb = Pb_base + i;
3843     //     Pm = Pm_base;
3844     //     Pn = Pn_base + i;
3845 
3846     //     Ra = *Pa;
3847     //     Rb = *Pb;
3848     //     Rm = *Pm;
3849     //     Rn = *Pn;
3850 
3851     //     int iters = i;
3852     //     for (j = 0; iters--; j++) {
3853     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
3854     //       MACC(Ra, Rb, t0, t1, t2);
3855     //       Ra = *++Pa;
3856     //       Rb = *--Pb;
3857     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
3858     //       MACC(Rm, Rn, t0, t1, t2);
3859     //       Rm = *++Pm;
3860     //       Rn = *--Pn;
3861     //     }
3862 
3863     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
3864     //     MACC(Ra, Rb, t0, t1, t2);
3865     //     *Pm = Rm = t0 * inv;
3866     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
3867     //     MACC(Rm, Rn, t0, t1, t2);
3868 
3869     //     assert(t0 == 0, "broken Montgomery multiply");
3870 
3871     //     t0 = t1; t1 = t2; t2 = 0;
3872     //   }
3873 
3874     //   for (i = len; i < 2*len; i++) {
3875     //     int j;
3876 
3877     //     Pa = Pa_base + i-len;
3878     //     Pb = Pb_base + len;
3879     //     Pm = Pm_base + i-len;
3880     //     Pn = Pn_base + len;
3881 
3882     //     Ra = *++Pa;
3883     //     Rb = *--Pb;
3884     //     Rm = *++Pm;
3885     //     Rn = *--Pn;
3886 
3887     //     int iters = len*2-i-1;
3888     //     for (j = i-len+1; iters--; j++) {
3889     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
3890     //       MACC(Ra, Rb, t0, t1, t2);
3891     //       Ra = *++Pa;
3892     //       Rb = *--Pb;
3893     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
3894     //       MACC(Rm, Rn, t0, t1, t2);
3895     //       Rm = *++Pm;
3896     //       Rn = *--Pn;
3897     //     }
3898 
3899     //     Pm_base[i-len] = t0;
3900     //     t0 = t1; t1 = t2; t2 = 0;
3901     //   }
3902 
3903     //   while (t0)
3904     //     t0 = sub(Pm_base, Pn_base, t0, len);
3905     // }
3906 
3907     /**
3908      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
3909      * multiplies than Montgomery multiplication so it should be up to
3910      * 25% faster.  However, its loop control is more complex and it
3911      * may actually run slower on some machines.
3912      *
3913      * Arguments:
3914      *
3915      * Inputs:
3916      *   c_rarg0   - int array elements a
3917      *   c_rarg1   - int array elements n (the modulus)
3918      *   c_rarg2   - int length
3919      *   c_rarg3   - int inv
3920      *   c_rarg4   - int array elements m (the result)
3921      *
3922      */
3923     address generate_square() {
3924       Label argh;
3925       bind(argh);
3926       stop("MontgomeryMultiply total_allocation must be <= 8192");
3927 
3928       align(CodeEntryAlignment);
3929       address entry = pc();
3930 
3931       enter();
3932 
3933       // Make room.
3934       cmpw(Rlen, 512);
3935       br(Assembler::HI, argh);
3936       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
3937       andr(sp, Ra, -2 * wordSize);
3938 
3939       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
3940 
3941       {
3942         // Copy input args, reversing as we go.  We use Ra as a
3943         // temporary variable.
3944         reverse(Ra, Pa_base, Rlen, t0, t1);
3945         reverse(Ra, Pn_base, Rlen, t0, t1);
3946       }
3947 
3948       // Push all call-saved registers and also Pm_base which we'll need
3949       // at the end.
3950       save_regs();
3951 
3952       mov(Pm_base, Ra);
3953 
3954       mov(t0, zr);
3955       mov(t1, zr);
3956       mov(t2, zr);
3957 
3958       block_comment("for (int i = 0; i < len; i++) {");
3959       mov(Ri, zr); {
3960         Label loop, end;
3961         bind(loop);
3962         cmp(Ri, Rlen);
3963         br(Assembler::GE, end);
3964 
3965         pre1(Ri);
3966 
3967         block_comment("for (j = (i+1)/2; j; j--) {"); {
3968           add(Rj, Ri, 1);
3969           lsr(Rj, Rj, 1);
3970           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
3971         } block_comment("  } // j");
3972 
3973         last_squaring(Ri);
3974 
3975         block_comment("  for (j = i/2; j; j--) {"); {
3976           lsr(Rj, Ri, 1);
3977           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
3978         } block_comment("  } // j");
3979 
3980         post1_squaring();
3981         add(Ri, Ri, 1);
3982         cmp(Ri, Rlen);
3983         br(Assembler::LT, loop);
3984 
3985         bind(end);
3986         block_comment("} // i");
3987       }
3988 
3989       block_comment("for (int i = len; i < 2*len; i++) {");
3990       mov(Ri, Rlen); {
3991         Label loop, end;
3992         bind(loop);
3993         cmp(Ri, Rlen, Assembler::LSL, 1);
3994         br(Assembler::GE, end);
3995 
3996         pre2(Ri, Rlen);
3997 
3998         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
3999           lsl(Rj, Rlen, 1);
4000           sub(Rj, Rj, Ri);
4001           sub(Rj, Rj, 1);
4002           lsr(Rj, Rj, 1);
4003           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4004         } block_comment("  } // j");
4005 
4006         last_squaring(Ri);
4007 
4008         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4009           lsl(Rj, Rlen, 1);
4010           sub(Rj, Rj, Ri);
4011           lsr(Rj, Rj, 1);
4012           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4013         } block_comment("  } // j");
4014 
4015         post2(Ri, Rlen);
4016         add(Ri, Ri, 1);
4017         cmp(Ri, Rlen, Assembler::LSL, 1);
4018 
4019         br(Assembler::LT, loop);
4020         bind(end);
4021         block_comment("} // i");
4022       }
4023 
4024       normalize(Rlen);
4025 
4026       mov(Ra, Pm_base);  // Save Pm_base in Ra
4027       restore_regs();  // Restore caller's Pm_base
4028 
4029       // Copy our result into caller's Pm_base
4030       reverse(Pm_base, Ra, Rlen, t0, t1);
4031 
4032       leave();
4033       ret(lr);
4034 
4035       return entry;
4036     }
4037     // In C, approximately:
4038 
4039     // void
4040     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4041     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4042     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4043     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4044     //   unsigned long Ra, Rb, Rn, Rm;
4045 
4046     //   int i;
4047 
4048     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4049 
4050     //   for (i = 0; i < len; i++) {
4051     //     int j;
4052 
4053     //     Pa = Pa_base;
4054     //     Pb = Pa_base + i;
4055     //     Pm = Pm_base;
4056     //     Pn = Pn_base + i;
4057 
4058     //     Ra = *Pa;
4059     //     Rb = *Pb;
4060     //     Rm = *Pm;
4061     //     Rn = *Pn;
4062 
4063     //     int iters = (i+1)/2;
4064     //     for (j = 0; iters--; j++) {
4065     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4066     //       MACC2(Ra, Rb, t0, t1, t2);
4067     //       Ra = *++Pa;
4068     //       Rb = *--Pb;
4069     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4070     //       MACC(Rm, Rn, t0, t1, t2);
4071     //       Rm = *++Pm;
4072     //       Rn = *--Pn;
4073     //     }
4074     //     if ((i & 1) == 0) {
4075     //       assert(Ra == Pa_base[j], "must be");
4076     //       MACC(Ra, Ra, t0, t1, t2);
4077     //     }
4078     //     iters = i/2;
4079     //     assert(iters == i-j, "must be");
4080     //     for (; iters--; j++) {
4081     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4082     //       MACC(Rm, Rn, t0, t1, t2);
4083     //       Rm = *++Pm;
4084     //       Rn = *--Pn;
4085     //     }
4086 
4087     //     *Pm = Rm = t0 * inv;
4088     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4089     //     MACC(Rm, Rn, t0, t1, t2);
4090 
4091     //     assert(t0 == 0, "broken Montgomery multiply");
4092 
4093     //     t0 = t1; t1 = t2; t2 = 0;
4094     //   }
4095 
4096     //   for (i = len; i < 2*len; i++) {
4097     //     int start = i-len+1;
4098     //     int end = start + (len - start)/2;
4099     //     int j;
4100 
4101     //     Pa = Pa_base + i-len;
4102     //     Pb = Pa_base + len;
4103     //     Pm = Pm_base + i-len;
4104     //     Pn = Pn_base + len;
4105 
4106     //     Ra = *++Pa;
4107     //     Rb = *--Pb;
4108     //     Rm = *++Pm;
4109     //     Rn = *--Pn;
4110 
4111     //     int iters = (2*len-i-1)/2;
4112     //     assert(iters == end-start, "must be");
4113     //     for (j = start; iters--; j++) {
4114     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4115     //       MACC2(Ra, Rb, t0, t1, t2);
4116     //       Ra = *++Pa;
4117     //       Rb = *--Pb;
4118     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4119     //       MACC(Rm, Rn, t0, t1, t2);
4120     //       Rm = *++Pm;
4121     //       Rn = *--Pn;
4122     //     }
4123     //     if ((i & 1) == 0) {
4124     //       assert(Ra == Pa_base[j], "must be");
4125     //       MACC(Ra, Ra, t0, t1, t2);
4126     //     }
4127     //     iters =  (2*len-i)/2;
4128     //     assert(iters == len-j, "must be");
4129     //     for (; iters--; j++) {
4130     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4131     //       MACC(Rm, Rn, t0, t1, t2);
4132     //       Rm = *++Pm;
4133     //       Rn = *--Pn;
4134     //     }
4135     //     Pm_base[i-len] = t0;
4136     //     t0 = t1; t1 = t2; t2 = 0;
4137     //   }
4138 
4139     //   while (t0)
4140     //     t0 = sub(Pm_base, Pn_base, t0, len);
4141     // }
4142   };
4143 
4144   // Initialization
4145   void generate_initial() {
4146     // Generate initial stubs and initializes the entry points
4147 
4148     // entry points that exist in all platforms Note: This is code
4149     // that could be shared among different platforms - however the
4150     // benefit seems to be smaller than the disadvantage of having a
4151     // much more complicated generator structure. See also comment in
4152     // stubRoutines.hpp.
4153 
4154     StubRoutines::_forward_exception_entry = generate_forward_exception();
4155 
4156     StubRoutines::_call_stub_entry =
4157       generate_call_stub(StubRoutines::_call_stub_return_address);
4158 
4159     // is referenced by megamorphic call
4160     StubRoutines::_catch_exception_entry = generate_catch_exception();
4161 
4162     // Build this early so it's available for the interpreter.
4163     StubRoutines::_throw_StackOverflowError_entry =
4164       generate_throw_exception("StackOverflowError throw_exception",
4165                                CAST_FROM_FN_PTR(address,
4166                                                 SharedRuntime::
4167                                                 throw_StackOverflowError));
4168     if (UseCRC32Intrinsics) {
4169       // set table address before stub generation which use it
4170       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4171       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4172     }
4173   }
4174 
4175   void generate_all() {
4176     // support for verify_oop (must happen after universe_init)
4177     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4178     StubRoutines::_throw_AbstractMethodError_entry =
4179       generate_throw_exception("AbstractMethodError throw_exception",
4180                                CAST_FROM_FN_PTR(address,
4181                                                 SharedRuntime::
4182                                                 throw_AbstractMethodError));
4183 
4184     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4185       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4186                                CAST_FROM_FN_PTR(address,
4187                                                 SharedRuntime::
4188                                                 throw_IncompatibleClassChangeError));
4189 
4190     StubRoutines::_throw_NullPointerException_at_call_entry =
4191       generate_throw_exception("NullPointerException at call throw_exception",
4192                                CAST_FROM_FN_PTR(address,
4193                                                 SharedRuntime::
4194                                                 throw_NullPointerException_at_call));
4195 
4196     // arraycopy stubs used by compilers
4197     generate_arraycopy_stubs();
4198 
4199     if (UseMultiplyToLenIntrinsic) {
4200       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4201     }
4202 
4203     if (UseMontgomeryMultiplyIntrinsic) {
4204       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4205       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4206       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4207     }
4208 
4209     if (UseMontgomerySquareIntrinsic) {
4210       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4211       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4212       // We use generate_multiply() rather than generate_square()
4213       // because it's faster for the sizes of modulus we care about.
4214       StubRoutines::_montgomerySquare = g.generate_multiply();
4215     }
4216 
4217 #ifndef BUILTIN_SIM
4218     // generate GHASH intrinsics code
4219     if (UseGHASHIntrinsics) {
4220       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4221     }
4222 
4223     if (UseAESIntrinsics) {
4224       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4225       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4226       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4227       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4228     }
4229 
4230     if (UseSHA1Intrinsics) {
4231       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4232       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4233     }
4234     if (UseSHA256Intrinsics) {
4235       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4236       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4237     }
4238 
4239     if (UseCRC32CIntrinsics) {
4240       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
4241     }
4242 
4243     // generate Adler32 intrinsics code
4244     if (UseAdler32Intrinsics) {
4245       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
4246     }
4247 
4248     // Safefetch stubs.
4249     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4250                                                        &StubRoutines::_safefetch32_fault_pc,
4251                                                        &StubRoutines::_safefetch32_continuation_pc);
4252     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4253                                                        &StubRoutines::_safefetchN_fault_pc,
4254                                                        &StubRoutines::_safefetchN_continuation_pc);
4255 #endif
4256   }
4257 
4258  public:
4259   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4260     if (all) {
4261       generate_all();
4262     } else {
4263       generate_initial();
4264     }
4265   }
4266 }; // end class declaration
4267 
4268 void StubGenerator_generate(CodeBuffer* code, bool all) {
4269   StubGenerator g(code, all);
4270 }