New src/cpu/aarch64/vm/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "interpreter/interpreter.hpp"
  30 #include "nativeInst_aarch64.hpp"
  31 #include "oops/instanceOop.hpp"
  32 #include "oops/method.hpp"
  33 #include "oops/objArrayKlass.hpp"
  34 #include "oops/oop.inline.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/frame.inline.hpp"
  37 #include "runtime/handles.inline.hpp"
  38 #include "runtime/sharedRuntime.hpp"
  39 #include "runtime/stubCodeGenerator.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "runtime/thread.inline.hpp"
  42 #include "utilities/top.hpp"
  43 #ifdef COMPILER2
  44 #include "opto/runtime.hpp"
  45 #endif
  46 
  47 #ifdef BUILTIN_SIM
  48 #include "../../../../../../simulator/simulator.hpp"
  49 #endif
  50 
  51 // Declaration and definition of StubGenerator (no .hpp file).
  52 // For a more detailed description of the stub routine structure
  53 // see the comment in stubRoutines.hpp
  54 
  55 #undef __
  56 #define __ _masm->
  57 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #else
  62 #define BLOCK_COMMENT(str) __ block_comment(str)
  63 #endif
  64 
  65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  66 
  67 // Stub Code definitions
  68 
  69 class StubGenerator: public StubCodeGenerator {
  70  private:
  71 
  72 #ifdef PRODUCT
  73 #define inc_counter_np(counter) ((void)0)
  74 #else
  75   void inc_counter_np_(int& counter) {
  76     __ lea(rscratch2, ExternalAddress((address)&counter));
  77     __ ldrw(rscratch1, Address(rscratch2));
  78     __ addw(rscratch1, rscratch1, 1);
  79     __ strw(rscratch1, Address(rscratch2));
  80   }
  81 #define inc_counter_np(counter) \
  82   BLOCK_COMMENT("inc_counter " #counter); \
  83   inc_counter_np_(counter);
  84 #endif
  85 
  86   // Call stubs are used to call Java from C
  87   //
  88   // Arguments:
  89   //    c_rarg0:   call wrapper address                   address
  90   //    c_rarg1:   result                                 address
  91   //    c_rarg2:   result type                            BasicType
  92   //    c_rarg3:   method                                 Method*
  93   //    c_rarg4:   (interpreter) entry point              address
  94   //    c_rarg5:   parameters                             intptr_t*
  95   //    c_rarg6:   parameter size (in words)              int
  96   //    c_rarg7:   thread                                 Thread*
  97   //
  98   // There is no return from the stub itself as any Java result
  99   // is written to result
 100   //
 101   // we save r30 (lr) as the return PC at the base of the frame and
 102   // link r29 (fp) below it as the frame pointer installing sp (r31)
 103   // into fp.
 104   //
 105   // we save r0-r7, which accounts for all the c arguments.
 106   //
 107   // TODO: strictly do we need to save them all? they are treated as
 108   // volatile by C so could we omit saving the ones we are going to
 109   // place in global registers (thread? method?) or those we only use
 110   // during setup of the Java call?
 111   //
 112   // we don't need to save r8 which C uses as an indirect result location
 113   // return register.
 114   //
 115   // we don't need to save r9-r15 which both C and Java treat as
 116   // volatile
 117   //
 118   // we don't need to save r16-18 because Java does not use them
 119   //
 120   // we save r19-r28 which Java uses as scratch registers and C
 121   // expects to be callee-save
 122   //
 123   // we save the bottom 64 bits of each value stored in v8-v15; it is
 124   // the responsibility of the caller to preserve larger values.
 125   //
 126   // so the stub frame looks like this when we enter Java code
 127   //
 128   //     [ return_from_Java     ] <--- sp
 129   //     [ argument word n      ]
 130   //      ...
 131   // -27 [ argument word 1      ]
 132   // -26 [ saved v15            ] <--- sp_after_call
 133   // -25 [ saved v14            ]
 134   // -24 [ saved v13            ]
 135   // -23 [ saved v12            ]
 136   // -22 [ saved v11            ]
 137   // -21 [ saved v10            ]
 138   // -20 [ saved v9             ]
 139   // -19 [ saved v8             ]
 140   // -18 [ saved r28            ]
 141   // -17 [ saved r27            ]
 142   // -16 [ saved r26            ]
 143   // -15 [ saved r25            ]
 144   // -14 [ saved r24            ]
 145   // -13 [ saved r23            ]
 146   // -12 [ saved r22            ]
 147   // -11 [ saved r21            ]
 148   // -10 [ saved r20            ]
 149   //  -9 [ saved r19            ]
 150   //  -8 [ call wrapper    (r0) ]
 151   //  -7 [ result          (r1) ]
 152   //  -6 [ result type     (r2) ]
 153   //  -5 [ method          (r3) ]
 154   //  -4 [ entry point     (r4) ]
 155   //  -3 [ parameters      (r5) ]
 156   //  -2 [ parameter size  (r6) ]
 157   //  -1 [ thread (r7)          ]
 158   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 159   //   1 [ saved lr       (r30) ]
 160 
 161   // Call stub stack layout word offsets from fp
 162   enum call_stub_layout {
 163     sp_after_call_off = -26,
 164 
 165     d15_off            = -26,
 166     d14_off            = -25,
 167     d13_off            = -24,
 168     d12_off            = -23,
 169     d11_off            = -22,
 170     d10_off            = -21,
 171     d9_off             = -20,
 172     d8_off             = -19,
 173 
 174     r28_off            = -18,
 175     r27_off            = -17,
 176     r26_off            = -16,
 177     r25_off            = -15,
 178     r24_off            = -14,
 179     r23_off            = -13,
 180     r22_off            = -12,
 181     r21_off            = -11,
 182     r20_off            = -10,
 183     r19_off            =  -9,
 184     call_wrapper_off   =  -8,
 185     result_off         =  -7,
 186     result_type_off    =  -6,
 187     method_off         =  -5,
 188     entry_point_off    =  -4,
 189     parameters_off     =  -3,
 190     parameter_size_off =  -2,
 191     thread_off         =  -1,
 192     fp_f               =   0,
 193     retaddr_off        =   1,
 194   };
 195 
 196   address generate_call_stub(address& return_address) {
 197     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 198            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 199            "adjust this code");
 200 
 201     StubCodeMark mark(this, "StubRoutines", "call_stub");
 202     address start = __ pc();
 203 
 204     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 205 
 206     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 207     const Address result        (rfp, result_off         * wordSize);
 208     const Address result_type   (rfp, result_type_off    * wordSize);
 209     const Address method        (rfp, method_off         * wordSize);
 210     const Address entry_point   (rfp, entry_point_off    * wordSize);
 211     const Address parameters    (rfp, parameters_off     * wordSize);
 212     const Address parameter_size(rfp, parameter_size_off * wordSize);
 213 
 214     const Address thread        (rfp, thread_off         * wordSize);
 215 
 216     const Address d15_save      (rfp, d15_off * wordSize);
 217     const Address d14_save      (rfp, d14_off * wordSize);
 218     const Address d13_save      (rfp, d13_off * wordSize);
 219     const Address d12_save      (rfp, d12_off * wordSize);
 220     const Address d11_save      (rfp, d11_off * wordSize);
 221     const Address d10_save      (rfp, d10_off * wordSize);
 222     const Address d9_save       (rfp, d9_off * wordSize);
 223     const Address d8_save       (rfp, d8_off * wordSize);
 224 
 225     const Address r28_save      (rfp, r28_off * wordSize);
 226     const Address r27_save      (rfp, r27_off * wordSize);
 227     const Address r26_save      (rfp, r26_off * wordSize);
 228     const Address r25_save      (rfp, r25_off * wordSize);
 229     const Address r24_save      (rfp, r24_off * wordSize);
 230     const Address r23_save      (rfp, r23_off * wordSize);
 231     const Address r22_save      (rfp, r22_off * wordSize);
 232     const Address r21_save      (rfp, r21_off * wordSize);
 233     const Address r20_save      (rfp, r20_off * wordSize);
 234     const Address r19_save      (rfp, r19_off * wordSize);
 235 
 236     // stub code
 237 
 238     // we need a C prolog to bootstrap the x86 caller into the sim
 239     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 240 
 241     address aarch64_entry = __ pc();
 242 
 243 #ifdef BUILTIN_SIM
 244     // Save sender's SP for stack traces.
 245     __ mov(rscratch1, sp);
 246     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 247 #endif
 248     // set up frame and move sp to end of save area
 249     __ enter();
 250     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 251 
 252     // save register parameters and Java scratch/global registers
 253     // n.b. we save thread even though it gets installed in
 254     // rthread because we want to sanity check rthread later
 255     __ str(c_rarg7,  thread);
 256     __ strw(c_rarg6, parameter_size);
 257     __ str(c_rarg5,  parameters);
 258     __ str(c_rarg4,  entry_point);
 259     __ str(c_rarg3,  method);
 260     __ str(c_rarg2,  result_type);
 261     __ str(c_rarg1,  result);
 262     __ str(c_rarg0,  call_wrapper);
 263     __ str(r19,      r19_save);
 264     __ str(r20,      r20_save);
 265     __ str(r21,      r21_save);
 266     __ str(r22,      r22_save);
 267     __ str(r23,      r23_save);
 268     __ str(r24,      r24_save);
 269     __ str(r25,      r25_save);
 270     __ str(r26,      r26_save);
 271     __ str(r27,      r27_save);
 272     __ str(r28,      r28_save);
 273 
 274     __ strd(v8,      d8_save);
 275     __ strd(v9,      d9_save);
 276     __ strd(v10,     d10_save);
 277     __ strd(v11,     d11_save);
 278     __ strd(v12,     d12_save);
 279     __ strd(v13,     d13_save);
 280     __ strd(v14,     d14_save);
 281     __ strd(v15,     d15_save);
 282 
 283     // install Java thread in global register now we have saved
 284     // whatever value it held
 285     __ mov(rthread, c_rarg7);
 286     // And method
 287     __ mov(rmethod, c_rarg3);
 288 
 289     // set up the heapbase register
 290     __ reinit_heapbase();
 291 
 292 #ifdef ASSERT
 293     // make sure we have no pending exceptions
 294     {
 295       Label L;
 296       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 297       __ cmp(rscratch1, (unsigned)NULL_WORD);
 298       __ br(Assembler::EQ, L);
 299       __ stop("StubRoutines::call_stub: entered with pending exception");
 300       __ BIND(L);
 301     }
 302 #endif
 303     // pass parameters if any
 304     __ mov(esp, sp);
 305     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 306     __ andr(sp, rscratch1, -2 * wordSize);
 307 
 308     BLOCK_COMMENT("pass parameters if any");
 309     Label parameters_done;
 310     // parameter count is still in c_rarg6
 311     // and parameter pointer identifying param 1 is in c_rarg5
 312     __ cbzw(c_rarg6, parameters_done);
 313 
 314     address loop = __ pc();
 315     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 316     __ subsw(c_rarg6, c_rarg6, 1);
 317     __ push(rscratch1);
 318     __ br(Assembler::GT, loop);
 319 
 320     __ BIND(parameters_done);
 321 
 322     // call Java entry -- passing methdoOop, and current sp
 323     //      rmethod: Method*
 324     //      r13: sender sp
 325     BLOCK_COMMENT("call Java function");
 326     __ mov(r13, sp);
 327     __ blr(c_rarg4);
 328 
 329     // tell the simulator we have returned to the stub
 330 
 331     // we do this here because the notify will already have been done
 332     // if we get to the next instruction via an exception
 333     //
 334     // n.b. adding this instruction here affects the calculation of
 335     // whether or not a routine returns to the call stub (used when
 336     // doing stack walks) since the normal test is to check the return
 337     // pc against the address saved below. so we may need to allow for
 338     // this extra instruction in the check.
 339 
 340     if (NotifySimulator) {
 341       __ notify(Assembler::method_reentry);
 342     }
 343     // save current address for use by exception handling code
 344 
 345     return_address = __ pc();
 346 
 347     // store result depending on type (everything that is not
 348     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 349     // n.b. this assumes Java returns an integral result in r0
 350     // and a floating result in j_farg0
 351     __ ldr(j_rarg2, result);
 352     Label is_long, is_float, is_double, exit;
 353     __ ldr(j_rarg1, result_type);
 354     __ cmp(j_rarg1, T_OBJECT);
 355     __ br(Assembler::EQ, is_long);
 356     __ cmp(j_rarg1, T_LONG);
 357     __ br(Assembler::EQ, is_long);
 358     __ cmp(j_rarg1, T_FLOAT);
 359     __ br(Assembler::EQ, is_float);
 360     __ cmp(j_rarg1, T_DOUBLE);
 361     __ br(Assembler::EQ, is_double);
 362 
 363     // handle T_INT case
 364     __ strw(r0, Address(j_rarg2));
 365 
 366     __ BIND(exit);
 367 
 368     // pop parameters
 369     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 370 
 371 #ifdef ASSERT
 372     // verify that threads correspond
 373     {
 374       Label L, S;
 375       __ ldr(rscratch1, thread);
 376       __ cmp(rthread, rscratch1);
 377       __ br(Assembler::NE, S);
 378       __ get_thread(rscratch1);
 379       __ cmp(rthread, rscratch1);
 380       __ br(Assembler::EQ, L);
 381       __ BIND(S);
 382       __ stop("StubRoutines::call_stub: threads must correspond");
 383       __ BIND(L);
 384     }
 385 #endif
 386 
 387     // restore callee-save registers
 388     __ ldrd(v15,      d15_save);
 389     __ ldrd(v14,      d14_save);
 390     __ ldrd(v13,      d13_save);
 391     __ ldrd(v12,      d12_save);
 392     __ ldrd(v11,      d11_save);
 393     __ ldrd(v10,      d10_save);
 394     __ ldrd(v9,       d9_save);
 395     __ ldrd(v8,       d8_save);
 396 
 397     __ ldr(r28,      r28_save);
 398     __ ldr(r27,      r27_save);
 399     __ ldr(r26,      r26_save);
 400     __ ldr(r25,      r25_save);
 401     __ ldr(r24,      r24_save);
 402     __ ldr(r23,      r23_save);
 403     __ ldr(r22,      r22_save);
 404     __ ldr(r21,      r21_save);
 405     __ ldr(r20,      r20_save);
 406     __ ldr(r19,      r19_save);
 407     __ ldr(c_rarg0,  call_wrapper);
 408     __ ldr(c_rarg1,  result);
 409     __ ldrw(c_rarg2, result_type);
 410     __ ldr(c_rarg3,  method);
 411     __ ldr(c_rarg4,  entry_point);
 412     __ ldr(c_rarg5,  parameters);
 413     __ ldr(c_rarg6,  parameter_size);
 414     __ ldr(c_rarg7,  thread);
 415 
 416 #ifndef PRODUCT
 417     // tell the simulator we are about to end Java execution
 418     if (NotifySimulator) {
 419       __ notify(Assembler::method_exit);
 420     }
 421 #endif
 422     // leave frame and return to caller
 423     __ leave();
 424     __ ret(lr);
 425 
 426     // handle return types different from T_INT
 427 
 428     __ BIND(is_long);
 429     __ str(r0, Address(j_rarg2, 0));
 430     __ br(Assembler::AL, exit);
 431 
 432     __ BIND(is_float);
 433     __ strs(j_farg0, Address(j_rarg2, 0));
 434     __ br(Assembler::AL, exit);
 435 
 436     __ BIND(is_double);
 437     __ strd(j_farg0, Address(j_rarg2, 0));
 438     __ br(Assembler::AL, exit);
 439 
 440     return start;
 441   }
 442 
 443   // Return point for a Java call if there's an exception thrown in
 444   // Java code.  The exception is caught and transformed into a
 445   // pending exception stored in JavaThread that can be tested from
 446   // within the VM.
 447   //
 448   // Note: Usually the parameters are removed by the callee. In case
 449   // of an exception crossing an activation frame boundary, that is
 450   // not the case if the callee is compiled code => need to setup the
 451   // rsp.
 452   //
 453   // r0: exception oop
 454 
 455   // NOTE: this is used as a target from the signal handler so it
 456   // needs an x86 prolog which returns into the current simulator
 457   // executing the generated catch_exception code. so the prolog
 458   // needs to install rax in a sim register and adjust the sim's
 459   // restart pc to enter the generated code at the start position
 460   // then return from native to simulated execution.
 461 
 462   address generate_catch_exception() {
 463     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 464     address start = __ pc();
 465 
 466     // same as in generate_call_stub():
 467     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 468     const Address thread        (rfp, thread_off         * wordSize);
 469 
 470 #ifdef ASSERT
 471     // verify that threads correspond
 472     {
 473       Label L, S;
 474       __ ldr(rscratch1, thread);
 475       __ cmp(rthread, rscratch1);
 476       __ br(Assembler::NE, S);
 477       __ get_thread(rscratch1);
 478       __ cmp(rthread, rscratch1);
 479       __ br(Assembler::EQ, L);
 480       __ bind(S);
 481       __ stop("StubRoutines::catch_exception: threads must correspond");
 482       __ bind(L);
 483     }
 484 #endif
 485 
 486     // set pending exception
 487     __ verify_oop(r0);
 488 
 489     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 490     __ mov(rscratch1, (address)__FILE__);
 491     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 492     __ movw(rscratch1, (int)__LINE__);
 493     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 494 
 495     // complete return to VM
 496     assert(StubRoutines::_call_stub_return_address != NULL,
 497            "_call_stub_return_address must have been generated before");
 498     __ b(StubRoutines::_call_stub_return_address);
 499 
 500     return start;
 501   }
 502 
 503   // Continuation point for runtime calls returning with a pending
 504   // exception.  The pending exception check happened in the runtime
 505   // or native call stub.  The pending exception in Thread is
 506   // converted into a Java-level exception.
 507   //
 508   // Contract with Java-level exception handlers:
 509   // r0: exception
 510   // r3: throwing pc
 511   //
 512   // NOTE: At entry of this stub, exception-pc must be in LR !!
 513 
 514   // NOTE: this is always used as a jump target within generated code
 515   // so it just needs to be generated code wiht no x86 prolog
 516 
 517   address generate_forward_exception() {
 518     StubCodeMark mark(this, "StubRoutines", "forward exception");
 519     address start = __ pc();
 520 
 521     // Upon entry, LR points to the return address returning into
 522     // Java (interpreted or compiled) code; i.e., the return address
 523     // becomes the throwing pc.
 524     //
 525     // Arguments pushed before the runtime call are still on the stack
 526     // but the exception handler will reset the stack pointer ->
 527     // ignore them.  A potential result in registers can be ignored as
 528     // well.
 529 
 530 #ifdef ASSERT
 531     // make sure this code is only executed if there is a pending exception
 532     {
 533       Label L;
 534       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 535       __ cbnz(rscratch1, L);
 536       __ stop("StubRoutines::forward exception: no pending exception (1)");
 537       __ bind(L);
 538     }
 539 #endif
 540 
 541     // compute exception handler into r19
 542 
 543     // call the VM to find the handler address associated with the
 544     // caller address. pass thread in r0 and caller pc (ret address)
 545     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 546     // the stack.
 547     __ mov(c_rarg1, lr);
 548     // lr will be trashed by the VM call so we move it to R19
 549     // (callee-saved) because we also need to pass it to the handler
 550     // returned by this call.
 551     __ mov(r19, lr);
 552     BLOCK_COMMENT("call exception_handler_for_return_address");
 553     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 554                          SharedRuntime::exception_handler_for_return_address),
 555                     rthread, c_rarg1);
 556     // we should not really care that lr is no longer the callee
 557     // address. we saved the value the handler needs in r19 so we can
 558     // just copy it to r3. however, the C2 handler will push its own
 559     // frame and then calls into the VM and the VM code asserts that
 560     // the PC for the frame above the handler belongs to a compiled
 561     // Java method. So, we restore lr here to satisfy that assert.
 562     __ mov(lr, r19);
 563     // setup r0 & r3 & clear pending exception
 564     __ mov(r3, r19);
 565     __ mov(r19, r0);
 566     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 567     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 568 
 569 #ifdef ASSERT
 570     // make sure exception is set
 571     {
 572       Label L;
 573       __ cbnz(r0, L);
 574       __ stop("StubRoutines::forward exception: no pending exception (2)");
 575       __ bind(L);
 576     }
 577 #endif
 578 
 579     // continue at exception handler
 580     // r0: exception
 581     // r3: throwing pc
 582     // r19: exception handler
 583     __ verify_oop(r0);
 584     __ br(r19);
 585 
 586     return start;
 587   }
 588 
 589   // Non-destructive plausibility checks for oops
 590   //
 591   // Arguments:
 592   //    r0: oop to verify
 593   //    rscratch1: error message
 594   //
 595   // Stack after saving c_rarg3:
 596   //    [tos + 0]: saved c_rarg3
 597   //    [tos + 1]: saved c_rarg2
 598   //    [tos + 2]: saved lr
 599   //    [tos + 3]: saved rscratch2
 600   //    [tos + 4]: saved r0
 601   //    [tos + 5]: saved rscratch1
 602   address generate_verify_oop() {
 603 
 604     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 605     address start = __ pc();
 606 
 607     Label exit, error;
 608 
 609     // save c_rarg2 and c_rarg3
 610     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 611 
 612     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 613     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 614     __ ldr(c_rarg3, Address(c_rarg2));
 615     __ add(c_rarg3, c_rarg3, 1);
 616     __ str(c_rarg3, Address(c_rarg2));
 617 
 618     // object is in r0
 619     // make sure object is 'reasonable'
 620     __ cbz(r0, exit); // if obj is NULL it is OK
 621 
 622     // Check if the oop is in the right area of memory
 623     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 624     __ andr(c_rarg2, r0, c_rarg3);
 625     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 626 
 627     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 628     // instruction here because the flags register is live.
 629     __ eor(c_rarg2, c_rarg2, c_rarg3);
 630     __ cbnz(c_rarg2, error);
 631 
 632     // make sure klass is 'reasonable', which is not zero.
 633     __ load_klass(r0, r0);  // get klass
 634     __ cbz(r0, error);      // if klass is NULL it is broken
 635 
 636     // return if everything seems ok
 637     __ bind(exit);
 638 
 639     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 640     __ ret(lr);
 641 
 642     // handle errors
 643     __ bind(error);
 644     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 645 
 646     __ push(RegSet::range(r0, r29), sp);
 647     // debug(char* msg, int64_t pc, int64_t regs[])
 648     __ mov(c_rarg0, rscratch1);      // pass address of error message
 649     __ mov(c_rarg1, lr);             // pass return address
 650     __ mov(c_rarg2, sp);             // pass address of regs on stack
 651 #ifndef PRODUCT
 652     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 653 #endif
 654     BLOCK_COMMENT("call MacroAssembler::debug");
 655     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 656     __ blrt(rscratch1, 3, 0, 1);
 657 
 658     return start;
 659   }
 660 
 661   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 662 
 663   // Generate code for an array write pre barrier
 664   //
 665   //     addr    -  starting address
 666   //     count   -  element count
 667   //     tmp     - scratch register
 668   //
 669   //     Destroy no registers!
 670   //
 671   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 672     BarrierSet* bs = Universe::heap()->barrier_set();
 673     switch (bs->kind()) {
 674     case BarrierSet::G1SATBCTLogging:
 675       // With G1, don't generate the call if we statically know that the target in uninitialized
 676       if (!dest_uninitialized) {
 677         __ push(RegSet::range(r0, r29), sp);         // integer registers except lr & sp
 678         if (count == c_rarg0) {
 679           if (addr == c_rarg1) {
 680             // exactly backwards!!
 681             __ stp(c_rarg0, c_rarg1, __ pre(sp, -2 * wordSize));
 682             __ ldp(c_rarg1, c_rarg0, __ post(sp, -2 * wordSize));
 683           } else {
 684             __ mov(c_rarg1, count);
 685             __ mov(c_rarg0, addr);
 686           }
 687         } else {
 688           __ mov(c_rarg0, addr);
 689           __ mov(c_rarg1, count);
 690         }
 691         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 692         __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
 693         break;
 694       case BarrierSet::CardTableForRS:
 695       case BarrierSet::CardTableExtension:
 696       case BarrierSet::ModRef:
 697         break;
 698       default:
 699         ShouldNotReachHere();
 700 
 701       }
 702     }
 703   }
 704 
 705   //
 706   // Generate code for an array write post barrier
 707   //
 708   //  Input:
 709   //     start    - register containing starting address of destination array
 710   //     end      - register containing ending address of destination array
 711   //     scratch  - scratch register
 712   //
 713   //  The input registers are overwritten.
 714   //  The ending address is inclusive.
 715   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 716     assert_different_registers(start, end, scratch);
 717     BarrierSet* bs = Universe::heap()->barrier_set();
 718     switch (bs->kind()) {
 719       case BarrierSet::G1SATBCTLogging:
 720 
 721         {
 722           __ push(RegSet::range(r0, r29), sp);         // integer registers except lr & sp
 723           // must compute element count unless barrier set interface is changed (other platforms supply count)
 724           assert_different_registers(start, end, scratch);
 725           __ lea(scratch, Address(end, BytesPerHeapOop));
 726           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 727           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 728           __ mov(c_rarg0, start);
 729           __ mov(c_rarg1, scratch);
 730           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 731           __ pop(RegSet::range(r0, r29), sp);         // integer registers except lr & sp        }
 732         }
 733         break;
 734       case BarrierSet::CardTableForRS:
 735       case BarrierSet::CardTableExtension:
 736         {
 737           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 738           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 739 
 740           Label L_loop;
 741 
 742            __ lsr(start, start, CardTableModRefBS::card_shift);
 743            __ lsr(end, end, CardTableModRefBS::card_shift);
 744            __ sub(end, end, start); // number of bytes to copy
 745 
 746           const Register count = end; // 'end' register contains bytes count now
 747           __ load_byte_map_base(scratch);
 748           __ add(start, start, scratch);
 749           if (UseConcMarkSweepGC) {
 750             __ membar(__ StoreStore);
 751           }
 752           __ BIND(L_loop);
 753           __ strb(zr, Address(start, count));
 754           __ subs(count, count, 1);
 755           __ br(Assembler::HS, L_loop);
 756         }
 757         break;
 758       default:
 759         ShouldNotReachHere();
 760 
 761     }
 762   }
 763 
 764   typedef enum {
 765     copy_forwards = 1,
 766     copy_backwards = -1
 767   } copy_direction;
 768 
 769   // Bulk copy of blocks of 8 words.
 770   //
 771   // count is a count of words.
 772   //
 773   // Precondition: count >= 2
 774   //
 775   // Postconditions:
 776   //
 777   // The least significant bit of count contains the remaining count
 778   // of words to copy.  The rest of count is trash.
 779   //
 780   // s and d are adjusted to point to the remaining words to copy
 781   //
 782   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 783                            copy_direction direction) {
 784     int unit = wordSize * direction;
 785 
 786     int offset;
 787     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 788       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 789 
 790     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 791     assert_different_registers(s, d, count, rscratch1);
 792 
 793     Label again, large, small;
 794     __ align(6);
 795     __ bind(start);
 796     __ cmp(count, 8);
 797     __ br(Assembler::LO, small);
 798     if (direction == copy_forwards) {
 799       __ sub(s, s, 2 * wordSize);
 800       __ sub(d, d, 2 * wordSize);
 801     }
 802     __ subs(count, count, 16);
 803     __ br(Assembler::GE, large);
 804 
 805     // 8 <= count < 16 words.  Copy 8.
 806     __ ldp(t0, t1, Address(s, 2 * unit));
 807     __ ldp(t2, t3, Address(s, 4 * unit));
 808     __ ldp(t4, t5, Address(s, 6 * unit));
 809     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 810 
 811     __ stp(t0, t1, Address(d, 2 * unit));
 812     __ stp(t2, t3, Address(d, 4 * unit));
 813     __ stp(t4, t5, Address(d, 6 * unit));
 814     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 815 
 816     if (direction == copy_forwards) {
 817       __ add(s, s, 2 * wordSize);
 818       __ add(d, d, 2 * wordSize);
 819     }
 820 
 821     {
 822       Label L1, L2;
 823       __ bind(small);
 824       __ tbz(count, exact_log2(4), L1);
 825       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 826       __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 827       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 828       __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 829       __ bind(L1);
 830 
 831       __ tbz(count, 1, L2);
 832       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 833       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 834       __ bind(L2);
 835     }
 836 
 837     __ ret(lr);
 838 
 839     __ align(6);
 840     __ bind(large);
 841 
 842     // Fill 8 registers
 843     __ ldp(t0, t1, Address(s, 2 * unit));
 844     __ ldp(t2, t3, Address(s, 4 * unit));
 845     __ ldp(t4, t5, Address(s, 6 * unit));
 846     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 847 
 848     __ bind(again);
 849 
 850     if (direction == copy_forwards && PrefetchCopyIntervalInBytes > 0)
 851       __ prfm(Address(s, PrefetchCopyIntervalInBytes), PLDL1KEEP);
 852 
 853     __ stp(t0, t1, Address(d, 2 * unit));
 854     __ ldp(t0, t1, Address(s, 2 * unit));
 855     __ stp(t2, t3, Address(d, 4 * unit));
 856     __ ldp(t2, t3, Address(s, 4 * unit));
 857     __ stp(t4, t5, Address(d, 6 * unit));
 858     __ ldp(t4, t5, Address(s, 6 * unit));
 859     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 860     __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 861 
 862     __ subs(count, count, 8);
 863     __ br(Assembler::HS, again);
 864 
 865     // Drain
 866     __ stp(t0, t1, Address(d, 2 * unit));
 867     __ stp(t2, t3, Address(d, 4 * unit));
 868     __ stp(t4, t5, Address(d, 6 * unit));
 869     __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 870 
 871     if (direction == copy_forwards) {
 872       __ add(s, s, 2 * wordSize);
 873       __ add(d, d, 2 * wordSize);
 874     }
 875 
 876     {
 877       Label L1, L2;
 878       __ tbz(count, exact_log2(4), L1);
 879       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 880       __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 881       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 882       __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 883       __ bind(L1);
 884 
 885       __ tbz(count, 1, L2);
 886       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 887       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 888       __ bind(L2);
 889     }
 890 
 891     __ ret(lr);
 892   }
 893 
 894   // Small copy: less than 16 bytes.
 895   //
 896   // NB: Ignores all of the bits of count which represent more than 15
 897   // bytes, so a caller doesn't have to mask them.
 898 
 899   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
 900     bool is_backwards = step < 0;
 901     size_t granularity = uabs(step);
 902     int direction = is_backwards ? -1 : 1;
 903     int unit = wordSize * direction;
 904 
 905     Label Lpair, Lword, Lint, Lshort, Lbyte;
 906 
 907     assert(granularity
 908            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 909 
 910     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
 911 
 912     // ??? I don't know if this bit-test-and-branch is the right thing
 913     // to do.  It does a lot of jumping, resulting in several
 914     // mispredicted branches.  It might make more sense to do this
 915     // with something like Duff's device with a single computed branch.
 916 
 917     __ tbz(count, 3 - exact_log2(granularity), Lword);
 918     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
 919     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
 920     __ bind(Lword);
 921 
 922     if (granularity <= sizeof (jint)) {
 923       __ tbz(count, 2 - exact_log2(granularity), Lint);
 924       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 925       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 926       __ bind(Lint);
 927     }
 928 
 929     if (granularity <= sizeof (jshort)) {
 930       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 931       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 932       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 933       __ bind(Lshort);
 934     }
 935 
 936     if (granularity <= sizeof (jbyte)) {
 937       __ tbz(count, 0, Lbyte);
 938       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 939       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 940       __ bind(Lbyte);
 941     }
 942   }
 943 
 944   Label copy_f, copy_b;
 945 
 946   // All-singing all-dancing memory copy.
 947   //
 948   // Copy count units of memory from s to d.  The size of a unit is
 949   // step, which can be positive or negative depending on the direction
 950   // of copy.  If is_aligned is false, we align the source address.
 951   //
 952 
 953   void copy_memory(bool is_aligned, Register s, Register d,
 954                    Register count, Register tmp, int step) {
 955     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 956     bool is_backwards = step < 0;
 957     int granularity = uabs(step);
 958     const Register t0 = r3, t1 = r4;
 959 
 960     if (is_backwards) {
 961       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
 962       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
 963     }
 964 
 965     Label done, tail;
 966 
 967     __ cmp(count, 16/granularity);
 968     __ br(Assembler::LO, tail);
 969 
 970     // Now we've got the small case out of the way we can align the
 971     // source address on a 2-word boundary.
 972 
 973     Label aligned;
 974 
 975     if (is_aligned) {
 976       // We may have to adjust by 1 word to get s 2-word-aligned.
 977       __ tbz(s, exact_log2(wordSize), aligned);
 978       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
 979       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
 980       __ sub(count, count, wordSize/granularity);
 981     } else {
 982       if (is_backwards) {
 983         __ andr(rscratch2, s, 2 * wordSize - 1);
 984       } else {
 985         __ neg(rscratch2, s);
 986         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
 987       }
 988       // rscratch2 is the byte adjustment needed to align s.
 989       __ cbz(rscratch2, aligned);
 990       __ lsr(rscratch2, rscratch2, exact_log2(granularity));
 991       __ sub(count, count, rscratch2);
 992 
 993 #if 0
 994       // ?? This code is only correct for a disjoint copy.  It may or
 995       // may not make sense to use it in that case.
 996 
 997       // Copy the first pair; s and d may not be aligned.
 998       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
 999       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1000 
1001       // Align s and d, adjust count
1002       if (is_backwards) {
1003         __ sub(s, s, rscratch2);
1004         __ sub(d, d, rscratch2);
1005       } else {
1006         __ add(s, s, rscratch2);
1007         __ add(d, d, rscratch2);
1008       }
1009 #else
1010       copy_memory_small(s, d, rscratch2, rscratch1, step);
1011 #endif
1012     }
1013 
1014     __ cmp(count, 16/granularity);
1015     __ br(Assembler::LT, tail);
1016     __ bind(aligned);
1017 
1018     // s is now 2-word-aligned.
1019 
1020     // We have a count of units and some trailing bytes.  Adjust the
1021     // count and do a bulk copy of words.
1022     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1023     if (direction == copy_forwards)
1024       __ bl(copy_f);
1025     else
1026       __ bl(copy_b);
1027 
1028     // And the tail.
1029 
1030     __ bind(tail);
1031     copy_memory_small(s, d, count, tmp, step);
1032   }
1033 
1034 
1035   void clobber_registers() {
1036 #ifdef ASSERT
1037     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1038     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1039     for (Register r = r3; r <= r18; r++)
1040       if (r != rscratch1) __ mov(r, rscratch1);
1041 #endif
1042   }
1043 
1044   // Scan over array at a for count oops, verifying each one.
1045   // Preserves a and count, clobbers rscratch1 and rscratch2.
1046   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1047     Label loop, end;
1048     __ mov(rscratch1, a);
1049     __ mov(rscratch2, zr);
1050     __ bind(loop);
1051     __ cmp(rscratch2, count);
1052     __ br(Assembler::HS, end);
1053     if (size == (size_t)wordSize) {
1054       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1055       __ verify_oop(temp);
1056     } else {
1057       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1058       __ decode_heap_oop(temp); // calls verify_oop
1059     }
1060     __ add(rscratch2, rscratch2, size);
1061     __ b(loop);
1062     __ bind(end);
1063   }
1064 
1065   // Arguments:
1066   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1067   //             ignored
1068   //   is_oop  - true => oop array, so generate store check code
1069   //   name    - stub name string
1070   //
1071   // Inputs:
1072   //   c_rarg0   - source array address
1073   //   c_rarg1   - destination array address
1074   //   c_rarg2   - element count, treated as ssize_t, can be zero
1075   //
1076   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1077   // the hardware handle it.  The two dwords within qwords that span
1078   // cache line boundaries will still be loaded and stored atomicly.
1079   //
1080   // Side Effects:
1081   //   disjoint_int_copy_entry is set to the no-overlap entry point
1082   //   used by generate_conjoint_int_oop_copy().
1083   //
1084   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1085                                   const char *name, bool dest_uninitialized = false) {
1086     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1087     __ align(CodeEntryAlignment);
1088     StubCodeMark mark(this, "StubRoutines", name);
1089     address start = __ pc();
1090     __ enter();
1091 
1092     if (entry != NULL) {
1093       *entry = __ pc();
1094       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1095       BLOCK_COMMENT("Entry:");
1096     }
1097 
1098     if (is_oop) {
1099       __ push(RegSet::of(d, count), sp);
1100       // no registers are destroyed by this call
1101       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1102     }
1103     copy_memory(aligned, s, d, count, rscratch1, size);
1104     if (is_oop) {
1105       __ pop(RegSet::of(d, count), sp);
1106       if (VerifyOops)
1107         verify_oop_array(size, d, count, r16);
1108       __ sub(count, count, 1); // make an inclusive end pointer
1109       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1110       gen_write_ref_array_post_barrier(d, count, rscratch1);
1111     }
1112     __ leave();
1113     __ mov(r0, zr); // return 0
1114     __ ret(lr);
1115 #ifdef BUILTIN_SIM
1116     {
1117       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1118       sim->notifyCompile(const_cast<char*>(name), start);
1119     }
1120 #endif
1121     return start;
1122   }
1123 
1124   // Arguments:
1125   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1126   //             ignored
1127   //   is_oop  - true => oop array, so generate store check code
1128   //   name    - stub name string
1129   //
1130   // Inputs:
1131   //   c_rarg0   - source array address
1132   //   c_rarg1   - destination array address
1133   //   c_rarg2   - element count, treated as ssize_t, can be zero
1134   //
1135   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1136   // the hardware handle it.  The two dwords within qwords that span
1137   // cache line boundaries will still be loaded and stored atomicly.
1138   //
1139   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1140                                  address *entry, const char *name,
1141                                  bool dest_uninitialized = false) {
1142     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1143 
1144     StubCodeMark mark(this, "StubRoutines", name);
1145     address start = __ pc();
1146     __ enter();
1147 
1148     if (entry != NULL) {
1149       *entry = __ pc();
1150       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1151       BLOCK_COMMENT("Entry:");
1152     }
1153     __ cmp(d, s);
1154     __ br(Assembler::LS, nooverlap_target);
1155 
1156     if (is_oop) {
1157       __ push(RegSet::of(d, count), sp);
1158       // no registers are destroyed by this call
1159       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1160     }
1161     copy_memory(aligned, s, d, count, rscratch1, -size);
1162     if (is_oop) {
1163       __ pop(RegSet::of(d, count), sp);
1164       if (VerifyOops)
1165         verify_oop_array(size, d, count, r16);
1166       __ sub(count, count, 1); // make an inclusive end pointer
1167       __ lea(count, Address(d, count, Address::uxtw(exact_log2(size))));
1168       gen_write_ref_array_post_barrier(d, count, rscratch1);
1169     }
1170     __ leave();
1171     __ mov(r0, zr); // return 0
1172     __ ret(lr);
1173 #ifdef BUILTIN_SIM
1174     {
1175       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1176       sim->notifyCompile(const_cast<char*>(name), start);
1177     }
1178 #endif
1179     return start;
1180 }
1181 
1182   // Arguments:
1183   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1184   //             ignored
1185   //   name    - stub name string
1186   //
1187   // Inputs:
1188   //   c_rarg0   - source array address
1189   //   c_rarg1   - destination array address
1190   //   c_rarg2   - element count, treated as ssize_t, can be zero
1191   //
1192   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1193   // we let the hardware handle it.  The one to eight bytes within words,
1194   // dwords or qwords that span cache line boundaries will still be loaded
1195   // and stored atomically.
1196   //
1197   // Side Effects:
1198   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1199   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1200   // we let the hardware handle it.  The one to eight bytes within words,
1201   // dwords or qwords that span cache line boundaries will still be loaded
1202   // and stored atomically.
1203   //
1204   // Side Effects:
1205   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1206   //   used by generate_conjoint_byte_copy().
1207   //
1208   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1209     const bool not_oop = false;
1210     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1211   }
1212 
1213   // Arguments:
1214   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1215   //             ignored
1216   //   name    - stub name string
1217   //
1218   // Inputs:
1219   //   c_rarg0   - source array address
1220   //   c_rarg1   - destination array address
1221   //   c_rarg2   - element count, treated as ssize_t, can be zero
1222   //
1223   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1224   // we let the hardware handle it.  The one to eight bytes within words,
1225   // dwords or qwords that span cache line boundaries will still be loaded
1226   // and stored atomically.
1227   //
1228   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1229                                       address* entry, const char *name) {
1230     const bool not_oop = false;
1231     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1232   }
1233 
1234   // Arguments:
1235   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1236   //             ignored
1237   //   name    - stub name string
1238   //
1239   // Inputs:
1240   //   c_rarg0   - source array address
1241   //   c_rarg1   - destination array address
1242   //   c_rarg2   - element count, treated as ssize_t, can be zero
1243   //
1244   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1245   // let the hardware handle it.  The two or four words within dwords
1246   // or qwords that span cache line boundaries will still be loaded
1247   // and stored atomically.
1248   //
1249   // Side Effects:
1250   //   disjoint_short_copy_entry is set to the no-overlap entry point
1251   //   used by generate_conjoint_short_copy().
1252   //
1253   address generate_disjoint_short_copy(bool aligned,
1254                                        address* entry, const char *name) {
1255     const bool not_oop = false;
1256     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1257   }
1258 
1259   // Arguments:
1260   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1261   //             ignored
1262   //   name    - stub name string
1263   //
1264   // Inputs:
1265   //   c_rarg0   - source array address
1266   //   c_rarg1   - destination array address
1267   //   c_rarg2   - element count, treated as ssize_t, can be zero
1268   //
1269   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1270   // let the hardware handle it.  The two or four words within dwords
1271   // or qwords that span cache line boundaries will still be loaded
1272   // and stored atomically.
1273   //
1274   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1275                                        address *entry, const char *name) {
1276     const bool not_oop = false;
1277     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1278 
1279   }
1280   // Arguments:
1281   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1282   //             ignored
1283   //   name    - stub name string
1284   //
1285   // Inputs:
1286   //   c_rarg0   - source array address
1287   //   c_rarg1   - destination array address
1288   //   c_rarg2   - element count, treated as ssize_t, can be zero
1289   //
1290   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1291   // the hardware handle it.  The two dwords within qwords that span
1292   // cache line boundaries will still be loaded and stored atomicly.
1293   //
1294   // Side Effects:
1295   //   disjoint_int_copy_entry is set to the no-overlap entry point
1296   //   used by generate_conjoint_int_oop_copy().
1297   //
1298   address generate_disjoint_int_copy(bool aligned, address *entry,
1299                                          const char *name, bool dest_uninitialized = false) {
1300     const bool not_oop = false;
1301     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1302   }
1303 
1304   // Arguments:
1305   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1306   //             ignored
1307   //   name    - stub name string
1308   //
1309   // Inputs:
1310   //   c_rarg0   - source array address
1311   //   c_rarg1   - destination array address
1312   //   c_rarg2   - element count, treated as ssize_t, can be zero
1313   //
1314   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1315   // the hardware handle it.  The two dwords within qwords that span
1316   // cache line boundaries will still be loaded and stored atomicly.
1317   //
1318   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1319                                      address *entry, const char *name,
1320                                      bool dest_uninitialized = false) {
1321     const bool not_oop = false;
1322     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1323   }
1324 
1325 
1326   // Arguments:
1327   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1328   //             ignored
1329   //   name    - stub name string
1330   //
1331   // Inputs:
1332   //   c_rarg0   - source array address
1333   //   c_rarg1   - destination array address
1334   //   c_rarg2   - element count, treated as size_t, can be zero
1335   //
1336   // Side Effects:
1337   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1338   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1339   //
1340   address generate_disjoint_long_copy(bool aligned, address *entry,
1341                                           const char *name, bool dest_uninitialized = false) {
1342     const bool not_oop = false;
1343     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1344   }
1345 
1346   // Arguments:
1347   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1348   //             ignored
1349   //   name    - stub name string
1350   //
1351   // Inputs:
1352   //   c_rarg0   - source array address
1353   //   c_rarg1   - destination array address
1354   //   c_rarg2   - element count, treated as size_t, can be zero
1355   //
1356   address generate_conjoint_long_copy(bool aligned,
1357                                       address nooverlap_target, address *entry,
1358                                       const char *name, bool dest_uninitialized = false) {
1359     const bool not_oop = false;
1360     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1361   }
1362 
1363   // Arguments:
1364   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1365   //             ignored
1366   //   name    - stub name string
1367   //
1368   // Inputs:
1369   //   c_rarg0   - source array address
1370   //   c_rarg1   - destination array address
1371   //   c_rarg2   - element count, treated as size_t, can be zero
1372   //
1373   // Side Effects:
1374   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1375   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1376   //
1377   address generate_disjoint_oop_copy(bool aligned, address *entry,
1378                                      const char *name, bool dest_uninitialized = false) {
1379     const bool is_oop = true;
1380     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1381     return generate_disjoint_copy(size, aligned, is_oop, entry, name);
1382   }
1383 
1384   // Arguments:
1385   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1386   //             ignored
1387   //   name    - stub name string
1388   //
1389   // Inputs:
1390   //   c_rarg0   - source array address
1391   //   c_rarg1   - destination array address
1392   //   c_rarg2   - element count, treated as size_t, can be zero
1393   //
1394   address generate_conjoint_oop_copy(bool aligned,
1395                                      address nooverlap_target, address *entry,
1396                                      const char *name, bool dest_uninitialized = false) {
1397     const bool is_oop = true;
1398     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1399     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, name);
1400   }
1401 
1402 
1403   // Helper for generating a dynamic type check.
1404   // Smashes rscratch1.
1405   void generate_type_check(Register sub_klass,
1406                            Register super_check_offset,
1407                            Register super_klass,
1408                            Label& L_success) {
1409     assert_different_registers(sub_klass, super_check_offset, super_klass);
1410 
1411     BLOCK_COMMENT("type_check:");
1412 
1413     Label L_miss;
1414 
1415     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1416                                      super_check_offset);
1417     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1418 
1419     // Fall through on failure!
1420     __ BIND(L_miss);
1421   }
1422 
1423   //
1424   //  Generate checkcasting array copy stub
1425   //
1426   //  Input:
1427   //    c_rarg0   - source array address
1428   //    c_rarg1   - destination array address
1429   //    c_rarg2   - element count, treated as ssize_t, can be zero
1430   //    c_rarg3   - size_t ckoff (super_check_offset)
1431   //    c_rarg4   - oop ckval (super_klass)
1432   //
1433   //  Output:
1434   //    r0 ==  0  -  success
1435   //    r0 == -1^K - failure, where K is partial transfer count
1436   //
1437   address generate_checkcast_copy(const char *name, address *entry,
1438                                   bool dest_uninitialized = false) {
1439 
1440     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1441 
1442     // Input registers (after setup_arg_regs)
1443     const Register from        = c_rarg0;   // source array address
1444     const Register to          = c_rarg1;   // destination array address
1445     const Register count       = c_rarg2;   // elementscount
1446     const Register ckoff       = c_rarg3;   // super_check_offset
1447     const Register ckval       = c_rarg4;   // super_klass
1448 
1449     // Registers used as temps (r18, r19, r20 are save-on-entry)
1450     const Register count_save  = r21;       // orig elementscount
1451     const Register start_to    = r20;       // destination array start address
1452     const Register copied_oop  = r18;       // actual oop copied
1453     const Register r19_klass   = r19;       // oop._klass
1454 
1455     //---------------------------------------------------------------
1456     // Assembler stub will be used for this call to arraycopy
1457     // if the two arrays are subtypes of Object[] but the
1458     // destination array type is not equal to or a supertype
1459     // of the source type.  Each element must be separately
1460     // checked.
1461 
1462     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1463                                copied_oop, r19_klass, count_save);
1464 
1465     __ align(CodeEntryAlignment);
1466     StubCodeMark mark(this, "StubRoutines", name);
1467     address start = __ pc();
1468 
1469     __ enter(); // required for proper stackwalking of RuntimeStub frame
1470 
1471 #ifdef ASSERT
1472     // caller guarantees that the arrays really are different
1473     // otherwise, we would have to make conjoint checks
1474     { Label L;
1475       array_overlap_test(L, TIMES_OOP);
1476       __ stop("checkcast_copy within a single array");
1477       __ bind(L);
1478     }
1479 #endif //ASSERT
1480 
1481     // Caller of this entry point must set up the argument registers.
1482     if (entry != NULL) {
1483       *entry = __ pc();
1484       BLOCK_COMMENT("Entry:");
1485     }
1486 
1487      // Empty array:  Nothing to do.
1488     __ cbz(count, L_done);
1489 
1490     __ push(RegSet::of(r18, r19, r20, r21), sp);
1491 
1492 #ifdef ASSERT
1493     BLOCK_COMMENT("assert consistent ckoff/ckval");
1494     // The ckoff and ckval must be mutually consistent,
1495     // even though caller generates both.
1496     { Label L;
1497       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1498       __ ldrw(start_to, Address(ckval, sco_offset));
1499       __ cmpw(ckoff, start_to);
1500       __ br(Assembler::EQ, L);
1501       __ stop("super_check_offset inconsistent");
1502       __ bind(L);
1503     }
1504 #endif //ASSERT
1505 
1506     // save the original count
1507     __ mov(count_save, count);
1508 
1509     // Copy from low to high addresses
1510     __ mov(start_to, to);              // Save destination array start address
1511     __ b(L_load_element);
1512 
1513     // ======== begin loop ========
1514     // (Loop is rotated; its entry is L_load_element.)
1515     // Loop control:
1516     //   for (; count != 0; count--) {
1517     //     copied_oop = load_heap_oop(from++);
1518     //     ... generate_type_check ...;
1519     //     store_heap_oop(to++, copied_oop);
1520     //   }
1521     __ align(OptoLoopAlignment);
1522 
1523     __ BIND(L_store_element);
1524     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1525     __ sub(count, count, 1);
1526     __ cbz(count, L_do_card_marks);
1527 
1528     // ======== loop entry is here ========
1529     __ BIND(L_load_element);
1530     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1531     __ cbz(copied_oop, L_store_element);
1532 
1533     __ load_klass(r19_klass, copied_oop);// query the object klass
1534     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1535     // ======== end loop ========
1536 
1537     // It was a real error; we must depend on the caller to finish the job.
1538     // Register count = remaining oops, count_orig = total oops.
1539     // Emit GC store barriers for the oops we have copied and report
1540     // their number to the caller.
1541 
1542     __ subs(count, count_save, count);     // K = partially copied oop count
1543     __ eon(count, count, zr);                   // report (-1^K) to caller
1544     __ br(Assembler::EQ, L_done_pop);
1545 
1546     __ BIND(L_do_card_marks);
1547     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1548     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1549 
1550     __ bind(L_done_pop);
1551     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1552     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1553 
1554     __ bind(L_done);
1555     __ mov(r0, count);
1556     __ leave();
1557     __ ret(lr);
1558 
1559     return start;
1560   }
1561 
1562   // Perform range checks on the proposed arraycopy.
1563   // Kills temp, but nothing else.
1564   // Also, clean the sign bits of src_pos and dst_pos.
1565   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1566                               Register src_pos, // source position (c_rarg1)
1567                               Register dst,     // destination array oo (c_rarg2)
1568                               Register dst_pos, // destination position (c_rarg3)
1569                               Register length,
1570                               Register temp,
1571                               Label& L_failed) {
1572     BLOCK_COMMENT("arraycopy_range_checks:");
1573 
1574     assert_different_registers(rscratch1, temp);
1575 
1576     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1577     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1578     __ addw(temp, length, src_pos);
1579     __ cmpw(temp, rscratch1);
1580     __ br(Assembler::HI, L_failed);
1581 
1582     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1583     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1584     __ addw(temp, length, dst_pos);
1585     __ cmpw(temp, rscratch1);
1586     __ br(Assembler::HI, L_failed);
1587 
1588     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1589     __ movw(src_pos, src_pos);
1590     __ movw(dst_pos, dst_pos);
1591 
1592     BLOCK_COMMENT("arraycopy_range_checks done");
1593   }
1594 
1595   // These stubs get called from some dumb test routine.
1596   // I'll write them properly when they're called from
1597   // something that's actually doing something.
1598   static void fake_arraycopy_stub(address src, address dst, int count) {
1599     assert(count == 0, "huh?");
1600   }
1601 
1602 
1603   //
1604   //  Generate 'unsafe' array copy stub
1605   //  Though just as safe as the other stubs, it takes an unscaled
1606   //  size_t argument instead of an element count.
1607   //
1608   //  Input:
1609   //    c_rarg0   - source array address
1610   //    c_rarg1   - destination array address
1611   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1612   //
1613   // Examines the alignment of the operands and dispatches
1614   // to a long, int, short, or byte copy loop.
1615   //
1616   address generate_unsafe_copy(const char *name,
1617                                address byte_copy_entry) {
1618 #ifdef PRODUCT
1619     return StubRoutines::_jbyte_arraycopy;
1620 #else
1621     __ align(CodeEntryAlignment);
1622     StubCodeMark mark(this, "StubRoutines", name);
1623     address start = __ pc();
1624     __ enter(); // required for proper stackwalking of RuntimeStub frame
1625     // bump this on entry, not on exit:
1626     __ lea(rscratch2, ExternalAddress((address)&SharedRuntime::_unsafe_array_copy_ctr));
1627     __ incrementw(Address(rscratch2));
1628     __ b(RuntimeAddress(byte_copy_entry));
1629     return start;
1630 #endif
1631   }
1632 
1633   //
1634   //  Generate generic array copy stubs
1635   //
1636   //  Input:
1637   //    c_rarg0    -  src oop
1638   //    c_rarg1    -  src_pos (32-bits)
1639   //    c_rarg2    -  dst oop
1640   //    c_rarg3    -  dst_pos (32-bits)
1641   //    c_rarg4    -  element count (32-bits)
1642   //
1643   //  Output:
1644   //    r0 ==  0  -  success
1645   //    r0 == -1^K - failure, where K is partial transfer count
1646   //
1647   address generate_generic_copy(const char *name,
1648                                 address byte_copy_entry, address short_copy_entry,
1649                                 address int_copy_entry, address oop_copy_entry,
1650                                 address long_copy_entry, address checkcast_copy_entry) {
1651 
1652     Label L_failed, L_failed_0, L_objArray;
1653     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1654 
1655     // Input registers
1656     const Register src        = c_rarg0;  // source array oop
1657     const Register src_pos    = c_rarg1;  // source position
1658     const Register dst        = c_rarg2;  // destination array oop
1659     const Register dst_pos    = c_rarg3;  // destination position
1660     const Register length     = c_rarg4;
1661 
1662     StubCodeMark mark(this, "StubRoutines", name);
1663 
1664     __ align(CodeEntryAlignment);
1665     address start = __ pc();
1666 
1667     __ enter(); // required for proper stackwalking of RuntimeStub frame
1668 
1669     // bump this on entry, not on exit:
1670     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1671 
1672     //-----------------------------------------------------------------------
1673     // Assembler stub will be used for this call to arraycopy
1674     // if the following conditions are met:
1675     //
1676     // (1) src and dst must not be null.
1677     // (2) src_pos must not be negative.
1678     // (3) dst_pos must not be negative.
1679     // (4) length  must not be negative.
1680     // (5) src klass and dst klass should be the same and not NULL.
1681     // (6) src and dst should be arrays.
1682     // (7) src_pos + length must not exceed length of src.
1683     // (8) dst_pos + length must not exceed length of dst.
1684     //
1685 
1686     //  if (src == NULL) return -1;
1687     __ cbz(src, L_failed);
1688 
1689     //  if (src_pos < 0) return -1;
1690     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
1691 
1692     //  if (dst == NULL) return -1;
1693     __ cbz(dst, L_failed);
1694 
1695     //  if (dst_pos < 0) return -1;
1696     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
1697 
1698     // registers used as temp
1699     const Register scratch_length    = r16; // elements count to copy
1700     const Register scratch_src_klass = r17; // array klass
1701     const Register lh                = r18; // layout helper
1702 
1703     //  if (length < 0) return -1;
1704     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
1705     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
1706 
1707     __ load_klass(scratch_src_klass, src);
1708 #ifdef ASSERT
1709     //  assert(src->klass() != NULL);
1710     {
1711       BLOCK_COMMENT("assert klasses not null {");
1712       Label L1, L2;
1713       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
1714       __ bind(L1);
1715       __ stop("broken null klass");
1716       __ bind(L2);
1717       __ load_klass(rscratch1, dst);
1718       __ cbz(rscratch1, L1);     // this would be broken also
1719       BLOCK_COMMENT("} assert klasses not null done");
1720     }
1721 #endif
1722 
1723     // Load layout helper (32-bits)
1724     //
1725     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1726     // 32        30    24            16              8     2                 0
1727     //
1728     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1729     //
1730 
1731     const int lh_offset = in_bytes(Klass::layout_helper_offset());
1732 
1733     // Handle objArrays completely differently...
1734     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1735     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
1736     __ movw(rscratch1, objArray_lh);
1737     __ eorw(rscratch2, lh, rscratch1);
1738     __ cbzw(rscratch2, L_objArray);
1739 
1740     //  if (src->klass() != dst->klass()) return -1;
1741     __ load_klass(rscratch2, dst);
1742     __ eor(rscratch2, rscratch2, scratch_src_klass);
1743     __ cbnz(rscratch2, L_failed);
1744 
1745     //  if (!src->is_Array()) return -1;
1746     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
1747 
1748     // At this point, it is known to be a typeArray (array_tag 0x3).
1749 #ifdef ASSERT
1750     {
1751       BLOCK_COMMENT("assert primitive array {");
1752       Label L;
1753       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
1754       __ cmpw(lh, rscratch2);
1755       __ br(Assembler::GE, L);
1756       __ stop("must be a primitive array");
1757       __ bind(L);
1758       BLOCK_COMMENT("} assert primitive array done");
1759     }
1760 #endif
1761 
1762     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1763                            rscratch2, L_failed);
1764 
1765     // TypeArrayKlass
1766     //
1767     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
1768     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
1769     //
1770 
1771     const Register rscratch1_offset = rscratch1;    // array offset
1772     const Register r18_elsize = lh; // element size
1773 
1774     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
1775            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
1776     __ add(src, src, rscratch1_offset);           // src array offset
1777     __ add(dst, dst, rscratch1_offset);           // dst array offset
1778     BLOCK_COMMENT("choose copy loop based on element size");
1779 
1780     // next registers should be set before the jump to corresponding stub
1781     const Register from     = c_rarg0;  // source array address
1782     const Register to       = c_rarg1;  // destination array address
1783     const Register count    = c_rarg2;  // elements count
1784 
1785     // 'from', 'to', 'count' registers should be set in such order
1786     // since they are the same as 'src', 'src_pos', 'dst'.
1787 
1788     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1789 
1790     // The possible values of elsize are 0-3, i.e. exact_log2(element
1791     // size in bytes).  We do a simple bitwise binary search.
1792   __ BIND(L_copy_bytes);
1793     __ tbnz(r18_elsize, 1, L_copy_ints);
1794     __ tbnz(r18_elsize, 0, L_copy_shorts);
1795     __ lea(from, Address(src, src_pos));// src_addr
1796     __ lea(to,   Address(dst, dst_pos));// dst_addr
1797     __ movw(count, scratch_length); // length
1798     __ b(RuntimeAddress(byte_copy_entry));
1799 
1800   __ BIND(L_copy_shorts);
1801     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
1802     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
1803     __ movw(count, scratch_length); // length
1804     __ b(RuntimeAddress(short_copy_entry));
1805 
1806   __ BIND(L_copy_ints);
1807     __ tbnz(r18_elsize, 0, L_copy_longs);
1808     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
1809     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
1810     __ movw(count, scratch_length); // length
1811     __ b(RuntimeAddress(int_copy_entry));
1812 
1813   __ BIND(L_copy_longs);
1814 #ifdef ASSERT
1815     {
1816       BLOCK_COMMENT("assert long copy {");
1817       Label L;
1818       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
1819       __ cmpw(r18_elsize, LogBytesPerLong);
1820       __ br(Assembler::EQ, L);
1821       __ stop("must be long copy, but elsize is wrong");
1822       __ bind(L);
1823       BLOCK_COMMENT("} assert long copy done");
1824     }
1825 #endif
1826     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
1827     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
1828     __ movw(count, scratch_length); // length
1829     __ b(RuntimeAddress(long_copy_entry));
1830 
1831     // ObjArrayKlass
1832   __ BIND(L_objArray);
1833     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
1834 
1835     Label L_plain_copy, L_checkcast_copy;
1836     //  test array classes for subtyping
1837     __ load_klass(r18, dst);
1838     __ cmp(scratch_src_klass, r18); // usual case is exact equality
1839     __ br(Assembler::NE, L_checkcast_copy);
1840 
1841     // Identically typed arrays can be copied without element-wise checks.
1842     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1843                            rscratch2, L_failed);
1844 
1845     __ lea(from, Address(src, src_pos, Address::lsl(3)));
1846     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1847     __ lea(to, Address(dst, dst_pos, Address::lsl(3)));
1848     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1849     __ movw(count, scratch_length); // length
1850   __ BIND(L_plain_copy);
1851     __ b(RuntimeAddress(oop_copy_entry));
1852 
1853   __ BIND(L_checkcast_copy);
1854     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
1855     {
1856       // Before looking at dst.length, make sure dst is also an objArray.
1857       __ ldrw(rscratch1, Address(r18, lh_offset));
1858       __ movw(rscratch2, objArray_lh);
1859       __ eorw(rscratch1, rscratch1, rscratch2);
1860       __ cbnzw(rscratch1, L_failed);
1861 
1862       // It is safe to examine both src.length and dst.length.
1863       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1864                              r18, L_failed);
1865 
1866       const Register rscratch2_dst_klass = rscratch2;
1867       __ load_klass(rscratch2_dst_klass, dst); // reload
1868 
1869       // Marshal the base address arguments now, freeing registers.
1870       __ lea(from, Address(src, src_pos, Address::lsl(3)));
1871       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1872       __ lea(to, Address(dst, dst_pos, Address::lsl(3)));
1873       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1874       __ movw(count, length);           // length (reloaded)
1875       Register sco_temp = c_rarg3;      // this register is free now
1876       assert_different_registers(from, to, count, sco_temp,
1877                                  rscratch2_dst_klass, scratch_src_klass);
1878       // assert_clean_int(count, sco_temp);
1879 
1880       // Generate the type check.
1881       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
1882       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
1883       // assert_clean_int(sco_temp, r18);
1884       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
1885 
1886       // Fetch destination element klass from the ObjArrayKlass header.
1887       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
1888       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
1889       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
1890 
1891       // the checkcast_copy loop needs two extra arguments:
1892       assert(c_rarg3 == sco_temp, "#3 already in place");
1893       // Set up arguments for checkcast_copy_entry.
1894       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
1895       __ b(RuntimeAddress(checkcast_copy_entry));
1896     }
1897 
1898   __ BIND(L_failed);
1899     __ mov(r0, -1);
1900     __ leave();   // required for proper stackwalking of RuntimeStub frame
1901     __ ret(lr);
1902 
1903     return start;
1904   }
1905 
1906   void generate_arraycopy_stubs() {
1907     address entry;
1908     address entry_jbyte_arraycopy;
1909     address entry_jshort_arraycopy;
1910     address entry_jint_arraycopy;
1911     address entry_oop_arraycopy;
1912     address entry_jlong_arraycopy;
1913     address entry_checkcast_arraycopy;
1914 
1915     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
1916     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
1917 
1918     //*** jbyte
1919     // Always need aligned and unaligned versions
1920     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
1921                                                                                   "jbyte_disjoint_arraycopy");
1922     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
1923                                                                                   &entry_jbyte_arraycopy,
1924                                                                                   "jbyte_arraycopy");
1925     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
1926                                                                                   "arrayof_jbyte_disjoint_arraycopy");
1927     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
1928                                                                                   "arrayof_jbyte_arraycopy");
1929 
1930     //*** jshort
1931     // Always need aligned and unaligned versions
1932     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
1933                                                                                     "jshort_disjoint_arraycopy");
1934     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
1935                                                                                     &entry_jshort_arraycopy,
1936                                                                                     "jshort_arraycopy");
1937     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
1938                                                                                     "arrayof_jshort_disjoint_arraycopy");
1939     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
1940                                                                                     "arrayof_jshort_arraycopy");
1941 
1942     //*** jint
1943     // Aligned versions
1944     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
1945                                                                                 "arrayof_jint_disjoint_arraycopy");
1946     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
1947                                                                                 "arrayof_jint_arraycopy");
1948     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
1949     // entry_jint_arraycopy always points to the unaligned version
1950     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
1951                                                                                 "jint_disjoint_arraycopy");
1952     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
1953                                                                                 &entry_jint_arraycopy,
1954                                                                                 "jint_arraycopy");
1955 
1956     //*** jlong
1957     // It is always aligned
1958     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
1959                                                                                   "arrayof_jlong_disjoint_arraycopy");
1960     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
1961                                                                                   "arrayof_jlong_arraycopy");
1962     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
1963     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
1964 
1965     //*** oops
1966     {
1967       // With compressed oops we need unaligned versions; notice that
1968       // we overwrite entry_oop_arraycopy.
1969       bool aligned = !UseCompressedOops;
1970 
1971       StubRoutines::_arrayof_oop_disjoint_arraycopy
1972         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy");
1973       StubRoutines::_arrayof_oop_arraycopy
1974         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy");
1975       // Aligned versions without pre-barriers
1976       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
1977         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
1978                                      /*dest_uninitialized*/true);
1979       StubRoutines::_arrayof_oop_arraycopy_uninit
1980         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
1981                                      /*dest_uninitialized*/true);
1982     }
1983 
1984     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
1985     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
1986     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
1987     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
1988 
1989     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
1990     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
1991                                                                         /*dest_uninitialized*/true);
1992 
1993     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
1994                                                               entry_jbyte_arraycopy);
1995 
1996     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
1997                                                                entry_jbyte_arraycopy,
1998                                                                entry_jshort_arraycopy,
1999                                                                entry_jint_arraycopy,
2000                                                                entry_oop_arraycopy,
2001                                                                entry_jlong_arraycopy,
2002                                                                entry_checkcast_arraycopy);
2003 
2004   }
2005 
2006   void generate_math_stubs() { Unimplemented(); }
2007 
2008   // Arguments:
2009   //
2010   // Inputs:
2011   //   c_rarg0   - source byte array address
2012   //   c_rarg1   - destination byte array address
2013   //   c_rarg2   - K (key) in little endian int array
2014   //
2015   address generate_aescrypt_encryptBlock() {
2016     __ align(CodeEntryAlignment);
2017     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2018 
2019     Label L_doLast;
2020 
2021     const Register from        = c_rarg0;  // source array address
2022     const Register to          = c_rarg1;  // destination array address
2023     const Register key         = c_rarg2;  // key array address
2024     const Register keylen      = rscratch1;
2025 
2026     address start = __ pc();
2027     __ enter();
2028 
2029     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2030 
2031     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2032 
2033     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2034     __ rev32(v1, __ T16B, v1);
2035     __ rev32(v2, __ T16B, v2);
2036     __ rev32(v3, __ T16B, v3);
2037     __ rev32(v4, __ T16B, v4);
2038     __ aese(v0, v1);
2039     __ aesmc(v0, v0);
2040     __ aese(v0, v2);
2041     __ aesmc(v0, v0);
2042     __ aese(v0, v3);
2043     __ aesmc(v0, v0);
2044     __ aese(v0, v4);
2045     __ aesmc(v0, v0);
2046 
2047     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2048     __ rev32(v1, __ T16B, v1);
2049     __ rev32(v2, __ T16B, v2);
2050     __ rev32(v3, __ T16B, v3);
2051     __ rev32(v4, __ T16B, v4);
2052     __ aese(v0, v1);
2053     __ aesmc(v0, v0);
2054     __ aese(v0, v2);
2055     __ aesmc(v0, v0);
2056     __ aese(v0, v3);
2057     __ aesmc(v0, v0);
2058     __ aese(v0, v4);
2059     __ aesmc(v0, v0);
2060 
2061     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2062     __ rev32(v1, __ T16B, v1);
2063     __ rev32(v2, __ T16B, v2);
2064 
2065     __ cmpw(keylen, 44);
2066     __ br(Assembler::EQ, L_doLast);
2067 
2068     __ aese(v0, v1);
2069     __ aesmc(v0, v0);
2070     __ aese(v0, v2);
2071     __ aesmc(v0, v0);
2072 
2073     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2074     __ rev32(v1, __ T16B, v1);
2075     __ rev32(v2, __ T16B, v2);
2076 
2077     __ cmpw(keylen, 52);
2078     __ br(Assembler::EQ, L_doLast);
2079 
2080     __ aese(v0, v1);
2081     __ aesmc(v0, v0);
2082     __ aese(v0, v2);
2083     __ aesmc(v0, v0);
2084 
2085     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2086     __ rev32(v1, __ T16B, v1);
2087     __ rev32(v2, __ T16B, v2);
2088 
2089     __ BIND(L_doLast);
2090 
2091     __ aese(v0, v1);
2092     __ aesmc(v0, v0);
2093     __ aese(v0, v2);
2094 
2095     __ ld1(v1, __ T16B, key);
2096     __ rev32(v1, __ T16B, v1);
2097     __ eor(v0, __ T16B, v0, v1);
2098 
2099     __ st1(v0, __ T16B, to);
2100 
2101     __ mov(r0, 0);
2102 
2103     __ leave();
2104     __ ret(lr);
2105 
2106     return start;
2107   }
2108 
2109   // Arguments:
2110   //
2111   // Inputs:
2112   //   c_rarg0   - source byte array address
2113   //   c_rarg1   - destination byte array address
2114   //   c_rarg2   - K (key) in little endian int array
2115   //
2116   address generate_aescrypt_decryptBlock() {
2117     assert(UseAES, "need AES instructions and misaligned SSE support");
2118     __ align(CodeEntryAlignment);
2119     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2120     Label L_doLast;
2121 
2122     const Register from        = c_rarg0;  // source array address
2123     const Register to          = c_rarg1;  // destination array address
2124     const Register key         = c_rarg2;  // key array address
2125     const Register keylen      = rscratch1;
2126 
2127     address start = __ pc();
2128     __ enter(); // required for proper stackwalking of RuntimeStub frame
2129 
2130     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2131 
2132     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2133 
2134     __ ld1(v5, __ T16B, __ post(key, 16));
2135     __ rev32(v5, __ T16B, v5);
2136 
2137     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2138     __ rev32(v1, __ T16B, v1);
2139     __ rev32(v2, __ T16B, v2);
2140     __ rev32(v3, __ T16B, v3);
2141     __ rev32(v4, __ T16B, v4);
2142     __ aesd(v0, v1);
2143     __ aesimc(v0, v0);
2144     __ aesd(v0, v2);
2145     __ aesimc(v0, v0);
2146     __ aesd(v0, v3);
2147     __ aesimc(v0, v0);
2148     __ aesd(v0, v4);
2149     __ aesimc(v0, v0);
2150 
2151     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2152     __ rev32(v1, __ T16B, v1);
2153     __ rev32(v2, __ T16B, v2);
2154     __ rev32(v3, __ T16B, v3);
2155     __ rev32(v4, __ T16B, v4);
2156     __ aesd(v0, v1);
2157     __ aesimc(v0, v0);
2158     __ aesd(v0, v2);
2159     __ aesimc(v0, v0);
2160     __ aesd(v0, v3);
2161     __ aesimc(v0, v0);
2162     __ aesd(v0, v4);
2163     __ aesimc(v0, v0);
2164 
2165     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2166     __ rev32(v1, __ T16B, v1);
2167     __ rev32(v2, __ T16B, v2);
2168 
2169     __ cmpw(keylen, 44);
2170     __ br(Assembler::EQ, L_doLast);
2171 
2172     __ aesd(v0, v1);
2173     __ aesimc(v0, v0);
2174     __ aesd(v0, v2);
2175     __ aesimc(v0, v0);
2176 
2177     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2178     __ rev32(v1, __ T16B, v1);
2179     __ rev32(v2, __ T16B, v2);
2180 
2181     __ cmpw(keylen, 52);
2182     __ br(Assembler::EQ, L_doLast);
2183 
2184     __ aesd(v0, v1);
2185     __ aesimc(v0, v0);
2186     __ aesd(v0, v2);
2187     __ aesimc(v0, v0);
2188 
2189     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2190     __ rev32(v1, __ T16B, v1);
2191     __ rev32(v2, __ T16B, v2);
2192 
2193     __ BIND(L_doLast);
2194 
2195     __ aesd(v0, v1);
2196     __ aesimc(v0, v0);
2197     __ aesd(v0, v2);
2198 
2199     __ eor(v0, __ T16B, v0, v5);
2200 
2201     __ st1(v0, __ T16B, to);
2202 
2203     __ mov(r0, 0);
2204 
2205     __ leave();
2206     __ ret(lr);
2207 
2208     return start;
2209   }
2210 
2211   // Arguments:
2212   //
2213   // Inputs:
2214   //   c_rarg0   - source byte array address
2215   //   c_rarg1   - destination byte array address
2216   //   c_rarg2   - K (key) in little endian int array
2217   //   c_rarg3   - r vector byte array address
2218   //   c_rarg4   - input length
2219   //
2220   // Output:
2221   //   x0        - input length
2222   //
2223   address generate_cipherBlockChaining_encryptAESCrypt() {
2224     assert(UseAES, "need AES instructions and misaligned SSE support");
2225     __ align(CodeEntryAlignment);
2226     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2227 
2228     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2229 
2230     const Register from        = c_rarg0;  // source array address
2231     const Register to          = c_rarg1;  // destination array address
2232     const Register key         = c_rarg2;  // key array address
2233     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2234                                            // and left with the results of the last encryption block
2235     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2236     const Register keylen      = rscratch1;
2237 
2238     address start = __ pc();
2239       __ enter();
2240 
2241       __ mov(rscratch2, len_reg);
2242       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2243 
2244       __ ld1(v0, __ T16B, rvec);
2245 
2246       __ cmpw(keylen, 52);
2247       __ br(Assembler::CC, L_loadkeys_44);
2248       __ br(Assembler::EQ, L_loadkeys_52);
2249 
2250       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2251       __ rev32(v17, __ T16B, v17);
2252       __ rev32(v18, __ T16B, v18);
2253     __ BIND(L_loadkeys_52);
2254       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2255       __ rev32(v19, __ T16B, v19);
2256       __ rev32(v20, __ T16B, v20);
2257     __ BIND(L_loadkeys_44);
2258       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2259       __ rev32(v21, __ T16B, v21);
2260       __ rev32(v22, __ T16B, v22);
2261       __ rev32(v23, __ T16B, v23);
2262       __ rev32(v24, __ T16B, v24);
2263       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2264       __ rev32(v25, __ T16B, v25);
2265       __ rev32(v26, __ T16B, v26);
2266       __ rev32(v27, __ T16B, v27);
2267       __ rev32(v28, __ T16B, v28);
2268       __ ld1(v29, v30, v31, __ T16B, key);
2269       __ rev32(v29, __ T16B, v29);
2270       __ rev32(v30, __ T16B, v30);
2271       __ rev32(v31, __ T16B, v31);
2272 
2273     __ BIND(L_aes_loop);
2274       __ ld1(v1, __ T16B, __ post(from, 16));
2275       __ eor(v0, __ T16B, v0, v1);
2276 
2277       __ br(Assembler::CC, L_rounds_44);
2278       __ br(Assembler::EQ, L_rounds_52);
2279 
2280       __ aese(v0, v17); __ aesmc(v0, v0);
2281       __ aese(v0, v18); __ aesmc(v0, v0);
2282     __ BIND(L_rounds_52);
2283       __ aese(v0, v19); __ aesmc(v0, v0);
2284       __ aese(v0, v20); __ aesmc(v0, v0);
2285     __ BIND(L_rounds_44);
2286       __ aese(v0, v21); __ aesmc(v0, v0);
2287       __ aese(v0, v22); __ aesmc(v0, v0);
2288       __ aese(v0, v23); __ aesmc(v0, v0);
2289       __ aese(v0, v24); __ aesmc(v0, v0);
2290       __ aese(v0, v25); __ aesmc(v0, v0);
2291       __ aese(v0, v26); __ aesmc(v0, v0);
2292       __ aese(v0, v27); __ aesmc(v0, v0);
2293       __ aese(v0, v28); __ aesmc(v0, v0);
2294       __ aese(v0, v29); __ aesmc(v0, v0);
2295       __ aese(v0, v30);
2296       __ eor(v0, __ T16B, v0, v31);
2297 
2298       __ st1(v0, __ T16B, __ post(to, 16));
2299       __ sub(len_reg, len_reg, 16);
2300       __ cbnz(len_reg, L_aes_loop);
2301 
2302       __ st1(v0, __ T16B, rvec);
2303 
2304       __ mov(r0, rscratch2);
2305 
2306       __ leave();
2307       __ ret(lr);
2308 
2309       return start;
2310   }
2311 
2312   // Arguments:
2313   //
2314   // Inputs:
2315   //   c_rarg0   - source byte array address
2316   //   c_rarg1   - destination byte array address
2317   //   c_rarg2   - K (key) in little endian int array
2318   //   c_rarg3   - r vector byte array address
2319   //   c_rarg4   - input length
2320   //
2321   // Output:
2322   //   r0        - input length
2323   //
2324   address generate_cipherBlockChaining_decryptAESCrypt() {
2325     assert(UseAES, "need AES instructions and misaligned SSE support");
2326     __ align(CodeEntryAlignment);
2327     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2328 
2329     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2330 
2331     const Register from        = c_rarg0;  // source array address
2332     const Register to          = c_rarg1;  // destination array address
2333     const Register key         = c_rarg2;  // key array address
2334     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2335                                            // and left with the results of the last encryption block
2336     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2337     const Register keylen      = rscratch1;
2338 
2339     address start = __ pc();
2340       __ enter();
2341 
2342       __ mov(rscratch2, len_reg);
2343       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2344 
2345       __ ld1(v2, __ T16B, rvec);
2346 
2347       __ ld1(v31, __ T16B, __ post(key, 16));
2348       __ rev32(v31, __ T16B, v31);
2349 
2350       __ cmpw(keylen, 52);
2351       __ br(Assembler::CC, L_loadkeys_44);
2352       __ br(Assembler::EQ, L_loadkeys_52);
2353 
2354       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2355       __ rev32(v17, __ T16B, v17);
2356       __ rev32(v18, __ T16B, v18);
2357     __ BIND(L_loadkeys_52);
2358       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2359       __ rev32(v19, __ T16B, v19);
2360       __ rev32(v20, __ T16B, v20);
2361     __ BIND(L_loadkeys_44);
2362       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2363       __ rev32(v21, __ T16B, v21);
2364       __ rev32(v22, __ T16B, v22);
2365       __ rev32(v23, __ T16B, v23);
2366       __ rev32(v24, __ T16B, v24);
2367       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2368       __ rev32(v25, __ T16B, v25);
2369       __ rev32(v26, __ T16B, v26);
2370       __ rev32(v27, __ T16B, v27);
2371       __ rev32(v28, __ T16B, v28);
2372       __ ld1(v29, v30, __ T16B, key);
2373       __ rev32(v29, __ T16B, v29);
2374       __ rev32(v30, __ T16B, v30);
2375 
2376     __ BIND(L_aes_loop);
2377       __ ld1(v0, __ T16B, __ post(from, 16));
2378       __ orr(v1, __ T16B, v0, v0);
2379 
2380       __ br(Assembler::CC, L_rounds_44);
2381       __ br(Assembler::EQ, L_rounds_52);
2382 
2383       __ aesd(v0, v17); __ aesimc(v0, v0);
2384       __ aesd(v0, v18); __ aesimc(v0, v0);
2385     __ BIND(L_rounds_52);
2386       __ aesd(v0, v19); __ aesimc(v0, v0);
2387       __ aesd(v0, v20); __ aesimc(v0, v0);
2388     __ BIND(L_rounds_44);
2389       __ aesd(v0, v21); __ aesimc(v0, v0);
2390       __ aesd(v0, v22); __ aesimc(v0, v0);
2391       __ aesd(v0, v23); __ aesimc(v0, v0);
2392       __ aesd(v0, v24); __ aesimc(v0, v0);
2393       __ aesd(v0, v25); __ aesimc(v0, v0);
2394       __ aesd(v0, v26); __ aesimc(v0, v0);
2395       __ aesd(v0, v27); __ aesimc(v0, v0);
2396       __ aesd(v0, v28); __ aesimc(v0, v0);
2397       __ aesd(v0, v29); __ aesimc(v0, v0);
2398       __ aesd(v0, v30);
2399       __ eor(v0, __ T16B, v0, v31);
2400       __ eor(v0, __ T16B, v0, v2);
2401 
2402       __ st1(v0, __ T16B, __ post(to, 16));
2403       __ orr(v2, __ T16B, v1, v1);
2404 
2405       __ sub(len_reg, len_reg, 16);
2406       __ cbnz(len_reg, L_aes_loop);
2407 
2408       __ st1(v2, __ T16B, rvec);
2409 
2410       __ mov(r0, rscratch2);
2411 
2412       __ leave();
2413       __ ret(lr);
2414 
2415     return start;
2416   }
2417 
2418   // Arguments:
2419   //
2420   // Inputs:
2421   //   c_rarg0   - byte[]  source+offset
2422   //   c_rarg1   - int[]   SHA.state
2423   //   c_rarg2   - int     offset
2424   //   c_rarg3   - int     limit
2425   //
2426   address generate_sha1_implCompress(bool multi_block, const char *name) {
2427     __ align(CodeEntryAlignment);
2428     StubCodeMark mark(this, "StubRoutines", name);
2429     address start = __ pc();
2430 
2431     Register buf   = c_rarg0;
2432     Register state = c_rarg1;
2433     Register ofs   = c_rarg2;
2434     Register limit = c_rarg3;
2435 
2436     Label keys;
2437     Label sha1_loop;
2438 
2439     // load the keys into v0..v3
2440     __ adr(rscratch1, keys);
2441     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2442     // load 5 words state into v6, v7
2443     __ ldrq(v6, Address(state, 0));
2444     __ ldrs(v7, Address(state, 16));
2445 
2446 
2447     __ BIND(sha1_loop);
2448     // load 64 bytes of data into v16..v19
2449     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2450     __ rev32(v16, __ T16B, v16);
2451     __ rev32(v17, __ T16B, v17);
2452     __ rev32(v18, __ T16B, v18);
2453     __ rev32(v19, __ T16B, v19);
2454 
2455     // do the sha1
2456     __ addv(v4, __ T4S, v16, v0);
2457     __ orr(v20, __ T16B, v6, v6);
2458 
2459     FloatRegister d0 = v16;
2460     FloatRegister d1 = v17;
2461     FloatRegister d2 = v18;
2462     FloatRegister d3 = v19;
2463 
2464     for (int round = 0; round < 20; round++) {
2465       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2466       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2467       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2468       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2469       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2470 
2471       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2472       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2473       __ sha1h(tmp2, __ T4S, v20);
2474       if (round < 5)
2475         __ sha1c(v20, __ T4S, tmp3, tmp4);
2476       else if (round < 10 || round >= 15)
2477         __ sha1p(v20, __ T4S, tmp3, tmp4);
2478       else
2479         __ sha1m(v20, __ T4S, tmp3, tmp4);
2480       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2481 
2482       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2483     }
2484 
2485     __ addv(v7, __ T2S, v7, v21);
2486     __ addv(v6, __ T4S, v6, v20);
2487 
2488     if (multi_block) {
2489       __ add(ofs, ofs, 64);
2490       __ cmp(ofs, limit);
2491       __ br(Assembler::LE, sha1_loop);
2492       __ mov(c_rarg0, ofs); // return ofs
2493     }
2494 
2495     __ strq(v6, Address(state, 0));
2496     __ strs(v7, Address(state, 16));
2497 
2498     __ ret(lr);
2499 
2500     __ bind(keys);
2501     __ emit_int32(0x5a827999);
2502     __ emit_int32(0x6ed9eba1);
2503     __ emit_int32(0x8f1bbcdc);
2504     __ emit_int32(0xca62c1d6);
2505 
2506     return start;
2507   }
2508 
2509 
2510   // Arguments:
2511   //
2512   // Inputs:
2513   //   c_rarg0   - byte[]  source+offset
2514   //   c_rarg1   - int[]   SHA.state
2515   //   c_rarg2   - int     offset
2516   //   c_rarg3   - int     limit
2517   //
2518   address generate_sha256_implCompress(bool multi_block, const char *name) {
2519     static const uint32_t round_consts[64] = {
2520       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
2521       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
2522       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
2523       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
2524       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
2525       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
2526       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
2527       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
2528       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
2529       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
2530       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
2531       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
2532       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
2533       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
2534       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
2535       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
2536     };
2537     __ align(CodeEntryAlignment);
2538     StubCodeMark mark(this, "StubRoutines", name);
2539     address start = __ pc();
2540 
2541     Register buf   = c_rarg0;
2542     Register state = c_rarg1;
2543     Register ofs   = c_rarg2;
2544     Register limit = c_rarg3;
2545 
2546     Label sha1_loop;
2547 
2548     __ stpd(v8, v9, __ pre(sp, -32));
2549     __ stpd(v10, v11, Address(sp, 16));
2550 
2551 // dga == v0
2552 // dgb == v1
2553 // dg0 == v2
2554 // dg1 == v3
2555 // dg2 == v4
2556 // t0 == v6
2557 // t1 == v7
2558 
2559     // load 16 keys to v16..v31
2560     __ lea(rscratch1, ExternalAddress((address)round_consts));
2561     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
2562     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
2563     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
2564     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
2565 
2566     // load 8 words (256 bits) state
2567     __ ldpq(v0, v1, state);
2568 
2569     __ BIND(sha1_loop);
2570     // load 64 bytes of data into v8..v11
2571     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
2572     __ rev32(v8, __ T16B, v8);
2573     __ rev32(v9, __ T16B, v9);
2574     __ rev32(v10, __ T16B, v10);
2575     __ rev32(v11, __ T16B, v11);
2576 
2577     __ addv(v6, __ T4S, v8, v16);
2578     __ orr(v2, __ T16B, v0, v0);
2579     __ orr(v3, __ T16B, v1, v1);
2580 
2581     FloatRegister d0 = v8;
2582     FloatRegister d1 = v9;
2583     FloatRegister d2 = v10;
2584     FloatRegister d3 = v11;
2585 
2586 
2587     for (int round = 0; round < 16; round++) {
2588       FloatRegister tmp1 = (round & 1) ? v6 : v7;
2589       FloatRegister tmp2 = (round & 1) ? v7 : v6;
2590       FloatRegister tmp3 = (round & 1) ? v2 : v4;
2591       FloatRegister tmp4 = (round & 1) ? v4 : v2;
2592 
2593       if (round < 12) __ sha256su0(d0, __ T4S, d1);
2594        __ orr(v4, __ T16B, v2, v2);
2595       if (round < 15)
2596         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
2597       __ sha256h(v2, __ T4S, v3, tmp2);
2598       __ sha256h2(v3, __ T4S, v4, tmp2);
2599       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
2600 
2601       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2602     }
2603 
2604     __ addv(v0, __ T4S, v0, v2);
2605     __ addv(v1, __ T4S, v1, v3);
2606 
2607     if (multi_block) {
2608       __ add(ofs, ofs, 64);
2609       __ cmp(ofs, limit);
2610       __ br(Assembler::LE, sha1_loop);
2611       __ mov(c_rarg0, ofs); // return ofs
2612     }
2613 
2614     __ ldpd(v10, v11, Address(sp, 16));
2615     __ ldpd(v8, v9, __ post(sp, 32));
2616 
2617     __ stpq(v0, v1, state);
2618 
2619     __ ret(lr);
2620 
2621     return start;
2622   }
2623 
2624 #ifndef BUILTIN_SIM
2625   // Safefetch stubs.
2626   void generate_safefetch(const char* name, int size, address* entry,
2627                           address* fault_pc, address* continuation_pc) {
2628     // safefetch signatures:
2629     //   int      SafeFetch32(int*      adr, int      errValue);
2630     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
2631     //
2632     // arguments:
2633     //   c_rarg0 = adr
2634     //   c_rarg1 = errValue
2635     //
2636     // result:
2637     //   PPC_RET  = *adr or errValue
2638 
2639     StubCodeMark mark(this, "StubRoutines", name);
2640 
2641     // Entry point, pc or function descriptor.
2642     *entry = __ pc();
2643 
2644     // Load *adr into c_rarg1, may fault.
2645     *fault_pc = __ pc();
2646     switch (size) {
2647       case 4:
2648         // int32_t
2649         __ ldrw(c_rarg1, Address(c_rarg0, 0));
2650         break;
2651       case 8:
2652         // int64_t
2653         __ ldr(c_rarg1, Address(c_rarg0, 0));
2654         break;
2655       default:
2656         ShouldNotReachHere();
2657     }
2658 
2659     // return errValue or *adr
2660     *continuation_pc = __ pc();
2661     __ mov(r0, c_rarg1);
2662     __ ret(lr);
2663   }
2664 #endif
2665 
2666   /**
2667    *  Arguments:
2668    *
2669    * Inputs:
2670    *   c_rarg0   - int crc
2671    *   c_rarg1   - byte* buf
2672    *   c_rarg2   - int length
2673    *
2674    * Ouput:
2675    *       rax   - int crc result
2676    */
2677   address generate_updateBytesCRC32() {
2678     assert(UseCRC32Intrinsics, "what are we doing here?");
2679 
2680     __ align(CodeEntryAlignment);
2681     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
2682 
2683     address start = __ pc();
2684 
2685     const Register crc   = c_rarg0;  // crc
2686     const Register buf   = c_rarg1;  // source java byte array address
2687     const Register len   = c_rarg2;  // length
2688     const Register table0 = c_rarg3; // crc_table address
2689     const Register table1 = c_rarg4;
2690     const Register table2 = c_rarg5;
2691     const Register table3 = c_rarg6;
2692     const Register tmp3 = c_rarg7;
2693 
2694     BLOCK_COMMENT("Entry:");
2695     __ enter(); // required for proper stackwalking of RuntimeStub frame
2696 
2697     __ kernel_crc32(crc, buf, len,
2698               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
2699 
2700     __ leave(); // required for proper stackwalking of RuntimeStub frame
2701     __ ret(lr);
2702 
2703     return start;
2704   }
2705 
2706   /**
2707    *  Arguments:
2708    *
2709    * Inputs:
2710    *   c_rarg0   - int crc
2711    *   c_rarg1   - byte* buf
2712    *   c_rarg2   - int length
2713    *   c_rarg3   - int* table
2714    *
2715    * Ouput:
2716    *       r0   - int crc result
2717    */
2718   address generate_updateBytesCRC32C() {
2719     assert(UseCRC32CIntrinsics, "what are we doing here?");
2720 
2721     __ align(CodeEntryAlignment);
2722     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
2723 
2724     address start = __ pc();
2725 
2726     const Register crc   = c_rarg0;  // crc
2727     const Register buf   = c_rarg1;  // source java byte array address
2728     const Register len   = c_rarg2;  // length
2729     const Register table0 = c_rarg3; // crc_table address
2730     const Register table1 = c_rarg4;
2731     const Register table2 = c_rarg5;
2732     const Register table3 = c_rarg6;
2733     const Register tmp3 = c_rarg7;
2734 
2735     BLOCK_COMMENT("Entry:");
2736     __ enter(); // required for proper stackwalking of RuntimeStub frame
2737 
2738     __ kernel_crc32c(crc, buf, len,
2739               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
2740 
2741     __ leave(); // required for proper stackwalking of RuntimeStub frame
2742     __ ret(lr);
2743 
2744     return start;
2745   }
2746 
2747   /***
2748    *  Arguments:
2749    *
2750    *  Inputs:
2751    *   c_rarg0   - int   adler
2752    *   c_rarg1   - byte* buff
2753    *   c_rarg2   - int   len
2754    *
2755    * Output:
2756    *   c_rarg0   - int adler result
2757    */
2758   address generate_updateBytesAdler32() {
2759     __ align(CodeEntryAlignment);
2760     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
2761     address start = __ pc();
2762 
2763     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
2764 
2765     // Aliases
2766     Register adler  = c_rarg0;
2767     Register s1     = c_rarg0;
2768     Register s2     = c_rarg3;
2769     Register buff   = c_rarg1;
2770     Register len    = c_rarg2;
2771     Register nmax  = r4;
2772     Register base = r5;
2773     Register count = r6;
2774     Register temp0 = rscratch1;
2775     Register temp1 = rscratch2;
2776     Register temp2 = r7;
2777 
2778     // Max number of bytes we can process before having to take the mod
2779     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
2780     unsigned long BASE = 0xfff1;
2781     unsigned long NMAX = 0x15B0;
2782 
2783     __ mov(base, BASE);
2784     __ mov(nmax, NMAX);
2785 
2786     // s1 is initialized to the lower 16 bits of adler
2787     // s2 is initialized to the upper 16 bits of adler
2788     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
2789     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
2790 
2791     // The pipelined loop needs at least 16 elements for 1 iteration
2792     // It does check this, but it is more effective to skip to the cleanup loop
2793     __ cmp(len, 16);
2794     __ br(Assembler::HS, L_nmax);
2795     __ cbz(len, L_combine);
2796 
2797     __ bind(L_simple_by1_loop);
2798     __ ldrb(temp0, Address(__ post(buff, 1)));
2799     __ add(s1, s1, temp0);
2800     __ add(s2, s2, s1);
2801     __ subs(len, len, 1);
2802     __ br(Assembler::HI, L_simple_by1_loop);
2803 
2804     // s1 = s1 % BASE
2805     __ subs(temp0, s1, base);
2806     __ csel(s1, temp0, s1, Assembler::HS);
2807 
2808     // s2 = s2 % BASE
2809     __ lsr(temp0, s2, 16);
2810     __ lsl(temp1, temp0, 4);
2811     __ sub(temp1, temp1, temp0);
2812     __ add(s2, temp1, s2, ext::uxth);
2813 
2814     __ subs(temp0, s2, base);
2815     __ csel(s2, temp0, s2, Assembler::HS);
2816 
2817     __ b(L_combine);
2818 
2819     __ bind(L_nmax);
2820     __ subs(len, len, nmax);
2821     __ sub(count, nmax, 16);
2822     __ br(Assembler::LO, L_by16);
2823 
2824     __ bind(L_nmax_loop);
2825 
2826     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
2827 
2828     __ add(s1, s1, temp0, ext::uxtb);
2829     __ ubfx(temp2, temp0, 8, 8);
2830     __ add(s2, s2, s1);
2831     __ add(s1, s1, temp2);
2832     __ ubfx(temp2, temp0, 16, 8);
2833     __ add(s2, s2, s1);
2834     __ add(s1, s1, temp2);
2835     __ ubfx(temp2, temp0, 24, 8);
2836     __ add(s2, s2, s1);
2837     __ add(s1, s1, temp2);
2838     __ ubfx(temp2, temp0, 32, 8);
2839     __ add(s2, s2, s1);
2840     __ add(s1, s1, temp2);
2841     __ ubfx(temp2, temp0, 40, 8);
2842     __ add(s2, s2, s1);
2843     __ add(s1, s1, temp2);
2844     __ ubfx(temp2, temp0, 48, 8);
2845     __ add(s2, s2, s1);
2846     __ add(s1, s1, temp2);
2847     __ add(s2, s2, s1);
2848     __ add(s1, s1, temp0, Assembler::LSR, 56);
2849     __ add(s2, s2, s1);
2850 
2851     __ add(s1, s1, temp1, ext::uxtb);
2852     __ ubfx(temp2, temp1, 8, 8);
2853     __ add(s2, s2, s1);
2854     __ add(s1, s1, temp2);
2855     __ ubfx(temp2, temp1, 16, 8);
2856     __ add(s2, s2, s1);
2857     __ add(s1, s1, temp2);
2858     __ ubfx(temp2, temp1, 24, 8);
2859     __ add(s2, s2, s1);
2860     __ add(s1, s1, temp2);
2861     __ ubfx(temp2, temp1, 32, 8);
2862     __ add(s2, s2, s1);
2863     __ add(s1, s1, temp2);
2864     __ ubfx(temp2, temp1, 40, 8);
2865     __ add(s2, s2, s1);
2866     __ add(s1, s1, temp2);
2867     __ ubfx(temp2, temp1, 48, 8);
2868     __ add(s2, s2, s1);
2869     __ add(s1, s1, temp2);
2870     __ add(s2, s2, s1);
2871     __ add(s1, s1, temp1, Assembler::LSR, 56);
2872     __ add(s2, s2, s1);
2873 
2874     __ subs(count, count, 16);
2875     __ br(Assembler::HS, L_nmax_loop);
2876 
2877     // s1 = s1 % BASE
2878     __ lsr(temp0, s1, 16);
2879     __ lsl(temp1, temp0, 4);
2880     __ sub(temp1, temp1, temp0);
2881     __ add(temp1, temp1, s1, ext::uxth);
2882 
2883     __ lsr(temp0, temp1, 16);
2884     __ lsl(s1, temp0, 4);
2885     __ sub(s1, s1, temp0);
2886     __ add(s1, s1, temp1, ext:: uxth);
2887 
2888     __ subs(temp0, s1, base);
2889     __ csel(s1, temp0, s1, Assembler::HS);
2890 
2891     // s2 = s2 % BASE
2892     __ lsr(temp0, s2, 16);
2893     __ lsl(temp1, temp0, 4);
2894     __ sub(temp1, temp1, temp0);
2895     __ add(temp1, temp1, s2, ext::uxth);
2896 
2897     __ lsr(temp0, temp1, 16);
2898     __ lsl(s2, temp0, 4);
2899     __ sub(s2, s2, temp0);
2900     __ add(s2, s2, temp1, ext:: uxth);
2901 
2902     __ subs(temp0, s2, base);
2903     __ csel(s2, temp0, s2, Assembler::HS);
2904 
2905     __ subs(len, len, nmax);
2906     __ sub(count, nmax, 16);
2907     __ br(Assembler::HS, L_nmax_loop);
2908 
2909     __ bind(L_by16);
2910     __ adds(len, len, count);
2911     __ br(Assembler::LO, L_by1);
2912 
2913     __ bind(L_by16_loop);
2914 
2915     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
2916 
2917     __ add(s1, s1, temp0, ext::uxtb);
2918     __ ubfx(temp2, temp0, 8, 8);
2919     __ add(s2, s2, s1);
2920     __ add(s1, s1, temp2);
2921     __ ubfx(temp2, temp0, 16, 8);
2922     __ add(s2, s2, s1);
2923     __ add(s1, s1, temp2);
2924     __ ubfx(temp2, temp0, 24, 8);
2925     __ add(s2, s2, s1);
2926     __ add(s1, s1, temp2);
2927     __ ubfx(temp2, temp0, 32, 8);
2928     __ add(s2, s2, s1);
2929     __ add(s1, s1, temp2);
2930     __ ubfx(temp2, temp0, 40, 8);
2931     __ add(s2, s2, s1);
2932     __ add(s1, s1, temp2);
2933     __ ubfx(temp2, temp0, 48, 8);
2934     __ add(s2, s2, s1);
2935     __ add(s1, s1, temp2);
2936     __ add(s2, s2, s1);
2937     __ add(s1, s1, temp0, Assembler::LSR, 56);
2938     __ add(s2, s2, s1);
2939 
2940     __ add(s1, s1, temp1, ext::uxtb);
2941     __ ubfx(temp2, temp1, 8, 8);
2942     __ add(s2, s2, s1);
2943     __ add(s1, s1, temp2);
2944     __ ubfx(temp2, temp1, 16, 8);
2945     __ add(s2, s2, s1);
2946     __ add(s1, s1, temp2);
2947     __ ubfx(temp2, temp1, 24, 8);
2948     __ add(s2, s2, s1);
2949     __ add(s1, s1, temp2);
2950     __ ubfx(temp2, temp1, 32, 8);
2951     __ add(s2, s2, s1);
2952     __ add(s1, s1, temp2);
2953     __ ubfx(temp2, temp1, 40, 8);
2954     __ add(s2, s2, s1);
2955     __ add(s1, s1, temp2);
2956     __ ubfx(temp2, temp1, 48, 8);
2957     __ add(s2, s2, s1);
2958     __ add(s1, s1, temp2);
2959     __ add(s2, s2, s1);
2960     __ add(s1, s1, temp1, Assembler::LSR, 56);
2961     __ add(s2, s2, s1);
2962 
2963     __ subs(len, len, 16);
2964     __ br(Assembler::HS, L_by16_loop);
2965 
2966     __ bind(L_by1);
2967     __ adds(len, len, 15);
2968     __ br(Assembler::LO, L_do_mod);
2969 
2970     __ bind(L_by1_loop);
2971     __ ldrb(temp0, Address(__ post(buff, 1)));
2972     __ add(s1, temp0, s1);
2973     __ add(s2, s2, s1);
2974     __ subs(len, len, 1);
2975     __ br(Assembler::HS, L_by1_loop);
2976 
2977     __ bind(L_do_mod);
2978     // s1 = s1 % BASE
2979     __ lsr(temp0, s1, 16);
2980     __ lsl(temp1, temp0, 4);
2981     __ sub(temp1, temp1, temp0);
2982     __ add(temp1, temp1, s1, ext::uxth);
2983 
2984     __ lsr(temp0, temp1, 16);
2985     __ lsl(s1, temp0, 4);
2986     __ sub(s1, s1, temp0);
2987     __ add(s1, s1, temp1, ext:: uxth);
2988 
2989     __ subs(temp0, s1, base);
2990     __ csel(s1, temp0, s1, Assembler::HS);
2991 
2992     // s2 = s2 % BASE
2993     __ lsr(temp0, s2, 16);
2994     __ lsl(temp1, temp0, 4);
2995     __ sub(temp1, temp1, temp0);
2996     __ add(temp1, temp1, s2, ext::uxth);
2997 
2998     __ lsr(temp0, temp1, 16);
2999     __ lsl(s2, temp0, 4);
3000     __ sub(s2, s2, temp0);
3001     __ add(s2, s2, temp1, ext:: uxth);
3002 
3003     __ subs(temp0, s2, base);
3004     __ csel(s2, temp0, s2, Assembler::HS);
3005 
3006     // Combine lower bits and higher bits
3007     __ bind(L_combine);
3008     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3009 
3010     __ ret(lr);
3011 
3012     return start;
3013   }
3014 
3015   /**
3016    *  Arguments:
3017    *
3018    *  Input:
3019    *    c_rarg0   - x address
3020    *    c_rarg1   - x length
3021    *    c_rarg2   - y address
3022    *    c_rarg3   - y lenth
3023    *    c_rarg4   - z address
3024    *    c_rarg5   - z length
3025    */
3026   address generate_multiplyToLen() {
3027     __ align(CodeEntryAlignment);
3028     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3029 
3030     address start = __ pc();
3031     const Register x     = r0;
3032     const Register xlen  = r1;
3033     const Register y     = r2;
3034     const Register ylen  = r3;
3035     const Register z     = r4;
3036     const Register zlen  = r5;
3037 
3038     const Register tmp1  = r10;
3039     const Register tmp2  = r11;
3040     const Register tmp3  = r12;
3041     const Register tmp4  = r13;
3042     const Register tmp5  = r14;
3043     const Register tmp6  = r15;
3044     const Register tmp7  = r16;
3045 
3046     BLOCK_COMMENT("Entry:");
3047     __ enter(); // required for proper stackwalking of RuntimeStub frame
3048     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3049     __ leave(); // required for proper stackwalking of RuntimeStub frame
3050     __ ret(lr);
3051 
3052     return start;
3053   }
3054 
3055   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3056                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3057                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3058     // Karatsuba multiplication performs a 128*128 -> 256-bit
3059     // multiplication in three 128-bit multiplications and a few
3060     // additions.
3061     //
3062     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3063     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3064     //
3065     // Inputs:
3066     //
3067     // A0 in a.d[0]     (subkey)
3068     // A1 in a.d[1]
3069     // (A1+A0) in a1_xor_a0.d[0]
3070     //
3071     // B0 in b.d[0]     (state)
3072     // B1 in b.d[1]
3073 
3074     __ ext(tmp1, __ T16B, b, b, 0x08);
3075     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3076     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3077     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3078     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3079 
3080     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3081     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3082     __ eor(tmp2, __ T16B, tmp2, tmp4);
3083     __ eor(tmp2, __ T16B, tmp2, tmp3);
3084 
3085     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3086     __ ins(result_hi, __ D, tmp2, 0, 1);
3087     __ ins(result_lo, __ D, tmp2, 1, 0);
3088   }
3089 
3090   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3091                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3092     const FloatRegister t0 = result;
3093 
3094     // The GCM field polynomial f is z^128 + p(z), where p =
3095     // z^7+z^2+z+1.
3096     //
3097     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3098     //
3099     // so, given that the product we're reducing is
3100     //    a == lo + hi * z^128
3101     // substituting,
3102     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3103     //
3104     // we reduce by multiplying hi by p(z) and subtracting the result
3105     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3106     // bits we can do this with two 64-bit multiplications, lo*p and
3107     // hi*p.
3108 
3109     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3110     __ ext(t1, __ T16B, t0, z, 8);
3111     __ eor(hi, __ T16B, hi, t1);
3112     __ ext(t1, __ T16B, z, t0, 8);
3113     __ eor(lo, __ T16B, lo, t1);
3114     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3115     __ eor(result, __ T16B, lo, t0);
3116   }
3117 
3118   /**
3119    *  Arguments:
3120    *
3121    *  Input:
3122    *  c_rarg0   - current state address
3123    *  c_rarg1   - H key address
3124    *  c_rarg2   - data address
3125    *  c_rarg3   - number of blocks
3126    *
3127    *  Output:
3128    *  Updated state at c_rarg0
3129    */
3130   address generate_ghash_processBlocks() {
3131     // Bafflingly, GCM uses little-endian for the byte order, but
3132     // big-endian for the bit order.  For example, the polynomial 1 is
3133     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3134     //
3135     // So, we must either reverse the bytes in each word and do
3136     // everything big-endian or reverse the bits in each byte and do
3137     // it little-endian.  On AArch64 it's more idiomatic to reverse
3138     // the bits in each byte (we have an instruction, RBIT, to do
3139     // that) and keep the data in little-endian bit order throught the
3140     // calculation, bit-reversing the inputs and outputs.
3141 
3142     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3143     __ align(wordSize * 2);
3144     address p = __ pc();
3145     __ emit_int64(0x87);  // The low-order bits of the field
3146                           // polynomial (i.e. p = z^7+z^2+z+1)
3147                           // repeated in the low and high parts of a
3148                           // 128-bit vector
3149     __ emit_int64(0x87);
3150 
3151     __ align(CodeEntryAlignment);
3152     address start = __ pc();
3153 
3154     Register state   = c_rarg0;
3155     Register subkeyH = c_rarg1;
3156     Register data    = c_rarg2;
3157     Register blocks  = c_rarg3;
3158 
3159     FloatRegister vzr = v30;
3160     __ eor(vzr, __ T16B, vzr, vzr); // zero register
3161 
3162     __ ldrq(v0, Address(state));
3163     __ ldrq(v1, Address(subkeyH));
3164 
3165     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3166     __ rbit(v0, __ T16B, v0);
3167     __ rev64(v1, __ T16B, v1);
3168     __ rbit(v1, __ T16B, v1);
3169 
3170     __ ldrq(v26, p);
3171 
3172     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
3173     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
3174 
3175     {
3176       Label L_ghash_loop;
3177       __ bind(L_ghash_loop);
3178 
3179       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
3180                                                  // reversing each byte
3181       __ rbit(v2, __ T16B, v2);
3182       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
3183 
3184       // Multiply state in v2 by subkey in v1
3185       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
3186                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
3187                      /*temps*/v6, v20, v18, v21);
3188       // Reduce v7:v5 by the field polynomial
3189       ghash_reduce(v0, v5, v7, v26, vzr, v20);
3190 
3191       __ sub(blocks, blocks, 1);
3192       __ cbnz(blocks, L_ghash_loop);
3193     }
3194 
3195     // The bit-reversed result is at this point in v0
3196     __ rev64(v1, __ T16B, v0);
3197     __ rbit(v1, __ T16B, v1);
3198 
3199     __ st1(v1, __ T16B, state);
3200     __ ret(lr);
3201 
3202     return start;
3203   }
3204 
3205   // Continuation point for throwing of implicit exceptions that are
3206   // not handled in the current activation. Fabricates an exception
3207   // oop and initiates normal exception dispatching in this
3208   // frame. Since we need to preserve callee-saved values (currently
3209   // only for C2, but done for C1 as well) we need a callee-saved oop
3210   // map and therefore have to make these stubs into RuntimeStubs
3211   // rather than BufferBlobs.  If the compiler needs all registers to
3212   // be preserved between the fault point and the exception handler
3213   // then it must assume responsibility for that in
3214   // AbstractCompiler::continuation_for_implicit_null_exception or
3215   // continuation_for_implicit_division_by_zero_exception. All other
3216   // implicit exceptions (e.g., NullPointerException or
3217   // AbstractMethodError on entry) are either at call sites or
3218   // otherwise assume that stack unwinding will be initiated, so
3219   // caller saved registers were assumed volatile in the compiler.
3220 
3221 #undef __
3222 #define __ masm->
3223 
3224   address generate_throw_exception(const char* name,
3225                                    address runtime_entry,
3226                                    Register arg1 = noreg,
3227                                    Register arg2 = noreg) {
3228     // Information about frame layout at time of blocking runtime call.
3229     // Note that we only have to preserve callee-saved registers since
3230     // the compilers are responsible for supplying a continuation point
3231     // if they expect all registers to be preserved.
3232     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3233     enum layout {
3234       rfp_off = 0,
3235       rfp_off2,
3236       return_off,
3237       return_off2,
3238       framesize // inclusive of return address
3239     };
3240 
3241     int insts_size = 512;
3242     int locs_size  = 64;
3243 
3244     CodeBuffer code(name, insts_size, locs_size);
3245     OopMapSet* oop_maps  = new OopMapSet();
3246     MacroAssembler* masm = new MacroAssembler(&code);
3247 
3248     address start = __ pc();
3249 
3250     // This is an inlined and slightly modified version of call_VM
3251     // which has the ability to fetch the return PC out of
3252     // thread-local storage and also sets up last_Java_sp slightly
3253     // differently than the real call_VM
3254 
3255     __ enter(); // Save FP and LR before call
3256 
3257     assert(is_even(framesize/2), "sp not 16-byte aligned");
3258 
3259     // lr and fp are already in place
3260     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3261 
3262     int frame_complete = __ pc() - start;
3263 
3264     // Set up last_Java_sp and last_Java_fp
3265     address the_pc = __ pc();
3266     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3267 
3268     // Call runtime
3269     if (arg1 != noreg) {
3270       assert(arg2 != c_rarg1, "clobbered");
3271       __ mov(c_rarg1, arg1);
3272     }
3273     if (arg2 != noreg) {
3274       __ mov(c_rarg2, arg2);
3275     }
3276     __ mov(c_rarg0, rthread);
3277     BLOCK_COMMENT("call runtime_entry");
3278     __ mov(rscratch1, runtime_entry);
3279     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
3280 
3281     // Generate oop map
3282     OopMap* map = new OopMap(framesize, 0);
3283 
3284     oop_maps->add_gc_map(the_pc - start, map);
3285 
3286     __ reset_last_Java_frame(true, true);
3287     __ maybe_isb();
3288 
3289     __ leave();
3290 
3291     // check for pending exceptions
3292 #ifdef ASSERT
3293     Label L;
3294     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3295     __ cbnz(rscratch1, L);
3296     __ should_not_reach_here();
3297     __ bind(L);
3298 #endif // ASSERT
3299     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3300 
3301 
3302     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3303     RuntimeStub* stub =
3304       RuntimeStub::new_runtime_stub(name,
3305                                     &code,
3306                                     frame_complete,
3307                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3308                                     oop_maps, false);
3309     return stub->entry_point();
3310   }
3311 
3312   class MontgomeryMultiplyGenerator : public MacroAssembler {
3313 
3314     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3315       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
3316 
3317     RegSet _toSave;
3318     bool _squaring;
3319 
3320   public:
3321     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3322       : MacroAssembler(as->code()), _squaring(squaring) {
3323 
3324       // Register allocation
3325 
3326       Register reg = c_rarg0;
3327       Pa_base = reg;       // Argument registers
3328       if (squaring)
3329         Pb_base = Pa_base;
3330       else
3331         Pb_base = ++reg;
3332       Pn_base = ++reg;
3333       Rlen= ++reg;
3334       inv = ++reg;
3335       Pm_base = ++reg;
3336 
3337                           // Working registers:
3338       Ra =  ++reg;        // The current digit of a, b, n, and m.
3339       Rb =  ++reg;
3340       Rm =  ++reg;
3341       Rn =  ++reg;
3342 
3343       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
3344       Pb =  ++reg;
3345       Pm =  ++reg;
3346       Pn =  ++reg;
3347 
3348       t0 =  ++reg;        // Three registers which form a
3349       t1 =  ++reg;        // triple-precision accumuator.
3350       t2 =  ++reg;
3351 
3352       Ri =  ++reg;        // Inner and outer loop indexes.
3353       Rj =  ++reg;
3354 
3355       Rhi_ab = ++reg;     // Product registers: low and high parts
3356       Rlo_ab = ++reg;     // of a*b and m*n.
3357       Rhi_mn = ++reg;
3358       Rlo_mn = ++reg;
3359 
3360       // r19 and up are callee-saved.
3361       _toSave = RegSet::range(r19, reg) + Pm_base;
3362     }
3363 
3364   private:
3365     void save_regs() {
3366       push(_toSave, sp);
3367     }
3368 
3369     void restore_regs() {
3370       pop(_toSave, sp);
3371     }
3372 
3373     template <typename T>
3374     void unroll_2(Register count, T block) {
3375       Label loop, end, odd;
3376       tbnz(count, 0, odd);
3377       cbz(count, end);
3378       align(16);
3379       bind(loop);
3380       (this->*block)();
3381       bind(odd);
3382       (this->*block)();
3383       subs(count, count, 2);
3384       br(Assembler::GT, loop);
3385       bind(end);
3386     }
3387 
3388     template <typename T>
3389     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3390       Label loop, end, odd;
3391       tbnz(count, 0, odd);
3392       cbz(count, end);
3393       align(16);
3394       bind(loop);
3395       (this->*block)(d, s, tmp);
3396       bind(odd);
3397       (this->*block)(d, s, tmp);
3398       subs(count, count, 2);
3399       br(Assembler::GT, loop);
3400       bind(end);
3401     }
3402 
3403     void pre1(RegisterOrConstant i) {
3404       block_comment("pre1");
3405       // Pa = Pa_base;
3406       // Pb = Pb_base + i;
3407       // Pm = Pm_base;
3408       // Pn = Pn_base + i;
3409       // Ra = *Pa;
3410       // Rb = *Pb;
3411       // Rm = *Pm;
3412       // Rn = *Pn;
3413       ldr(Ra, Address(Pa_base));
3414       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3415       ldr(Rm, Address(Pm_base));
3416       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3417       lea(Pa, Address(Pa_base));
3418       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3419       lea(Pm, Address(Pm_base));
3420       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3421 
3422       // Zero the m*n result.
3423       mov(Rhi_mn, zr);
3424       mov(Rlo_mn, zr);
3425     }
3426 
3427     // The core multiply-accumulate step of a Montgomery
3428     // multiplication.  The idea is to schedule operations as a
3429     // pipeline so that instructions with long latencies (loads and
3430     // multiplies) have time to complete before their results are
3431     // used.  This most benefits in-order implementations of the
3432     // architecture but out-of-order ones also benefit.
3433     void step() {
3434       block_comment("step");
3435       // MACC(Ra, Rb, t0, t1, t2);
3436       // Ra = *++Pa;
3437       // Rb = *--Pb;
3438       umulh(Rhi_ab, Ra, Rb);
3439       mul(Rlo_ab, Ra, Rb);
3440       ldr(Ra, pre(Pa, wordSize));
3441       ldr(Rb, pre(Pb, -wordSize));
3442       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
3443                                        // previous iteration.
3444       // MACC(Rm, Rn, t0, t1, t2);
3445       // Rm = *++Pm;
3446       // Rn = *--Pn;
3447       umulh(Rhi_mn, Rm, Rn);
3448       mul(Rlo_mn, Rm, Rn);
3449       ldr(Rm, pre(Pm, wordSize));
3450       ldr(Rn, pre(Pn, -wordSize));
3451       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3452     }
3453 
3454     void post1() {
3455       block_comment("post1");
3456 
3457       // MACC(Ra, Rb, t0, t1, t2);
3458       // Ra = *++Pa;
3459       // Rb = *--Pb;
3460       umulh(Rhi_ab, Ra, Rb);
3461       mul(Rlo_ab, Ra, Rb);
3462       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3463       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3464 
3465       // *Pm = Rm = t0 * inv;
3466       mul(Rm, t0, inv);
3467       str(Rm, Address(Pm));
3468 
3469       // MACC(Rm, Rn, t0, t1, t2);
3470       // t0 = t1; t1 = t2; t2 = 0;
3471       umulh(Rhi_mn, Rm, Rn);
3472 
3473 #ifndef PRODUCT
3474       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3475       {
3476         mul(Rlo_mn, Rm, Rn);
3477         add(Rlo_mn, t0, Rlo_mn);
3478         Label ok;
3479         cbz(Rlo_mn, ok); {
3480           stop("broken Montgomery multiply");
3481         } bind(ok);
3482       }
3483 #endif
3484       // We have very carefully set things up so that
3485       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3486       // the lower half of Rm * Rn because we know the result already:
3487       // it must be -t0.  t0 + (-t0) must generate a carry iff
3488       // t0 != 0.  So, rather than do a mul and an adds we just set
3489       // the carry flag iff t0 is nonzero.
3490       //
3491       // mul(Rlo_mn, Rm, Rn);
3492       // adds(zr, t0, Rlo_mn);
3493       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3494       adcs(t0, t1, Rhi_mn);
3495       adc(t1, t2, zr);
3496       mov(t2, zr);
3497     }
3498 
3499     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
3500       block_comment("pre2");
3501       // Pa = Pa_base + i-len;
3502       // Pb = Pb_base + len;
3503       // Pm = Pm_base + i-len;
3504       // Pn = Pn_base + len;
3505 
3506       if (i.is_register()) {
3507         sub(Rj, i.as_register(), len);
3508       } else {
3509         mov(Rj, i.as_constant());
3510         sub(Rj, Rj, len);
3511       }
3512       // Rj == i-len
3513 
3514       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
3515       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
3516       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3517       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
3518 
3519       // Ra = *++Pa;
3520       // Rb = *--Pb;
3521       // Rm = *++Pm;
3522       // Rn = *--Pn;
3523       ldr(Ra, pre(Pa, wordSize));
3524       ldr(Rb, pre(Pb, -wordSize));
3525       ldr(Rm, pre(Pm, wordSize));
3526       ldr(Rn, pre(Pn, -wordSize));
3527 
3528       mov(Rhi_mn, zr);
3529       mov(Rlo_mn, zr);
3530     }
3531 
3532     void post2(RegisterOrConstant i, RegisterOrConstant len) {
3533       block_comment("post2");
3534       if (i.is_constant()) {
3535         mov(Rj, i.as_constant()-len.as_constant());
3536       } else {
3537         sub(Rj, i.as_register(), len);
3538       }
3539 
3540       adds(t0, t0, Rlo_mn); // The pending m*n, low part
3541 
3542       // As soon as we know the least significant digit of our result,
3543       // store it.
3544       // Pm_base[i-len] = t0;
3545       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3546 
3547       // t0 = t1; t1 = t2; t2 = 0;
3548       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
3549       adc(t1, t2, zr);
3550       mov(t2, zr);
3551     }
3552 
3553     // A carry in t0 after Montgomery multiplication means that we
3554     // should subtract multiples of n from our result in m.  We'll
3555     // keep doing that until there is no carry.
3556     void normalize(RegisterOrConstant len) {
3557       block_comment("normalize");
3558       // while (t0)
3559       //   t0 = sub(Pm_base, Pn_base, t0, len);
3560       Label loop, post, again;
3561       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
3562       cbz(t0, post); {
3563         bind(again); {
3564           mov(i, zr);
3565           mov(cnt, len);
3566           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3567           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3568           subs(zr, zr, zr); // set carry flag, i.e. no borrow
3569           align(16);
3570           bind(loop); {
3571             sbcs(Rm, Rm, Rn);
3572             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3573             add(i, i, 1);
3574             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3575             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3576             sub(cnt, cnt, 1);
3577           } cbnz(cnt, loop);
3578           sbc(t0, t0, zr);
3579         } cbnz(t0, again);
3580       } bind(post);
3581     }
3582 
3583     // Move memory at s to d, reversing words.
3584     //    Increments d to end of copied memory
3585     //    Destroys tmp1, tmp2
3586     //    Preserves len
3587     //    Leaves s pointing to the address which was in d at start
3588     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
3589       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
3590 
3591       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
3592       mov(tmp1, len);
3593       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
3594       sub(s, d, len, ext::uxtw, LogBytesPerWord);
3595     }
3596     // where
3597     void reverse1(Register d, Register s, Register tmp) {
3598       ldr(tmp, pre(s, -wordSize));
3599       ror(tmp, tmp, 32);
3600       str(tmp, post(d, wordSize));
3601     }
3602 
3603     void step_squaring() {
3604       // An extra ACC
3605       step();
3606       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3607     }
3608 
3609     void last_squaring(RegisterOrConstant i) {
3610       Label dont;
3611       // if ((i & 1) == 0) {
3612       tbnz(i.as_register(), 0, dont); {
3613         // MACC(Ra, Rb, t0, t1, t2);
3614         // Ra = *++Pa;
3615         // Rb = *--Pb;
3616         umulh(Rhi_ab, Ra, Rb);
3617         mul(Rlo_ab, Ra, Rb);
3618         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3619       } bind(dont);
3620     }
3621 
3622     void extra_step_squaring() {
3623       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3624 
3625       // MACC(Rm, Rn, t0, t1, t2);
3626       // Rm = *++Pm;
3627       // Rn = *--Pn;
3628       umulh(Rhi_mn, Rm, Rn);
3629       mul(Rlo_mn, Rm, Rn);
3630       ldr(Rm, pre(Pm, wordSize));
3631       ldr(Rn, pre(Pn, -wordSize));
3632     }
3633 
3634     void post1_squaring() {
3635       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3636 
3637       // *Pm = Rm = t0 * inv;
3638       mul(Rm, t0, inv);
3639       str(Rm, Address(Pm));
3640 
3641       // MACC(Rm, Rn, t0, t1, t2);
3642       // t0 = t1; t1 = t2; t2 = 0;
3643       umulh(Rhi_mn, Rm, Rn);
3644 
3645 #ifndef PRODUCT
3646       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3647       {
3648         mul(Rlo_mn, Rm, Rn);
3649         add(Rlo_mn, t0, Rlo_mn);
3650         Label ok;
3651         cbz(Rlo_mn, ok); {
3652           stop("broken Montgomery multiply");
3653         } bind(ok);
3654       }
3655 #endif
3656       // We have very carefully set things up so that
3657       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3658       // the lower half of Rm * Rn because we know the result already:
3659       // it must be -t0.  t0 + (-t0) must generate a carry iff
3660       // t0 != 0.  So, rather than do a mul and an adds we just set
3661       // the carry flag iff t0 is nonzero.
3662       //
3663       // mul(Rlo_mn, Rm, Rn);
3664       // adds(zr, t0, Rlo_mn);
3665       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3666       adcs(t0, t1, Rhi_mn);
3667       adc(t1, t2, zr);
3668       mov(t2, zr);
3669     }
3670 
3671     void acc(Register Rhi, Register Rlo,
3672              Register t0, Register t1, Register t2) {
3673       adds(t0, t0, Rlo);
3674       adcs(t1, t1, Rhi);
3675       adc(t2, t2, zr);
3676     }
3677 
3678   public:
3679     /**
3680      * Fast Montgomery multiplication.  The derivation of the
3681      * algorithm is in A Cryptographic Library for the Motorola
3682      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3683      *
3684      * Arguments:
3685      *
3686      * Inputs for multiplication:
3687      *   c_rarg0   - int array elements a
3688      *   c_rarg1   - int array elements b
3689      *   c_rarg2   - int array elements n (the modulus)
3690      *   c_rarg3   - int length
3691      *   c_rarg4   - int inv
3692      *   c_rarg5   - int array elements m (the result)
3693      *
3694      * Inputs for squaring:
3695      *   c_rarg0   - int array elements a
3696      *   c_rarg1   - int array elements n (the modulus)
3697      *   c_rarg2   - int length
3698      *   c_rarg3   - int inv
3699      *   c_rarg4   - int array elements m (the result)
3700      *
3701      */
3702     address generate_multiply() {
3703       Label argh, nothing;
3704       bind(argh);
3705       stop("MontgomeryMultiply total_allocation must be <= 8192");
3706 
3707       align(CodeEntryAlignment);
3708       address entry = pc();
3709 
3710       cbzw(Rlen, nothing);
3711 
3712       enter();
3713 
3714       // Make room.
3715       cmpw(Rlen, 512);
3716       br(Assembler::HI, argh);
3717       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
3718       andr(sp, Ra, -2 * wordSize);
3719 
3720       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
3721 
3722       {
3723         // Copy input args, reversing as we go.  We use Ra as a
3724         // temporary variable.
3725         reverse(Ra, Pa_base, Rlen, t0, t1);
3726         if (!_squaring)
3727           reverse(Ra, Pb_base, Rlen, t0, t1);
3728         reverse(Ra, Pn_base, Rlen, t0, t1);
3729       }
3730 
3731       // Push all call-saved registers and also Pm_base which we'll need
3732       // at the end.
3733       save_regs();
3734 
3735 #ifndef PRODUCT
3736       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3737       {
3738         ldr(Rn, Address(Pn_base, 0));
3739         mul(Rlo_mn, Rn, inv);
3740         cmp(Rlo_mn, -1);
3741         Label ok;
3742         br(EQ, ok); {
3743           stop("broken inverse in Montgomery multiply");
3744         } bind(ok);
3745       }
3746 #endif
3747 
3748       mov(Pm_base, Ra);
3749 
3750       mov(t0, zr);
3751       mov(t1, zr);
3752       mov(t2, zr);
3753 
3754       block_comment("for (int i = 0; i < len; i++) {");
3755       mov(Ri, zr); {
3756         Label loop, end;
3757         cmpw(Ri, Rlen);
3758         br(Assembler::GE, end);
3759 
3760         bind(loop);
3761         pre1(Ri);
3762 
3763         block_comment("  for (j = i; j; j--) {"); {
3764           movw(Rj, Ri);
3765           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3766         } block_comment("  } // j");
3767 
3768         post1();
3769         addw(Ri, Ri, 1);
3770         cmpw(Ri, Rlen);
3771         br(Assembler::LT, loop);
3772         bind(end);
3773         block_comment("} // i");
3774       }
3775 
3776       block_comment("for (int i = len; i < 2*len; i++) {");
3777       mov(Ri, Rlen); {
3778         Label loop, end;
3779         cmpw(Ri, Rlen, Assembler::LSL, 1);
3780         br(Assembler::GE, end);
3781 
3782         bind(loop);
3783         pre2(Ri, Rlen);
3784 
3785         block_comment("  for (j = len*2-i-1; j; j--) {"); {
3786           lslw(Rj, Rlen, 1);
3787           subw(Rj, Rj, Ri);
3788           subw(Rj, Rj, 1);
3789           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3790         } block_comment("  } // j");
3791 
3792         post2(Ri, Rlen);
3793         addw(Ri, Ri, 1);
3794         cmpw(Ri, Rlen, Assembler::LSL, 1);
3795         br(Assembler::LT, loop);
3796         bind(end);
3797       }
3798       block_comment("} // i");
3799 
3800       normalize(Rlen);
3801 
3802       mov(Ra, Pm_base);  // Save Pm_base in Ra
3803       restore_regs();  // Restore caller's Pm_base
3804 
3805       // Copy our result into caller's Pm_base
3806       reverse(Pm_base, Ra, Rlen, t0, t1);
3807 
3808       leave();
3809       bind(nothing);
3810       ret(lr);
3811 
3812       return entry;
3813     }
3814     // In C, approximately:
3815 
3816     // void
3817     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
3818     //                     unsigned long Pn_base[], unsigned long Pm_base[],
3819     //                     unsigned long inv, int len) {
3820     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3821     //   unsigned long *Pa, *Pb, *Pn, *Pm;
3822     //   unsigned long Ra, Rb, Rn, Rm;
3823 
3824     //   int i;
3825 
3826     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
3827 
3828     //   for (i = 0; i < len; i++) {
3829     //     int j;
3830 
3831     //     Pa = Pa_base;
3832     //     Pb = Pb_base + i;
3833     //     Pm = Pm_base;
3834     //     Pn = Pn_base + i;
3835 
3836     //     Ra = *Pa;
3837     //     Rb = *Pb;
3838     //     Rm = *Pm;
3839     //     Rn = *Pn;
3840 
3841     //     int iters = i;
3842     //     for (j = 0; iters--; j++) {
3843     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
3844     //       MACC(Ra, Rb, t0, t1, t2);
3845     //       Ra = *++Pa;
3846     //       Rb = *--Pb;
3847     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
3848     //       MACC(Rm, Rn, t0, t1, t2);
3849     //       Rm = *++Pm;
3850     //       Rn = *--Pn;
3851     //     }
3852 
3853     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
3854     //     MACC(Ra, Rb, t0, t1, t2);
3855     //     *Pm = Rm = t0 * inv;
3856     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
3857     //     MACC(Rm, Rn, t0, t1, t2);
3858 
3859     //     assert(t0 == 0, "broken Montgomery multiply");
3860 
3861     //     t0 = t1; t1 = t2; t2 = 0;
3862     //   }
3863 
3864     //   for (i = len; i < 2*len; i++) {
3865     //     int j;
3866 
3867     //     Pa = Pa_base + i-len;
3868     //     Pb = Pb_base + len;
3869     //     Pm = Pm_base + i-len;
3870     //     Pn = Pn_base + len;
3871 
3872     //     Ra = *++Pa;
3873     //     Rb = *--Pb;
3874     //     Rm = *++Pm;
3875     //     Rn = *--Pn;
3876 
3877     //     int iters = len*2-i-1;
3878     //     for (j = i-len+1; iters--; j++) {
3879     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
3880     //       MACC(Ra, Rb, t0, t1, t2);
3881     //       Ra = *++Pa;
3882     //       Rb = *--Pb;
3883     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
3884     //       MACC(Rm, Rn, t0, t1, t2);
3885     //       Rm = *++Pm;
3886     //       Rn = *--Pn;
3887     //     }
3888 
3889     //     Pm_base[i-len] = t0;
3890     //     t0 = t1; t1 = t2; t2 = 0;
3891     //   }
3892 
3893     //   while (t0)
3894     //     t0 = sub(Pm_base, Pn_base, t0, len);
3895     // }
3896 
3897     /**
3898      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
3899      * multiplies than Montgomery multiplication so it should be up to
3900      * 25% faster.  However, its loop control is more complex and it
3901      * may actually run slower on some machines.
3902      *
3903      * Arguments:
3904      *
3905      * Inputs:
3906      *   c_rarg0   - int array elements a
3907      *   c_rarg1   - int array elements n (the modulus)
3908      *   c_rarg2   - int length
3909      *   c_rarg3   - int inv
3910      *   c_rarg4   - int array elements m (the result)
3911      *
3912      */
3913     address generate_square() {
3914       Label argh;
3915       bind(argh);
3916       stop("MontgomeryMultiply total_allocation must be <= 8192");
3917 
3918       align(CodeEntryAlignment);
3919       address entry = pc();
3920 
3921       enter();
3922 
3923       // Make room.
3924       cmpw(Rlen, 512);
3925       br(Assembler::HI, argh);
3926       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
3927       andr(sp, Ra, -2 * wordSize);
3928 
3929       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
3930 
3931       {
3932         // Copy input args, reversing as we go.  We use Ra as a
3933         // temporary variable.
3934         reverse(Ra, Pa_base, Rlen, t0, t1);
3935         reverse(Ra, Pn_base, Rlen, t0, t1);
3936       }
3937 
3938       // Push all call-saved registers and also Pm_base which we'll need
3939       // at the end.
3940       save_regs();
3941 
3942       mov(Pm_base, Ra);
3943 
3944       mov(t0, zr);
3945       mov(t1, zr);
3946       mov(t2, zr);
3947 
3948       block_comment("for (int i = 0; i < len; i++) {");
3949       mov(Ri, zr); {
3950         Label loop, end;
3951         bind(loop);
3952         cmp(Ri, Rlen);
3953         br(Assembler::GE, end);
3954 
3955         pre1(Ri);
3956 
3957         block_comment("for (j = (i+1)/2; j; j--) {"); {
3958           add(Rj, Ri, 1);
3959           lsr(Rj, Rj, 1);
3960           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
3961         } block_comment("  } // j");
3962 
3963         last_squaring(Ri);
3964 
3965         block_comment("  for (j = i/2; j; j--) {"); {
3966           lsr(Rj, Ri, 1);
3967           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
3968         } block_comment("  } // j");
3969 
3970         post1_squaring();
3971         add(Ri, Ri, 1);
3972         cmp(Ri, Rlen);
3973         br(Assembler::LT, loop);
3974 
3975         bind(end);
3976         block_comment("} // i");
3977       }
3978 
3979       block_comment("for (int i = len; i < 2*len; i++) {");
3980       mov(Ri, Rlen); {
3981         Label loop, end;
3982         bind(loop);
3983         cmp(Ri, Rlen, Assembler::LSL, 1);
3984         br(Assembler::GE, end);
3985 
3986         pre2(Ri, Rlen);
3987 
3988         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
3989           lsl(Rj, Rlen, 1);
3990           sub(Rj, Rj, Ri);
3991           sub(Rj, Rj, 1);
3992           lsr(Rj, Rj, 1);
3993           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
3994         } block_comment("  } // j");
3995 
3996         last_squaring(Ri);
3997 
3998         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
3999           lsl(Rj, Rlen, 1);
4000           sub(Rj, Rj, Ri);
4001           lsr(Rj, Rj, 1);
4002           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4003         } block_comment("  } // j");
4004 
4005         post2(Ri, Rlen);
4006         add(Ri, Ri, 1);
4007         cmp(Ri, Rlen, Assembler::LSL, 1);
4008 
4009         br(Assembler::LT, loop);
4010         bind(end);
4011         block_comment("} // i");
4012       }
4013 
4014       normalize(Rlen);
4015 
4016       mov(Ra, Pm_base);  // Save Pm_base in Ra
4017       restore_regs();  // Restore caller's Pm_base
4018 
4019       // Copy our result into caller's Pm_base
4020       reverse(Pm_base, Ra, Rlen, t0, t1);
4021 
4022       leave();
4023       ret(lr);
4024 
4025       return entry;
4026     }
4027     // In C, approximately:
4028 
4029     // void
4030     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4031     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4032     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4033     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4034     //   unsigned long Ra, Rb, Rn, Rm;
4035 
4036     //   int i;
4037 
4038     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4039 
4040     //   for (i = 0; i < len; i++) {
4041     //     int j;
4042 
4043     //     Pa = Pa_base;
4044     //     Pb = Pa_base + i;
4045     //     Pm = Pm_base;
4046     //     Pn = Pn_base + i;
4047 
4048     //     Ra = *Pa;
4049     //     Rb = *Pb;
4050     //     Rm = *Pm;
4051     //     Rn = *Pn;
4052 
4053     //     int iters = (i+1)/2;
4054     //     for (j = 0; iters--; j++) {
4055     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4056     //       MACC2(Ra, Rb, t0, t1, t2);
4057     //       Ra = *++Pa;
4058     //       Rb = *--Pb;
4059     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4060     //       MACC(Rm, Rn, t0, t1, t2);
4061     //       Rm = *++Pm;
4062     //       Rn = *--Pn;
4063     //     }
4064     //     if ((i & 1) == 0) {
4065     //       assert(Ra == Pa_base[j], "must be");
4066     //       MACC(Ra, Ra, t0, t1, t2);
4067     //     }
4068     //     iters = i/2;
4069     //     assert(iters == i-j, "must be");
4070     //     for (; iters--; j++) {
4071     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4072     //       MACC(Rm, Rn, t0, t1, t2);
4073     //       Rm = *++Pm;
4074     //       Rn = *--Pn;
4075     //     }
4076 
4077     //     *Pm = Rm = t0 * inv;
4078     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4079     //     MACC(Rm, Rn, t0, t1, t2);
4080 
4081     //     assert(t0 == 0, "broken Montgomery multiply");
4082 
4083     //     t0 = t1; t1 = t2; t2 = 0;
4084     //   }
4085 
4086     //   for (i = len; i < 2*len; i++) {
4087     //     int start = i-len+1;
4088     //     int end = start + (len - start)/2;
4089     //     int j;
4090 
4091     //     Pa = Pa_base + i-len;
4092     //     Pb = Pa_base + len;
4093     //     Pm = Pm_base + i-len;
4094     //     Pn = Pn_base + len;
4095 
4096     //     Ra = *++Pa;
4097     //     Rb = *--Pb;
4098     //     Rm = *++Pm;
4099     //     Rn = *--Pn;
4100 
4101     //     int iters = (2*len-i-1)/2;
4102     //     assert(iters == end-start, "must be");
4103     //     for (j = start; iters--; j++) {
4104     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4105     //       MACC2(Ra, Rb, t0, t1, t2);
4106     //       Ra = *++Pa;
4107     //       Rb = *--Pb;
4108     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4109     //       MACC(Rm, Rn, t0, t1, t2);
4110     //       Rm = *++Pm;
4111     //       Rn = *--Pn;
4112     //     }
4113     //     if ((i & 1) == 0) {
4114     //       assert(Ra == Pa_base[j], "must be");
4115     //       MACC(Ra, Ra, t0, t1, t2);
4116     //     }
4117     //     iters =  (2*len-i)/2;
4118     //     assert(iters == len-j, "must be");
4119     //     for (; iters--; j++) {
4120     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4121     //       MACC(Rm, Rn, t0, t1, t2);
4122     //       Rm = *++Pm;
4123     //       Rn = *--Pn;
4124     //     }
4125     //     Pm_base[i-len] = t0;
4126     //     t0 = t1; t1 = t2; t2 = 0;
4127     //   }
4128 
4129     //   while (t0)
4130     //     t0 = sub(Pm_base, Pn_base, t0, len);
4131     // }
4132   };
4133 
4134   // Initialization
4135   void generate_initial() {
4136     // Generate initial stubs and initializes the entry points
4137 
4138     // entry points that exist in all platforms Note: This is code
4139     // that could be shared among different platforms - however the
4140     // benefit seems to be smaller than the disadvantage of having a
4141     // much more complicated generator structure. See also comment in
4142     // stubRoutines.hpp.
4143 
4144     StubRoutines::_forward_exception_entry = generate_forward_exception();
4145 
4146     StubRoutines::_call_stub_entry =
4147       generate_call_stub(StubRoutines::_call_stub_return_address);
4148 
4149     // is referenced by megamorphic call
4150     StubRoutines::_catch_exception_entry = generate_catch_exception();
4151 
4152     // Build this early so it's available for the interpreter.
4153     StubRoutines::_throw_StackOverflowError_entry =
4154       generate_throw_exception("StackOverflowError throw_exception",
4155                                CAST_FROM_FN_PTR(address,
4156                                                 SharedRuntime::
4157                                                 throw_StackOverflowError));
4158     if (UseCRC32Intrinsics) {
4159       // set table address before stub generation which use it
4160       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4161       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4162     }
4163   }
4164 
4165   void generate_all() {
4166     // support for verify_oop (must happen after universe_init)
4167     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4168     StubRoutines::_throw_AbstractMethodError_entry =
4169       generate_throw_exception("AbstractMethodError throw_exception",
4170                                CAST_FROM_FN_PTR(address,
4171                                                 SharedRuntime::
4172                                                 throw_AbstractMethodError));
4173 
4174     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4175       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4176                                CAST_FROM_FN_PTR(address,
4177                                                 SharedRuntime::
4178                                                 throw_IncompatibleClassChangeError));
4179 
4180     StubRoutines::_throw_NullPointerException_at_call_entry =
4181       generate_throw_exception("NullPointerException at call throw_exception",
4182                                CAST_FROM_FN_PTR(address,
4183                                                 SharedRuntime::
4184                                                 throw_NullPointerException_at_call));
4185 
4186     // arraycopy stubs used by compilers
4187     generate_arraycopy_stubs();
4188 
4189     if (UseMultiplyToLenIntrinsic) {
4190       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4191     }
4192 
4193     if (UseMontgomeryMultiplyIntrinsic) {
4194       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4195       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4196       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4197     }
4198 
4199     if (UseMontgomerySquareIntrinsic) {
4200       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4201       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4202       // We use generate_multiply() rather than generate_square()
4203       // because it's faster for the sizes of modulus we care about.
4204       StubRoutines::_montgomerySquare = g.generate_multiply();
4205     }
4206 
4207 #ifndef BUILTIN_SIM
4208     // generate GHASH intrinsics code
4209     if (UseGHASHIntrinsics) {
4210       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4211     }
4212 
4213     if (UseAESIntrinsics) {
4214       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4215       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4216       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4217       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4218     }
4219 
4220     if (UseSHA1Intrinsics) {
4221       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4222       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4223     }
4224     if (UseSHA256Intrinsics) {
4225       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4226       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4227     }
4228 
4229     if (UseCRC32CIntrinsics) {
4230       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
4231     }
4232 
4233     // generate Adler32 intrinsics code
4234     if (UseAdler32Intrinsics) {
4235       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
4236     }
4237 
4238     // Safefetch stubs.
4239     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4240                                                        &StubRoutines::_safefetch32_fault_pc,
4241                                                        &StubRoutines::_safefetch32_continuation_pc);
4242     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4243                                                        &StubRoutines::_safefetchN_fault_pc,
4244                                                        &StubRoutines::_safefetchN_continuation_pc);
4245 #endif
4246   }
4247 
4248  public:
4249   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4250     if (all) {
4251       generate_all();
4252     } else {
4253       generate_initial();
4254     }
4255   }
4256 }; // end class declaration
4257 
4258 void StubGenerator_generate(CodeBuffer* code, bool all) {
4259   StubGenerator g(code, all);
4260 }