1 /*
   2  * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "interpreter/interpreter.hpp"
  30 #include "nativeInst_aarch64.hpp"
  31 #include "oops/instanceOop.hpp"
  32 #include "oops/method.hpp"
  33 #include "oops/objArrayKlass.hpp"
  34 #include "oops/oop.inline.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/frame.inline.hpp"
  37 #include "runtime/handles.inline.hpp"
  38 #include "runtime/sharedRuntime.hpp"
  39 #include "runtime/stubCodeGenerator.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "runtime/thread.inline.hpp"
  42 #include "utilities/top.hpp"
  43 #ifdef COMPILER2
  44 #include "opto/runtime.hpp"
  45 #endif
  46 
  47 #ifdef BUILTIN_SIM
  48 #include "../../../../../../simulator/simulator.hpp"
  49 #endif
  50 
  51 // Declaration and definition of StubGenerator (no .hpp file).
  52 // For a more detailed description of the stub routine structure
  53 // see the comment in stubRoutines.hpp
  54 
  55 #undef __
  56 #define __ _masm->
  57 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #else
  62 #define BLOCK_COMMENT(str) __ block_comment(str)
  63 #endif
  64 
  65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  66 
  67 // Stub Code definitions
  68 
  69 class StubGenerator: public StubCodeGenerator {
  70  private:
  71 
  72 #ifdef PRODUCT
  73 #define inc_counter_np(counter) ((void)0)
  74 #else
  75   void inc_counter_np_(int& counter) {
  76     __ lea(rscratch2, ExternalAddress((address)&counter));
  77     __ ldrw(rscratch1, Address(rscratch2));
  78     __ addw(rscratch1, rscratch1, 1);
  79     __ strw(rscratch1, Address(rscratch2));
  80   }
  81 #define inc_counter_np(counter) \
  82   BLOCK_COMMENT("inc_counter " #counter); \
  83   inc_counter_np_(counter);
  84 #endif
  85 
  86   // Call stubs are used to call Java from C
  87   //
  88   // Arguments:
  89   //    c_rarg0:   call wrapper address                   address
  90   //    c_rarg1:   result                                 address
  91   //    c_rarg2:   result type                            BasicType
  92   //    c_rarg3:   method                                 Method*
  93   //    c_rarg4:   (interpreter) entry point              address
  94   //    c_rarg5:   parameters                             intptr_t*
  95   //    c_rarg6:   parameter size (in words)              int
  96   //    c_rarg7:   thread                                 Thread*
  97   //
  98   // There is no return from the stub itself as any Java result
  99   // is written to result
 100   //
 101   // we save r30 (lr) as the return PC at the base of the frame and
 102   // link r29 (fp) below it as the frame pointer installing sp (r31)
 103   // into fp.
 104   //
 105   // we save r0-r7, which accounts for all the c arguments.
 106   //
 107   // TODO: strictly do we need to save them all? they are treated as
 108   // volatile by C so could we omit saving the ones we are going to
 109   // place in global registers (thread? method?) or those we only use
 110   // during setup of the Java call?
 111   //
 112   // we don't need to save r8 which C uses as an indirect result location
 113   // return register.
 114   //
 115   // we don't need to save r9-r15 which both C and Java treat as
 116   // volatile
 117   //
 118   // we don't need to save r16-18 because Java does not use them
 119   //
 120   // we save r19-r28 which Java uses as scratch registers and C
 121   // expects to be callee-save
 122   //
 123   // we save the bottom 64 bits of each value stored in v8-v15; it is
 124   // the responsibility of the caller to preserve larger values.
 125   //
 126   // so the stub frame looks like this when we enter Java code
 127   //
 128   //     [ return_from_Java     ] <--- sp
 129   //     [ argument word n      ]
 130   //      ...
 131   // -27 [ argument word 1      ]
 132   // -26 [ saved v15            ] <--- sp_after_call
 133   // -25 [ saved v14            ]
 134   // -24 [ saved v13            ]
 135   // -23 [ saved v12            ]
 136   // -22 [ saved v11            ]
 137   // -21 [ saved v10            ]
 138   // -20 [ saved v9             ]
 139   // -19 [ saved v8             ]
 140   // -18 [ saved r28            ]
 141   // -17 [ saved r27            ]
 142   // -16 [ saved r26            ]
 143   // -15 [ saved r25            ]
 144   // -14 [ saved r24            ]
 145   // -13 [ saved r23            ]
 146   // -12 [ saved r22            ]
 147   // -11 [ saved r21            ]
 148   // -10 [ saved r20            ]
 149   //  -9 [ saved r19            ]
 150   //  -8 [ call wrapper    (r0) ]
 151   //  -7 [ result          (r1) ]
 152   //  -6 [ result type     (r2) ]
 153   //  -5 [ method          (r3) ]
 154   //  -4 [ entry point     (r4) ]
 155   //  -3 [ parameters      (r5) ]
 156   //  -2 [ parameter size  (r6) ]
 157   //  -1 [ thread (r7)          ]
 158   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 159   //   1 [ saved lr       (r30) ]
 160 
 161   // Call stub stack layout word offsets from fp
 162   enum call_stub_layout {
 163     sp_after_call_off = -26,
 164 
 165     d15_off            = -26,
 166     d13_off            = -24,
 167     d11_off            = -22,
 168     d9_off             = -20,
 169 
 170     r28_off            = -18,
 171     r26_off            = -16,
 172     r24_off            = -14,
 173     r22_off            = -12,
 174     r20_off            = -10,
 175     call_wrapper_off   =  -8,
 176     result_off         =  -7,
 177     result_type_off    =  -6,
 178     method_off         =  -5,
 179     entry_point_off    =  -4,
 180     parameter_size_off =  -2,
 181     thread_off         =  -1,
 182     fp_f               =   0,
 183     retaddr_off        =   1,
 184   };
 185 
 186   address generate_call_stub(address& return_address) {
 187     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 188            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 189            "adjust this code");
 190 
 191     StubCodeMark mark(this, "StubRoutines", "call_stub");
 192     address start = __ pc();
 193 
 194     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 195 
 196     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 197     const Address result        (rfp, result_off         * wordSize);
 198     const Address result_type   (rfp, result_type_off    * wordSize);
 199     const Address method        (rfp, method_off         * wordSize);
 200     const Address entry_point   (rfp, entry_point_off    * wordSize);
 201     const Address parameter_size(rfp, parameter_size_off * wordSize);
 202 
 203     const Address thread        (rfp, thread_off         * wordSize);
 204 
 205     const Address d15_save      (rfp, d15_off * wordSize);
 206     const Address d13_save      (rfp, d13_off * wordSize);
 207     const Address d11_save      (rfp, d11_off * wordSize);
 208     const Address d9_save       (rfp, d9_off * wordSize);
 209 
 210     const Address r28_save      (rfp, r28_off * wordSize);
 211     const Address r26_save      (rfp, r26_off * wordSize);
 212     const Address r24_save      (rfp, r24_off * wordSize);
 213     const Address r22_save      (rfp, r22_off * wordSize);
 214     const Address r20_save      (rfp, r20_off * wordSize);
 215 
 216     // stub code
 217 
 218     // we need a C prolog to bootstrap the x86 caller into the sim
 219     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 220 
 221     address aarch64_entry = __ pc();
 222 
 223 #ifdef BUILTIN_SIM
 224     // Save sender's SP for stack traces.
 225     __ mov(rscratch1, sp);
 226     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 227 #endif
 228     // set up frame and move sp to end of save area
 229     __ enter();
 230     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 231 
 232     // save register parameters and Java scratch/global registers
 233     // n.b. we save thread even though it gets installed in
 234     // rthread because we want to sanity check rthread later
 235     __ str(c_rarg7,  thread);
 236     __ strw(c_rarg6, parameter_size);
 237     __ stp(c_rarg4, c_rarg5,  entry_point);
 238     __ stp(c_rarg2, c_rarg3,  result_type);
 239     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 240 
 241     __ stp(r20, r19,   r20_save);
 242     __ stp(r22, r21,   r22_save);
 243     __ stp(r24, r23,   r24_save);
 244     __ stp(r26, r25,   r26_save);
 245     __ stp(r28, r27,   r28_save);
 246 
 247     __ stpd(v9,  v8,   d9_save);
 248     __ stpd(v11, v10,  d11_save);
 249     __ stpd(v13, v12,  d13_save);
 250     __ stpd(v15, v14,  d15_save);
 251 
 252     // install Java thread in global register now we have saved
 253     // whatever value it held
 254     __ mov(rthread, c_rarg7);
 255     // And method
 256     __ mov(rmethod, c_rarg3);
 257 
 258     // set up the heapbase register
 259     __ reinit_heapbase();
 260 
 261 #ifdef ASSERT
 262     // make sure we have no pending exceptions
 263     {
 264       Label L;
 265       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 266       __ cmp(rscratch1, (unsigned)NULL_WORD);
 267       __ br(Assembler::EQ, L);
 268       __ stop("StubRoutines::call_stub: entered with pending exception");
 269       __ BIND(L);
 270     }
 271 #endif
 272     // pass parameters if any
 273     __ mov(esp, sp);
 274     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 275     __ andr(sp, rscratch1, -2 * wordSize);
 276 
 277     BLOCK_COMMENT("pass parameters if any");
 278     Label parameters_done;
 279     // parameter count is still in c_rarg6
 280     // and parameter pointer identifying param 1 is in c_rarg5
 281     __ cbzw(c_rarg6, parameters_done);
 282 
 283     address loop = __ pc();
 284     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 285     __ subsw(c_rarg6, c_rarg6, 1);
 286     __ push(rscratch1);
 287     __ br(Assembler::GT, loop);
 288 
 289     __ BIND(parameters_done);
 290 
 291     // call Java entry -- passing methdoOop, and current sp
 292     //      rmethod: Method*
 293     //      r13: sender sp
 294     BLOCK_COMMENT("call Java function");
 295     __ mov(r13, sp);
 296     __ blr(c_rarg4);
 297 
 298     // tell the simulator we have returned to the stub
 299 
 300     // we do this here because the notify will already have been done
 301     // if we get to the next instruction via an exception
 302     //
 303     // n.b. adding this instruction here affects the calculation of
 304     // whether or not a routine returns to the call stub (used when
 305     // doing stack walks) since the normal test is to check the return
 306     // pc against the address saved below. so we may need to allow for
 307     // this extra instruction in the check.
 308 
 309     if (NotifySimulator) {
 310       __ notify(Assembler::method_reentry);
 311     }
 312     // save current address for use by exception handling code
 313 
 314     return_address = __ pc();
 315 
 316     // store result depending on type (everything that is not
 317     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 318     // n.b. this assumes Java returns an integral result in r0
 319     // and a floating result in j_farg0
 320     __ ldr(j_rarg2, result);
 321     Label is_long, is_float, is_double, exit;
 322     __ ldr(j_rarg1, result_type);
 323     __ cmp(j_rarg1, T_OBJECT);
 324     __ br(Assembler::EQ, is_long);
 325     __ cmp(j_rarg1, T_LONG);
 326     __ br(Assembler::EQ, is_long);
 327     __ cmp(j_rarg1, T_FLOAT);
 328     __ br(Assembler::EQ, is_float);
 329     __ cmp(j_rarg1, T_DOUBLE);
 330     __ br(Assembler::EQ, is_double);
 331 
 332     // handle T_INT case
 333     __ strw(r0, Address(j_rarg2));
 334 
 335     __ BIND(exit);
 336 
 337     // pop parameters
 338     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 339 
 340 #ifdef ASSERT
 341     // verify that threads correspond
 342     {
 343       Label L, S;
 344       __ ldr(rscratch1, thread);
 345       __ cmp(rthread, rscratch1);
 346       __ br(Assembler::NE, S);
 347       __ get_thread(rscratch1);
 348       __ cmp(rthread, rscratch1);
 349       __ br(Assembler::EQ, L);
 350       __ BIND(S);
 351       __ stop("StubRoutines::call_stub: threads must correspond");
 352       __ BIND(L);
 353     }
 354 #endif
 355 
 356     // restore callee-save registers
 357     __ ldpd(v15, v14,  d15_save);
 358     __ ldpd(v13, v12,  d13_save);
 359     __ ldpd(v11, v10,  d11_save);
 360     __ ldpd(v9,  v8,   d9_save);
 361 
 362     __ ldp(r28, r27,   r28_save);
 363     __ ldp(r26, r25,   r26_save);
 364     __ ldp(r24, r23,   r24_save);
 365     __ ldp(r22, r21,   r22_save);
 366     __ ldp(r20, r19,   r20_save);
 367 
 368     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 369     __ ldrw(c_rarg2, result_type);
 370     __ ldr(c_rarg3,  method);
 371     __ ldp(c_rarg4, c_rarg5,  entry_point);
 372     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 373 
 374 #ifndef PRODUCT
 375     // tell the simulator we are about to end Java execution
 376     if (NotifySimulator) {
 377       __ notify(Assembler::method_exit);
 378     }
 379 #endif
 380     // leave frame and return to caller
 381     __ leave();
 382     __ ret(lr);
 383 
 384     // handle return types different from T_INT
 385 
 386     __ BIND(is_long);
 387     __ str(r0, Address(j_rarg2, 0));
 388     __ br(Assembler::AL, exit);
 389 
 390     __ BIND(is_float);
 391     __ strs(j_farg0, Address(j_rarg2, 0));
 392     __ br(Assembler::AL, exit);
 393 
 394     __ BIND(is_double);
 395     __ strd(j_farg0, Address(j_rarg2, 0));
 396     __ br(Assembler::AL, exit);
 397 
 398     return start;
 399   }
 400 
 401   // Return point for a Java call if there's an exception thrown in
 402   // Java code.  The exception is caught and transformed into a
 403   // pending exception stored in JavaThread that can be tested from
 404   // within the VM.
 405   //
 406   // Note: Usually the parameters are removed by the callee. In case
 407   // of an exception crossing an activation frame boundary, that is
 408   // not the case if the callee is compiled code => need to setup the
 409   // rsp.
 410   //
 411   // r0: exception oop
 412 
 413   // NOTE: this is used as a target from the signal handler so it
 414   // needs an x86 prolog which returns into the current simulator
 415   // executing the generated catch_exception code. so the prolog
 416   // needs to install rax in a sim register and adjust the sim's
 417   // restart pc to enter the generated code at the start position
 418   // then return from native to simulated execution.
 419 
 420   address generate_catch_exception() {
 421     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 422     address start = __ pc();
 423 
 424     // same as in generate_call_stub():
 425     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 426     const Address thread        (rfp, thread_off         * wordSize);
 427 
 428 #ifdef ASSERT
 429     // verify that threads correspond
 430     {
 431       Label L, S;
 432       __ ldr(rscratch1, thread);
 433       __ cmp(rthread, rscratch1);
 434       __ br(Assembler::NE, S);
 435       __ get_thread(rscratch1);
 436       __ cmp(rthread, rscratch1);
 437       __ br(Assembler::EQ, L);
 438       __ bind(S);
 439       __ stop("StubRoutines::catch_exception: threads must correspond");
 440       __ bind(L);
 441     }
 442 #endif
 443 
 444     // set pending exception
 445     __ verify_oop(r0);
 446 
 447     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 448     __ mov(rscratch1, (address)__FILE__);
 449     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 450     __ movw(rscratch1, (int)__LINE__);
 451     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 452 
 453     // complete return to VM
 454     assert(StubRoutines::_call_stub_return_address != NULL,
 455            "_call_stub_return_address must have been generated before");
 456     __ b(StubRoutines::_call_stub_return_address);
 457 
 458     return start;
 459   }
 460 
 461   // Continuation point for runtime calls returning with a pending
 462   // exception.  The pending exception check happened in the runtime
 463   // or native call stub.  The pending exception in Thread is
 464   // converted into a Java-level exception.
 465   //
 466   // Contract with Java-level exception handlers:
 467   // r0: exception
 468   // r3: throwing pc
 469   //
 470   // NOTE: At entry of this stub, exception-pc must be in LR !!
 471 
 472   // NOTE: this is always used as a jump target within generated code
 473   // so it just needs to be generated code wiht no x86 prolog
 474 
 475   address generate_forward_exception() {
 476     StubCodeMark mark(this, "StubRoutines", "forward exception");
 477     address start = __ pc();
 478 
 479     // Upon entry, LR points to the return address returning into
 480     // Java (interpreted or compiled) code; i.e., the return address
 481     // becomes the throwing pc.
 482     //
 483     // Arguments pushed before the runtime call are still on the stack
 484     // but the exception handler will reset the stack pointer ->
 485     // ignore them.  A potential result in registers can be ignored as
 486     // well.
 487 
 488 #ifdef ASSERT
 489     // make sure this code is only executed if there is a pending exception
 490     {
 491       Label L;
 492       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 493       __ cbnz(rscratch1, L);
 494       __ stop("StubRoutines::forward exception: no pending exception (1)");
 495       __ bind(L);
 496     }
 497 #endif
 498 
 499     // compute exception handler into r19
 500 
 501     // call the VM to find the handler address associated with the
 502     // caller address. pass thread in r0 and caller pc (ret address)
 503     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 504     // the stack.
 505     __ mov(c_rarg1, lr);
 506     // lr will be trashed by the VM call so we move it to R19
 507     // (callee-saved) because we also need to pass it to the handler
 508     // returned by this call.
 509     __ mov(r19, lr);
 510     BLOCK_COMMENT("call exception_handler_for_return_address");
 511     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 512                          SharedRuntime::exception_handler_for_return_address),
 513                     rthread, c_rarg1);
 514     // we should not really care that lr is no longer the callee
 515     // address. we saved the value the handler needs in r19 so we can
 516     // just copy it to r3. however, the C2 handler will push its own
 517     // frame and then calls into the VM and the VM code asserts that
 518     // the PC for the frame above the handler belongs to a compiled
 519     // Java method. So, we restore lr here to satisfy that assert.
 520     __ mov(lr, r19);
 521     // setup r0 & r3 & clear pending exception
 522     __ mov(r3, r19);
 523     __ mov(r19, r0);
 524     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 525     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 526 
 527 #ifdef ASSERT
 528     // make sure exception is set
 529     {
 530       Label L;
 531       __ cbnz(r0, L);
 532       __ stop("StubRoutines::forward exception: no pending exception (2)");
 533       __ bind(L);
 534     }
 535 #endif
 536 
 537     // continue at exception handler
 538     // r0: exception
 539     // r3: throwing pc
 540     // r19: exception handler
 541     __ verify_oop(r0);
 542     __ br(r19);
 543 
 544     return start;
 545   }
 546 
 547   // Non-destructive plausibility checks for oops
 548   //
 549   // Arguments:
 550   //    r0: oop to verify
 551   //    rscratch1: error message
 552   //
 553   // Stack after saving c_rarg3:
 554   //    [tos + 0]: saved c_rarg3
 555   //    [tos + 1]: saved c_rarg2
 556   //    [tos + 2]: saved lr
 557   //    [tos + 3]: saved rscratch2
 558   //    [tos + 4]: saved r0
 559   //    [tos + 5]: saved rscratch1
 560   address generate_verify_oop() {
 561 
 562     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 563     address start = __ pc();
 564 
 565     Label exit, error;
 566 
 567     // save c_rarg2 and c_rarg3
 568     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 569 
 570     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 571     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 572     __ ldr(c_rarg3, Address(c_rarg2));
 573     __ add(c_rarg3, c_rarg3, 1);
 574     __ str(c_rarg3, Address(c_rarg2));
 575 
 576     // object is in r0
 577     // make sure object is 'reasonable'
 578     __ cbz(r0, exit); // if obj is NULL it is OK
 579 
 580     // Check if the oop is in the right area of memory
 581     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 582     __ andr(c_rarg2, r0, c_rarg3);
 583     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 584 
 585     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 586     // instruction here because the flags register is live.
 587     __ eor(c_rarg2, c_rarg2, c_rarg3);
 588     __ cbnz(c_rarg2, error);
 589 
 590     // make sure klass is 'reasonable', which is not zero.
 591     __ load_klass(r0, r0);  // get klass
 592     __ cbz(r0, error);      // if klass is NULL it is broken
 593 
 594     // return if everything seems ok
 595     __ bind(exit);
 596 
 597     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 598     __ ret(lr);
 599 
 600     // handle errors
 601     __ bind(error);
 602     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 603 
 604     __ push(RegSet::range(r0, r29), sp);
 605     // debug(char* msg, int64_t pc, int64_t regs[])
 606     __ mov(c_rarg0, rscratch1);      // pass address of error message
 607     __ mov(c_rarg1, lr);             // pass return address
 608     __ mov(c_rarg2, sp);             // pass address of regs on stack
 609 #ifndef PRODUCT
 610     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 611 #endif
 612     BLOCK_COMMENT("call MacroAssembler::debug");
 613     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 614     __ blrt(rscratch1, 3, 0, 1);
 615 
 616     return start;
 617   }
 618 
 619   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 620 
 621   // Generate code for an array write pre barrier
 622   //
 623   //     addr    -  starting address
 624   //     count   -  element count
 625   //     tmp     - scratch register
 626   //
 627   //     Destroy no registers except rscratch1 and rscratch2
 628   //
 629   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 630     BarrierSet* bs = Universe::heap()->barrier_set();
 631     switch (bs->kind()) {
 632     case BarrierSet::G1SATBCTLogging:
 633       // With G1, don't generate the call if we statically know that the target in uninitialized
 634       if (!dest_uninitialized) {
 635         __ push_call_clobbered_registers();
 636         if (count == c_rarg0) {
 637           if (addr == c_rarg1) {
 638             // exactly backwards!!
 639             __ mov(rscratch1, c_rarg0);
 640             __ mov(c_rarg0, c_rarg1);
 641             __ mov(c_rarg1, rscratch1);
 642           } else {
 643             __ mov(c_rarg1, count);
 644             __ mov(c_rarg0, addr);
 645           }
 646         } else {
 647           __ mov(c_rarg0, addr);
 648           __ mov(c_rarg1, count);
 649         }
 650         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 651         __ pop_call_clobbered_registers();
 652         break;
 653       case BarrierSet::CardTableForRS:
 654       case BarrierSet::CardTableExtension:
 655       case BarrierSet::ModRef:
 656         break;
 657       default:
 658         ShouldNotReachHere();
 659 
 660       }
 661     }
 662   }
 663 
 664   //
 665   // Generate code for an array write post barrier
 666   //
 667   //  Input:
 668   //     start    - register containing starting address of destination array
 669   //     end      - register containing ending address of destination array
 670   //     scratch  - scratch register
 671   //
 672   //  The input registers are overwritten.
 673   //  The ending address is inclusive.
 674   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 675     assert_different_registers(start, end, scratch);
 676     BarrierSet* bs = Universe::heap()->barrier_set();
 677     switch (bs->kind()) {
 678       case BarrierSet::G1SATBCTLogging:
 679 
 680         {
 681           __ push_call_clobbered_registers();
 682           // must compute element count unless barrier set interface is changed (other platforms supply count)
 683           assert_different_registers(start, end, scratch);
 684           __ lea(scratch, Address(end, BytesPerHeapOop));
 685           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 686           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 687           __ mov(c_rarg0, start);
 688           __ mov(c_rarg1, scratch);
 689           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 690           __ pop_call_clobbered_registers();
 691         }
 692         break;
 693       case BarrierSet::CardTableForRS:
 694       case BarrierSet::CardTableExtension:
 695         {
 696           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 697           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 698 
 699           Label L_loop;
 700 
 701            __ lsr(start, start, CardTableModRefBS::card_shift);
 702            __ lsr(end, end, CardTableModRefBS::card_shift);
 703            __ sub(end, end, start); // number of bytes to copy
 704 
 705           const Register count = end; // 'end' register contains bytes count now
 706           __ load_byte_map_base(scratch);
 707           __ add(start, start, scratch);
 708           if (UseConcMarkSweepGC) {
 709             __ membar(__ StoreStore);
 710           }
 711           __ BIND(L_loop);
 712           __ strb(zr, Address(start, count));
 713           __ subs(count, count, 1);
 714           __ br(Assembler::HS, L_loop);
 715         }
 716         break;
 717       default:
 718         ShouldNotReachHere();
 719 
 720     }
 721   }
 722 
 723   typedef enum {
 724     copy_forwards = 1,
 725     copy_backwards = -1
 726   } copy_direction;
 727 
 728   // Bulk copy of blocks of 8 words.
 729   //
 730   // count is a count of words.
 731   //
 732   // Precondition: count >= 8
 733   //
 734   // Postconditions:
 735   //
 736   // The least significant bit of count contains the remaining count
 737   // of words to copy.  The rest of count is trash.
 738   //
 739   // s and d are adjusted to point to the remaining words to copy
 740   //
 741   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 742                            copy_direction direction) {
 743     int unit = wordSize * direction;
 744     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 745 
 746     int offset;
 747     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 748       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 749     const Register stride = r13;
 750 
 751     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 752     assert_different_registers(s, d, count, rscratch1);
 753 
 754     Label again, drain;
 755     const char *stub_name;
 756     if (direction == copy_forwards)
 757       stub_name = "foward_copy_longs";
 758     else
 759       stub_name = "backward_copy_longs";
 760     StubCodeMark mark(this, "StubRoutines", stub_name);
 761     __ align(CodeEntryAlignment);
 762     __ bind(start);
 763     if (direction == copy_forwards) {
 764       __ sub(s, s, bias);
 765       __ sub(d, d, bias);
 766     }
 767 
 768 #ifdef ASSERT
 769     // Make sure we are never given < 8 words
 770     {
 771       Label L;
 772       __ cmp(count, 8);
 773       __ br(Assembler::GE, L);
 774       __ stop("genrate_copy_longs called with < 8 words");
 775       __ bind(L);
 776     }
 777 #endif
 778 
 779     // Fill 8 registers
 780     if (UseSIMDForMemoryOps) {
 781       __ ldpq(v0, v1, Address(s, 4 * unit));
 782       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 783     } else {
 784       __ ldp(t0, t1, Address(s, 2 * unit));
 785       __ ldp(t2, t3, Address(s, 4 * unit));
 786       __ ldp(t4, t5, Address(s, 6 * unit));
 787       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 788     }
 789 
 790     __ subs(count, count, 16);
 791     __ br(Assembler::LO, drain);
 792 
 793     int prefetch = PrefetchCopyIntervalInBytes;
 794     bool use_stride = false;
 795     if (direction == copy_backwards) {
 796        use_stride = prefetch > 256;
 797        prefetch = -prefetch;
 798        if (use_stride) __ mov(stride, prefetch);
 799     }
 800 
 801     __ bind(again);
 802 
 803     if (PrefetchCopyIntervalInBytes > 0)
 804       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
 805 
 806     if (UseSIMDForMemoryOps) {
 807       __ stpq(v0, v1, Address(d, 4 * unit));
 808       __ ldpq(v0, v1, Address(s, 4 * unit));
 809       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 810       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 811     } else {
 812       __ stp(t0, t1, Address(d, 2 * unit));
 813       __ ldp(t0, t1, Address(s, 2 * unit));
 814       __ stp(t2, t3, Address(d, 4 * unit));
 815       __ ldp(t2, t3, Address(s, 4 * unit));
 816       __ stp(t4, t5, Address(d, 6 * unit));
 817       __ ldp(t4, t5, Address(s, 6 * unit));
 818       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 819       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
 820     }
 821 
 822     __ subs(count, count, 8);
 823     __ br(Assembler::HS, again);
 824 
 825     // Drain
 826     __ bind(drain);
 827     if (UseSIMDForMemoryOps) {
 828       __ stpq(v0, v1, Address(d, 4 * unit));
 829       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
 830     } else {
 831       __ stp(t0, t1, Address(d, 2 * unit));
 832       __ stp(t2, t3, Address(d, 4 * unit));
 833       __ stp(t4, t5, Address(d, 6 * unit));
 834       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
 835     }
 836 
 837     {
 838       Label L1, L2;
 839       __ tbz(count, exact_log2(4), L1);
 840       if (UseSIMDForMemoryOps) {
 841         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
 842         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
 843       } else {
 844         __ ldp(t0, t1, Address(s, 2 * unit));
 845         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
 846         __ stp(t0, t1, Address(d, 2 * unit));
 847         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
 848       }
 849       __ bind(L1);
 850 
 851       if (direction == copy_forwards) {
 852         __ add(s, s, bias);
 853         __ add(d, d, bias);
 854       }
 855 
 856       __ tbz(count, 1, L2);
 857       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
 858       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
 859       __ bind(L2);
 860     }
 861 
 862     __ ret(lr);
 863   }
 864 
 865   // Small copy: less than 16 bytes.
 866   //
 867   // NB: Ignores all of the bits of count which represent more than 15
 868   // bytes, so a caller doesn't have to mask them.
 869 
 870   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
 871     bool is_backwards = step < 0;
 872     size_t granularity = uabs(step);
 873     int direction = is_backwards ? -1 : 1;
 874     int unit = wordSize * direction;
 875 
 876     Label Lpair, Lword, Lint, Lshort, Lbyte;
 877 
 878     assert(granularity
 879            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
 880 
 881     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
 882 
 883     // ??? I don't know if this bit-test-and-branch is the right thing
 884     // to do.  It does a lot of jumping, resulting in several
 885     // mispredicted branches.  It might make more sense to do this
 886     // with something like Duff's device with a single computed branch.
 887 
 888     __ tbz(count, 3 - exact_log2(granularity), Lword);
 889     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
 890     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
 891     __ bind(Lword);
 892 
 893     if (granularity <= sizeof (jint)) {
 894       __ tbz(count, 2 - exact_log2(granularity), Lint);
 895       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
 896       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
 897       __ bind(Lint);
 898     }
 899 
 900     if (granularity <= sizeof (jshort)) {
 901       __ tbz(count, 1 - exact_log2(granularity), Lshort);
 902       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
 903       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
 904       __ bind(Lshort);
 905     }
 906 
 907     if (granularity <= sizeof (jbyte)) {
 908       __ tbz(count, 0, Lbyte);
 909       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
 910       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
 911       __ bind(Lbyte);
 912     }
 913   }
 914 
 915   Label copy_f, copy_b;
 916 
 917   // All-singing all-dancing memory copy.
 918   //
 919   // Copy count units of memory from s to d.  The size of a unit is
 920   // step, which can be positive or negative depending on the direction
 921   // of copy.  If is_aligned is false, we align the source address.
 922   //
 923 
 924   void copy_memory(bool is_aligned, Register s, Register d,
 925                    Register count, Register tmp, int step) {
 926     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
 927     bool is_backwards = step < 0;
 928     int granularity = uabs(step);
 929     const Register t0 = r3, t1 = r4;
 930 
 931     // <= 96 bytes do inline. Direction doesn't matter because we always
 932     // load all the data before writing anything
 933     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
 934     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
 935     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
 936     const Register send = r17, dend = r18;
 937 
 938     if (PrefetchCopyIntervalInBytes > 0)
 939       __ prfm(Address(s, 0), PLDL1KEEP);
 940     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
 941     __ br(Assembler::HI, copy_big);
 942 
 943     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
 944     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
 945 
 946     __ cmp(count, 16/granularity);
 947     __ br(Assembler::LS, copy16);
 948 
 949     __ cmp(count, 64/granularity);
 950     __ br(Assembler::HI, copy80);
 951 
 952     __ cmp(count, 32/granularity);
 953     __ br(Assembler::LS, copy32);
 954 
 955     // 33..64 bytes
 956     if (UseSIMDForMemoryOps) {
 957       __ ldpq(v0, v1, Address(s, 0));
 958       __ ldpq(v2, v3, Address(send, -32));
 959       __ stpq(v0, v1, Address(d, 0));
 960       __ stpq(v2, v3, Address(dend, -32));
 961     } else {
 962       __ ldp(t0, t1, Address(s, 0));
 963       __ ldp(t2, t3, Address(s, 16));
 964       __ ldp(t4, t5, Address(send, -32));
 965       __ ldp(t6, t7, Address(send, -16));
 966 
 967       __ stp(t0, t1, Address(d, 0));
 968       __ stp(t2, t3, Address(d, 16));
 969       __ stp(t4, t5, Address(dend, -32));
 970       __ stp(t6, t7, Address(dend, -16));
 971     }
 972     __ b(finish);
 973 
 974     // 17..32 bytes
 975     __ bind(copy32);
 976     __ ldp(t0, t1, Address(s, 0));
 977     __ ldp(t2, t3, Address(send, -16));
 978     __ stp(t0, t1, Address(d, 0));
 979     __ stp(t2, t3, Address(dend, -16));
 980     __ b(finish);
 981 
 982     // 65..80/96 bytes
 983     // (96 bytes if SIMD because we do 32 byes per instruction)
 984     __ bind(copy80);
 985     if (UseSIMDForMemoryOps) {
 986       __ ldpq(v0, v1, Address(s, 0));
 987       __ ldpq(v2, v3, Address(s, 32));
 988       __ ldpq(v4, v5, Address(send, -32));
 989       __ stpq(v0, v1, Address(d, 0));
 990       __ stpq(v2, v3, Address(d, 32));
 991       __ stpq(v4, v5, Address(dend, -32));
 992     } else {
 993       __ ldp(t0, t1, Address(s, 0));
 994       __ ldp(t2, t3, Address(s, 16));
 995       __ ldp(t4, t5, Address(s, 32));
 996       __ ldp(t6, t7, Address(s, 48));
 997       __ ldp(t8, t9, Address(send, -16));
 998 
 999       __ stp(t0, t1, Address(d, 0));
1000       __ stp(t2, t3, Address(d, 16));
1001       __ stp(t4, t5, Address(d, 32));
1002       __ stp(t6, t7, Address(d, 48));
1003       __ stp(t8, t9, Address(dend, -16));
1004     }
1005     __ b(finish);
1006 
1007     // 0..16 bytes
1008     __ bind(copy16);
1009     __ cmp(count, 8/granularity);
1010     __ br(Assembler::LO, copy8);
1011 
1012     // 8..16 bytes
1013     __ ldr(t0, Address(s, 0));
1014     __ ldr(t1, Address(send, -8));
1015     __ str(t0, Address(d, 0));
1016     __ str(t1, Address(dend, -8));
1017     __ b(finish);
1018 
1019     if (granularity < 8) {
1020       // 4..7 bytes
1021       __ bind(copy8);
1022       __ tbz(count, 2 - exact_log2(granularity), copy4);
1023       __ ldrw(t0, Address(s, 0));
1024       __ ldrw(t1, Address(send, -4));
1025       __ strw(t0, Address(d, 0));
1026       __ strw(t1, Address(dend, -4));
1027       __ b(finish);
1028       if (granularity < 4) {
1029         // 0..3 bytes
1030         __ bind(copy4);
1031         __ cbz(count, finish); // get rid of 0 case
1032         if (granularity == 2) {
1033           __ ldrh(t0, Address(s, 0));
1034           __ strh(t0, Address(d, 0));
1035         } else { // granularity == 1
1036           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1037           // the first and last byte.
1038           // Handle the 3 byte case by loading and storing base + count/2
1039           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1040           // This does means in the 1 byte case we load/store the same
1041           // byte 3 times.
1042           __ lsr(count, count, 1);
1043           __ ldrb(t0, Address(s, 0));
1044           __ ldrb(t1, Address(send, -1));
1045           __ ldrb(t2, Address(s, count));
1046           __ strb(t0, Address(d, 0));
1047           __ strb(t1, Address(dend, -1));
1048           __ strb(t2, Address(d, count));
1049         }
1050         __ b(finish);
1051       }
1052     }
1053 
1054     __ bind(copy_big);
1055     if (is_backwards) {
1056       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1057       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1058     }
1059 
1060     // Now we've got the small case out of the way we can align the
1061     // source address on a 2-word boundary.
1062 
1063     Label aligned;
1064 
1065     if (is_aligned) {
1066       // We may have to adjust by 1 word to get s 2-word-aligned.
1067       __ tbz(s, exact_log2(wordSize), aligned);
1068       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1069       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1070       __ sub(count, count, wordSize/granularity);
1071     } else {
1072       if (is_backwards) {
1073         __ andr(rscratch2, s, 2 * wordSize - 1);
1074       } else {
1075         __ neg(rscratch2, s);
1076         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1077       }
1078       // rscratch2 is the byte adjustment needed to align s.
1079       __ cbz(rscratch2, aligned);
1080       int shift = exact_log2(granularity);
1081       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1082       __ sub(count, count, rscratch2);
1083 
1084 #if 0
1085       // ?? This code is only correct for a disjoint copy.  It may or
1086       // may not make sense to use it in that case.
1087 
1088       // Copy the first pair; s and d may not be aligned.
1089       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1090       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1091 
1092       // Align s and d, adjust count
1093       if (is_backwards) {
1094         __ sub(s, s, rscratch2);
1095         __ sub(d, d, rscratch2);
1096       } else {
1097         __ add(s, s, rscratch2);
1098         __ add(d, d, rscratch2);
1099       }
1100 #else
1101       copy_memory_small(s, d, rscratch2, rscratch1, step);
1102 #endif
1103     }
1104 
1105     __ bind(aligned);
1106 
1107     // s is now 2-word-aligned.
1108 
1109     // We have a count of units and some trailing bytes.  Adjust the
1110     // count and do a bulk copy of words.
1111     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1112     if (direction == copy_forwards)
1113       __ bl(copy_f);
1114     else
1115       __ bl(copy_b);
1116 
1117     // And the tail.
1118     copy_memory_small(s, d, count, tmp, step);
1119 
1120     if (granularity >= 8) __ bind(copy8);
1121     if (granularity >= 4) __ bind(copy4);
1122     __ bind(finish);
1123   }
1124 
1125 
1126   void clobber_registers() {
1127 #ifdef ASSERT
1128     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1129     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1130     for (Register r = r3; r <= r18; r++)
1131       if (r != rscratch1) __ mov(r, rscratch1);
1132 #endif
1133   }
1134 
1135   // Scan over array at a for count oops, verifying each one.
1136   // Preserves a and count, clobbers rscratch1 and rscratch2.
1137   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1138     Label loop, end;
1139     __ mov(rscratch1, a);
1140     __ mov(rscratch2, zr);
1141     __ bind(loop);
1142     __ cmp(rscratch2, count);
1143     __ br(Assembler::HS, end);
1144     if (size == (size_t)wordSize) {
1145       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1146       __ verify_oop(temp);
1147     } else {
1148       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1149       __ decode_heap_oop(temp); // calls verify_oop
1150     }
1151     __ add(rscratch2, rscratch2, size);
1152     __ b(loop);
1153     __ bind(end);
1154   }
1155 
1156   // Arguments:
1157   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1158   //             ignored
1159   //   is_oop  - true => oop array, so generate store check code
1160   //   name    - stub name string
1161   //
1162   // Inputs:
1163   //   c_rarg0   - source array address
1164   //   c_rarg1   - destination array address
1165   //   c_rarg2   - element count, treated as ssize_t, can be zero
1166   //
1167   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1168   // the hardware handle it.  The two dwords within qwords that span
1169   // cache line boundaries will still be loaded and stored atomicly.
1170   //
1171   // Side Effects:
1172   //   disjoint_int_copy_entry is set to the no-overlap entry point
1173   //   used by generate_conjoint_int_oop_copy().
1174   //
1175   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1176                                   const char *name, bool dest_uninitialized = false) {
1177     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1178     __ align(CodeEntryAlignment);
1179     StubCodeMark mark(this, "StubRoutines", name);
1180     address start = __ pc();
1181     __ enter();
1182 
1183     if (entry != NULL) {
1184       *entry = __ pc();
1185       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1186       BLOCK_COMMENT("Entry:");
1187     }
1188 
1189     if (is_oop) {
1190       __ push(RegSet::of(d, count), sp);
1191       // no registers are destroyed by this call
1192       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1193     }
1194     copy_memory(aligned, s, d, count, rscratch1, size);
1195     if (is_oop) {
1196       __ pop(RegSet::of(d, count), sp);
1197       if (VerifyOops)
1198         verify_oop_array(size, d, count, r16);
1199       __ sub(count, count, 1); // make an inclusive end pointer
1200       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1201       gen_write_ref_array_post_barrier(d, count, rscratch1);
1202     }
1203     __ leave();
1204     __ mov(r0, zr); // return 0
1205     __ ret(lr);
1206 #ifdef BUILTIN_SIM
1207     {
1208       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1209       sim->notifyCompile(const_cast<char*>(name), start);
1210     }
1211 #endif
1212     return start;
1213   }
1214 
1215   // Arguments:
1216   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1217   //             ignored
1218   //   is_oop  - true => oop array, so generate store check code
1219   //   name    - stub name string
1220   //
1221   // Inputs:
1222   //   c_rarg0   - source array address
1223   //   c_rarg1   - destination array address
1224   //   c_rarg2   - element count, treated as ssize_t, can be zero
1225   //
1226   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1227   // the hardware handle it.  The two dwords within qwords that span
1228   // cache line boundaries will still be loaded and stored atomicly.
1229   //
1230   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1231                                  address *entry, const char *name,
1232                                  bool dest_uninitialized = false) {
1233     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1234 
1235     StubCodeMark mark(this, "StubRoutines", name);
1236     address start = __ pc();
1237     __ enter();
1238 
1239     if (entry != NULL) {
1240       *entry = __ pc();
1241       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1242       BLOCK_COMMENT("Entry:");
1243     }
1244 
1245     // use fwd copy when (d-s) above_equal (count*size)
1246     __ sub(rscratch1, d, s);
1247     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1248     __ br(Assembler::HS, nooverlap_target);
1249 
1250     if (is_oop) {
1251       __ push(RegSet::of(d, count), sp);
1252       // no registers are destroyed by this call
1253       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1254     }
1255     copy_memory(aligned, s, d, count, rscratch1, -size);
1256     if (is_oop) {
1257       __ pop(RegSet::of(d, count), sp);
1258       if (VerifyOops)
1259         verify_oop_array(size, d, count, r16);
1260       __ sub(count, count, 1); // make an inclusive end pointer
1261       __ lea(count, Address(d, count, Address::uxtw(exact_log2(size))));
1262       gen_write_ref_array_post_barrier(d, count, rscratch1);
1263     }
1264     __ leave();
1265     __ mov(r0, zr); // return 0
1266     __ ret(lr);
1267 #ifdef BUILTIN_SIM
1268     {
1269       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1270       sim->notifyCompile(const_cast<char*>(name), start);
1271     }
1272 #endif
1273     return start;
1274 }
1275 
1276   // Arguments:
1277   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1278   //             ignored
1279   //   name    - stub name string
1280   //
1281   // Inputs:
1282   //   c_rarg0   - source array address
1283   //   c_rarg1   - destination array address
1284   //   c_rarg2   - element count, treated as ssize_t, can be zero
1285   //
1286   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1287   // we let the hardware handle it.  The one to eight bytes within words,
1288   // dwords or qwords that span cache line boundaries will still be loaded
1289   // and stored atomically.
1290   //
1291   // Side Effects:
1292   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1293   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1294   // we let the hardware handle it.  The one to eight bytes within words,
1295   // dwords or qwords that span cache line boundaries will still be loaded
1296   // and stored atomically.
1297   //
1298   // Side Effects:
1299   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1300   //   used by generate_conjoint_byte_copy().
1301   //
1302   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1303     const bool not_oop = false;
1304     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1305   }
1306 
1307   // Arguments:
1308   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1309   //             ignored
1310   //   name    - stub name string
1311   //
1312   // Inputs:
1313   //   c_rarg0   - source array address
1314   //   c_rarg1   - destination array address
1315   //   c_rarg2   - element count, treated as ssize_t, can be zero
1316   //
1317   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1318   // we let the hardware handle it.  The one to eight bytes within words,
1319   // dwords or qwords that span cache line boundaries will still be loaded
1320   // and stored atomically.
1321   //
1322   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1323                                       address* entry, const char *name) {
1324     const bool not_oop = false;
1325     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1326   }
1327 
1328   // Arguments:
1329   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1330   //             ignored
1331   //   name    - stub name string
1332   //
1333   // Inputs:
1334   //   c_rarg0   - source array address
1335   //   c_rarg1   - destination array address
1336   //   c_rarg2   - element count, treated as ssize_t, can be zero
1337   //
1338   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1339   // let the hardware handle it.  The two or four words within dwords
1340   // or qwords that span cache line boundaries will still be loaded
1341   // and stored atomically.
1342   //
1343   // Side Effects:
1344   //   disjoint_short_copy_entry is set to the no-overlap entry point
1345   //   used by generate_conjoint_short_copy().
1346   //
1347   address generate_disjoint_short_copy(bool aligned,
1348                                        address* entry, const char *name) {
1349     const bool not_oop = false;
1350     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1351   }
1352 
1353   // Arguments:
1354   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1355   //             ignored
1356   //   name    - stub name string
1357   //
1358   // Inputs:
1359   //   c_rarg0   - source array address
1360   //   c_rarg1   - destination array address
1361   //   c_rarg2   - element count, treated as ssize_t, can be zero
1362   //
1363   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1364   // let the hardware handle it.  The two or four words within dwords
1365   // or qwords that span cache line boundaries will still be loaded
1366   // and stored atomically.
1367   //
1368   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1369                                        address *entry, const char *name) {
1370     const bool not_oop = false;
1371     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1372 
1373   }
1374   // Arguments:
1375   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1376   //             ignored
1377   //   name    - stub name string
1378   //
1379   // Inputs:
1380   //   c_rarg0   - source array address
1381   //   c_rarg1   - destination array address
1382   //   c_rarg2   - element count, treated as ssize_t, can be zero
1383   //
1384   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1385   // the hardware handle it.  The two dwords within qwords that span
1386   // cache line boundaries will still be loaded and stored atomicly.
1387   //
1388   // Side Effects:
1389   //   disjoint_int_copy_entry is set to the no-overlap entry point
1390   //   used by generate_conjoint_int_oop_copy().
1391   //
1392   address generate_disjoint_int_copy(bool aligned, address *entry,
1393                                          const char *name, bool dest_uninitialized = false) {
1394     const bool not_oop = false;
1395     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1396   }
1397 
1398   // Arguments:
1399   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1400   //             ignored
1401   //   name    - stub name string
1402   //
1403   // Inputs:
1404   //   c_rarg0   - source array address
1405   //   c_rarg1   - destination array address
1406   //   c_rarg2   - element count, treated as ssize_t, can be zero
1407   //
1408   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1409   // the hardware handle it.  The two dwords within qwords that span
1410   // cache line boundaries will still be loaded and stored atomicly.
1411   //
1412   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1413                                      address *entry, const char *name,
1414                                      bool dest_uninitialized = false) {
1415     const bool not_oop = false;
1416     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1417   }
1418 
1419 
1420   // Arguments:
1421   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1422   //             ignored
1423   //   name    - stub name string
1424   //
1425   // Inputs:
1426   //   c_rarg0   - source array address
1427   //   c_rarg1   - destination array address
1428   //   c_rarg2   - element count, treated as size_t, can be zero
1429   //
1430   // Side Effects:
1431   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1432   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1433   //
1434   address generate_disjoint_long_copy(bool aligned, address *entry,
1435                                           const char *name, bool dest_uninitialized = false) {
1436     const bool not_oop = false;
1437     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1438   }
1439 
1440   // Arguments:
1441   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1442   //             ignored
1443   //   name    - stub name string
1444   //
1445   // Inputs:
1446   //   c_rarg0   - source array address
1447   //   c_rarg1   - destination array address
1448   //   c_rarg2   - element count, treated as size_t, can be zero
1449   //
1450   address generate_conjoint_long_copy(bool aligned,
1451                                       address nooverlap_target, address *entry,
1452                                       const char *name, bool dest_uninitialized = false) {
1453     const bool not_oop = false;
1454     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1455   }
1456 
1457   // Arguments:
1458   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1459   //             ignored
1460   //   name    - stub name string
1461   //
1462   // Inputs:
1463   //   c_rarg0   - source array address
1464   //   c_rarg1   - destination array address
1465   //   c_rarg2   - element count, treated as size_t, can be zero
1466   //
1467   // Side Effects:
1468   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1469   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1470   //
1471   address generate_disjoint_oop_copy(bool aligned, address *entry,
1472                                      const char *name, bool dest_uninitialized) {
1473     const bool is_oop = true;
1474     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1475     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1476   }
1477 
1478   // Arguments:
1479   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1480   //             ignored
1481   //   name    - stub name string
1482   //
1483   // Inputs:
1484   //   c_rarg0   - source array address
1485   //   c_rarg1   - destination array address
1486   //   c_rarg2   - element count, treated as size_t, can be zero
1487   //
1488   address generate_conjoint_oop_copy(bool aligned,
1489                                      address nooverlap_target, address *entry,
1490                                      const char *name, bool dest_uninitialized) {
1491     const bool is_oop = true;
1492     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1493     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1494                                   name, dest_uninitialized);
1495   }
1496 
1497 
1498   // Helper for generating a dynamic type check.
1499   // Smashes rscratch1.
1500   void generate_type_check(Register sub_klass,
1501                            Register super_check_offset,
1502                            Register super_klass,
1503                            Label& L_success) {
1504     assert_different_registers(sub_klass, super_check_offset, super_klass);
1505 
1506     BLOCK_COMMENT("type_check:");
1507 
1508     Label L_miss;
1509 
1510     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1511                                      super_check_offset);
1512     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1513 
1514     // Fall through on failure!
1515     __ BIND(L_miss);
1516   }
1517 
1518   //
1519   //  Generate checkcasting array copy stub
1520   //
1521   //  Input:
1522   //    c_rarg0   - source array address
1523   //    c_rarg1   - destination array address
1524   //    c_rarg2   - element count, treated as ssize_t, can be zero
1525   //    c_rarg3   - size_t ckoff (super_check_offset)
1526   //    c_rarg4   - oop ckval (super_klass)
1527   //
1528   //  Output:
1529   //    r0 ==  0  -  success
1530   //    r0 == -1^K - failure, where K is partial transfer count
1531   //
1532   address generate_checkcast_copy(const char *name, address *entry,
1533                                   bool dest_uninitialized = false) {
1534 
1535     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1536 
1537     // Input registers (after setup_arg_regs)
1538     const Register from        = c_rarg0;   // source array address
1539     const Register to          = c_rarg1;   // destination array address
1540     const Register count       = c_rarg2;   // elementscount
1541     const Register ckoff       = c_rarg3;   // super_check_offset
1542     const Register ckval       = c_rarg4;   // super_klass
1543 
1544     // Registers used as temps (r18, r19, r20 are save-on-entry)
1545     const Register count_save  = r21;       // orig elementscount
1546     const Register start_to    = r20;       // destination array start address
1547     const Register copied_oop  = r18;       // actual oop copied
1548     const Register r19_klass   = r19;       // oop._klass
1549 
1550     //---------------------------------------------------------------
1551     // Assembler stub will be used for this call to arraycopy
1552     // if the two arrays are subtypes of Object[] but the
1553     // destination array type is not equal to or a supertype
1554     // of the source type.  Each element must be separately
1555     // checked.
1556 
1557     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1558                                copied_oop, r19_klass, count_save);
1559 
1560     __ align(CodeEntryAlignment);
1561     StubCodeMark mark(this, "StubRoutines", name);
1562     address start = __ pc();
1563 
1564     __ enter(); // required for proper stackwalking of RuntimeStub frame
1565 
1566 #ifdef ASSERT
1567     // caller guarantees that the arrays really are different
1568     // otherwise, we would have to make conjoint checks
1569     { Label L;
1570       array_overlap_test(L, TIMES_OOP);
1571       __ stop("checkcast_copy within a single array");
1572       __ bind(L);
1573     }
1574 #endif //ASSERT
1575 
1576     // Caller of this entry point must set up the argument registers.
1577     if (entry != NULL) {
1578       *entry = __ pc();
1579       BLOCK_COMMENT("Entry:");
1580     }
1581 
1582      // Empty array:  Nothing to do.
1583     __ cbz(count, L_done);
1584 
1585     __ push(RegSet::of(r18, r19, r20, r21), sp);
1586 
1587 #ifdef ASSERT
1588     BLOCK_COMMENT("assert consistent ckoff/ckval");
1589     // The ckoff and ckval must be mutually consistent,
1590     // even though caller generates both.
1591     { Label L;
1592       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1593       __ ldrw(start_to, Address(ckval, sco_offset));
1594       __ cmpw(ckoff, start_to);
1595       __ br(Assembler::EQ, L);
1596       __ stop("super_check_offset inconsistent");
1597       __ bind(L);
1598     }
1599 #endif //ASSERT
1600 
1601     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
1602 
1603     // save the original count
1604     __ mov(count_save, count);
1605 
1606     // Copy from low to high addresses
1607     __ mov(start_to, to);              // Save destination array start address
1608     __ b(L_load_element);
1609 
1610     // ======== begin loop ========
1611     // (Loop is rotated; its entry is L_load_element.)
1612     // Loop control:
1613     //   for (; count != 0; count--) {
1614     //     copied_oop = load_heap_oop(from++);
1615     //     ... generate_type_check ...;
1616     //     store_heap_oop(to++, copied_oop);
1617     //   }
1618     __ align(OptoLoopAlignment);
1619 
1620     __ BIND(L_store_element);
1621     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
1622     __ sub(count, count, 1);
1623     __ cbz(count, L_do_card_marks);
1624 
1625     // ======== loop entry is here ========
1626     __ BIND(L_load_element);
1627     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
1628     __ cbz(copied_oop, L_store_element);
1629 
1630     __ load_klass(r19_klass, copied_oop);// query the object klass
1631     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
1632     // ======== end loop ========
1633 
1634     // It was a real error; we must depend on the caller to finish the job.
1635     // Register count = remaining oops, count_orig = total oops.
1636     // Emit GC store barriers for the oops we have copied and report
1637     // their number to the caller.
1638 
1639     __ subs(count, count_save, count);     // K = partially copied oop count
1640     __ eon(count, count, zr);                   // report (-1^K) to caller
1641     __ br(Assembler::EQ, L_done_pop);
1642 
1643     __ BIND(L_do_card_marks);
1644     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1645     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
1646 
1647     __ bind(L_done_pop);
1648     __ pop(RegSet::of(r18, r19, r20, r21), sp);
1649     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1650 
1651     __ bind(L_done);
1652     __ mov(r0, count);
1653     __ leave();
1654     __ ret(lr);
1655 
1656     return start;
1657   }
1658 
1659   // Perform range checks on the proposed arraycopy.
1660   // Kills temp, but nothing else.
1661   // Also, clean the sign bits of src_pos and dst_pos.
1662   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
1663                               Register src_pos, // source position (c_rarg1)
1664                               Register dst,     // destination array oo (c_rarg2)
1665                               Register dst_pos, // destination position (c_rarg3)
1666                               Register length,
1667                               Register temp,
1668                               Label& L_failed) {
1669     BLOCK_COMMENT("arraycopy_range_checks:");
1670 
1671     assert_different_registers(rscratch1, temp);
1672 
1673     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
1674     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
1675     __ addw(temp, length, src_pos);
1676     __ cmpw(temp, rscratch1);
1677     __ br(Assembler::HI, L_failed);
1678 
1679     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
1680     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
1681     __ addw(temp, length, dst_pos);
1682     __ cmpw(temp, rscratch1);
1683     __ br(Assembler::HI, L_failed);
1684 
1685     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
1686     __ movw(src_pos, src_pos);
1687     __ movw(dst_pos, dst_pos);
1688 
1689     BLOCK_COMMENT("arraycopy_range_checks done");
1690   }
1691 
1692   // These stubs get called from some dumb test routine.
1693   // I'll write them properly when they're called from
1694   // something that's actually doing something.
1695   static void fake_arraycopy_stub(address src, address dst, int count) {
1696     assert(count == 0, "huh?");
1697   }
1698 
1699 
1700   //
1701   //  Generate 'unsafe' array copy stub
1702   //  Though just as safe as the other stubs, it takes an unscaled
1703   //  size_t argument instead of an element count.
1704   //
1705   //  Input:
1706   //    c_rarg0   - source array address
1707   //    c_rarg1   - destination array address
1708   //    c_rarg2   - byte count, treated as ssize_t, can be zero
1709   //
1710   // Examines the alignment of the operands and dispatches
1711   // to a long, int, short, or byte copy loop.
1712   //
1713   address generate_unsafe_copy(const char *name,
1714                                address byte_copy_entry) {
1715 #ifdef PRODUCT
1716     return StubRoutines::_jbyte_arraycopy;
1717 #else
1718     __ align(CodeEntryAlignment);
1719     StubCodeMark mark(this, "StubRoutines", name);
1720     address start = __ pc();
1721     __ enter(); // required for proper stackwalking of RuntimeStub frame
1722     // bump this on entry, not on exit:
1723     __ lea(rscratch2, ExternalAddress((address)&SharedRuntime::_unsafe_array_copy_ctr));
1724     __ incrementw(Address(rscratch2));
1725     __ b(RuntimeAddress(byte_copy_entry));
1726     return start;
1727 #endif
1728   }
1729 
1730   //
1731   //  Generate generic array copy stubs
1732   //
1733   //  Input:
1734   //    c_rarg0    -  src oop
1735   //    c_rarg1    -  src_pos (32-bits)
1736   //    c_rarg2    -  dst oop
1737   //    c_rarg3    -  dst_pos (32-bits)
1738   //    c_rarg4    -  element count (32-bits)
1739   //
1740   //  Output:
1741   //    r0 ==  0  -  success
1742   //    r0 == -1^K - failure, where K is partial transfer count
1743   //
1744   address generate_generic_copy(const char *name,
1745                                 address byte_copy_entry, address short_copy_entry,
1746                                 address int_copy_entry, address oop_copy_entry,
1747                                 address long_copy_entry, address checkcast_copy_entry) {
1748 
1749     Label L_failed, L_failed_0, L_objArray;
1750     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
1751 
1752     // Input registers
1753     const Register src        = c_rarg0;  // source array oop
1754     const Register src_pos    = c_rarg1;  // source position
1755     const Register dst        = c_rarg2;  // destination array oop
1756     const Register dst_pos    = c_rarg3;  // destination position
1757     const Register length     = c_rarg4;
1758 
1759     StubCodeMark mark(this, "StubRoutines", name);
1760 
1761     __ align(CodeEntryAlignment);
1762     address start = __ pc();
1763 
1764     __ enter(); // required for proper stackwalking of RuntimeStub frame
1765 
1766     // bump this on entry, not on exit:
1767     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
1768 
1769     //-----------------------------------------------------------------------
1770     // Assembler stub will be used for this call to arraycopy
1771     // if the following conditions are met:
1772     //
1773     // (1) src and dst must not be null.
1774     // (2) src_pos must not be negative.
1775     // (3) dst_pos must not be negative.
1776     // (4) length  must not be negative.
1777     // (5) src klass and dst klass should be the same and not NULL.
1778     // (6) src and dst should be arrays.
1779     // (7) src_pos + length must not exceed length of src.
1780     // (8) dst_pos + length must not exceed length of dst.
1781     //
1782 
1783     //  if (src == NULL) return -1;
1784     __ cbz(src, L_failed);
1785 
1786     //  if (src_pos < 0) return -1;
1787     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
1788 
1789     //  if (dst == NULL) return -1;
1790     __ cbz(dst, L_failed);
1791 
1792     //  if (dst_pos < 0) return -1;
1793     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
1794 
1795     // registers used as temp
1796     const Register scratch_length    = r16; // elements count to copy
1797     const Register scratch_src_klass = r17; // array klass
1798     const Register lh                = r18; // layout helper
1799 
1800     //  if (length < 0) return -1;
1801     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
1802     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
1803 
1804     __ load_klass(scratch_src_klass, src);
1805 #ifdef ASSERT
1806     //  assert(src->klass() != NULL);
1807     {
1808       BLOCK_COMMENT("assert klasses not null {");
1809       Label L1, L2;
1810       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
1811       __ bind(L1);
1812       __ stop("broken null klass");
1813       __ bind(L2);
1814       __ load_klass(rscratch1, dst);
1815       __ cbz(rscratch1, L1);     // this would be broken also
1816       BLOCK_COMMENT("} assert klasses not null done");
1817     }
1818 #endif
1819 
1820     // Load layout helper (32-bits)
1821     //
1822     //  |array_tag|     | header_size | element_type |     |log2_element_size|
1823     // 32        30    24            16              8     2                 0
1824     //
1825     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
1826     //
1827 
1828     const int lh_offset = in_bytes(Klass::layout_helper_offset());
1829 
1830     // Handle objArrays completely differently...
1831     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
1832     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
1833     __ movw(rscratch1, objArray_lh);
1834     __ eorw(rscratch2, lh, rscratch1);
1835     __ cbzw(rscratch2, L_objArray);
1836 
1837     //  if (src->klass() != dst->klass()) return -1;
1838     __ load_klass(rscratch2, dst);
1839     __ eor(rscratch2, rscratch2, scratch_src_klass);
1840     __ cbnz(rscratch2, L_failed);
1841 
1842     //  if (!src->is_Array()) return -1;
1843     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
1844 
1845     // At this point, it is known to be a typeArray (array_tag 0x3).
1846 #ifdef ASSERT
1847     {
1848       BLOCK_COMMENT("assert primitive array {");
1849       Label L;
1850       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
1851       __ cmpw(lh, rscratch2);
1852       __ br(Assembler::GE, L);
1853       __ stop("must be a primitive array");
1854       __ bind(L);
1855       BLOCK_COMMENT("} assert primitive array done");
1856     }
1857 #endif
1858 
1859     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1860                            rscratch2, L_failed);
1861 
1862     // TypeArrayKlass
1863     //
1864     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
1865     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
1866     //
1867 
1868     const Register rscratch1_offset = rscratch1;    // array offset
1869     const Register r18_elsize = lh; // element size
1870 
1871     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
1872            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
1873     __ add(src, src, rscratch1_offset);           // src array offset
1874     __ add(dst, dst, rscratch1_offset);           // dst array offset
1875     BLOCK_COMMENT("choose copy loop based on element size");
1876 
1877     // next registers should be set before the jump to corresponding stub
1878     const Register from     = c_rarg0;  // source array address
1879     const Register to       = c_rarg1;  // destination array address
1880     const Register count    = c_rarg2;  // elements count
1881 
1882     // 'from', 'to', 'count' registers should be set in such order
1883     // since they are the same as 'src', 'src_pos', 'dst'.
1884 
1885     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
1886 
1887     // The possible values of elsize are 0-3, i.e. exact_log2(element
1888     // size in bytes).  We do a simple bitwise binary search.
1889   __ BIND(L_copy_bytes);
1890     __ tbnz(r18_elsize, 1, L_copy_ints);
1891     __ tbnz(r18_elsize, 0, L_copy_shorts);
1892     __ lea(from, Address(src, src_pos));// src_addr
1893     __ lea(to,   Address(dst, dst_pos));// dst_addr
1894     __ movw(count, scratch_length); // length
1895     __ b(RuntimeAddress(byte_copy_entry));
1896 
1897   __ BIND(L_copy_shorts);
1898     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
1899     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
1900     __ movw(count, scratch_length); // length
1901     __ b(RuntimeAddress(short_copy_entry));
1902 
1903   __ BIND(L_copy_ints);
1904     __ tbnz(r18_elsize, 0, L_copy_longs);
1905     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
1906     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
1907     __ movw(count, scratch_length); // length
1908     __ b(RuntimeAddress(int_copy_entry));
1909 
1910   __ BIND(L_copy_longs);
1911 #ifdef ASSERT
1912     {
1913       BLOCK_COMMENT("assert long copy {");
1914       Label L;
1915       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
1916       __ cmpw(r18_elsize, LogBytesPerLong);
1917       __ br(Assembler::EQ, L);
1918       __ stop("must be long copy, but elsize is wrong");
1919       __ bind(L);
1920       BLOCK_COMMENT("} assert long copy done");
1921     }
1922 #endif
1923     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
1924     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
1925     __ movw(count, scratch_length); // length
1926     __ b(RuntimeAddress(long_copy_entry));
1927 
1928     // ObjArrayKlass
1929   __ BIND(L_objArray);
1930     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
1931 
1932     Label L_plain_copy, L_checkcast_copy;
1933     //  test array classes for subtyping
1934     __ load_klass(r18, dst);
1935     __ cmp(scratch_src_klass, r18); // usual case is exact equality
1936     __ br(Assembler::NE, L_checkcast_copy);
1937 
1938     // Identically typed arrays can be copied without element-wise checks.
1939     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1940                            rscratch2, L_failed);
1941 
1942     __ lea(from, Address(src, src_pos, Address::lsl(3)));
1943     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1944     __ lea(to, Address(dst, dst_pos, Address::lsl(3)));
1945     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1946     __ movw(count, scratch_length); // length
1947   __ BIND(L_plain_copy);
1948     __ b(RuntimeAddress(oop_copy_entry));
1949 
1950   __ BIND(L_checkcast_copy);
1951     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
1952     {
1953       // Before looking at dst.length, make sure dst is also an objArray.
1954       __ ldrw(rscratch1, Address(r18, lh_offset));
1955       __ movw(rscratch2, objArray_lh);
1956       __ eorw(rscratch1, rscratch1, rscratch2);
1957       __ cbnzw(rscratch1, L_failed);
1958 
1959       // It is safe to examine both src.length and dst.length.
1960       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
1961                              r18, L_failed);
1962 
1963       const Register rscratch2_dst_klass = rscratch2;
1964       __ load_klass(rscratch2_dst_klass, dst); // reload
1965 
1966       // Marshal the base address arguments now, freeing registers.
1967       __ lea(from, Address(src, src_pos, Address::lsl(3)));
1968       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1969       __ lea(to, Address(dst, dst_pos, Address::lsl(3)));
1970       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
1971       __ movw(count, length);           // length (reloaded)
1972       Register sco_temp = c_rarg3;      // this register is free now
1973       assert_different_registers(from, to, count, sco_temp,
1974                                  rscratch2_dst_klass, scratch_src_klass);
1975       // assert_clean_int(count, sco_temp);
1976 
1977       // Generate the type check.
1978       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
1979       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
1980       // assert_clean_int(sco_temp, r18);
1981       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
1982 
1983       // Fetch destination element klass from the ObjArrayKlass header.
1984       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
1985       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
1986       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
1987 
1988       // the checkcast_copy loop needs two extra arguments:
1989       assert(c_rarg3 == sco_temp, "#3 already in place");
1990       // Set up arguments for checkcast_copy_entry.
1991       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
1992       __ b(RuntimeAddress(checkcast_copy_entry));
1993     }
1994 
1995   __ BIND(L_failed);
1996     __ mov(r0, -1);
1997     __ leave();   // required for proper stackwalking of RuntimeStub frame
1998     __ ret(lr);
1999 
2000     return start;
2001   }
2002 
2003   void generate_arraycopy_stubs() {
2004     address entry;
2005     address entry_jbyte_arraycopy;
2006     address entry_jshort_arraycopy;
2007     address entry_jint_arraycopy;
2008     address entry_oop_arraycopy;
2009     address entry_jlong_arraycopy;
2010     address entry_checkcast_arraycopy;
2011 
2012     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2013     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2014 
2015     //*** jbyte
2016     // Always need aligned and unaligned versions
2017     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2018                                                                                   "jbyte_disjoint_arraycopy");
2019     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2020                                                                                   &entry_jbyte_arraycopy,
2021                                                                                   "jbyte_arraycopy");
2022     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2023                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2024     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2025                                                                                   "arrayof_jbyte_arraycopy");
2026 
2027     //*** jshort
2028     // Always need aligned and unaligned versions
2029     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2030                                                                                     "jshort_disjoint_arraycopy");
2031     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2032                                                                                     &entry_jshort_arraycopy,
2033                                                                                     "jshort_arraycopy");
2034     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2035                                                                                     "arrayof_jshort_disjoint_arraycopy");
2036     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2037                                                                                     "arrayof_jshort_arraycopy");
2038 
2039     //*** jint
2040     // Aligned versions
2041     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2042                                                                                 "arrayof_jint_disjoint_arraycopy");
2043     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2044                                                                                 "arrayof_jint_arraycopy");
2045     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2046     // entry_jint_arraycopy always points to the unaligned version
2047     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2048                                                                                 "jint_disjoint_arraycopy");
2049     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2050                                                                                 &entry_jint_arraycopy,
2051                                                                                 "jint_arraycopy");
2052 
2053     //*** jlong
2054     // It is always aligned
2055     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2056                                                                                   "arrayof_jlong_disjoint_arraycopy");
2057     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2058                                                                                   "arrayof_jlong_arraycopy");
2059     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2060     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2061 
2062     //*** oops
2063     {
2064       // With compressed oops we need unaligned versions; notice that
2065       // we overwrite entry_oop_arraycopy.
2066       bool aligned = !UseCompressedOops;
2067 
2068       StubRoutines::_arrayof_oop_disjoint_arraycopy
2069         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2070                                      /*dest_uninitialized*/false);
2071       StubRoutines::_arrayof_oop_arraycopy
2072         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2073                                      /*dest_uninitialized*/false);
2074       // Aligned versions without pre-barriers
2075       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2076         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2077                                      /*dest_uninitialized*/true);
2078       StubRoutines::_arrayof_oop_arraycopy_uninit
2079         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2080                                      /*dest_uninitialized*/true);
2081     }
2082 
2083     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2084     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2085     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2086     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2087 
2088     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2089     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2090                                                                         /*dest_uninitialized*/true);
2091 
2092     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2093                                                               entry_jbyte_arraycopy);
2094 
2095     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2096                                                                entry_jbyte_arraycopy,
2097                                                                entry_jshort_arraycopy,
2098                                                                entry_jint_arraycopy,
2099                                                                entry_oop_arraycopy,
2100                                                                entry_jlong_arraycopy,
2101                                                                entry_checkcast_arraycopy);
2102 
2103   }
2104 
2105   void generate_math_stubs() { Unimplemented(); }
2106 
2107   // Arguments:
2108   //
2109   // Inputs:
2110   //   c_rarg0   - source byte array address
2111   //   c_rarg1   - destination byte array address
2112   //   c_rarg2   - K (key) in little endian int array
2113   //
2114   address generate_aescrypt_encryptBlock() {
2115     __ align(CodeEntryAlignment);
2116     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2117 
2118     Label L_doLast;
2119 
2120     const Register from        = c_rarg0;  // source array address
2121     const Register to          = c_rarg1;  // destination array address
2122     const Register key         = c_rarg2;  // key array address
2123     const Register keylen      = rscratch1;
2124 
2125     address start = __ pc();
2126     __ enter();
2127 
2128     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2129 
2130     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2131 
2132     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2133     __ rev32(v1, __ T16B, v1);
2134     __ rev32(v2, __ T16B, v2);
2135     __ rev32(v3, __ T16B, v3);
2136     __ rev32(v4, __ T16B, v4);
2137     __ aese(v0, v1);
2138     __ aesmc(v0, v0);
2139     __ aese(v0, v2);
2140     __ aesmc(v0, v0);
2141     __ aese(v0, v3);
2142     __ aesmc(v0, v0);
2143     __ aese(v0, v4);
2144     __ aesmc(v0, v0);
2145 
2146     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2147     __ rev32(v1, __ T16B, v1);
2148     __ rev32(v2, __ T16B, v2);
2149     __ rev32(v3, __ T16B, v3);
2150     __ rev32(v4, __ T16B, v4);
2151     __ aese(v0, v1);
2152     __ aesmc(v0, v0);
2153     __ aese(v0, v2);
2154     __ aesmc(v0, v0);
2155     __ aese(v0, v3);
2156     __ aesmc(v0, v0);
2157     __ aese(v0, v4);
2158     __ aesmc(v0, v0);
2159 
2160     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2161     __ rev32(v1, __ T16B, v1);
2162     __ rev32(v2, __ T16B, v2);
2163 
2164     __ cmpw(keylen, 44);
2165     __ br(Assembler::EQ, L_doLast);
2166 
2167     __ aese(v0, v1);
2168     __ aesmc(v0, v0);
2169     __ aese(v0, v2);
2170     __ aesmc(v0, v0);
2171 
2172     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2173     __ rev32(v1, __ T16B, v1);
2174     __ rev32(v2, __ T16B, v2);
2175 
2176     __ cmpw(keylen, 52);
2177     __ br(Assembler::EQ, L_doLast);
2178 
2179     __ aese(v0, v1);
2180     __ aesmc(v0, v0);
2181     __ aese(v0, v2);
2182     __ aesmc(v0, v0);
2183 
2184     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2185     __ rev32(v1, __ T16B, v1);
2186     __ rev32(v2, __ T16B, v2);
2187 
2188     __ BIND(L_doLast);
2189 
2190     __ aese(v0, v1);
2191     __ aesmc(v0, v0);
2192     __ aese(v0, v2);
2193 
2194     __ ld1(v1, __ T16B, key);
2195     __ rev32(v1, __ T16B, v1);
2196     __ eor(v0, __ T16B, v0, v1);
2197 
2198     __ st1(v0, __ T16B, to);
2199 
2200     __ mov(r0, 0);
2201 
2202     __ leave();
2203     __ ret(lr);
2204 
2205     return start;
2206   }
2207 
2208   // Arguments:
2209   //
2210   // Inputs:
2211   //   c_rarg0   - source byte array address
2212   //   c_rarg1   - destination byte array address
2213   //   c_rarg2   - K (key) in little endian int array
2214   //
2215   address generate_aescrypt_decryptBlock() {
2216     assert(UseAES, "need AES instructions and misaligned SSE support");
2217     __ align(CodeEntryAlignment);
2218     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2219     Label L_doLast;
2220 
2221     const Register from        = c_rarg0;  // source array address
2222     const Register to          = c_rarg1;  // destination array address
2223     const Register key         = c_rarg2;  // key array address
2224     const Register keylen      = rscratch1;
2225 
2226     address start = __ pc();
2227     __ enter(); // required for proper stackwalking of RuntimeStub frame
2228 
2229     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2230 
2231     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2232 
2233     __ ld1(v5, __ T16B, __ post(key, 16));
2234     __ rev32(v5, __ T16B, v5);
2235 
2236     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2237     __ rev32(v1, __ T16B, v1);
2238     __ rev32(v2, __ T16B, v2);
2239     __ rev32(v3, __ T16B, v3);
2240     __ rev32(v4, __ T16B, v4);
2241     __ aesd(v0, v1);
2242     __ aesimc(v0, v0);
2243     __ aesd(v0, v2);
2244     __ aesimc(v0, v0);
2245     __ aesd(v0, v3);
2246     __ aesimc(v0, v0);
2247     __ aesd(v0, v4);
2248     __ aesimc(v0, v0);
2249 
2250     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2251     __ rev32(v1, __ T16B, v1);
2252     __ rev32(v2, __ T16B, v2);
2253     __ rev32(v3, __ T16B, v3);
2254     __ rev32(v4, __ T16B, v4);
2255     __ aesd(v0, v1);
2256     __ aesimc(v0, v0);
2257     __ aesd(v0, v2);
2258     __ aesimc(v0, v0);
2259     __ aesd(v0, v3);
2260     __ aesimc(v0, v0);
2261     __ aesd(v0, v4);
2262     __ aesimc(v0, v0);
2263 
2264     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2265     __ rev32(v1, __ T16B, v1);
2266     __ rev32(v2, __ T16B, v2);
2267 
2268     __ cmpw(keylen, 44);
2269     __ br(Assembler::EQ, L_doLast);
2270 
2271     __ aesd(v0, v1);
2272     __ aesimc(v0, v0);
2273     __ aesd(v0, v2);
2274     __ aesimc(v0, v0);
2275 
2276     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2277     __ rev32(v1, __ T16B, v1);
2278     __ rev32(v2, __ T16B, v2);
2279 
2280     __ cmpw(keylen, 52);
2281     __ br(Assembler::EQ, L_doLast);
2282 
2283     __ aesd(v0, v1);
2284     __ aesimc(v0, v0);
2285     __ aesd(v0, v2);
2286     __ aesimc(v0, v0);
2287 
2288     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2289     __ rev32(v1, __ T16B, v1);
2290     __ rev32(v2, __ T16B, v2);
2291 
2292     __ BIND(L_doLast);
2293 
2294     __ aesd(v0, v1);
2295     __ aesimc(v0, v0);
2296     __ aesd(v0, v2);
2297 
2298     __ eor(v0, __ T16B, v0, v5);
2299 
2300     __ st1(v0, __ T16B, to);
2301 
2302     __ mov(r0, 0);
2303 
2304     __ leave();
2305     __ ret(lr);
2306 
2307     return start;
2308   }
2309 
2310   // Arguments:
2311   //
2312   // Inputs:
2313   //   c_rarg0   - source byte array address
2314   //   c_rarg1   - destination byte array address
2315   //   c_rarg2   - K (key) in little endian int array
2316   //   c_rarg3   - r vector byte array address
2317   //   c_rarg4   - input length
2318   //
2319   // Output:
2320   //   x0        - input length
2321   //
2322   address generate_cipherBlockChaining_encryptAESCrypt() {
2323     assert(UseAES, "need AES instructions and misaligned SSE support");
2324     __ align(CodeEntryAlignment);
2325     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2326 
2327     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2328 
2329     const Register from        = c_rarg0;  // source array address
2330     const Register to          = c_rarg1;  // destination array address
2331     const Register key         = c_rarg2;  // key array address
2332     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2333                                            // and left with the results of the last encryption block
2334     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2335     const Register keylen      = rscratch1;
2336 
2337     address start = __ pc();
2338       __ enter();
2339 
2340       __ mov(rscratch2, len_reg);
2341       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2342 
2343       __ ld1(v0, __ T16B, rvec);
2344 
2345       __ cmpw(keylen, 52);
2346       __ br(Assembler::CC, L_loadkeys_44);
2347       __ br(Assembler::EQ, L_loadkeys_52);
2348 
2349       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2350       __ rev32(v17, __ T16B, v17);
2351       __ rev32(v18, __ T16B, v18);
2352     __ BIND(L_loadkeys_52);
2353       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2354       __ rev32(v19, __ T16B, v19);
2355       __ rev32(v20, __ T16B, v20);
2356     __ BIND(L_loadkeys_44);
2357       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2358       __ rev32(v21, __ T16B, v21);
2359       __ rev32(v22, __ T16B, v22);
2360       __ rev32(v23, __ T16B, v23);
2361       __ rev32(v24, __ T16B, v24);
2362       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2363       __ rev32(v25, __ T16B, v25);
2364       __ rev32(v26, __ T16B, v26);
2365       __ rev32(v27, __ T16B, v27);
2366       __ rev32(v28, __ T16B, v28);
2367       __ ld1(v29, v30, v31, __ T16B, key);
2368       __ rev32(v29, __ T16B, v29);
2369       __ rev32(v30, __ T16B, v30);
2370       __ rev32(v31, __ T16B, v31);
2371 
2372     __ BIND(L_aes_loop);
2373       __ ld1(v1, __ T16B, __ post(from, 16));
2374       __ eor(v0, __ T16B, v0, v1);
2375 
2376       __ br(Assembler::CC, L_rounds_44);
2377       __ br(Assembler::EQ, L_rounds_52);
2378 
2379       __ aese(v0, v17); __ aesmc(v0, v0);
2380       __ aese(v0, v18); __ aesmc(v0, v0);
2381     __ BIND(L_rounds_52);
2382       __ aese(v0, v19); __ aesmc(v0, v0);
2383       __ aese(v0, v20); __ aesmc(v0, v0);
2384     __ BIND(L_rounds_44);
2385       __ aese(v0, v21); __ aesmc(v0, v0);
2386       __ aese(v0, v22); __ aesmc(v0, v0);
2387       __ aese(v0, v23); __ aesmc(v0, v0);
2388       __ aese(v0, v24); __ aesmc(v0, v0);
2389       __ aese(v0, v25); __ aesmc(v0, v0);
2390       __ aese(v0, v26); __ aesmc(v0, v0);
2391       __ aese(v0, v27); __ aesmc(v0, v0);
2392       __ aese(v0, v28); __ aesmc(v0, v0);
2393       __ aese(v0, v29); __ aesmc(v0, v0);
2394       __ aese(v0, v30);
2395       __ eor(v0, __ T16B, v0, v31);
2396 
2397       __ st1(v0, __ T16B, __ post(to, 16));
2398       __ sub(len_reg, len_reg, 16);
2399       __ cbnz(len_reg, L_aes_loop);
2400 
2401       __ st1(v0, __ T16B, rvec);
2402 
2403       __ mov(r0, rscratch2);
2404 
2405       __ leave();
2406       __ ret(lr);
2407 
2408       return start;
2409   }
2410 
2411   // Arguments:
2412   //
2413   // Inputs:
2414   //   c_rarg0   - source byte array address
2415   //   c_rarg1   - destination byte array address
2416   //   c_rarg2   - K (key) in little endian int array
2417   //   c_rarg3   - r vector byte array address
2418   //   c_rarg4   - input length
2419   //
2420   // Output:
2421   //   r0        - input length
2422   //
2423   address generate_cipherBlockChaining_decryptAESCrypt() {
2424     assert(UseAES, "need AES instructions and misaligned SSE support");
2425     __ align(CodeEntryAlignment);
2426     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
2427 
2428     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2429 
2430     const Register from        = c_rarg0;  // source array address
2431     const Register to          = c_rarg1;  // destination array address
2432     const Register key         = c_rarg2;  // key array address
2433     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2434                                            // and left with the results of the last encryption block
2435     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2436     const Register keylen      = rscratch1;
2437 
2438     address start = __ pc();
2439       __ enter();
2440 
2441       __ mov(rscratch2, len_reg);
2442       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2443 
2444       __ ld1(v2, __ T16B, rvec);
2445 
2446       __ ld1(v31, __ T16B, __ post(key, 16));
2447       __ rev32(v31, __ T16B, v31);
2448 
2449       __ cmpw(keylen, 52);
2450       __ br(Assembler::CC, L_loadkeys_44);
2451       __ br(Assembler::EQ, L_loadkeys_52);
2452 
2453       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2454       __ rev32(v17, __ T16B, v17);
2455       __ rev32(v18, __ T16B, v18);
2456     __ BIND(L_loadkeys_52);
2457       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2458       __ rev32(v19, __ T16B, v19);
2459       __ rev32(v20, __ T16B, v20);
2460     __ BIND(L_loadkeys_44);
2461       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2462       __ rev32(v21, __ T16B, v21);
2463       __ rev32(v22, __ T16B, v22);
2464       __ rev32(v23, __ T16B, v23);
2465       __ rev32(v24, __ T16B, v24);
2466       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2467       __ rev32(v25, __ T16B, v25);
2468       __ rev32(v26, __ T16B, v26);
2469       __ rev32(v27, __ T16B, v27);
2470       __ rev32(v28, __ T16B, v28);
2471       __ ld1(v29, v30, __ T16B, key);
2472       __ rev32(v29, __ T16B, v29);
2473       __ rev32(v30, __ T16B, v30);
2474 
2475     __ BIND(L_aes_loop);
2476       __ ld1(v0, __ T16B, __ post(from, 16));
2477       __ orr(v1, __ T16B, v0, v0);
2478 
2479       __ br(Assembler::CC, L_rounds_44);
2480       __ br(Assembler::EQ, L_rounds_52);
2481 
2482       __ aesd(v0, v17); __ aesimc(v0, v0);
2483       __ aesd(v0, v18); __ aesimc(v0, v0);
2484     __ BIND(L_rounds_52);
2485       __ aesd(v0, v19); __ aesimc(v0, v0);
2486       __ aesd(v0, v20); __ aesimc(v0, v0);
2487     __ BIND(L_rounds_44);
2488       __ aesd(v0, v21); __ aesimc(v0, v0);
2489       __ aesd(v0, v22); __ aesimc(v0, v0);
2490       __ aesd(v0, v23); __ aesimc(v0, v0);
2491       __ aesd(v0, v24); __ aesimc(v0, v0);
2492       __ aesd(v0, v25); __ aesimc(v0, v0);
2493       __ aesd(v0, v26); __ aesimc(v0, v0);
2494       __ aesd(v0, v27); __ aesimc(v0, v0);
2495       __ aesd(v0, v28); __ aesimc(v0, v0);
2496       __ aesd(v0, v29); __ aesimc(v0, v0);
2497       __ aesd(v0, v30);
2498       __ eor(v0, __ T16B, v0, v31);
2499       __ eor(v0, __ T16B, v0, v2);
2500 
2501       __ st1(v0, __ T16B, __ post(to, 16));
2502       __ orr(v2, __ T16B, v1, v1);
2503 
2504       __ sub(len_reg, len_reg, 16);
2505       __ cbnz(len_reg, L_aes_loop);
2506 
2507       __ st1(v2, __ T16B, rvec);
2508 
2509       __ mov(r0, rscratch2);
2510 
2511       __ leave();
2512       __ ret(lr);
2513 
2514     return start;
2515   }
2516 
2517   // Arguments:
2518   //
2519   // Inputs:
2520   //   c_rarg0   - byte[]  source+offset
2521   //   c_rarg1   - int[]   SHA.state
2522   //   c_rarg2   - int     offset
2523   //   c_rarg3   - int     limit
2524   //
2525   address generate_sha1_implCompress(bool multi_block, const char *name) {
2526     __ align(CodeEntryAlignment);
2527     StubCodeMark mark(this, "StubRoutines", name);
2528     address start = __ pc();
2529 
2530     Register buf   = c_rarg0;
2531     Register state = c_rarg1;
2532     Register ofs   = c_rarg2;
2533     Register limit = c_rarg3;
2534 
2535     Label keys;
2536     Label sha1_loop;
2537 
2538     // load the keys into v0..v3
2539     __ adr(rscratch1, keys);
2540     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
2541     // load 5 words state into v6, v7
2542     __ ldrq(v6, Address(state, 0));
2543     __ ldrs(v7, Address(state, 16));
2544 
2545 
2546     __ BIND(sha1_loop);
2547     // load 64 bytes of data into v16..v19
2548     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
2549     __ rev32(v16, __ T16B, v16);
2550     __ rev32(v17, __ T16B, v17);
2551     __ rev32(v18, __ T16B, v18);
2552     __ rev32(v19, __ T16B, v19);
2553 
2554     // do the sha1
2555     __ addv(v4, __ T4S, v16, v0);
2556     __ orr(v20, __ T16B, v6, v6);
2557 
2558     FloatRegister d0 = v16;
2559     FloatRegister d1 = v17;
2560     FloatRegister d2 = v18;
2561     FloatRegister d3 = v19;
2562 
2563     for (int round = 0; round < 20; round++) {
2564       FloatRegister tmp1 = (round & 1) ? v4 : v5;
2565       FloatRegister tmp2 = (round & 1) ? v21 : v22;
2566       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
2567       FloatRegister tmp4 = (round & 1) ? v5 : v4;
2568       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
2569 
2570       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
2571       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
2572       __ sha1h(tmp2, __ T4S, v20);
2573       if (round < 5)
2574         __ sha1c(v20, __ T4S, tmp3, tmp4);
2575       else if (round < 10 || round >= 15)
2576         __ sha1p(v20, __ T4S, tmp3, tmp4);
2577       else
2578         __ sha1m(v20, __ T4S, tmp3, tmp4);
2579       if (round < 16) __ sha1su1(d0, __ T4S, d3);
2580 
2581       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2582     }
2583 
2584     __ addv(v7, __ T2S, v7, v21);
2585     __ addv(v6, __ T4S, v6, v20);
2586 
2587     if (multi_block) {
2588       __ add(ofs, ofs, 64);
2589       __ cmp(ofs, limit);
2590       __ br(Assembler::LE, sha1_loop);
2591       __ mov(c_rarg0, ofs); // return ofs
2592     }
2593 
2594     __ strq(v6, Address(state, 0));
2595     __ strs(v7, Address(state, 16));
2596 
2597     __ ret(lr);
2598 
2599     __ bind(keys);
2600     __ emit_int32(0x5a827999);
2601     __ emit_int32(0x6ed9eba1);
2602     __ emit_int32(0x8f1bbcdc);
2603     __ emit_int32(0xca62c1d6);
2604 
2605     return start;
2606   }
2607 
2608 
2609   // Arguments:
2610   //
2611   // Inputs:
2612   //   c_rarg0   - byte[]  source+offset
2613   //   c_rarg1   - int[]   SHA.state
2614   //   c_rarg2   - int     offset
2615   //   c_rarg3   - int     limit
2616   //
2617   address generate_sha256_implCompress(bool multi_block, const char *name) {
2618     static const uint32_t round_consts[64] = {
2619       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
2620       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
2621       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
2622       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
2623       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
2624       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
2625       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
2626       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
2627       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
2628       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
2629       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
2630       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
2631       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
2632       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
2633       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
2634       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
2635     };
2636     __ align(CodeEntryAlignment);
2637     StubCodeMark mark(this, "StubRoutines", name);
2638     address start = __ pc();
2639 
2640     Register buf   = c_rarg0;
2641     Register state = c_rarg1;
2642     Register ofs   = c_rarg2;
2643     Register limit = c_rarg3;
2644 
2645     Label sha1_loop;
2646 
2647     __ stpd(v8, v9, __ pre(sp, -32));
2648     __ stpd(v10, v11, Address(sp, 16));
2649 
2650 // dga == v0
2651 // dgb == v1
2652 // dg0 == v2
2653 // dg1 == v3
2654 // dg2 == v4
2655 // t0 == v6
2656 // t1 == v7
2657 
2658     // load 16 keys to v16..v31
2659     __ lea(rscratch1, ExternalAddress((address)round_consts));
2660     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
2661     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
2662     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
2663     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
2664 
2665     // load 8 words (256 bits) state
2666     __ ldpq(v0, v1, state);
2667 
2668     __ BIND(sha1_loop);
2669     // load 64 bytes of data into v8..v11
2670     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
2671     __ rev32(v8, __ T16B, v8);
2672     __ rev32(v9, __ T16B, v9);
2673     __ rev32(v10, __ T16B, v10);
2674     __ rev32(v11, __ T16B, v11);
2675 
2676     __ addv(v6, __ T4S, v8, v16);
2677     __ orr(v2, __ T16B, v0, v0);
2678     __ orr(v3, __ T16B, v1, v1);
2679 
2680     FloatRegister d0 = v8;
2681     FloatRegister d1 = v9;
2682     FloatRegister d2 = v10;
2683     FloatRegister d3 = v11;
2684 
2685 
2686     for (int round = 0; round < 16; round++) {
2687       FloatRegister tmp1 = (round & 1) ? v6 : v7;
2688       FloatRegister tmp2 = (round & 1) ? v7 : v6;
2689       FloatRegister tmp3 = (round & 1) ? v2 : v4;
2690       FloatRegister tmp4 = (round & 1) ? v4 : v2;
2691 
2692       if (round < 12) __ sha256su0(d0, __ T4S, d1);
2693        __ orr(v4, __ T16B, v2, v2);
2694       if (round < 15)
2695         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
2696       __ sha256h(v2, __ T4S, v3, tmp2);
2697       __ sha256h2(v3, __ T4S, v4, tmp2);
2698       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
2699 
2700       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
2701     }
2702 
2703     __ addv(v0, __ T4S, v0, v2);
2704     __ addv(v1, __ T4S, v1, v3);
2705 
2706     if (multi_block) {
2707       __ add(ofs, ofs, 64);
2708       __ cmp(ofs, limit);
2709       __ br(Assembler::LE, sha1_loop);
2710       __ mov(c_rarg0, ofs); // return ofs
2711     }
2712 
2713     __ ldpd(v10, v11, Address(sp, 16));
2714     __ ldpd(v8, v9, __ post(sp, 32));
2715 
2716     __ stpq(v0, v1, state);
2717 
2718     __ ret(lr);
2719 
2720     return start;
2721   }
2722 
2723 #ifndef BUILTIN_SIM
2724   // Safefetch stubs.
2725   void generate_safefetch(const char* name, int size, address* entry,
2726                           address* fault_pc, address* continuation_pc) {
2727     // safefetch signatures:
2728     //   int      SafeFetch32(int*      adr, int      errValue);
2729     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
2730     //
2731     // arguments:
2732     //   c_rarg0 = adr
2733     //   c_rarg1 = errValue
2734     //
2735     // result:
2736     //   PPC_RET  = *adr or errValue
2737 
2738     StubCodeMark mark(this, "StubRoutines", name);
2739 
2740     // Entry point, pc or function descriptor.
2741     *entry = __ pc();
2742 
2743     // Load *adr into c_rarg1, may fault.
2744     *fault_pc = __ pc();
2745     switch (size) {
2746       case 4:
2747         // int32_t
2748         __ ldrw(c_rarg1, Address(c_rarg0, 0));
2749         break;
2750       case 8:
2751         // int64_t
2752         __ ldr(c_rarg1, Address(c_rarg0, 0));
2753         break;
2754       default:
2755         ShouldNotReachHere();
2756     }
2757 
2758     // return errValue or *adr
2759     *continuation_pc = __ pc();
2760     __ mov(r0, c_rarg1);
2761     __ ret(lr);
2762   }
2763 #endif
2764 
2765   /**
2766    *  Arguments:
2767    *
2768    * Inputs:
2769    *   c_rarg0   - int crc
2770    *   c_rarg1   - byte* buf
2771    *   c_rarg2   - int length
2772    *
2773    * Ouput:
2774    *       rax   - int crc result
2775    */
2776   address generate_updateBytesCRC32() {
2777     assert(UseCRC32Intrinsics, "what are we doing here?");
2778 
2779     __ align(CodeEntryAlignment);
2780     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
2781 
2782     address start = __ pc();
2783 
2784     const Register crc   = c_rarg0;  // crc
2785     const Register buf   = c_rarg1;  // source java byte array address
2786     const Register len   = c_rarg2;  // length
2787     const Register table0 = c_rarg3; // crc_table address
2788     const Register table1 = c_rarg4;
2789     const Register table2 = c_rarg5;
2790     const Register table3 = c_rarg6;
2791     const Register tmp3 = c_rarg7;
2792 
2793     BLOCK_COMMENT("Entry:");
2794     __ enter(); // required for proper stackwalking of RuntimeStub frame
2795 
2796     __ kernel_crc32(crc, buf, len,
2797               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
2798 
2799     __ leave(); // required for proper stackwalking of RuntimeStub frame
2800     __ ret(lr);
2801 
2802     return start;
2803   }
2804 
2805   /**
2806    *  Arguments:
2807    *
2808    * Inputs:
2809    *   c_rarg0   - int crc
2810    *   c_rarg1   - byte* buf
2811    *   c_rarg2   - int length
2812    *   c_rarg3   - int* table
2813    *
2814    * Ouput:
2815    *       r0   - int crc result
2816    */
2817   address generate_updateBytesCRC32C() {
2818     assert(UseCRC32CIntrinsics, "what are we doing here?");
2819 
2820     __ align(CodeEntryAlignment);
2821     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
2822 
2823     address start = __ pc();
2824 
2825     const Register crc   = c_rarg0;  // crc
2826     const Register buf   = c_rarg1;  // source java byte array address
2827     const Register len   = c_rarg2;  // length
2828     const Register table0 = c_rarg3; // crc_table address
2829     const Register table1 = c_rarg4;
2830     const Register table2 = c_rarg5;
2831     const Register table3 = c_rarg6;
2832     const Register tmp3 = c_rarg7;
2833 
2834     BLOCK_COMMENT("Entry:");
2835     __ enter(); // required for proper stackwalking of RuntimeStub frame
2836 
2837     __ kernel_crc32c(crc, buf, len,
2838               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
2839 
2840     __ leave(); // required for proper stackwalking of RuntimeStub frame
2841     __ ret(lr);
2842 
2843     return start;
2844   }
2845 
2846   /***
2847    *  Arguments:
2848    *
2849    *  Inputs:
2850    *   c_rarg0   - int   adler
2851    *   c_rarg1   - byte* buff
2852    *   c_rarg2   - int   len
2853    *
2854    * Output:
2855    *   c_rarg0   - int adler result
2856    */
2857   address generate_updateBytesAdler32() {
2858     __ align(CodeEntryAlignment);
2859     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
2860     address start = __ pc();
2861 
2862     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
2863 
2864     // Aliases
2865     Register adler  = c_rarg0;
2866     Register s1     = c_rarg0;
2867     Register s2     = c_rarg3;
2868     Register buff   = c_rarg1;
2869     Register len    = c_rarg2;
2870     Register nmax  = r4;
2871     Register base = r5;
2872     Register count = r6;
2873     Register temp0 = rscratch1;
2874     Register temp1 = rscratch2;
2875     Register temp2 = r7;
2876 
2877     // Max number of bytes we can process before having to take the mod
2878     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
2879     unsigned long BASE = 0xfff1;
2880     unsigned long NMAX = 0x15B0;
2881 
2882     __ mov(base, BASE);
2883     __ mov(nmax, NMAX);
2884 
2885     // s1 is initialized to the lower 16 bits of adler
2886     // s2 is initialized to the upper 16 bits of adler
2887     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
2888     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
2889 
2890     // The pipelined loop needs at least 16 elements for 1 iteration
2891     // It does check this, but it is more effective to skip to the cleanup loop
2892     __ cmp(len, 16);
2893     __ br(Assembler::HS, L_nmax);
2894     __ cbz(len, L_combine);
2895 
2896     __ bind(L_simple_by1_loop);
2897     __ ldrb(temp0, Address(__ post(buff, 1)));
2898     __ add(s1, s1, temp0);
2899     __ add(s2, s2, s1);
2900     __ subs(len, len, 1);
2901     __ br(Assembler::HI, L_simple_by1_loop);
2902 
2903     // s1 = s1 % BASE
2904     __ subs(temp0, s1, base);
2905     __ csel(s1, temp0, s1, Assembler::HS);
2906 
2907     // s2 = s2 % BASE
2908     __ lsr(temp0, s2, 16);
2909     __ lsl(temp1, temp0, 4);
2910     __ sub(temp1, temp1, temp0);
2911     __ add(s2, temp1, s2, ext::uxth);
2912 
2913     __ subs(temp0, s2, base);
2914     __ csel(s2, temp0, s2, Assembler::HS);
2915 
2916     __ b(L_combine);
2917 
2918     __ bind(L_nmax);
2919     __ subs(len, len, nmax);
2920     __ sub(count, nmax, 16);
2921     __ br(Assembler::LO, L_by16);
2922 
2923     __ bind(L_nmax_loop);
2924 
2925     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
2926 
2927     __ add(s1, s1, temp0, ext::uxtb);
2928     __ ubfx(temp2, temp0, 8, 8);
2929     __ add(s2, s2, s1);
2930     __ add(s1, s1, temp2);
2931     __ ubfx(temp2, temp0, 16, 8);
2932     __ add(s2, s2, s1);
2933     __ add(s1, s1, temp2);
2934     __ ubfx(temp2, temp0, 24, 8);
2935     __ add(s2, s2, s1);
2936     __ add(s1, s1, temp2);
2937     __ ubfx(temp2, temp0, 32, 8);
2938     __ add(s2, s2, s1);
2939     __ add(s1, s1, temp2);
2940     __ ubfx(temp2, temp0, 40, 8);
2941     __ add(s2, s2, s1);
2942     __ add(s1, s1, temp2);
2943     __ ubfx(temp2, temp0, 48, 8);
2944     __ add(s2, s2, s1);
2945     __ add(s1, s1, temp2);
2946     __ add(s2, s2, s1);
2947     __ add(s1, s1, temp0, Assembler::LSR, 56);
2948     __ add(s2, s2, s1);
2949 
2950     __ add(s1, s1, temp1, ext::uxtb);
2951     __ ubfx(temp2, temp1, 8, 8);
2952     __ add(s2, s2, s1);
2953     __ add(s1, s1, temp2);
2954     __ ubfx(temp2, temp1, 16, 8);
2955     __ add(s2, s2, s1);
2956     __ add(s1, s1, temp2);
2957     __ ubfx(temp2, temp1, 24, 8);
2958     __ add(s2, s2, s1);
2959     __ add(s1, s1, temp2);
2960     __ ubfx(temp2, temp1, 32, 8);
2961     __ add(s2, s2, s1);
2962     __ add(s1, s1, temp2);
2963     __ ubfx(temp2, temp1, 40, 8);
2964     __ add(s2, s2, s1);
2965     __ add(s1, s1, temp2);
2966     __ ubfx(temp2, temp1, 48, 8);
2967     __ add(s2, s2, s1);
2968     __ add(s1, s1, temp2);
2969     __ add(s2, s2, s1);
2970     __ add(s1, s1, temp1, Assembler::LSR, 56);
2971     __ add(s2, s2, s1);
2972 
2973     __ subs(count, count, 16);
2974     __ br(Assembler::HS, L_nmax_loop);
2975 
2976     // s1 = s1 % BASE
2977     __ lsr(temp0, s1, 16);
2978     __ lsl(temp1, temp0, 4);
2979     __ sub(temp1, temp1, temp0);
2980     __ add(temp1, temp1, s1, ext::uxth);
2981 
2982     __ lsr(temp0, temp1, 16);
2983     __ lsl(s1, temp0, 4);
2984     __ sub(s1, s1, temp0);
2985     __ add(s1, s1, temp1, ext:: uxth);
2986 
2987     __ subs(temp0, s1, base);
2988     __ csel(s1, temp0, s1, Assembler::HS);
2989 
2990     // s2 = s2 % BASE
2991     __ lsr(temp0, s2, 16);
2992     __ lsl(temp1, temp0, 4);
2993     __ sub(temp1, temp1, temp0);
2994     __ add(temp1, temp1, s2, ext::uxth);
2995 
2996     __ lsr(temp0, temp1, 16);
2997     __ lsl(s2, temp0, 4);
2998     __ sub(s2, s2, temp0);
2999     __ add(s2, s2, temp1, ext:: uxth);
3000 
3001     __ subs(temp0, s2, base);
3002     __ csel(s2, temp0, s2, Assembler::HS);
3003 
3004     __ subs(len, len, nmax);
3005     __ sub(count, nmax, 16);
3006     __ br(Assembler::HS, L_nmax_loop);
3007 
3008     __ bind(L_by16);
3009     __ adds(len, len, count);
3010     __ br(Assembler::LO, L_by1);
3011 
3012     __ bind(L_by16_loop);
3013 
3014     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3015 
3016     __ add(s1, s1, temp0, ext::uxtb);
3017     __ ubfx(temp2, temp0, 8, 8);
3018     __ add(s2, s2, s1);
3019     __ add(s1, s1, temp2);
3020     __ ubfx(temp2, temp0, 16, 8);
3021     __ add(s2, s2, s1);
3022     __ add(s1, s1, temp2);
3023     __ ubfx(temp2, temp0, 24, 8);
3024     __ add(s2, s2, s1);
3025     __ add(s1, s1, temp2);
3026     __ ubfx(temp2, temp0, 32, 8);
3027     __ add(s2, s2, s1);
3028     __ add(s1, s1, temp2);
3029     __ ubfx(temp2, temp0, 40, 8);
3030     __ add(s2, s2, s1);
3031     __ add(s1, s1, temp2);
3032     __ ubfx(temp2, temp0, 48, 8);
3033     __ add(s2, s2, s1);
3034     __ add(s1, s1, temp2);
3035     __ add(s2, s2, s1);
3036     __ add(s1, s1, temp0, Assembler::LSR, 56);
3037     __ add(s2, s2, s1);
3038 
3039     __ add(s1, s1, temp1, ext::uxtb);
3040     __ ubfx(temp2, temp1, 8, 8);
3041     __ add(s2, s2, s1);
3042     __ add(s1, s1, temp2);
3043     __ ubfx(temp2, temp1, 16, 8);
3044     __ add(s2, s2, s1);
3045     __ add(s1, s1, temp2);
3046     __ ubfx(temp2, temp1, 24, 8);
3047     __ add(s2, s2, s1);
3048     __ add(s1, s1, temp2);
3049     __ ubfx(temp2, temp1, 32, 8);
3050     __ add(s2, s2, s1);
3051     __ add(s1, s1, temp2);
3052     __ ubfx(temp2, temp1, 40, 8);
3053     __ add(s2, s2, s1);
3054     __ add(s1, s1, temp2);
3055     __ ubfx(temp2, temp1, 48, 8);
3056     __ add(s2, s2, s1);
3057     __ add(s1, s1, temp2);
3058     __ add(s2, s2, s1);
3059     __ add(s1, s1, temp1, Assembler::LSR, 56);
3060     __ add(s2, s2, s1);
3061 
3062     __ subs(len, len, 16);
3063     __ br(Assembler::HS, L_by16_loop);
3064 
3065     __ bind(L_by1);
3066     __ adds(len, len, 15);
3067     __ br(Assembler::LO, L_do_mod);
3068 
3069     __ bind(L_by1_loop);
3070     __ ldrb(temp0, Address(__ post(buff, 1)));
3071     __ add(s1, temp0, s1);
3072     __ add(s2, s2, s1);
3073     __ subs(len, len, 1);
3074     __ br(Assembler::HS, L_by1_loop);
3075 
3076     __ bind(L_do_mod);
3077     // s1 = s1 % BASE
3078     __ lsr(temp0, s1, 16);
3079     __ lsl(temp1, temp0, 4);
3080     __ sub(temp1, temp1, temp0);
3081     __ add(temp1, temp1, s1, ext::uxth);
3082 
3083     __ lsr(temp0, temp1, 16);
3084     __ lsl(s1, temp0, 4);
3085     __ sub(s1, s1, temp0);
3086     __ add(s1, s1, temp1, ext:: uxth);
3087 
3088     __ subs(temp0, s1, base);
3089     __ csel(s1, temp0, s1, Assembler::HS);
3090 
3091     // s2 = s2 % BASE
3092     __ lsr(temp0, s2, 16);
3093     __ lsl(temp1, temp0, 4);
3094     __ sub(temp1, temp1, temp0);
3095     __ add(temp1, temp1, s2, ext::uxth);
3096 
3097     __ lsr(temp0, temp1, 16);
3098     __ lsl(s2, temp0, 4);
3099     __ sub(s2, s2, temp0);
3100     __ add(s2, s2, temp1, ext:: uxth);
3101 
3102     __ subs(temp0, s2, base);
3103     __ csel(s2, temp0, s2, Assembler::HS);
3104 
3105     // Combine lower bits and higher bits
3106     __ bind(L_combine);
3107     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3108 
3109     __ ret(lr);
3110 
3111     return start;
3112   }
3113 
3114   /**
3115    *  Arguments:
3116    *
3117    *  Input:
3118    *    c_rarg0   - x address
3119    *    c_rarg1   - x length
3120    *    c_rarg2   - y address
3121    *    c_rarg3   - y lenth
3122    *    c_rarg4   - z address
3123    *    c_rarg5   - z length
3124    */
3125   address generate_multiplyToLen() {
3126     __ align(CodeEntryAlignment);
3127     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3128 
3129     address start = __ pc();
3130     const Register x     = r0;
3131     const Register xlen  = r1;
3132     const Register y     = r2;
3133     const Register ylen  = r3;
3134     const Register z     = r4;
3135     const Register zlen  = r5;
3136 
3137     const Register tmp1  = r10;
3138     const Register tmp2  = r11;
3139     const Register tmp3  = r12;
3140     const Register tmp4  = r13;
3141     const Register tmp5  = r14;
3142     const Register tmp6  = r15;
3143     const Register tmp7  = r16;
3144 
3145     BLOCK_COMMENT("Entry:");
3146     __ enter(); // required for proper stackwalking of RuntimeStub frame
3147     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3148     __ leave(); // required for proper stackwalking of RuntimeStub frame
3149     __ ret(lr);
3150 
3151     return start;
3152   }
3153 
3154   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3155                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3156                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3157     // Karatsuba multiplication performs a 128*128 -> 256-bit
3158     // multiplication in three 128-bit multiplications and a few
3159     // additions.
3160     //
3161     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3162     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3163     //
3164     // Inputs:
3165     //
3166     // A0 in a.d[0]     (subkey)
3167     // A1 in a.d[1]
3168     // (A1+A0) in a1_xor_a0.d[0]
3169     //
3170     // B0 in b.d[0]     (state)
3171     // B1 in b.d[1]
3172 
3173     __ ext(tmp1, __ T16B, b, b, 0x08);
3174     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3175     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3176     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3177     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3178 
3179     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3180     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3181     __ eor(tmp2, __ T16B, tmp2, tmp4);
3182     __ eor(tmp2, __ T16B, tmp2, tmp3);
3183 
3184     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3185     __ ins(result_hi, __ D, tmp2, 0, 1);
3186     __ ins(result_lo, __ D, tmp2, 1, 0);
3187   }
3188 
3189   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3190                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3191     const FloatRegister t0 = result;
3192 
3193     // The GCM field polynomial f is z^128 + p(z), where p =
3194     // z^7+z^2+z+1.
3195     //
3196     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3197     //
3198     // so, given that the product we're reducing is
3199     //    a == lo + hi * z^128
3200     // substituting,
3201     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3202     //
3203     // we reduce by multiplying hi by p(z) and subtracting the result
3204     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3205     // bits we can do this with two 64-bit multiplications, lo*p and
3206     // hi*p.
3207 
3208     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3209     __ ext(t1, __ T16B, t0, z, 8);
3210     __ eor(hi, __ T16B, hi, t1);
3211     __ ext(t1, __ T16B, z, t0, 8);
3212     __ eor(lo, __ T16B, lo, t1);
3213     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3214     __ eor(result, __ T16B, lo, t0);
3215   }
3216 
3217   /**
3218    *  Arguments:
3219    *
3220    *  Input:
3221    *  c_rarg0   - current state address
3222    *  c_rarg1   - H key address
3223    *  c_rarg2   - data address
3224    *  c_rarg3   - number of blocks
3225    *
3226    *  Output:
3227    *  Updated state at c_rarg0
3228    */
3229   address generate_ghash_processBlocks() {
3230     // Bafflingly, GCM uses little-endian for the byte order, but
3231     // big-endian for the bit order.  For example, the polynomial 1 is
3232     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3233     //
3234     // So, we must either reverse the bytes in each word and do
3235     // everything big-endian or reverse the bits in each byte and do
3236     // it little-endian.  On AArch64 it's more idiomatic to reverse
3237     // the bits in each byte (we have an instruction, RBIT, to do
3238     // that) and keep the data in little-endian bit order throught the
3239     // calculation, bit-reversing the inputs and outputs.
3240 
3241     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3242     __ align(wordSize * 2);
3243     address p = __ pc();
3244     __ emit_int64(0x87);  // The low-order bits of the field
3245                           // polynomial (i.e. p = z^7+z^2+z+1)
3246                           // repeated in the low and high parts of a
3247                           // 128-bit vector
3248     __ emit_int64(0x87);
3249 
3250     __ align(CodeEntryAlignment);
3251     address start = __ pc();
3252 
3253     Register state   = c_rarg0;
3254     Register subkeyH = c_rarg1;
3255     Register data    = c_rarg2;
3256     Register blocks  = c_rarg3;
3257 
3258     FloatRegister vzr = v30;
3259     __ eor(vzr, __ T16B, vzr, vzr); // zero register
3260 
3261     __ ldrq(v0, Address(state));
3262     __ ldrq(v1, Address(subkeyH));
3263 
3264     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3265     __ rbit(v0, __ T16B, v0);
3266     __ rev64(v1, __ T16B, v1);
3267     __ rbit(v1, __ T16B, v1);
3268 
3269     __ ldrq(v26, p);
3270 
3271     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
3272     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
3273 
3274     {
3275       Label L_ghash_loop;
3276       __ bind(L_ghash_loop);
3277 
3278       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
3279                                                  // reversing each byte
3280       __ rbit(v2, __ T16B, v2);
3281       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
3282 
3283       // Multiply state in v2 by subkey in v1
3284       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
3285                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
3286                      /*temps*/v6, v20, v18, v21);
3287       // Reduce v7:v5 by the field polynomial
3288       ghash_reduce(v0, v5, v7, v26, vzr, v20);
3289 
3290       __ sub(blocks, blocks, 1);
3291       __ cbnz(blocks, L_ghash_loop);
3292     }
3293 
3294     // The bit-reversed result is at this point in v0
3295     __ rev64(v1, __ T16B, v0);
3296     __ rbit(v1, __ T16B, v1);
3297 
3298     __ st1(v1, __ T16B, state);
3299     __ ret(lr);
3300 
3301     return start;
3302   }
3303 
3304   // Continuation point for throwing of implicit exceptions that are
3305   // not handled in the current activation. Fabricates an exception
3306   // oop and initiates normal exception dispatching in this
3307   // frame. Since we need to preserve callee-saved values (currently
3308   // only for C2, but done for C1 as well) we need a callee-saved oop
3309   // map and therefore have to make these stubs into RuntimeStubs
3310   // rather than BufferBlobs.  If the compiler needs all registers to
3311   // be preserved between the fault point and the exception handler
3312   // then it must assume responsibility for that in
3313   // AbstractCompiler::continuation_for_implicit_null_exception or
3314   // continuation_for_implicit_division_by_zero_exception. All other
3315   // implicit exceptions (e.g., NullPointerException or
3316   // AbstractMethodError on entry) are either at call sites or
3317   // otherwise assume that stack unwinding will be initiated, so
3318   // caller saved registers were assumed volatile in the compiler.
3319 
3320 #undef __
3321 #define __ masm->
3322 
3323   address generate_throw_exception(const char* name,
3324                                    address runtime_entry,
3325                                    Register arg1 = noreg,
3326                                    Register arg2 = noreg) {
3327     // Information about frame layout at time of blocking runtime call.
3328     // Note that we only have to preserve callee-saved registers since
3329     // the compilers are responsible for supplying a continuation point
3330     // if they expect all registers to be preserved.
3331     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3332     enum layout {
3333       rfp_off = 0,
3334       rfp_off2,
3335       return_off,
3336       return_off2,
3337       framesize // inclusive of return address
3338     };
3339 
3340     int insts_size = 512;
3341     int locs_size  = 64;
3342 
3343     CodeBuffer code(name, insts_size, locs_size);
3344     OopMapSet* oop_maps  = new OopMapSet();
3345     MacroAssembler* masm = new MacroAssembler(&code);
3346 
3347     address start = __ pc();
3348 
3349     // This is an inlined and slightly modified version of call_VM
3350     // which has the ability to fetch the return PC out of
3351     // thread-local storage and also sets up last_Java_sp slightly
3352     // differently than the real call_VM
3353 
3354     __ enter(); // Save FP and LR before call
3355 
3356     assert(is_even(framesize/2), "sp not 16-byte aligned");
3357 
3358     // lr and fp are already in place
3359     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3360 
3361     int frame_complete = __ pc() - start;
3362 
3363     // Set up last_Java_sp and last_Java_fp
3364     address the_pc = __ pc();
3365     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3366 
3367     // Call runtime
3368     if (arg1 != noreg) {
3369       assert(arg2 != c_rarg1, "clobbered");
3370       __ mov(c_rarg1, arg1);
3371     }
3372     if (arg2 != noreg) {
3373       __ mov(c_rarg2, arg2);
3374     }
3375     __ mov(c_rarg0, rthread);
3376     BLOCK_COMMENT("call runtime_entry");
3377     __ mov(rscratch1, runtime_entry);
3378     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
3379 
3380     // Generate oop map
3381     OopMap* map = new OopMap(framesize, 0);
3382 
3383     oop_maps->add_gc_map(the_pc - start, map);
3384 
3385     __ reset_last_Java_frame(true, true);
3386     __ maybe_isb();
3387 
3388     __ leave();
3389 
3390     // check for pending exceptions
3391 #ifdef ASSERT
3392     Label L;
3393     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3394     __ cbnz(rscratch1, L);
3395     __ should_not_reach_here();
3396     __ bind(L);
3397 #endif // ASSERT
3398     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3399 
3400 
3401     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3402     RuntimeStub* stub =
3403       RuntimeStub::new_runtime_stub(name,
3404                                     &code,
3405                                     frame_complete,
3406                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3407                                     oop_maps, false);
3408     return stub->entry_point();
3409   }
3410 
3411   class MontgomeryMultiplyGenerator : public MacroAssembler {
3412 
3413     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3414       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
3415 
3416     RegSet _toSave;
3417     bool _squaring;
3418 
3419   public:
3420     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
3421       : MacroAssembler(as->code()), _squaring(squaring) {
3422 
3423       // Register allocation
3424 
3425       Register reg = c_rarg0;
3426       Pa_base = reg;       // Argument registers
3427       if (squaring)
3428         Pb_base = Pa_base;
3429       else
3430         Pb_base = ++reg;
3431       Pn_base = ++reg;
3432       Rlen= ++reg;
3433       inv = ++reg;
3434       Pm_base = ++reg;
3435 
3436                           // Working registers:
3437       Ra =  ++reg;        // The current digit of a, b, n, and m.
3438       Rb =  ++reg;
3439       Rm =  ++reg;
3440       Rn =  ++reg;
3441 
3442       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
3443       Pb =  ++reg;
3444       Pm =  ++reg;
3445       Pn =  ++reg;
3446 
3447       t0 =  ++reg;        // Three registers which form a
3448       t1 =  ++reg;        // triple-precision accumuator.
3449       t2 =  ++reg;
3450 
3451       Ri =  ++reg;        // Inner and outer loop indexes.
3452       Rj =  ++reg;
3453 
3454       Rhi_ab = ++reg;     // Product registers: low and high parts
3455       Rlo_ab = ++reg;     // of a*b and m*n.
3456       Rhi_mn = ++reg;
3457       Rlo_mn = ++reg;
3458 
3459       // r19 and up are callee-saved.
3460       _toSave = RegSet::range(r19, reg) + Pm_base;
3461     }
3462 
3463   private:
3464     void save_regs() {
3465       push(_toSave, sp);
3466     }
3467 
3468     void restore_regs() {
3469       pop(_toSave, sp);
3470     }
3471 
3472     template <typename T>
3473     void unroll_2(Register count, T block) {
3474       Label loop, end, odd;
3475       tbnz(count, 0, odd);
3476       cbz(count, end);
3477       align(16);
3478       bind(loop);
3479       (this->*block)();
3480       bind(odd);
3481       (this->*block)();
3482       subs(count, count, 2);
3483       br(Assembler::GT, loop);
3484       bind(end);
3485     }
3486 
3487     template <typename T>
3488     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
3489       Label loop, end, odd;
3490       tbnz(count, 0, odd);
3491       cbz(count, end);
3492       align(16);
3493       bind(loop);
3494       (this->*block)(d, s, tmp);
3495       bind(odd);
3496       (this->*block)(d, s, tmp);
3497       subs(count, count, 2);
3498       br(Assembler::GT, loop);
3499       bind(end);
3500     }
3501 
3502     void pre1(RegisterOrConstant i) {
3503       block_comment("pre1");
3504       // Pa = Pa_base;
3505       // Pb = Pb_base + i;
3506       // Pm = Pm_base;
3507       // Pn = Pn_base + i;
3508       // Ra = *Pa;
3509       // Rb = *Pb;
3510       // Rm = *Pm;
3511       // Rn = *Pn;
3512       ldr(Ra, Address(Pa_base));
3513       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3514       ldr(Rm, Address(Pm_base));
3515       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3516       lea(Pa, Address(Pa_base));
3517       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
3518       lea(Pm, Address(Pm_base));
3519       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3520 
3521       // Zero the m*n result.
3522       mov(Rhi_mn, zr);
3523       mov(Rlo_mn, zr);
3524     }
3525 
3526     // The core multiply-accumulate step of a Montgomery
3527     // multiplication.  The idea is to schedule operations as a
3528     // pipeline so that instructions with long latencies (loads and
3529     // multiplies) have time to complete before their results are
3530     // used.  This most benefits in-order implementations of the
3531     // architecture but out-of-order ones also benefit.
3532     void step() {
3533       block_comment("step");
3534       // MACC(Ra, Rb, t0, t1, t2);
3535       // Ra = *++Pa;
3536       // Rb = *--Pb;
3537       umulh(Rhi_ab, Ra, Rb);
3538       mul(Rlo_ab, Ra, Rb);
3539       ldr(Ra, pre(Pa, wordSize));
3540       ldr(Rb, pre(Pb, -wordSize));
3541       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
3542                                        // previous iteration.
3543       // MACC(Rm, Rn, t0, t1, t2);
3544       // Rm = *++Pm;
3545       // Rn = *--Pn;
3546       umulh(Rhi_mn, Rm, Rn);
3547       mul(Rlo_mn, Rm, Rn);
3548       ldr(Rm, pre(Pm, wordSize));
3549       ldr(Rn, pre(Pn, -wordSize));
3550       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3551     }
3552 
3553     void post1() {
3554       block_comment("post1");
3555 
3556       // MACC(Ra, Rb, t0, t1, t2);
3557       // Ra = *++Pa;
3558       // Rb = *--Pb;
3559       umulh(Rhi_ab, Ra, Rb);
3560       mul(Rlo_ab, Ra, Rb);
3561       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3562       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3563 
3564       // *Pm = Rm = t0 * inv;
3565       mul(Rm, t0, inv);
3566       str(Rm, Address(Pm));
3567 
3568       // MACC(Rm, Rn, t0, t1, t2);
3569       // t0 = t1; t1 = t2; t2 = 0;
3570       umulh(Rhi_mn, Rm, Rn);
3571 
3572 #ifndef PRODUCT
3573       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3574       {
3575         mul(Rlo_mn, Rm, Rn);
3576         add(Rlo_mn, t0, Rlo_mn);
3577         Label ok;
3578         cbz(Rlo_mn, ok); {
3579           stop("broken Montgomery multiply");
3580         } bind(ok);
3581       }
3582 #endif
3583       // We have very carefully set things up so that
3584       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3585       // the lower half of Rm * Rn because we know the result already:
3586       // it must be -t0.  t0 + (-t0) must generate a carry iff
3587       // t0 != 0.  So, rather than do a mul and an adds we just set
3588       // the carry flag iff t0 is nonzero.
3589       //
3590       // mul(Rlo_mn, Rm, Rn);
3591       // adds(zr, t0, Rlo_mn);
3592       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3593       adcs(t0, t1, Rhi_mn);
3594       adc(t1, t2, zr);
3595       mov(t2, zr);
3596     }
3597 
3598     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
3599       block_comment("pre2");
3600       // Pa = Pa_base + i-len;
3601       // Pb = Pb_base + len;
3602       // Pm = Pm_base + i-len;
3603       // Pn = Pn_base + len;
3604 
3605       if (i.is_register()) {
3606         sub(Rj, i.as_register(), len);
3607       } else {
3608         mov(Rj, i.as_constant());
3609         sub(Rj, Rj, len);
3610       }
3611       // Rj == i-len
3612 
3613       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
3614       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
3615       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3616       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
3617 
3618       // Ra = *++Pa;
3619       // Rb = *--Pb;
3620       // Rm = *++Pm;
3621       // Rn = *--Pn;
3622       ldr(Ra, pre(Pa, wordSize));
3623       ldr(Rb, pre(Pb, -wordSize));
3624       ldr(Rm, pre(Pm, wordSize));
3625       ldr(Rn, pre(Pn, -wordSize));
3626 
3627       mov(Rhi_mn, zr);
3628       mov(Rlo_mn, zr);
3629     }
3630 
3631     void post2(RegisterOrConstant i, RegisterOrConstant len) {
3632       block_comment("post2");
3633       if (i.is_constant()) {
3634         mov(Rj, i.as_constant()-len.as_constant());
3635       } else {
3636         sub(Rj, i.as_register(), len);
3637       }
3638 
3639       adds(t0, t0, Rlo_mn); // The pending m*n, low part
3640 
3641       // As soon as we know the least significant digit of our result,
3642       // store it.
3643       // Pm_base[i-len] = t0;
3644       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
3645 
3646       // t0 = t1; t1 = t2; t2 = 0;
3647       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
3648       adc(t1, t2, zr);
3649       mov(t2, zr);
3650     }
3651 
3652     // A carry in t0 after Montgomery multiplication means that we
3653     // should subtract multiples of n from our result in m.  We'll
3654     // keep doing that until there is no carry.
3655     void normalize(RegisterOrConstant len) {
3656       block_comment("normalize");
3657       // while (t0)
3658       //   t0 = sub(Pm_base, Pn_base, t0, len);
3659       Label loop, post, again;
3660       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
3661       cbz(t0, post); {
3662         bind(again); {
3663           mov(i, zr);
3664           mov(cnt, len);
3665           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3666           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3667           subs(zr, zr, zr); // set carry flag, i.e. no borrow
3668           align(16);
3669           bind(loop); {
3670             sbcs(Rm, Rm, Rn);
3671             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3672             add(i, i, 1);
3673             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
3674             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
3675             sub(cnt, cnt, 1);
3676           } cbnz(cnt, loop);
3677           sbc(t0, t0, zr);
3678         } cbnz(t0, again);
3679       } bind(post);
3680     }
3681 
3682     // Move memory at s to d, reversing words.
3683     //    Increments d to end of copied memory
3684     //    Destroys tmp1, tmp2
3685     //    Preserves len
3686     //    Leaves s pointing to the address which was in d at start
3687     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
3688       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
3689 
3690       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
3691       mov(tmp1, len);
3692       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
3693       sub(s, d, len, ext::uxtw, LogBytesPerWord);
3694     }
3695     // where
3696     void reverse1(Register d, Register s, Register tmp) {
3697       ldr(tmp, pre(s, -wordSize));
3698       ror(tmp, tmp, 32);
3699       str(tmp, post(d, wordSize));
3700     }
3701 
3702     void step_squaring() {
3703       // An extra ACC
3704       step();
3705       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3706     }
3707 
3708     void last_squaring(RegisterOrConstant i) {
3709       Label dont;
3710       // if ((i & 1) == 0) {
3711       tbnz(i.as_register(), 0, dont); {
3712         // MACC(Ra, Rb, t0, t1, t2);
3713         // Ra = *++Pa;
3714         // Rb = *--Pb;
3715         umulh(Rhi_ab, Ra, Rb);
3716         mul(Rlo_ab, Ra, Rb);
3717         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
3718       } bind(dont);
3719     }
3720 
3721     void extra_step_squaring() {
3722       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3723 
3724       // MACC(Rm, Rn, t0, t1, t2);
3725       // Rm = *++Pm;
3726       // Rn = *--Pn;
3727       umulh(Rhi_mn, Rm, Rn);
3728       mul(Rlo_mn, Rm, Rn);
3729       ldr(Rm, pre(Pm, wordSize));
3730       ldr(Rn, pre(Pn, -wordSize));
3731     }
3732 
3733     void post1_squaring() {
3734       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
3735 
3736       // *Pm = Rm = t0 * inv;
3737       mul(Rm, t0, inv);
3738       str(Rm, Address(Pm));
3739 
3740       // MACC(Rm, Rn, t0, t1, t2);
3741       // t0 = t1; t1 = t2; t2 = 0;
3742       umulh(Rhi_mn, Rm, Rn);
3743 
3744 #ifndef PRODUCT
3745       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
3746       {
3747         mul(Rlo_mn, Rm, Rn);
3748         add(Rlo_mn, t0, Rlo_mn);
3749         Label ok;
3750         cbz(Rlo_mn, ok); {
3751           stop("broken Montgomery multiply");
3752         } bind(ok);
3753       }
3754 #endif
3755       // We have very carefully set things up so that
3756       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
3757       // the lower half of Rm * Rn because we know the result already:
3758       // it must be -t0.  t0 + (-t0) must generate a carry iff
3759       // t0 != 0.  So, rather than do a mul and an adds we just set
3760       // the carry flag iff t0 is nonzero.
3761       //
3762       // mul(Rlo_mn, Rm, Rn);
3763       // adds(zr, t0, Rlo_mn);
3764       subs(zr, t0, 1); // Set carry iff t0 is nonzero
3765       adcs(t0, t1, Rhi_mn);
3766       adc(t1, t2, zr);
3767       mov(t2, zr);
3768     }
3769 
3770     void acc(Register Rhi, Register Rlo,
3771              Register t0, Register t1, Register t2) {
3772       adds(t0, t0, Rlo);
3773       adcs(t1, t1, Rhi);
3774       adc(t2, t2, zr);
3775     }
3776 
3777   public:
3778     /**
3779      * Fast Montgomery multiplication.  The derivation of the
3780      * algorithm is in A Cryptographic Library for the Motorola
3781      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
3782      *
3783      * Arguments:
3784      *
3785      * Inputs for multiplication:
3786      *   c_rarg0   - int array elements a
3787      *   c_rarg1   - int array elements b
3788      *   c_rarg2   - int array elements n (the modulus)
3789      *   c_rarg3   - int length
3790      *   c_rarg4   - int inv
3791      *   c_rarg5   - int array elements m (the result)
3792      *
3793      * Inputs for squaring:
3794      *   c_rarg0   - int array elements a
3795      *   c_rarg1   - int array elements n (the modulus)
3796      *   c_rarg2   - int length
3797      *   c_rarg3   - int inv
3798      *   c_rarg4   - int array elements m (the result)
3799      *
3800      */
3801     address generate_multiply() {
3802       Label argh, nothing;
3803       bind(argh);
3804       stop("MontgomeryMultiply total_allocation must be <= 8192");
3805 
3806       align(CodeEntryAlignment);
3807       address entry = pc();
3808 
3809       cbzw(Rlen, nothing);
3810 
3811       enter();
3812 
3813       // Make room.
3814       cmpw(Rlen, 512);
3815       br(Assembler::HI, argh);
3816       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
3817       andr(sp, Ra, -2 * wordSize);
3818 
3819       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
3820 
3821       {
3822         // Copy input args, reversing as we go.  We use Ra as a
3823         // temporary variable.
3824         reverse(Ra, Pa_base, Rlen, t0, t1);
3825         if (!_squaring)
3826           reverse(Ra, Pb_base, Rlen, t0, t1);
3827         reverse(Ra, Pn_base, Rlen, t0, t1);
3828       }
3829 
3830       // Push all call-saved registers and also Pm_base which we'll need
3831       // at the end.
3832       save_regs();
3833 
3834 #ifndef PRODUCT
3835       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
3836       {
3837         ldr(Rn, Address(Pn_base, 0));
3838         mul(Rlo_mn, Rn, inv);
3839         cmp(Rlo_mn, -1);
3840         Label ok;
3841         br(EQ, ok); {
3842           stop("broken inverse in Montgomery multiply");
3843         } bind(ok);
3844       }
3845 #endif
3846 
3847       mov(Pm_base, Ra);
3848 
3849       mov(t0, zr);
3850       mov(t1, zr);
3851       mov(t2, zr);
3852 
3853       block_comment("for (int i = 0; i < len; i++) {");
3854       mov(Ri, zr); {
3855         Label loop, end;
3856         cmpw(Ri, Rlen);
3857         br(Assembler::GE, end);
3858 
3859         bind(loop);
3860         pre1(Ri);
3861 
3862         block_comment("  for (j = i; j; j--) {"); {
3863           movw(Rj, Ri);
3864           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3865         } block_comment("  } // j");
3866 
3867         post1();
3868         addw(Ri, Ri, 1);
3869         cmpw(Ri, Rlen);
3870         br(Assembler::LT, loop);
3871         bind(end);
3872         block_comment("} // i");
3873       }
3874 
3875       block_comment("for (int i = len; i < 2*len; i++) {");
3876       mov(Ri, Rlen); {
3877         Label loop, end;
3878         cmpw(Ri, Rlen, Assembler::LSL, 1);
3879         br(Assembler::GE, end);
3880 
3881         bind(loop);
3882         pre2(Ri, Rlen);
3883 
3884         block_comment("  for (j = len*2-i-1; j; j--) {"); {
3885           lslw(Rj, Rlen, 1);
3886           subw(Rj, Rj, Ri);
3887           subw(Rj, Rj, 1);
3888           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
3889         } block_comment("  } // j");
3890 
3891         post2(Ri, Rlen);
3892         addw(Ri, Ri, 1);
3893         cmpw(Ri, Rlen, Assembler::LSL, 1);
3894         br(Assembler::LT, loop);
3895         bind(end);
3896       }
3897       block_comment("} // i");
3898 
3899       normalize(Rlen);
3900 
3901       mov(Ra, Pm_base);  // Save Pm_base in Ra
3902       restore_regs();  // Restore caller's Pm_base
3903 
3904       // Copy our result into caller's Pm_base
3905       reverse(Pm_base, Ra, Rlen, t0, t1);
3906 
3907       leave();
3908       bind(nothing);
3909       ret(lr);
3910 
3911       return entry;
3912     }
3913     // In C, approximately:
3914 
3915     // void
3916     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
3917     //                     unsigned long Pn_base[], unsigned long Pm_base[],
3918     //                     unsigned long inv, int len) {
3919     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
3920     //   unsigned long *Pa, *Pb, *Pn, *Pm;
3921     //   unsigned long Ra, Rb, Rn, Rm;
3922 
3923     //   int i;
3924 
3925     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
3926 
3927     //   for (i = 0; i < len; i++) {
3928     //     int j;
3929 
3930     //     Pa = Pa_base;
3931     //     Pb = Pb_base + i;
3932     //     Pm = Pm_base;
3933     //     Pn = Pn_base + i;
3934 
3935     //     Ra = *Pa;
3936     //     Rb = *Pb;
3937     //     Rm = *Pm;
3938     //     Rn = *Pn;
3939 
3940     //     int iters = i;
3941     //     for (j = 0; iters--; j++) {
3942     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
3943     //       MACC(Ra, Rb, t0, t1, t2);
3944     //       Ra = *++Pa;
3945     //       Rb = *--Pb;
3946     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
3947     //       MACC(Rm, Rn, t0, t1, t2);
3948     //       Rm = *++Pm;
3949     //       Rn = *--Pn;
3950     //     }
3951 
3952     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
3953     //     MACC(Ra, Rb, t0, t1, t2);
3954     //     *Pm = Rm = t0 * inv;
3955     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
3956     //     MACC(Rm, Rn, t0, t1, t2);
3957 
3958     //     assert(t0 == 0, "broken Montgomery multiply");
3959 
3960     //     t0 = t1; t1 = t2; t2 = 0;
3961     //   }
3962 
3963     //   for (i = len; i < 2*len; i++) {
3964     //     int j;
3965 
3966     //     Pa = Pa_base + i-len;
3967     //     Pb = Pb_base + len;
3968     //     Pm = Pm_base + i-len;
3969     //     Pn = Pn_base + len;
3970 
3971     //     Ra = *++Pa;
3972     //     Rb = *--Pb;
3973     //     Rm = *++Pm;
3974     //     Rn = *--Pn;
3975 
3976     //     int iters = len*2-i-1;
3977     //     for (j = i-len+1; iters--; j++) {
3978     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
3979     //       MACC(Ra, Rb, t0, t1, t2);
3980     //       Ra = *++Pa;
3981     //       Rb = *--Pb;
3982     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
3983     //       MACC(Rm, Rn, t0, t1, t2);
3984     //       Rm = *++Pm;
3985     //       Rn = *--Pn;
3986     //     }
3987 
3988     //     Pm_base[i-len] = t0;
3989     //     t0 = t1; t1 = t2; t2 = 0;
3990     //   }
3991 
3992     //   while (t0)
3993     //     t0 = sub(Pm_base, Pn_base, t0, len);
3994     // }
3995 
3996     /**
3997      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
3998      * multiplies than Montgomery multiplication so it should be up to
3999      * 25% faster.  However, its loop control is more complex and it
4000      * may actually run slower on some machines.
4001      *
4002      * Arguments:
4003      *
4004      * Inputs:
4005      *   c_rarg0   - int array elements a
4006      *   c_rarg1   - int array elements n (the modulus)
4007      *   c_rarg2   - int length
4008      *   c_rarg3   - int inv
4009      *   c_rarg4   - int array elements m (the result)
4010      *
4011      */
4012     address generate_square() {
4013       Label argh;
4014       bind(argh);
4015       stop("MontgomeryMultiply total_allocation must be <= 8192");
4016 
4017       align(CodeEntryAlignment);
4018       address entry = pc();
4019 
4020       enter();
4021 
4022       // Make room.
4023       cmpw(Rlen, 512);
4024       br(Assembler::HI, argh);
4025       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4026       andr(sp, Ra, -2 * wordSize);
4027 
4028       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4029 
4030       {
4031         // Copy input args, reversing as we go.  We use Ra as a
4032         // temporary variable.
4033         reverse(Ra, Pa_base, Rlen, t0, t1);
4034         reverse(Ra, Pn_base, Rlen, t0, t1);
4035       }
4036 
4037       // Push all call-saved registers and also Pm_base which we'll need
4038       // at the end.
4039       save_regs();
4040 
4041       mov(Pm_base, Ra);
4042 
4043       mov(t0, zr);
4044       mov(t1, zr);
4045       mov(t2, zr);
4046 
4047       block_comment("for (int i = 0; i < len; i++) {");
4048       mov(Ri, zr); {
4049         Label loop, end;
4050         bind(loop);
4051         cmp(Ri, Rlen);
4052         br(Assembler::GE, end);
4053 
4054         pre1(Ri);
4055 
4056         block_comment("for (j = (i+1)/2; j; j--) {"); {
4057           add(Rj, Ri, 1);
4058           lsr(Rj, Rj, 1);
4059           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4060         } block_comment("  } // j");
4061 
4062         last_squaring(Ri);
4063 
4064         block_comment("  for (j = i/2; j; j--) {"); {
4065           lsr(Rj, Ri, 1);
4066           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4067         } block_comment("  } // j");
4068 
4069         post1_squaring();
4070         add(Ri, Ri, 1);
4071         cmp(Ri, Rlen);
4072         br(Assembler::LT, loop);
4073 
4074         bind(end);
4075         block_comment("} // i");
4076       }
4077 
4078       block_comment("for (int i = len; i < 2*len; i++) {");
4079       mov(Ri, Rlen); {
4080         Label loop, end;
4081         bind(loop);
4082         cmp(Ri, Rlen, Assembler::LSL, 1);
4083         br(Assembler::GE, end);
4084 
4085         pre2(Ri, Rlen);
4086 
4087         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4088           lsl(Rj, Rlen, 1);
4089           sub(Rj, Rj, Ri);
4090           sub(Rj, Rj, 1);
4091           lsr(Rj, Rj, 1);
4092           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4093         } block_comment("  } // j");
4094 
4095         last_squaring(Ri);
4096 
4097         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4098           lsl(Rj, Rlen, 1);
4099           sub(Rj, Rj, Ri);
4100           lsr(Rj, Rj, 1);
4101           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4102         } block_comment("  } // j");
4103 
4104         post2(Ri, Rlen);
4105         add(Ri, Ri, 1);
4106         cmp(Ri, Rlen, Assembler::LSL, 1);
4107 
4108         br(Assembler::LT, loop);
4109         bind(end);
4110         block_comment("} // i");
4111       }
4112 
4113       normalize(Rlen);
4114 
4115       mov(Ra, Pm_base);  // Save Pm_base in Ra
4116       restore_regs();  // Restore caller's Pm_base
4117 
4118       // Copy our result into caller's Pm_base
4119       reverse(Pm_base, Ra, Rlen, t0, t1);
4120 
4121       leave();
4122       ret(lr);
4123 
4124       return entry;
4125     }
4126     // In C, approximately:
4127 
4128     // void
4129     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4130     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4131     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4132     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4133     //   unsigned long Ra, Rb, Rn, Rm;
4134 
4135     //   int i;
4136 
4137     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4138 
4139     //   for (i = 0; i < len; i++) {
4140     //     int j;
4141 
4142     //     Pa = Pa_base;
4143     //     Pb = Pa_base + i;
4144     //     Pm = Pm_base;
4145     //     Pn = Pn_base + i;
4146 
4147     //     Ra = *Pa;
4148     //     Rb = *Pb;
4149     //     Rm = *Pm;
4150     //     Rn = *Pn;
4151 
4152     //     int iters = (i+1)/2;
4153     //     for (j = 0; iters--; j++) {
4154     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4155     //       MACC2(Ra, Rb, t0, t1, t2);
4156     //       Ra = *++Pa;
4157     //       Rb = *--Pb;
4158     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4159     //       MACC(Rm, Rn, t0, t1, t2);
4160     //       Rm = *++Pm;
4161     //       Rn = *--Pn;
4162     //     }
4163     //     if ((i & 1) == 0) {
4164     //       assert(Ra == Pa_base[j], "must be");
4165     //       MACC(Ra, Ra, t0, t1, t2);
4166     //     }
4167     //     iters = i/2;
4168     //     assert(iters == i-j, "must be");
4169     //     for (; iters--; j++) {
4170     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4171     //       MACC(Rm, Rn, t0, t1, t2);
4172     //       Rm = *++Pm;
4173     //       Rn = *--Pn;
4174     //     }
4175 
4176     //     *Pm = Rm = t0 * inv;
4177     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4178     //     MACC(Rm, Rn, t0, t1, t2);
4179 
4180     //     assert(t0 == 0, "broken Montgomery multiply");
4181 
4182     //     t0 = t1; t1 = t2; t2 = 0;
4183     //   }
4184 
4185     //   for (i = len; i < 2*len; i++) {
4186     //     int start = i-len+1;
4187     //     int end = start + (len - start)/2;
4188     //     int j;
4189 
4190     //     Pa = Pa_base + i-len;
4191     //     Pb = Pa_base + len;
4192     //     Pm = Pm_base + i-len;
4193     //     Pn = Pn_base + len;
4194 
4195     //     Ra = *++Pa;
4196     //     Rb = *--Pb;
4197     //     Rm = *++Pm;
4198     //     Rn = *--Pn;
4199 
4200     //     int iters = (2*len-i-1)/2;
4201     //     assert(iters == end-start, "must be");
4202     //     for (j = start; iters--; j++) {
4203     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4204     //       MACC2(Ra, Rb, t0, t1, t2);
4205     //       Ra = *++Pa;
4206     //       Rb = *--Pb;
4207     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4208     //       MACC(Rm, Rn, t0, t1, t2);
4209     //       Rm = *++Pm;
4210     //       Rn = *--Pn;
4211     //     }
4212     //     if ((i & 1) == 0) {
4213     //       assert(Ra == Pa_base[j], "must be");
4214     //       MACC(Ra, Ra, t0, t1, t2);
4215     //     }
4216     //     iters =  (2*len-i)/2;
4217     //     assert(iters == len-j, "must be");
4218     //     for (; iters--; j++) {
4219     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4220     //       MACC(Rm, Rn, t0, t1, t2);
4221     //       Rm = *++Pm;
4222     //       Rn = *--Pn;
4223     //     }
4224     //     Pm_base[i-len] = t0;
4225     //     t0 = t1; t1 = t2; t2 = 0;
4226     //   }
4227 
4228     //   while (t0)
4229     //     t0 = sub(Pm_base, Pn_base, t0, len);
4230     // }
4231   };
4232 
4233   // Initialization
4234   void generate_initial() {
4235     // Generate initial stubs and initializes the entry points
4236 
4237     // entry points that exist in all platforms Note: This is code
4238     // that could be shared among different platforms - however the
4239     // benefit seems to be smaller than the disadvantage of having a
4240     // much more complicated generator structure. See also comment in
4241     // stubRoutines.hpp.
4242 
4243     StubRoutines::_forward_exception_entry = generate_forward_exception();
4244 
4245     StubRoutines::_call_stub_entry =
4246       generate_call_stub(StubRoutines::_call_stub_return_address);
4247 
4248     // is referenced by megamorphic call
4249     StubRoutines::_catch_exception_entry = generate_catch_exception();
4250 
4251     // Build this early so it's available for the interpreter.
4252     StubRoutines::_throw_StackOverflowError_entry =
4253       generate_throw_exception("StackOverflowError throw_exception",
4254                                CAST_FROM_FN_PTR(address,
4255                                                 SharedRuntime::
4256                                                 throw_StackOverflowError));
4257     if (UseCRC32Intrinsics) {
4258       // set table address before stub generation which use it
4259       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4260       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4261     }
4262   }
4263 
4264   void generate_all() {
4265     // support for verify_oop (must happen after universe_init)
4266     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4267     StubRoutines::_throw_AbstractMethodError_entry =
4268       generate_throw_exception("AbstractMethodError throw_exception",
4269                                CAST_FROM_FN_PTR(address,
4270                                                 SharedRuntime::
4271                                                 throw_AbstractMethodError));
4272 
4273     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4274       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4275                                CAST_FROM_FN_PTR(address,
4276                                                 SharedRuntime::
4277                                                 throw_IncompatibleClassChangeError));
4278 
4279     StubRoutines::_throw_NullPointerException_at_call_entry =
4280       generate_throw_exception("NullPointerException at call throw_exception",
4281                                CAST_FROM_FN_PTR(address,
4282                                                 SharedRuntime::
4283                                                 throw_NullPointerException_at_call));
4284 
4285     // arraycopy stubs used by compilers
4286     generate_arraycopy_stubs();
4287 
4288     if (UseMultiplyToLenIntrinsic) {
4289       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4290     }
4291 
4292     if (UseMontgomeryMultiplyIntrinsic) {
4293       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4294       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4295       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4296     }
4297 
4298     if (UseMontgomerySquareIntrinsic) {
4299       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4300       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4301       // We use generate_multiply() rather than generate_square()
4302       // because it's faster for the sizes of modulus we care about.
4303       StubRoutines::_montgomerySquare = g.generate_multiply();
4304     }
4305 
4306 #ifndef BUILTIN_SIM
4307     // generate GHASH intrinsics code
4308     if (UseGHASHIntrinsics) {
4309       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4310     }
4311 
4312     if (UseAESIntrinsics) {
4313       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4314       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4315       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4316       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4317     }
4318 
4319     if (UseSHA1Intrinsics) {
4320       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4321       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4322     }
4323     if (UseSHA256Intrinsics) {
4324       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4325       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4326     }
4327 
4328     if (UseCRC32CIntrinsics) {
4329       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
4330     }
4331 
4332     // generate Adler32 intrinsics code
4333     if (UseAdler32Intrinsics) {
4334       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
4335     }
4336 
4337     // Safefetch stubs.
4338     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4339                                                        &StubRoutines::_safefetch32_fault_pc,
4340                                                        &StubRoutines::_safefetch32_continuation_pc);
4341     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4342                                                        &StubRoutines::_safefetchN_fault_pc,
4343                                                        &StubRoutines::_safefetchN_continuation_pc);
4344 #endif
4345   }
4346 
4347  public:
4348   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4349     if (all) {
4350       generate_all();
4351     } else {
4352       generate_initial();
4353     }
4354   }
4355 }; // end class declaration
4356 
4357 void StubGenerator_generate(CodeBuffer* code, bool all) {
4358   StubGenerator g(code, all);
4359 }