1 /*
   2  * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.hpp"
  28 #include "asm/macroAssembler.inline.hpp"
  29 #include "gc/shenandoah/brooksPointer.hpp"
  30 #include "gc/shenandoah/shenandoahBarrierSet.hpp"
  31 #include "gc/shenandoah/shenandoahHeap.hpp"
  32 #include "gc/shenandoah/shenandoahHeapRegion.hpp"
  33 #include "interpreter/interpreter.hpp"
  34 #include "nativeInst_aarch64.hpp"
  35 #include "oops/instanceOop.hpp"
  36 #include "oops/method.hpp"
  37 #include "oops/objArrayKlass.hpp"
  38 #include "oops/oop.inline.hpp"
  39 #include "prims/methodHandles.hpp"
  40 #include "runtime/frame.inline.hpp"
  41 #include "runtime/handles.inline.hpp"
  42 #include "runtime/sharedRuntime.hpp"
  43 #include "runtime/stubCodeGenerator.hpp"
  44 #include "runtime/stubRoutines.hpp"
  45 #include "runtime/thread.inline.hpp"
  46 #ifdef COMPILER2
  47 #include "opto/runtime.hpp"
  48 #endif
  49 
  50 #ifdef BUILTIN_SIM
  51 #include "../../../../../../simulator/simulator.hpp"
  52 #endif
  53 
  54 // Declaration and definition of StubGenerator (no .hpp file).
  55 // For a more detailed description of the stub routine structure
  56 // see the comment in stubRoutines.hpp
  57 
  58 #undef __
  59 #define __ _masm->
  60 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8))
  61 
  62 #ifdef PRODUCT
  63 #define BLOCK_COMMENT(str) /* nothing */
  64 #else
  65 #define BLOCK_COMMENT(str) __ block_comment(str)
  66 #endif
  67 
  68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  69 
  70 // Stub Code definitions
  71 
  72 class StubGenerator: public StubCodeGenerator {
  73  private:
  74 
  75 #ifdef PRODUCT
  76 #define inc_counter_np(counter) ((void)0)
  77 #else
  78   void inc_counter_np_(int& counter) {
  79     __ lea(rscratch2, ExternalAddress((address)&counter));
  80     __ ldrw(rscratch1, Address(rscratch2));
  81     __ addw(rscratch1, rscratch1, 1);
  82     __ strw(rscratch1, Address(rscratch2));
  83   }
  84 #define inc_counter_np(counter) \
  85   BLOCK_COMMENT("inc_counter " #counter); \
  86   inc_counter_np_(counter);
  87 #endif
  88 
  89   // Call stubs are used to call Java from C
  90   //
  91   // Arguments:
  92   //    c_rarg0:   call wrapper address                   address
  93   //    c_rarg1:   result                                 address
  94   //    c_rarg2:   result type                            BasicType
  95   //    c_rarg3:   method                                 Method*
  96   //    c_rarg4:   (interpreter) entry point              address
  97   //    c_rarg5:   parameters                             intptr_t*
  98   //    c_rarg6:   parameter size (in words)              int
  99   //    c_rarg7:   thread                                 Thread*
 100   //
 101   // There is no return from the stub itself as any Java result
 102   // is written to result
 103   //
 104   // we save r30 (lr) as the return PC at the base of the frame and
 105   // link r29 (fp) below it as the frame pointer installing sp (r31)
 106   // into fp.
 107   //
 108   // we save r0-r7, which accounts for all the c arguments.
 109   //
 110   // TODO: strictly do we need to save them all? they are treated as
 111   // volatile by C so could we omit saving the ones we are going to
 112   // place in global registers (thread? method?) or those we only use
 113   // during setup of the Java call?
 114   //
 115   // we don't need to save r8 which C uses as an indirect result location
 116   // return register.
 117   //
 118   // we don't need to save r9-r15 which both C and Java treat as
 119   // volatile
 120   //
 121   // we don't need to save r16-18 because Java does not use them
 122   //
 123   // we save r19-r28 which Java uses as scratch registers and C
 124   // expects to be callee-save
 125   //
 126   // we save the bottom 64 bits of each value stored in v8-v15; it is
 127   // the responsibility of the caller to preserve larger values.
 128   //
 129   // so the stub frame looks like this when we enter Java code
 130   //
 131   //     [ return_from_Java     ] <--- sp
 132   //     [ argument word n      ]
 133   //      ...
 134   // -27 [ argument word 1      ]
 135   // -26 [ saved v15            ] <--- sp_after_call
 136   // -25 [ saved v14            ]
 137   // -24 [ saved v13            ]
 138   // -23 [ saved v12            ]
 139   // -22 [ saved v11            ]
 140   // -21 [ saved v10            ]
 141   // -20 [ saved v9             ]
 142   // -19 [ saved v8             ]
 143   // -18 [ saved r28            ]
 144   // -17 [ saved r27            ]
 145   // -16 [ saved r26            ]
 146   // -15 [ saved r25            ]
 147   // -14 [ saved r24            ]
 148   // -13 [ saved r23            ]
 149   // -12 [ saved r22            ]
 150   // -11 [ saved r21            ]
 151   // -10 [ saved r20            ]
 152   //  -9 [ saved r19            ]
 153   //  -8 [ call wrapper    (r0) ]
 154   //  -7 [ result          (r1) ]
 155   //  -6 [ result type     (r2) ]
 156   //  -5 [ method          (r3) ]
 157   //  -4 [ entry point     (r4) ]
 158   //  -3 [ parameters      (r5) ]
 159   //  -2 [ parameter size  (r6) ]
 160   //  -1 [ thread (r7)          ]
 161   //   0 [ saved fp       (r29) ] <--- fp == saved sp (r31)
 162   //   1 [ saved lr       (r30) ]
 163 
 164   // Call stub stack layout word offsets from fp
 165   enum call_stub_layout {
 166     sp_after_call_off = -26,
 167 
 168     d15_off            = -26,
 169     d13_off            = -24,
 170     d11_off            = -22,
 171     d9_off             = -20,
 172 
 173     r28_off            = -18,
 174     r26_off            = -16,
 175     r24_off            = -14,
 176     r22_off            = -12,
 177     r20_off            = -10,
 178     call_wrapper_off   =  -8,
 179     result_off         =  -7,
 180     result_type_off    =  -6,
 181     method_off         =  -5,
 182     entry_point_off    =  -4,
 183     parameter_size_off =  -2,
 184     thread_off         =  -1,
 185     fp_f               =   0,
 186     retaddr_off        =   1,
 187   };
 188 
 189   address generate_call_stub(address& return_address) {
 190     assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 191            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 192            "adjust this code");
 193 
 194     StubCodeMark mark(this, "StubRoutines", "call_stub");
 195     address start = __ pc();
 196 
 197     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 198 
 199     const Address call_wrapper  (rfp, call_wrapper_off   * wordSize);
 200     const Address result        (rfp, result_off         * wordSize);
 201     const Address result_type   (rfp, result_type_off    * wordSize);
 202     const Address method        (rfp, method_off         * wordSize);
 203     const Address entry_point   (rfp, entry_point_off    * wordSize);
 204     const Address parameter_size(rfp, parameter_size_off * wordSize);
 205 
 206     const Address thread        (rfp, thread_off         * wordSize);
 207 
 208     const Address d15_save      (rfp, d15_off * wordSize);
 209     const Address d13_save      (rfp, d13_off * wordSize);
 210     const Address d11_save      (rfp, d11_off * wordSize);
 211     const Address d9_save       (rfp, d9_off * wordSize);
 212 
 213     const Address r28_save      (rfp, r28_off * wordSize);
 214     const Address r26_save      (rfp, r26_off * wordSize);
 215     const Address r24_save      (rfp, r24_off * wordSize);
 216     const Address r22_save      (rfp, r22_off * wordSize);
 217     const Address r20_save      (rfp, r20_off * wordSize);
 218 
 219     // stub code
 220 
 221     // we need a C prolog to bootstrap the x86 caller into the sim
 222     __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void);
 223 
 224     address aarch64_entry = __ pc();
 225 
 226 #ifdef BUILTIN_SIM
 227     // Save sender's SP for stack traces.
 228     __ mov(rscratch1, sp);
 229     __ str(rscratch1, Address(__ pre(sp, -2 * wordSize)));
 230 #endif
 231     // set up frame and move sp to end of save area
 232     __ enter();
 233     __ sub(sp, rfp, -sp_after_call_off * wordSize);
 234 
 235     // save register parameters and Java scratch/global registers
 236     // n.b. we save thread even though it gets installed in
 237     // rthread because we want to sanity check rthread later
 238     __ str(c_rarg7,  thread);
 239     __ strw(c_rarg6, parameter_size);
 240     __ stp(c_rarg4, c_rarg5,  entry_point);
 241     __ stp(c_rarg2, c_rarg3,  result_type);
 242     __ stp(c_rarg0, c_rarg1,  call_wrapper);
 243 
 244     __ stp(r20, r19,   r20_save);
 245     __ stp(r22, r21,   r22_save);
 246     __ stp(r24, r23,   r24_save);
 247     __ stp(r26, r25,   r26_save);
 248     __ stp(r28, r27,   r28_save);
 249 
 250     __ stpd(v9,  v8,   d9_save);
 251     __ stpd(v11, v10,  d11_save);
 252     __ stpd(v13, v12,  d13_save);
 253     __ stpd(v15, v14,  d15_save);
 254 
 255     // install Java thread in global register now we have saved
 256     // whatever value it held
 257     __ mov(rthread, c_rarg7);
 258     // And method
 259     __ mov(rmethod, c_rarg3);
 260 
 261     // set up the heapbase register
 262     __ reinit_heapbase();
 263 
 264 #ifdef ASSERT
 265     // make sure we have no pending exceptions
 266     {
 267       Label L;
 268       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 269       __ cmp(rscratch1, (unsigned)NULL_WORD);
 270       __ br(Assembler::EQ, L);
 271       __ stop("StubRoutines::call_stub: entered with pending exception");
 272       __ BIND(L);
 273     }
 274 #endif
 275     // pass parameters if any
 276     __ mov(esp, sp);
 277     __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way
 278     __ andr(sp, rscratch1, -2 * wordSize);
 279 
 280     BLOCK_COMMENT("pass parameters if any");
 281     Label parameters_done;
 282     // parameter count is still in c_rarg6
 283     // and parameter pointer identifying param 1 is in c_rarg5
 284     __ cbzw(c_rarg6, parameters_done);
 285 
 286     address loop = __ pc();
 287     __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize)));
 288     __ subsw(c_rarg6, c_rarg6, 1);
 289     __ push(rscratch1);
 290     __ br(Assembler::GT, loop);
 291 
 292     __ BIND(parameters_done);
 293 
 294     // call Java entry -- passing methdoOop, and current sp
 295     //      rmethod: Method*
 296     //      r13: sender sp
 297     BLOCK_COMMENT("call Java function");
 298     __ mov(r13, sp);
 299     __ blr(c_rarg4);
 300 
 301     // tell the simulator we have returned to the stub
 302 
 303     // we do this here because the notify will already have been done
 304     // if we get to the next instruction via an exception
 305     //
 306     // n.b. adding this instruction here affects the calculation of
 307     // whether or not a routine returns to the call stub (used when
 308     // doing stack walks) since the normal test is to check the return
 309     // pc against the address saved below. so we may need to allow for
 310     // this extra instruction in the check.
 311 
 312     if (NotifySimulator) {
 313       __ notify(Assembler::method_reentry);
 314     }
 315     // save current address for use by exception handling code
 316 
 317     return_address = __ pc();
 318 
 319     // store result depending on type (everything that is not
 320     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 321     // n.b. this assumes Java returns an integral result in r0
 322     // and a floating result in j_farg0
 323     __ ldr(j_rarg2, result);
 324     Label is_long, is_float, is_double, exit;
 325     __ ldr(j_rarg1, result_type);
 326     __ cmp(j_rarg1, T_OBJECT);
 327     __ br(Assembler::EQ, is_long);
 328     __ cmp(j_rarg1, T_LONG);
 329     __ br(Assembler::EQ, is_long);
 330     __ cmp(j_rarg1, T_FLOAT);
 331     __ br(Assembler::EQ, is_float);
 332     __ cmp(j_rarg1, T_DOUBLE);
 333     __ br(Assembler::EQ, is_double);
 334 
 335     // handle T_INT case
 336     __ strw(r0, Address(j_rarg2));
 337 
 338     __ BIND(exit);
 339 
 340     // pop parameters
 341     __ sub(esp, rfp, -sp_after_call_off * wordSize);
 342 
 343 #ifdef ASSERT
 344     // verify that threads correspond
 345     {
 346       Label L, S;
 347       __ ldr(rscratch1, thread);
 348       __ cmp(rthread, rscratch1);
 349       __ br(Assembler::NE, S);
 350       __ get_thread(rscratch1);
 351       __ cmp(rthread, rscratch1);
 352       __ br(Assembler::EQ, L);
 353       __ BIND(S);
 354       __ stop("StubRoutines::call_stub: threads must correspond");
 355       __ BIND(L);
 356     }
 357 #endif
 358 
 359     // restore callee-save registers
 360     __ ldpd(v15, v14,  d15_save);
 361     __ ldpd(v13, v12,  d13_save);
 362     __ ldpd(v11, v10,  d11_save);
 363     __ ldpd(v9,  v8,   d9_save);
 364 
 365     __ ldp(r28, r27,   r28_save);
 366     __ ldp(r26, r25,   r26_save);
 367     __ ldp(r24, r23,   r24_save);
 368     __ ldp(r22, r21,   r22_save);
 369     __ ldp(r20, r19,   r20_save);
 370 
 371     __ ldp(c_rarg0, c_rarg1,  call_wrapper);
 372     __ ldrw(c_rarg2, result_type);
 373     __ ldr(c_rarg3,  method);
 374     __ ldp(c_rarg4, c_rarg5,  entry_point);
 375     __ ldp(c_rarg6, c_rarg7,  parameter_size);
 376 
 377 #ifndef PRODUCT
 378     // tell the simulator we are about to end Java execution
 379     if (NotifySimulator) {
 380       __ notify(Assembler::method_exit);
 381     }
 382 #endif
 383     // leave frame and return to caller
 384     __ leave();
 385     __ ret(lr);
 386 
 387     // handle return types different from T_INT
 388 
 389     __ BIND(is_long);
 390     __ str(r0, Address(j_rarg2, 0));
 391     __ br(Assembler::AL, exit);
 392 
 393     __ BIND(is_float);
 394     __ strs(j_farg0, Address(j_rarg2, 0));
 395     __ br(Assembler::AL, exit);
 396 
 397     __ BIND(is_double);
 398     __ strd(j_farg0, Address(j_rarg2, 0));
 399     __ br(Assembler::AL, exit);
 400 
 401     return start;
 402   }
 403 
 404   // Return point for a Java call if there's an exception thrown in
 405   // Java code.  The exception is caught and transformed into a
 406   // pending exception stored in JavaThread that can be tested from
 407   // within the VM.
 408   //
 409   // Note: Usually the parameters are removed by the callee. In case
 410   // of an exception crossing an activation frame boundary, that is
 411   // not the case if the callee is compiled code => need to setup the
 412   // rsp.
 413   //
 414   // r0: exception oop
 415 
 416   // NOTE: this is used as a target from the signal handler so it
 417   // needs an x86 prolog which returns into the current simulator
 418   // executing the generated catch_exception code. so the prolog
 419   // needs to install rax in a sim register and adjust the sim's
 420   // restart pc to enter the generated code at the start position
 421   // then return from native to simulated execution.
 422 
 423   address generate_catch_exception() {
 424     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 425     address start = __ pc();
 426 
 427     // same as in generate_call_stub():
 428     const Address sp_after_call(rfp, sp_after_call_off * wordSize);
 429     const Address thread        (rfp, thread_off         * wordSize);
 430 
 431 #ifdef ASSERT
 432     // verify that threads correspond
 433     {
 434       Label L, S;
 435       __ ldr(rscratch1, thread);
 436       __ cmp(rthread, rscratch1);
 437       __ br(Assembler::NE, S);
 438       __ get_thread(rscratch1);
 439       __ cmp(rthread, rscratch1);
 440       __ br(Assembler::EQ, L);
 441       __ bind(S);
 442       __ stop("StubRoutines::catch_exception: threads must correspond");
 443       __ bind(L);
 444     }
 445 #endif
 446 
 447     // set pending exception
 448     __ verify_oop(r0);
 449 
 450     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 451     __ mov(rscratch1, (address)__FILE__);
 452     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 453     __ movw(rscratch1, (int)__LINE__);
 454     __ strw(rscratch1, Address(rthread, Thread::exception_line_offset()));
 455 
 456     // complete return to VM
 457     assert(StubRoutines::_call_stub_return_address != NULL,
 458            "_call_stub_return_address must have been generated before");
 459     __ b(StubRoutines::_call_stub_return_address);
 460 
 461     return start;
 462   }
 463 
 464   // Continuation point for runtime calls returning with a pending
 465   // exception.  The pending exception check happened in the runtime
 466   // or native call stub.  The pending exception in Thread is
 467   // converted into a Java-level exception.
 468   //
 469   // Contract with Java-level exception handlers:
 470   // r0: exception
 471   // r3: throwing pc
 472   //
 473   // NOTE: At entry of this stub, exception-pc must be in LR !!
 474 
 475   // NOTE: this is always used as a jump target within generated code
 476   // so it just needs to be generated code wiht no x86 prolog
 477 
 478   address generate_forward_exception() {
 479     StubCodeMark mark(this, "StubRoutines", "forward exception");
 480     address start = __ pc();
 481 
 482     // Upon entry, LR points to the return address returning into
 483     // Java (interpreted or compiled) code; i.e., the return address
 484     // becomes the throwing pc.
 485     //
 486     // Arguments pushed before the runtime call are still on the stack
 487     // but the exception handler will reset the stack pointer ->
 488     // ignore them.  A potential result in registers can be ignored as
 489     // well.
 490 
 491 #ifdef ASSERT
 492     // make sure this code is only executed if there is a pending exception
 493     {
 494       Label L;
 495       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 496       __ cbnz(rscratch1, L);
 497       __ stop("StubRoutines::forward exception: no pending exception (1)");
 498       __ bind(L);
 499     }
 500 #endif
 501 
 502     // compute exception handler into r19
 503 
 504     // call the VM to find the handler address associated with the
 505     // caller address. pass thread in r0 and caller pc (ret address)
 506     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 507     // the stack.
 508     __ mov(c_rarg1, lr);
 509     // lr will be trashed by the VM call so we move it to R19
 510     // (callee-saved) because we also need to pass it to the handler
 511     // returned by this call.
 512     __ mov(r19, lr);
 513     BLOCK_COMMENT("call exception_handler_for_return_address");
 514     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 515                          SharedRuntime::exception_handler_for_return_address),
 516                     rthread, c_rarg1);
 517     // we should not really care that lr is no longer the callee
 518     // address. we saved the value the handler needs in r19 so we can
 519     // just copy it to r3. however, the C2 handler will push its own
 520     // frame and then calls into the VM and the VM code asserts that
 521     // the PC for the frame above the handler belongs to a compiled
 522     // Java method. So, we restore lr here to satisfy that assert.
 523     __ mov(lr, r19);
 524     // setup r0 & r3 & clear pending exception
 525     __ mov(r3, r19);
 526     __ mov(r19, r0);
 527     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 528     __ str(zr, Address(rthread, Thread::pending_exception_offset()));
 529 
 530 #ifdef ASSERT
 531     // make sure exception is set
 532     {
 533       Label L;
 534       __ cbnz(r0, L);
 535       __ stop("StubRoutines::forward exception: no pending exception (2)");
 536       __ bind(L);
 537     }
 538 #endif
 539 
 540     // continue at exception handler
 541     // r0: exception
 542     // r3: throwing pc
 543     // r19: exception handler
 544     __ verify_oop(r0);
 545     __ br(r19);
 546 
 547     return start;
 548   }
 549 
 550   // Shenandoah write barrier.
 551   //
 552   // Input:
 553   //   r0: OOP to evacuate.  Not null.
 554   //
 555   // Output:
 556   //   r0: Pointer to evacuated OOP.
 557   //
 558   // Trash rscratch1, rscratch2.  Preserve everything else.
 559 
 560   address generate_shenandoah_wb() {
 561     StubCodeMark mark(this, "StubRoutines", "shenandoah_wb");
 562 
 563     __ align(6);
 564     address start = __ pc();
 565 
 566     Label work, slow_case, lose, not_an_instance, is_array;
 567     Address evacuation_in_progress
 568       = Address(rthread, in_bytes(JavaThread::evacuation_in_progress_offset()));
 569 
 570     __ mov(rscratch2, ShenandoahHeap::in_cset_fast_test_addr());
 571     __ lsr(rscratch1, r0, ShenandoahHeapRegion::RegionSizeShift);
 572     __ ldrb(rscratch2, Address(rscratch2, rscratch1));
 573     __ tbnz(rscratch2, 0, work);
 574     __ ret(lr);
 575 
 576     __ bind(work);
 577 
 578     RegSet saved = RegSet::range(r1, r4);
 579     __ push(saved, sp);
 580 
 581     Register obj = r0, size = r2, newobj = r3, newobj_end = rscratch2;
 582 
 583     __ ldr(newobj, Address(rthread, JavaThread::gclab_top_offset()));
 584     __ cbz(newobj, slow_case); // No GCLAB
 585 
 586     __ load_klass(r1, obj);
 587     __ ldrw(size, Address(r1, Klass::layout_helper_offset()));
 588     __ tbnz(size, BitsPerInt - 1, not_an_instance);  // make sure it's an instance (LH > 0)
 589     assert(Klass::_lh_neutral_value == 0, "must be");
 590     __ cbzw(size, slow_case);
 591     __ tbnz(size, exact_log2(Klass::_lh_instance_slow_path_bit), slow_case);
 592     __ bind(is_array);
 593 
 594     // size contains the size (in bytes) of the object.
 595 
 596     // Make sure it's not a really big object.
 597     // ??? Maybe this test is not necessary.
 598     __ cmp(size, 128 * HeapWordSize);
 599     __ br(Assembler::GE, slow_case);
 600 
 601     int oop_extra_words = Universe::heap()->oop_extra_words();
 602     __ add(newobj_end, newobj, oop_extra_words * HeapWordSize);
 603     __ add(newobj_end, newobj_end, size, ext::uxtw);
 604 
 605     // Pointer to end of new object is in newobj_end.
 606 
 607     __ ldr(rscratch1, Address(rthread, JavaThread::gclab_end_offset()));
 608     __ cmp(newobj_end, rscratch1);
 609     __ br(Assembler::HS, slow_case); // No room in GCLAB
 610 
 611     // Store Brooks pointer and adjust start of newobj.
 612     Universe::heap()->compile_prepare_oop(_masm, newobj);
 613 
 614     // We can reuse newobj_end (rscratch2) to hold dst.
 615     Register src = r1, dst = newobj_end;
 616 
 617     // Copy the object from obj to newobj.  This loop is short and
 618     // sweet: the typical size of an object is about eight HeapWords
 619     // so it makes no sense to optimize for a large memory copy.
 620     // There might be some sense to calling generate_copy_longs from
 621     // here if the object to be copied is very large.
 622     Label loop, odd_count;
 623     {
 624       __ mov(src, obj);
 625       __ mov(dst, newobj);
 626       __ tbnz(size, exact_log2(HeapWordSize), odd_count);
 627 
 628       // Live registers: obj, newobj, size, src, dst.
 629 
 630       __ bind(loop);
 631       // Count is even.  Copy pairs of HeapWords.
 632       __ ldp(rscratch1, r4, __ post(src, 2 * HeapWordSize));
 633       __ stp(rscratch1, r4, __ post(dst, 2 * HeapWordSize));
 634       __ subs(size, size, 2 * HeapWordSize);
 635       __ br(Assembler::GT, loop);
 636     }
 637 
 638     // All copied.  Now try to CAS the Brooks pointer.
 639     Label succeed;
 640     __ lea(r2, Address(obj, BrooksPointer::byte_offset()));
 641     __ cmpxchgptr(obj, newobj, r2, rscratch1, succeed, NULL);
 642       // If we lose the CAS we are racing with someone who just beat
 643       // us evacuating the object.  This leaves the address of the
 644       // evacuated object in r0.
 645 
 646     // We lost.
 647     __ pop(saved, sp);
 648     __ ret(lr);
 649 
 650     // We won.
 651     __ bind(succeed);
 652     __ mov(obj, newobj);
 653     // dst points to end of newobj.
 654     __ str(dst, Address(rthread, JavaThread::gclab_top_offset()));
 655     __ pop(saved, sp);
 656     __ ret(lr);
 657 
 658     // Come here if the count of HeapWords is odd.
 659     {
 660       __ bind(odd_count);
 661       __ ldr(rscratch1, __ post(src, HeapWordSize));
 662       __ str(rscratch1, __ post(dst, HeapWordSize));
 663       __ subs(size, size, HeapWordSize);
 664       __ b(loop);
 665     }
 666 
 667     // Come here if obj is an array of some kind.
 668     {
 669       __ bind(not_an_instance);
 670 
 671       // It's an array.  Calculate the size in r4.
 672       __ ubfx(r4, size, Klass::_lh_header_size_shift,
 673               exact_log2(Klass::_lh_header_size_mask+1));
 674       __ ldrw(rscratch1, Address(obj, arrayOopDesc::length_offset_in_bytes()));
 675       __ ubfx(rscratch2, size, Klass::_lh_log2_element_size_shift,
 676               exact_log2(Klass::_lh_log2_element_size_mask+1));
 677       __ lslv(rscratch1, rscratch1, rscratch2);
 678       __ add(size, rscratch1, r4);
 679 
 680       // Round up the size.
 681       __ add(size, size, MinObjAlignmentInBytes-1);
 682       __ andr(size, size, -MinObjAlignmentInBytes);
 683 
 684       __ b(is_array);
 685     }
 686 
 687     {
 688       // Make a runtime call to evacuate the object.
 689       __ bind(slow_case);
 690       __ pop(saved, sp);
 691 
 692       __ enter(); // required for proper stackwalking of RuntimeStub frame
 693 
 694       __ push_call_clobbered_registers();
 695 
 696       __ mov(lr, CAST_FROM_FN_PTR(address, ShenandoahBarrierSet::write_barrier_c2));
 697       __ blrt(lr, 1, 0, MacroAssembler::ret_type_integral);
 698       __ mov(rscratch1, obj);
 699 
 700       __ pop_call_clobbered_registers();
 701       __ mov(obj, rscratch1);
 702 
 703       __ leave(); // required for proper stackwalking of RuntimeStub frame
 704       __ ret(lr);
 705     }
 706 
 707     return start;
 708   }
 709 
 710   // Non-destructive plausibility checks for oops
 711   //
 712   // Arguments:
 713   //    r0: oop to verify
 714   //    rscratch1: error message
 715   //
 716   // Stack after saving c_rarg3:
 717   //    [tos + 0]: saved c_rarg3
 718   //    [tos + 1]: saved c_rarg2
 719   //    [tos + 2]: saved lr
 720   //    [tos + 3]: saved rscratch2
 721   //    [tos + 4]: saved r0
 722   //    [tos + 5]: saved rscratch1
 723   address generate_verify_oop() {
 724 
 725     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 726     address start = __ pc();
 727 
 728     Label exit, error;
 729 
 730     // save c_rarg2 and c_rarg3
 731     __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16)));
 732 
 733     // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 734     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 735     __ ldr(c_rarg3, Address(c_rarg2));
 736     __ add(c_rarg3, c_rarg3, 1);
 737     __ str(c_rarg3, Address(c_rarg2));
 738 
 739     // object is in r0
 740     // make sure object is 'reasonable'
 741     __ cbz(r0, exit); // if obj is NULL it is OK
 742 
 743     // Check if the oop is in the right area of memory
 744     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 745     __ andr(c_rarg2, r0, c_rarg3);
 746     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 747 
 748     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 749     // instruction here because the flags register is live.
 750     __ eor(c_rarg2, c_rarg2, c_rarg3);
 751     __ cbnz(c_rarg2, error);
 752 
 753     // make sure klass is 'reasonable', which is not zero.
 754     __ load_klass(r0, r0);  // get klass
 755     __ cbz(r0, error);      // if klass is NULL it is broken
 756 
 757     // return if everything seems ok
 758     __ bind(exit);
 759 
 760     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 761     __ ret(lr);
 762 
 763     // handle errors
 764     __ bind(error);
 765     __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16)));
 766 
 767     __ push(RegSet::range(r0, r29), sp);
 768     // debug(char* msg, int64_t pc, int64_t regs[])
 769     __ mov(c_rarg0, rscratch1);      // pass address of error message
 770     __ mov(c_rarg1, lr);             // pass return address
 771     __ mov(c_rarg2, sp);             // pass address of regs on stack
 772 #ifndef PRODUCT
 773     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 774 #endif
 775     BLOCK_COMMENT("call MacroAssembler::debug");
 776     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64));
 777     __ blrt(rscratch1, 3, 0, 1);
 778 
 779     return start;
 780   }
 781 
 782   void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); }
 783 
 784   // Generate code for an array write pre barrier
 785   //
 786   //     addr    -  starting address
 787   //     count   -  element count
 788   //     tmp     - scratch register
 789   //
 790   //     Destroy no registers except rscratch1 and rscratch2
 791   //
 792   void  gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 793     BarrierSet* bs = Universe::heap()->barrier_set();
 794     switch (bs->kind()) {
 795     case BarrierSet::G1SATBCTLogging:
 796     case BarrierSet::ShenandoahBarrierSet:
 797       // Don't generate the call if we statically know that the target is uninitialized
 798       if (!dest_uninitialized) {
 799         __ push_call_clobbered_registers();
 800         if (count == c_rarg0) {
 801           if (addr == c_rarg1) {
 802             // exactly backwards!!
 803             __ mov(rscratch1, c_rarg0);
 804             __ mov(c_rarg0, c_rarg1);
 805             __ mov(c_rarg1, rscratch1);
 806           } else {
 807             __ mov(c_rarg1, count);
 808             __ mov(c_rarg0, addr);
 809           }
 810         } else {
 811           __ mov(c_rarg0, addr);
 812           __ mov(c_rarg1, count);
 813         }
 814         __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2);
 815         __ pop_call_clobbered_registers();
 816         break;
 817       case BarrierSet::CardTableForRS:
 818       case BarrierSet::CardTableExtension:
 819       case BarrierSet::ModRef:
 820         break;
 821       default:
 822         ShouldNotReachHere();
 823       }
 824     }
 825   }
 826 
 827 
 828   //
 829   // Generate code for an array write post barrier
 830   //
 831   //  Input:
 832   //     start    - register containing starting address of destination array
 833   //     end      - register containing ending address of destination array
 834   //     scratch  - scratch register
 835   //
 836   //  The input registers are overwritten.
 837   //  The ending address is inclusive.
 838   void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) {
 839     assert_different_registers(start, end, scratch);
 840     BarrierSet* bs = Universe::heap()->barrier_set();
 841     switch (bs->kind()) {
 842       case BarrierSet::G1SATBCTLogging:
 843       case BarrierSet::ShenandoahBarrierSet:
 844         {
 845           __ push_call_clobbered_registers();
 846           // must compute element count unless barrier set interface is changed (other platforms supply count)
 847           assert_different_registers(start, end, scratch);
 848           __ lea(scratch, Address(end, BytesPerHeapOop));
 849           __ sub(scratch, scratch, start);               // subtract start to get #bytes
 850           __ lsr(scratch, scratch, LogBytesPerHeapOop);  // convert to element count
 851           __ mov(c_rarg0, start);
 852           __ mov(c_rarg1, scratch);
 853           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2);
 854           __ pop_call_clobbered_registers();
 855         }
 856         break;
 857       case BarrierSet::CardTableForRS:
 858       case BarrierSet::CardTableExtension:
 859         {
 860           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
 861           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 862 
 863           Label L_loop;
 864 
 865            __ lsr(start, start, CardTableModRefBS::card_shift);
 866            __ lsr(end, end, CardTableModRefBS::card_shift);
 867            __ sub(end, end, start); // number of bytes to copy
 868 
 869           const Register count = end; // 'end' register contains bytes count now
 870           __ load_byte_map_base(scratch);
 871           __ add(start, start, scratch);
 872           if (UseConcMarkSweepGC) {
 873             __ membar(__ StoreStore);
 874           }
 875           __ BIND(L_loop);
 876           __ strb(zr, Address(start, count));
 877           __ subs(count, count, 1);
 878           __ br(Assembler::GE, L_loop);
 879         }
 880         break;
 881       default:
 882         ShouldNotReachHere();
 883 
 884     }
 885   }
 886 
 887   address generate_zero_longs(Register base, Register cnt) {
 888     Register tmp = rscratch1;
 889     Register tmp2 = rscratch2;
 890     int zva_length = VM_Version::zva_length();
 891     Label initial_table_end, loop_zva;
 892     Label fini;
 893 
 894     __ align(CodeEntryAlignment);
 895     StubCodeMark mark(this, "StubRoutines", "zero_longs");
 896     address start = __ pc();
 897 
 898     // Base must be 16 byte aligned. If not just return and let caller handle it
 899     __ tst(base, 0x0f);
 900     __ br(Assembler::NE, fini);
 901     // Align base with ZVA length.
 902     __ neg(tmp, base);
 903     __ andr(tmp, tmp, zva_length - 1);
 904 
 905     // tmp: the number of bytes to be filled to align the base with ZVA length.
 906     __ add(base, base, tmp);
 907     __ sub(cnt, cnt, tmp, Assembler::ASR, 3);
 908     __ adr(tmp2, initial_table_end);
 909     __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2);
 910     __ br(tmp2);
 911 
 912     for (int i = -zva_length + 16; i < 0; i += 16)
 913       __ stp(zr, zr, Address(base, i));
 914     __ bind(initial_table_end);
 915 
 916     __ sub(cnt, cnt, zva_length >> 3);
 917     __ bind(loop_zva);
 918     __ dc(Assembler::ZVA, base);
 919     __ subs(cnt, cnt, zva_length >> 3);
 920     __ add(base, base, zva_length);
 921     __ br(Assembler::GE, loop_zva);
 922     __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA
 923     __ bind(fini);
 924     __ ret(lr);
 925 
 926     return start;
 927   }
 928 
 929   typedef enum {
 930     copy_forwards = 1,
 931     copy_backwards = -1
 932   } copy_direction;
 933 
 934   // Bulk copy of blocks of 8 words.
 935   //
 936   // count is a count of words.
 937   //
 938   // Precondition: count >= 8
 939   //
 940   // Postconditions:
 941   //
 942   // The least significant bit of count contains the remaining count
 943   // of words to copy.  The rest of count is trash.
 944   //
 945   // s and d are adjusted to point to the remaining words to copy
 946   //
 947   void generate_copy_longs(Label &start, Register s, Register d, Register count,
 948                            copy_direction direction) {
 949     int unit = wordSize * direction;
 950     int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize;
 951 
 952     int offset;
 953     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6,
 954       t4 = r7, t5 = r10, t6 = r11, t7 = r12;
 955     const Register stride = r13;
 956 
 957     assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7);
 958     assert_different_registers(s, d, count, rscratch1);
 959 
 960     Label again, drain;
 961     const char *stub_name;
 962     if (direction == copy_forwards)
 963       stub_name = "foward_copy_longs";
 964     else
 965       stub_name = "backward_copy_longs";
 966     StubCodeMark mark(this, "StubRoutines", stub_name);
 967     __ align(CodeEntryAlignment);
 968     __ bind(start);
 969 
 970     Label unaligned_copy_long;
 971     if (AvoidUnalignedAccesses) {
 972       __ tbnz(d, 3, unaligned_copy_long);
 973     }
 974 
 975     if (direction == copy_forwards) {
 976       __ sub(s, s, bias);
 977       __ sub(d, d, bias);
 978     }
 979 
 980 #ifdef ASSERT
 981     // Make sure we are never given < 8 words
 982     {
 983       Label L;
 984       __ cmp(count, 8);
 985       __ br(Assembler::GE, L);
 986       __ stop("genrate_copy_longs called with < 8 words");
 987       __ bind(L);
 988     }
 989 #endif
 990 
 991     // Fill 8 registers
 992     if (UseSIMDForMemoryOps) {
 993       __ ldpq(v0, v1, Address(s, 4 * unit));
 994       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
 995     } else {
 996       __ ldp(t0, t1, Address(s, 2 * unit));
 997       __ ldp(t2, t3, Address(s, 4 * unit));
 998       __ ldp(t4, t5, Address(s, 6 * unit));
 999       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1000     }
1001 
1002     __ subs(count, count, 16);
1003     __ br(Assembler::LO, drain);
1004 
1005     int prefetch = PrefetchCopyIntervalInBytes;
1006     bool use_stride = false;
1007     if (direction == copy_backwards) {
1008        use_stride = prefetch > 256;
1009        prefetch = -prefetch;
1010        if (use_stride) __ mov(stride, prefetch);
1011     }
1012 
1013     __ bind(again);
1014 
1015     if (PrefetchCopyIntervalInBytes > 0)
1016       __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1017 
1018     if (UseSIMDForMemoryOps) {
1019       __ stpq(v0, v1, Address(d, 4 * unit));
1020       __ ldpq(v0, v1, Address(s, 4 * unit));
1021       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
1022       __ ldpq(v2, v3, Address(__ pre(s, 8 * unit)));
1023     } else {
1024       __ stp(t0, t1, Address(d, 2 * unit));
1025       __ ldp(t0, t1, Address(s, 2 * unit));
1026       __ stp(t2, t3, Address(d, 4 * unit));
1027       __ ldp(t2, t3, Address(s, 4 * unit));
1028       __ stp(t4, t5, Address(d, 6 * unit));
1029       __ ldp(t4, t5, Address(s, 6 * unit));
1030       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
1031       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1032     }
1033 
1034     __ subs(count, count, 8);
1035     __ br(Assembler::HS, again);
1036 
1037     // Drain
1038     __ bind(drain);
1039     if (UseSIMDForMemoryOps) {
1040       __ stpq(v0, v1, Address(d, 4 * unit));
1041       __ stpq(v2, v3, Address(__ pre(d, 8 * unit)));
1042     } else {
1043       __ stp(t0, t1, Address(d, 2 * unit));
1044       __ stp(t2, t3, Address(d, 4 * unit));
1045       __ stp(t4, t5, Address(d, 6 * unit));
1046       __ stp(t6, t7, Address(__ pre(d, 8 * unit)));
1047     }
1048 
1049     {
1050       Label L1, L2;
1051       __ tbz(count, exact_log2(4), L1);
1052       if (UseSIMDForMemoryOps) {
1053         __ ldpq(v0, v1, Address(__ pre(s, 4 * unit)));
1054         __ stpq(v0, v1, Address(__ pre(d, 4 * unit)));
1055       } else {
1056         __ ldp(t0, t1, Address(s, 2 * unit));
1057         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1058         __ stp(t0, t1, Address(d, 2 * unit));
1059         __ stp(t2, t3, Address(__ pre(d, 4 * unit)));
1060       }
1061       __ bind(L1);
1062 
1063       if (direction == copy_forwards) {
1064         __ add(s, s, bias);
1065         __ add(d, d, bias);
1066       }
1067 
1068       __ tbz(count, 1, L2);
1069       __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards)));
1070       __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards)));
1071       __ bind(L2);
1072     }
1073 
1074     __ ret(lr);
1075 
1076     if (AvoidUnalignedAccesses) {
1077       Label drain, again;
1078       // Register order for storing. Order is different for backward copy.
1079 
1080       __ bind(unaligned_copy_long);
1081 
1082       // source address is even aligned, target odd aligned
1083       //
1084       // when forward copying word pairs we read long pairs at offsets
1085       // {0, 2, 4, 6} (in long words). when backwards copying we read
1086       // long pairs at offsets {-2, -4, -6, -8}. We adjust the source
1087       // address by -2 in the forwards case so we can compute the
1088       // source offsets for both as {2, 4, 6, 8} * unit where unit = 1
1089       // or -1.
1090       //
1091       // when forward copying we need to store 1 word, 3 pairs and
1092       // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a
1093       // zero offset We adjust the destination by -1 which means we
1094       // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores.
1095       //
1096       // When backwards copyng we need to store 1 word, 3 pairs and
1097       // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use
1098       // offsets {1, 3, 5, 7, 8} * unit.
1099 
1100       if (direction == copy_forwards) {
1101         __ sub(s, s, 16);
1102         __ sub(d, d, 8);
1103       }
1104 
1105       // Fill 8 registers
1106       //
1107       // for forwards copy s was offset by -16 from the original input
1108       // value of s so the register contents are at these offsets
1109       // relative to the 64 bit block addressed by that original input
1110       // and so on for each successive 64 byte block when s is updated
1111       //
1112       // t0 at offset 0,  t1 at offset 8
1113       // t2 at offset 16, t3 at offset 24
1114       // t4 at offset 32, t5 at offset 40
1115       // t6 at offset 48, t7 at offset 56
1116 
1117       // for backwards copy s was not offset so the register contents
1118       // are at these offsets into the preceding 64 byte block
1119       // relative to that original input and so on for each successive
1120       // preceding 64 byte block when s is updated. this explains the
1121       // slightly counter-intuitive looking pattern of register usage
1122       // in the stp instructions for backwards copy.
1123       //
1124       // t0 at offset -16, t1 at offset -8
1125       // t2 at offset -32, t3 at offset -24
1126       // t4 at offset -48, t5 at offset -40
1127       // t6 at offset -64, t7 at offset -56
1128 
1129       __ ldp(t0, t1, Address(s, 2 * unit));
1130       __ ldp(t2, t3, Address(s, 4 * unit));
1131       __ ldp(t4, t5, Address(s, 6 * unit));
1132       __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1133 
1134       __ subs(count, count, 16);
1135       __ br(Assembler::LO, drain);
1136 
1137       int prefetch = PrefetchCopyIntervalInBytes;
1138       bool use_stride = false;
1139       if (direction == copy_backwards) {
1140          use_stride = prefetch > 256;
1141          prefetch = -prefetch;
1142          if (use_stride) __ mov(stride, prefetch);
1143       }
1144 
1145       __ bind(again);
1146 
1147       if (PrefetchCopyIntervalInBytes > 0)
1148         __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP);
1149 
1150       if (direction == copy_forwards) {
1151        // allowing for the offset of -8 the store instructions place
1152        // registers into the target 64 bit block at the following
1153        // offsets
1154        //
1155        // t0 at offset 0
1156        // t1 at offset 8,  t2 at offset 16
1157        // t3 at offset 24, t4 at offset 32
1158        // t5 at offset 40, t6 at offset 48
1159        // t7 at offset 56
1160 
1161         __ str(t0, Address(d, 1 * unit));
1162         __ stp(t1, t2, Address(d, 2 * unit));
1163         __ ldp(t0, t1, Address(s, 2 * unit));
1164         __ stp(t3, t4, Address(d, 4 * unit));
1165         __ ldp(t2, t3, Address(s, 4 * unit));
1166         __ stp(t5, t6, Address(d, 6 * unit));
1167         __ ldp(t4, t5, Address(s, 6 * unit));
1168         __ str(t7, Address(__ pre(d, 8 * unit)));
1169         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1170       } else {
1171        // d was not offset when we started so the registers are
1172        // written into the 64 bit block preceding d with the following
1173        // offsets
1174        //
1175        // t1 at offset -8
1176        // t3 at offset -24, t0 at offset -16
1177        // t5 at offset -48, t2 at offset -32
1178        // t7 at offset -56, t4 at offset -48
1179        //                   t6 at offset -64
1180        //
1181        // note that this matches the offsets previously noted for the
1182        // loads
1183 
1184         __ str(t1, Address(d, 1 * unit));
1185         __ stp(t3, t0, Address(d, 3 * unit));
1186         __ ldp(t0, t1, Address(s, 2 * unit));
1187         __ stp(t5, t2, Address(d, 5 * unit));
1188         __ ldp(t2, t3, Address(s, 4 * unit));
1189         __ stp(t7, t4, Address(d, 7 * unit));
1190         __ ldp(t4, t5, Address(s, 6 * unit));
1191         __ str(t6, Address(__ pre(d, 8 * unit)));
1192         __ ldp(t6, t7, Address(__ pre(s, 8 * unit)));
1193       }
1194 
1195       __ subs(count, count, 8);
1196       __ br(Assembler::HS, again);
1197 
1198       // Drain
1199       //
1200       // this uses the same pattern of offsets and register arguments
1201       // as above
1202       __ bind(drain);
1203       if (direction == copy_forwards) {
1204         __ str(t0, Address(d, 1 * unit));
1205         __ stp(t1, t2, Address(d, 2 * unit));
1206         __ stp(t3, t4, Address(d, 4 * unit));
1207         __ stp(t5, t6, Address(d, 6 * unit));
1208         __ str(t7, Address(__ pre(d, 8 * unit)));
1209       } else {
1210         __ str(t1, Address(d, 1 * unit));
1211         __ stp(t3, t0, Address(d, 3 * unit));
1212         __ stp(t5, t2, Address(d, 5 * unit));
1213         __ stp(t7, t4, Address(d, 7 * unit));
1214         __ str(t6, Address(__ pre(d, 8 * unit)));
1215       }
1216       // now we need to copy any remaining part block which may
1217       // include a 4 word block subblock and/or a 2 word subblock.
1218       // bits 2 and 1 in the count are the tell-tale for whetehr we
1219       // have each such subblock
1220       {
1221         Label L1, L2;
1222         __ tbz(count, exact_log2(4), L1);
1223        // this is the same as above but copying only 4 longs hence
1224        // with ony one intervening stp between the str instructions
1225        // but note that the offsets and registers still follow the
1226        // same pattern
1227         __ ldp(t0, t1, Address(s, 2 * unit));
1228         __ ldp(t2, t3, Address(__ pre(s, 4 * unit)));
1229         if (direction == copy_forwards) {
1230           __ str(t0, Address(d, 1 * unit));
1231           __ stp(t1, t2, Address(d, 2 * unit));
1232           __ str(t3, Address(__ pre(d, 4 * unit)));
1233         } else {
1234           __ str(t1, Address(d, 1 * unit));
1235           __ stp(t3, t0, Address(d, 3 * unit));
1236           __ str(t2, Address(__ pre(d, 4 * unit)));
1237         }
1238         __ bind(L1);
1239 
1240         __ tbz(count, 1, L2);
1241        // this is the same as above but copying only 2 longs hence
1242        // there is no intervening stp between the str instructions
1243        // but note that the offset and register patterns are still
1244        // the same
1245         __ ldp(t0, t1, Address(__ pre(s, 2 * unit)));
1246         if (direction == copy_forwards) {
1247           __ str(t0, Address(d, 1 * unit));
1248           __ str(t1, Address(__ pre(d, 2 * unit)));
1249         } else {
1250           __ str(t1, Address(d, 1 * unit));
1251           __ str(t0, Address(__ pre(d, 2 * unit)));
1252         }
1253         __ bind(L2);
1254 
1255        // for forwards copy we need to re-adjust the offsets we
1256        // applied so that s and d are follow the last words written
1257 
1258        if (direction == copy_forwards) {
1259          __ add(s, s, 16);
1260          __ add(d, d, 8);
1261        }
1262 
1263       }
1264 
1265       __ ret(lr);
1266       }
1267   }
1268 
1269   // Small copy: less than 16 bytes.
1270   //
1271   // NB: Ignores all of the bits of count which represent more than 15
1272   // bytes, so a caller doesn't have to mask them.
1273 
1274   void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) {
1275     bool is_backwards = step < 0;
1276     size_t granularity = uabs(step);
1277     int direction = is_backwards ? -1 : 1;
1278     int unit = wordSize * direction;
1279 
1280     Label Lpair, Lword, Lint, Lshort, Lbyte;
1281 
1282     assert(granularity
1283            && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small");
1284 
1285     const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6;
1286 
1287     // ??? I don't know if this bit-test-and-branch is the right thing
1288     // to do.  It does a lot of jumping, resulting in several
1289     // mispredicted branches.  It might make more sense to do this
1290     // with something like Duff's device with a single computed branch.
1291 
1292     __ tbz(count, 3 - exact_log2(granularity), Lword);
1293     __ ldr(tmp, Address(__ adjust(s, unit, is_backwards)));
1294     __ str(tmp, Address(__ adjust(d, unit, is_backwards)));
1295     __ bind(Lword);
1296 
1297     if (granularity <= sizeof (jint)) {
1298       __ tbz(count, 2 - exact_log2(granularity), Lint);
1299       __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards)));
1300       __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards)));
1301       __ bind(Lint);
1302     }
1303 
1304     if (granularity <= sizeof (jshort)) {
1305       __ tbz(count, 1 - exact_log2(granularity), Lshort);
1306       __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards)));
1307       __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards)));
1308       __ bind(Lshort);
1309     }
1310 
1311     if (granularity <= sizeof (jbyte)) {
1312       __ tbz(count, 0, Lbyte);
1313       __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards)));
1314       __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards)));
1315       __ bind(Lbyte);
1316     }
1317   }
1318 
1319   Label copy_f, copy_b;
1320 
1321   // All-singing all-dancing memory copy.
1322   //
1323   // Copy count units of memory from s to d.  The size of a unit is
1324   // step, which can be positive or negative depending on the direction
1325   // of copy.  If is_aligned is false, we align the source address.
1326   //
1327 
1328   void copy_memory(bool is_aligned, Register s, Register d,
1329                    Register count, Register tmp, int step) {
1330     copy_direction direction = step < 0 ? copy_backwards : copy_forwards;
1331     bool is_backwards = step < 0;
1332     int granularity = uabs(step);
1333     const Register t0 = r3, t1 = r4;
1334 
1335     // <= 96 bytes do inline. Direction doesn't matter because we always
1336     // load all the data before writing anything
1337     Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish;
1338     const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8;
1339     const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12;
1340     const Register send = r17, dend = r18;
1341 
1342     if (PrefetchCopyIntervalInBytes > 0)
1343       __ prfm(Address(s, 0), PLDL1KEEP);
1344     __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity);
1345     __ br(Assembler::HI, copy_big);
1346 
1347     __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity))));
1348     __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity))));
1349 
1350     __ cmp(count, 16/granularity);
1351     __ br(Assembler::LS, copy16);
1352 
1353     __ cmp(count, 64/granularity);
1354     __ br(Assembler::HI, copy80);
1355 
1356     __ cmp(count, 32/granularity);
1357     __ br(Assembler::LS, copy32);
1358 
1359     // 33..64 bytes
1360     if (UseSIMDForMemoryOps) {
1361       __ ldpq(v0, v1, Address(s, 0));
1362       __ ldpq(v2, v3, Address(send, -32));
1363       __ stpq(v0, v1, Address(d, 0));
1364       __ stpq(v2, v3, Address(dend, -32));
1365     } else {
1366       __ ldp(t0, t1, Address(s, 0));
1367       __ ldp(t2, t3, Address(s, 16));
1368       __ ldp(t4, t5, Address(send, -32));
1369       __ ldp(t6, t7, Address(send, -16));
1370 
1371       __ stp(t0, t1, Address(d, 0));
1372       __ stp(t2, t3, Address(d, 16));
1373       __ stp(t4, t5, Address(dend, -32));
1374       __ stp(t6, t7, Address(dend, -16));
1375     }
1376     __ b(finish);
1377 
1378     // 17..32 bytes
1379     __ bind(copy32);
1380     __ ldp(t0, t1, Address(s, 0));
1381     __ ldp(t2, t3, Address(send, -16));
1382     __ stp(t0, t1, Address(d, 0));
1383     __ stp(t2, t3, Address(dend, -16));
1384     __ b(finish);
1385 
1386     // 65..80/96 bytes
1387     // (96 bytes if SIMD because we do 32 byes per instruction)
1388     __ bind(copy80);
1389     if (UseSIMDForMemoryOps) {
1390       __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0));
1391       __ ldpq(v4, v5, Address(send, -32));
1392       __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0));
1393       __ stpq(v4, v5, Address(dend, -32));
1394     } else {
1395       __ ldp(t0, t1, Address(s, 0));
1396       __ ldp(t2, t3, Address(s, 16));
1397       __ ldp(t4, t5, Address(s, 32));
1398       __ ldp(t6, t7, Address(s, 48));
1399       __ ldp(t8, t9, Address(send, -16));
1400 
1401       __ stp(t0, t1, Address(d, 0));
1402       __ stp(t2, t3, Address(d, 16));
1403       __ stp(t4, t5, Address(d, 32));
1404       __ stp(t6, t7, Address(d, 48));
1405       __ stp(t8, t9, Address(dend, -16));
1406     }
1407     __ b(finish);
1408 
1409     // 0..16 bytes
1410     __ bind(copy16);
1411     __ cmp(count, 8/granularity);
1412     __ br(Assembler::LO, copy8);
1413 
1414     // 8..16 bytes
1415     __ ldr(t0, Address(s, 0));
1416     __ ldr(t1, Address(send, -8));
1417     __ str(t0, Address(d, 0));
1418     __ str(t1, Address(dend, -8));
1419     __ b(finish);
1420 
1421     if (granularity < 8) {
1422       // 4..7 bytes
1423       __ bind(copy8);
1424       __ tbz(count, 2 - exact_log2(granularity), copy4);
1425       __ ldrw(t0, Address(s, 0));
1426       __ ldrw(t1, Address(send, -4));
1427       __ strw(t0, Address(d, 0));
1428       __ strw(t1, Address(dend, -4));
1429       __ b(finish);
1430       if (granularity < 4) {
1431         // 0..3 bytes
1432         __ bind(copy4);
1433         __ cbz(count, finish); // get rid of 0 case
1434         if (granularity == 2) {
1435           __ ldrh(t0, Address(s, 0));
1436           __ strh(t0, Address(d, 0));
1437         } else { // granularity == 1
1438           // Now 1..3 bytes. Handle the 1 and 2 byte case by copying
1439           // the first and last byte.
1440           // Handle the 3 byte case by loading and storing base + count/2
1441           // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1))
1442           // This does means in the 1 byte case we load/store the same
1443           // byte 3 times.
1444           __ lsr(count, count, 1);
1445           __ ldrb(t0, Address(s, 0));
1446           __ ldrb(t1, Address(send, -1));
1447           __ ldrb(t2, Address(s, count));
1448           __ strb(t0, Address(d, 0));
1449           __ strb(t1, Address(dend, -1));
1450           __ strb(t2, Address(d, count));
1451         }
1452         __ b(finish);
1453       }
1454     }
1455 
1456     __ bind(copy_big);
1457     if (is_backwards) {
1458       __ lea(s, Address(s, count, Address::lsl(exact_log2(-step))));
1459       __ lea(d, Address(d, count, Address::lsl(exact_log2(-step))));
1460     }
1461 
1462     // Now we've got the small case out of the way we can align the
1463     // source address on a 2-word boundary.
1464 
1465     Label aligned;
1466 
1467     if (is_aligned) {
1468       // We may have to adjust by 1 word to get s 2-word-aligned.
1469       __ tbz(s, exact_log2(wordSize), aligned);
1470       __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards)));
1471       __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards)));
1472       __ sub(count, count, wordSize/granularity);
1473     } else {
1474       if (is_backwards) {
1475         __ andr(rscratch2, s, 2 * wordSize - 1);
1476       } else {
1477         __ neg(rscratch2, s);
1478         __ andr(rscratch2, rscratch2, 2 * wordSize - 1);
1479       }
1480       // rscratch2 is the byte adjustment needed to align s.
1481       __ cbz(rscratch2, aligned);
1482       int shift = exact_log2(granularity);
1483       if (shift)  __ lsr(rscratch2, rscratch2, shift);
1484       __ sub(count, count, rscratch2);
1485 
1486 #if 0
1487       // ?? This code is only correct for a disjoint copy.  It may or
1488       // may not make sense to use it in that case.
1489 
1490       // Copy the first pair; s and d may not be aligned.
1491       __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0));
1492       __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0));
1493 
1494       // Align s and d, adjust count
1495       if (is_backwards) {
1496         __ sub(s, s, rscratch2);
1497         __ sub(d, d, rscratch2);
1498       } else {
1499         __ add(s, s, rscratch2);
1500         __ add(d, d, rscratch2);
1501       }
1502 #else
1503       copy_memory_small(s, d, rscratch2, rscratch1, step);
1504 #endif
1505     }
1506 
1507     __ bind(aligned);
1508 
1509     // s is now 2-word-aligned.
1510 
1511     // We have a count of units and some trailing bytes.  Adjust the
1512     // count and do a bulk copy of words.
1513     __ lsr(rscratch2, count, exact_log2(wordSize/granularity));
1514     if (direction == copy_forwards)
1515       __ bl(copy_f);
1516     else
1517       __ bl(copy_b);
1518 
1519     // And the tail.
1520     copy_memory_small(s, d, count, tmp, step);
1521 
1522     if (granularity >= 8) __ bind(copy8);
1523     if (granularity >= 4) __ bind(copy4);
1524     __ bind(finish);
1525   }
1526 
1527 
1528   void clobber_registers() {
1529 #ifdef ASSERT
1530     __ mov(rscratch1, (uint64_t)0xdeadbeef);
1531     __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32);
1532     for (Register r = r3; r <= r18; r++)
1533       if (r != rscratch1) __ mov(r, rscratch1);
1534 #endif
1535   }
1536 
1537   // Scan over array at a for count oops, verifying each one.
1538   // Preserves a and count, clobbers rscratch1 and rscratch2.
1539   void verify_oop_array (size_t size, Register a, Register count, Register temp) {
1540     Label loop, end;
1541     __ mov(rscratch1, a);
1542     __ mov(rscratch2, zr);
1543     __ bind(loop);
1544     __ cmp(rscratch2, count);
1545     __ br(Assembler::HS, end);
1546     if (size == (size_t)wordSize) {
1547       __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1548       __ verify_oop(temp);
1549     } else {
1550       __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size))));
1551       __ decode_heap_oop(temp); // calls verify_oop
1552     }
1553     __ add(rscratch2, rscratch2, size);
1554     __ b(loop);
1555     __ bind(end);
1556   }
1557 
1558   // Arguments:
1559   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1560   //             ignored
1561   //   is_oop  - true => oop array, so generate store check code
1562   //   name    - stub name string
1563   //
1564   // Inputs:
1565   //   c_rarg0   - source array address
1566   //   c_rarg1   - destination array address
1567   //   c_rarg2   - element count, treated as ssize_t, can be zero
1568   //
1569   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1570   // the hardware handle it.  The two dwords within qwords that span
1571   // cache line boundaries will still be loaded and stored atomicly.
1572   //
1573   // Side Effects:
1574   //   disjoint_int_copy_entry is set to the no-overlap entry point
1575   //   used by generate_conjoint_int_oop_copy().
1576   //
1577   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
1578                                   const char *name, bool dest_uninitialized = false) {
1579     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1580     __ align(CodeEntryAlignment);
1581     StubCodeMark mark(this, "StubRoutines", name);
1582     address start = __ pc();
1583     __ enter();
1584 
1585     if (entry != NULL) {
1586       *entry = __ pc();
1587       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1588       BLOCK_COMMENT("Entry:");
1589     }
1590 
1591     if (is_oop) {
1592       __ push(RegSet::of(d, count), sp);
1593       // no registers are destroyed by this call
1594       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1595     }
1596     copy_memory(aligned, s, d, count, rscratch1, size);
1597     if (is_oop) {
1598       __ pop(RegSet::of(d, count), sp);
1599       if (VerifyOops)
1600         verify_oop_array(size, d, count, r16);
1601       __ sub(count, count, 1); // make an inclusive end pointer
1602       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1603       gen_write_ref_array_post_barrier(d, count, rscratch1);
1604     }
1605     __ leave();
1606     __ mov(r0, zr); // return 0
1607     __ ret(lr);
1608 #ifdef BUILTIN_SIM
1609     {
1610       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1611       sim->notifyCompile(const_cast<char*>(name), start);
1612     }
1613 #endif
1614     return start;
1615   }
1616 
1617   // Arguments:
1618   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1619   //             ignored
1620   //   is_oop  - true => oop array, so generate store check code
1621   //   name    - stub name string
1622   //
1623   // Inputs:
1624   //   c_rarg0   - source array address
1625   //   c_rarg1   - destination array address
1626   //   c_rarg2   - element count, treated as ssize_t, can be zero
1627   //
1628   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1629   // the hardware handle it.  The two dwords within qwords that span
1630   // cache line boundaries will still be loaded and stored atomicly.
1631   //
1632   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
1633                                  address *entry, const char *name,
1634                                  bool dest_uninitialized = false) {
1635     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
1636 
1637     StubCodeMark mark(this, "StubRoutines", name);
1638     address start = __ pc();
1639     __ enter();
1640 
1641     if (entry != NULL) {
1642       *entry = __ pc();
1643       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1644       BLOCK_COMMENT("Entry:");
1645     }
1646 
1647     // use fwd copy when (d-s) above_equal (count*size)
1648     __ sub(rscratch1, d, s);
1649     __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size));
1650     __ br(Assembler::HS, nooverlap_target);
1651 
1652     if (is_oop) {
1653       __ push(RegSet::of(d, count), sp);
1654       // no registers are destroyed by this call
1655       gen_write_ref_array_pre_barrier(d, count, dest_uninitialized);
1656     }
1657     copy_memory(aligned, s, d, count, rscratch1, -size);
1658     if (is_oop) {
1659       __ pop(RegSet::of(d, count), sp);
1660       if (VerifyOops)
1661         verify_oop_array(size, d, count, r16);
1662       __ sub(count, count, 1); // make an inclusive end pointer
1663       __ lea(count, Address(d, count, Address::lsl(exact_log2(size))));
1664       gen_write_ref_array_post_barrier(d, count, rscratch1);
1665     }
1666     __ leave();
1667     __ mov(r0, zr); // return 0
1668     __ ret(lr);
1669 #ifdef BUILTIN_SIM
1670     {
1671       AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck);
1672       sim->notifyCompile(const_cast<char*>(name), start);
1673     }
1674 #endif
1675     return start;
1676 }
1677 
1678   // Arguments:
1679   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1680   //             ignored
1681   //   name    - stub name string
1682   //
1683   // Inputs:
1684   //   c_rarg0   - source array address
1685   //   c_rarg1   - destination array address
1686   //   c_rarg2   - element count, treated as ssize_t, can be zero
1687   //
1688   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1689   // we let the hardware handle it.  The one to eight bytes within words,
1690   // dwords or qwords that span cache line boundaries will still be loaded
1691   // and stored atomically.
1692   //
1693   // Side Effects:
1694   //   disjoint_byte_copy_entry is set to the no-overlap entry point  //
1695   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1696   // we let the hardware handle it.  The one to eight bytes within words,
1697   // dwords or qwords that span cache line boundaries will still be loaded
1698   // and stored atomically.
1699   //
1700   // Side Effects:
1701   //   disjoint_byte_copy_entry is set to the no-overlap entry point
1702   //   used by generate_conjoint_byte_copy().
1703   //
1704   address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) {
1705     const bool not_oop = false;
1706     return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name);
1707   }
1708 
1709   // Arguments:
1710   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1711   //             ignored
1712   //   name    - stub name string
1713   //
1714   // Inputs:
1715   //   c_rarg0   - source array address
1716   //   c_rarg1   - destination array address
1717   //   c_rarg2   - element count, treated as ssize_t, can be zero
1718   //
1719   // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries,
1720   // we let the hardware handle it.  The one to eight bytes within words,
1721   // dwords or qwords that span cache line boundaries will still be loaded
1722   // and stored atomically.
1723   //
1724   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1725                                       address* entry, const char *name) {
1726     const bool not_oop = false;
1727     return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name);
1728   }
1729 
1730   // Arguments:
1731   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1732   //             ignored
1733   //   name    - stub name string
1734   //
1735   // Inputs:
1736   //   c_rarg0   - source array address
1737   //   c_rarg1   - destination array address
1738   //   c_rarg2   - element count, treated as ssize_t, can be zero
1739   //
1740   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1741   // let the hardware handle it.  The two or four words within dwords
1742   // or qwords that span cache line boundaries will still be loaded
1743   // and stored atomically.
1744   //
1745   // Side Effects:
1746   //   disjoint_short_copy_entry is set to the no-overlap entry point
1747   //   used by generate_conjoint_short_copy().
1748   //
1749   address generate_disjoint_short_copy(bool aligned,
1750                                        address* entry, const char *name) {
1751     const bool not_oop = false;
1752     return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name);
1753   }
1754 
1755   // Arguments:
1756   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1757   //             ignored
1758   //   name    - stub name string
1759   //
1760   // Inputs:
1761   //   c_rarg0   - source array address
1762   //   c_rarg1   - destination array address
1763   //   c_rarg2   - element count, treated as ssize_t, can be zero
1764   //
1765   // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we
1766   // let the hardware handle it.  The two or four words within dwords
1767   // or qwords that span cache line boundaries will still be loaded
1768   // and stored atomically.
1769   //
1770   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1771                                        address *entry, const char *name) {
1772     const bool not_oop = false;
1773     return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name);
1774 
1775   }
1776   // Arguments:
1777   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1778   //             ignored
1779   //   name    - stub name string
1780   //
1781   // Inputs:
1782   //   c_rarg0   - source array address
1783   //   c_rarg1   - destination array address
1784   //   c_rarg2   - element count, treated as ssize_t, can be zero
1785   //
1786   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1787   // the hardware handle it.  The two dwords within qwords that span
1788   // cache line boundaries will still be loaded and stored atomicly.
1789   //
1790   // Side Effects:
1791   //   disjoint_int_copy_entry is set to the no-overlap entry point
1792   //   used by generate_conjoint_int_oop_copy().
1793   //
1794   address generate_disjoint_int_copy(bool aligned, address *entry,
1795                                          const char *name) {
1796     const bool not_oop = false;
1797     return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name);
1798   }
1799 
1800   // Arguments:
1801   //   aligned - true => Input and output aligned on a HeapWord == 8-byte boundary
1802   //             ignored
1803   //   name    - stub name string
1804   //
1805   // Inputs:
1806   //   c_rarg0   - source array address
1807   //   c_rarg1   - destination array address
1808   //   c_rarg2   - element count, treated as ssize_t, can be zero
1809   //
1810   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
1811   // the hardware handle it.  The two dwords within qwords that span
1812   // cache line boundaries will still be loaded and stored atomicly.
1813   //
1814   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
1815                                      address *entry, const char *name,
1816                                      bool dest_uninitialized = false) {
1817     const bool not_oop = false;
1818     return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name);
1819   }
1820 
1821 
1822   // Arguments:
1823   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1824   //             ignored
1825   //   name    - stub name string
1826   //
1827   // Inputs:
1828   //   c_rarg0   - source array address
1829   //   c_rarg1   - destination array address
1830   //   c_rarg2   - element count, treated as size_t, can be zero
1831   //
1832   // Side Effects:
1833   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1834   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1835   //
1836   address generate_disjoint_long_copy(bool aligned, address *entry,
1837                                           const char *name, bool dest_uninitialized = false) {
1838     const bool not_oop = false;
1839     return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name);
1840   }
1841 
1842   // Arguments:
1843   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1844   //             ignored
1845   //   name    - stub name string
1846   //
1847   // Inputs:
1848   //   c_rarg0   - source array address
1849   //   c_rarg1   - destination array address
1850   //   c_rarg2   - element count, treated as size_t, can be zero
1851   //
1852   address generate_conjoint_long_copy(bool aligned,
1853                                       address nooverlap_target, address *entry,
1854                                       const char *name, bool dest_uninitialized = false) {
1855     const bool not_oop = false;
1856     return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name);
1857   }
1858 
1859   // Arguments:
1860   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1861   //             ignored
1862   //   name    - stub name string
1863   //
1864   // Inputs:
1865   //   c_rarg0   - source array address
1866   //   c_rarg1   - destination array address
1867   //   c_rarg2   - element count, treated as size_t, can be zero
1868   //
1869   // Side Effects:
1870   //   disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the
1871   //   no-overlap entry point used by generate_conjoint_long_oop_copy().
1872   //
1873   address generate_disjoint_oop_copy(bool aligned, address *entry,
1874                                      const char *name, bool dest_uninitialized) {
1875     const bool is_oop = true;
1876     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1877     return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized);
1878   }
1879 
1880   // Arguments:
1881   //   aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes
1882   //             ignored
1883   //   name    - stub name string
1884   //
1885   // Inputs:
1886   //   c_rarg0   - source array address
1887   //   c_rarg1   - destination array address
1888   //   c_rarg2   - element count, treated as size_t, can be zero
1889   //
1890   address generate_conjoint_oop_copy(bool aligned,
1891                                      address nooverlap_target, address *entry,
1892                                      const char *name, bool dest_uninitialized) {
1893     const bool is_oop = true;
1894     const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong);
1895     return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry,
1896                                   name, dest_uninitialized);
1897   }
1898 
1899 
1900   // Helper for generating a dynamic type check.
1901   // Smashes rscratch1.
1902   void generate_type_check(Register sub_klass,
1903                            Register super_check_offset,
1904                            Register super_klass,
1905                            Label& L_success) {
1906     assert_different_registers(sub_klass, super_check_offset, super_klass);
1907 
1908     BLOCK_COMMENT("type_check:");
1909 
1910     Label L_miss;
1911 
1912     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
1913                                      super_check_offset);
1914     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
1915 
1916     // Fall through on failure!
1917     __ BIND(L_miss);
1918   }
1919 
1920   //
1921   //  Generate checkcasting array copy stub
1922   //
1923   //  Input:
1924   //    c_rarg0   - source array address
1925   //    c_rarg1   - destination array address
1926   //    c_rarg2   - element count, treated as ssize_t, can be zero
1927   //    c_rarg3   - size_t ckoff (super_check_offset)
1928   //    c_rarg4   - oop ckval (super_klass)
1929   //
1930   //  Output:
1931   //    r0 ==  0  -  success
1932   //    r0 == -1^K - failure, where K is partial transfer count
1933   //
1934   address generate_checkcast_copy(const char *name, address *entry,
1935                                   bool dest_uninitialized = false) {
1936 
1937     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
1938 
1939     // Input registers (after setup_arg_regs)
1940     const Register from        = c_rarg0;   // source array address
1941     const Register to          = c_rarg1;   // destination array address
1942     const Register count       = c_rarg2;   // elementscount
1943     const Register ckoff       = c_rarg3;   // super_check_offset
1944     const Register ckval       = c_rarg4;   // super_klass
1945 
1946     // Registers used as temps (r18, r19, r20 are save-on-entry)
1947     const Register count_save  = r21;       // orig elementscount
1948     const Register start_to    = r20;       // destination array start address
1949     const Register copied_oop  = r18;       // actual oop copied
1950     const Register r19_klass   = r19;       // oop._klass
1951 
1952     //---------------------------------------------------------------
1953     // Assembler stub will be used for this call to arraycopy
1954     // if the two arrays are subtypes of Object[] but the
1955     // destination array type is not equal to or a supertype
1956     // of the source type.  Each element must be separately
1957     // checked.
1958 
1959     assert_different_registers(from, to, count, ckoff, ckval, start_to,
1960                                copied_oop, r19_klass, count_save);
1961 
1962     __ align(CodeEntryAlignment);
1963     StubCodeMark mark(this, "StubRoutines", name);
1964     address start = __ pc();
1965 
1966     __ enter(); // required for proper stackwalking of RuntimeStub frame
1967 
1968 #ifdef ASSERT
1969     // caller guarantees that the arrays really are different
1970     // otherwise, we would have to make conjoint checks
1971     { Label L;
1972       array_overlap_test(L, TIMES_OOP);
1973       __ stop("checkcast_copy within a single array");
1974       __ bind(L);
1975     }
1976 #endif //ASSERT
1977 
1978     // Caller of this entry point must set up the argument registers.
1979     if (entry != NULL) {
1980       *entry = __ pc();
1981       BLOCK_COMMENT("Entry:");
1982     }
1983 
1984      // Empty array:  Nothing to do.
1985     __ cbz(count, L_done);
1986 
1987     __ push(RegSet::of(r18, r19, r20, r21), sp);
1988 
1989 #ifdef ASSERT
1990     BLOCK_COMMENT("assert consistent ckoff/ckval");
1991     // The ckoff and ckval must be mutually consistent,
1992     // even though caller generates both.
1993     { Label L;
1994       int sco_offset = in_bytes(Klass::super_check_offset_offset());
1995       __ ldrw(start_to, Address(ckval, sco_offset));
1996       __ cmpw(ckoff, start_to);
1997       __ br(Assembler::EQ, L);
1998       __ stop("super_check_offset inconsistent");
1999       __ bind(L);
2000     }
2001 #endif //ASSERT
2002 
2003     gen_write_ref_array_pre_barrier(to, count, dest_uninitialized);
2004 
2005     // save the original count
2006     __ mov(count_save, count);
2007 
2008     // Copy from low to high addresses
2009     __ mov(start_to, to);              // Save destination array start address
2010     __ b(L_load_element);
2011 
2012     // ======== begin loop ========
2013     // (Loop is rotated; its entry is L_load_element.)
2014     // Loop control:
2015     //   for (; count != 0; count--) {
2016     //     copied_oop = load_heap_oop(from++);
2017     //     ... generate_type_check ...;
2018     //     store_heap_oop(to++, copied_oop);
2019     //   }
2020     __ align(OptoLoopAlignment);
2021 
2022     __ BIND(L_store_element);
2023     __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop);  // store the oop
2024     __ sub(count, count, 1);
2025     __ cbz(count, L_do_card_marks);
2026 
2027     // ======== loop entry is here ========
2028     __ BIND(L_load_element);
2029     __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop
2030     __ cbz(copied_oop, L_store_element);
2031 
2032     __ load_klass(r19_klass, copied_oop);// query the object klass
2033     generate_type_check(r19_klass, ckoff, ckval, L_store_element);
2034     // ======== end loop ========
2035 
2036     // It was a real error; we must depend on the caller to finish the job.
2037     // Register count = remaining oops, count_orig = total oops.
2038     // Emit GC store barriers for the oops we have copied and report
2039     // their number to the caller.
2040 
2041     __ subs(count, count_save, count);     // K = partially copied oop count
2042     __ eon(count, count, zr);                   // report (-1^K) to caller
2043     __ br(Assembler::EQ, L_done_pop);
2044 
2045     __ BIND(L_do_card_marks);
2046     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
2047     gen_write_ref_array_post_barrier(start_to, to, rscratch1);
2048 
2049     __ bind(L_done_pop);
2050     __ pop(RegSet::of(r18, r19, r20, r21), sp);
2051     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
2052 
2053     __ bind(L_done);
2054     __ mov(r0, count);
2055     __ leave();
2056     __ ret(lr);
2057 
2058     return start;
2059   }
2060 
2061   // Perform range checks on the proposed arraycopy.
2062   // Kills temp, but nothing else.
2063   // Also, clean the sign bits of src_pos and dst_pos.
2064   void arraycopy_range_checks(Register src,     // source array oop (c_rarg0)
2065                               Register src_pos, // source position (c_rarg1)
2066                               Register dst,     // destination array oo (c_rarg2)
2067                               Register dst_pos, // destination position (c_rarg3)
2068                               Register length,
2069                               Register temp,
2070                               Label& L_failed) {
2071     BLOCK_COMMENT("arraycopy_range_checks:");
2072 
2073     assert_different_registers(rscratch1, temp);
2074 
2075     //  if (src_pos + length > arrayOop(src)->length())  FAIL;
2076     __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes()));
2077     __ addw(temp, length, src_pos);
2078     __ cmpw(temp, rscratch1);
2079     __ br(Assembler::HI, L_failed);
2080 
2081     //  if (dst_pos + length > arrayOop(dst)->length())  FAIL;
2082     __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes()));
2083     __ addw(temp, length, dst_pos);
2084     __ cmpw(temp, rscratch1);
2085     __ br(Assembler::HI, L_failed);
2086 
2087     // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'.
2088     __ movw(src_pos, src_pos);
2089     __ movw(dst_pos, dst_pos);
2090 
2091     BLOCK_COMMENT("arraycopy_range_checks done");
2092   }
2093 
2094   // These stubs get called from some dumb test routine.
2095   // I'll write them properly when they're called from
2096   // something that's actually doing something.
2097   static void fake_arraycopy_stub(address src, address dst, int count) {
2098     assert(count == 0, "huh?");
2099   }
2100 
2101 
2102   //
2103   //  Generate 'unsafe' array copy stub
2104   //  Though just as safe as the other stubs, it takes an unscaled
2105   //  size_t argument instead of an element count.
2106   //
2107   //  Input:
2108   //    c_rarg0   - source array address
2109   //    c_rarg1   - destination array address
2110   //    c_rarg2   - byte count, treated as ssize_t, can be zero
2111   //
2112   // Examines the alignment of the operands and dispatches
2113   // to a long, int, short, or byte copy loop.
2114   //
2115   address generate_unsafe_copy(const char *name,
2116                                address byte_copy_entry,
2117                                address short_copy_entry,
2118                                address int_copy_entry,
2119                                address long_copy_entry) {
2120     Label L_long_aligned, L_int_aligned, L_short_aligned;
2121     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
2122 
2123     __ align(CodeEntryAlignment);
2124     StubCodeMark mark(this, "StubRoutines", name);
2125     address start = __ pc();
2126     __ enter(); // required for proper stackwalking of RuntimeStub frame
2127 
2128     // bump this on entry, not on exit:
2129     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr);
2130 
2131     __ orr(rscratch1, s, d);
2132     __ orr(rscratch1, rscratch1, count);
2133 
2134     __ andr(rscratch1, rscratch1, BytesPerLong-1);
2135     __ cbz(rscratch1, L_long_aligned);
2136     __ andr(rscratch1, rscratch1, BytesPerInt-1);
2137     __ cbz(rscratch1, L_int_aligned);
2138     __ tbz(rscratch1, 0, L_short_aligned);
2139     __ b(RuntimeAddress(byte_copy_entry));
2140 
2141     __ BIND(L_short_aligned);
2142     __ lsr(count, count, LogBytesPerShort);  // size => short_count
2143     __ b(RuntimeAddress(short_copy_entry));
2144     __ BIND(L_int_aligned);
2145     __ lsr(count, count, LogBytesPerInt);    // size => int_count
2146     __ b(RuntimeAddress(int_copy_entry));
2147     __ BIND(L_long_aligned);
2148     __ lsr(count, count, LogBytesPerLong);   // size => long_count
2149     __ b(RuntimeAddress(long_copy_entry));
2150 
2151     return start;
2152   }
2153 
2154   //
2155   //  Generate generic array copy stubs
2156   //
2157   //  Input:
2158   //    c_rarg0    -  src oop
2159   //    c_rarg1    -  src_pos (32-bits)
2160   //    c_rarg2    -  dst oop
2161   //    c_rarg3    -  dst_pos (32-bits)
2162   //    c_rarg4    -  element count (32-bits)
2163   //
2164   //  Output:
2165   //    r0 ==  0  -  success
2166   //    r0 == -1^K - failure, where K is partial transfer count
2167   //
2168   address generate_generic_copy(const char *name,
2169                                 address byte_copy_entry, address short_copy_entry,
2170                                 address int_copy_entry, address oop_copy_entry,
2171                                 address long_copy_entry, address checkcast_copy_entry) {
2172 
2173     Label L_failed, L_failed_0, L_objArray;
2174     Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs;
2175 
2176     // Input registers
2177     const Register src        = c_rarg0;  // source array oop
2178     const Register src_pos    = c_rarg1;  // source position
2179     const Register dst        = c_rarg2;  // destination array oop
2180     const Register dst_pos    = c_rarg3;  // destination position
2181     const Register length     = c_rarg4;
2182 
2183     StubCodeMark mark(this, "StubRoutines", name);
2184 
2185     __ align(CodeEntryAlignment);
2186     address start = __ pc();
2187 
2188     __ enter(); // required for proper stackwalking of RuntimeStub frame
2189 
2190     // bump this on entry, not on exit:
2191     inc_counter_np(SharedRuntime::_generic_array_copy_ctr);
2192 
2193     //-----------------------------------------------------------------------
2194     // Assembler stub will be used for this call to arraycopy
2195     // if the following conditions are met:
2196     //
2197     // (1) src and dst must not be null.
2198     // (2) src_pos must not be negative.
2199     // (3) dst_pos must not be negative.
2200     // (4) length  must not be negative.
2201     // (5) src klass and dst klass should be the same and not NULL.
2202     // (6) src and dst should be arrays.
2203     // (7) src_pos + length must not exceed length of src.
2204     // (8) dst_pos + length must not exceed length of dst.
2205     //
2206 
2207     //  if (src == NULL) return -1;
2208     __ cbz(src, L_failed);
2209 
2210     //  if (src_pos < 0) return -1;
2211     __ tbnz(src_pos, 31, L_failed);  // i.e. sign bit set
2212 
2213     //  if (dst == NULL) return -1;
2214     __ cbz(dst, L_failed);
2215 
2216     //  if (dst_pos < 0) return -1;
2217     __ tbnz(dst_pos, 31, L_failed);  // i.e. sign bit set
2218 
2219     // registers used as temp
2220     const Register scratch_length    = r16; // elements count to copy
2221     const Register scratch_src_klass = r17; // array klass
2222     const Register lh                = r18; // layout helper
2223 
2224     //  if (length < 0) return -1;
2225     __ movw(scratch_length, length);        // length (elements count, 32-bits value)
2226     __ tbnz(scratch_length, 31, L_failed);  // i.e. sign bit set
2227 
2228     __ load_klass(scratch_src_klass, src);
2229 #ifdef ASSERT
2230     //  assert(src->klass() != NULL);
2231     {
2232       BLOCK_COMMENT("assert klasses not null {");
2233       Label L1, L2;
2234       __ cbnz(scratch_src_klass, L2);   // it is broken if klass is NULL
2235       __ bind(L1);
2236       __ stop("broken null klass");
2237       __ bind(L2);
2238       __ load_klass(rscratch1, dst);
2239       __ cbz(rscratch1, L1);     // this would be broken also
2240       BLOCK_COMMENT("} assert klasses not null done");
2241     }
2242 #endif
2243 
2244     // Load layout helper (32-bits)
2245     //
2246     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2247     // 32        30    24            16              8     2                 0
2248     //
2249     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2250     //
2251 
2252     const int lh_offset = in_bytes(Klass::layout_helper_offset());
2253 
2254     // Handle objArrays completely differently...
2255     const jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2256     __ ldrw(lh, Address(scratch_src_klass, lh_offset));
2257     __ movw(rscratch1, objArray_lh);
2258     __ eorw(rscratch2, lh, rscratch1);
2259     __ cbzw(rscratch2, L_objArray);
2260 
2261     //  if (src->klass() != dst->klass()) return -1;
2262     __ load_klass(rscratch2, dst);
2263     __ eor(rscratch2, rscratch2, scratch_src_klass);
2264     __ cbnz(rscratch2, L_failed);
2265 
2266     //  if (!src->is_Array()) return -1;
2267     __ tbz(lh, 31, L_failed);  // i.e. (lh >= 0)
2268 
2269     // At this point, it is known to be a typeArray (array_tag 0x3).
2270 #ifdef ASSERT
2271     {
2272       BLOCK_COMMENT("assert primitive array {");
2273       Label L;
2274       __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2275       __ cmpw(lh, rscratch2);
2276       __ br(Assembler::GE, L);
2277       __ stop("must be a primitive array");
2278       __ bind(L);
2279       BLOCK_COMMENT("} assert primitive array done");
2280     }
2281 #endif
2282 
2283     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2284                            rscratch2, L_failed);
2285 
2286     // TypeArrayKlass
2287     //
2288     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2289     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2290     //
2291 
2292     const Register rscratch1_offset = rscratch1;    // array offset
2293     const Register r18_elsize = lh; // element size
2294 
2295     __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift,
2296            exact_log2(Klass::_lh_header_size_mask+1));   // array_offset
2297     __ add(src, src, rscratch1_offset);           // src array offset
2298     __ add(dst, dst, rscratch1_offset);           // dst array offset
2299     BLOCK_COMMENT("choose copy loop based on element size");
2300 
2301     // next registers should be set before the jump to corresponding stub
2302     const Register from     = c_rarg0;  // source array address
2303     const Register to       = c_rarg1;  // destination array address
2304     const Register count    = c_rarg2;  // elements count
2305 
2306     // 'from', 'to', 'count' registers should be set in such order
2307     // since they are the same as 'src', 'src_pos', 'dst'.
2308 
2309     assert(Klass::_lh_log2_element_size_shift == 0, "fix this code");
2310 
2311     // The possible values of elsize are 0-3, i.e. exact_log2(element
2312     // size in bytes).  We do a simple bitwise binary search.
2313   __ BIND(L_copy_bytes);
2314     __ tbnz(r18_elsize, 1, L_copy_ints);
2315     __ tbnz(r18_elsize, 0, L_copy_shorts);
2316     __ lea(from, Address(src, src_pos));// src_addr
2317     __ lea(to,   Address(dst, dst_pos));// dst_addr
2318     __ movw(count, scratch_length); // length
2319     __ b(RuntimeAddress(byte_copy_entry));
2320 
2321   __ BIND(L_copy_shorts);
2322     __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr
2323     __ lea(to,   Address(dst, dst_pos, Address::lsl(1)));// dst_addr
2324     __ movw(count, scratch_length); // length
2325     __ b(RuntimeAddress(short_copy_entry));
2326 
2327   __ BIND(L_copy_ints);
2328     __ tbnz(r18_elsize, 0, L_copy_longs);
2329     __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr
2330     __ lea(to,   Address(dst, dst_pos, Address::lsl(2)));// dst_addr
2331     __ movw(count, scratch_length); // length
2332     __ b(RuntimeAddress(int_copy_entry));
2333 
2334   __ BIND(L_copy_longs);
2335 #ifdef ASSERT
2336     {
2337       BLOCK_COMMENT("assert long copy {");
2338       Label L;
2339       __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize
2340       __ cmpw(r18_elsize, LogBytesPerLong);
2341       __ br(Assembler::EQ, L);
2342       __ stop("must be long copy, but elsize is wrong");
2343       __ bind(L);
2344       BLOCK_COMMENT("} assert long copy done");
2345     }
2346 #endif
2347     __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr
2348     __ lea(to,   Address(dst, dst_pos, Address::lsl(3)));// dst_addr
2349     __ movw(count, scratch_length); // length
2350     __ b(RuntimeAddress(long_copy_entry));
2351 
2352     // ObjArrayKlass
2353   __ BIND(L_objArray);
2354     // live at this point:  scratch_src_klass, scratch_length, src[_pos], dst[_pos]
2355 
2356     Label L_plain_copy, L_checkcast_copy;
2357     //  test array classes for subtyping
2358     __ load_klass(r18, dst);
2359     __ cmp(scratch_src_klass, r18); // usual case is exact equality
2360     __ br(Assembler::NE, L_checkcast_copy);
2361 
2362     // Identically typed arrays can be copied without element-wise checks.
2363     arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2364                            rscratch2, L_failed);
2365 
2366     __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2367     __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2368     __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2369     __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2370     __ movw(count, scratch_length); // length
2371   __ BIND(L_plain_copy);
2372     __ b(RuntimeAddress(oop_copy_entry));
2373 
2374   __ BIND(L_checkcast_copy);
2375     // live at this point:  scratch_src_klass, scratch_length, r18 (dst_klass)
2376     {
2377       // Before looking at dst.length, make sure dst is also an objArray.
2378       __ ldrw(rscratch1, Address(r18, lh_offset));
2379       __ movw(rscratch2, objArray_lh);
2380       __ eorw(rscratch1, rscratch1, rscratch2);
2381       __ cbnzw(rscratch1, L_failed);
2382 
2383       // It is safe to examine both src.length and dst.length.
2384       arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length,
2385                              r18, L_failed);
2386 
2387       const Register rscratch2_dst_klass = rscratch2;
2388       __ load_klass(rscratch2_dst_klass, dst); // reload
2389 
2390       // Marshal the base address arguments now, freeing registers.
2391       __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop)));
2392       __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2393       __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop)));
2394       __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT));
2395       __ movw(count, length);           // length (reloaded)
2396       Register sco_temp = c_rarg3;      // this register is free now
2397       assert_different_registers(from, to, count, sco_temp,
2398                                  rscratch2_dst_klass, scratch_src_klass);
2399       // assert_clean_int(count, sco_temp);
2400 
2401       // Generate the type check.
2402       const int sco_offset = in_bytes(Klass::super_check_offset_offset());
2403       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2404       // assert_clean_int(sco_temp, r18);
2405       generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy);
2406 
2407       // Fetch destination element klass from the ObjArrayKlass header.
2408       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2409       __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset));
2410       __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset));
2411 
2412       // the checkcast_copy loop needs two extra arguments:
2413       assert(c_rarg3 == sco_temp, "#3 already in place");
2414       // Set up arguments for checkcast_copy_entry.
2415       __ mov(c_rarg4, rscratch2_dst_klass);  // dst.klass.element_klass
2416       __ b(RuntimeAddress(checkcast_copy_entry));
2417     }
2418 
2419   __ BIND(L_failed);
2420     __ mov(r0, -1);
2421     __ leave();   // required for proper stackwalking of RuntimeStub frame
2422     __ ret(lr);
2423 
2424     return start;
2425   }
2426 
2427   //
2428   // Generate stub for array fill. If "aligned" is true, the
2429   // "to" address is assumed to be heapword aligned.
2430   //
2431   // Arguments for generated stub:
2432   //   to:    c_rarg0
2433   //   value: c_rarg1
2434   //   count: c_rarg2 treated as signed
2435   //
2436   address generate_fill(BasicType t, bool aligned, const char *name) {
2437     __ align(CodeEntryAlignment);
2438     StubCodeMark mark(this, "StubRoutines", name);
2439     address start = __ pc();
2440 
2441     BLOCK_COMMENT("Entry:");
2442 
2443     const Register to        = c_rarg0;  // source array address
2444     const Register value     = c_rarg1;  // value
2445     const Register count     = c_rarg2;  // elements count
2446 
2447     const Register bz_base = r10;        // base for block_zero routine
2448     const Register cnt_words = r11;      // temp register
2449 
2450     __ enter();
2451 
2452     Label L_fill_elements, L_exit1;
2453 
2454     int shift = -1;
2455     switch (t) {
2456       case T_BYTE:
2457         shift = 0;
2458         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2459         __ bfi(value, value, 8, 8);   // 8 bit -> 16 bit
2460         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2461         __ br(Assembler::LO, L_fill_elements);
2462         break;
2463       case T_SHORT:
2464         shift = 1;
2465         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2466         __ bfi(value, value, 16, 16); // 16 bit -> 32 bit
2467         __ br(Assembler::LO, L_fill_elements);
2468         break;
2469       case T_INT:
2470         shift = 2;
2471         __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element
2472         __ br(Assembler::LO, L_fill_elements);
2473         break;
2474       default: ShouldNotReachHere();
2475     }
2476 
2477     // Align source address at 8 bytes address boundary.
2478     Label L_skip_align1, L_skip_align2, L_skip_align4;
2479     if (!aligned) {
2480       switch (t) {
2481         case T_BYTE:
2482           // One byte misalignment happens only for byte arrays.
2483           __ tbz(to, 0, L_skip_align1);
2484           __ strb(value, Address(__ post(to, 1)));
2485           __ subw(count, count, 1);
2486           __ bind(L_skip_align1);
2487           // Fallthrough
2488         case T_SHORT:
2489           // Two bytes misalignment happens only for byte and short (char) arrays.
2490           __ tbz(to, 1, L_skip_align2);
2491           __ strh(value, Address(__ post(to, 2)));
2492           __ subw(count, count, 2 >> shift);
2493           __ bind(L_skip_align2);
2494           // Fallthrough
2495         case T_INT:
2496           // Align to 8 bytes, we know we are 4 byte aligned to start.
2497           __ tbz(to, 2, L_skip_align4);
2498           __ strw(value, Address(__ post(to, 4)));
2499           __ subw(count, count, 4 >> shift);
2500           __ bind(L_skip_align4);
2501           break;
2502         default: ShouldNotReachHere();
2503       }
2504     }
2505 
2506     //
2507     //  Fill large chunks
2508     //
2509     __ lsrw(cnt_words, count, 3 - shift); // number of words
2510     __ bfi(value, value, 32, 32);         // 32 bit -> 64 bit
2511     __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift);
2512     if (UseBlockZeroing) {
2513       Label non_block_zeroing, rest;
2514       Register tmp = rscratch1;
2515       // count >= BlockZeroingLowLimit && value == 0
2516       __ subs(tmp, cnt_words, BlockZeroingLowLimit >> 3);
2517       __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE);
2518       __ br(Assembler::NE, non_block_zeroing);
2519       __ mov(bz_base, to);
2520       __ block_zero(bz_base, cnt_words, true);
2521       __ mov(to, bz_base);
2522       __ b(rest);
2523       __ bind(non_block_zeroing);
2524       __ fill_words(to, cnt_words, value);
2525       __ bind(rest);
2526     }
2527     else {
2528       __ fill_words(to, cnt_words, value);
2529     }
2530 
2531     // Remaining count is less than 8 bytes. Fill it by a single store.
2532     // Note that the total length is no less than 8 bytes.
2533     if (t == T_BYTE || t == T_SHORT) {
2534       Label L_exit1;
2535       __ cbzw(count, L_exit1);
2536       __ add(to, to, count, Assembler::LSL, shift); // points to the end
2537       __ str(value, Address(to, -8));    // overwrite some elements
2538       __ bind(L_exit1);
2539       __ leave();
2540       __ ret(lr);
2541     }
2542 
2543     // Handle copies less than 8 bytes.
2544     Label L_fill_2, L_fill_4, L_exit2;
2545     __ bind(L_fill_elements);
2546     switch (t) {
2547       case T_BYTE:
2548         __ tbz(count, 0, L_fill_2);
2549         __ strb(value, Address(__ post(to, 1)));
2550         __ bind(L_fill_2);
2551         __ tbz(count, 1, L_fill_4);
2552         __ strh(value, Address(__ post(to, 2)));
2553         __ bind(L_fill_4);
2554         __ tbz(count, 2, L_exit2);
2555         __ strw(value, Address(to));
2556         break;
2557       case T_SHORT:
2558         __ tbz(count, 0, L_fill_4);
2559         __ strh(value, Address(__ post(to, 2)));
2560         __ bind(L_fill_4);
2561         __ tbz(count, 1, L_exit2);
2562         __ strw(value, Address(to));
2563         break;
2564       case T_INT:
2565         __ cbzw(count, L_exit2);
2566         __ strw(value, Address(to));
2567         break;
2568       default: ShouldNotReachHere();
2569     }
2570     __ bind(L_exit2);
2571     __ leave();
2572     __ ret(lr);
2573     return start;
2574   }
2575 
2576   void generate_arraycopy_stubs() {
2577     address entry;
2578     address entry_jbyte_arraycopy;
2579     address entry_jshort_arraycopy;
2580     address entry_jint_arraycopy;
2581     address entry_oop_arraycopy;
2582     address entry_jlong_arraycopy;
2583     address entry_checkcast_arraycopy;
2584 
2585     generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards);
2586     generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards);
2587 
2588     StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11);
2589 
2590     //*** jbyte
2591     // Always need aligned and unaligned versions
2592     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
2593                                                                                   "jbyte_disjoint_arraycopy");
2594     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
2595                                                                                   &entry_jbyte_arraycopy,
2596                                                                                   "jbyte_arraycopy");
2597     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2598                                                                                   "arrayof_jbyte_disjoint_arraycopy");
2599     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
2600                                                                                   "arrayof_jbyte_arraycopy");
2601 
2602     //*** jshort
2603     // Always need aligned and unaligned versions
2604     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
2605                                                                                     "jshort_disjoint_arraycopy");
2606     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
2607                                                                                     &entry_jshort_arraycopy,
2608                                                                                     "jshort_arraycopy");
2609     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2610                                                                                     "arrayof_jshort_disjoint_arraycopy");
2611     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
2612                                                                                     "arrayof_jshort_arraycopy");
2613 
2614     //*** jint
2615     // Aligned versions
2616     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
2617                                                                                 "arrayof_jint_disjoint_arraycopy");
2618     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
2619                                                                                 "arrayof_jint_arraycopy");
2620     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
2621     // entry_jint_arraycopy always points to the unaligned version
2622     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
2623                                                                                 "jint_disjoint_arraycopy");
2624     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
2625                                                                                 &entry_jint_arraycopy,
2626                                                                                 "jint_arraycopy");
2627 
2628     //*** jlong
2629     // It is always aligned
2630     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
2631                                                                                   "arrayof_jlong_disjoint_arraycopy");
2632     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
2633                                                                                   "arrayof_jlong_arraycopy");
2634     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
2635     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
2636 
2637     //*** oops
2638     {
2639       // With compressed oops we need unaligned versions; notice that
2640       // we overwrite entry_oop_arraycopy.
2641       bool aligned = !UseCompressedOops;
2642 
2643       StubRoutines::_arrayof_oop_disjoint_arraycopy
2644         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy",
2645                                      /*dest_uninitialized*/false);
2646       StubRoutines::_arrayof_oop_arraycopy
2647         = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy",
2648                                      /*dest_uninitialized*/false);
2649       // Aligned versions without pre-barriers
2650       StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit
2651         = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit",
2652                                      /*dest_uninitialized*/true);
2653       StubRoutines::_arrayof_oop_arraycopy_uninit
2654         = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit",
2655                                      /*dest_uninitialized*/true);
2656     }
2657 
2658     StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
2659     StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
2660     StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
2661     StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
2662 
2663     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
2664     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
2665                                                                         /*dest_uninitialized*/true);
2666 
2667     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
2668                                                               entry_jbyte_arraycopy,
2669                                                               entry_jshort_arraycopy,
2670                                                               entry_jint_arraycopy,
2671                                                               entry_jlong_arraycopy);
2672 
2673     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
2674                                                                entry_jbyte_arraycopy,
2675                                                                entry_jshort_arraycopy,
2676                                                                entry_jint_arraycopy,
2677                                                                entry_oop_arraycopy,
2678                                                                entry_jlong_arraycopy,
2679                                                                entry_checkcast_arraycopy);
2680 
2681     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
2682     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
2683     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
2684     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
2685     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2686     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
2687   }
2688 
2689   void generate_math_stubs() { Unimplemented(); }
2690 
2691   // Arguments:
2692   //
2693   // Inputs:
2694   //   c_rarg0   - source byte array address
2695   //   c_rarg1   - destination byte array address
2696   //   c_rarg2   - K (key) in little endian int array
2697   //
2698   address generate_aescrypt_encryptBlock() {
2699     __ align(CodeEntryAlignment);
2700     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2701 
2702     Label L_doLast;
2703 
2704     const Register from        = c_rarg0;  // source array address
2705     const Register to          = c_rarg1;  // destination array address
2706     const Register key         = c_rarg2;  // key array address
2707     const Register keylen      = rscratch1;
2708 
2709     address start = __ pc();
2710     __ enter();
2711 
2712     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2713 
2714     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2715 
2716     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2717     __ rev32(v1, __ T16B, v1);
2718     __ rev32(v2, __ T16B, v2);
2719     __ rev32(v3, __ T16B, v3);
2720     __ rev32(v4, __ T16B, v4);
2721     __ aese(v0, v1);
2722     __ aesmc(v0, v0);
2723     __ aese(v0, v2);
2724     __ aesmc(v0, v0);
2725     __ aese(v0, v3);
2726     __ aesmc(v0, v0);
2727     __ aese(v0, v4);
2728     __ aesmc(v0, v0);
2729 
2730     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2731     __ rev32(v1, __ T16B, v1);
2732     __ rev32(v2, __ T16B, v2);
2733     __ rev32(v3, __ T16B, v3);
2734     __ rev32(v4, __ T16B, v4);
2735     __ aese(v0, v1);
2736     __ aesmc(v0, v0);
2737     __ aese(v0, v2);
2738     __ aesmc(v0, v0);
2739     __ aese(v0, v3);
2740     __ aesmc(v0, v0);
2741     __ aese(v0, v4);
2742     __ aesmc(v0, v0);
2743 
2744     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2745     __ rev32(v1, __ T16B, v1);
2746     __ rev32(v2, __ T16B, v2);
2747 
2748     __ cmpw(keylen, 44);
2749     __ br(Assembler::EQ, L_doLast);
2750 
2751     __ aese(v0, v1);
2752     __ aesmc(v0, v0);
2753     __ aese(v0, v2);
2754     __ aesmc(v0, v0);
2755 
2756     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2757     __ rev32(v1, __ T16B, v1);
2758     __ rev32(v2, __ T16B, v2);
2759 
2760     __ cmpw(keylen, 52);
2761     __ br(Assembler::EQ, L_doLast);
2762 
2763     __ aese(v0, v1);
2764     __ aesmc(v0, v0);
2765     __ aese(v0, v2);
2766     __ aesmc(v0, v0);
2767 
2768     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2769     __ rev32(v1, __ T16B, v1);
2770     __ rev32(v2, __ T16B, v2);
2771 
2772     __ BIND(L_doLast);
2773 
2774     __ aese(v0, v1);
2775     __ aesmc(v0, v0);
2776     __ aese(v0, v2);
2777 
2778     __ ld1(v1, __ T16B, key);
2779     __ rev32(v1, __ T16B, v1);
2780     __ eor(v0, __ T16B, v0, v1);
2781 
2782     __ st1(v0, __ T16B, to);
2783 
2784     __ mov(r0, 0);
2785 
2786     __ leave();
2787     __ ret(lr);
2788 
2789     return start;
2790   }
2791 
2792   // Arguments:
2793   //
2794   // Inputs:
2795   //   c_rarg0   - source byte array address
2796   //   c_rarg1   - destination byte array address
2797   //   c_rarg2   - K (key) in little endian int array
2798   //
2799   address generate_aescrypt_decryptBlock() {
2800     assert(UseAES, "need AES instructions and misaligned SSE support");
2801     __ align(CodeEntryAlignment);
2802     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2803     Label L_doLast;
2804 
2805     const Register from        = c_rarg0;  // source array address
2806     const Register to          = c_rarg1;  // destination array address
2807     const Register key         = c_rarg2;  // key array address
2808     const Register keylen      = rscratch1;
2809 
2810     address start = __ pc();
2811     __ enter(); // required for proper stackwalking of RuntimeStub frame
2812 
2813     __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2814 
2815     __ ld1(v0, __ T16B, from); // get 16 bytes of input
2816 
2817     __ ld1(v5, __ T16B, __ post(key, 16));
2818     __ rev32(v5, __ T16B, v5);
2819 
2820     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2821     __ rev32(v1, __ T16B, v1);
2822     __ rev32(v2, __ T16B, v2);
2823     __ rev32(v3, __ T16B, v3);
2824     __ rev32(v4, __ T16B, v4);
2825     __ aesd(v0, v1);
2826     __ aesimc(v0, v0);
2827     __ aesd(v0, v2);
2828     __ aesimc(v0, v0);
2829     __ aesd(v0, v3);
2830     __ aesimc(v0, v0);
2831     __ aesd(v0, v4);
2832     __ aesimc(v0, v0);
2833 
2834     __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64));
2835     __ rev32(v1, __ T16B, v1);
2836     __ rev32(v2, __ T16B, v2);
2837     __ rev32(v3, __ T16B, v3);
2838     __ rev32(v4, __ T16B, v4);
2839     __ aesd(v0, v1);
2840     __ aesimc(v0, v0);
2841     __ aesd(v0, v2);
2842     __ aesimc(v0, v0);
2843     __ aesd(v0, v3);
2844     __ aesimc(v0, v0);
2845     __ aesd(v0, v4);
2846     __ aesimc(v0, v0);
2847 
2848     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2849     __ rev32(v1, __ T16B, v1);
2850     __ rev32(v2, __ T16B, v2);
2851 
2852     __ cmpw(keylen, 44);
2853     __ br(Assembler::EQ, L_doLast);
2854 
2855     __ aesd(v0, v1);
2856     __ aesimc(v0, v0);
2857     __ aesd(v0, v2);
2858     __ aesimc(v0, v0);
2859 
2860     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2861     __ rev32(v1, __ T16B, v1);
2862     __ rev32(v2, __ T16B, v2);
2863 
2864     __ cmpw(keylen, 52);
2865     __ br(Assembler::EQ, L_doLast);
2866 
2867     __ aesd(v0, v1);
2868     __ aesimc(v0, v0);
2869     __ aesd(v0, v2);
2870     __ aesimc(v0, v0);
2871 
2872     __ ld1(v1, v2, __ T16B, __ post(key, 32));
2873     __ rev32(v1, __ T16B, v1);
2874     __ rev32(v2, __ T16B, v2);
2875 
2876     __ BIND(L_doLast);
2877 
2878     __ aesd(v0, v1);
2879     __ aesimc(v0, v0);
2880     __ aesd(v0, v2);
2881 
2882     __ eor(v0, __ T16B, v0, v5);
2883 
2884     __ st1(v0, __ T16B, to);
2885 
2886     __ mov(r0, 0);
2887 
2888     __ leave();
2889     __ ret(lr);
2890 
2891     return start;
2892   }
2893 
2894   // Arguments:
2895   //
2896   // Inputs:
2897   //   c_rarg0   - source byte array address
2898   //   c_rarg1   - destination byte array address
2899   //   c_rarg2   - K (key) in little endian int array
2900   //   c_rarg3   - r vector byte array address
2901   //   c_rarg4   - input length
2902   //
2903   // Output:
2904   //   x0        - input length
2905   //
2906   address generate_cipherBlockChaining_encryptAESCrypt() {
2907     assert(UseAES, "need AES instructions and misaligned SSE support");
2908     __ align(CodeEntryAlignment);
2909     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
2910 
2911     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
2912 
2913     const Register from        = c_rarg0;  // source array address
2914     const Register to          = c_rarg1;  // destination array address
2915     const Register key         = c_rarg2;  // key array address
2916     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
2917                                            // and left with the results of the last encryption block
2918     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
2919     const Register keylen      = rscratch1;
2920 
2921     address start = __ pc();
2922       __ enter();
2923 
2924       __ mov(rscratch2, len_reg);
2925       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
2926 
2927       __ ld1(v0, __ T16B, rvec);
2928 
2929       __ cmpw(keylen, 52);
2930       __ br(Assembler::CC, L_loadkeys_44);
2931       __ br(Assembler::EQ, L_loadkeys_52);
2932 
2933       __ ld1(v17, v18, __ T16B, __ post(key, 32));
2934       __ rev32(v17, __ T16B, v17);
2935       __ rev32(v18, __ T16B, v18);
2936     __ BIND(L_loadkeys_52);
2937       __ ld1(v19, v20, __ T16B, __ post(key, 32));
2938       __ rev32(v19, __ T16B, v19);
2939       __ rev32(v20, __ T16B, v20);
2940     __ BIND(L_loadkeys_44);
2941       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
2942       __ rev32(v21, __ T16B, v21);
2943       __ rev32(v22, __ T16B, v22);
2944       __ rev32(v23, __ T16B, v23);
2945       __ rev32(v24, __ T16B, v24);
2946       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
2947       __ rev32(v25, __ T16B, v25);
2948       __ rev32(v26, __ T16B, v26);
2949       __ rev32(v27, __ T16B, v27);
2950       __ rev32(v28, __ T16B, v28);
2951       __ ld1(v29, v30, v31, __ T16B, key);
2952       __ rev32(v29, __ T16B, v29);
2953       __ rev32(v30, __ T16B, v30);
2954       __ rev32(v31, __ T16B, v31);
2955 
2956     __ BIND(L_aes_loop);
2957       __ ld1(v1, __ T16B, __ post(from, 16));
2958       __ eor(v0, __ T16B, v0, v1);
2959 
2960       __ br(Assembler::CC, L_rounds_44);
2961       __ br(Assembler::EQ, L_rounds_52);
2962 
2963       __ aese(v0, v17); __ aesmc(v0, v0);
2964       __ aese(v0, v18); __ aesmc(v0, v0);
2965     __ BIND(L_rounds_52);
2966       __ aese(v0, v19); __ aesmc(v0, v0);
2967       __ aese(v0, v20); __ aesmc(v0, v0);
2968     __ BIND(L_rounds_44);
2969       __ aese(v0, v21); __ aesmc(v0, v0);
2970       __ aese(v0, v22); __ aesmc(v0, v0);
2971       __ aese(v0, v23); __ aesmc(v0, v0);
2972       __ aese(v0, v24); __ aesmc(v0, v0);
2973       __ aese(v0, v25); __ aesmc(v0, v0);
2974       __ aese(v0, v26); __ aesmc(v0, v0);
2975       __ aese(v0, v27); __ aesmc(v0, v0);
2976       __ aese(v0, v28); __ aesmc(v0, v0);
2977       __ aese(v0, v29); __ aesmc(v0, v0);
2978       __ aese(v0, v30);
2979       __ eor(v0, __ T16B, v0, v31);
2980 
2981       __ st1(v0, __ T16B, __ post(to, 16));
2982       __ sub(len_reg, len_reg, 16);
2983       __ cbnz(len_reg, L_aes_loop);
2984 
2985       __ st1(v0, __ T16B, rvec);
2986 
2987       __ mov(r0, rscratch2);
2988 
2989       __ leave();
2990       __ ret(lr);
2991 
2992       return start;
2993   }
2994 
2995   // Arguments:
2996   //
2997   // Inputs:
2998   //   c_rarg0   - source byte array address
2999   //   c_rarg1   - destination byte array address
3000   //   c_rarg2   - K (key) in little endian int array
3001   //   c_rarg3   - r vector byte array address
3002   //   c_rarg4   - input length
3003   //
3004   // Output:
3005   //   r0        - input length
3006   //
3007   address generate_cipherBlockChaining_decryptAESCrypt() {
3008     assert(UseAES, "need AES instructions and misaligned SSE support");
3009     __ align(CodeEntryAlignment);
3010     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3011 
3012     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
3013 
3014     const Register from        = c_rarg0;  // source array address
3015     const Register to          = c_rarg1;  // destination array address
3016     const Register key         = c_rarg2;  // key array address
3017     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
3018                                            // and left with the results of the last encryption block
3019     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
3020     const Register keylen      = rscratch1;
3021 
3022     address start = __ pc();
3023       __ enter();
3024 
3025       __ mov(rscratch2, len_reg);
3026       __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3027 
3028       __ ld1(v2, __ T16B, rvec);
3029 
3030       __ ld1(v31, __ T16B, __ post(key, 16));
3031       __ rev32(v31, __ T16B, v31);
3032 
3033       __ cmpw(keylen, 52);
3034       __ br(Assembler::CC, L_loadkeys_44);
3035       __ br(Assembler::EQ, L_loadkeys_52);
3036 
3037       __ ld1(v17, v18, __ T16B, __ post(key, 32));
3038       __ rev32(v17, __ T16B, v17);
3039       __ rev32(v18, __ T16B, v18);
3040     __ BIND(L_loadkeys_52);
3041       __ ld1(v19, v20, __ T16B, __ post(key, 32));
3042       __ rev32(v19, __ T16B, v19);
3043       __ rev32(v20, __ T16B, v20);
3044     __ BIND(L_loadkeys_44);
3045       __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64));
3046       __ rev32(v21, __ T16B, v21);
3047       __ rev32(v22, __ T16B, v22);
3048       __ rev32(v23, __ T16B, v23);
3049       __ rev32(v24, __ T16B, v24);
3050       __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64));
3051       __ rev32(v25, __ T16B, v25);
3052       __ rev32(v26, __ T16B, v26);
3053       __ rev32(v27, __ T16B, v27);
3054       __ rev32(v28, __ T16B, v28);
3055       __ ld1(v29, v30, __ T16B, key);
3056       __ rev32(v29, __ T16B, v29);
3057       __ rev32(v30, __ T16B, v30);
3058 
3059     __ BIND(L_aes_loop);
3060       __ ld1(v0, __ T16B, __ post(from, 16));
3061       __ orr(v1, __ T16B, v0, v0);
3062 
3063       __ br(Assembler::CC, L_rounds_44);
3064       __ br(Assembler::EQ, L_rounds_52);
3065 
3066       __ aesd(v0, v17); __ aesimc(v0, v0);
3067       __ aesd(v0, v18); __ aesimc(v0, v0);
3068     __ BIND(L_rounds_52);
3069       __ aesd(v0, v19); __ aesimc(v0, v0);
3070       __ aesd(v0, v20); __ aesimc(v0, v0);
3071     __ BIND(L_rounds_44);
3072       __ aesd(v0, v21); __ aesimc(v0, v0);
3073       __ aesd(v0, v22); __ aesimc(v0, v0);
3074       __ aesd(v0, v23); __ aesimc(v0, v0);
3075       __ aesd(v0, v24); __ aesimc(v0, v0);
3076       __ aesd(v0, v25); __ aesimc(v0, v0);
3077       __ aesd(v0, v26); __ aesimc(v0, v0);
3078       __ aesd(v0, v27); __ aesimc(v0, v0);
3079       __ aesd(v0, v28); __ aesimc(v0, v0);
3080       __ aesd(v0, v29); __ aesimc(v0, v0);
3081       __ aesd(v0, v30);
3082       __ eor(v0, __ T16B, v0, v31);
3083       __ eor(v0, __ T16B, v0, v2);
3084 
3085       __ st1(v0, __ T16B, __ post(to, 16));
3086       __ orr(v2, __ T16B, v1, v1);
3087 
3088       __ sub(len_reg, len_reg, 16);
3089       __ cbnz(len_reg, L_aes_loop);
3090 
3091       __ st1(v2, __ T16B, rvec);
3092 
3093       __ mov(r0, rscratch2);
3094 
3095       __ leave();
3096       __ ret(lr);
3097 
3098     return start;
3099   }
3100 
3101   // Arguments:
3102   //
3103   // Inputs:
3104   //   c_rarg0   - byte[]  source+offset
3105   //   c_rarg1   - int[]   SHA.state
3106   //   c_rarg2   - int     offset
3107   //   c_rarg3   - int     limit
3108   //
3109   address generate_sha1_implCompress(bool multi_block, const char *name) {
3110     __ align(CodeEntryAlignment);
3111     StubCodeMark mark(this, "StubRoutines", name);
3112     address start = __ pc();
3113 
3114     Register buf   = c_rarg0;
3115     Register state = c_rarg1;
3116     Register ofs   = c_rarg2;
3117     Register limit = c_rarg3;
3118 
3119     Label keys;
3120     Label sha1_loop;
3121 
3122     // load the keys into v0..v3
3123     __ adr(rscratch1, keys);
3124     __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1));
3125     // load 5 words state into v6, v7
3126     __ ldrq(v6, Address(state, 0));
3127     __ ldrs(v7, Address(state, 16));
3128 
3129 
3130     __ BIND(sha1_loop);
3131     // load 64 bytes of data into v16..v19
3132     __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf);
3133     __ rev32(v16, __ T16B, v16);
3134     __ rev32(v17, __ T16B, v17);
3135     __ rev32(v18, __ T16B, v18);
3136     __ rev32(v19, __ T16B, v19);
3137 
3138     // do the sha1
3139     __ addv(v4, __ T4S, v16, v0);
3140     __ orr(v20, __ T16B, v6, v6);
3141 
3142     FloatRegister d0 = v16;
3143     FloatRegister d1 = v17;
3144     FloatRegister d2 = v18;
3145     FloatRegister d3 = v19;
3146 
3147     for (int round = 0; round < 20; round++) {
3148       FloatRegister tmp1 = (round & 1) ? v4 : v5;
3149       FloatRegister tmp2 = (round & 1) ? v21 : v22;
3150       FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7;
3151       FloatRegister tmp4 = (round & 1) ? v5 : v4;
3152       FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3));
3153 
3154       if (round < 16) __ sha1su0(d0, __ T4S, d1, d2);
3155       if (round < 19) __ addv(tmp1, __ T4S, d1, key);
3156       __ sha1h(tmp2, __ T4S, v20);
3157       if (round < 5)
3158         __ sha1c(v20, __ T4S, tmp3, tmp4);
3159       else if (round < 10 || round >= 15)
3160         __ sha1p(v20, __ T4S, tmp3, tmp4);
3161       else
3162         __ sha1m(v20, __ T4S, tmp3, tmp4);
3163       if (round < 16) __ sha1su1(d0, __ T4S, d3);
3164 
3165       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3166     }
3167 
3168     __ addv(v7, __ T2S, v7, v21);
3169     __ addv(v6, __ T4S, v6, v20);
3170 
3171     if (multi_block) {
3172       __ add(ofs, ofs, 64);
3173       __ cmp(ofs, limit);
3174       __ br(Assembler::LE, sha1_loop);
3175       __ mov(c_rarg0, ofs); // return ofs
3176     }
3177 
3178     __ strq(v6, Address(state, 0));
3179     __ strs(v7, Address(state, 16));
3180 
3181     __ ret(lr);
3182 
3183     __ bind(keys);
3184     __ emit_int32(0x5a827999);
3185     __ emit_int32(0x6ed9eba1);
3186     __ emit_int32(0x8f1bbcdc);
3187     __ emit_int32(0xca62c1d6);
3188 
3189     return start;
3190   }
3191 
3192 
3193   // Arguments:
3194   //
3195   // Inputs:
3196   //   c_rarg0   - byte[]  source+offset
3197   //   c_rarg1   - int[]   SHA.state
3198   //   c_rarg2   - int     offset
3199   //   c_rarg3   - int     limit
3200   //
3201   address generate_sha256_implCompress(bool multi_block, const char *name) {
3202     static const uint32_t round_consts[64] = {
3203       0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5,
3204       0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5,
3205       0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3,
3206       0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174,
3207       0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc,
3208       0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da,
3209       0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7,
3210       0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967,
3211       0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13,
3212       0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85,
3213       0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3,
3214       0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070,
3215       0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5,
3216       0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3,
3217       0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208,
3218       0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2,
3219     };
3220     __ align(CodeEntryAlignment);
3221     StubCodeMark mark(this, "StubRoutines", name);
3222     address start = __ pc();
3223 
3224     Register buf   = c_rarg0;
3225     Register state = c_rarg1;
3226     Register ofs   = c_rarg2;
3227     Register limit = c_rarg3;
3228 
3229     Label sha1_loop;
3230 
3231     __ stpd(v8, v9, __ pre(sp, -32));
3232     __ stpd(v10, v11, Address(sp, 16));
3233 
3234 // dga == v0
3235 // dgb == v1
3236 // dg0 == v2
3237 // dg1 == v3
3238 // dg2 == v4
3239 // t0 == v6
3240 // t1 == v7
3241 
3242     // load 16 keys to v16..v31
3243     __ lea(rscratch1, ExternalAddress((address)round_consts));
3244     __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64));
3245     __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64));
3246     __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64));
3247     __ ld1(v28, v29, v30, v31, __ T4S, rscratch1);
3248 
3249     // load 8 words (256 bits) state
3250     __ ldpq(v0, v1, state);
3251 
3252     __ BIND(sha1_loop);
3253     // load 64 bytes of data into v8..v11
3254     __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf);
3255     __ rev32(v8, __ T16B, v8);
3256     __ rev32(v9, __ T16B, v9);
3257     __ rev32(v10, __ T16B, v10);
3258     __ rev32(v11, __ T16B, v11);
3259 
3260     __ addv(v6, __ T4S, v8, v16);
3261     __ orr(v2, __ T16B, v0, v0);
3262     __ orr(v3, __ T16B, v1, v1);
3263 
3264     FloatRegister d0 = v8;
3265     FloatRegister d1 = v9;
3266     FloatRegister d2 = v10;
3267     FloatRegister d3 = v11;
3268 
3269 
3270     for (int round = 0; round < 16; round++) {
3271       FloatRegister tmp1 = (round & 1) ? v6 : v7;
3272       FloatRegister tmp2 = (round & 1) ? v7 : v6;
3273       FloatRegister tmp3 = (round & 1) ? v2 : v4;
3274       FloatRegister tmp4 = (round & 1) ? v4 : v2;
3275 
3276       if (round < 12) __ sha256su0(d0, __ T4S, d1);
3277        __ orr(v4, __ T16B, v2, v2);
3278       if (round < 15)
3279         __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17));
3280       __ sha256h(v2, __ T4S, v3, tmp2);
3281       __ sha256h2(v3, __ T4S, v4, tmp2);
3282       if (round < 12) __ sha256su1(d0, __ T4S, d2, d3);
3283 
3284       tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1;
3285     }
3286 
3287     __ addv(v0, __ T4S, v0, v2);
3288     __ addv(v1, __ T4S, v1, v3);
3289 
3290     if (multi_block) {
3291       __ add(ofs, ofs, 64);
3292       __ cmp(ofs, limit);
3293       __ br(Assembler::LE, sha1_loop);
3294       __ mov(c_rarg0, ofs); // return ofs
3295     }
3296 
3297     __ ldpd(v10, v11, Address(sp, 16));
3298     __ ldpd(v8, v9, __ post(sp, 32));
3299 
3300     __ stpq(v0, v1, state);
3301 
3302     __ ret(lr);
3303 
3304     return start;
3305   }
3306 
3307 #ifndef BUILTIN_SIM
3308   // Safefetch stubs.
3309   void generate_safefetch(const char* name, int size, address* entry,
3310                           address* fault_pc, address* continuation_pc) {
3311     // safefetch signatures:
3312     //   int      SafeFetch32(int*      adr, int      errValue);
3313     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3314     //
3315     // arguments:
3316     //   c_rarg0 = adr
3317     //   c_rarg1 = errValue
3318     //
3319     // result:
3320     //   PPC_RET  = *adr or errValue
3321 
3322     StubCodeMark mark(this, "StubRoutines", name);
3323 
3324     // Entry point, pc or function descriptor.
3325     *entry = __ pc();
3326 
3327     // Load *adr into c_rarg1, may fault.
3328     *fault_pc = __ pc();
3329     switch (size) {
3330       case 4:
3331         // int32_t
3332         __ ldrw(c_rarg1, Address(c_rarg0, 0));
3333         break;
3334       case 8:
3335         // int64_t
3336         __ ldr(c_rarg1, Address(c_rarg0, 0));
3337         break;
3338       default:
3339         ShouldNotReachHere();
3340     }
3341 
3342     // return errValue or *adr
3343     *continuation_pc = __ pc();
3344     __ mov(r0, c_rarg1);
3345     __ ret(lr);
3346   }
3347 #endif
3348 
3349   /**
3350    *  Arguments:
3351    *
3352    * Inputs:
3353    *   c_rarg0   - int crc
3354    *   c_rarg1   - byte* buf
3355    *   c_rarg2   - int length
3356    *
3357    * Ouput:
3358    *       rax   - int crc result
3359    */
3360   address generate_updateBytesCRC32() {
3361     assert(UseCRC32Intrinsics, "what are we doing here?");
3362 
3363     __ align(CodeEntryAlignment);
3364     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
3365 
3366     address start = __ pc();
3367 
3368     const Register crc   = c_rarg0;  // crc
3369     const Register buf   = c_rarg1;  // source java byte array address
3370     const Register len   = c_rarg2;  // length
3371     const Register table0 = c_rarg3; // crc_table address
3372     const Register table1 = c_rarg4;
3373     const Register table2 = c_rarg5;
3374     const Register table3 = c_rarg6;
3375     const Register tmp3 = c_rarg7;
3376 
3377     BLOCK_COMMENT("Entry:");
3378     __ enter(); // required for proper stackwalking of RuntimeStub frame
3379 
3380     __ kernel_crc32(crc, buf, len,
3381               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3382 
3383     __ leave(); // required for proper stackwalking of RuntimeStub frame
3384     __ ret(lr);
3385 
3386     return start;
3387   }
3388 
3389   /**
3390    *  Arguments:
3391    *
3392    * Inputs:
3393    *   c_rarg0   - int crc
3394    *   c_rarg1   - byte* buf
3395    *   c_rarg2   - int length
3396    *   c_rarg3   - int* table
3397    *
3398    * Ouput:
3399    *       r0   - int crc result
3400    */
3401   address generate_updateBytesCRC32C() {
3402     assert(UseCRC32CIntrinsics, "what are we doing here?");
3403 
3404     __ align(CodeEntryAlignment);
3405     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
3406 
3407     address start = __ pc();
3408 
3409     const Register crc   = c_rarg0;  // crc
3410     const Register buf   = c_rarg1;  // source java byte array address
3411     const Register len   = c_rarg2;  // length
3412     const Register table0 = c_rarg3; // crc_table address
3413     const Register table1 = c_rarg4;
3414     const Register table2 = c_rarg5;
3415     const Register table3 = c_rarg6;
3416     const Register tmp3 = c_rarg7;
3417 
3418     BLOCK_COMMENT("Entry:");
3419     __ enter(); // required for proper stackwalking of RuntimeStub frame
3420 
3421     __ kernel_crc32c(crc, buf, len,
3422               table0, table1, table2, table3, rscratch1, rscratch2, tmp3);
3423 
3424     __ leave(); // required for proper stackwalking of RuntimeStub frame
3425     __ ret(lr);
3426 
3427     return start;
3428   }
3429 
3430   /***
3431    *  Arguments:
3432    *
3433    *  Inputs:
3434    *   c_rarg0   - int   adler
3435    *   c_rarg1   - byte* buff
3436    *   c_rarg2   - int   len
3437    *
3438    * Output:
3439    *   c_rarg0   - int adler result
3440    */
3441   address generate_updateBytesAdler32() {
3442     __ align(CodeEntryAlignment);
3443     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
3444     address start = __ pc();
3445 
3446     Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1;
3447 
3448     // Aliases
3449     Register adler  = c_rarg0;
3450     Register s1     = c_rarg0;
3451     Register s2     = c_rarg3;
3452     Register buff   = c_rarg1;
3453     Register len    = c_rarg2;
3454     Register nmax  = r4;
3455     Register base = r5;
3456     Register count = r6;
3457     Register temp0 = rscratch1;
3458     Register temp1 = rscratch2;
3459     Register temp2 = r7;
3460 
3461     // Max number of bytes we can process before having to take the mod
3462     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
3463     unsigned long BASE = 0xfff1;
3464     unsigned long NMAX = 0x15B0;
3465 
3466     __ mov(base, BASE);
3467     __ mov(nmax, NMAX);
3468 
3469     // s1 is initialized to the lower 16 bits of adler
3470     // s2 is initialized to the upper 16 bits of adler
3471     __ ubfx(s2, adler, 16, 16);  // s2 = ((adler >> 16) & 0xffff)
3472     __ uxth(s1, adler);          // s1 = (adler & 0xffff)
3473 
3474     // The pipelined loop needs at least 16 elements for 1 iteration
3475     // It does check this, but it is more effective to skip to the cleanup loop
3476     __ cmp(len, 16);
3477     __ br(Assembler::HS, L_nmax);
3478     __ cbz(len, L_combine);
3479 
3480     __ bind(L_simple_by1_loop);
3481     __ ldrb(temp0, Address(__ post(buff, 1)));
3482     __ add(s1, s1, temp0);
3483     __ add(s2, s2, s1);
3484     __ subs(len, len, 1);
3485     __ br(Assembler::HI, L_simple_by1_loop);
3486 
3487     // s1 = s1 % BASE
3488     __ subs(temp0, s1, base);
3489     __ csel(s1, temp0, s1, Assembler::HS);
3490 
3491     // s2 = s2 % BASE
3492     __ lsr(temp0, s2, 16);
3493     __ lsl(temp1, temp0, 4);
3494     __ sub(temp1, temp1, temp0);
3495     __ add(s2, temp1, s2, ext::uxth);
3496 
3497     __ subs(temp0, s2, base);
3498     __ csel(s2, temp0, s2, Assembler::HS);
3499 
3500     __ b(L_combine);
3501 
3502     __ bind(L_nmax);
3503     __ subs(len, len, nmax);
3504     __ sub(count, nmax, 16);
3505     __ br(Assembler::LO, L_by16);
3506 
3507     __ bind(L_nmax_loop);
3508 
3509     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3510 
3511     __ add(s1, s1, temp0, ext::uxtb);
3512     __ ubfx(temp2, temp0, 8, 8);
3513     __ add(s2, s2, s1);
3514     __ add(s1, s1, temp2);
3515     __ ubfx(temp2, temp0, 16, 8);
3516     __ add(s2, s2, s1);
3517     __ add(s1, s1, temp2);
3518     __ ubfx(temp2, temp0, 24, 8);
3519     __ add(s2, s2, s1);
3520     __ add(s1, s1, temp2);
3521     __ ubfx(temp2, temp0, 32, 8);
3522     __ add(s2, s2, s1);
3523     __ add(s1, s1, temp2);
3524     __ ubfx(temp2, temp0, 40, 8);
3525     __ add(s2, s2, s1);
3526     __ add(s1, s1, temp2);
3527     __ ubfx(temp2, temp0, 48, 8);
3528     __ add(s2, s2, s1);
3529     __ add(s1, s1, temp2);
3530     __ add(s2, s2, s1);
3531     __ add(s1, s1, temp0, Assembler::LSR, 56);
3532     __ add(s2, s2, s1);
3533 
3534     __ add(s1, s1, temp1, ext::uxtb);
3535     __ ubfx(temp2, temp1, 8, 8);
3536     __ add(s2, s2, s1);
3537     __ add(s1, s1, temp2);
3538     __ ubfx(temp2, temp1, 16, 8);
3539     __ add(s2, s2, s1);
3540     __ add(s1, s1, temp2);
3541     __ ubfx(temp2, temp1, 24, 8);
3542     __ add(s2, s2, s1);
3543     __ add(s1, s1, temp2);
3544     __ ubfx(temp2, temp1, 32, 8);
3545     __ add(s2, s2, s1);
3546     __ add(s1, s1, temp2);
3547     __ ubfx(temp2, temp1, 40, 8);
3548     __ add(s2, s2, s1);
3549     __ add(s1, s1, temp2);
3550     __ ubfx(temp2, temp1, 48, 8);
3551     __ add(s2, s2, s1);
3552     __ add(s1, s1, temp2);
3553     __ add(s2, s2, s1);
3554     __ add(s1, s1, temp1, Assembler::LSR, 56);
3555     __ add(s2, s2, s1);
3556 
3557     __ subs(count, count, 16);
3558     __ br(Assembler::HS, L_nmax_loop);
3559 
3560     // s1 = s1 % BASE
3561     __ lsr(temp0, s1, 16);
3562     __ lsl(temp1, temp0, 4);
3563     __ sub(temp1, temp1, temp0);
3564     __ add(temp1, temp1, s1, ext::uxth);
3565 
3566     __ lsr(temp0, temp1, 16);
3567     __ lsl(s1, temp0, 4);
3568     __ sub(s1, s1, temp0);
3569     __ add(s1, s1, temp1, ext:: uxth);
3570 
3571     __ subs(temp0, s1, base);
3572     __ csel(s1, temp0, s1, Assembler::HS);
3573 
3574     // s2 = s2 % BASE
3575     __ lsr(temp0, s2, 16);
3576     __ lsl(temp1, temp0, 4);
3577     __ sub(temp1, temp1, temp0);
3578     __ add(temp1, temp1, s2, ext::uxth);
3579 
3580     __ lsr(temp0, temp1, 16);
3581     __ lsl(s2, temp0, 4);
3582     __ sub(s2, s2, temp0);
3583     __ add(s2, s2, temp1, ext:: uxth);
3584 
3585     __ subs(temp0, s2, base);
3586     __ csel(s2, temp0, s2, Assembler::HS);
3587 
3588     __ subs(len, len, nmax);
3589     __ sub(count, nmax, 16);
3590     __ br(Assembler::HS, L_nmax_loop);
3591 
3592     __ bind(L_by16);
3593     __ adds(len, len, count);
3594     __ br(Assembler::LO, L_by1);
3595 
3596     __ bind(L_by16_loop);
3597 
3598     __ ldp(temp0, temp1, Address(__ post(buff, 16)));
3599 
3600     __ add(s1, s1, temp0, ext::uxtb);
3601     __ ubfx(temp2, temp0, 8, 8);
3602     __ add(s2, s2, s1);
3603     __ add(s1, s1, temp2);
3604     __ ubfx(temp2, temp0, 16, 8);
3605     __ add(s2, s2, s1);
3606     __ add(s1, s1, temp2);
3607     __ ubfx(temp2, temp0, 24, 8);
3608     __ add(s2, s2, s1);
3609     __ add(s1, s1, temp2);
3610     __ ubfx(temp2, temp0, 32, 8);
3611     __ add(s2, s2, s1);
3612     __ add(s1, s1, temp2);
3613     __ ubfx(temp2, temp0, 40, 8);
3614     __ add(s2, s2, s1);
3615     __ add(s1, s1, temp2);
3616     __ ubfx(temp2, temp0, 48, 8);
3617     __ add(s2, s2, s1);
3618     __ add(s1, s1, temp2);
3619     __ add(s2, s2, s1);
3620     __ add(s1, s1, temp0, Assembler::LSR, 56);
3621     __ add(s2, s2, s1);
3622 
3623     __ add(s1, s1, temp1, ext::uxtb);
3624     __ ubfx(temp2, temp1, 8, 8);
3625     __ add(s2, s2, s1);
3626     __ add(s1, s1, temp2);
3627     __ ubfx(temp2, temp1, 16, 8);
3628     __ add(s2, s2, s1);
3629     __ add(s1, s1, temp2);
3630     __ ubfx(temp2, temp1, 24, 8);
3631     __ add(s2, s2, s1);
3632     __ add(s1, s1, temp2);
3633     __ ubfx(temp2, temp1, 32, 8);
3634     __ add(s2, s2, s1);
3635     __ add(s1, s1, temp2);
3636     __ ubfx(temp2, temp1, 40, 8);
3637     __ add(s2, s2, s1);
3638     __ add(s1, s1, temp2);
3639     __ ubfx(temp2, temp1, 48, 8);
3640     __ add(s2, s2, s1);
3641     __ add(s1, s1, temp2);
3642     __ add(s2, s2, s1);
3643     __ add(s1, s1, temp1, Assembler::LSR, 56);
3644     __ add(s2, s2, s1);
3645 
3646     __ subs(len, len, 16);
3647     __ br(Assembler::HS, L_by16_loop);
3648 
3649     __ bind(L_by1);
3650     __ adds(len, len, 15);
3651     __ br(Assembler::LO, L_do_mod);
3652 
3653     __ bind(L_by1_loop);
3654     __ ldrb(temp0, Address(__ post(buff, 1)));
3655     __ add(s1, temp0, s1);
3656     __ add(s2, s2, s1);
3657     __ subs(len, len, 1);
3658     __ br(Assembler::HS, L_by1_loop);
3659 
3660     __ bind(L_do_mod);
3661     // s1 = s1 % BASE
3662     __ lsr(temp0, s1, 16);
3663     __ lsl(temp1, temp0, 4);
3664     __ sub(temp1, temp1, temp0);
3665     __ add(temp1, temp1, s1, ext::uxth);
3666 
3667     __ lsr(temp0, temp1, 16);
3668     __ lsl(s1, temp0, 4);
3669     __ sub(s1, s1, temp0);
3670     __ add(s1, s1, temp1, ext:: uxth);
3671 
3672     __ subs(temp0, s1, base);
3673     __ csel(s1, temp0, s1, Assembler::HS);
3674 
3675     // s2 = s2 % BASE
3676     __ lsr(temp0, s2, 16);
3677     __ lsl(temp1, temp0, 4);
3678     __ sub(temp1, temp1, temp0);
3679     __ add(temp1, temp1, s2, ext::uxth);
3680 
3681     __ lsr(temp0, temp1, 16);
3682     __ lsl(s2, temp0, 4);
3683     __ sub(s2, s2, temp0);
3684     __ add(s2, s2, temp1, ext:: uxth);
3685 
3686     __ subs(temp0, s2, base);
3687     __ csel(s2, temp0, s2, Assembler::HS);
3688 
3689     // Combine lower bits and higher bits
3690     __ bind(L_combine);
3691     __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16)
3692 
3693     __ ret(lr);
3694 
3695     return start;
3696   }
3697 
3698   /**
3699    *  Arguments:
3700    *
3701    *  Input:
3702    *    c_rarg0   - x address
3703    *    c_rarg1   - x length
3704    *    c_rarg2   - y address
3705    *    c_rarg3   - y lenth
3706    *    c_rarg4   - z address
3707    *    c_rarg5   - z length
3708    */
3709   address generate_multiplyToLen() {
3710     __ align(CodeEntryAlignment);
3711     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3712 
3713     address start = __ pc();
3714     const Register x     = r0;
3715     const Register xlen  = r1;
3716     const Register y     = r2;
3717     const Register ylen  = r3;
3718     const Register z     = r4;
3719     const Register zlen  = r5;
3720 
3721     const Register tmp1  = r10;
3722     const Register tmp2  = r11;
3723     const Register tmp3  = r12;
3724     const Register tmp4  = r13;
3725     const Register tmp5  = r14;
3726     const Register tmp6  = r15;
3727     const Register tmp7  = r16;
3728 
3729     BLOCK_COMMENT("Entry:");
3730     __ enter(); // required for proper stackwalking of RuntimeStub frame
3731     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7);
3732     __ leave(); // required for proper stackwalking of RuntimeStub frame
3733     __ ret(lr);
3734 
3735     return start;
3736   }
3737 
3738   void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi,
3739                       FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0,
3740                       FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) {
3741     // Karatsuba multiplication performs a 128*128 -> 256-bit
3742     // multiplication in three 128-bit multiplications and a few
3743     // additions.
3744     //
3745     // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1)
3746     // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0
3747     //
3748     // Inputs:
3749     //
3750     // A0 in a.d[0]     (subkey)
3751     // A1 in a.d[1]
3752     // (A1+A0) in a1_xor_a0.d[0]
3753     //
3754     // B0 in b.d[0]     (state)
3755     // B1 in b.d[1]
3756 
3757     __ ext(tmp1, __ T16B, b, b, 0x08);
3758     __ pmull2(result_hi, __ T1Q, b, a, __ T2D);  // A1*B1
3759     __ eor(tmp1, __ T16B, tmp1, b);            // (B1+B0)
3760     __ pmull(result_lo,  __ T1Q, b, a, __ T1D);  // A0*B0
3761     __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0)
3762 
3763     __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08);
3764     __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0
3765     __ eor(tmp2, __ T16B, tmp2, tmp4);
3766     __ eor(tmp2, __ T16B, tmp2, tmp3);
3767 
3768     // Register pair <result_hi:result_lo> holds the result of carry-less multiplication
3769     __ ins(result_hi, __ D, tmp2, 0, 1);
3770     __ ins(result_lo, __ D, tmp2, 1, 0);
3771   }
3772 
3773   void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi,
3774                     FloatRegister p, FloatRegister z, FloatRegister t1) {
3775     const FloatRegister t0 = result;
3776 
3777     // The GCM field polynomial f is z^128 + p(z), where p =
3778     // z^7+z^2+z+1.
3779     //
3780     //    z^128 === -p(z)  (mod (z^128 + p(z)))
3781     //
3782     // so, given that the product we're reducing is
3783     //    a == lo + hi * z^128
3784     // substituting,
3785     //      === lo - hi * p(z)  (mod (z^128 + p(z)))
3786     //
3787     // we reduce by multiplying hi by p(z) and subtracting the result
3788     // from (i.e. XORing it with) lo.  Because p has no nonzero high
3789     // bits we can do this with two 64-bit multiplications, lo*p and
3790     // hi*p.
3791 
3792     __ pmull2(t0, __ T1Q, hi, p, __ T2D);
3793     __ ext(t1, __ T16B, t0, z, 8);
3794     __ eor(hi, __ T16B, hi, t1);
3795     __ ext(t1, __ T16B, z, t0, 8);
3796     __ eor(lo, __ T16B, lo, t1);
3797     __ pmull(t0, __ T1Q, hi, p, __ T1D);
3798     __ eor(result, __ T16B, lo, t0);
3799   }
3800 
3801   /**
3802    *  Arguments:
3803    *
3804    *  Input:
3805    *  c_rarg0   - current state address
3806    *  c_rarg1   - H key address
3807    *  c_rarg2   - data address
3808    *  c_rarg3   - number of blocks
3809    *
3810    *  Output:
3811    *  Updated state at c_rarg0
3812    */
3813   address generate_ghash_processBlocks() {
3814     // Bafflingly, GCM uses little-endian for the byte order, but
3815     // big-endian for the bit order.  For example, the polynomial 1 is
3816     // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00.
3817     //
3818     // So, we must either reverse the bytes in each word and do
3819     // everything big-endian or reverse the bits in each byte and do
3820     // it little-endian.  On AArch64 it's more idiomatic to reverse
3821     // the bits in each byte (we have an instruction, RBIT, to do
3822     // that) and keep the data in little-endian bit order throught the
3823     // calculation, bit-reversing the inputs and outputs.
3824 
3825     StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
3826     __ align(wordSize * 2);
3827     address p = __ pc();
3828     __ emit_int64(0x87);  // The low-order bits of the field
3829                           // polynomial (i.e. p = z^7+z^2+z+1)
3830                           // repeated in the low and high parts of a
3831                           // 128-bit vector
3832     __ emit_int64(0x87);
3833 
3834     __ align(CodeEntryAlignment);
3835     address start = __ pc();
3836 
3837     Register state   = c_rarg0;
3838     Register subkeyH = c_rarg1;
3839     Register data    = c_rarg2;
3840     Register blocks  = c_rarg3;
3841 
3842     FloatRegister vzr = v30;
3843     __ eor(vzr, __ T16B, vzr, vzr); // zero register
3844 
3845     __ ldrq(v0, Address(state));
3846     __ ldrq(v1, Address(subkeyH));
3847 
3848     __ rev64(v0, __ T16B, v0);          // Bit-reverse words in state and subkeyH
3849     __ rbit(v0, __ T16B, v0);
3850     __ rev64(v1, __ T16B, v1);
3851     __ rbit(v1, __ T16B, v1);
3852 
3853     __ ldrq(v26, p);
3854 
3855     __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1
3856     __ eor(v16, __ T16B, v16, v1);      // xor subkeyH into subkeyL (Karatsuba: (A1+A0))
3857 
3858     {
3859       Label L_ghash_loop;
3860       __ bind(L_ghash_loop);
3861 
3862       __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit
3863                                                  // reversing each byte
3864       __ rbit(v2, __ T16B, v2);
3865       __ eor(v2, __ T16B, v0, v2);   // bit-swapped data ^ bit-swapped state
3866 
3867       // Multiply state in v2 by subkey in v1
3868       ghash_multiply(/*result_lo*/v5, /*result_hi*/v7,
3869                      /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16,
3870                      /*temps*/v6, v20, v18, v21);
3871       // Reduce v7:v5 by the field polynomial
3872       ghash_reduce(v0, v5, v7, v26, vzr, v20);
3873 
3874       __ sub(blocks, blocks, 1);
3875       __ cbnz(blocks, L_ghash_loop);
3876     }
3877 
3878     // The bit-reversed result is at this point in v0
3879     __ rev64(v1, __ T16B, v0);
3880     __ rbit(v1, __ T16B, v1);
3881 
3882     __ st1(v1, __ T16B, state);
3883     __ ret(lr);
3884 
3885     return start;
3886   }
3887 
3888   // Continuation point for throwing of implicit exceptions that are
3889   // not handled in the current activation. Fabricates an exception
3890   // oop and initiates normal exception dispatching in this
3891   // frame. Since we need to preserve callee-saved values (currently
3892   // only for C2, but done for C1 as well) we need a callee-saved oop
3893   // map and therefore have to make these stubs into RuntimeStubs
3894   // rather than BufferBlobs.  If the compiler needs all registers to
3895   // be preserved between the fault point and the exception handler
3896   // then it must assume responsibility for that in
3897   // AbstractCompiler::continuation_for_implicit_null_exception or
3898   // continuation_for_implicit_division_by_zero_exception. All other
3899   // implicit exceptions (e.g., NullPointerException or
3900   // AbstractMethodError on entry) are either at call sites or
3901   // otherwise assume that stack unwinding will be initiated, so
3902   // caller saved registers were assumed volatile in the compiler.
3903 
3904 #undef __
3905 #define __ masm->
3906 
3907   address generate_throw_exception(const char* name,
3908                                    address runtime_entry,
3909                                    Register arg1 = noreg,
3910                                    Register arg2 = noreg) {
3911     // Information about frame layout at time of blocking runtime call.
3912     // Note that we only have to preserve callee-saved registers since
3913     // the compilers are responsible for supplying a continuation point
3914     // if they expect all registers to be preserved.
3915     // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0
3916     enum layout {
3917       rfp_off = 0,
3918       rfp_off2,
3919       return_off,
3920       return_off2,
3921       framesize // inclusive of return address
3922     };
3923 
3924     int insts_size = 512;
3925     int locs_size  = 64;
3926 
3927     CodeBuffer code(name, insts_size, locs_size);
3928     OopMapSet* oop_maps  = new OopMapSet();
3929     MacroAssembler* masm = new MacroAssembler(&code);
3930 
3931     address start = __ pc();
3932 
3933     // This is an inlined and slightly modified version of call_VM
3934     // which has the ability to fetch the return PC out of
3935     // thread-local storage and also sets up last_Java_sp slightly
3936     // differently than the real call_VM
3937 
3938     __ enter(); // Save FP and LR before call
3939 
3940     assert(is_even(framesize/2), "sp not 16-byte aligned");
3941 
3942     // lr and fp are already in place
3943     __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog
3944 
3945     int frame_complete = __ pc() - start;
3946 
3947     // Set up last_Java_sp and last_Java_fp
3948     address the_pc = __ pc();
3949     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
3950 
3951     // Call runtime
3952     if (arg1 != noreg) {
3953       assert(arg2 != c_rarg1, "clobbered");
3954       __ mov(c_rarg1, arg1);
3955     }
3956     if (arg2 != noreg) {
3957       __ mov(c_rarg2, arg2);
3958     }
3959     __ mov(c_rarg0, rthread);
3960     BLOCK_COMMENT("call runtime_entry");
3961     __ mov(rscratch1, runtime_entry);
3962     __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1);
3963 
3964     // Generate oop map
3965     OopMap* map = new OopMap(framesize, 0);
3966 
3967     oop_maps->add_gc_map(the_pc - start, map);
3968 
3969     __ reset_last_Java_frame(true);
3970     __ maybe_isb();
3971 
3972     __ leave();
3973 
3974     // check for pending exceptions
3975 #ifdef ASSERT
3976     Label L;
3977     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
3978     __ cbnz(rscratch1, L);
3979     __ should_not_reach_here();
3980     __ bind(L);
3981 #endif // ASSERT
3982     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
3983 
3984 
3985     // codeBlob framesize is in words (not VMRegImpl::slot_size)
3986     RuntimeStub* stub =
3987       RuntimeStub::new_runtime_stub(name,
3988                                     &code,
3989                                     frame_complete,
3990                                     (framesize >> (LogBytesPerWord - LogBytesPerInt)),
3991                                     oop_maps, false);
3992     return stub->entry_point();
3993   }
3994 
3995   class MontgomeryMultiplyGenerator : public MacroAssembler {
3996 
3997     Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn,
3998       Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj;
3999 
4000     RegSet _toSave;
4001     bool _squaring;
4002 
4003   public:
4004     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
4005       : MacroAssembler(as->code()), _squaring(squaring) {
4006 
4007       // Register allocation
4008 
4009       Register reg = c_rarg0;
4010       Pa_base = reg;       // Argument registers
4011       if (squaring)
4012         Pb_base = Pa_base;
4013       else
4014         Pb_base = ++reg;
4015       Pn_base = ++reg;
4016       Rlen= ++reg;
4017       inv = ++reg;
4018       Pm_base = ++reg;
4019 
4020                           // Working registers:
4021       Ra =  ++reg;        // The current digit of a, b, n, and m.
4022       Rb =  ++reg;
4023       Rm =  ++reg;
4024       Rn =  ++reg;
4025 
4026       Pa =  ++reg;        // Pointers to the current/next digit of a, b, n, and m.
4027       Pb =  ++reg;
4028       Pm =  ++reg;
4029       Pn =  ++reg;
4030 
4031       t0 =  ++reg;        // Three registers which form a
4032       t1 =  ++reg;        // triple-precision accumuator.
4033       t2 =  ++reg;
4034 
4035       Ri =  ++reg;        // Inner and outer loop indexes.
4036       Rj =  ++reg;
4037 
4038       Rhi_ab = ++reg;     // Product registers: low and high parts
4039       Rlo_ab = ++reg;     // of a*b and m*n.
4040       Rhi_mn = ++reg;
4041       Rlo_mn = ++reg;
4042 
4043       // r19 and up are callee-saved.
4044       _toSave = RegSet::range(r19, reg) + Pm_base;
4045     }
4046 
4047   private:
4048     void save_regs() {
4049       push(_toSave, sp);
4050     }
4051 
4052     void restore_regs() {
4053       pop(_toSave, sp);
4054     }
4055 
4056     template <typename T>
4057     void unroll_2(Register count, T block) {
4058       Label loop, end, odd;
4059       tbnz(count, 0, odd);
4060       cbz(count, end);
4061       align(16);
4062       bind(loop);
4063       (this->*block)();
4064       bind(odd);
4065       (this->*block)();
4066       subs(count, count, 2);
4067       br(Assembler::GT, loop);
4068       bind(end);
4069     }
4070 
4071     template <typename T>
4072     void unroll_2(Register count, T block, Register d, Register s, Register tmp) {
4073       Label loop, end, odd;
4074       tbnz(count, 0, odd);
4075       cbz(count, end);
4076       align(16);
4077       bind(loop);
4078       (this->*block)(d, s, tmp);
4079       bind(odd);
4080       (this->*block)(d, s, tmp);
4081       subs(count, count, 2);
4082       br(Assembler::GT, loop);
4083       bind(end);
4084     }
4085 
4086     void pre1(RegisterOrConstant i) {
4087       block_comment("pre1");
4088       // Pa = Pa_base;
4089       // Pb = Pb_base + i;
4090       // Pm = Pm_base;
4091       // Pn = Pn_base + i;
4092       // Ra = *Pa;
4093       // Rb = *Pb;
4094       // Rm = *Pm;
4095       // Rn = *Pn;
4096       ldr(Ra, Address(Pa_base));
4097       ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4098       ldr(Rm, Address(Pm_base));
4099       ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4100       lea(Pa, Address(Pa_base));
4101       lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord)));
4102       lea(Pm, Address(Pm_base));
4103       lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4104 
4105       // Zero the m*n result.
4106       mov(Rhi_mn, zr);
4107       mov(Rlo_mn, zr);
4108     }
4109 
4110     // The core multiply-accumulate step of a Montgomery
4111     // multiplication.  The idea is to schedule operations as a
4112     // pipeline so that instructions with long latencies (loads and
4113     // multiplies) have time to complete before their results are
4114     // used.  This most benefits in-order implementations of the
4115     // architecture but out-of-order ones also benefit.
4116     void step() {
4117       block_comment("step");
4118       // MACC(Ra, Rb, t0, t1, t2);
4119       // Ra = *++Pa;
4120       // Rb = *--Pb;
4121       umulh(Rhi_ab, Ra, Rb);
4122       mul(Rlo_ab, Ra, Rb);
4123       ldr(Ra, pre(Pa, wordSize));
4124       ldr(Rb, pre(Pb, -wordSize));
4125       acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the
4126                                        // previous iteration.
4127       // MACC(Rm, Rn, t0, t1, t2);
4128       // Rm = *++Pm;
4129       // Rn = *--Pn;
4130       umulh(Rhi_mn, Rm, Rn);
4131       mul(Rlo_mn, Rm, Rn);
4132       ldr(Rm, pre(Pm, wordSize));
4133       ldr(Rn, pre(Pn, -wordSize));
4134       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4135     }
4136 
4137     void post1() {
4138       block_comment("post1");
4139 
4140       // MACC(Ra, Rb, t0, t1, t2);
4141       // Ra = *++Pa;
4142       // Rb = *--Pb;
4143       umulh(Rhi_ab, Ra, Rb);
4144       mul(Rlo_ab, Ra, Rb);
4145       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4146       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4147 
4148       // *Pm = Rm = t0 * inv;
4149       mul(Rm, t0, inv);
4150       str(Rm, Address(Pm));
4151 
4152       // MACC(Rm, Rn, t0, t1, t2);
4153       // t0 = t1; t1 = t2; t2 = 0;
4154       umulh(Rhi_mn, Rm, Rn);
4155 
4156 #ifndef PRODUCT
4157       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4158       {
4159         mul(Rlo_mn, Rm, Rn);
4160         add(Rlo_mn, t0, Rlo_mn);
4161         Label ok;
4162         cbz(Rlo_mn, ok); {
4163           stop("broken Montgomery multiply");
4164         } bind(ok);
4165       }
4166 #endif
4167       // We have very carefully set things up so that
4168       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4169       // the lower half of Rm * Rn because we know the result already:
4170       // it must be -t0.  t0 + (-t0) must generate a carry iff
4171       // t0 != 0.  So, rather than do a mul and an adds we just set
4172       // the carry flag iff t0 is nonzero.
4173       //
4174       // mul(Rlo_mn, Rm, Rn);
4175       // adds(zr, t0, Rlo_mn);
4176       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4177       adcs(t0, t1, Rhi_mn);
4178       adc(t1, t2, zr);
4179       mov(t2, zr);
4180     }
4181 
4182     void pre2(RegisterOrConstant i, RegisterOrConstant len) {
4183       block_comment("pre2");
4184       // Pa = Pa_base + i-len;
4185       // Pb = Pb_base + len;
4186       // Pm = Pm_base + i-len;
4187       // Pn = Pn_base + len;
4188 
4189       if (i.is_register()) {
4190         sub(Rj, i.as_register(), len);
4191       } else {
4192         mov(Rj, i.as_constant());
4193         sub(Rj, Rj, len);
4194       }
4195       // Rj == i-len
4196 
4197       lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord)));
4198       lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord)));
4199       lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4200       lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord)));
4201 
4202       // Ra = *++Pa;
4203       // Rb = *--Pb;
4204       // Rm = *++Pm;
4205       // Rn = *--Pn;
4206       ldr(Ra, pre(Pa, wordSize));
4207       ldr(Rb, pre(Pb, -wordSize));
4208       ldr(Rm, pre(Pm, wordSize));
4209       ldr(Rn, pre(Pn, -wordSize));
4210 
4211       mov(Rhi_mn, zr);
4212       mov(Rlo_mn, zr);
4213     }
4214 
4215     void post2(RegisterOrConstant i, RegisterOrConstant len) {
4216       block_comment("post2");
4217       if (i.is_constant()) {
4218         mov(Rj, i.as_constant()-len.as_constant());
4219       } else {
4220         sub(Rj, i.as_register(), len);
4221       }
4222 
4223       adds(t0, t0, Rlo_mn); // The pending m*n, low part
4224 
4225       // As soon as we know the least significant digit of our result,
4226       // store it.
4227       // Pm_base[i-len] = t0;
4228       str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord)));
4229 
4230       // t0 = t1; t1 = t2; t2 = 0;
4231       adcs(t0, t1, Rhi_mn); // The pending m*n, high part
4232       adc(t1, t2, zr);
4233       mov(t2, zr);
4234     }
4235 
4236     // A carry in t0 after Montgomery multiplication means that we
4237     // should subtract multiples of n from our result in m.  We'll
4238     // keep doing that until there is no carry.
4239     void normalize(RegisterOrConstant len) {
4240       block_comment("normalize");
4241       // while (t0)
4242       //   t0 = sub(Pm_base, Pn_base, t0, len);
4243       Label loop, post, again;
4244       Register cnt = t1, i = t2; // Re-use registers; we're done with them now
4245       cbz(t0, post); {
4246         bind(again); {
4247           mov(i, zr);
4248           mov(cnt, len);
4249           ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4250           ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4251           subs(zr, zr, zr); // set carry flag, i.e. no borrow
4252           align(16);
4253           bind(loop); {
4254             sbcs(Rm, Rm, Rn);
4255             str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4256             add(i, i, 1);
4257             ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord)));
4258             ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord)));
4259             sub(cnt, cnt, 1);
4260           } cbnz(cnt, loop);
4261           sbc(t0, t0, zr);
4262         } cbnz(t0, again);
4263       } bind(post);
4264     }
4265 
4266     // Move memory at s to d, reversing words.
4267     //    Increments d to end of copied memory
4268     //    Destroys tmp1, tmp2
4269     //    Preserves len
4270     //    Leaves s pointing to the address which was in d at start
4271     void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) {
4272       assert(tmp1 < r19 && tmp2 < r19, "register corruption");
4273 
4274       lea(s, Address(s, len, Address::uxtw(LogBytesPerWord)));
4275       mov(tmp1, len);
4276       unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2);
4277       sub(s, d, len, ext::uxtw, LogBytesPerWord);
4278     }
4279     // where
4280     void reverse1(Register d, Register s, Register tmp) {
4281       ldr(tmp, pre(s, -wordSize));
4282       ror(tmp, tmp, 32);
4283       str(tmp, post(d, wordSize));
4284     }
4285 
4286     void step_squaring() {
4287       // An extra ACC
4288       step();
4289       acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4290     }
4291 
4292     void last_squaring(RegisterOrConstant i) {
4293       Label dont;
4294       // if ((i & 1) == 0) {
4295       tbnz(i.as_register(), 0, dont); {
4296         // MACC(Ra, Rb, t0, t1, t2);
4297         // Ra = *++Pa;
4298         // Rb = *--Pb;
4299         umulh(Rhi_ab, Ra, Rb);
4300         mul(Rlo_ab, Ra, Rb);
4301         acc(Rhi_ab, Rlo_ab, t0, t1, t2);
4302       } bind(dont);
4303     }
4304 
4305     void extra_step_squaring() {
4306       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4307 
4308       // MACC(Rm, Rn, t0, t1, t2);
4309       // Rm = *++Pm;
4310       // Rn = *--Pn;
4311       umulh(Rhi_mn, Rm, Rn);
4312       mul(Rlo_mn, Rm, Rn);
4313       ldr(Rm, pre(Pm, wordSize));
4314       ldr(Rn, pre(Pn, -wordSize));
4315     }
4316 
4317     void post1_squaring() {
4318       acc(Rhi_mn, Rlo_mn, t0, t1, t2);  // The pending m*n
4319 
4320       // *Pm = Rm = t0 * inv;
4321       mul(Rm, t0, inv);
4322       str(Rm, Address(Pm));
4323 
4324       // MACC(Rm, Rn, t0, t1, t2);
4325       // t0 = t1; t1 = t2; t2 = 0;
4326       umulh(Rhi_mn, Rm, Rn);
4327 
4328 #ifndef PRODUCT
4329       // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply");
4330       {
4331         mul(Rlo_mn, Rm, Rn);
4332         add(Rlo_mn, t0, Rlo_mn);
4333         Label ok;
4334         cbz(Rlo_mn, ok); {
4335           stop("broken Montgomery multiply");
4336         } bind(ok);
4337       }
4338 #endif
4339       // We have very carefully set things up so that
4340       // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate
4341       // the lower half of Rm * Rn because we know the result already:
4342       // it must be -t0.  t0 + (-t0) must generate a carry iff
4343       // t0 != 0.  So, rather than do a mul and an adds we just set
4344       // the carry flag iff t0 is nonzero.
4345       //
4346       // mul(Rlo_mn, Rm, Rn);
4347       // adds(zr, t0, Rlo_mn);
4348       subs(zr, t0, 1); // Set carry iff t0 is nonzero
4349       adcs(t0, t1, Rhi_mn);
4350       adc(t1, t2, zr);
4351       mov(t2, zr);
4352     }
4353 
4354     void acc(Register Rhi, Register Rlo,
4355              Register t0, Register t1, Register t2) {
4356       adds(t0, t0, Rlo);
4357       adcs(t1, t1, Rhi);
4358       adc(t2, t2, zr);
4359     }
4360 
4361   public:
4362     /**
4363      * Fast Montgomery multiplication.  The derivation of the
4364      * algorithm is in A Cryptographic Library for the Motorola
4365      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
4366      *
4367      * Arguments:
4368      *
4369      * Inputs for multiplication:
4370      *   c_rarg0   - int array elements a
4371      *   c_rarg1   - int array elements b
4372      *   c_rarg2   - int array elements n (the modulus)
4373      *   c_rarg3   - int length
4374      *   c_rarg4   - int inv
4375      *   c_rarg5   - int array elements m (the result)
4376      *
4377      * Inputs for squaring:
4378      *   c_rarg0   - int array elements a
4379      *   c_rarg1   - int array elements n (the modulus)
4380      *   c_rarg2   - int length
4381      *   c_rarg3   - int inv
4382      *   c_rarg4   - int array elements m (the result)
4383      *
4384      */
4385     address generate_multiply() {
4386       Label argh, nothing;
4387       bind(argh);
4388       stop("MontgomeryMultiply total_allocation must be <= 8192");
4389 
4390       align(CodeEntryAlignment);
4391       address entry = pc();
4392 
4393       cbzw(Rlen, nothing);
4394 
4395       enter();
4396 
4397       // Make room.
4398       cmpw(Rlen, 512);
4399       br(Assembler::HI, argh);
4400       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4401       andr(sp, Ra, -2 * wordSize);
4402 
4403       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4404 
4405       {
4406         // Copy input args, reversing as we go.  We use Ra as a
4407         // temporary variable.
4408         reverse(Ra, Pa_base, Rlen, t0, t1);
4409         if (!_squaring)
4410           reverse(Ra, Pb_base, Rlen, t0, t1);
4411         reverse(Ra, Pn_base, Rlen, t0, t1);
4412       }
4413 
4414       // Push all call-saved registers and also Pm_base which we'll need
4415       // at the end.
4416       save_regs();
4417 
4418 #ifndef PRODUCT
4419       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
4420       {
4421         ldr(Rn, Address(Pn_base, 0));
4422         mul(Rlo_mn, Rn, inv);
4423         cmp(Rlo_mn, -1);
4424         Label ok;
4425         br(EQ, ok); {
4426           stop("broken inverse in Montgomery multiply");
4427         } bind(ok);
4428       }
4429 #endif
4430 
4431       mov(Pm_base, Ra);
4432 
4433       mov(t0, zr);
4434       mov(t1, zr);
4435       mov(t2, zr);
4436 
4437       block_comment("for (int i = 0; i < len; i++) {");
4438       mov(Ri, zr); {
4439         Label loop, end;
4440         cmpw(Ri, Rlen);
4441         br(Assembler::GE, end);
4442 
4443         bind(loop);
4444         pre1(Ri);
4445 
4446         block_comment("  for (j = i; j; j--) {"); {
4447           movw(Rj, Ri);
4448           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4449         } block_comment("  } // j");
4450 
4451         post1();
4452         addw(Ri, Ri, 1);
4453         cmpw(Ri, Rlen);
4454         br(Assembler::LT, loop);
4455         bind(end);
4456         block_comment("} // i");
4457       }
4458 
4459       block_comment("for (int i = len; i < 2*len; i++) {");
4460       mov(Ri, Rlen); {
4461         Label loop, end;
4462         cmpw(Ri, Rlen, Assembler::LSL, 1);
4463         br(Assembler::GE, end);
4464 
4465         bind(loop);
4466         pre2(Ri, Rlen);
4467 
4468         block_comment("  for (j = len*2-i-1; j; j--) {"); {
4469           lslw(Rj, Rlen, 1);
4470           subw(Rj, Rj, Ri);
4471           subw(Rj, Rj, 1);
4472           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
4473         } block_comment("  } // j");
4474 
4475         post2(Ri, Rlen);
4476         addw(Ri, Ri, 1);
4477         cmpw(Ri, Rlen, Assembler::LSL, 1);
4478         br(Assembler::LT, loop);
4479         bind(end);
4480       }
4481       block_comment("} // i");
4482 
4483       normalize(Rlen);
4484 
4485       mov(Ra, Pm_base);  // Save Pm_base in Ra
4486       restore_regs();  // Restore caller's Pm_base
4487 
4488       // Copy our result into caller's Pm_base
4489       reverse(Pm_base, Ra, Rlen, t0, t1);
4490 
4491       leave();
4492       bind(nothing);
4493       ret(lr);
4494 
4495       return entry;
4496     }
4497     // In C, approximately:
4498 
4499     // void
4500     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
4501     //                     unsigned long Pn_base[], unsigned long Pm_base[],
4502     //                     unsigned long inv, int len) {
4503     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4504     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4505     //   unsigned long Ra, Rb, Rn, Rm;
4506 
4507     //   int i;
4508 
4509     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4510 
4511     //   for (i = 0; i < len; i++) {
4512     //     int j;
4513 
4514     //     Pa = Pa_base;
4515     //     Pb = Pb_base + i;
4516     //     Pm = Pm_base;
4517     //     Pn = Pn_base + i;
4518 
4519     //     Ra = *Pa;
4520     //     Rb = *Pb;
4521     //     Rm = *Pm;
4522     //     Rn = *Pn;
4523 
4524     //     int iters = i;
4525     //     for (j = 0; iters--; j++) {
4526     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4527     //       MACC(Ra, Rb, t0, t1, t2);
4528     //       Ra = *++Pa;
4529     //       Rb = *--Pb;
4530     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4531     //       MACC(Rm, Rn, t0, t1, t2);
4532     //       Rm = *++Pm;
4533     //       Rn = *--Pn;
4534     //     }
4535 
4536     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
4537     //     MACC(Ra, Rb, t0, t1, t2);
4538     //     *Pm = Rm = t0 * inv;
4539     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4540     //     MACC(Rm, Rn, t0, t1, t2);
4541 
4542     //     assert(t0 == 0, "broken Montgomery multiply");
4543 
4544     //     t0 = t1; t1 = t2; t2 = 0;
4545     //   }
4546 
4547     //   for (i = len; i < 2*len; i++) {
4548     //     int j;
4549 
4550     //     Pa = Pa_base + i-len;
4551     //     Pb = Pb_base + len;
4552     //     Pm = Pm_base + i-len;
4553     //     Pn = Pn_base + len;
4554 
4555     //     Ra = *++Pa;
4556     //     Rb = *--Pb;
4557     //     Rm = *++Pm;
4558     //     Rn = *--Pn;
4559 
4560     //     int iters = len*2-i-1;
4561     //     for (j = i-len+1; iters--; j++) {
4562     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
4563     //       MACC(Ra, Rb, t0, t1, t2);
4564     //       Ra = *++Pa;
4565     //       Rb = *--Pb;
4566     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4567     //       MACC(Rm, Rn, t0, t1, t2);
4568     //       Rm = *++Pm;
4569     //       Rn = *--Pn;
4570     //     }
4571 
4572     //     Pm_base[i-len] = t0;
4573     //     t0 = t1; t1 = t2; t2 = 0;
4574     //   }
4575 
4576     //   while (t0)
4577     //     t0 = sub(Pm_base, Pn_base, t0, len);
4578     // }
4579 
4580     /**
4581      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
4582      * multiplies than Montgomery multiplication so it should be up to
4583      * 25% faster.  However, its loop control is more complex and it
4584      * may actually run slower on some machines.
4585      *
4586      * Arguments:
4587      *
4588      * Inputs:
4589      *   c_rarg0   - int array elements a
4590      *   c_rarg1   - int array elements n (the modulus)
4591      *   c_rarg2   - int length
4592      *   c_rarg3   - int inv
4593      *   c_rarg4   - int array elements m (the result)
4594      *
4595      */
4596     address generate_square() {
4597       Label argh;
4598       bind(argh);
4599       stop("MontgomeryMultiply total_allocation must be <= 8192");
4600 
4601       align(CodeEntryAlignment);
4602       address entry = pc();
4603 
4604       enter();
4605 
4606       // Make room.
4607       cmpw(Rlen, 512);
4608       br(Assembler::HI, argh);
4609       sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint)));
4610       andr(sp, Ra, -2 * wordSize);
4611 
4612       lsrw(Rlen, Rlen, 1);  // length in longwords = len/2
4613 
4614       {
4615         // Copy input args, reversing as we go.  We use Ra as a
4616         // temporary variable.
4617         reverse(Ra, Pa_base, Rlen, t0, t1);
4618         reverse(Ra, Pn_base, Rlen, t0, t1);
4619       }
4620 
4621       // Push all call-saved registers and also Pm_base which we'll need
4622       // at the end.
4623       save_regs();
4624 
4625       mov(Pm_base, Ra);
4626 
4627       mov(t0, zr);
4628       mov(t1, zr);
4629       mov(t2, zr);
4630 
4631       block_comment("for (int i = 0; i < len; i++) {");
4632       mov(Ri, zr); {
4633         Label loop, end;
4634         bind(loop);
4635         cmp(Ri, Rlen);
4636         br(Assembler::GE, end);
4637 
4638         pre1(Ri);
4639 
4640         block_comment("for (j = (i+1)/2; j; j--) {"); {
4641           add(Rj, Ri, 1);
4642           lsr(Rj, Rj, 1);
4643           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4644         } block_comment("  } // j");
4645 
4646         last_squaring(Ri);
4647 
4648         block_comment("  for (j = i/2; j; j--) {"); {
4649           lsr(Rj, Ri, 1);
4650           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4651         } block_comment("  } // j");
4652 
4653         post1_squaring();
4654         add(Ri, Ri, 1);
4655         cmp(Ri, Rlen);
4656         br(Assembler::LT, loop);
4657 
4658         bind(end);
4659         block_comment("} // i");
4660       }
4661 
4662       block_comment("for (int i = len; i < 2*len; i++) {");
4663       mov(Ri, Rlen); {
4664         Label loop, end;
4665         bind(loop);
4666         cmp(Ri, Rlen, Assembler::LSL, 1);
4667         br(Assembler::GE, end);
4668 
4669         pre2(Ri, Rlen);
4670 
4671         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
4672           lsl(Rj, Rlen, 1);
4673           sub(Rj, Rj, Ri);
4674           sub(Rj, Rj, 1);
4675           lsr(Rj, Rj, 1);
4676           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
4677         } block_comment("  } // j");
4678 
4679         last_squaring(Ri);
4680 
4681         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
4682           lsl(Rj, Rlen, 1);
4683           sub(Rj, Rj, Ri);
4684           lsr(Rj, Rj, 1);
4685           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
4686         } block_comment("  } // j");
4687 
4688         post2(Ri, Rlen);
4689         add(Ri, Ri, 1);
4690         cmp(Ri, Rlen, Assembler::LSL, 1);
4691 
4692         br(Assembler::LT, loop);
4693         bind(end);
4694         block_comment("} // i");
4695       }
4696 
4697       normalize(Rlen);
4698 
4699       mov(Ra, Pm_base);  // Save Pm_base in Ra
4700       restore_regs();  // Restore caller's Pm_base
4701 
4702       // Copy our result into caller's Pm_base
4703       reverse(Pm_base, Ra, Rlen, t0, t1);
4704 
4705       leave();
4706       ret(lr);
4707 
4708       return entry;
4709     }
4710     // In C, approximately:
4711 
4712     // void
4713     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
4714     //                   unsigned long Pm_base[], unsigned long inv, int len) {
4715     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
4716     //   unsigned long *Pa, *Pb, *Pn, *Pm;
4717     //   unsigned long Ra, Rb, Rn, Rm;
4718 
4719     //   int i;
4720 
4721     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
4722 
4723     //   for (i = 0; i < len; i++) {
4724     //     int j;
4725 
4726     //     Pa = Pa_base;
4727     //     Pb = Pa_base + i;
4728     //     Pm = Pm_base;
4729     //     Pn = Pn_base + i;
4730 
4731     //     Ra = *Pa;
4732     //     Rb = *Pb;
4733     //     Rm = *Pm;
4734     //     Rn = *Pn;
4735 
4736     //     int iters = (i+1)/2;
4737     //     for (j = 0; iters--; j++) {
4738     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4739     //       MACC2(Ra, Rb, t0, t1, t2);
4740     //       Ra = *++Pa;
4741     //       Rb = *--Pb;
4742     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4743     //       MACC(Rm, Rn, t0, t1, t2);
4744     //       Rm = *++Pm;
4745     //       Rn = *--Pn;
4746     //     }
4747     //     if ((i & 1) == 0) {
4748     //       assert(Ra == Pa_base[j], "must be");
4749     //       MACC(Ra, Ra, t0, t1, t2);
4750     //     }
4751     //     iters = i/2;
4752     //     assert(iters == i-j, "must be");
4753     //     for (; iters--; j++) {
4754     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4755     //       MACC(Rm, Rn, t0, t1, t2);
4756     //       Rm = *++Pm;
4757     //       Rn = *--Pn;
4758     //     }
4759 
4760     //     *Pm = Rm = t0 * inv;
4761     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
4762     //     MACC(Rm, Rn, t0, t1, t2);
4763 
4764     //     assert(t0 == 0, "broken Montgomery multiply");
4765 
4766     //     t0 = t1; t1 = t2; t2 = 0;
4767     //   }
4768 
4769     //   for (i = len; i < 2*len; i++) {
4770     //     int start = i-len+1;
4771     //     int end = start + (len - start)/2;
4772     //     int j;
4773 
4774     //     Pa = Pa_base + i-len;
4775     //     Pb = Pa_base + len;
4776     //     Pm = Pm_base + i-len;
4777     //     Pn = Pn_base + len;
4778 
4779     //     Ra = *++Pa;
4780     //     Rb = *--Pb;
4781     //     Rm = *++Pm;
4782     //     Rn = *--Pn;
4783 
4784     //     int iters = (2*len-i-1)/2;
4785     //     assert(iters == end-start, "must be");
4786     //     for (j = start; iters--; j++) {
4787     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
4788     //       MACC2(Ra, Rb, t0, t1, t2);
4789     //       Ra = *++Pa;
4790     //       Rb = *--Pb;
4791     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4792     //       MACC(Rm, Rn, t0, t1, t2);
4793     //       Rm = *++Pm;
4794     //       Rn = *--Pn;
4795     //     }
4796     //     if ((i & 1) == 0) {
4797     //       assert(Ra == Pa_base[j], "must be");
4798     //       MACC(Ra, Ra, t0, t1, t2);
4799     //     }
4800     //     iters =  (2*len-i)/2;
4801     //     assert(iters == len-j, "must be");
4802     //     for (; iters--; j++) {
4803     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
4804     //       MACC(Rm, Rn, t0, t1, t2);
4805     //       Rm = *++Pm;
4806     //       Rn = *--Pn;
4807     //     }
4808     //     Pm_base[i-len] = t0;
4809     //     t0 = t1; t1 = t2; t2 = 0;
4810     //   }
4811 
4812     //   while (t0)
4813     //     t0 = sub(Pm_base, Pn_base, t0, len);
4814     // }
4815   };
4816 
4817   // Initialization
4818   void generate_initial() {
4819     // Generate initial stubs and initializes the entry points
4820 
4821     // entry points that exist in all platforms Note: This is code
4822     // that could be shared among different platforms - however the
4823     // benefit seems to be smaller than the disadvantage of having a
4824     // much more complicated generator structure. See also comment in
4825     // stubRoutines.hpp.
4826 
4827     StubRoutines::_forward_exception_entry = generate_forward_exception();
4828 
4829     StubRoutines::_call_stub_entry =
4830       generate_call_stub(StubRoutines::_call_stub_return_address);
4831 
4832     // is referenced by megamorphic call
4833     StubRoutines::_catch_exception_entry = generate_catch_exception();
4834 
4835     // Build this early so it's available for the interpreter.
4836     StubRoutines::_throw_StackOverflowError_entry =
4837       generate_throw_exception("StackOverflowError throw_exception",
4838                                CAST_FROM_FN_PTR(address,
4839                                                 SharedRuntime::
4840                                                 throw_StackOverflowError));
4841     if (UseCRC32Intrinsics) {
4842       // set table address before stub generation which use it
4843       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table;
4844       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
4845     }
4846   }
4847 
4848   void generate_all() {
4849     // support for verify_oop (must happen after universe_init)
4850     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4851     StubRoutines::_throw_AbstractMethodError_entry =
4852       generate_throw_exception("AbstractMethodError throw_exception",
4853                                CAST_FROM_FN_PTR(address,
4854                                                 SharedRuntime::
4855                                                 throw_AbstractMethodError));
4856 
4857     StubRoutines::_throw_IncompatibleClassChangeError_entry =
4858       generate_throw_exception("IncompatibleClassChangeError throw_exception",
4859                                CAST_FROM_FN_PTR(address,
4860                                                 SharedRuntime::
4861                                                 throw_IncompatibleClassChangeError));
4862 
4863     StubRoutines::_throw_NullPointerException_at_call_entry =
4864       generate_throw_exception("NullPointerException at call throw_exception",
4865                                CAST_FROM_FN_PTR(address,
4866                                                 SharedRuntime::
4867                                                 throw_NullPointerException_at_call));
4868 
4869     // arraycopy stubs used by compilers
4870     generate_arraycopy_stubs();
4871 
4872     if (UseMultiplyToLenIntrinsic) {
4873       StubRoutines::_multiplyToLen = generate_multiplyToLen();
4874     }
4875 
4876     if (UseMontgomeryMultiplyIntrinsic) {
4877       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
4878       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
4879       StubRoutines::_montgomeryMultiply = g.generate_multiply();
4880     }
4881 
4882     if (UseMontgomerySquareIntrinsic) {
4883       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
4884       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
4885       // We use generate_multiply() rather than generate_square()
4886       // because it's faster for the sizes of modulus we care about.
4887       StubRoutines::_montgomerySquare = g.generate_multiply();
4888     }
4889 
4890     if (UseShenandoahGC) {
4891       StubRoutines::aarch64::_shenandoah_wb = generate_shenandoah_wb();
4892     }
4893 
4894 #ifndef BUILTIN_SIM
4895     // generate GHASH intrinsics code
4896     if (UseGHASHIntrinsics) {
4897       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
4898     }
4899 
4900     if (UseAESIntrinsics) {
4901       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4902       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4903       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4904       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4905     }
4906 
4907     if (UseSHA1Intrinsics) {
4908       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
4909       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
4910     }
4911     if (UseSHA256Intrinsics) {
4912       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
4913       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
4914     }
4915 
4916     if (UseCRC32CIntrinsics) {
4917       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
4918     }
4919 
4920     // generate Adler32 intrinsics code
4921     if (UseAdler32Intrinsics) {
4922       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
4923     }
4924 
4925     // Safefetch stubs.
4926     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
4927                                                        &StubRoutines::_safefetch32_fault_pc,
4928                                                        &StubRoutines::_safefetch32_continuation_pc);
4929     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
4930                                                        &StubRoutines::_safefetchN_fault_pc,
4931                                                        &StubRoutines::_safefetchN_continuation_pc);
4932 #endif
4933   }
4934 
4935  public:
4936   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4937     if (all) {
4938       generate_all();
4939     } else {
4940       generate_initial();
4941     }
4942   }
4943 }; // end class declaration
4944 
4945 void StubGenerator_generate(CodeBuffer* code, bool all) {
4946   StubGenerator g(code, all);
4947 }