New src/hotspot/cpu/aarch32/stubGenerator

   1 /*
   2  * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved.
   4  * Copyright (c) 2015, Linaro Ltd. All rights reserved.
   5  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   6  *
   7  * This code is free software; you can redistribute it and/or modify it
   8  * under the terms of the GNU General Public License version 2 only, as
   9  * published by the Free Software Foundation.
  10  *
  11  * This code is distributed in the hope that it will be useful, but WITHOUT
  12  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  13  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  14  * version 2 for more details (a copy is included in the LICENSE file that
  15  * accompanied this code).
  16  *
  17  * You should have received a copy of the GNU General Public License version
  18  * 2 along with this work; if not, write to the Free Software Foundation,
  19  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  20  *
  21  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  22  * or visit www.oracle.com if you need additional information or have any
  23  * questions.
  24  *
  25  */
  26 
  27 #include "precompiled.hpp"
  28 #include "asm/macroAssembler.hpp"
  29 #include "asm/macroAssembler.inline.hpp"
  30 #include "gc/shared/barrierSet.hpp"
  31 #include "gc/shared/barrierSetAssembler.hpp"
  32 #include "interpreter/interpreter.hpp"
  33 #include "nativeInst_aarch32.hpp"
  34 #include "oops/instanceOop.hpp"
  35 #include "oops/method.hpp"
  36 #include "oops/objArrayKlass.hpp"
  37 #include "oops/oop.inline.hpp"
  38 #include "prims/methodHandles.hpp"
  39 #include "runtime/frame.inline.hpp"
  40 #include "runtime/handles.inline.hpp"
  41 #include "runtime/sharedRuntime.hpp"
  42 #include "runtime/stubCodeGenerator.hpp"
  43 #include "runtime/stubRoutines.hpp"
  44 #include "runtime/thread.inline.hpp"
  45 #include "vm_version_aarch32.hpp"
  46 #ifdef COMPILER2
  47 #include "opto/runtime.hpp"
  48 #endif
  49 
  50 
  51 // Declaration and definition of StubGenerator (no .hpp file).
  52 // For a more detailed description of the stub routine structure
  53 // see the comment in stubRoutines.hpp
  54 
  55 #undef __
  56 #define __ _masm->
  57 #define TIMES_OOP lsl(exact_log2(4))
  58 
  59 #ifdef PRODUCT
  60 #define BLOCK_COMMENT(str) /* nothing */
  61 #else
  62 #define BLOCK_COMMENT(str) __ block_comment(str)
  63 #endif
  64 
  65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  66 
  67 // Stub Code definitions
  68 
  69 class StubGenerator: public StubCodeGenerator {
  70  private:
  71 
  72 #ifdef PRODUCT
  73 #define inc_counter_np(counter) ((void)0)
  74 #else
  75   void inc_counter_np_(int& counter) {
  76     __ lea(rscratch2, ExternalAddress((address)&counter));
  77     __ ldr(rscratch1, Address(rscratch2));
  78     __ add(rscratch1, rscratch1, 1);
  79     __ str(rscratch1, Address(rscratch2));
  80   }
  81 #define inc_counter_np(counter) \
  82   BLOCK_COMMENT("inc_counter " #counter); \
  83   inc_counter_np_(counter);
  84 #endif
  85 
  86   // Call stubs are used to call Java from C
  87   //
  88   // There are only four registers available to house arguments and we're expecting eight
  89   // the layout will be as follows:
  90 
  91   // c_rarg0 = call wrapper address
  92   // c_rarg1 = result
  93   // c_rarg2 = result type
  94   // c_rarg3 = method
  95   // sp -> [ entry_point
  96   //         parameters -> java params
  97   //         parameter size (in words)
  98   //         thread] (address increasing)
  99   //
 100   // We don't
 101   // NEW!! layout for aarch32 so that save and restore can be collapsed into a single
 102   // load/store
 103   // layout of saved registers now is
 104   // 0   [ saved lr      ] <- rfp
 105   // -1  [ saved fp      ]
 106   // -2  [ r12/rthread   ] Thread passed in args
 107   // -3  [ r10/rmethod   ] NOTE omitted rfp as restored automatically
 108   // -4  [ r9/rscratch1  ] Platform register?
 109   // -5  [ r8/thread     ]
 110   // -6  [ r7/rcpool     ]
 111   // -7  [ r6/rlocals    ]
 112   // -8  [ r5/rbcp       ]
 113   // -9  [ r4/rdispatch  ]
 114   // -10 [ r2/res type   ]
 115   // -11 [ r1/result     ]
 116   // -12 [r0/call wrapper]<- sp (when restored from fp value)
 117   // -13 maybe alignment
 118   // -YY [ java arg0     ]
 119   //   ...
 120   // -xx [ java argn     ] <- sp on branch into java
 121   //
 122   // XXX Note we do not save floating point registers
 123   // Only floating point registers s16-31 / d8-15 need to be saved
 124   // these are never touched by template interpreted code.
 125   // On a sequence such as C -> Java -> C, the C functions will save them if used.
 126 
 127   address generate_call_stub(address& return_address) {
 128     /*assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 &&
 129            (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off,
 130            "adjust this code");*/
 131     const int thread_off = -frame::get_frame_size(VMFrameAPCS) * wordSize;
 132 
 133     StubCodeMark mark(this, "StubRoutines", "call_stub");
 134     address start = __ pc();
 135     __ reg_printf("entering call stub with { sp : %p, rfp : %p, lr : %p}\n", sp, rfp, lr);
 136     __ enter(VMFrameAPCS); //save rfp & lr and possibly another 2 words
 137 
 138     const int entry_point_arg_off = 1 * wordSize,
 139               params_arg_off      = 2 * wordSize,
 140               param_sz_arg_off    = 3 * wordSize,
 141               thread_arg_off      = 4 * wordSize;
 142     // r12 is a scratch register so we can clobber it to save thread
 143     // which is needed at the end
 144     __ ldr(r12, Address(rfp, thread_arg_off));
 145     // r0, r1, r2, r4 - r10, r12
 146     // we save r0 as the call_wrapper_address is needed elsewhere
 147     // we save r1, r2 as they hold the result and it's type,
 148     // which are needed on return
 149     // r12 holds the thread ptr
 150     unsigned c_save_regset = 0b0001011111110111;
 151     int nsaved = __ count_bits(c_save_regset);
 152     __ stmdb(sp, c_save_regset);
 153 
 154     // Offset from rfp to end of stack.
 155     const int rfp_tos_offset_bytes = frame::get_offset_from_rfp_bytes() + nsaved * wordSize;
 156 
 157     // install Java thread in global register now we have saved
 158     // whatever value it held
 159     __ mov(rthread, r12);
 160     // And method
 161     __ mov(rmethod, c_rarg3);
 162 
 163 #ifdef ASSERT
 164     // make sure we have no pending exceptions
 165     {
 166       Label L;
 167       __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset())));
 168       __ cmp(rscratch1, (unsigned)NULL_WORD);
 169       __ b(L, Assembler::EQ);
 170       __ stop("StubRoutines::call_stub: entered with pending exception");
 171       __ BIND(L);
 172     }
 173 #endif
 174     __ ldr(rscratch2, Address(rfp, param_sz_arg_off));
 175     // align sp at the time we call java
 176     __ sub(sp, sp, rscratch2, lsl(LogBytesPerWord));
 177     __ align_stack();
 178     __ add(sp, sp, rscratch2, lsl(LogBytesPerWord));
 179 
 180     __ ldr(rscratch1, Address(rfp, params_arg_off));
 181 
 182     BLOCK_COMMENT("pass parameters if any");
 183     Label parameters_done;
 184 
 185     __ reg_printf("call_stub param_off = %p, param_sz = %d\n", rscratch1, rscratch2);
 186     __ cmp(rscratch2, 0);
 187     __ b(parameters_done, Assembler::EQ);
 188 
 189     // r14 makes ok temp as already saved in frame header
 190     address loop = __ pc();
 191     __ ldr(r14, Address(__ post(rscratch1, wordSize)));
 192     __ subs(rscratch2, rscratch2, 1);
 193 
 194     // TODO remove
 195     __ reg_printf("\tARG SP[%d] : 0x%08x\n", rscratch2, r14);
 196     __ cmp(rscratch2, 0);
 197     // END TODO
 198     __ push(r14);
 199     __ b(loop, Assembler::GT);
 200 
 201     __ BIND(parameters_done);
 202 
 203 #ifdef ASSERT
 204     __ verify_stack_alignment();
 205 #endif
 206 
 207     BLOCK_COMMENT("call Java function");
 208     __ ldr(rscratch1, Address(rfp, entry_point_arg_off));
 209     __ reg_printf("Calling Java function with rfp = %p, sp = %p\n", rfp, sp);
 210     __ mov(r4, sp);                 // set sender sp
 211     __ bl(rscratch1);
 212     // save current address for use by exception handling code
 213     return_address = __ pc();
 214 
 215     __ reg_printf("Returned to call_stub with rfp = %p, sp = %p\n", rfp, sp);
 216 
 217     // At this point rfp should be restored to the value it was set to before
 218     // use it to set the top of stack.
 219     __ sub(sp, rfp, rfp_tos_offset_bytes);
 220 
 221 #ifdef ASSERT
 222     // verify that threads correspond
 223     __ ldr(r12, Address(rfp, thread_off));
 224     //rfp points to register stored in highest memory location - first on
 225     // stack, that's the saved lr, r12 is just below that
 226     // stored in r12 at this point
 227     {
 228       Label L, S;
 229       __ cmp(rthread, r12);
 230       __ b(S, Assembler::NE);
 231       __ get_thread(r12);
 232       __ cmp(rthread, r12);
 233       __ b(L, Assembler::EQ);
 234       __ BIND(S);
 235       __ stop("StubRoutines::call_stub: threads must correspond");
 236       __ BIND(L);
 237     }
 238 #endif
 239 
 240     if(MacroAssembler::enable_debugging_static) {
 241       // FIXME Remove this hacky debugging code
 242       Label L;
 243       __ ldr(rscratch2, Address(rthread, Thread::pending_exception_offset()));
 244       __ cbnz(rscratch2, L);
 245       // If we're returning via an exception then we shouldn't report exit,
 246       // the exception handler will have already reported the exit and reporting
 247       // via our progress through the call stub will result in an extra method
 248       // being reported as exited.
 249       __ print_method_exit();
 250       __ bind(L);
 251     }
 252 
 253     // NOTE Horrible tricks here
 254     // We need to preserve current r0 and r1 values as they contain the return value.
 255     // First we discard r0 saved to stack, no longer needed.
 256     // We have saved result and type as c_rarg1 and c_rarg2, so now we alter
 257     // the regset to load as follows:
 258     // c_rarg2 = result
 259     // c_rarg3 = result_type
 260 
 261     assert((c_save_regset & 0xf) == 0b0111, "change me");
 262     __ add(sp, sp, wordSize);
 263     const int altered_saved_regset = (~0xf & c_save_regset) | 0xc;
 264     __ ldmia(sp, altered_saved_regset);
 265 
 266     // store result depending on type (everything that is not
 267     // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT)
 268     // n.b. this assumes Java returns an integral result in r0
 269     // and a floating result in j_farg0
 270 
 271     Label is_object, is_long, is_float, is_double, exit;
 272     __ cmp(c_rarg3, T_OBJECT);
 273     __ b(is_object, Assembler::EQ);
 274     __ cmp(c_rarg3, T_LONG);
 275     __ b(is_long, Assembler::EQ);
 276     if(hasFPU()) {
 277       // soft FP fall through T_INT case
 278       __ cmp(c_rarg3, T_FLOAT);
 279       __ b(is_float, Assembler::EQ);
 280     }
 281     __ cmp(c_rarg3, T_DOUBLE);
 282     if(hasFPU()) {
 283       __ b(is_double, Assembler::EQ);
 284     } else {
 285       __ b(is_long, Assembler::EQ);
 286     }
 287 
 288     // handle T_INT case
 289     __ str(r0, Address(c_rarg2));
 290 
 291     __ BIND(exit);
 292     __ leave(VMFrameAPCS); //Restore rfp, sp, lr
 293     __ reg_printf("leaving call stub with { sp : %p, rfp : %p, lr : %p}\n", sp, rfp, lr);
 294     // Pop arguments from stack.
 295     //__ add(sp, sp, 4 * wordSize);
 296 
 297     __ b(lr);
 298 
 299     // handle return types different from T_INT
 300     __ BIND(is_object);
 301     __ mov(r1, 0);
 302 
 303     __ BIND(is_long);
 304     __ strd(r0, r1, Address(c_rarg2, 0));
 305     __ b(exit, Assembler::AL);
 306 
 307     if(hasFPU()) {
 308       __ BIND(is_float);
 309       __ vstr_f32(f0, Address(c_rarg2, 0));
 310       __ b(exit, Assembler::AL);
 311 
 312       __ BIND(is_double);
 313       __ vstr_f64(d0, Address(c_rarg2, 0));
 314       __ b(exit, Assembler::AL);
 315     }
 316     return start;
 317   }
 318 
 319   // Return point for a Java call if there's an exception thrown in
 320   // Java code.  The exception is caught and transformed into a
 321   // pending exception stored in JavaThread that can be tested from
 322   // within the VM.
 323   //
 324   // Note: Usually the parameters are removed by the callee. In case
 325   // of an exception crossing an activation frame boundary, that is
 326   // not the case if the callee is compiled code => need to setup the
 327   // rsp.
 328   //
 329   // r0: exception oop
 330 
 331   // NOTE: this is used as a target from the signal handler so it
 332   // needs an x86 prolog which returns into the current simulator
 333   // executing the generated catch_exception code. so the prolog
 334   // needs to install rax in a sim register and adjust the sim's
 335   // restart pc to enter the generated code at the start position
 336   // then return from native to simulated execution.
 337 
 338   address generate_catch_exception() {
 339     const int thread_off = -frame::get_frame_size(VMFrameAPCS) * wordSize;
 340 
 341     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 342     address start = __ pc();
 343 
 344     // same as in generate_call_stub():
 345     const Address thread(rfp, thread_off);
 346 
 347 #ifdef ASSERT
 348     // verify that threads correspond
 349     {
 350       Label L, S;
 351       __ ldr(rscratch1, thread);
 352       __ cmp(rthread, rscratch1);
 353       __ b(S, Assembler::NE);
 354       __ get_thread(rscratch1);
 355       __ cmp(rthread, rscratch1);
 356       __ b(L, Assembler::EQ);
 357       __ bind(S);
 358       __ stop("StubRoutines::catch_exception: threads must correspond");
 359       __ bind(L);
 360     }
 361 #endif
 362 
 363     // set pending exception
 364     __ verify_oop(r0);
 365 
 366     __ str(r0, Address(rthread, Thread::pending_exception_offset()));
 367     __ mov(rscratch1, (address)__FILE__);
 368     __ str(rscratch1, Address(rthread, Thread::exception_file_offset()));
 369     __ mov(rscratch1, (int)__LINE__);
 370     __ str(rscratch1, Address(rthread, Thread::exception_line_offset()));
 371 
 372     // complete return to VM
 373     assert(StubRoutines::_call_stub_return_address != NULL,
 374            "_call_stub_return_address must have been generated before");
 375     __ b(StubRoutines::_call_stub_return_address);
 376 
 377     return start;
 378   }
 379 
 380   // Continuation point for runtime calls returning with a pending
 381   // exception.  The pending exception check happened in the runtime
 382   // or native call stub.  The pending exception in Thread is
 383   // converted into a Java-level exception.
 384   //
 385   // Contract with Java-level exception handlers:
 386   // r0: exception
 387   // r3: throwing pc
 388   //
 389   // NOTE: At entry of this stub, exception-pc must be in LR !!
 390 
 391   // NOTE: this is always used as a jump target within generated code
 392   // so it just needs to be generated code wiht no x86 prolog
 393 
 394   address generate_forward_exception() {
 395     //FIXME NOTE ON ALTERATION TO ARM32 IT WAS ASSUMED THAT rmethod
 396     // won't be used anymore and set on entry to the handler - is this true?
 397 
 398     Register spare = rmethod;
 399 
 400     StubCodeMark mark(this, "StubRoutines", "forward exception");
 401     address start = __ pc();
 402 
 403     // Upon entry, LR points to the return address returning into
 404     // Java (interpreted or compiled) code; i.e., the return address
 405     // becomes the throwing pc.
 406     //
 407     // Arguments pushed before the runtime call are still on the stack
 408     // but the exception handler will reset the stack pointer ->
 409     // ignore them.  A potential result in registers can be ignored as
 410     // well.
 411 
 412 #ifdef ASSERT
 413     // make sure this code is only executed if there is a pending exception
 414     {
 415       Label L;
 416       __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 417       __ cbnz(rscratch1, L);
 418       __ stop("StubRoutines::forward exception: no pending exception (1)");
 419       __ bind(L);
 420     }
 421 #endif
 422 
 423     // compute exception handler into r2
 424 
 425     // call the VM to find the handler address associated with the
 426     // caller address. pass thread in r0 and caller pc (ret address)
 427     // in r1. n.b. the caller pc is in lr, unlike x86 where it is on
 428     // the stack.
 429     __ mov(c_rarg1, lr);
 430     // lr will be trashed by the VM call so we move it to R2
 431     // (callee-saved) because we also need to pass it to the handler
 432     // returned by this call.
 433     __ mov(spare, lr); //note rscratch1 is a callee saved register
 434     BLOCK_COMMENT("call exception_handler_for_return_address");
 435     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 436                          SharedRuntime::exception_handler_for_return_address),
 437                     rthread, c_rarg1);
 438     // we should not really care that lr is no longer the callee
 439     // address. we saved the value the handler needs in spare so we can
 440     // just copy it to r3. however, the C2 handler will push its own
 441     // frame and then calls into the VM and the VM code asserts that
 442     // the PC for the frame above the handler belongs to a compiled
 443     // Java method. So, we restore lr here to satisfy that assert.
 444     __ mov(lr, spare);
 445     // setup r0 & r3 & clear pending exception
 446     __ mov(r3, spare);
 447     __ mov(spare, r0);
 448     __ ldr(r0, Address(rthread, Thread::pending_exception_offset()));
 449     __ mov(rscratch1, 0);
 450     __ str(rscratch1, Address(rthread, Thread::pending_exception_offset()));
 451 
 452 #ifdef ASSERT
 453     // make sure exception is set
 454     {
 455       Label L;
 456       __ cbnz(r0, L);
 457       __ stop("StubRoutines::forward exception: no pending exception (2)");
 458       __ bind(L);
 459     }
 460 #endif
 461     // continue at exception handler
 462     // r0: exception
 463     // r3: throwing pc
 464     // spare: exception handler
 465 
 466     __ verify_oop(r0);
 467     __ b(spare);
 468 
 469     return start;
 470   }
 471 
 472   // Non-destructive plausibility checks for oops
 473   //
 474   // Arguments:
 475   //    r0: oop to verify
 476   //    rscratch1: error message
 477   //
 478   // Stack after saving c_rarg3:
 479   //    [tos + 0]: saved c_rarg3
 480   //    [tos + 1]: saved c_rarg2
 481   //    [tos + 2]: saved lr
 482   //    [tos + 3]: saved rscratch2
 483   //    [tos + 4]: saved r1
 484   //    [tos + 5]: saved r0
 485   //    [tos + 6]: saved rscratch1
 486   address generate_verify_oop() {
 487     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 488     address start = __ pc();
 489 
 490     Label exit, error;
 491 
 492     // save c_rarg2 and c_rarg3
 493     __ stmdb(sp, RegSet::of(c_rarg2, c_rarg3).bits());
 494 
 495     __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr()));
 496     __ ldr(c_rarg3, Address(c_rarg2));
 497     __ add(c_rarg3, c_rarg3, 1);
 498     __ str(c_rarg3, Address(c_rarg2));
 499 
 500     // object is in r0
 501     // make sure object is 'reasonable'
 502     __ cbz(r0, exit); // if obj is NULL it is OK
 503 
 504     // Check if the oop is in the right area of memory
 505     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask());
 506     __ andr(c_rarg2, r0, c_rarg3);
 507     __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits());
 508 
 509     // Compare c_rarg2 and c_rarg3.  We don't use a compare
 510     // instruction here because the flags register is live.
 511     __ eor(c_rarg2, c_rarg2, c_rarg3);
 512     __ cbnz(c_rarg2, error);
 513 
 514     // make sure klass is 'reasonable', which is not zero.
 515     __ load_klass(r0, r0);  // get klass
 516     __ cbz(r0, error);      // if klass is NULL it is broken
 517 
 518     // return if everything seems ok
 519     __ bind(exit);
 520 
 521     __ ldmia(sp, RegSet::of(c_rarg2, c_rarg3).bits());
 522     __ b(lr);
 523 
 524     // handle errors
 525     __ bind(error);
 526     __ ldmia(sp, RegSet::of(c_rarg2, c_rarg3).bits());
 527 
 528     __ pusha();
 529     // Save old sp
 530     __ add(c_rarg2, sp, 14 * wordSize);
 531     __ str(c_rarg2, Address( __ pre(sp, -wordSize)));
 532     __ mov(c_rarg0, rscratch1);      // pass address of error message
 533     __ mov(c_rarg1, lr);             // pass return address
 534     __ mov(c_rarg2, sp);             // pass address of regs on stack
 535 #ifndef PRODUCT
 536     assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area");
 537 #endif
 538     BLOCK_COMMENT("call MacroAssembler::debug");
 539     __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug32));
 540     __ bl(rscratch1);
 541     __ hlt(0);
 542 
 543     return start;
 544   }
 545 
 546   // NOTE : very strange, I changed this but I don't know why the Address:(signed extend word) was here
 547   //void array_overlap_test(Label& L_no_overlap, Address sf) { __ b(L_no_overlap); }
 548   void array_overlap_test(Label& L_no_overlap) { __ b(L_no_overlap); }
 549   //no test being performed ?
 550 
 551   //
 552   // Small copy: less than 4 bytes.
 553   //
 554   // NB: Ignores all of the bits of count which represent more than 3
 555   // bytes, so a caller doesn't have to mask them.
 556 
 557   void copy_memory_small(Register s, Register d, Register count, Register tmp, bool is_aligned, int step) {
 558     const int granularity = uabs(step);
 559     const bool gen_always = !is_aligned || (-4 < step && step < 0);
 560     Label halfword, done;
 561 
 562     if ((granularity <= 1) || gen_always) {
 563       __ tst(count, 1);
 564       __ b(halfword, Assembler::EQ);
 565       __ ldrb(tmp, step < 0 ? __ pre(s, -1) : __ post(s, 1));
 566       __ strb(tmp, step < 0 ? __ pre(d, -1) : __ post(d, 1));
 567     }
 568 
 569     if ((granularity <= 2) || gen_always) {
 570       __ bind(halfword);
 571       __ tst(count, 2);
 572       __ b(done, Assembler::EQ);
 573       __ ldrh(tmp, step < 0 ? __ pre(s, -2) : __ post(s, 2));
 574       __ strh(tmp, step < 0 ? __ pre(d, -2) : __ post(d, 2));
 575     }
 576 
 577     __ bind(done);
 578   }
 579 
 580   void copy_memory_simd(Register s, Register d,
 581                    Register count, Register tmp, int step,
 582                    DoubleFloatRegSet tmp_set, size_t tmp_set_size ) {
 583     assert(UseSIMDForMemoryOps, "should be available");
 584     Label simd_loop, simd_small;
 585 
 586     __ cmp(count, tmp_set_size);
 587     __ b(simd_small, Assembler::LT);
 588 
 589     __ mov(tmp, count, __ lsr(exact_log2(tmp_set_size)));
 590     __ sub(count, count, tmp, __ lsl(exact_log2(tmp_set_size)));
 591 
 592     __ bind(simd_loop);
 593 
 594     __ pld(Address(s, step < 0 ? -2 * tmp_set_size : tmp_set_size));
 595 
 596     if (step < 0) {
 597       __ vldmdb_f64(s, tmp_set.bits());
 598       __ vstmdb_f64(d, tmp_set.bits());
 599     } else {
 600       __ vldmia_f64(s, tmp_set.bits());
 601       __ vstmia_f64(d, tmp_set.bits());
 602     }
 603 
 604     __ subs(tmp, tmp, 1);
 605     __ b(simd_loop, Assembler::NE);
 606 
 607     __ bind(simd_small);
 608   }
 609 
 610   // All-singing all-dancing memory copy.
 611   //
 612   // Copy count units of memory from s to d.  The size of a unit is
 613   // step, which can be positive or negative depending on the direction
 614   // of copy.  If is_aligned is false, we align the source address.
 615   //
 616 
 617   void copy_memory(bool is_aligned, Register s, Register d,
 618                    Register count, int step) {
 619     const int small_copy_size = 32; // 1 copy by ldm pays off alignment efforts and push/pop of temp set
 620     const int granularity = uabs(step);
 621     const Register tmp2 = rscratch2;
 622     const Register t0 = r3;
 623     Label small;
 624 
 625     assert_different_registers(s, d, count, tmp2, t0);
 626 
 627     __ mov(count, count, __ lsl(exact_log2(granularity)));
 628 
 629     if (step < 0) {
 630       __ add(s, s, count);
 631       __ add(d, d, count);
 632     }
 633 
 634     __ cmp(count, small_copy_size);
 635     __ b(small, Assembler::LT);
 636 
 637     // aligning
 638     if (!is_aligned || (-4 < step && step < 0)) {
 639       assert(3 <= small_copy_size, "may copy number of bytes required for alignment");
 640       if (step < 0) {
 641         __ andr(tmp2, s, 3);
 642       } else {
 643         __ rsb(tmp2, s, 0);
 644         __ andr(tmp2, tmp2, 3);
 645       }
 646       __ sub(count, count, tmp2);
 647       copy_memory_small(s, d, tmp2, t0, is_aligned, step);
 648     }
 649 
 650 #ifdef ASSERT
 651     Label src_aligned;
 652     __ tst(s, 3);
 653     __ b(src_aligned, Assembler::EQ);
 654     __ stop("src is not aligned");
 655     __ bind(src_aligned);
 656 #endif
 657 
 658     // if destination is unaliged, copying by words is the only option
 659     __ tst(d, 3);
 660     __ b(small, Assembler::NE);
 661     if (UseSIMDForMemoryOps && (VM_Version::features() & FT_AdvSIMD)) {
 662       copy_memory_simd(s, d, count, tmp2, step, DoubleFloatRegSet::range(d0, d7), 64);
 663       copy_memory_simd(s, d, count, tmp2, step, DoubleFloatRegSet::range(d0, d1), 16);
 664     } else {
 665       const RegSet tmp_set = RegSet::range(r4, r7);
 666       const int tmp_set_size = 16;
 667       Label ldm_loop;
 668 
 669       assert_different_registers(s, d, count, tmp2, r4, r5, r6, r7);
 670 
 671       __ cmp(count, tmp_set_size);
 672       __ b(small, Assembler::LT);
 673 
 674       __ push(tmp_set, sp);
 675 
 676       __ mov(tmp2, count, __ lsr(exact_log2(tmp_set_size)));
 677       __ sub(count, count, tmp2, __ lsl(exact_log2(tmp_set_size)));
 678 
 679       __ bind(ldm_loop);
 680 
 681       __ pld(Address(s, step < 0 ? -2 * tmp_set_size : tmp_set_size));
 682 
 683       if (step < 0) {
 684         __ ldmdb(s, tmp_set.bits());
 685         __ stmdb(d, tmp_set.bits());
 686       } else {
 687         __ ldmia(s, tmp_set.bits());
 688         __ stmia(d, tmp_set.bits());
 689       }
 690 
 691       __ subs(tmp2, tmp2, 1);
 692       __ b(ldm_loop, Assembler::NE);
 693 
 694       __ pop(tmp_set, sp);
 695     }
 696 
 697     __ bind(small);
 698 
 699     Label words_loop, words_done;
 700     __ cmp(count, BytesPerWord);
 701     __ b(words_done, Assembler::LT);
 702 
 703     __ mov(tmp2, count, __ lsr(exact_log2(BytesPerWord)));
 704     __ sub(count, count, tmp2, __ lsl(exact_log2(BytesPerWord)));
 705 
 706     __ bind(words_loop);
 707 
 708     Address src = step < 0 ? __ pre(s, -BytesPerWord) : __ post(s, BytesPerWord);
 709     Address dst = step < 0 ? __ pre(d, -BytesPerWord) : __ post(d, BytesPerWord);
 710 
 711     __ pld(Address(s, step < 0 ? -2 * BytesPerWord : BytesPerWord));
 712     __ ldr(t0, src);
 713     __ str(t0, dst);
 714     __ subs(tmp2, tmp2, 1);
 715 
 716     __ b(words_loop, Assembler::NE);
 717 
 718     __ bind(words_done);
 719     copy_memory_small(s, d, count, t0, is_aligned, step);
 720   }
 721 
 722   // Arguments:
 723   //   aligned - true => Input and output aligned on a HeapWord == 4-byte boundary
 724   //             ignored
 725   //   is_oop  - true => oop array, so generate store check code
 726   //   name    - stub name string
 727   //
 728   // Inputs:
 729   //   c_rarg0   - source array address
 730   //   c_rarg1   - destination array address
 731   //   c_rarg2   - element count, treated as ssize_t, can be zero
 732   //
 733   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 734   // the hardware handle it.  The two dwords within qwords that span
 735   // cache line boundaries will still be loaded and stored atomicly.
 736   //
 737   // Side Effects:
 738   //   disjoint_int_copy_entry is set to the no-overlap entry point
 739   //   used by generate_conjoint_int_oop_copy().
 740   //
 741   address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry,
 742                                   const char *name, bool dest_uninitialized = false) {
 743     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 744     __ align(CodeEntryAlignment);
 745     StubCodeMark mark(this, "StubRoutines", name);
 746     address start = __ pc();
 747     if (entry != NULL) {
 748       *entry = __ pc();
 749       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
 750       BLOCK_COMMENT("Entry:");
 751     }
 752     __ enter(VMFrameAPCS);
 753 
 754     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
 755     if (dest_uninitialized) {
 756       decorators |= IS_DEST_UNINITIALIZED;
 757     }
 758     if (aligned) {
 759       decorators |= ARRAYCOPY_ALIGNED;
 760     }
 761 
 762     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 763     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count);
 764 
 765     if (is_oop) {
 766       __ push(RegSet::of(d, count), sp);
 767     }
 768 
 769     // copy memory likes to voluntary use rscratch2 and r3
 770     copy_memory(aligned, s, d, count, size);
 771 
 772     if (is_oop) {
 773       __ pop(RegSet::of(d, count), sp);
 774       __ sub(count, count, 1); // make an inclusive end pointer
 775       __ lea(count, Address(d, count, lsl(exact_log2(size))));
 776     }
 777 
 778     // barriers are for oop arrays only, so don't worry about s, d and count being lost before
 779     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch2);
 780 
 781     __ leave(VMFrameAPCS);
 782     __ b(lr);
 783     return start;
 784   }
 785 
 786   // Arguments:
 787   //   aligned - true => Input and output aligned on a HeapWord == 4-byte boundary
 788   //             ignored
 789   //   is_oop  - true => oop array, so generate store check code
 790   //   name    - stub name string
 791   //
 792   // Inputs:
 793   //   c_rarg0   - source array address
 794   //   c_rarg1   - destination array address
 795   //   c_rarg2   - element count, treated as ssize_t, can be zero
 796   //
 797   // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let
 798   // the hardware handle it.  The two dwords within qwords that span
 799   // cache line boundaries will still be loaded and stored atomicly.
 800   //
 801   address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target,
 802                                  address *entry, const char *name,
 803                                  bool dest_uninitialized = false) {
 804     Register s = c_rarg0, d = c_rarg1, count = c_rarg2;
 805     __ align(CodeEntryAlignment);
 806     StubCodeMark mark(this, "StubRoutines", name);
 807     address start = __ pc();
 808 
 809     __ cmp(d, s);
 810     __ b(nooverlap_target, Assembler::LS);
 811 
 812     __ enter(VMFrameAPCS);
 813 
 814     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
 815     if (dest_uninitialized) {
 816       decorators |= IS_DEST_UNINITIALIZED;
 817     }
 818     if (aligned) {
 819       decorators |= ARRAYCOPY_ALIGNED;
 820     }
 821 
 822     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 823     bs->arraycopy_prologue(_masm, decorators, is_oop, d, count);
 824 
 825     if (is_oop) {
 826       __ push(RegSet::of(d, count), sp);
 827     }
 828 
 829     // copy memory likes to voluntary use rscratch2 and r3
 830     copy_memory(aligned, s, d, count, -size);
 831 
 832     if (is_oop) {
 833       __ pop(RegSet::of(d, count), sp);
 834       __ sub(count, count, 1); // make an inclusive end pointer
 835       __ lea(count, Address(d, count, lsl(exact_log2(size))));
 836     }
 837 
 838     // barriers are for oop arrays only, so don't worry about s, d and count being lost before
 839     bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch2);
 840 
 841     __ leave(VMFrameAPCS);
 842     __ b(lr);
 843     return start;
 844   }
 845 
 846   // Helper for generating a dynamic type check.
 847   // Smashes rscratch1.
 848   void generate_type_check(Register sub_klass,
 849                            Register super_check_offset,
 850                            Register super_klass,
 851                            Label& L_success) {
 852     assert_different_registers(sub_klass, super_check_offset, super_klass);
 853 
 854     BLOCK_COMMENT("type_check:");
 855 
 856     Label L_miss;
 857 
 858     __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg,        &L_success, &L_miss, NULL,
 859                                      super_check_offset);
 860     __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL);
 861 
 862     // Fall through on failure!
 863     __ BIND(L_miss);
 864   }
 865 
 866   //
 867   //  Generate checkcasting array copy stub
 868   //
 869   //  Input:
 870   //    c_rarg0   - source array address
 871   //    c_rarg1   - destination array address
 872   //    c_rarg2   - element count, treated as ssize_t, can be zero
 873   //    c_rarg3   - size_t ckoff (super_check_offset)
 874   //    [sp]      - oop ckval (super_klass)
 875   //
 876   //  Output:
 877   //    r0 ==  0  -  success
 878   //    r0 == -1^K - failure, where K is partial transfer count
 879   //
 880   address generate_checkcast_copy(const char *name, address *entry,
 881                                   bool dest_uninitialized = false) {
 882     Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop;
 883 
 884     // Input registers (after setup_arg_regs)
 885     const Register from        = c_rarg0;   // source array address
 886     const Register to          = c_rarg1;   // destination array address
 887     const Register count       = c_rarg2;   // elementscount
 888     const Register ckoff       = c_rarg3;   // super_check_offset
 889 
 890     // Registers used as temps
 891     const Register ckval       = r4;        // super_klass
 892     const Register count_save  = r5;        // orig elementscount
 893     const Register copied_oop  = r6;        // actual oop copied
 894     const Register oop_klass   = r7;        // oop._klass
 895     const Register start_to    = lr;
 896 
 897     //---------------------------------------------------------------
 898     // Assembler stub will be used for this call to arraycopy
 899     // if the two arrays are subtypes of Object[] but the
 900     // destination array type is not equal to or a supertype
 901     // of the source type.  Each element must be separately
 902     // checked.
 903 
 904     assert_different_registers(from, to, count, ckoff, ckval,
 905                                copied_oop, oop_klass, count_save);
 906 
 907     __ align(CodeEntryAlignment);
 908     StubCodeMark mark(this, "StubRoutines", name);
 909     address start = __ pc();
 910 
 911     __ enter(VMFrameAPCS); // required for proper stackwalking of RuntimeStub frame
 912 
 913 #ifdef ASSERT
 914     // caller guarantees that the arrays really are different
 915     // otherwise, we would have to make conjoint checks
 916     { Label L;
 917       array_overlap_test(L);//, TIMES_OOP);
 918       __ stop("checkcast_copy within a single array");
 919       __ bind(L);
 920     }
 921 #endif //ASSERT
 922 
 923     // Caller of this entry point must set up the argument registers.
 924     if (entry != NULL) {
 925       *entry = __ pc();
 926       BLOCK_COMMENT("Entry:");
 927     }
 928 
 929      // Empty array:  Nothing to do.
 930     __ cbz(count, L_done);
 931 
 932     // rscratch1 used as temp, rscratch2 can be killed by inc_counter_np
 933     __ push(RegSet::of(count_save, copied_oop, oop_klass, ckval, rscratch1, rscratch2), sp);
 934     __ ldr(ckval, Address(rfp, wordSize));
 935 
 936 #ifdef ASSERT
 937     BLOCK_COMMENT("assert consistent ckoff/ckval");
 938     // The ckoff and ckval must be mutually consistent,
 939     // even though caller generates both.
 940     { Label L;
 941       int sco_offset = in_bytes(Klass::super_check_offset_offset());
 942       __ ldr(rscratch1, Address(ckval, sco_offset));
 943       __ cmp(ckoff, rscratch1);
 944       __ b(L, Assembler::EQ);
 945       __ stop("super_check_offset inconsistent");
 946       __ bind(L);
 947     }
 948 #endif //ASSERT
 949 
 950     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
 951     bool is_oop = true;
 952     if (dest_uninitialized) {
 953       decorators |= IS_DEST_UNINITIALIZED;
 954     }
 955 
 956     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
 957     bs->arraycopy_prologue(_masm, decorators, is_oop, to, count);
 958 
 959     // save the original count
 960     __ mov(count_save, count);
 961 
 962     // save destination array start address
 963     __ mov(start_to, to);
 964 
 965     // Copy from low to high addresses
 966     __ b(L_load_element);
 967 
 968     // ======== begin loop ========
 969     // (Loop is rotated; its entry is L_load_element.)
 970     // Loop control:
 971     //   for (; count != 0; count--) {
 972     //     copied_oop = load_heap_oop(from++);
 973     //     ... generate_type_check ...;
 974     //     store_heap_oop(to++, copied_oop);
 975     //   }
 976     __ align(OptoLoopAlignment);
 977 
 978     __ BIND(L_store_element);
 979     __ store_heap_oop(__ post(to, 4), copied_oop, noreg, noreg, AS_RAW);  // store the oop
 980     __ sub(count, count, 1);
 981     __ cbz(count, L_do_card_marks);
 982 
 983     // ======== loop entry is here ========
 984     __ BIND(L_load_element);
 985     __ load_heap_oop(copied_oop, __ post(from, 4), noreg, noreg, AS_RAW); // load the oop
 986     __ cbz(copied_oop, L_store_element);
 987 
 988     __ load_klass(oop_klass, copied_oop);// query the object klass
 989     generate_type_check(oop_klass, ckoff, ckval, L_store_element);
 990     // ======== end loop ========
 991 
 992     // It was a real error; we must depend on the caller to finish the job.
 993     // Register count = remaining oops, count_orig = total oops.
 994     // Emit GC store barriers for the oops we have copied and report
 995     // their number to the caller.
 996 
 997     __ subs(count, count_save, count);     // K = partially copied oop count
 998     __ inv(count, count);                   // report (-1^K) to caller
 999     __ b(L_done_pop, Assembler::EQ);
1000 
1001     __ BIND(L_do_card_marks);
1002     __ add(to, to, -heapOopSize);         // make an inclusive end pointer
1003     bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1);
1004 
1005     __ bind(L_done_pop);
1006     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr);
1007     __ pop(RegSet::of(count_save, copied_oop, oop_klass, ckval, rscratch1, rscratch2), sp);
1008 
1009     __ bind(L_done);
1010     __ mov(r0, count);
1011     __ leave(VMFrameAPCS);
1012     __ b(lr);
1013     return start;
1014   }
1015 
1016   void generate_arraycopy_stubs() {
1017     address entry;
1018 
1019     // jbyte
1020     StubRoutines::_arrayof_jbyte_disjoint_arraycopy =      generate_disjoint_copy(sizeof(jbyte),  true,  false,        &entry, "arrayof_jbyte_disjoint_arraycopy");
1021     StubRoutines::_arrayof_jbyte_arraycopy =               generate_conjoint_copy(sizeof(jbyte),  true,  false, entry, NULL,   "arrayof_jbyte_arraycopy");
1022     StubRoutines::_jbyte_disjoint_arraycopy =              generate_disjoint_copy(sizeof(jbyte),  false, false,        &entry, "jbyte_disjoint_arraycopy");
1023     StubRoutines::_jbyte_arraycopy =                       generate_conjoint_copy(sizeof(jbyte),  false, false, entry, NULL,   "jbyte_arraycopy");
1024     // jshort
1025     StubRoutines::_arrayof_jshort_disjoint_arraycopy =     generate_disjoint_copy(sizeof(jshort), true,  false,        &entry, "arrayof_jshort_disjoint_arraycopy");
1026     StubRoutines::_arrayof_jshort_arraycopy =              generate_conjoint_copy(sizeof(jshort), true,  false, entry, NULL,   "arrayof_jshort_arraycopy");
1027     StubRoutines::_jshort_disjoint_arraycopy =             generate_disjoint_copy(sizeof(jshort), false, false,        &entry, "jshort_disjoint_arraycopy");
1028     StubRoutines::_jshort_arraycopy =                      generate_conjoint_copy(sizeof(jshort), false, false, entry, NULL,   "jshort_arraycopy");
1029     // jint (always aligned)
1030     StubRoutines::_arrayof_jint_disjoint_arraycopy =       generate_disjoint_copy(sizeof(jint),   true,  false,        &entry, "arrayof_jint_disjoint_arraycopy");
1031     StubRoutines::_arrayof_jint_arraycopy =                generate_conjoint_copy(sizeof(jint),   true,  false, entry, NULL,   "arrayof_jint_arraycopy");
1032     StubRoutines::_jint_disjoint_arraycopy =               StubRoutines::_arrayof_jint_disjoint_arraycopy;
1033     StubRoutines::_jint_arraycopy =                        StubRoutines::_arrayof_jint_arraycopy;
1034     // jlong (always aligned)
1035     StubRoutines::_arrayof_jlong_disjoint_arraycopy =      generate_disjoint_copy(sizeof(jlong),  true,  false,        &entry, "arrayof_jlong_disjoint_arraycopy");
1036     StubRoutines::_arrayof_jlong_arraycopy =               generate_conjoint_copy(sizeof(jlong),  true,  false, entry, NULL,   "arrayof_jlong_arraycopy");
1037     StubRoutines::_jlong_disjoint_arraycopy =              StubRoutines::_arrayof_jlong_disjoint_arraycopy;
1038     StubRoutines::_jlong_arraycopy =                       StubRoutines::_arrayof_jlong_arraycopy;
1039     // OOP (always aligned)
1040     StubRoutines::_arrayof_oop_disjoint_arraycopy =        generate_disjoint_copy(sizeof(jint),   true,  true,         &entry, "arrayof_oop_disjoint_arraycopy");
1041     StubRoutines::_arrayof_oop_arraycopy =                 generate_conjoint_copy(sizeof(jint),   true,  true,  entry, NULL,   "arrayof_oop_arraycopy");
1042     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_copy(sizeof(jint),   true,  true,         &entry, "arrayof_oop_disjoint_arraycopy_uninit", true);
1043     StubRoutines::_arrayof_oop_arraycopy_uninit =          generate_conjoint_copy(sizeof(jint),   true,  true,  entry, NULL,   "arrayof_oop_arraycopy_uninit",          true);
1044     StubRoutines::_oop_disjoint_arraycopy =                StubRoutines::_arrayof_oop_disjoint_arraycopy;
1045     StubRoutines::_oop_arraycopy =                         StubRoutines::_arrayof_oop_arraycopy;
1046     StubRoutines::_oop_disjoint_arraycopy_uninit =         StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
1047     StubRoutines::_oop_arraycopy_uninit =                  StubRoutines::_arrayof_oop_arraycopy_uninit;
1048 
1049     StubRoutines::_checkcast_arraycopy =        generate_checkcast_copy("checkcast_arraycopy",        NULL);
1050     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, true);
1051   }
1052 
1053   void generate_math_stubs() { Unimplemented(); }
1054 
1055   // Safefetch stubs.
1056   void generate_safefetch(const char* name, int size, address* entry,
1057                           address* fault_pc, address* continuation_pc) {
1058     // safefetch signatures:
1059     //   int      SafeFetch32(int*      adr, int      errValue);
1060     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
1061     //
1062     // arguments:
1063     //   c_rarg0 = adr
1064     //   c_rarg1 = errValue
1065     //
1066     // result:
1067     //   PPC_RET  = *adr or errValue
1068 
1069     StubCodeMark mark(this, "StubRoutines", name);
1070 
1071     // Entry point, pc or function descriptor.
1072     *entry = __ pc();
1073 
1074     // Load *adr into c_rarg1, may fault.
1075     __ mov(c_rarg2, c_rarg0);
1076     *fault_pc = __ pc();
1077     switch (size) {
1078       case 4:
1079         // int32_t
1080         __ ldr(c_rarg0, Address(c_rarg2, 0));
1081         break;
1082       default:
1083         ShouldNotReachHere();
1084     }
1085     __ b(lr);
1086     // return errValue or *adr
1087     *continuation_pc = __ pc();
1088     __ mov(r0, c_rarg1);
1089     __ b(lr);
1090   }
1091 
1092   /**
1093    *  Arguments:
1094    *
1095    * Inputs:
1096    *   c_rarg0   - int crc
1097    *   c_rarg1   - byte* buf
1098    *   c_rarg2   - int length
1099    *
1100    * Output:
1101    *       r0   - int crc result
1102    *
1103    * Preserves:
1104    *       r13
1105    *
1106    */
1107   address generate_updateBytesCRC32(int is_crc32c) {
1108     assert(!is_crc32c ? UseCRC32Intrinsics : UseCRC32CIntrinsics, "what are we doing here?");
1109 
1110     __ align(CodeEntryAlignment);
1111     StubCodeMark mark(this, "StubRoutines", !is_crc32c ? "updateBytesCRC32" : "updateBytesCRC32C");
1112 
1113     address start = __ pc();
1114 
1115     const Register crc   = c_rarg0;  // crc
1116     const Register buf   = c_rarg1;  // source java byte array address
1117     const Register len   = c_rarg2;  // length
1118     const Register table0 = c_rarg3; // crc_table address
1119     const Register table1 = r4;
1120     const Register table2 = r5;
1121     const Register table3 = lr;
1122 
1123     BLOCK_COMMENT("Entry:");
1124     __ enter(); // required for proper stackwalking of RuntimeStub frame
1125     __ push(RegSet::of(table1, table2, r6, r7, rscratch1, rscratch2), sp);
1126 
1127     __ kernel_crc32(crc, buf, len,
1128               table0, table1, table2, table3, rscratch1, rscratch2, r6, is_crc32c);
1129 
1130     __ pop(RegSet::of(table1, table2, r6, r7, rscratch1, rscratch2), sp);
1131     __ leave(); // required for proper stackwalking of RuntimeStub frame
1132     __ ret(lr);
1133 
1134     return start;
1135   }
1136 
1137   /**
1138    *  Arguments:
1139    *
1140    *  Input:
1141    *    c_rarg0   - x address
1142    *    c_rarg1   - x length
1143    *    c_rarg2   - y address
1144    *    c_rarg3   - y lenth
1145    *    sp[0]     - z address
1146    *    sp[1]     - z length
1147    */
1148   address generate_multiplyToLen() {
1149     __ align(CodeEntryAlignment);
1150     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
1151 
1152     address start = __ pc();
1153     const Register x     = r0;
1154     const Register xlen  = r1;
1155     const Register y     = r2;
1156     const Register ylen  = r3;
1157 
1158     const Register z     = r4;
1159     const Register zlen  = r5;
1160 
1161     const Register tmp1  = r6;
1162     const Register tmp2  = r7;
1163     const Register tmp3  = r8;
1164     const Register tmp4  = r9;
1165     const Register tmp5  = r12;
1166     const Register tmp6  = r14;
1167 
1168     BLOCK_COMMENT("Entry:");
1169     __ enter(); // required for proper stackwalking of RuntimeStub frame
1170     __ push(RegSet::of(z, zlen, tmp1, tmp2)+RegSet::of(tmp3, tmp4, tmp5, tmp6), sp);
1171     __ ldr(z, Address(rfp, 4));
1172     __ ldr(zlen, Address(rfp, 8));
1173     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6);
1174     __ pop(RegSet::of(z, zlen, tmp1, tmp2)+RegSet::of(tmp3, tmp4, tmp5, tmp6), sp);
1175     __ leave(); // required for proper stackwalking of RuntimeStub frame
1176     __ ret(lr);
1177 
1178     return start;
1179   }
1180 
1181   /**
1182    *  Arguments:
1183    *
1184    *  Input:
1185    *    c_rarg0   - out
1186    *    c_rarg1   - int
1187    *    c_rarg2   - offset
1188    *    c_rarg3   - len
1189    *    sp[0]     - k
1190    */
1191   address generate_mulAdd() {
1192     __ align(CodeEntryAlignment);
1193     StubCodeMark mark(this, "StubRoutines", "mulAdd");
1194 
1195     address start = __ pc();
1196     const Register out    = r0;
1197     const Register in     = r1;
1198     const Register offset = r2;
1199     const Register len    = r3;
1200 
1201     const Register k      = r4;
1202 
1203     const Register tmp1  = r6;
1204     const Register tmp2  = r7;
1205     const Register tmp3  = r8;
1206 
1207     BLOCK_COMMENT("Entry:");
1208     __ enter(); // required for proper stackwalking of RuntimeStub frame
1209     __ push(RegSet::of(k, tmp1, tmp2, tmp3), sp);
1210     __ ldr(k, Address(rfp, 4));
1211     __ mul_add(out, in, offset, len, k, tmp1, tmp2, tmp3);
1212     __ pop(RegSet::of(k, tmp1, tmp2, tmp3), sp);
1213     __ leave(); // required for proper stackwalking of RuntimeStub frame
1214     __ ret(lr);
1215 
1216     return start;
1217   }
1218 
1219 
1220   // Arguments:
1221   //
1222   // Inputs:
1223   //   c_rarg0   - source byte array address
1224   //   c_rarg1   - destination byte array address
1225   //   c_rarg2   - K (key) in little endian int array
1226   //
1227 
1228   address generate_aescrypt_encryptBlock() {
1229     assert(UseAESIntrinsics, "what are we doing here?");
1230     __ align(CodeEntryAlignment);
1231     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
1232 
1233     address start = __ pc();
1234 
1235     const Register from = c_rarg0; // source array address
1236     const Register to = c_rarg1; // destination array address
1237     const Register key = c_rarg2; // key array address
1238     const Register keylen = c_rarg3;
1239     const Register table1 = r4;
1240     const Register t0 = r5;
1241     const Register t1 = r6;
1242     const Register t2 = r7;
1243     const Register t3 = r8;
1244     const Register t4 = r9;
1245     const Register t5 = r10;
1246     const Register t6 = r11;
1247     const Register t7 = r12;
1248 
1249     BLOCK_COMMENT("Entry:");
1250     __ enter();
1251 
1252     __ push(RegSet::of(r4, r5, r6, r7, r8), sp);
1253     __ push(RegSet::of(r9, r10, r11, r12), sp);
1254     __ kernel_aescrypt_encryptBlock(from, to, key, keylen, table1,
1255             t0, t1, t2, t3, t4, t5, t6, t7);
1256     __ pop(RegSet::of(r9, r10, r11, r12), sp);
1257     __ pop(RegSet::of(r4, r5, r6, r7, r8), sp);
1258 
1259     __ leave();
1260     __ ret(lr);
1261 
1262     return start;
1263   }
1264 
1265   // Arguments:
1266   //
1267   // Inputs:
1268   //   c_rarg0   - source byte array address
1269   //   c_rarg1   - destination byte array address
1270   //   c_rarg2   - K (key) in little endian int array
1271   //
1272 
1273   address generate_aescrypt_decryptBlock() {
1274     assert(UseAESIntrinsics, "what are we doing here?");
1275     __ align(CodeEntryAlignment);
1276     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
1277 
1278     address start = __ pc();
1279 
1280     const Register from = c_rarg0; // source array address
1281     const Register to = c_rarg1; // destination array address
1282     const Register key = c_rarg2; // key array address
1283     const Register keylen = c_rarg3;
1284     const Register table1 = r4;
1285     const Register t0 = r5;
1286     const Register t1 = r6;
1287     const Register t2 = r7;
1288     const Register t3 = r8;
1289     const Register t4 = r9;
1290     const Register t5 = r10;
1291     const Register t6 = r11;
1292     const Register t7 = r12;
1293 
1294     BLOCK_COMMENT("Entry:");
1295     __ enter();
1296 
1297     __ push(RegSet::of(r4, r5, r6, r7, r8), sp);
1298     __ push(RegSet::of(r9, r10, r11, r12), sp);
1299     __ kernel_aescrypt_decryptBlock(from, to, key, keylen, table1,
1300             t0, t1, t2, t3, t4, t5, t6, t7);
1301     __ pop(RegSet::of(r9, r10, r11, r12), sp);
1302     __ pop(RegSet::of(r4, r5, r6, r7, r8), sp);
1303 
1304     __ leave();
1305     __ ret(lr);
1306 
1307     return start;
1308   }
1309 
1310   // Arguments:
1311   //
1312   // Inputs:
1313   //   c_rarg0   - source byte array address
1314   //   c_rarg1   - destination byte array address
1315   //   c_rarg2   - K (key) in little endian int array
1316   //   c_rarg3   - r vector byte array address
1317   //   c_rarg4   - input length
1318   //
1319   // Output:
1320   //   x0        - input length
1321   //
1322 
1323   address generate_cipherBlockChaining_encryptAESCrypt(bool len_on_stack) {
1324     assert(UseAESIntrinsics && UseNeon, "what are we doing here?");
1325     __ align(CodeEntryAlignment);
1326     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
1327 
1328     address start = __ pc();
1329 
1330     const Register from = c_rarg0; // source array address
1331     const Register to = c_rarg1; // destination array address
1332     const Register key = c_rarg2; // key array address
1333     const Register rvec = c_rarg3; // r byte array initialized from initvector array address
1334     // and left with the results of the last encryption block
1335     const Register len = r4; // src len (must be multiple of blocksize 16)
1336     const Register keylen = r5;
1337     const Register table = r6;
1338     const Register t0 = r7;
1339     const Register t1 = r8;
1340     const Register t2 = r9;
1341     const Register t3 = r10;
1342     const Register t4 = r11;
1343     const Register t5 = r12;
1344     const Register t6 = lr;
1345 
1346     BLOCK_COMMENT("Entry:");
1347     __ enter();
1348 
1349     __ push(RegSet::of(r4, r5, r6, r7, r8), sp);
1350     __ push(RegSet::of(r9, r10, r11, r12), sp);
1351     __ vstmdb_f64(sp, 0xff00); // d8-d15 are callee save registers
1352 
1353     if (len_on_stack)
1354       __ ldr(len, Address(rfp, wordSize));
1355     __ kernel_aescrypt_encrypt(from, to, key, rvec, len, keylen, table,
1356             t0, t1, t2, t3, t4, t5, t6);
1357 
1358     __ vldmia_f64(sp, 0xff00);
1359     __ pop(RegSet::of(r9, r10, r11, r12), sp);
1360     __ pop(RegSet::of(r4, r5, r6, r7, r8), sp);
1361 
1362     __ leave();
1363     __ ret(lr);
1364 
1365     return start;
1366   }
1367 
1368   // Arguments:
1369   //
1370   // Inputs:
1371   //   c_rarg0   - source byte array address
1372   //   c_rarg1   - destination byte array address
1373   //   c_rarg2   - K (key) in little endian int array
1374   //   c_rarg3   - r vector byte array address
1375   //   c_rarg4   - input length
1376   //
1377   // Output:
1378   //   x0        - input length
1379   //
1380 
1381   address generate_cipherBlockChaining_decryptAESCrypt(bool len_on_stack) {
1382     assert(UseAESIntrinsics && UseNeon, "what are we doing here?");
1383     __ align(CodeEntryAlignment);
1384     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
1385 
1386     address start = __ pc();
1387 
1388     const Register from = c_rarg0; // source array address
1389     const Register to = c_rarg1; // destination array address
1390     const Register key = c_rarg2; // key array address
1391     const Register rvec = c_rarg3; // r byte array initialized from initvector array address
1392     // and left with the results of the last encryption block
1393     const Register len = r4; // src len (must be multiple of blocksize 16)
1394     const Register keylen = r5;
1395     const Register table = r6;
1396     const Register t0 = r7;
1397     const Register t1 = r8;
1398     const Register t2 = r9;
1399     const Register t3 = r10;
1400     const Register t4 = r11;
1401     const Register t5 = r12;
1402     const Register t6 = lr;
1403 
1404     BLOCK_COMMENT("Entry:");
1405     __ enter();
1406 
1407     __ push(RegSet::of(r4, r5, r6, r7, r8), sp);
1408     __ push(RegSet::of(r9, r10, r11, r12), sp);
1409     __ vstmdb_f64(sp, 0xff00); // d8-d15 are callee save registers
1410 
1411     if (len_on_stack)
1412       __ ldr(len, Address(rfp, wordSize));
1413     __ kernel_aescrypt_decrypt(from, to, key, rvec, len, keylen, table,
1414             t0, t1, t2, t3, t4, t5, t6);
1415 
1416     __ vldmia_f64(sp, 0xff00);
1417     __ pop(RegSet::of(r9, r10, r11, r12), sp);
1418     __ pop(RegSet::of(r4, r5, r6, r7, r8), sp);
1419 
1420     __ leave();
1421     __ ret(lr);
1422 
1423     return start;
1424   }
1425 
1426   // Arguments:
1427   //
1428   // Inputs:
1429   //   c_rarg0   - source byte array address
1430   //   c_rarg1   - state array
1431 
1432   address generate_sha_implCompress() {
1433     assert(UseSHA1Intrinsics, "what are we doing here?");
1434     __ align(CodeEntryAlignment);
1435     StubCodeMark mark(this, "StubRoutines", "sha_implCompress");
1436     address start = __ pc();
1437 
1438     const Register from = c_rarg0; // source array address
1439     const Register state = c_rarg1;   // state array address
1440     const Register t0 = c_rarg2;
1441     const Register t1 = c_rarg3;
1442     const Register t2 = r4;
1443     const Register t3 = r5;
1444     const Register t4 = r6;
1445     const Register t5 = r7;
1446     const Register t6 = r8;
1447     const Register t7 = r9;
1448     const Register t8 = r10;
1449     const Register t9 = r11;
1450     const Register t10 = r12;
1451     DoubleFloatRegSet _fToSave = DoubleFloatRegSet::range(d0, d15);
1452 
1453     BLOCK_COMMENT("Entry:");
1454     __ enter();
1455 
1456     __ push(RegSet::of(r4, r5, r6, r7, r8), sp);
1457     __ push(RegSet::of(r9, r10, r11, r12), sp);
1458     __ vstmdb_f64(sp, _fToSave.bits());
1459 
1460     __ kernel_sha_implCompress(from, state, t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10);
1461 
1462     __ vldmia_f64(sp, _fToSave.bits(), true);
1463     __ pop(RegSet::of(r9, r10, r11, r12), sp);
1464     __ pop(RegSet::of(r4, r5, r6, r7, r8), sp);
1465 
1466     __ leave();
1467     __ ret(lr);
1468 
1469     return start;
1470   }
1471 
1472   // Arguments:
1473   //
1474   // Inputs:
1475   //   c_rarg0   - source byte array address
1476   //   c_rarg1   - state array
1477 
1478   address generate_sha256_implCompress() {
1479     assert(UseSHA256Intrinsics, "what are we doing here?");
1480     __ align(CodeEntryAlignment);
1481     StubCodeMark mark(this, "StubRoutines", "sha256_implCompress");
1482     address start = __ pc();
1483 
1484     const Register from = c_rarg0; // source array address
1485     const Register state = c_rarg1;   // state array address
1486     const Register t0 = c_rarg2;
1487     const Register t1 = c_rarg3;
1488     const Register t2 = r4;
1489     const Register t3 = r5;
1490     const Register t4 = r6;
1491     const Register t5 = r7;
1492     const Register t6 = r8;
1493     const Register t7 = r9;
1494     const Register t8 = r10;
1495     const Register t9 = r11;
1496     const Register t10 = r12;
1497     const Register t11 = lr;
1498     DoubleFloatRegSet _fToSave1 = DoubleFloatRegSet::range(d0, d15);
1499     DoubleFloatRegSet _fToSave2 = DoubleFloatRegSet::range(d16,d31);
1500 
1501     BLOCK_COMMENT("Entry:");
1502     __ enter();
1503 
1504     __ push(RegSet::of(r4, r5, r6, r7, r8), sp);
1505     __ push(RegSet::of(r9, r10, r11, r12, lr), sp);
1506     __ vstmdb_f64(sp, _fToSave1.bits());
1507     __ vstmdb_f64(sp, _fToSave2.bits());
1508 
1509     __ kernel_sha256_implCompress(from, state, t0, t1,
1510             t2, t3, t4, t5, t6, t7, t8, t9, t10, t11);
1511 
1512     __ vldmia_f64(sp, _fToSave2.bits(), true);
1513     __ vldmia_f64(sp, _fToSave1.bits(), true);
1514     __ pop(RegSet::of(r9, r10, r11, r12, lr), sp);
1515     __ pop(RegSet::of(r4, r5, r6, r7, r8), sp);
1516 
1517     __ leave();
1518     __ ret(lr);
1519 
1520     return start;
1521   }
1522 
1523   // Arguments:
1524   //
1525   // Inputs:
1526   //   c_rarg0   - source byte array address
1527   //   c_rarg1   - state array
1528 
1529   address generate_sha512_implCompress() {
1530     assert(UseSHA512Intrinsics, "what are we doing here?");
1531     __ align(CodeEntryAlignment);
1532     StubCodeMark mark(this, "StubRoutines", "sha512_implCompress");
1533     address start = __ pc();
1534 
1535     const Register from = c_rarg0; // source array address
1536     const Register state = c_rarg1;   // state array address
1537     const Register t0 = c_rarg2;
1538     const Register t1 = c_rarg3;
1539     DoubleFloatRegSet _fToSave1 = DoubleFloatRegSet::range(d0, d15);
1540     DoubleFloatRegSet _fToSave2 = DoubleFloatRegSet::range(d16,d31);
1541 
1542 
1543     BLOCK_COMMENT("Entry:");
1544     __ enter();
1545 
1546     __ vstmdb_f64(sp, _fToSave1.bits());
1547     __ vstmdb_f64(sp, _fToSave2.bits());
1548 
1549     __ kernel_sha512_implCompress(from, state, t0, t1);
1550 
1551     __ vldmia_f64(sp, _fToSave2.bits(), true);
1552     __ vldmia_f64(sp, _fToSave1.bits(), true);
1553 
1554     __ leave();
1555     __ ret(lr);
1556 
1557     return start;
1558   }
1559 
1560   // Continuation point for throwing of implicit exceptions that are
1561   // not handled in the current activation. Fabricates an exception
1562   // oop and initiates normal exception dispatching in this
1563   // frame. Since we need to preserve callee-saved values (currently
1564   // only for C2, but done for C1 as well) we need a callee-saved oop
1565   // map and therefore have to make these stubs into RuntimeStubs
1566   // rather than BufferBlobs.  If the compiler needs all registers to
1567   // be preserved between the fault point and the exception handler
1568   // then it must assume responsibility for that in
1569   // AbstractCompiler::continuation_for_implicit_null_exception or
1570   // continuation_for_implicit_division_by_zero_exception. All other
1571   // implicit exceptions (e.g., NullPointerException or
1572   // AbstractMethodError on entry) are either at call sites or
1573   // otherwise assume that stack unwinding will be initiated, so
1574   // caller saved registers were assumed volatile in the compiler.
1575 
1576 #undef __
1577 #define __ masm->
1578 
1579   address generate_throw_exception(const char* name,
1580                                    address runtime_entry,
1581                                    Register arg1 = noreg,
1582                                    Register arg2 = noreg) {
1583     // Information about frame layout at time of blocking runtime call.
1584     // Note that we only have to preserve callee-saved registers since
1585     // the compilers are responsible for supplying a continuation point
1586     // if they expect all registers to be preserved.
1587     // n.b. aarch32 asserts that frame::arg_reg_save_area_bytes == 0
1588     const int framesize = frame::get_frame_size();
1589     const int insts_size = 512;
1590     const int locs_size  = 64;
1591 
1592     CodeBuffer code(name, insts_size, locs_size);
1593     OopMapSet* oop_maps  = new OopMapSet();
1594     MacroAssembler* masm = new MacroAssembler(&code);
1595 
1596     address start = __ pc();
1597 
1598     // This is an inlined and slightly modified version of call_VM
1599     // which has the ability to fetch the return PC out of
1600     // thread-local storage and also sets up last_Java_sp slightly
1601     // differently than the real call_VM
1602 
1603     __ enter(); // Save at least FP and LR before call
1604 
1605     assert(is_even(framesize), "sp not 8-byte aligned");
1606 
1607     int frame_complete = __ pc() - start;
1608 
1609     // Set up last_Java_sp and last_Java_fp
1610     address the_pc = __ pc();
1611     __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1);
1612 
1613     // Call runtime
1614     if (arg1 != noreg) {
1615       assert(arg2 != c_rarg1, "clobbered");
1616       __ mov(c_rarg1, arg1);
1617     }
1618     if (arg2 != noreg) {
1619       __ mov(c_rarg2, arg2);
1620     }
1621     __ mov(c_rarg0, rthread);
1622     BLOCK_COMMENT("call runtime_entry");
1623     __ align_stack();
1624     __ mov(rscratch1, runtime_entry);
1625     __ bl(rscratch1);
1626 
1627     // Generate oop map
1628     OopMap* map = new OopMap(framesize, 0);
1629 
1630     oop_maps->add_gc_map(the_pc - start, map);
1631 
1632     __ reset_last_Java_frame(true);
1633     __ maybe_isb();
1634 
1635     __ leave();
1636 
1637     // check for pending exceptions
1638 #ifdef ASSERT
1639     Label L;
1640     __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset()));
1641     __ cbnz(rscratch1, L);
1642     __ should_not_reach_here();
1643     __ bind(L);
1644 #endif // ASSERT
1645     __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry()));
1646 
1647 
1648     // codeBlob framesize is in words (not VMRegImpl::slot_size)
1649     RuntimeStub* stub =
1650       RuntimeStub::new_runtime_stub(name,
1651                                     &code,
1652                                     frame_complete,
1653                                     framesize,
1654                                     oop_maps, false);
1655     return stub->entry_point();
1656   }
1657 
1658   class MontgomeryMultiplyGenerator : public MacroAssembler {
1659 
1660     Register Pa_base, Pb_base, Pn_base, Pm_base, Rlen, Ri, Rj, Pa, Pb, Pn, Pm;
1661     FloatRegister inv, Ra, Rb, Rm, Rn, RabAB, RaBAb, s0, s1, s2, tmp;
1662 
1663     RegSet _toSave;
1664     DoubleFloatRegSet _fToSave;
1665     bool _squaring;
1666 
1667   public:
1668     MontgomeryMultiplyGenerator (Assembler *as, bool squaring)
1669       : MacroAssembler(as->code()), _squaring(squaring) {
1670 
1671       // Register allocation
1672 
1673       Register reg = c_rarg0;
1674 
1675       Pa_base = reg++;       // Argument registers
1676       if (squaring)
1677         Pb_base = Pa_base;
1678       else
1679         Pb_base = reg++;
1680       Pn_base = reg++;
1681       Rlen= reg++;
1682       Pm_base = r4;
1683 
1684       Ri =  r5;        // Inner and outer loop indexes.
1685       Rj =  r6;
1686 
1687       Pa =  r7;        // Pointers to the current/next digit of a, b, n, and m.
1688       Pb =  r8;
1689       Pm =  r9;
1690       Pn =  r12;
1691 
1692       _toSave = RegSet::range(r4, r8) + RegSet::of(r9, r12);
1693 
1694       // Now NEON registers
1695 
1696       // Working registers:
1697       Ra =  d0;        // The current digit of a, b, n, and m.
1698       Rb =  d1;        // The values are stored as read, that is high and
1699       Rm =  d2;        // low 32-bit parts are exchanged
1700       Rn =  d3;
1701 
1702       // Three registers which form a triple-precision accumulator.
1703       // For sake of performance these are 128-bit and are overlapping
1704       //  (hence the name is s, not t). The schema is the following:
1705       //         w4|w3|w2|w1|w0| (32-bit words)
1706       // s0 lo:          |**|**|
1707       // s0 hi:       |**|**|
1708       // s1 lo:       |**|**|
1709       // s1 hi:    |**|**|
1710       // s2 lo:    |**|**|
1711       // s2 hi: |**|**|
1712       // the idea is that each of 64-bit s registers accumulate only 32-bit
1713       // numbers and hence never needs carry operation
1714 
1715       s0 =  q2;
1716       s1 =  q3;
1717       s2 =  q4;
1718 
1719       RabAB = q5;      // Product registers: low, high and middle parts
1720       RaBAb = q6;      // of a*b and m*n. hi(A)*hi(B) is the same quad as lo(a)*lo(b)
1721 
1722       inv = d14;
1723       tmp = d15;
1724 
1725       _fToSave = DoubleFloatRegSet::range(d8, tmp);
1726     }
1727 
1728   private:
1729     void save_regs() {
1730       vstmdb_f64(sp, _fToSave.bits());
1731       push(_toSave, sp);
1732     }
1733 
1734     void restore_regs() {
1735       pop(_toSave, sp);
1736       vldmia_f64(sp, _fToSave.bits(), true);
1737     }
1738 
1739     template <typename T>
1740     void unroll_2(Register count, T block) {
1741       Label loop, end, odd;
1742       tbnz(count, 0, odd);
1743       cbz(count, end);
1744       align(16);
1745       bind(loop);
1746       (this->*block)();
1747       bind(odd);
1748       (this->*block)();
1749       subs(count, count, 2);
1750       b(loop, Assembler::GT);
1751       bind(end);
1752     }
1753 
1754     void pre1(Register i) {
1755       block_comment("pre1");
1756       // Pa = Pa_base;
1757       // Pb = Pb_base + i;
1758       // Pm = Pm_base;
1759       // Pn = Pn_base + i;
1760       // Ra = *Pa;
1761       // Rb = *Pb;
1762       // Rm = *Pm;
1763       // Rn = *Pn;
1764       lea(Pa, Address(Pa_base));
1765       lea(Pb, Address(Pb_base, i, lsl(LogBytesPerLong), Address::SUB));
1766       lea(Pm, Address(Pm_base));
1767       lea(Pn, Address(Pn_base, i, lsl(LogBytesPerLong), Address::SUB));
1768 
1769       vld1_64(Ra, Address(Pa), Assembler::ALIGN_STD);
1770       vld1_64(Rb, Address(Pb), Assembler::ALIGN_STD);
1771       vld1_64(Rm, Address(Pm), Assembler::ALIGN_STD);
1772       vld1_64(Rn, Address(Pn), Assembler::ALIGN_STD);
1773     }
1774 
1775     // The core multiply-accumulate step of a Montgomery
1776     // multiplication.  The idea is to schedule operations as a
1777     // pipeline so that instructions with long latencies (loads and
1778     // multiplies) have time to complete before their results are
1779     // used.  This most benefits in-order implementations of the
1780     // architecture but out-of-order ones also benefit.
1781     void step() {
1782       block_comment("step");
1783       // MACC(Rm, Rn, t0, t1, t2);
1784       // Rm = *++Pm;
1785       // Rn = *--Pn;
1786       sub(Pm, Pm, BytesPerLong);
1787       add(Pn, Pn, BytesPerLong);
1788       vmul_acc1(Rm, Rn, tmp, RabAB, RaBAb);
1789       vld1_64(Rm, Address(Pm), Assembler::ALIGN_STD);
1790       vld1_64(Rn, Address(Pn), Assembler::ALIGN_STD);
1791       vmul_acc2(tmp, RabAB, RaBAb);
1792 
1793       // MACC(Ra, Rb, t0, t1, t2);
1794       // Ra = *++Pa;
1795       // Rb = *--Pb;
1796       sub(Pa, Pa, BytesPerLong);
1797       add(Pb, Pb, BytesPerLong);
1798       vmul_acc1(Ra, Rb, tmp, RabAB, RaBAb);
1799       vld1_64(Ra, Address(Pa), Assembler::ALIGN_STD);
1800       vld1_64(Rb, Address(Pb), Assembler::ALIGN_STD);
1801       vmul_acc2(tmp, RabAB, RaBAb);
1802     }
1803 
1804     void post1() {
1805       FloatRegister t0 = RabAB;
1806 
1807       block_comment("post1");
1808 
1809       // MACC(Ra, Rb, t0, t1, t2);
1810       vmul_acc1(Ra, Rb, tmp, RabAB, RaBAb);
1811       vmul_acc2(tmp, RabAB, RaBAb);
1812 
1813       // *Pm = Rm = t0 * inv;
1814       vmul_fin(t0, tmp);
1815       vmul_simple(Rm, t0, inv, RaBAb); // RaBAb is tmp
1816       vrev64_64_32(Rm, Rm);         // write in reversed, big-endian format
1817       vst1_64(Rm, Address(Pm), ALIGN_STD);
1818 
1819       // MACC(Rm, Rn, t0, t1, t2);
1820       vmul_acc1(Rm, Rn, tmp, RabAB, RaBAb);
1821       vmul_acc2(tmp, RabAB, RaBAb);
1822 
1823 #ifndef PRODUCT
1824       // assert(t0 == 0, "broken Montgomery multiply");
1825       {
1826         vmul_fin(t0, tmp);
1827         Label ok;
1828         push(RegSet::of(Ri, Rj), sp);
1829         vmov_f64(Ri, Rj, t0);
1830         orr(Ri, Ri, Rj);
1831         cbz(Ri, ok); {
1832           stop("broken Montgomery multiply");
1833         } bind(ok);
1834         pop(RegSet::of(Ri, Rj), sp);
1835       }
1836 #endif
1837 
1838       // t0 = t1; t1 = t2; t2 = 0;
1839       shift_t(RabAB);
1840     }
1841 
1842     void pre2(Register i, Register len) {
1843       block_comment("pre2");
1844       // Pa = Pa_base + i-len;
1845       // Pb = Pb_base + len;
1846       // Pm = Pm_base + i-len;
1847       // Pn = Pn_base + len;
1848 
1849       // Rj == i-len
1850       sub(Rj, i, len);
1851 
1852       lea(Pa, Address(Pa_base, Rj, lsl(LogBytesPerLong), Address::SUB));
1853       lea(Pb, Address(Pb_base, len, lsl(LogBytesPerLong), Address::SUB));
1854       lea(Pm, Address(Pm_base, Rj, lsl(LogBytesPerLong), Address::SUB));
1855       lea(Pn, Address(Pn_base, len, lsl(LogBytesPerLong), Address::SUB));
1856 
1857       // Ra = *++Pa;
1858       // Rb = *--Pb;
1859       // Rm = *++Pm;
1860       // Rn = *--Pn;
1861       sub(Pa, Pa, BytesPerLong);
1862       add(Pb, Pb, BytesPerLong);
1863       sub(Pm, Pm, BytesPerLong);
1864       add(Pn, Pn, BytesPerLong);
1865 
1866       vld1_64(Ra, Address(Pa), ALIGN_STD);
1867       vld1_64(Rb, Address(Pb), ALIGN_STD);
1868       vld1_64(Rm, Address(Pm), ALIGN_STD);
1869       vld1_64(Rn, Address(Pn), ALIGN_STD);
1870     }
1871 
1872     void post2(Register i, Register len) {
1873       FloatRegister t0 = RabAB;
1874 
1875       block_comment("post2");
1876 
1877       vmul_fin(t0, tmp);
1878 
1879       // As soon as we know the least significant digit of our result,
1880       // store it.
1881       // Pm_base[i-len] = t0;
1882       sub(Rj, i, len);
1883       lea(Rj, Address(Pm_base, Rj, lsl(LogBytesPerLong), Address::SUB));
1884       vrev64_64_32(t0, t0);
1885       vst1_64(t0, Address(Rj), ALIGN_STD);
1886 
1887       // t0 = t1; t1 = t2; t2 = 0;
1888       shift_t(RabAB);
1889     }
1890 
1891     // A carry in t0 after Montgomery multiplication means that we
1892     // should subtract multiples of n from our result in m.  We'll
1893     // keep doing that until there is no carry. ARM registers are used
1894     // for this operation, this is faster than using NEON
1895     void normalize(Register len, Register t0lo, Register t0hi,
1896         Register tmp1, Register tmp2, Register tmp3, Register tmp4, Register tmp5) {
1897       block_comment("normalize");
1898       // while (t0)
1899       //   t0 = sub(Pm_base, Pn_base, t0, len);
1900       Label loop, post, again;
1901       Register cnt = tmp1, i = tmp2, m = tmp3, n = tmp4, flags = tmp5;
1902       // let them point to last 32-bit element now
1903       add(Pn_base, Pn_base, BytesPerInt);
1904       add(Pm_base, Pm_base, BytesPerInt);
1905       orrs(n, t0lo, t0hi);
1906       b(post, EQ); {
1907         bind(again); {
1908           mov(i, 0);
1909           mov(cnt, len); // each loop processes 64 bits
1910           ldr(m, Address(Pm_base));
1911           ldr(n, Address(Pn_base));
1912           cmp(n, n); // set carry flag, i.e. no borrow
1913           mrs(flags);
1914           align(16);
1915           bind(loop); {
1916             msr(flags, true, false);
1917             sbcs(m, m, n);
1918             str(m, Address(Pm_base, i, lsl(LogBytesPerWord), Address::SUB));
1919             add(i, i, 1);
1920             ldr(n, Address(Pn_base, i, lsl(LogBytesPerWord), Address::SUB));
1921             ldr(m, Address(Pm_base, i, lsl(LogBytesPerWord), Address::SUB));
1922             sbcs(m, m, n);
1923             mrs(flags);
1924             str(m, Address(Pm_base, i, lsl(LogBytesPerWord), Address::SUB));
1925             add(i, i, 1);
1926             ldr(n, Address(Pn_base, i, lsl(LogBytesPerWord), Address::SUB));
1927             ldr(m, Address(Pm_base, i, lsl(LogBytesPerWord), Address::SUB));
1928             sub(cnt, cnt, 1);
1929           } cbnz(cnt, loop);
1930           msr(flags, true, false);
1931           sbcs(t0lo, t0lo, 0);
1932           sbc(t0hi, t0hi, 0);
1933           orrs(n, t0lo, t0hi);
1934         } b(again, NE);
1935       } bind(post);
1936     }
1937 
1938     void step_squaring() {
1939       // An extra ACC for A*B
1940       step();
1941       vmul_acc2(tmp, RabAB, RaBAb, false);
1942     }
1943 
1944     void last_squaring(Register i) {
1945       Label dont;
1946       // if ((i & 1) == 0) {
1947       tbnz(i, 0, dont); {
1948         // MACC(Ra, Rb, t0, t1, t2);
1949         // Ra = *++Pa;
1950         // Rb = *--Pb;
1951         sub(Pa, Pa, BytesPerLong);
1952         add(Pb, Pb, BytesPerLong);
1953         vmul_acc1(Ra, Rb, tmp, RabAB, RaBAb);
1954         vmul_acc2(tmp, RabAB, RaBAb);
1955       } bind(dont);
1956     }
1957 
1958     void extra_step_squaring() {
1959       // MACC(Rm, Rn, t0, t1, t2);
1960       // Rm = *++Pm;
1961       // Rn = *--Pn;
1962       sub(Pm, Pm, BytesPerLong);
1963       add(Pn, Pn, BytesPerLong);
1964       vmul_acc1(Rm, Rn, tmp, RabAB, RaBAb);
1965       vld1_64(Rm, Address(Pm), Assembler::ALIGN_STD);
1966       vld1_64(Rn, Address(Pn), Assembler::ALIGN_STD);
1967       vmul_acc2(tmp, RabAB, RaBAb);
1968     }
1969 
1970     void post1_squaring() {
1971       FloatRegister t0 = RabAB;
1972 
1973       // *Pm = Rm = t0 * inv;
1974       vmul_fin(t0, tmp);
1975       vmul_simple(Rm, t0, inv, RaBAb); // RaBAb is tmp
1976       vrev64_64_32(Rm, Rm);
1977       vst1_64(Rm, Address(Pm), ALIGN_STD);
1978 
1979       // MACC(Rm, Rn, t0, t1, t2);
1980       vmul_acc1(Rm, Rn, tmp, RabAB, RaBAb);
1981       vmul_acc2(tmp, RabAB, RaBAb);
1982 
1983 #ifndef PRODUCT
1984       // assert(t0 == 0, "broken Montgomery multiply");
1985       {
1986         vmul_fin(t0, tmp);
1987         Label ok;
1988         push(RegSet::of(Ri, Rj), sp);
1989         vmov_f64(Ri, Rj, t0);
1990         orr(Ri, Ri, Rj);
1991         cbz(Ri, ok); {
1992           stop("broken Montgomery square");
1993         } bind(ok);
1994         pop(RegSet::of(Ri, Rj), sp);
1995       }
1996 #endif
1997 
1998       // t0 = t1; t1 = t2; t2 = 0;
1999       shift_t(RabAB);
2000     }
2001 
2002     /**
2003      * Initializes the accumulators
2004      */
2005     void vmul_init() {
2006       vmov_128_32(s0, 0);
2007       vmov_128_32(s1, 0);
2008       vmov_128_32(s2, 0);
2009     }
2010 
2011     /**
2012      * Multiplies unsigned 64-bit a by unsigned 64-bit b accumulating the
2013      * result into temp array (s0-s2). temp array is not converged into
2014      * resulting number. See vmul_fin.
2015      * Performance critical part.
2016      * @param a first operand
2017      * @param b second operand
2018      */
2019     void vmul_acc1(FloatRegister a, FloatRegister b, FloatRegister tmp, FloatRegister RabAB, FloatRegister RaBAb) {
2020       vrev64_64_32(tmp, b);
2021       vmull_32u(RabAB, a, b);
2022       vmull_32u(RaBAb, a, tmp);
2023     }
2024 
2025     void vmul_acc2(FloatRegister tmp, FloatRegister RabAB, FloatRegister RaBAb, bool trn_aBAb = true) {
2026       // words 2-0 of accumulator
2027       vaddw_32u(s0, s0, RabAB->successor(FloatRegisterImpl::DOUBLE));
2028       if (trn_aBAb) {
2029         // words 3-1 of accumulator. phase 1
2030         vtrn_64_32(RaBAb, RaBAb->successor(FloatRegisterImpl::DOUBLE));
2031       }
2032       // words 4-2 of accumulator
2033       vaddw_32u(s2, s2, RabAB);
2034       // words 3-1 of accumulator. phase 2
2035       vpadal_128_u32(s1, RaBAb);
2036     }
2037 
2038     /**
2039      * Simple unsigned 64-bit multiply a by b.
2040      * Least significant 64 bits of result are written into register res,
2041      * the rest are discarded.
2042      * @param res 64-bit result
2043      * @param a 64-bit operand
2044      * @param b 64-bit operand
2045      * @param tmp 128-bit temporary register
2046      */
2047     void vmul_simple(FloatRegister res, FloatRegister a, FloatRegister b, FloatRegister tmp) {
2048       FloatRegister tmp2 = tmp->successor(FloatRegisterImpl::DOUBLE);
2049       vmull_32u(tmp, a, b);
2050       vrev64_64_32(tmp2, b);
2051       vmul_64_32(tmp2, a, tmp2);
2052       vpaddl_64_u32(tmp2, tmp2);
2053       vshl_64_64(tmp2, tmp2, 32);
2054       vadd_64_64(res, tmp, tmp2);
2055     }
2056 
2057     /**
2058      * Converges the temp array and returns least significant 64 bits of the result.
2059      * @param t0 the register to write the least significant 64 bits of result
2060      * @param tmp 64-bit temporary register
2061      */
2062     void vmul_fin(FloatRegister t0, FloatRegister tmp1) {
2063       FloatRegister abLow = s0;
2064       FloatRegister abHigh = s0->successor(FloatRegisterImpl::DOUBLE);
2065       FloatRegister aBAbLow = s1;
2066 
2067       // words 0 and 1
2068       vshr_64_u64(tmp1, abLow, 32);
2069       vadd_64_64(tmp1, tmp1, abHigh);
2070       vadd_64_64(tmp1, tmp1, aBAbLow);
2071       vmov_64(t0, abLow);
2072       vsli_64_64(t0, tmp1, 32);
2073     }
2074 
2075     /**
2076      * Performs t0 = t1; t1 = t2; t2 = 0; represented as s0-s2.
2077      * @param tmp 128-bit register
2078      */
2079     void shift_t(FloatRegister tmp) {
2080       FloatRegister s0hi = s0->successor(FloatRegisterImpl::DOUBLE);
2081       FloatRegister s1hi = s1->successor(FloatRegisterImpl::DOUBLE);
2082       FloatRegister s2hi = s2->successor(FloatRegisterImpl::DOUBLE);
2083       FloatRegister tmphi = tmp->successor(FloatRegisterImpl::DOUBLE);
2084       vshr_64_u64(s0, s0, 32);
2085       vaddl_32u(tmp, s1, s0hi);
2086       vadd_64_64(s0, s0, tmp);
2087       vshr_64_u64(s0, s0, 32);
2088       vadd_64_64(tmphi, s0, tmphi);
2089       vaddl_32u(s0, s1hi, s2);
2090       vadd_64_64(s0, s0, tmphi);
2091       vmov_64(s1, s2hi);
2092       vmov_64_32(s1hi, 0);
2093       vmov_128_32(s2, 0);
2094     }
2095 
2096   public:
2097     /**
2098      * Fast Montgomery multiplication.  The derivation of the
2099      * algorithm is in A Cryptographic Library for the Motorola
2100      * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237.
2101      *
2102      * Arguments:
2103      *
2104      * Inputs for multiplication:
2105      *   c_rarg0   - int64 array elements a
2106      *   c_rarg1   - int64 array elements b
2107      *   c_rarg2   - int64 array elements n (the modulus)
2108      *   c_rarg3   - int64 length
2109      *   [sp]      - int64 inv
2110      *   [sp+8]    - int64 array elements m (the result)
2111      *
2112      */
2113     address generate_multiply() {
2114       Label nothing;
2115       align(CodeEntryAlignment);
2116       address entry = pc();
2117 
2118       cbz(Rlen, nothing);
2119 
2120       enter();
2121 
2122       // Push all call-saved registers
2123       save_regs();
2124 
2125       // load inv and m array pointer
2126       add(Ri, rfp, 4);
2127       vld1_64(inv, Address(Ri), ALIGN_STD);
2128       ldr(Pm_base, Address(Ri, BytesPerLong));
2129 
2130       lsr(Rlen, Rlen, 1);  // length in longwords = len/2
2131 
2132       // let Px_base point on last 64-bit element of an array
2133       add(Pa_base, Pa_base, Rlen, lsl(LogBytesPerLong));
2134       sub(Pa_base, Pa_base, BytesPerLong);
2135       if (!_squaring) {
2136         add(Pb_base, Pb_base, Rlen, lsl(LogBytesPerLong));
2137         sub(Pb_base, Pb_base, BytesPerLong);
2138       }
2139       add(Pn_base, Pn_base, Rlen, lsl(LogBytesPerLong));
2140       sub(Pn_base, Pn_base, BytesPerLong);
2141       add(Pm_base, Pm_base, Rlen, lsl(LogBytesPerLong));
2142       sub(Pm_base, Pm_base, BytesPerLong);
2143 
2144 #ifndef PRODUCT
2145       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
2146       {
2147         // Pn, Pm and s0 are used as a temporary
2148         vld1_64(Rn, Address(Pn_base), Assembler::ALIGN_STD);
2149         vrev64_64_32(Rn, Rn);
2150         vmul_simple(tmp, Rn, inv, s0);
2151         vmov_f64(Pm, Pn, tmp);
2152         andr(Pm, Pm, Pn);
2153         cmn(Pm, 1);
2154         Label ok;
2155         b(ok, EQ); {
2156           stop("broken inverse in Montgomery multiply");
2157         } bind(ok);
2158       }
2159 #endif
2160 
2161       vmul_init();
2162 
2163       block_comment("for (int i = 0; i < len; i++) {");
2164       mov(Ri, 0); {
2165         Label loop, end;
2166         cmp(Ri, Rlen);
2167         b(end, Assembler::GE);
2168 
2169         bind(loop);
2170         pre1(Ri);
2171 
2172         block_comment("  for (j = i; j; j--) {"); {
2173           mov(Rj, Ri);
2174           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
2175         } block_comment("  } // j");
2176 
2177         post1();
2178         add(Ri, Ri, 1);
2179         cmp(Ri, Rlen);
2180         b(loop, Assembler::LT);
2181         bind(end);
2182         block_comment("} // i");
2183       }
2184 
2185       block_comment("for (int i = len; i < 2*len; i++) {");
2186       mov(Ri, Rlen); {
2187         Label loop, end;
2188         cmp(Ri, Rlen, lsl(1));
2189         b(end, Assembler::GE);
2190 
2191         bind(loop);
2192         pre2(Ri, Rlen);
2193 
2194         block_comment("  for (j = len*2-i-1; j; j--) {"); {
2195           lsl(Rj, Rlen, 1);
2196           sub(Rj, Rj, Ri);
2197           sub(Rj, Rj, 1);
2198           unroll_2(Rj, &MontgomeryMultiplyGenerator::step);
2199         } block_comment("  } // j");
2200 
2201         post2(Ri, Rlen);
2202         add(Ri, Ri, 1);
2203         cmp(Ri, Rlen, lsl(1));
2204         b(loop, Assembler::LT);
2205         bind(end);
2206       }
2207       block_comment("} // i");
2208 
2209       FloatRegister t0 = RabAB; // use as temporary
2210       vmul_fin(t0, tmp);
2211       vmov_f64(Pa, Pb, t0);
2212       normalize(Rlen, Pa, Pb, Pm, Pn, Ri, Rj, Pa_base);
2213 
2214       restore_regs();
2215       leave();
2216       bind(nothing);
2217       ret(lr);
2218 
2219       return entry;
2220     }
2221     // In C, approximately:
2222 
2223     // void
2224     // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[],
2225     //                     unsigned long Pn_base[], unsigned long Pm_base[],
2226     //                     unsigned long inv, int len) {
2227     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
2228     //   unsigned long *Pa, *Pb, *Pn, *Pm;
2229     //   unsigned long Ra, Rb, Rn, Rm;
2230 
2231     //   int i;
2232 
2233     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
2234 
2235     //   for (i = 0; i < len; i++) {
2236     //     int j;
2237 
2238     //     Pa = Pa_base;
2239     //     Pb = Pb_base + i;
2240     //     Pm = Pm_base;
2241     //     Pn = Pn_base + i;
2242 
2243     //     Ra = *Pa;
2244     //     Rb = *Pb;
2245     //     Rm = *Pm;
2246     //     Rn = *Pn;
2247 
2248     //     int iters = i;
2249     //     for (j = 0; iters--; j++) {
2250     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
2251     //       MACC(Ra, Rb, t0, t1, t2);
2252     //       Ra = *++Pa;
2253     //       Rb = *--Pb;
2254     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
2255     //       MACC(Rm, Rn, t0, t1, t2);
2256     //       Rm = *++Pm;
2257     //       Rn = *--Pn;
2258     //     }
2259 
2260     //     assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be");
2261     //     MACC(Ra, Rb, t0, t1, t2);
2262     //     *Pm = Rm = t0 * inv;
2263     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
2264     //     MACC(Rm, Rn, t0, t1, t2);
2265 
2266     //     assert(t0 == 0, "broken Montgomery multiply");
2267 
2268     //     t0 = t1; t1 = t2; t2 = 0;
2269     //   }
2270 
2271     //   for (i = len; i < 2*len; i++) {
2272     //     int j;
2273 
2274     //     Pa = Pa_base + i-len;
2275     //     Pb = Pb_base + len;
2276     //     Pm = Pm_base + i-len;
2277     //     Pn = Pn_base + len;
2278 
2279     //     Ra = *++Pa;
2280     //     Rb = *--Pb;
2281     //     Rm = *++Pm;
2282     //     Rn = *--Pn;
2283 
2284     //     int iters = len*2-i-1;
2285     //     for (j = i-len+1; iters--; j++) {
2286     //       assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be");
2287     //       MACC(Ra, Rb, t0, t1, t2);
2288     //       Ra = *++Pa;
2289     //       Rb = *--Pb;
2290     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
2291     //       MACC(Rm, Rn, t0, t1, t2);
2292     //       Rm = *++Pm;
2293     //       Rn = *--Pn;
2294     //     }
2295 
2296     //     Pm_base[i-len] = t0;
2297     //     t0 = t1; t1 = t2; t2 = 0;
2298     //   }
2299 
2300     //   while (t0)
2301     //     t0 = sub(Pm_base, Pn_base, t0, len);
2302     // }
2303 
2304     /**
2305      * Fast Montgomery squaring.  This uses asymptotically 25% fewer
2306      * multiplies than Montgomery multiplication so it should be up to
2307      * 25% faster.  However, its loop control is more complex and it
2308      * may actually run slower on some machines.
2309      *
2310      * Arguments:
2311      *
2312      * Inputs:
2313      *   c_rarg0   - int64 array elements a
2314      *   c_rarg1   - int64 array elements n (the modulus)
2315      *   c_rarg2   - int length
2316      *   [sp]      - int inv
2317      *   [sp+8]    - int array elements m (the result)
2318      *
2319      */
2320     address generate_square() {
2321       align(CodeEntryAlignment);
2322       address entry = pc();
2323 
2324       enter();
2325 
2326       save_regs();
2327 
2328       // load inv and m array pointer
2329       add(Ri, rfp, 4);
2330       vld1_64(inv, Address(Ri), ALIGN_STD);
2331       ldr(Pm_base, Address(Ri, BytesPerLong));
2332 
2333       lsr(Rlen, Rlen, 1);  // length in longwords = len/2
2334 
2335       // let Px_base point on last 64-bit element of an array
2336       add(Pa_base, Pa_base, Rlen, lsl(LogBytesPerLong));
2337       sub(Pa_base, Pa_base, BytesPerLong);
2338       add(Pn_base, Pn_base, Rlen, lsl(LogBytesPerLong));
2339       sub(Pn_base, Pn_base, BytesPerLong);
2340       add(Pm_base, Pm_base, Rlen, lsl(LogBytesPerLong));
2341       sub(Pm_base, Pm_base, BytesPerLong);
2342 
2343 #ifndef PRODUCT
2344       // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply");
2345       {
2346         // Pn, Pm and s0 are used as a temporary
2347         vld1_64(Rn, Address(Pn_base), Assembler::ALIGN_STD);
2348         vrev64_64_32(Rn, Rn);
2349         vmul_simple(tmp, Rn, inv, s0);
2350         vmov_f64(Pm, Pn, tmp);
2351         andr(Pm, Pm, Pn);
2352         cmn(Pm, 1);
2353         Label ok;
2354         b(ok, EQ); {
2355           stop("broken inverse in Montgomery square");
2356         } bind(ok);
2357       }
2358 #endif
2359 
2360       vmul_init();
2361 
2362       block_comment("for (int i = 0; i < len; i++) {");
2363       mov(Ri, 0); {
2364         Label loop, end;
2365         bind(loop);
2366         cmp(Ri, Rlen);
2367         b(end, GE);
2368 
2369         pre1(Ri);
2370 
2371         block_comment("for (j = (i+1)/2; j; j--) {"); {
2372           add(Rj, Ri, 1);
2373           lsr(Rj, Rj, 1);
2374           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
2375         } block_comment("  } // j");
2376 
2377         last_squaring(Ri);
2378 
2379         block_comment("  for (j = i/2; j; j--) {"); {
2380           lsr(Rj, Ri, 1);
2381           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
2382         } block_comment("  } // j");
2383 
2384         post1_squaring();
2385         add(Ri, Ri, 1);
2386         cmp(Ri, Rlen);
2387         b(loop, LT);
2388 
2389         bind(end);
2390         block_comment("} // i");
2391       }
2392 
2393       block_comment("for (int i = len; i < 2*len; i++) {");
2394       mov(Ri, Rlen); {
2395         Label loop, end;
2396         bind(loop);
2397         cmp(Ri, Rlen, lsl(1));
2398         b(end, GE);
2399 
2400         pre2(Ri, Rlen);
2401 
2402         block_comment("  for (j = (2*len-i-1)/2; j; j--) {"); {
2403           lsl(Rj, Rlen, 1);
2404           sub(Rj, Rj, Ri);
2405           sub(Rj, Rj, 1);
2406           lsr(Rj, Rj, 1);
2407           unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring);
2408         } block_comment("  } // j");
2409 
2410         last_squaring(Ri);
2411 
2412         block_comment("  for (j = (2*len-i)/2; j; j--) {"); {
2413           lsl(Rj, Rlen, 1);
2414           sub(Rj, Rj, Ri);
2415           lsr(Rj, Rj, 1);
2416           unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring);
2417         } block_comment("  } // j");
2418 
2419         post2(Ri, Rlen);
2420         add(Ri, Ri, 1);
2421         cmp(Ri, Rlen, lsl(1));
2422 
2423         b(loop, LT);
2424         bind(end);
2425         block_comment("} // i");
2426       }
2427 
2428       FloatRegister t0 = RabAB; // use as temporary
2429       vmul_fin(t0, tmp);
2430       vmov_f64(Pa, Pb, t0);
2431       normalize(Rlen, Pa, Pb, Pm, Pn, Ri, Rj, Pa_base);
2432 
2433       restore_regs();
2434       leave();
2435       ret(lr);
2436 
2437       return entry;
2438     }
2439     // In C, approximately:
2440 
2441     // void
2442     // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[],
2443     //                   unsigned long Pm_base[], unsigned long inv, int len) {
2444     //   unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator
2445     //   unsigned long *Pa, *Pb, *Pn, *Pm;
2446     //   unsigned long Ra, Rb, Rn, Rm;
2447 
2448     //   int i;
2449 
2450     //   assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply");
2451 
2452     //   for (i = 0; i < len; i++) {
2453     //     int j;
2454 
2455     //     Pa = Pa_base;
2456     //     Pb = Pa_base + i;
2457     //     Pm = Pm_base;
2458     //     Pn = Pn_base + i;
2459 
2460     //     Ra = *Pa;
2461     //     Rb = *Pb;
2462     //     Rm = *Pm;
2463     //     Rn = *Pn;
2464 
2465     //     int iters = (i+1)/2;
2466     //     for (j = 0; iters--; j++) {
2467     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
2468     //       MACC2(Ra, Rb, t0, t1, t2);
2469     //       Ra = *++Pa;
2470     //       Rb = *--Pb;
2471     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
2472     //       MACC(Rm, Rn, t0, t1, t2);
2473     //       Rm = *++Pm;
2474     //       Rn = *--Pn;
2475     //     }
2476     //     if ((i & 1) == 0) {
2477     //       assert(Ra == Pa_base[j], "must be");
2478     //       MACC(Ra, Ra, t0, t1, t2);
2479     //     }
2480     //     iters = i/2;
2481     //     assert(iters == i-j, "must be");
2482     //     for (; iters--; j++) {
2483     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
2484     //       MACC(Rm, Rn, t0, t1, t2);
2485     //       Rm = *++Pm;
2486     //       Rn = *--Pn;
2487     //     }
2488 
2489     //     *Pm = Rm = t0 * inv;
2490     //     assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be");
2491     //     MACC(Rm, Rn, t0, t1, t2);
2492 
2493     //     assert(t0 == 0, "broken Montgomery multiply");
2494 
2495     //     t0 = t1; t1 = t2; t2 = 0;
2496     //   }
2497 
2498     //   for (i = len; i < 2*len; i++) {
2499     //     int start = i-len+1;
2500     //     int end = start + (len - start)/2;
2501     //     int j;
2502 
2503     //     Pa = Pa_base + i-len;
2504     //     Pb = Pa_base + len;
2505     //     Pm = Pm_base + i-len;
2506     //     Pn = Pn_base + len;
2507 
2508     //     Ra = *++Pa;
2509     //     Rb = *--Pb;
2510     //     Rm = *++Pm;
2511     //     Rn = *--Pn;
2512 
2513     //     int iters = (2*len-i-1)/2;
2514     //     assert(iters == end-start, "must be");
2515     //     for (j = start; iters--; j++) {
2516     //       assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be");
2517     //       MACC2(Ra, Rb, t0, t1, t2);
2518     //       Ra = *++Pa;
2519     //       Rb = *--Pb;
2520     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
2521     //       MACC(Rm, Rn, t0, t1, t2);
2522     //       Rm = *++Pm;
2523     //       Rn = *--Pn;
2524     //     }
2525     //     if ((i & 1) == 0) {
2526     //       assert(Ra == Pa_base[j], "must be");
2527     //       MACC(Ra, Ra, t0, t1, t2);
2528     //     }
2529     //     iters =  (2*len-i)/2;
2530     //     assert(iters == len-j, "must be");
2531     //     for (; iters--; j++) {
2532     //       assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be");
2533     //       MACC(Rm, Rn, t0, t1, t2);
2534     //       Rm = *++Pm;
2535     //       Rn = *--Pn;
2536     //     }
2537     //     Pm_base[i-len] = t0;
2538     //     t0 = t1; t1 = t2; t2 = 0;
2539     //   }
2540 
2541     //   while (t0)
2542     //     t0 = sub(Pm_base, Pn_base, t0, len);
2543     // }
2544   };
2545 
2546   // Initialization
2547   void generate_initial() {
2548     // Generate initial stubs and initializes the entry points
2549 
2550     // entry points that exist in all platforms Note: This is code
2551     // that could be shared among different platforms - however the
2552     // benefit seems to be smaller than the disadvantage of having a
2553     // much more complicated generator structure. See also comment in
2554     // stubRoutines.hpp.
2555 
2556     StubRoutines::_forward_exception_entry = generate_forward_exception();
2557 
2558     StubRoutines::_call_stub_entry =
2559       generate_call_stub(StubRoutines::_call_stub_return_address);
2560 
2561     // is referenced by megamorphic call
2562     StubRoutines::_catch_exception_entry = generate_catch_exception();
2563 
2564     // Build this early so it's available for the interpreter.
2565     StubRoutines::_throw_StackOverflowError_entry =
2566       generate_throw_exception("StackOverflowError throw_exception",
2567                                CAST_FROM_FN_PTR(address,
2568                                                 SharedRuntime::throw_StackOverflowError));
2569     StubRoutines::_throw_delayed_StackOverflowError_entry =
2570       generate_throw_exception("delayed StackOverflowError throw_exception",
2571                                CAST_FROM_FN_PTR(address,
2572                                                 SharedRuntime::throw_delayed_StackOverflowError));
2573     if (UseCRC32Intrinsics) {
2574       // set table address before stub generation which use it
2575       StubRoutines::_crc_table_adr = (address)StubRoutines::aarch32::_crc_table;
2576       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(false);
2577     }
2578 
2579     if (UseCRC32CIntrinsics) {
2580       // set table address before stub generation which use it
2581       StubRoutines::_crc32c_table_addr = (address)StubRoutines::aarch32::_crc32c_table;
2582       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32(true);
2583     }
2584 
2585     if (UseAESIntrinsics) {
2586     // set table address before stub generation which use it
2587       StubRoutines::_aes_table_te_addr = (address)StubRoutines::aarch32::_aes_te_table;
2588       StubRoutines::_aes_table_td_addr = (address)StubRoutines::aarch32::_aes_td_table;
2589 
2590       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
2591       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
2592 
2593       if (UseNeon) {
2594         // AES CBC implementation uses NEON insructions
2595         StubRoutines::_cipherBlockChaining_encryptAESCrypt_special = generate_cipherBlockChaining_encryptAESCrypt(false);
2596         StubRoutines::_cipherBlockChaining_decryptAESCrypt_special = generate_cipherBlockChaining_decryptAESCrypt(false);
2597         StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(true);
2598         StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(true);
2599       }
2600     }
2601 
2602     if (UseSHA1Intrinsics) {
2603       StubRoutines::_sha1_table_addr = (address)StubRoutines::aarch32::_sha1_table;
2604       StubRoutines::_sha1_implCompress   = generate_sha_implCompress();
2605     }
2606     if (UseSHA256Intrinsics) {
2607       StubRoutines::_sha256_table_addr = (address)StubRoutines::aarch32::_sha256_table;
2608       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress();
2609     }
2610     if (UseSHA512Intrinsics) {
2611       StubRoutines::_sha512_table_addr = (address)StubRoutines::aarch32::_sha512_table;
2612       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress();
2613     }
2614 
2615     NativeCall::init();
2616   }
2617 #undef __
2618 #define __ _masm->
2619 
2620 #ifdef COMPILER2
2621   address generate_idiv_irem_stub(const char *name, bool want_mod) {
2622     __ align(CodeEntryAlignment);
2623     StubCodeMark mark(this, "StubRoutines", name);
2624 
2625     address start = __ pc();
2626 
2627     BLOCK_COMMENT("Entry:");
2628     __ enter(); // required for proper stackwalking of RuntimeStub frame
2629     // C2 knows this kills rscratch1 and rscratch2, so not save them
2630 
2631     __ divide(r0, r1, r2, 32, want_mod);
2632 
2633     __ leave(); // required for proper stackwalking of RuntimeStub frame
2634     __ ret(lr);
2635 
2636     return start;
2637   }
2638 
2639   // Support for uint StubRoutine::Arm::partial_subtype_check( Klass sub, Klass super );
2640   // Arguments :
2641   //
2642   //      ret  : R0, returned
2643   //      icc/xcc: set as R0 (depending on wordSize)
2644   //      sub  : R1, argument, not changed
2645   //      super: R2, argument, not changed
2646   //      raddr: LR, blown by call
2647   address generate_partial_subtype_check() {
2648     __ align(CodeEntryAlignment);
2649     StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
2650     address start = __ pc();
2651 
2652     // based on SPARC check_klass_subtype_[fast|slow]_path (without CompressedOops)
2653 
2654     // R0 used as tmp_reg (in addition to return reg)
2655     Register sub_klass = r1;
2656     Register super_klass = r2;
2657     Register tmp_reg2 = r3;
2658     Register tmp_reg3 = r4;
2659 
2660 // inc_counter_np kills rscratch1 and rscratch2
2661 #define saved_set RegSet::of(tmp_reg2, tmp_reg3, rscratch1, rscratch2)
2662 
2663     Label L_loop, L_fail;
2664 
2665     int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
2666 
2667     // fast check should be redundant
2668 
2669     // slow check
2670     {
2671       __ push(saved_set, sp);
2672 
2673       // a couple of useful fields in sub_klass:
2674       int ss_offset = in_bytes(Klass::secondary_supers_offset());
2675 
2676       // Do a linear scan of the secondary super-klass chain.
2677       // This code is rarely used, so simplicity is a virtue here.
2678 
2679       inc_counter_np(SharedRuntime::_partial_subtype_ctr);
2680 
2681       Register scan_temp = tmp_reg2;
2682       Register count_temp = tmp_reg3;
2683 
2684       // We will consult the secondary-super array.
2685       __ ldr(scan_temp, Address(sub_klass, ss_offset));
2686 
2687       Register search_key = super_klass;
2688 
2689       // Load the array length.
2690       __ ldr(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes()));
2691       __ add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes());
2692 
2693       __ add(count_temp, count_temp, 1);
2694 
2695       // Top of search loop
2696       __ bind(L_loop);
2697       // Notes:
2698       //  scan_temp starts at the array elements
2699       //  count_temp is 1+size
2700       __ subs(count_temp, count_temp, 1);
2701       __ b(L_fail, Assembler::EQ); // not found in the array
2702 
2703       // Load next super to check
2704       // In the array of super classes elements are pointer sized.
2705       int element_size = wordSize;
2706       __ ldr(r0, __ post(scan_temp, element_size));
2707 
2708       // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list
2709       __ subs(r0, r0, search_key); // set R0 to 0 on success (and flags to eq)
2710 
2711       // A miss means we are NOT a subtype and need to keep looping
2712       __ b(L_loop, Assembler::NE);
2713 
2714       // Falling out the bottom means we found a hit; we ARE a subtype
2715 
2716       // Success.  Cache the super we found and proceed in triumph.
2717       __ str(super_klass, Address(sub_klass, sc_offset));
2718 
2719       // Return success
2720       // R0 is already 0 and flags are already set to eq
2721       __ pop(saved_set, sp);
2722       __ ret(lr);
2723 
2724       // Return failure
2725       __ bind(L_fail);
2726       __ movs_i(r0, 1); // sets the flags
2727       __ pop(saved_set, sp);
2728       __ ret(lr);
2729     }
2730     return start;
2731   }
2732 #undef saved_set
2733 
2734   address generate_string_compress_neon() {
2735     __ align(CodeEntryAlignment);
2736     StubCodeMark mark(this, "StubRoutines", "string_compress_neon");
2737     address start = __ pc();
2738 
2739     Register src = r2;
2740     Register dst = r1;
2741     Register len = r3;
2742     Register t = r9;
2743     Register t2 = r12;
2744     FloatRegister a1 = d0;
2745     FloatRegister a2 = d1;
2746     FloatRegister b1 = d2;
2747     FloatRegister b2 = d3;
2748     Register result = r0;
2749 
2750     Label Lloop2, Lset_result;
2751 
2752     __ sub(len, len, 8+16);
2753     __ vld1_64(a1, a2, __ post(src, 16), Assembler::ALIGN_STD);
2754     __ bind(Lloop2); {
2755       __ vld1_64(b1, __ post(src, 8), Assembler::ALIGN_STD);
2756       __ vuzp_64_8(a1, a2); // a1 now has lower bytes, a2 upper
2757       __ vld1_64(b2, __ post(src, 8), Assembler::ALIGN_STD);
2758       __ vmov_f64(t, t2, a2);
2759       __ vst1_64(a1, __ post(dst, 8), Assembler::ALIGN_STD);
2760       __ orrs(t, t, t2);
2761       __ b(Lset_result, Assembler::NE);
2762 
2763       __ vld1_64(a1, __ post(src, 8), Assembler::ALIGN_STD);
2764       __ vuzp_64_8(b1, b2); // b1 now has lower bytes, b2 upper
2765       __ vld1_64(a2, __ post(src, 8), Assembler::ALIGN_STD);
2766       __ vmov_f64(t, t2, b2);
2767       __ vst1_64(b1, __ post(dst, 8), Assembler::ALIGN_STD);
2768       __ orrs(t, t, t2);
2769       __ b(Lset_result, Assembler::NE);
2770       __ subs(len, len, 16);
2771       __ b(Lloop2, Assembler::GE);
2772     }
2773 
2774     __ vuzp_64_8(a1, a2); // a1 now has lower bytes, a2 upper
2775     __ vmov_f64(t, t2, a2);
2776     __ vst1_64(a1, __ post(dst, 8), Assembler::ALIGN_STD);
2777     __ orrs(t, t, t2);
2778     __ b(Lset_result, Assembler::NE);
2779     __ adds(len, len, 16);
2780     __ ret(lr); // leaves Z-flag to check for per-char slow case
2781 
2782     __ bind(Lset_result);
2783     __ movs_i(result, 0, Assembler::NE); // sets Z flag
2784     __ ret(lr);
2785 
2786     return start;
2787   }
2788 
2789   address generate_string_inflate_neon() {
2790     __ align(CodeEntryAlignment);
2791     StubCodeMark mark(this, "StubRoutines", "string_inflate_neon");
2792     address start = __ pc();
2793 
2794     Register src = r0;
2795     Register dst = r1;
2796     Register len = r2;
2797     FloatRegister a1 = d0;
2798 
2799     Label Lloop2;
2800 
2801     __ sub(len, len, 16);
2802     __ bind(Lloop2); {
2803       __ vld1_64(d0, __ post(src, 8), Assembler::ALIGN_STD);
2804       __ vmovl_8u(q0, d0);
2805       __ vst1_64(d0, d1, __ post(dst, 16), Assembler::ALIGN_STD);
2806       __ vld1_64(d0, __ post(src, 8), Assembler::ALIGN_STD);
2807       __ vmovl_8u(q0, d0);
2808       __ vst1_64(d0, d1, __ post(dst, 16), Assembler::ALIGN_STD);
2809       __ subs(len, len, 16);
2810       __ b(Lloop2, Assembler::HS);
2811     }
2812 
2813     __ adds(len, len, 16); // sets Z flag to check in intrinsic
2814     __ ret(lr);
2815 
2816     return start;
2817   }
2818 
2819   void generate_c2_stubs() {
2820     StubRoutines::aarch32::_idiv_entry =
2821             generate_idiv_irem_stub("idiv_c2_stub", false);
2822     StubRoutines::aarch32::_irem_entry =
2823             generate_idiv_irem_stub("irem_c2_stub", true);
2824     StubRoutines::aarch32::_partial_subtype_check =
2825             generate_partial_subtype_check();
2826     if (VM_Version::features() & FT_AdvSIMD) {
2827       StubRoutines::aarch32::_string_compress_neon =
2828               generate_string_compress_neon();
2829       StubRoutines::aarch32::_string_inflate_neon =
2830               generate_string_inflate_neon();
2831     }
2832   }
2833 #endif
2834 
2835   void generate_all() {
2836     // support for verify_oop (must happen after universe_init)
2837     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
2838     StubRoutines::_throw_AbstractMethodError_entry =
2839       generate_throw_exception("AbstractMethodError throw_exception",
2840                                CAST_FROM_FN_PTR(address,
2841                                                 SharedRuntime::
2842                                                 throw_AbstractMethodError));
2843 
2844     StubRoutines::_throw_IncompatibleClassChangeError_entry =
2845       generate_throw_exception("IncompatibleClassChangeError throw_exception",
2846                                CAST_FROM_FN_PTR(address,
2847                                                 SharedRuntime::
2848                                                 throw_IncompatibleClassChangeError));
2849 
2850     StubRoutines::_throw_NullPointerException_at_call_entry =
2851       generate_throw_exception("NullPointerException at call throw_exception",
2852                                CAST_FROM_FN_PTR(address,
2853                                                 SharedRuntime::
2854                                                 throw_NullPointerException_at_call));
2855 
2856     // arraycopy stubs used by compilers
2857     generate_arraycopy_stubs();
2858 
2859 #ifdef COMPILER2
2860     if (UseMultiplyToLenIntrinsic) {
2861       StubRoutines::_multiplyToLen = generate_multiplyToLen();
2862       StubRoutines::_mulAdd = generate_mulAdd();
2863     }
2864 #endif
2865 
2866     if (UseMontgomeryMultiplyIntrinsic) {
2867       StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply");
2868       MontgomeryMultiplyGenerator g(_masm, /*squaring*/false);
2869       StubRoutines::_montgomeryMultiply = g.generate_multiply();
2870     }
2871 
2872     if (UseMontgomerySquareIntrinsic) {
2873       StubCodeMark mark(this, "StubRoutines", "montgomerySquare");
2874       MontgomeryMultiplyGenerator g(_masm, /*squaring*/true);
2875       StubRoutines::_montgomerySquare = g.generate_square();
2876     }
2877 
2878     // Safefetch stubs.
2879     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
2880                                                        &StubRoutines::_safefetch32_fault_pc,
2881                                                        &StubRoutines::_safefetch32_continuation_pc);
2882     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
2883                                                        &StubRoutines::_safefetchN_fault_pc,
2884                                                        &StubRoutines::_safefetchN_continuation_pc);
2885 
2886 #ifdef COMPILER2
2887     generate_c2_stubs();
2888 #endif
2889   }
2890 
2891  public:
2892   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
2893     if (all) {
2894       generate_all();
2895     } else {
2896       generate_initial();
2897     }
2898 
2899   }
2900 }; // end class declaration
2901 
2902 void StubGenerator_generate(CodeBuffer* code, bool all) {
2903   StubGenerator g(code, all);
2904 }