1 /*
   2  * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "assembler_arm.inline.hpp"
  28 #include "interpreter/interpreter.hpp"
  29 #include "nativeInst_arm.hpp"
  30 #include "oops/instanceOop.hpp"
  31 #include "oops/method.hpp"
  32 #include "oops/objArrayKlass.hpp"
  33 #include "oops/oop.inline.hpp"
  34 #include "prims/methodHandles.hpp"
  35 #include "runtime/frame.inline.hpp"
  36 #include "runtime/handles.inline.hpp"
  37 #include "runtime/sharedRuntime.hpp"
  38 #include "runtime/stubCodeGenerator.hpp"
  39 #include "runtime/stubRoutines.hpp"
  40 #ifdef COMPILER2
  41 #include "opto/runtime.hpp"
  42 #endif
  43 
  44 // Declaration and definition of StubGenerator (no .hpp file).
  45 // For a more detailed description of the stub routine structure
  46 // see the comment in stubRoutines.hpp
  47 
  48 #define __ _masm->
  49 
  50 #ifdef PRODUCT
  51 #define BLOCK_COMMENT(str) /* nothing */
  52 #else
  53 #define BLOCK_COMMENT(str) __ block_comment(str)
  54 #endif
  55 
  56 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  57 
  58 // -------------------------------------------------------------------------------------------------------------------------
  59 // Stub Code definitions
  60 
  61 // Platform dependent parameters for array copy stubs
  62 
  63 // Note: we have noticed a huge change in behavior on a microbenchmark
  64 // from platform to platform depending on the configuration.
  65 
  66 // Instead of adding a series of command line options (which
  67 // unfortunately have to be done in the shared file and cannot appear
  68 // only in the ARM port), the tested result are hard-coded here in a set
  69 // of options, selected by specifying 'ArmCopyPlatform'
  70 
  71 // Currently, this 'platform' is hardcoded to a value that is a good
  72 // enough trade-off.  However, one can easily modify this file to test
  73 // the hard-coded configurations or create new ones. If the gain is
  74 // significant, we could decide to either add command line options or
  75 // add code to automatically choose a configuration.
  76 
  77 // see comments below for the various configurations created
  78 #define DEFAULT_ARRAYCOPY_CONFIG 0
  79 #define TEGRA2_ARRAYCOPY_CONFIG 1
  80 #define IMX515_ARRAYCOPY_CONFIG 2
  81 
  82 // Hard coded choices (XXX: could be changed to a command line option)
  83 #define ArmCopyPlatform DEFAULT_ARRAYCOPY_CONFIG
  84 
  85 #ifdef AARCH64
  86 #define ArmCopyCacheLineSize 64
  87 #else
  88 #define ArmCopyCacheLineSize 32 // not worth optimizing to 64 according to measured gains
  89 #endif // AARCH64
  90 
  91 // TODO-AARCH64: tune and revise AArch64 arraycopy optimizations
  92 
  93 // configuration for each kind of loop
  94 typedef struct {
  95   int pld_distance;       // prefetch distance (0 => no prefetch, <0: prefetch_before);
  96 #ifndef AARCH64
  97   bool split_ldm;         // if true, split each STM in STMs with fewer registers
  98   bool split_stm;         // if true, split each LTM in LTMs with fewer registers
  99 #endif // !AARCH64
 100 } arraycopy_loop_config;
 101 
 102 // configuration for all loops
 103 typedef struct {
 104   // const char *description;
 105   arraycopy_loop_config forward_aligned;
 106   arraycopy_loop_config backward_aligned;
 107   arraycopy_loop_config forward_shifted;
 108   arraycopy_loop_config backward_shifted;
 109 } arraycopy_platform_config;
 110 
 111 // configured platforms
 112 static arraycopy_platform_config arraycopy_configurations[] = {
 113   // configuration parameters for arraycopy loops
 114 #ifdef AARCH64
 115   {
 116     {-256 }, // forward aligned
 117     {-128 }, // backward aligned
 118     {-256 }, // forward shifted
 119     {-128 }  // backward shifted
 120   }
 121 #else
 122 
 123   // Configurations were chosen based on manual analysis of benchmark
 124   // results, minimizing overhead with respect to best results on the
 125   // different test cases.
 126 
 127   // Prefetch before is always favored since it avoids dirtying the
 128   // cache uselessly for small copies. Code for prefetch after has
 129   // been kept in case the difference is significant for some
 130   // platforms but we might consider dropping it.
 131 
 132   // distance, ldm, stm
 133   {
 134     // default: tradeoff tegra2/imx515/nv-tegra2,
 135     // Notes on benchmarking:
 136     // - not far from optimal configuration on nv-tegra2
 137     // - within 5% of optimal configuration except for backward aligned on IMX
 138     // - up to 40% from optimal configuration for backward shifted and backward align for tegra2
 139     //   but still on par with the operating system copy
 140     {-256, true,  true  }, // forward aligned
 141     {-256, true,  true  }, // backward aligned
 142     {-256, false, false }, // forward shifted
 143     {-256, true,  true  } // backward shifted
 144   },
 145   {
 146     // configuration tuned on tegra2-4.
 147     // Warning: should not be used on nv-tegra2 !
 148     // Notes:
 149     // - prefetch after gives 40% gain on backward copies on tegra2-4,
 150     //   resulting in better number than the operating system
 151     //   copy. However, this can lead to a 300% loss on nv-tegra and has
 152     //   more impact on the cache (fetches futher than what is
 153     //   copied). Use this configuration with care, in case it improves
 154     //   reference benchmarks.
 155     {-256, true,  true  }, // forward aligned
 156     {96,   false, false }, // backward aligned
 157     {-256, false, false }, // forward shifted
 158     {96,   false, false } // backward shifted
 159   },
 160   {
 161     // configuration tuned on imx515
 162     // Notes:
 163     // - smaller prefetch distance is sufficient to get good result and might be more stable
 164     // - refined backward aligned options within 5% of optimal configuration except for
 165     //   tests were the arrays fit in the cache
 166     {-160, false, false }, // forward aligned
 167     {-160, false, false }, // backward aligned
 168     {-160, false, false }, // forward shifted
 169     {-160, true,  true  } // backward shifted
 170   }
 171 #endif // AARCH64
 172 };
 173 
 174 class StubGenerator: public StubCodeGenerator {
 175 
 176 #ifdef PRODUCT
 177 #define inc_counter_np(a,b,c) ((void)0)
 178 #else
 179 #define inc_counter_np(counter, t1, t2) \
 180   BLOCK_COMMENT("inc_counter " #counter); \
 181   __ inc_counter(&counter, t1, t2);
 182 #endif
 183 
 184  private:
 185 
 186   address generate_call_stub(address& return_address) {
 187     StubCodeMark mark(this, "StubRoutines", "call_stub");
 188     address start = __ pc();
 189 
 190 #ifdef AARCH64
 191     const int saved_regs_size = 192;
 192 
 193     __ stp(FP, LR, Address(SP, -saved_regs_size, pre_indexed));
 194     __ mov(FP, SP);
 195 
 196     int sp_offset = 16;
 197     assert(frame::entry_frame_call_wrapper_offset * wordSize == sp_offset, "adjust this code");
 198     __ stp(R0,  ZR,  Address(SP, sp_offset)); sp_offset += 16;
 199 
 200     const int saved_result_and_result_type_offset = sp_offset;
 201     __ stp(R1,  R2,  Address(SP, sp_offset)); sp_offset += 16;
 202     __ stp(R19, R20, Address(SP, sp_offset)); sp_offset += 16;
 203     __ stp(R21, R22, Address(SP, sp_offset)); sp_offset += 16;
 204     __ stp(R23, R24, Address(SP, sp_offset)); sp_offset += 16;
 205     __ stp(R25, R26, Address(SP, sp_offset)); sp_offset += 16;
 206     __ stp(R27, R28, Address(SP, sp_offset)); sp_offset += 16;
 207 
 208     __ stp_d(V8,  V9,  Address(SP, sp_offset)); sp_offset += 16;
 209     __ stp_d(V10, V11, Address(SP, sp_offset)); sp_offset += 16;
 210     __ stp_d(V12, V13, Address(SP, sp_offset)); sp_offset += 16;
 211     __ stp_d(V14, V15, Address(SP, sp_offset)); sp_offset += 16;
 212     assert (sp_offset == saved_regs_size, "adjust this code");
 213 
 214     __ mov(Rmethod, R3);
 215     __ mov(Rthread, R7);
 216     __ reinit_heapbase();
 217 
 218     { // Pass parameters
 219       Label done_parameters, pass_parameters;
 220 
 221       __ mov(Rparams, SP);
 222       __ cbz_w(R6, done_parameters);
 223 
 224       __ sub(Rtemp, SP, R6, ex_uxtw, LogBytesPerWord);
 225       __ align_reg(SP, Rtemp, StackAlignmentInBytes);
 226       __ add(Rparams, SP, R6, ex_uxtw, LogBytesPerWord);
 227 
 228       __ bind(pass_parameters);
 229       __ subs_w(R6, R6, 1);
 230       __ ldr(Rtemp, Address(R5, wordSize, post_indexed));
 231       __ str(Rtemp, Address(Rparams, -wordSize, pre_indexed));
 232       __ b(pass_parameters, ne);
 233 
 234       __ bind(done_parameters);
 235 
 236 #ifdef ASSERT
 237       {
 238         Label L;
 239         __ cmp(SP, Rparams);
 240         __ b(L, eq);
 241         __ stop("SP does not match Rparams");
 242         __ bind(L);
 243       }
 244 #endif
 245     }
 246 
 247     __ mov(Rsender_sp, SP);
 248     __ blr(R4);
 249     return_address = __ pc();
 250 
 251     __ mov(SP, FP);
 252 
 253     __ ldp(R1, R2, Address(SP, saved_result_and_result_type_offset));
 254 
 255     { // Handle return value
 256       Label cont;
 257       __ str(R0, Address(R1));
 258 
 259       __ cmp_w(R2, T_DOUBLE);
 260       __ ccmp_w(R2, T_FLOAT, Assembler::flags_for_condition(eq), ne);
 261       __ b(cont, ne);
 262 
 263       __ str_d(V0, Address(R1));
 264       __ bind(cont);
 265     }
 266 
 267     sp_offset = saved_result_and_result_type_offset + 16;
 268     __ ldp(R19, R20, Address(SP, sp_offset)); sp_offset += 16;
 269     __ ldp(R21, R22, Address(SP, sp_offset)); sp_offset += 16;
 270     __ ldp(R23, R24, Address(SP, sp_offset)); sp_offset += 16;
 271     __ ldp(R25, R26, Address(SP, sp_offset)); sp_offset += 16;
 272     __ ldp(R27, R28, Address(SP, sp_offset)); sp_offset += 16;
 273 
 274     __ ldp_d(V8,  V9,  Address(SP, sp_offset)); sp_offset += 16;
 275     __ ldp_d(V10, V11, Address(SP, sp_offset)); sp_offset += 16;
 276     __ ldp_d(V12, V13, Address(SP, sp_offset)); sp_offset += 16;
 277     __ ldp_d(V14, V15, Address(SP, sp_offset)); sp_offset += 16;
 278     assert (sp_offset == saved_regs_size, "adjust this code");
 279 
 280     __ ldp(FP, LR, Address(SP, saved_regs_size, post_indexed));
 281     __ ret();
 282 
 283 #else // AARCH64
 284 
 285     assert(frame::entry_frame_call_wrapper_offset == 0, "adjust this code");
 286 
 287     __ mov(Rtemp, SP);
 288     __ push(RegisterSet(FP) | RegisterSet(LR));
 289 #ifndef __SOFTFP__
 290     __ fstmdbd(SP, FloatRegisterSet(D8, 8), writeback);
 291 #endif
 292     __ stmdb(SP, RegisterSet(R0, R2) | RegisterSet(R4, R6) | RegisterSet(R8, R10) | altFP_7_11, writeback);
 293     __ mov(Rmethod, R3);
 294     __ ldmia(Rtemp, RegisterSet(R1, R3) | Rthread); // stacked arguments
 295 
 296     // XXX: TODO
 297     // Would be better with respect to native tools if the following
 298     // setting of FP was changed to conform to the native ABI, with FP
 299     // pointing to the saved FP slot (and the corresponding modifications
 300     // for entry_frame_call_wrapper_offset and frame::real_fp).
 301     __ mov(FP, SP);
 302 
 303     {
 304       Label no_parameters, pass_parameters;
 305       __ cmp(R3, 0);
 306       __ b(no_parameters, eq);
 307 
 308       __ bind(pass_parameters);
 309       __ ldr(Rtemp, Address(R2, wordSize, post_indexed)); // Rtemp OK, unused and scratchable
 310       __ subs(R3, R3, 1);
 311       __ push(Rtemp);
 312       __ b(pass_parameters, ne);
 313       __ bind(no_parameters);
 314     }
 315 
 316     __ mov(Rsender_sp, SP);
 317     __ blx(R1);
 318     return_address = __ pc();
 319 
 320     __ add(SP, FP, wordSize); // Skip link to JavaCallWrapper
 321     __ pop(RegisterSet(R2, R3));
 322 #ifndef __ABI_HARD__
 323     __ cmp(R3, T_LONG);
 324     __ cmp(R3, T_DOUBLE, ne);
 325     __ str(R0, Address(R2));
 326     __ str(R1, Address(R2, wordSize), eq);
 327 #else
 328     Label cont, l_float, l_double;
 329 
 330     __ cmp(R3, T_DOUBLE);
 331     __ b(l_double, eq);
 332 
 333     __ cmp(R3, T_FLOAT);
 334     __ b(l_float, eq);
 335 
 336     __ cmp(R3, T_LONG);
 337     __ str(R0, Address(R2));
 338     __ str(R1, Address(R2, wordSize), eq);
 339     __ b(cont);
 340 
 341 
 342     __ bind(l_double);
 343     __ fstd(D0, Address(R2));
 344     __ b(cont);
 345 
 346     __ bind(l_float);
 347     __ fsts(S0, Address(R2));
 348 
 349     __ bind(cont);
 350 #endif
 351 
 352     __ pop(RegisterSet(R4, R6) | RegisterSet(R8, R10) | altFP_7_11);
 353 #ifndef __SOFTFP__
 354     __ fldmiad(SP, FloatRegisterSet(D8, 8), writeback);
 355 #endif
 356     __ pop(RegisterSet(FP) | RegisterSet(PC));
 357 
 358 #endif // AARCH64
 359     return start;
 360   }
 361 
 362 
 363   // (in) Rexception_obj: exception oop
 364   address generate_catch_exception() {
 365     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 366     address start = __ pc();
 367 
 368     __ str(Rexception_obj, Address(Rthread, Thread::pending_exception_offset()));
 369     __ b(StubRoutines::_call_stub_return_address);
 370 
 371     return start;
 372   }
 373 
 374 
 375   // (in) Rexception_pc: return address
 376   address generate_forward_exception() {
 377     StubCodeMark mark(this, "StubRoutines", "forward exception");
 378     address start = __ pc();
 379 
 380     __ mov(c_rarg0, Rthread);
 381     __ mov(c_rarg1, Rexception_pc);
 382     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 383                          SharedRuntime::exception_handler_for_return_address),
 384                          c_rarg0, c_rarg1);
 385     __ ldr(Rexception_obj, Address(Rthread, Thread::pending_exception_offset()));
 386     const Register Rzero = __ zero_register(Rtemp); // Rtemp OK (cleared by above call)
 387     __ str(Rzero, Address(Rthread, Thread::pending_exception_offset()));
 388 
 389 #ifdef ASSERT
 390     // make sure exception is set
 391     { Label L;
 392       __ cbnz(Rexception_obj, L);
 393       __ stop("StubRoutines::forward exception: no pending exception (2)");
 394       __ bind(L);
 395     }
 396 #endif
 397 
 398     // Verify that there is really a valid exception in RAX.
 399     __ verify_oop(Rexception_obj);
 400 
 401     __ jump(R0); // handler is returned in R0 by runtime function
 402     return start;
 403   }
 404 
 405 
 406 #ifndef AARCH64
 407 
 408   // Integer division shared routine
 409   //   Input:
 410   //     R0  - dividend
 411   //     R2  - divisor
 412   //   Output:
 413   //     R0  - remainder
 414   //     R1  - quotient
 415   //   Destroys:
 416   //     R2
 417   //     LR
 418   address generate_idiv_irem() {
 419     Label positive_arguments, negative_or_zero, call_slow_path;
 420     Register dividend  = R0;
 421     Register divisor   = R2;
 422     Register remainder = R0;
 423     Register quotient  = R1;
 424     Register tmp       = LR;
 425     assert(dividend == remainder, "must be");
 426 
 427     address start = __ pc();
 428 
 429     // Check for special cases: divisor <= 0 or dividend < 0
 430     __ cmp(divisor, 0);
 431     __ orrs(quotient, dividend, divisor, ne);
 432     __ b(negative_or_zero, le);
 433 
 434     __ bind(positive_arguments);
 435     // Save return address on stack to free one extra register
 436     __ push(LR);
 437     // Approximate the mamximum order of the quotient
 438     __ clz(tmp, dividend);
 439     __ clz(quotient, divisor);
 440     __ subs(tmp, quotient, tmp);
 441     __ mov(quotient, 0);
 442     // Jump to the appropriate place in the unrolled loop below
 443     __ ldr(PC, Address(PC, tmp, lsl, 2), pl);
 444     // If divisor is greater than dividend, return immediately
 445     __ pop(PC);
 446 
 447     // Offset table
 448     Label offset_table[32];
 449     int i;
 450     for (i = 0; i <= 31; i++) {
 451       __ emit_address(offset_table[i]);
 452     }
 453 
 454     // Unrolled loop of 32 division steps
 455     for (i = 31; i >= 0; i--) {
 456       __ bind(offset_table[i]);
 457       __ cmp(remainder, AsmOperand(divisor, lsl, i));
 458       __ sub(remainder, remainder, AsmOperand(divisor, lsl, i), hs);
 459       __ add(quotient, quotient, 1 << i, hs);
 460     }
 461     __ pop(PC);
 462 
 463     __ bind(negative_or_zero);
 464     // Find the combination of argument signs and jump to corresponding handler
 465     __ andr(quotient, dividend, 0x80000000, ne);
 466     __ orr(quotient, quotient, AsmOperand(divisor, lsr, 31), ne);
 467     __ add(PC, PC, AsmOperand(quotient, ror, 26), ne);
 468     __ str(LR, Address(Rthread, JavaThread::saved_exception_pc_offset()));
 469 
 470     // The leaf runtime function can destroy R0-R3 and R12 registers which are still alive
 471     RegisterSet saved_registers = RegisterSet(R3) | RegisterSet(R12);
 472 #if R9_IS_SCRATCHED
 473     // Safer to save R9 here since callers may have been written
 474     // assuming R9 survives. This is suboptimal but may not be worth
 475     // revisiting for this slow case.
 476 
 477     // save also R10 for alignment
 478     saved_registers = saved_registers | RegisterSet(R9, R10);
 479 #endif
 480     {
 481       // divisor == 0
 482       FixedSizeCodeBlock zero_divisor(_masm, 8, true);
 483       __ push(saved_registers);
 484       __ mov(R0, Rthread);
 485       __ mov(R1, LR);
 486       __ mov(R2, SharedRuntime::IMPLICIT_DIVIDE_BY_ZERO);
 487       __ b(call_slow_path);
 488     }
 489 
 490     {
 491       // divisor > 0 && dividend < 0
 492       FixedSizeCodeBlock positive_divisor_negative_dividend(_masm, 8, true);
 493       __ push(LR);
 494       __ rsb(dividend, dividend, 0);
 495       __ bl(positive_arguments);
 496       __ rsb(remainder, remainder, 0);
 497       __ rsb(quotient, quotient, 0);
 498       __ pop(PC);
 499     }
 500 
 501     {
 502       // divisor < 0 && dividend > 0
 503       FixedSizeCodeBlock negative_divisor_positive_dividend(_masm, 8, true);
 504       __ push(LR);
 505       __ rsb(divisor, divisor, 0);
 506       __ bl(positive_arguments);
 507       __ rsb(quotient, quotient, 0);
 508       __ pop(PC);
 509     }
 510 
 511     {
 512       // divisor < 0 && dividend < 0
 513       FixedSizeCodeBlock negative_divisor_negative_dividend(_masm, 8, true);
 514       __ push(LR);
 515       __ rsb(dividend, dividend, 0);
 516       __ rsb(divisor, divisor, 0);
 517       __ bl(positive_arguments);
 518       __ rsb(remainder, remainder, 0);
 519       __ pop(PC);
 520     }
 521 
 522     __ bind(call_slow_path);
 523     __ call(CAST_FROM_FN_PTR(address, SharedRuntime::continuation_for_implicit_exception));
 524     __ pop(saved_registers);
 525     __ bx(R0);
 526 
 527     return start;
 528   }
 529 
 530 
 531  // As per atomic.hpp the Atomic read-modify-write operations must be logically implemented as:
 532  //  <fence>; <op>; <membar StoreLoad|StoreStore>
 533  // But for load-linked/store-conditional based systems a fence here simply means
 534  // no load/store can be reordered with respect to the initial load-linked, so we have:
 535  // <membar storeload|loadload> ; load-linked; <op>; store-conditional; <membar storeload|storestore>
 536  // There are no memory actions in <op> so nothing further is needed.
 537  //
 538  // So we define the following for convenience:
 539 #define MEMBAR_ATOMIC_OP_PRE \
 540     MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad|MacroAssembler::LoadLoad)
 541 #define MEMBAR_ATOMIC_OP_POST \
 542     MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad|MacroAssembler::StoreStore)
 543 
 544   // Note: JDK 9 only supports ARMv7+ so we always have ldrexd available even though the
 545   // code below allows for it to be otherwise. The else clause indicates an ARMv5 system
 546   // for which we do not support MP and so membars are not necessary. This ARMv5 code will
 547   // be removed in the future.
 548 
 549   // Support for jint Atomic::add(jint add_value, volatile jint *dest)
 550   //
 551   // Arguments :
 552   //
 553   //      add_value:      R0
 554   //      dest:           R1
 555   //
 556   // Results:
 557   //
 558   //     R0: the new stored in dest
 559   //
 560   // Overwrites:
 561   //
 562   //     R1, R2, R3
 563   //
 564   address generate_atomic_add() {
 565     address start;
 566 
 567     StubCodeMark mark(this, "StubRoutines", "atomic_add");
 568     Label retry;
 569     start = __ pc();
 570     Register addval    = R0;
 571     Register dest      = R1;
 572     Register prev      = R2;
 573     Register ok        = R2;
 574     Register newval    = R3;
 575 
 576     if (VM_Version::supports_ldrex()) {
 577       __ membar(MEMBAR_ATOMIC_OP_PRE, prev);
 578       __ bind(retry);
 579       __ ldrex(newval, Address(dest));
 580       __ add(newval, addval, newval);
 581       __ strex(ok, newval, Address(dest));
 582       __ cmp(ok, 0);
 583       __ b(retry, ne);
 584       __ mov (R0, newval);
 585       __ membar(MEMBAR_ATOMIC_OP_POST, prev);
 586     } else {
 587       __ bind(retry);
 588       __ ldr (prev, Address(dest));
 589       __ add(newval, addval, prev);
 590       __ atomic_cas_bool(prev, newval, dest, 0, noreg/*ignored*/);
 591       __ b(retry, ne);
 592       __ mov (R0, newval);
 593     }
 594     __ bx(LR);
 595 
 596     return start;
 597   }
 598 
 599   // Support for jint Atomic::xchg(jint exchange_value, volatile jint *dest)
 600   //
 601   // Arguments :
 602   //
 603   //      exchange_value: R0
 604   //      dest:           R1
 605   //
 606   // Results:
 607   //
 608   //     R0: the value previously stored in dest
 609   //
 610   // Overwrites:
 611   //
 612   //     R1, R2, R3
 613   //
 614   address generate_atomic_xchg() {
 615     address start;
 616 
 617     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
 618     start = __ pc();
 619     Register newval    = R0;
 620     Register dest      = R1;
 621     Register prev      = R2;
 622 
 623     Label retry;
 624 
 625     if (VM_Version::supports_ldrex()) {
 626       Register ok=R3;
 627       __ membar(MEMBAR_ATOMIC_OP_PRE, prev);
 628       __ bind(retry);
 629       __ ldrex(prev, Address(dest));
 630       __ strex(ok, newval, Address(dest));
 631       __ cmp(ok, 0);
 632       __ b(retry, ne);
 633       __ mov (R0, prev);
 634       __ membar(MEMBAR_ATOMIC_OP_POST, prev);
 635     } else {
 636       __ bind(retry);
 637       __ ldr (prev, Address(dest));
 638       __ atomic_cas_bool(prev, newval, dest, 0, noreg/*ignored*/);
 639       __ b(retry, ne);
 640       __ mov (R0, prev);
 641     }
 642     __ bx(LR);
 643 
 644     return start;
 645   }
 646 
 647   // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint *dest, jint compare_value)
 648   //
 649   // Arguments :
 650   //
 651   //      compare_value:  R0
 652   //      exchange_value: R1
 653   //      dest:           R2
 654   //
 655   // Results:
 656   //
 657   //     R0: the value previously stored in dest
 658   //
 659   // Overwrites:
 660   //
 661   //     R0, R1, R2, R3, Rtemp
 662   //
 663   address generate_atomic_cmpxchg() {
 664     address start;
 665 
 666     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
 667     start = __ pc();
 668     Register cmp       = R0;
 669     Register newval    = R1;
 670     Register dest      = R2;
 671     Register temp1     = R3;
 672     Register temp2     = Rtemp; // Rtemp free (native ABI)
 673 
 674     __ membar(MEMBAR_ATOMIC_OP_PRE, temp1);
 675 
 676     // atomic_cas returns previous value in R0
 677     __ atomic_cas(temp1, temp2, cmp, newval, dest, 0);
 678 
 679     __ membar(MEMBAR_ATOMIC_OP_POST, temp1);
 680 
 681     __ bx(LR);
 682 
 683     return start;
 684   }
 685 
 686   // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
 687   // reordered before by a wrapper to (jlong compare_value, jlong exchange_value, volatile jlong *dest)
 688   //
 689   // Arguments :
 690   //
 691   //      compare_value:  R1 (High), R0 (Low)
 692   //      exchange_value: R3 (High), R2 (Low)
 693   //      dest:           SP+0
 694   //
 695   // Results:
 696   //
 697   //     R0:R1: the value previously stored in dest
 698   //
 699   // Overwrites:
 700   //
 701   address generate_atomic_cmpxchg_long() {
 702     address start;
 703 
 704     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
 705     start = __ pc();
 706     Register cmp_lo      = R0;
 707     Register cmp_hi      = R1;
 708     Register newval_lo   = R2;
 709     Register newval_hi   = R3;
 710     Register addr        = Rtemp;  /* After load from stack */
 711     Register temp_lo     = R4;
 712     Register temp_hi     = R5;
 713     Register temp_result = R8;
 714     assert_different_registers(cmp_lo, newval_lo, temp_lo, addr, temp_result, R7);
 715     assert_different_registers(cmp_hi, newval_hi, temp_hi, addr, temp_result, R7);
 716 
 717     __ membar(MEMBAR_ATOMIC_OP_PRE, Rtemp); // Rtemp free (native ABI)
 718 
 719     // Stack is unaligned, maintain double word alignment by pushing
 720     // odd number of regs.
 721     __ push(RegisterSet(temp_result) | RegisterSet(temp_lo, temp_hi));
 722     __ ldr(addr, Address(SP, 12));
 723 
 724     // atomic_cas64 returns previous value in temp_lo, temp_hi
 725     __ atomic_cas64(temp_lo, temp_hi, temp_result, cmp_lo, cmp_hi,
 726                     newval_lo, newval_hi, addr, 0);
 727     __ mov(R0, temp_lo);
 728     __ mov(R1, temp_hi);
 729 
 730     __ pop(RegisterSet(temp_result) | RegisterSet(temp_lo, temp_hi));
 731 
 732     __ membar(MEMBAR_ATOMIC_OP_POST, Rtemp); // Rtemp free (native ABI)
 733     __ bx(LR);
 734 
 735     return start;
 736   }
 737 
 738   address generate_atomic_load_long() {
 739     address start;
 740 
 741     StubCodeMark mark(this, "StubRoutines", "atomic_load_long");
 742     start = __ pc();
 743     Register result_lo = R0;
 744     Register result_hi = R1;
 745     Register src       = R0;
 746 
 747     if (!os::is_MP()) {
 748       __ ldmia(src, RegisterSet(result_lo, result_hi));
 749       __ bx(LR);
 750     } else if (VM_Version::supports_ldrexd()) {
 751       __ ldrexd(result_lo, Address(src));
 752       __ clrex(); // FIXME: safe to remove?
 753       __ bx(LR);
 754     } else {
 755       __ stop("Atomic load(jlong) unsupported on this platform");
 756       __ bx(LR);
 757     }
 758 
 759     return start;
 760   }
 761 
 762   address generate_atomic_store_long() {
 763     address start;
 764 
 765     StubCodeMark mark(this, "StubRoutines", "atomic_store_long");
 766     start = __ pc();
 767     Register newval_lo = R0;
 768     Register newval_hi = R1;
 769     Register dest      = R2;
 770     Register scratch_lo    = R2;
 771     Register scratch_hi    = R3;  /* After load from stack */
 772     Register result    = R3;
 773 
 774     if (!os::is_MP()) {
 775       __ stmia(dest, RegisterSet(newval_lo, newval_hi));
 776       __ bx(LR);
 777     } else if (VM_Version::supports_ldrexd()) {
 778       __ mov(Rtemp, dest);  // get dest to Rtemp
 779       Label retry;
 780       __ bind(retry);
 781       __ ldrexd(scratch_lo, Address(Rtemp));
 782       __ strexd(result, R0, Address(Rtemp));
 783       __ rsbs(result, result, 1);
 784       __ b(retry, eq);
 785       __ bx(LR);
 786     } else {
 787       __ stop("Atomic store(jlong) unsupported on this platform");
 788       __ bx(LR);
 789     }
 790 
 791     return start;
 792   }
 793 
 794 
 795 #endif // AARCH64
 796 
 797 #ifdef COMPILER2
 798   // Support for uint StubRoutine::Arm::partial_subtype_check( Klass sub, Klass super );
 799   // Arguments :
 800   //
 801   //      ret  : R0, returned
 802   //      icc/xcc: set as R0 (depending on wordSize)
 803   //      sub  : R1, argument, not changed
 804   //      super: R2, argument, not changed
 805   //      raddr: LR, blown by call
 806   address generate_partial_subtype_check() {
 807     __ align(CodeEntryAlignment);
 808     StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
 809     address start = __ pc();
 810 
 811     // based on SPARC check_klass_subtype_[fast|slow]_path (without CompressedOops)
 812 
 813     // R0 used as tmp_reg (in addition to return reg)
 814     Register sub_klass = R1;
 815     Register super_klass = R2;
 816     Register tmp_reg2 = R3;
 817     Register tmp_reg3 = R4;
 818 #define saved_set tmp_reg2, tmp_reg3
 819 
 820     Label L_loop, L_fail;
 821 
 822     int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
 823 
 824     // fast check should be redundant
 825 
 826     // slow check
 827     {
 828       __ raw_push(saved_set);
 829 
 830       // a couple of useful fields in sub_klass:
 831       int ss_offset = in_bytes(Klass::secondary_supers_offset());
 832 
 833       // Do a linear scan of the secondary super-klass chain.
 834       // This code is rarely used, so simplicity is a virtue here.
 835 
 836       inc_counter_np(SharedRuntime::_partial_subtype_ctr, tmp_reg2, tmp_reg3);
 837 
 838       Register scan_temp = tmp_reg2;
 839       Register count_temp = tmp_reg3;
 840 
 841       // We will consult the secondary-super array.
 842       __ ldr(scan_temp, Address(sub_klass, ss_offset));
 843 
 844       Register search_key = super_klass;
 845 
 846       // Load the array length.
 847       __ ldr_s32(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes()));
 848       __ add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes());
 849 
 850       __ add(count_temp, count_temp, 1);
 851 
 852       // Top of search loop
 853       __ bind(L_loop);
 854       // Notes:
 855       //  scan_temp starts at the array elements
 856       //  count_temp is 1+size
 857       __ subs(count_temp, count_temp, 1);
 858       __ b(L_fail, eq); // not found in the array
 859 
 860       // Load next super to check
 861       // In the array of super classes elements are pointer sized.
 862       int element_size = wordSize;
 863       __ ldr(R0, Address(scan_temp, element_size, post_indexed));
 864 
 865       // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list
 866       __ subs(R0, R0, search_key); // set R0 to 0 on success (and flags to eq)
 867 
 868       // A miss means we are NOT a subtype and need to keep looping
 869       __ b(L_loop, ne);
 870 
 871       // Falling out the bottom means we found a hit; we ARE a subtype
 872 
 873       // Success.  Cache the super we found and proceed in triumph.
 874       __ str(super_klass, Address(sub_klass, sc_offset));
 875 
 876       // Return success
 877       // R0 is already 0 and flags are already set to eq
 878       __ raw_pop(saved_set);
 879       __ ret();
 880 
 881       // Return failure
 882       __ bind(L_fail);
 883 #ifdef AARCH64
 884       // count_temp is 0, can't use ZR here
 885       __ adds(R0, count_temp, 1); // sets the flags
 886 #else
 887       __ movs(R0, 1); // sets the flags
 888 #endif
 889       __ raw_pop(saved_set);
 890       __ ret();
 891     }
 892     return start;
 893   }
 894 #undef saved_set
 895 #endif // COMPILER2
 896 
 897 
 898   //----------------------------------------------------------------------------------------------------
 899   // Non-destructive plausibility checks for oops
 900 
 901   address generate_verify_oop() {
 902     StubCodeMark mark(this, "StubRoutines", "verify_oop");
 903     address start = __ pc();
 904 
 905     // Incoming arguments:
 906     //
 907     // R0: error message (char* )
 908     // R1: address of register save area
 909     // R2: oop to verify
 910     //
 911     // All registers are saved before calling this stub. However, condition flags should be saved here.
 912 
 913     const Register oop   = R2;
 914     const Register klass = R3;
 915     const Register tmp1  = R6;
 916     const Register tmp2  = R8;
 917 
 918     const Register flags     = Rtmp_save0; // R4/R19
 919     const Register ret_addr  = Rtmp_save1; // R5/R20
 920     assert_different_registers(oop, klass, tmp1, tmp2, flags, ret_addr, R7);
 921 
 922     Label exit, error;
 923     InlinedAddress verify_oop_count((address) StubRoutines::verify_oop_count_addr());
 924 
 925 #ifdef AARCH64
 926     __ mrs(flags, Assembler::SysReg_NZCV);
 927 #else
 928     __ mrs(Assembler::CPSR, flags);
 929 #endif // AARCH64
 930 
 931     __ ldr_literal(tmp1, verify_oop_count);
 932     __ ldr_s32(tmp2, Address(tmp1));
 933     __ add(tmp2, tmp2, 1);
 934     __ str_32(tmp2, Address(tmp1));
 935 
 936     // make sure object is 'reasonable'
 937     __ cbz(oop, exit);                           // if obj is NULL it is ok
 938 
 939     // Check if the oop is in the right area of memory
 940     // Note: oop_mask and oop_bits must be updated if the code is saved/reused
 941     const address oop_mask = (address) Universe::verify_oop_mask();
 942     const address oop_bits = (address) Universe::verify_oop_bits();
 943     __ mov_address(tmp1, oop_mask, symbolic_Relocation::oop_mask_reference);
 944     __ andr(tmp2, oop, tmp1);
 945     __ mov_address(tmp1, oop_bits, symbolic_Relocation::oop_bits_reference);
 946     __ cmp(tmp2, tmp1);
 947     __ b(error, ne);
 948 
 949     // make sure klass is 'reasonable'
 950     __ load_klass(klass, oop);                   // get klass
 951     __ cbz(klass, error);                        // if klass is NULL it is broken
 952 
 953     // return if everything seems ok
 954     __ bind(exit);
 955 
 956 #ifdef AARCH64
 957     __ msr(Assembler::SysReg_NZCV, flags);
 958 #else
 959     __ msr(Assembler::CPSR_f, flags);
 960 #endif // AARCH64
 961 
 962     __ ret();
 963 
 964     // handle errors
 965     __ bind(error);
 966 
 967     __ mov(ret_addr, LR);                      // save return address
 968 
 969     // R0: error message
 970     // R1: register save area
 971     __ call(CAST_FROM_FN_PTR(address, MacroAssembler::debug));
 972 
 973     __ mov(LR, ret_addr);
 974     __ b(exit);
 975 
 976     __ bind_literal(verify_oop_count);
 977 
 978     return start;
 979   }
 980 
 981   //----------------------------------------------------------------------------------------------------
 982   // Array copy stubs
 983 
 984   //
 985   //  Generate overlap test for array copy stubs
 986   //
 987   //  Input:
 988   //    R0    -  array1
 989   //    R1    -  array2
 990   //    R2    -  element count, 32-bit int
 991   //
 992   //  input registers are preserved
 993   //
 994   void array_overlap_test(address no_overlap_target, int log2_elem_size, Register tmp1, Register tmp2) {
 995     assert(no_overlap_target != NULL, "must be generated");
 996     array_overlap_test(no_overlap_target, NULL, log2_elem_size, tmp1, tmp2);
 997   }
 998   void array_overlap_test(Label& L_no_overlap, int log2_elem_size, Register tmp1, Register tmp2) {
 999     array_overlap_test(NULL, &L_no_overlap, log2_elem_size, tmp1, tmp2);
1000   }
1001   void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size, Register tmp1, Register tmp2) {
1002     const Register from       = R0;
1003     const Register to         = R1;
1004     const Register count      = R2;
1005     const Register to_from    = tmp1; // to - from
1006 #ifndef AARCH64
1007     const Register byte_count = (log2_elem_size == 0) ? count : tmp2; // count << log2_elem_size
1008 #endif // AARCH64
1009     assert_different_registers(from, to, count, tmp1, tmp2);
1010 
1011     // no_overlap version works if 'to' lower (unsigned) than 'from'
1012     // and or 'to' more than (count*size) from 'from'
1013 
1014     BLOCK_COMMENT("Array Overlap Test:");
1015     __ subs(to_from, to, from);
1016 #ifndef AARCH64
1017     if (log2_elem_size != 0) {
1018       __ mov(byte_count, AsmOperand(count, lsl, log2_elem_size));
1019     }
1020 #endif // !AARCH64
1021     if (NOLp == NULL)
1022       __ b(no_overlap_target,lo);
1023     else
1024       __ b((*NOLp), lo);
1025 #ifdef AARCH64
1026     __ subs(ZR, to_from, count, ex_sxtw, log2_elem_size);
1027 #else
1028     __ cmp(to_from, byte_count);
1029 #endif // AARCH64
1030     if (NOLp == NULL)
1031       __ b(no_overlap_target, ge);
1032     else
1033       __ b((*NOLp), ge);
1034   }
1035 
1036 #ifdef AARCH64
1037   // TODO-AARCH64: revise usages of bulk_* methods (probably ldp`s and stp`s should interlace)
1038 
1039   // Loads [from, from + count*wordSize) into regs[0], regs[1], ..., regs[count-1]
1040   // and increases 'from' by count*wordSize.
1041   void bulk_load_forward(Register from, const Register regs[], int count) {
1042     assert (count > 0 && count % 2 == 0, "count must be positive even number");
1043     int bytes = count * wordSize;
1044 
1045     int offset = 0;
1046     __ ldp(regs[0], regs[1], Address(from, bytes, post_indexed));
1047     offset += 2*wordSize;
1048 
1049     for (int i = 2; i < count; i += 2) {
1050       __ ldp(regs[i], regs[i+1], Address(from, -bytes + offset));
1051       offset += 2*wordSize;
1052     }
1053 
1054     assert (offset == bytes, "must be");
1055   }
1056 
1057   // Stores regs[0], regs[1], ..., regs[count-1] to [to, to + count*wordSize)
1058   // and increases 'to' by count*wordSize.
1059   void bulk_store_forward(Register to, const Register regs[], int count) {
1060     assert (count > 0 && count % 2 == 0, "count must be positive even number");
1061     int bytes = count * wordSize;
1062 
1063     int offset = 0;
1064     __ stp(regs[0], regs[1], Address(to, bytes, post_indexed));
1065     offset += 2*wordSize;
1066 
1067     for (int i = 2; i < count; i += 2) {
1068       __ stp(regs[i], regs[i+1], Address(to, -bytes + offset));
1069       offset += 2*wordSize;
1070     }
1071 
1072     assert (offset == bytes, "must be");
1073   }
1074 
1075   // Loads [from - count*wordSize, from) into regs[0], regs[1], ..., regs[count-1]
1076   // and decreases 'from' by count*wordSize.
1077   // Note that the word with lowest address goes to regs[0].
1078   void bulk_load_backward(Register from, const Register regs[], int count) {
1079     assert (count > 0 && count % 2 == 0, "count must be positive even number");
1080     int bytes = count * wordSize;
1081 
1082     int offset = 0;
1083 
1084     for (int i = count - 2; i > 0; i -= 2) {
1085       offset += 2*wordSize;
1086       __ ldp(regs[i], regs[i+1], Address(from, -offset));
1087     }
1088 
1089     offset += 2*wordSize;
1090     __ ldp(regs[0], regs[1], Address(from, -bytes, pre_indexed));
1091 
1092     assert (offset == bytes, "must be");
1093   }
1094 
1095   // Stores regs[0], regs[1], ..., regs[count-1] into [to - count*wordSize, to)
1096   // and decreases 'to' by count*wordSize.
1097   // Note that regs[0] value goes into the memory with lowest address.
1098   void bulk_store_backward(Register to, const Register regs[], int count) {
1099     assert (count > 0 && count % 2 == 0, "count must be positive even number");
1100     int bytes = count * wordSize;
1101 
1102     int offset = 0;
1103 
1104     for (int i = count - 2; i > 0; i -= 2) {
1105       offset += 2*wordSize;
1106       __ stp(regs[i], regs[i+1], Address(to, -offset));
1107     }
1108 
1109     offset += 2*wordSize;
1110     __ stp(regs[0], regs[1], Address(to, -bytes, pre_indexed));
1111 
1112     assert (offset == bytes, "must be");
1113   }
1114 #endif // AARCH64
1115 
1116   // TODO-AARCH64: rearrange in-loop prefetches:
1117   //   probably we should choose between "prefetch-store before or after store", not "before or after load".
1118   void prefetch(Register from, Register to, int offset, int to_delta = 0) {
1119     __ prefetch_read(Address(from, offset));
1120 #ifdef AARCH64
1121   // Next line commented out to avoid significant loss of performance in memory copy - JDK-8078120
1122   // __ prfm(pstl1keep, Address(to, offset + to_delta));
1123 #endif // AARCH64
1124   }
1125 
1126   // Generate the inner loop for forward aligned array copy
1127   //
1128   // Arguments
1129   //      from:      src address, 64 bits  aligned
1130   //      to:        dst address, wordSize aligned
1131   //      count:     number of elements (32-bit int)
1132   //      bytes_per_count: number of bytes for each unit of 'count'
1133   //
1134   // Return the minimum initial value for count
1135   //
1136   // Notes:
1137   // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64)
1138   // - 'to' aligned on wordSize
1139   // - 'count' must be greater or equal than the returned value
1140   //
1141   // Increases 'from' and 'to' by count*bytes_per_count.
1142   //
1143   // Scratches 'count', R3.
1144   // On AArch64 also scratches R4-R10; on 32-bit ARM R4-R10 are preserved (saved/restored).
1145   //
1146   int generate_forward_aligned_copy_loop(Register from, Register to, Register count, int bytes_per_count) {
1147     assert (from == R0 && to == R1 && count == R2, "adjust the implementation below");
1148 
1149     const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration
1150     arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_aligned;
1151     int pld_offset = config->pld_distance;
1152     const int count_per_loop = bytes_per_loop / bytes_per_count;
1153 
1154 #ifndef AARCH64
1155     bool split_read= config->split_ldm;
1156     bool split_write= config->split_stm;
1157 
1158     // XXX optim: use VLDM/VSTM when available (Neon) with PLD
1159     //  NEONCopyPLD
1160     //      PLD [r1, #0xC0]
1161     //      VLDM r1!,{d0-d7}
1162     //      VSTM r0!,{d0-d7}
1163     //      SUBS r2,r2,#0x40
1164     //      BGE NEONCopyPLD
1165 
1166     __ push(RegisterSet(R4,R10));
1167 #endif // !AARCH64
1168 
1169     const bool prefetch_before = pld_offset < 0;
1170     const bool prefetch_after = pld_offset > 0;
1171 
1172     Label L_skip_pld;
1173 
1174     // predecrease to exit when there is less than count_per_loop
1175     __ sub_32(count, count, count_per_loop);
1176 
1177     if (pld_offset != 0) {
1178       pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset;
1179 
1180       prefetch(from, to, 0);
1181 
1182       if (prefetch_before) {
1183         // If prefetch is done ahead, final PLDs that overflow the
1184         // copied area can be easily avoided. 'count' is predecreased
1185         // by the prefetch distance to optimize the inner loop and the
1186         // outer loop skips the PLD.
1187         __ subs_32(count, count, (bytes_per_loop+pld_offset)/bytes_per_count);
1188 
1189         // skip prefetch for small copies
1190         __ b(L_skip_pld, lt);
1191       }
1192 
1193       int offset = ArmCopyCacheLineSize;
1194       while (offset <= pld_offset) {
1195         prefetch(from, to, offset);
1196         offset += ArmCopyCacheLineSize;
1197       };
1198     }
1199 
1200 #ifdef AARCH64
1201     const Register data_regs[8] = {R3, R4, R5, R6, R7, R8, R9, R10};
1202 #endif // AARCH64
1203     {
1204       // LDM (32-bit ARM) / LDP (AArch64) copy of 'bytes_per_loop' bytes
1205 
1206       // 32-bit ARM note: we have tried implementing loop unrolling to skip one
1207       // PLD with 64 bytes cache line but the gain was not significant.
1208 
1209       Label L_copy_loop;
1210       __ align(OptoLoopAlignment);
1211       __ BIND(L_copy_loop);
1212 
1213       if (prefetch_before) {
1214         prefetch(from, to, bytes_per_loop + pld_offset);
1215         __ BIND(L_skip_pld);
1216       }
1217 
1218 #ifdef AARCH64
1219       bulk_load_forward(from, data_regs, 8);
1220 #else
1221       if (split_read) {
1222         // Split the register set in two sets so that there is less
1223         // latency between LDM and STM (R3-R6 available while R7-R10
1224         // still loading) and less register locking issue when iterating
1225         // on the first LDM.
1226         __ ldmia(from, RegisterSet(R3, R6), writeback);
1227         __ ldmia(from, RegisterSet(R7, R10), writeback);
1228       } else {
1229         __ ldmia(from, RegisterSet(R3, R10), writeback);
1230       }
1231 #endif // AARCH64
1232 
1233       __ subs_32(count, count, count_per_loop);
1234 
1235       if (prefetch_after) {
1236         prefetch(from, to, pld_offset, bytes_per_loop);
1237       }
1238 
1239 #ifdef AARCH64
1240       bulk_store_forward(to, data_regs, 8);
1241 #else
1242       if (split_write) {
1243         __ stmia(to, RegisterSet(R3, R6), writeback);
1244         __ stmia(to, RegisterSet(R7, R10), writeback);
1245       } else {
1246         __ stmia(to, RegisterSet(R3, R10), writeback);
1247       }
1248 #endif // AARCH64
1249 
1250       __ b(L_copy_loop, ge);
1251 
1252       if (prefetch_before) {
1253         // the inner loop may end earlier, allowing to skip PLD for the last iterations
1254         __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count);
1255         __ b(L_skip_pld, ge);
1256       }
1257     }
1258     BLOCK_COMMENT("Remaining bytes:");
1259     // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes
1260 
1261     // __ add(count, count, ...); // addition useless for the bit tests
1262     assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits");
1263 
1264 #ifdef AARCH64
1265     assert (bytes_per_loop == 64, "adjust the code below");
1266     assert (bytes_per_count <= 8, "adjust the code below");
1267 
1268     {
1269       Label L;
1270       __ tbz(count, exact_log2(32/bytes_per_count), L);
1271 
1272       bulk_load_forward(from, data_regs, 4);
1273       bulk_store_forward(to, data_regs, 4);
1274 
1275       __ bind(L);
1276     }
1277 
1278     {
1279       Label L;
1280       __ tbz(count, exact_log2(16/bytes_per_count), L);
1281 
1282       bulk_load_forward(from, data_regs, 2);
1283       bulk_store_forward(to, data_regs, 2);
1284 
1285       __ bind(L);
1286     }
1287 
1288     {
1289       Label L;
1290       __ tbz(count, exact_log2(8/bytes_per_count), L);
1291 
1292       __ ldr(R3, Address(from, 8, post_indexed));
1293       __ str(R3, Address(to,   8, post_indexed));
1294 
1295       __ bind(L);
1296     }
1297 
1298     if (bytes_per_count <= 4) {
1299       Label L;
1300       __ tbz(count, exact_log2(4/bytes_per_count), L);
1301 
1302       __ ldr_w(R3, Address(from, 4, post_indexed));
1303       __ str_w(R3, Address(to,   4, post_indexed));
1304 
1305       __ bind(L);
1306     }
1307 
1308     if (bytes_per_count <= 2) {
1309       Label L;
1310       __ tbz(count, exact_log2(2/bytes_per_count), L);
1311 
1312       __ ldrh(R3, Address(from, 2, post_indexed));
1313       __ strh(R3, Address(to,   2, post_indexed));
1314 
1315       __ bind(L);
1316     }
1317 
1318     if (bytes_per_count <= 1) {
1319       Label L;
1320       __ tbz(count, 0, L);
1321 
1322       __ ldrb(R3, Address(from, 1, post_indexed));
1323       __ strb(R3, Address(to,   1, post_indexed));
1324 
1325       __ bind(L);
1326     }
1327 #else
1328     __ tst(count, 16 / bytes_per_count);
1329     __ ldmia(from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes
1330     __ stmia(to, RegisterSet(R3, R6), writeback, ne);
1331 
1332     __ tst(count, 8 / bytes_per_count);
1333     __ ldmia(from, RegisterSet(R3, R4), writeback, ne); // copy 8 bytes
1334     __ stmia(to, RegisterSet(R3, R4), writeback, ne);
1335 
1336     if (bytes_per_count <= 4) {
1337       __ tst(count, 4 / bytes_per_count);
1338       __ ldr(R3, Address(from, 4, post_indexed), ne); // copy 4 bytes
1339       __ str(R3, Address(to, 4, post_indexed), ne);
1340     }
1341 
1342     if (bytes_per_count <= 2) {
1343       __ tst(count, 2 / bytes_per_count);
1344       __ ldrh(R3, Address(from, 2, post_indexed), ne); // copy 2 bytes
1345       __ strh(R3, Address(to, 2, post_indexed), ne);
1346     }
1347 
1348     if (bytes_per_count == 1) {
1349       __ tst(count, 1);
1350       __ ldrb(R3, Address(from, 1, post_indexed), ne);
1351       __ strb(R3, Address(to, 1, post_indexed), ne);
1352     }
1353 
1354     __ pop(RegisterSet(R4,R10));
1355 #endif // AARCH64
1356 
1357     return count_per_loop;
1358   }
1359 
1360 
1361   // Generate the inner loop for backward aligned array copy
1362   //
1363   // Arguments
1364   //      end_from:      src end address, 64 bits  aligned
1365   //      end_to:        dst end address, wordSize aligned
1366   //      count:         number of elements (32-bit int)
1367   //      bytes_per_count: number of bytes for each unit of 'count'
1368   //
1369   // Return the minimum initial value for count
1370   //
1371   // Notes:
1372   // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64)
1373   // - 'end_to' aligned on wordSize
1374   // - 'count' must be greater or equal than the returned value
1375   //
1376   // Decreases 'end_from' and 'end_to' by count*bytes_per_count.
1377   //
1378   // Scratches 'count', R3.
1379   // On AArch64 also scratches R4-R10; on 32-bit ARM R4-R10 are preserved (saved/restored).
1380   //
1381   int generate_backward_aligned_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count) {
1382     assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below");
1383 
1384     const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration
1385     const int count_per_loop = bytes_per_loop / bytes_per_count;
1386 
1387     arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_aligned;
1388     int pld_offset = config->pld_distance;
1389 
1390 #ifndef AARCH64
1391     bool split_read= config->split_ldm;
1392     bool split_write= config->split_stm;
1393 
1394     // See the forward copy variant for additional comments.
1395 
1396     __ push(RegisterSet(R4,R10));
1397 #endif // !AARCH64
1398 
1399     __ sub_32(count, count, count_per_loop);
1400 
1401     const bool prefetch_before = pld_offset < 0;
1402     const bool prefetch_after = pld_offset > 0;
1403 
1404     Label L_skip_pld;
1405 
1406     if (pld_offset != 0) {
1407       pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset;
1408 
1409       prefetch(end_from, end_to, -wordSize);
1410 
1411       if (prefetch_before) {
1412         __ subs_32(count, count, (bytes_per_loop + pld_offset) / bytes_per_count);
1413         __ b(L_skip_pld, lt);
1414       }
1415 
1416       int offset = ArmCopyCacheLineSize;
1417       while (offset <= pld_offset) {
1418         prefetch(end_from, end_to, -(wordSize + offset));
1419         offset += ArmCopyCacheLineSize;
1420       };
1421     }
1422 
1423 #ifdef AARCH64
1424     const Register data_regs[8] = {R3, R4, R5, R6, R7, R8, R9, R10};
1425 #endif // AARCH64
1426     {
1427       // LDM (32-bit ARM) / LDP (AArch64) copy of 'bytes_per_loop' bytes
1428 
1429       // 32-bit ARM note: we have tried implementing loop unrolling to skip one
1430       // PLD with 64 bytes cache line but the gain was not significant.
1431 
1432       Label L_copy_loop;
1433       __ align(OptoLoopAlignment);
1434       __ BIND(L_copy_loop);
1435 
1436       if (prefetch_before) {
1437         prefetch(end_from, end_to, -(wordSize + bytes_per_loop + pld_offset));
1438         __ BIND(L_skip_pld);
1439       }
1440 
1441 #ifdef AARCH64
1442       bulk_load_backward(end_from, data_regs, 8);
1443 #else
1444       if (split_read) {
1445         __ ldmdb(end_from, RegisterSet(R7, R10), writeback);
1446         __ ldmdb(end_from, RegisterSet(R3, R6), writeback);
1447       } else {
1448         __ ldmdb(end_from, RegisterSet(R3, R10), writeback);
1449       }
1450 #endif // AARCH64
1451 
1452       __ subs_32(count, count, count_per_loop);
1453 
1454       if (prefetch_after) {
1455         prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop);
1456       }
1457 
1458 #ifdef AARCH64
1459       bulk_store_backward(end_to, data_regs, 8);
1460 #else
1461       if (split_write) {
1462         __ stmdb(end_to, RegisterSet(R7, R10), writeback);
1463         __ stmdb(end_to, RegisterSet(R3, R6), writeback);
1464       } else {
1465         __ stmdb(end_to, RegisterSet(R3, R10), writeback);
1466       }
1467 #endif // AARCH64
1468 
1469       __ b(L_copy_loop, ge);
1470 
1471       if (prefetch_before) {
1472         __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count);
1473         __ b(L_skip_pld, ge);
1474       }
1475     }
1476     BLOCK_COMMENT("Remaining bytes:");
1477     // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes
1478 
1479     // __ add(count, count, ...); // addition useless for the bit tests
1480     assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits");
1481 
1482 #ifdef AARCH64
1483     assert (bytes_per_loop == 64, "adjust the code below");
1484     assert (bytes_per_count <= 8, "adjust the code below");
1485 
1486     {
1487       Label L;
1488       __ tbz(count, exact_log2(32/bytes_per_count), L);
1489 
1490       bulk_load_backward(end_from, data_regs, 4);
1491       bulk_store_backward(end_to, data_regs, 4);
1492 
1493       __ bind(L);
1494     }
1495 
1496     {
1497       Label L;
1498       __ tbz(count, exact_log2(16/bytes_per_count), L);
1499 
1500       bulk_load_backward(end_from, data_regs, 2);
1501       bulk_store_backward(end_to, data_regs, 2);
1502 
1503       __ bind(L);
1504     }
1505 
1506     {
1507       Label L;
1508       __ tbz(count, exact_log2(8/bytes_per_count), L);
1509 
1510       __ ldr(R3, Address(end_from, -8, pre_indexed));
1511       __ str(R3, Address(end_to,   -8, pre_indexed));
1512 
1513       __ bind(L);
1514     }
1515 
1516     if (bytes_per_count <= 4) {
1517       Label L;
1518       __ tbz(count, exact_log2(4/bytes_per_count), L);
1519 
1520       __ ldr_w(R3, Address(end_from, -4, pre_indexed));
1521       __ str_w(R3, Address(end_to,   -4, pre_indexed));
1522 
1523       __ bind(L);
1524     }
1525 
1526     if (bytes_per_count <= 2) {
1527       Label L;
1528       __ tbz(count, exact_log2(2/bytes_per_count), L);
1529 
1530       __ ldrh(R3, Address(end_from, -2, pre_indexed));
1531       __ strh(R3, Address(end_to,   -2, pre_indexed));
1532 
1533       __ bind(L);
1534     }
1535 
1536     if (bytes_per_count <= 1) {
1537       Label L;
1538       __ tbz(count, 0, L);
1539 
1540       __ ldrb(R3, Address(end_from, -1, pre_indexed));
1541       __ strb(R3, Address(end_to,   -1, pre_indexed));
1542 
1543       __ bind(L);
1544     }
1545 #else
1546     __ tst(count, 16 / bytes_per_count);
1547     __ ldmdb(end_from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes
1548     __ stmdb(end_to, RegisterSet(R3, R6), writeback, ne);
1549 
1550     __ tst(count, 8 / bytes_per_count);
1551     __ ldmdb(end_from, RegisterSet(R3, R4), writeback, ne); // copy 8 bytes
1552     __ stmdb(end_to, RegisterSet(R3, R4), writeback, ne);
1553 
1554     if (bytes_per_count <= 4) {
1555       __ tst(count, 4 / bytes_per_count);
1556       __ ldr(R3, Address(end_from, -4, pre_indexed), ne); // copy 4 bytes
1557       __ str(R3, Address(end_to, -4, pre_indexed), ne);
1558     }
1559 
1560     if (bytes_per_count <= 2) {
1561       __ tst(count, 2 / bytes_per_count);
1562       __ ldrh(R3, Address(end_from, -2, pre_indexed), ne); // copy 2 bytes
1563       __ strh(R3, Address(end_to, -2, pre_indexed), ne);
1564     }
1565 
1566     if (bytes_per_count == 1) {
1567       __ tst(count, 1);
1568       __ ldrb(R3, Address(end_from, -1, pre_indexed), ne);
1569       __ strb(R3, Address(end_to, -1, pre_indexed), ne);
1570     }
1571 
1572     __ pop(RegisterSet(R4,R10));
1573 #endif // AARCH64
1574 
1575     return count_per_loop;
1576   }
1577 
1578 
1579   // Generate the inner loop for shifted forward array copy (unaligned copy).
1580   // It can be used when bytes_per_count < wordSize, i.e.
1581   //  byte/short copy on 32-bit ARM, byte/short/int/compressed-oop copy on AArch64.
1582   //
1583   // Arguments
1584   //      from:      start src address, 64 bits aligned
1585   //      to:        start dst address, (now) wordSize aligned
1586   //      count:     number of elements (32-bit int)
1587   //      bytes_per_count: number of bytes for each unit of 'count'
1588   //      lsr_shift: shift applied to 'old' value to skipped already written bytes
1589   //      lsl_shift: shift applied to 'new' value to set the high bytes of the next write
1590   //
1591   // Return the minimum initial value for count
1592   //
1593   // Notes:
1594   // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64)
1595   // - 'to' aligned on wordSize
1596   // - 'count' must be greater or equal than the returned value
1597   // - 'lsr_shift' + 'lsl_shift' = BitsPerWord
1598   // - 'bytes_per_count' is 1 or 2 on 32-bit ARM; 1, 2 or 4 on AArch64
1599   //
1600   // Increases 'to' by count*bytes_per_count.
1601   //
1602   // Scratches 'from' and 'count', R3-R10, R12
1603   //
1604   // On entry:
1605   // - R12 is preloaded with the first 'BitsPerWord' bits read just before 'from'
1606   // - (R12 >> lsr_shift) is the part not yet written (just before 'to')
1607   // --> (*to) = (R12 >> lsr_shift) | (*from) << lsl_shift); ...
1608   //
1609   // This implementation may read more bytes than required.
1610   // Actually, it always reads exactly all data from the copied region with upper bound aligned up by wordSize,
1611   // so excessive read do not cross a word bound and is thus harmless.
1612   //
1613   int generate_forward_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, int lsr_shift, int lsl_shift) {
1614     assert (from == R0 && to == R1 && count == R2, "adjust the implementation below");
1615 
1616     const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iter
1617     const int count_per_loop = bytes_per_loop / bytes_per_count;
1618 
1619     arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_shifted;
1620     int pld_offset = config->pld_distance;
1621 
1622 #ifndef AARCH64
1623     bool split_read= config->split_ldm;
1624     bool split_write= config->split_stm;
1625 #endif // !AARCH64
1626 
1627     const bool prefetch_before = pld_offset < 0;
1628     const bool prefetch_after = pld_offset > 0;
1629     Label L_skip_pld, L_last_read, L_done;
1630     if (pld_offset != 0) {
1631 
1632       pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset;
1633 
1634       prefetch(from, to, 0);
1635 
1636       if (prefetch_before) {
1637         __ cmp_32(count, count_per_loop);
1638         __ b(L_last_read, lt);
1639         // skip prefetch for small copies
1640         // warning: count is predecreased by the prefetch distance to optimize the inner loop
1641         __ subs_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop);
1642         __ b(L_skip_pld, lt);
1643       }
1644 
1645       int offset = ArmCopyCacheLineSize;
1646       while (offset <= pld_offset) {
1647         prefetch(from, to, offset);
1648         offset += ArmCopyCacheLineSize;
1649       };
1650     }
1651 
1652     Label L_shifted_loop;
1653 
1654     __ align(OptoLoopAlignment);
1655     __ BIND(L_shifted_loop);
1656 
1657     if (prefetch_before) {
1658       // do it early if there might be register locking issues
1659       prefetch(from, to, bytes_per_loop + pld_offset);
1660       __ BIND(L_skip_pld);
1661     } else {
1662       __ cmp_32(count, count_per_loop);
1663       __ b(L_last_read, lt);
1664     }
1665 
1666 #ifdef AARCH64
1667     const Register data_regs[9] = {R3, R4, R5, R6, R7, R8, R9, R10, R12};
1668     __ logical_shift_right(R3, R12, lsr_shift); // part of R12 not yet written
1669     __ subs_32(count, count, count_per_loop);
1670     bulk_load_forward(from, &data_regs[1], 8);
1671 #else
1672     // read 32 bytes
1673     if (split_read) {
1674       // if write is not split, use less registers in first set to reduce locking
1675       RegisterSet set1 = split_write ? RegisterSet(R4, R7) : RegisterSet(R4, R5);
1676       RegisterSet set2 = (split_write ? RegisterSet(R8, R10) : RegisterSet(R6, R10)) | R12;
1677       __ ldmia(from, set1, writeback);
1678       __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); // part of R12 not yet written
1679       __ ldmia(from, set2, writeback);
1680       __ subs(count, count, count_per_loop); // XXX: should it be before the 2nd LDM ? (latency vs locking)
1681     } else {
1682       __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); // part of R12 not yet written
1683       __ ldmia(from, RegisterSet(R4, R10) | R12, writeback); // Note: small latency on R4
1684       __ subs(count, count, count_per_loop);
1685     }
1686 #endif // AARCH64
1687 
1688     if (prefetch_after) {
1689       // do it after the 1st ldm/ldp anyway  (no locking issues with early STM/STP)
1690       prefetch(from, to, pld_offset, bytes_per_loop);
1691     }
1692 
1693     // prepare (shift) the values in R3..R10
1694     __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); // merged below low bytes of next val
1695     __ logical_shift_right(R4, R4, lsr_shift); // unused part of next val
1696     __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift)); // ...
1697     __ logical_shift_right(R5, R5, lsr_shift);
1698     __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift));
1699     __ logical_shift_right(R6, R6, lsr_shift);
1700     __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift));
1701 #ifndef AARCH64
1702     if (split_write) {
1703       // write the first half as soon as possible to reduce stm locking
1704       __ stmia(to, RegisterSet(R3, R6), writeback, prefetch_before ? gt : ge);
1705     }
1706 #endif // !AARCH64
1707     __ logical_shift_right(R7, R7, lsr_shift);
1708     __ orr(R7, R7, AsmOperand(R8, lsl, lsl_shift));
1709     __ logical_shift_right(R8, R8, lsr_shift);
1710     __ orr(R8, R8, AsmOperand(R9, lsl, lsl_shift));
1711     __ logical_shift_right(R9, R9, lsr_shift);
1712     __ orr(R9, R9, AsmOperand(R10, lsl, lsl_shift));
1713     __ logical_shift_right(R10, R10, lsr_shift);
1714     __ orr(R10, R10, AsmOperand(R12, lsl, lsl_shift));
1715 
1716 #ifdef AARCH64
1717     bulk_store_forward(to, data_regs, 8);
1718 #else
1719     if (split_write) {
1720       __ stmia(to, RegisterSet(R7, R10), writeback, prefetch_before ? gt : ge);
1721     } else {
1722       __ stmia(to, RegisterSet(R3, R10), writeback, prefetch_before ? gt : ge);
1723     }
1724 #endif // AARCH64
1725     __ b(L_shifted_loop, gt); // no need to loop if 0 (when count need not be precise modulo bytes_per_loop)
1726 
1727     if (prefetch_before) {
1728       // the first loop may end earlier, allowing to skip pld at the end
1729       __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count);
1730 #ifndef AARCH64
1731       __ stmia(to, RegisterSet(R3, R10), writeback); // stmia was skipped
1732 #endif // !AARCH64
1733       __ b(L_skip_pld, ge);
1734       __ adds_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop);
1735     }
1736 
1737     __ BIND(L_last_read);
1738     __ b(L_done, eq);
1739 
1740 #ifdef AARCH64
1741     assert(bytes_per_count < 8, "adjust the code below");
1742 
1743     __ logical_shift_right(R3, R12, lsr_shift);
1744 
1745     {
1746       Label L;
1747       __ tbz(count, exact_log2(32/bytes_per_count), L);
1748       bulk_load_forward(from, &data_regs[1], 4);
1749       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift));
1750       __ logical_shift_right(R4, R4, lsr_shift);
1751       __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift));
1752       __ logical_shift_right(R5, R5, lsr_shift);
1753       __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift));
1754       __ logical_shift_right(R6, R6, lsr_shift);
1755       __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift));
1756       bulk_store_forward(to, data_regs, 4);
1757       __ logical_shift_right(R3, R7, lsr_shift);
1758       __ bind(L);
1759     }
1760 
1761     {
1762       Label L;
1763       __ tbz(count, exact_log2(16/bytes_per_count), L);
1764       bulk_load_forward(from, &data_regs[1], 2);
1765       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift));
1766       __ logical_shift_right(R4, R4, lsr_shift);
1767       __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift));
1768       bulk_store_forward(to, data_regs, 2);
1769       __ logical_shift_right(R3, R5, lsr_shift);
1770       __ bind(L);
1771     }
1772 
1773     {
1774       Label L;
1775       __ tbz(count, exact_log2(8/bytes_per_count), L);
1776       __ ldr(R4, Address(from, 8, post_indexed));
1777       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift));
1778       __ str(R3, Address(to, 8, post_indexed));
1779       __ logical_shift_right(R3, R4, lsr_shift);
1780       __ bind(L);
1781     }
1782 
1783     const int have_bytes = lsl_shift/BitsPerByte; // number of already read bytes in R3
1784 
1785     // It remains less than wordSize to write.
1786     // Do not check count if R3 already has maximal number of loaded elements (one less than wordSize).
1787     if (have_bytes < wordSize - bytes_per_count) {
1788       Label L;
1789       __ andr(count, count, (uintx)(8/bytes_per_count-1)); // make count exact
1790       __ cmp_32(count, have_bytes/bytes_per_count); // do we have enough bytes to store?
1791       __ b(L, le);
1792       __ ldr(R4, Address(from, 8, post_indexed));
1793       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift));
1794       __ bind(L);
1795     }
1796 
1797     {
1798       Label L;
1799       __ tbz(count, exact_log2(4/bytes_per_count), L);
1800       __ str_w(R3, Address(to, 4, post_indexed));
1801       if (bytes_per_count < 4) {
1802         __ logical_shift_right(R3, R3, 4*BitsPerByte);
1803       }
1804       __ bind(L);
1805     }
1806 
1807     if (bytes_per_count <= 2) {
1808       Label L;
1809       __ tbz(count, exact_log2(2/bytes_per_count), L);
1810       __ strh(R3, Address(to, 2, post_indexed));
1811       if (bytes_per_count < 2) {
1812         __ logical_shift_right(R3, R3, 2*BitsPerByte);
1813       }
1814       __ bind(L);
1815     }
1816 
1817     if (bytes_per_count <= 1) {
1818       Label L;
1819       __ tbz(count, exact_log2(1/bytes_per_count), L);
1820       __ strb(R3, Address(to, 1, post_indexed));
1821       __ bind(L);
1822     }
1823 #else
1824     switch (bytes_per_count) {
1825     case 2:
1826       __ mov(R3, AsmOperand(R12, lsr, lsr_shift));
1827       __ tst(count, 8);
1828       __ ldmia(from, RegisterSet(R4, R7), writeback, ne);
1829       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val
1830       __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val
1831       __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ...
1832       __ mov(R5, AsmOperand(R5, lsr, lsr_shift), ne);
1833       __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift), ne);
1834       __ mov(R6, AsmOperand(R6, lsr, lsr_shift), ne);
1835       __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift), ne);
1836       __ stmia(to, RegisterSet(R3, R6), writeback, ne);
1837       __ mov(R3, AsmOperand(R7, lsr, lsr_shift), ne);
1838 
1839       __ tst(count, 4);
1840       __ ldmia(from, RegisterSet(R4, R5), writeback, ne);
1841       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val
1842       __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val
1843       __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ...
1844       __ stmia(to, RegisterSet(R3, R4), writeback, ne);
1845       __ mov(R3, AsmOperand(R5, lsr, lsr_shift), ne);
1846 
1847       __ tst(count, 2);
1848       __ ldr(R4, Address(from, 4, post_indexed), ne);
1849       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne);
1850       __ str(R3, Address(to, 4, post_indexed), ne);
1851       __ mov(R3, AsmOperand(R4, lsr, lsr_shift), ne);
1852 
1853       __ tst(count, 1);
1854       __ strh(R3, Address(to, 2, post_indexed), ne); // one last short
1855       break;
1856 
1857     case 1:
1858       __ mov(R3, AsmOperand(R12, lsr, lsr_shift));
1859       __ tst(count, 16);
1860       __ ldmia(from, RegisterSet(R4, R7), writeback, ne);
1861       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val
1862       __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val
1863       __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ...
1864       __ mov(R5, AsmOperand(R5, lsr, lsr_shift), ne);
1865       __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift), ne);
1866       __ mov(R6, AsmOperand(R6, lsr, lsr_shift), ne);
1867       __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift), ne);
1868       __ stmia(to, RegisterSet(R3, R6), writeback, ne);
1869       __ mov(R3, AsmOperand(R7, lsr, lsr_shift), ne);
1870 
1871       __ tst(count, 8);
1872       __ ldmia(from, RegisterSet(R4, R5), writeback, ne);
1873       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val
1874       __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val
1875       __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ...
1876       __ stmia(to, RegisterSet(R3, R4), writeback, ne);
1877       __ mov(R3, AsmOperand(R5, lsr, lsr_shift), ne);
1878 
1879       __ tst(count, 4);
1880       __ ldr(R4, Address(from, 4, post_indexed), ne);
1881       __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne);
1882       __ str(R3, Address(to, 4, post_indexed), ne);
1883       __ mov(R3, AsmOperand(R4, lsr, lsr_shift), ne);
1884 
1885       __ andr(count, count, 3);
1886       __ cmp(count, 2);
1887 
1888       // Note: R3 might contain enough bytes ready to write (3 needed at most),
1889       // thus load on lsl_shift==24 is not needed (in fact forces reading
1890       // beyond source buffer end boundary)
1891       if (lsl_shift == 8) {
1892         __ ldr(R4, Address(from, 4, post_indexed), ge);
1893         __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ge);
1894       } else if (lsl_shift == 16) {
1895         __ ldr(R4, Address(from, 4, post_indexed), gt);
1896         __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), gt);
1897       }
1898 
1899       __ strh(R3, Address(to, 2, post_indexed), ge); // two last bytes
1900       __ mov(R3, AsmOperand(R3, lsr, 16), gt);
1901 
1902       __ tst(count, 1);
1903       __ strb(R3, Address(to, 1, post_indexed), ne); // one last byte
1904       break;
1905     }
1906 #endif // AARCH64
1907 
1908     __ BIND(L_done);
1909     return 0; // no minimum
1910   }
1911 
1912   // Generate the inner loop for shifted backward array copy (unaligned copy).
1913   // It can be used when bytes_per_count < wordSize, i.e.
1914   //  byte/short copy on 32-bit ARM, byte/short/int/compressed-oop copy on AArch64.
1915   //
1916   // Arguments
1917   //      end_from:  end src address, 64 bits aligned
1918   //      end_to:    end dst address, (now) wordSize aligned
1919   //      count:     number of elements (32-bit int)
1920   //      bytes_per_count: number of bytes for each unit of 'count'
1921   //      lsl_shift: shift applied to 'old' value to skipped already written bytes
1922   //      lsr_shift: shift applied to 'new' value to set the low bytes of the next write
1923   //
1924   // Return the minimum initial value for count
1925   //
1926   // Notes:
1927   // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64)
1928   // - 'end_to' aligned on wordSize
1929   // - 'count' must be greater or equal than the returned value
1930   // - 'lsr_shift' + 'lsl_shift' = 'BitsPerWord'
1931   // - 'bytes_per_count' is 1 or 2 on 32-bit ARM; 1, 2 or 4 on AArch64
1932   //
1933   // Decreases 'end_to' by count*bytes_per_count.
1934   //
1935   // Scratches 'end_from', 'count', R3-R10, R12
1936   //
1937   // On entry:
1938   // - R3 is preloaded with the first 'BitsPerWord' bits read just after 'from'
1939   // - (R3 << lsl_shift) is the part not yet written
1940   // --> (*--to) = (R3 << lsl_shift) | (*--from) >> lsr_shift); ...
1941   //
1942   // This implementation may read more bytes than required.
1943   // Actually, it always reads exactly all data from the copied region with beginning aligned down by wordSize,
1944   // so excessive read do not cross a word bound and is thus harmless.
1945   //
1946   int generate_backward_shifted_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count, int lsr_shift, int lsl_shift) {
1947     assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below");
1948 
1949     const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iter
1950     const int count_per_loop = bytes_per_loop / bytes_per_count;
1951 
1952     arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_shifted;
1953     int pld_offset = config->pld_distance;
1954 
1955 #ifndef AARCH64
1956     bool split_read= config->split_ldm;
1957     bool split_write= config->split_stm;
1958 #endif // !AARCH64
1959 
1960 
1961     const bool prefetch_before = pld_offset < 0;
1962     const bool prefetch_after = pld_offset > 0;
1963 
1964     Label L_skip_pld, L_done, L_last_read;
1965     if (pld_offset != 0) {
1966 
1967       pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset;
1968 
1969       prefetch(end_from, end_to, -wordSize);
1970 
1971       if (prefetch_before) {
1972         __ cmp_32(count, count_per_loop);
1973         __ b(L_last_read, lt);
1974 
1975         // skip prefetch for small copies
1976         // warning: count is predecreased by the prefetch distance to optimize the inner loop
1977         __ subs_32(count, count, ((bytes_per_loop + pld_offset)/bytes_per_count) + count_per_loop);
1978         __ b(L_skip_pld, lt);
1979       }
1980 
1981       int offset = ArmCopyCacheLineSize;
1982       while (offset <= pld_offset) {
1983         prefetch(end_from, end_to, -(wordSize + offset));
1984         offset += ArmCopyCacheLineSize;
1985       };
1986     }
1987 
1988     Label L_shifted_loop;
1989     __ align(OptoLoopAlignment);
1990     __ BIND(L_shifted_loop);
1991 
1992     if (prefetch_before) {
1993       // do the 1st ldm/ldp first anyway (no locking issues with early STM/STP)
1994       prefetch(end_from, end_to, -(wordSize + bytes_per_loop + pld_offset));
1995       __ BIND(L_skip_pld);
1996     } else {
1997       __ cmp_32(count, count_per_loop);
1998       __ b(L_last_read, lt);
1999     }
2000 
2001 #ifdef AARCH64
2002     __ logical_shift_left(R12, R3, lsl_shift);
2003     const Register data_regs[9] = {R3, R4, R5, R6, R7, R8, R9, R10, R12};
2004     bulk_load_backward(end_from, data_regs, 8);
2005 #else
2006     if (split_read) {
2007       __ ldmdb(end_from, RegisterSet(R7, R10), writeback);
2008       __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written
2009       __ ldmdb(end_from, RegisterSet(R3, R6), writeback);
2010     } else {
2011       __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written
2012       __ ldmdb(end_from, RegisterSet(R3, R10), writeback);
2013     }
2014 #endif // AARCH64
2015 
2016     __ subs_32(count, count, count_per_loop);
2017 
2018     if (prefetch_after) { // do prefetch during ldm/ldp latency
2019       prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop);
2020     }
2021 
2022     // prepare the values in R4..R10,R12
2023     __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); // merged above high  bytes of prev val
2024     __ logical_shift_left(R10, R10, lsl_shift); // unused part of prev val
2025     __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift)); // ...
2026     __ logical_shift_left(R9, R9, lsl_shift);
2027     __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift));
2028     __ logical_shift_left(R8, R8, lsl_shift);
2029     __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift));
2030     __ logical_shift_left(R7, R7, lsl_shift);
2031     __ orr(R7, R7, AsmOperand(R6, lsr, lsr_shift));
2032     __ logical_shift_left(R6, R6, lsl_shift);
2033     __ orr(R6, R6, AsmOperand(R5, lsr, lsr_shift));
2034 #ifndef AARCH64
2035     if (split_write) {
2036       // store early to reduce locking issues
2037       __ stmdb(end_to, RegisterSet(R6, R10) | R12, writeback, prefetch_before ? gt : ge);
2038     }
2039 #endif // !AARCH64
2040     __ logical_shift_left(R5, R5, lsl_shift);
2041     __ orr(R5, R5, AsmOperand(R4, lsr, lsr_shift));
2042     __ logical_shift_left(R4, R4, lsl_shift);
2043     __ orr(R4, R4, AsmOperand(R3, lsr, lsr_shift));
2044 
2045 #ifdef AARCH64
2046     bulk_store_backward(end_to, &data_regs[1], 8);
2047 #else
2048     if (split_write) {
2049       __ stmdb(end_to, RegisterSet(R4, R5), writeback, prefetch_before ? gt : ge);
2050     } else {
2051       __ stmdb(end_to, RegisterSet(R4, R10) | R12, writeback, prefetch_before ? gt : ge);
2052     }
2053 #endif // AARCH64
2054 
2055     __ b(L_shifted_loop, gt); // no need to loop if 0 (when count need not be precise modulo bytes_per_loop)
2056 
2057     if (prefetch_before) {
2058       // the first loop may end earlier, allowing to skip pld at the end
2059       __ cmn_32(count, ((bytes_per_loop + pld_offset)/bytes_per_count));
2060 #ifndef AARCH64
2061       __ stmdb(end_to, RegisterSet(R4, R10) | R12, writeback); // stmdb was skipped
2062 #endif // !AARCH64
2063       __ b(L_skip_pld, ge);
2064       __ adds_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop);
2065     }
2066 
2067     __ BIND(L_last_read);
2068     __ b(L_done, eq);
2069 
2070 #ifdef AARCH64
2071     assert(bytes_per_count < 8, "adjust the code below");
2072 
2073     __ logical_shift_left(R12, R3, lsl_shift);
2074 
2075     {
2076       Label L;
2077       __ tbz(count, exact_log2(32/bytes_per_count), L);
2078       bulk_load_backward(end_from, &data_regs[4], 4);
2079 
2080       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift));
2081       __ logical_shift_left(R10, R10, lsl_shift);
2082       __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift));
2083       __ logical_shift_left(R9, R9, lsl_shift);
2084       __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift));
2085       __ logical_shift_left(R8, R8, lsl_shift);
2086       __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift));
2087 
2088       bulk_store_backward(end_to, &data_regs[5], 4);
2089       __ logical_shift_left(R12, R7, lsl_shift);
2090       __ bind(L);
2091     }
2092 
2093     {
2094       Label L;
2095       __ tbz(count, exact_log2(16/bytes_per_count), L);
2096       bulk_load_backward(end_from, &data_regs[6], 2);
2097 
2098       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift));
2099       __ logical_shift_left(R10, R10, lsl_shift);
2100       __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift));
2101 
2102       bulk_store_backward(end_to, &data_regs[7], 2);
2103       __ logical_shift_left(R12, R9, lsl_shift);
2104       __ bind(L);
2105     }
2106 
2107     {
2108       Label L;
2109       __ tbz(count, exact_log2(8/bytes_per_count), L);
2110       __ ldr(R10, Address(end_from, -8, pre_indexed));
2111       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift));
2112       __ str(R12, Address(end_to, -8, pre_indexed));
2113       __ logical_shift_left(R12, R10, lsl_shift);
2114       __ bind(L);
2115     }
2116 
2117     const int have_bytes = lsr_shift/BitsPerByte; // number of already read bytes in R12
2118 
2119     // It remains less than wordSize to write.
2120     // Do not check count if R12 already has maximal number of loaded elements (one less than wordSize).
2121     if (have_bytes < wordSize - bytes_per_count) {
2122       Label L;
2123       __ andr(count, count, (uintx)(8/bytes_per_count-1)); // make count exact
2124       __ cmp_32(count, have_bytes/bytes_per_count); // do we have enough bytes to store?
2125       __ b(L, le);
2126       __ ldr(R10, Address(end_from, -8, pre_indexed));
2127       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift));
2128       __ bind(L);
2129     }
2130 
2131     assert (bytes_per_count <= 4, "must be");
2132 
2133     {
2134       Label L;
2135       __ tbz(count, exact_log2(4/bytes_per_count), L);
2136       __ logical_shift_right(R9, R12, (wordSize-4)*BitsPerByte);
2137       __ str_w(R9, Address(end_to, -4, pre_indexed)); // Write 4 MSB
2138       if (bytes_per_count < 4) {
2139         __ logical_shift_left(R12, R12, 4*BitsPerByte); // Promote remaining bytes to MSB
2140       }
2141       __ bind(L);
2142     }
2143 
2144     if (bytes_per_count <= 2) {
2145       Label L;
2146       __ tbz(count, exact_log2(2/bytes_per_count), L);
2147       __ logical_shift_right(R9, R12, (wordSize-2)*BitsPerByte);
2148       __ strh(R9, Address(end_to, -2, pre_indexed)); // Write 2 MSB
2149       if (bytes_per_count < 2) {
2150         __ logical_shift_left(R12, R12, 2*BitsPerByte); // Promote remaining bytes to MSB
2151       }
2152       __ bind(L);
2153     }
2154 
2155     if (bytes_per_count <= 1) {
2156       Label L;
2157       __ tbz(count, exact_log2(1/bytes_per_count), L);
2158       __ logical_shift_right(R9, R12, (wordSize-1)*BitsPerByte);
2159       __ strb(R9, Address(end_to, -1, pre_indexed)); // Write 1 MSB
2160       __ bind(L);
2161     }
2162 #else
2163       switch(bytes_per_count) {
2164       case 2:
2165       __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written
2166       __ tst(count, 8);
2167       __ ldmdb(end_from, RegisterSet(R7,R10), writeback, ne);
2168       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne);
2169       __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val
2170       __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ...
2171       __ mov(R9, AsmOperand(R9, lsl, lsl_shift),ne);
2172       __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift),ne);
2173       __ mov(R8, AsmOperand(R8, lsl, lsl_shift),ne);
2174       __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift),ne);
2175       __ stmdb(end_to, RegisterSet(R8,R10)|R12, writeback, ne);
2176       __ mov(R12, AsmOperand(R7, lsl, lsl_shift), ne);
2177 
2178       __ tst(count, 4);
2179       __ ldmdb(end_from, RegisterSet(R9, R10), writeback, ne);
2180       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne);
2181       __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val
2182       __ orr(R10, R10, AsmOperand(R9, lsr,lsr_shift),ne); // ...
2183       __ stmdb(end_to, RegisterSet(R10)|R12, writeback, ne);
2184       __ mov(R12, AsmOperand(R9, lsl, lsl_shift), ne);
2185 
2186       __ tst(count, 2);
2187       __ ldr(R10, Address(end_from, -4, pre_indexed), ne);
2188       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne);
2189       __ str(R12, Address(end_to, -4, pre_indexed), ne);
2190       __ mov(R12, AsmOperand(R10, lsl, lsl_shift), ne);
2191 
2192       __ tst(count, 1);
2193       __ mov(R12, AsmOperand(R12, lsr, lsr_shift),ne);
2194       __ strh(R12, Address(end_to, -2, pre_indexed), ne); // one last short
2195       break;
2196 
2197       case 1:
2198       __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written
2199       __ tst(count, 16);
2200       __ ldmdb(end_from, RegisterSet(R7,R10), writeback, ne);
2201       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne);
2202       __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val
2203       __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ...
2204       __ mov(R9, AsmOperand(R9, lsl, lsl_shift),ne);
2205       __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift),ne);
2206       __ mov(R8, AsmOperand(R8, lsl, lsl_shift),ne);
2207       __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift),ne);
2208       __ stmdb(end_to, RegisterSet(R8,R10)|R12, writeback, ne);
2209       __ mov(R12, AsmOperand(R7, lsl, lsl_shift), ne);
2210 
2211       __ tst(count, 8);
2212       __ ldmdb(end_from, RegisterSet(R9,R10), writeback, ne);
2213       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne);
2214       __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val
2215       __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ...
2216       __ stmdb(end_to, RegisterSet(R10)|R12, writeback, ne);
2217       __ mov(R12, AsmOperand(R9, lsl, lsl_shift), ne);
2218 
2219       __ tst(count, 4);
2220       __ ldr(R10, Address(end_from, -4, pre_indexed), ne);
2221       __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne);
2222       __ str(R12, Address(end_to, -4, pre_indexed), ne);
2223       __ mov(R12, AsmOperand(R10, lsl, lsl_shift), ne);
2224 
2225       __ tst(count, 2);
2226       if (lsr_shift != 24) {
2227         // avoid useless reading R10 when we already have 3 bytes ready in R12
2228         __ ldr(R10, Address(end_from, -4, pre_indexed), ne);
2229         __ orr(R12, R12, AsmOperand(R10, lsr,lsr_shift), ne);
2230       }
2231 
2232       // Note: R12 contains enough bytes ready to write (3 needed at most)
2233       // write the 2 MSBs
2234       __ mov(R9, AsmOperand(R12, lsr, 16), ne);
2235       __ strh(R9, Address(end_to, -2, pre_indexed), ne);
2236       // promote remaining to MSB
2237       __ mov(R12, AsmOperand(R12, lsl, 16), ne);
2238 
2239       __ tst(count, 1);
2240       // write the MSB of R12
2241       __ mov(R12, AsmOperand(R12, lsr, 24), ne);
2242       __ strb(R12, Address(end_to, -1, pre_indexed), ne);
2243 
2244       break;
2245       }
2246 #endif // AARCH64
2247 
2248     __ BIND(L_done);
2249     return 0; // no minimum
2250   }
2251 
2252   // This method is very useful for merging forward/backward implementations
2253   Address get_addr_with_indexing(Register base, int delta, bool forward) {
2254     if (forward) {
2255       return Address(base, delta, post_indexed);
2256     } else {
2257       return Address(base, -delta, pre_indexed);
2258     }
2259   }
2260 
2261 #ifdef AARCH64
2262   // Loads one 'size_in_bytes'-sized value from 'from' in given direction, i.e.
2263   //   if forward:  loads value at from and increases from by size
2264   //   if !forward: loads value at from-size_in_bytes and decreases from by size
2265   void load_one(Register rd, Register from, int size_in_bytes, bool forward) {
2266     assert_different_registers(from, rd);
2267     Address addr = get_addr_with_indexing(from, size_in_bytes, forward);
2268     __ load_sized_value(rd, addr, size_in_bytes, false);
2269   }
2270 
2271   // Stores one 'size_in_bytes'-sized value to 'to' in given direction (see load_one)
2272   void store_one(Register rd, Register to, int size_in_bytes, bool forward) {
2273     assert_different_registers(to, rd);
2274     Address addr = get_addr_with_indexing(to, size_in_bytes, forward);
2275     __ store_sized_value(rd, addr, size_in_bytes);
2276   }
2277 #else
2278   // load_one and store_one are the same as for AArch64 except for
2279   //   *) Support for condition execution
2280   //   *) Second value register argument for 8-byte values
2281 
2282   void load_one(Register rd, Register from, int size_in_bytes, bool forward, AsmCondition cond = al, Register rd2 = noreg) {
2283     assert_different_registers(from, rd, rd2);
2284     if (size_in_bytes < 8) {
2285       Address addr = get_addr_with_indexing(from, size_in_bytes, forward);
2286       __ load_sized_value(rd, addr, size_in_bytes, false, cond);
2287     } else {
2288       assert (rd2 != noreg, "second value register must be specified");
2289       assert (rd->encoding() < rd2->encoding(), "wrong value register set");
2290 
2291       if (forward) {
2292         __ ldmia(from, RegisterSet(rd) | rd2, writeback, cond);
2293       } else {
2294         __ ldmdb(from, RegisterSet(rd) | rd2, writeback, cond);
2295       }
2296     }
2297   }
2298 
2299   void store_one(Register rd, Register to, int size_in_bytes, bool forward, AsmCondition cond = al, Register rd2 = noreg) {
2300     assert_different_registers(to, rd, rd2);
2301     if (size_in_bytes < 8) {
2302       Address addr = get_addr_with_indexing(to, size_in_bytes, forward);
2303       __ store_sized_value(rd, addr, size_in_bytes, cond);
2304     } else {
2305       assert (rd2 != noreg, "second value register must be specified");
2306       assert (rd->encoding() < rd2->encoding(), "wrong value register set");
2307 
2308       if (forward) {
2309         __ stmia(to, RegisterSet(rd) | rd2, writeback, cond);
2310       } else {
2311         __ stmdb(to, RegisterSet(rd) | rd2, writeback, cond);
2312       }
2313     }
2314   }
2315 #endif // AARCH64
2316 
2317   // Copies data from 'from' to 'to' in specified direction to align 'from' by 64 bits.
2318   // (on 32-bit ARM 64-bit alignment is better for LDM).
2319   //
2320   // Arguments:
2321   //     from:              beginning (if forward) or upper bound (if !forward) of the region to be read
2322   //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
2323   //     count:             32-bit int, maximum number of elements which can be copied
2324   //     bytes_per_count:   size of an element
2325   //     forward:           specifies copy direction
2326   //
2327   // Notes:
2328   //   'from' and 'to' must be aligned by 'bytes_per_count'
2329   //   'count' must not be less than the returned value
2330   //   shifts 'from' and 'to' by the number of copied bytes in corresponding direction
2331   //   decreases 'count' by the number of elements copied
2332   //
2333   // Returns maximum number of bytes which may be copied.
2334   int align_src(Register from, Register to, Register count, Register tmp, int bytes_per_count, bool forward) {
2335     assert_different_registers(from, to, count, tmp);
2336 #ifdef AARCH64
2337     // TODO-AARCH64: replace by simple loop?
2338     Label Laligned_by_2, Laligned_by_4, Laligned_by_8;
2339 
2340     if (bytes_per_count == 1) {
2341       __ tbz(from, 0, Laligned_by_2);
2342       __ sub_32(count, count, 1);
2343       load_one(tmp, from, 1, forward);
2344       store_one(tmp, to, 1, forward);
2345     }
2346 
2347     __ BIND(Laligned_by_2);
2348 
2349     if (bytes_per_count <= 2) {
2350       __ tbz(from, 1, Laligned_by_4);
2351       __ sub_32(count, count, 2/bytes_per_count);
2352       load_one(tmp, from, 2, forward);
2353       store_one(tmp, to, 2, forward);
2354     }
2355 
2356     __ BIND(Laligned_by_4);
2357 
2358     if (bytes_per_count <= 4) {
2359       __ tbz(from, 2, Laligned_by_8);
2360       __ sub_32(count, count, 4/bytes_per_count);
2361       load_one(tmp, from, 4, forward);
2362       store_one(tmp, to, 4, forward);
2363     }
2364     __ BIND(Laligned_by_8);
2365 #else // AARCH64
2366     if (bytes_per_count < 8) {
2367       Label L_align_src;
2368       __ BIND(L_align_src);
2369       __ tst(from, 7);
2370       // ne => not aligned: copy one element and (if bytes_per_count < 4) loop
2371       __ sub(count, count, 1, ne);
2372       load_one(tmp, from, bytes_per_count, forward, ne);
2373       store_one(tmp, to, bytes_per_count, forward, ne);
2374       if (bytes_per_count < 4) {
2375         __ b(L_align_src, ne); // if bytes_per_count == 4, then 0 or 1 loop iterations are enough
2376       }
2377     }
2378 #endif // AARCH64
2379     return 7/bytes_per_count;
2380   }
2381 
2382   // Copies 'count' of 'bytes_per_count'-sized elements in the specified direction.
2383   //
2384   // Arguments:
2385   //     from:              beginning (if forward) or upper bound (if !forward) of the region to be read
2386   //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
2387   //     count:             32-bit int, number of elements to be copied
2388   //     entry:             copy loop entry point
2389   //     bytes_per_count:   size of an element
2390   //     forward:           specifies copy direction
2391   //
2392   // Notes:
2393   //     shifts 'from' and 'to'
2394   void copy_small_array(Register from, Register to, Register count, Register tmp, Register tmp2, int bytes_per_count, bool forward, Label & entry) {
2395     assert_different_registers(from, to, count, tmp);
2396 
2397     __ align(OptoLoopAlignment);
2398 #ifdef AARCH64
2399     Label L_small_array_done, L_small_array_loop;
2400     __ BIND(entry);
2401     __ cbz_32(count, L_small_array_done);
2402 
2403     __ BIND(L_small_array_loop);
2404     __ subs_32(count, count, 1);
2405     load_one(tmp, from, bytes_per_count, forward);
2406     store_one(tmp, to, bytes_per_count, forward);
2407     __ b(L_small_array_loop, gt);
2408 
2409     __ BIND(L_small_array_done);
2410 #else
2411     Label L_small_loop;
2412     __ BIND(L_small_loop);
2413     store_one(tmp, to, bytes_per_count, forward, al, tmp2);
2414     __ BIND(entry); // entry point
2415     __ subs(count, count, 1);
2416     load_one(tmp, from, bytes_per_count, forward, ge, tmp2);
2417     __ b(L_small_loop, ge);
2418 #endif // AARCH64
2419   }
2420 
2421   // Aligns 'to' by reading one word from 'from' and writting its part to 'to'.
2422   //
2423   // Arguments:
2424   //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
2425   //     count:             32-bit int, number of elements allowed to be copied
2426   //     to_remainder:      remainder of dividing 'to' by wordSize
2427   //     bytes_per_count:   size of an element
2428   //     forward:           specifies copy direction
2429   //     Rval:              contains an already read but not yet written word;
2430   //                        its' LSBs (if forward) or MSBs (if !forward) are to be written to align 'to'.
2431   //
2432   // Notes:
2433   //     'count' must not be less then the returned value
2434   //     'to' must be aligned by bytes_per_count but must not be aligned by wordSize
2435   //     shifts 'to' by the number of written bytes (so that it becomes the bound of memory to be written)
2436   //     decreases 'count' by the the number of elements written
2437   //     Rval's MSBs or LSBs remain to be written further by generate_{forward,backward}_shifted_copy_loop
2438   int align_dst(Register to, Register count, Register Rval, Register tmp,
2439                                         int to_remainder, int bytes_per_count, bool forward) {
2440     assert_different_registers(to, count, tmp, Rval);
2441 
2442     assert (0 < to_remainder && to_remainder < wordSize, "to_remainder is not valid");
2443     assert (to_remainder % bytes_per_count == 0, "to must be aligned by bytes_per_count");
2444 
2445     int bytes_to_write = forward ? (wordSize - to_remainder) : to_remainder;
2446 
2447     int offset = 0;
2448 
2449     for (int l = 0; l < LogBytesPerWord; ++l) {
2450       int s = (1 << l);
2451       if (bytes_to_write & s) {
2452         int new_offset = offset + s*BitsPerByte;
2453         if (forward) {
2454           if (offset == 0) {
2455             store_one(Rval, to, s, forward);
2456           } else {
2457             __ logical_shift_right(tmp, Rval, offset);
2458             store_one(tmp, to, s, forward);
2459           }
2460         } else {
2461           __ logical_shift_right(tmp, Rval, BitsPerWord - new_offset);
2462           store_one(tmp, to, s, forward);
2463         }
2464 
2465         offset = new_offset;
2466       }
2467     }
2468 
2469     assert (offset == bytes_to_write * BitsPerByte, "all bytes must be copied");
2470 
2471     __ sub_32(count, count, bytes_to_write/bytes_per_count);
2472 
2473     return bytes_to_write / bytes_per_count;
2474   }
2475 
2476   // Copies 'count' of elements using shifted copy loop
2477   //
2478   // Arguments:
2479   //     from:              beginning (if forward) or upper bound (if !forward) of the region to be read
2480   //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
2481   //     count:             32-bit int, number of elements to be copied
2482   //     to_remainder:      remainder of dividing 'to' by wordSize
2483   //     bytes_per_count:   size of an element
2484   //     forward:           specifies copy direction
2485   //     Rval:              contains an already read but not yet written word
2486   //
2487   //
2488   // Notes:
2489   //     'count' must not be less then the returned value
2490   //     'from' must be aligned by wordSize
2491   //     'to' must be aligned by bytes_per_count but must not be aligned by wordSize
2492   //     shifts 'to' by the number of copied bytes
2493   //
2494   // Scratches R3-R10, R12
2495   int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, Register Rval,
2496                                                         int to_remainder, int bytes_per_count, bool forward) {
2497 
2498     assert (0 < to_remainder && to_remainder < wordSize, "to_remainder is invalid");
2499 
2500     const Register tmp  = forward ? R3 : R12; // TODO-AARCH64: on cojoint_short R4 was used for tmp
2501     assert_different_registers(from, to, count, Rval, tmp);
2502 
2503     int required_to_align = align_dst(to, count, Rval, tmp, to_remainder, bytes_per_count, forward);
2504 
2505     int lsr_shift = (wordSize - to_remainder) * BitsPerByte;
2506     int lsl_shift = to_remainder * BitsPerByte;
2507 
2508     int min_copy;
2509     if (forward) {
2510       min_copy = generate_forward_shifted_copy_loop(from, to, count, bytes_per_count, lsr_shift, lsl_shift);
2511     } else {
2512       min_copy = generate_backward_shifted_copy_loop(from, to, count, bytes_per_count, lsr_shift, lsl_shift);
2513     }
2514 
2515     return min_copy + required_to_align;
2516   }
2517 
2518   // Copies 'count' of elements using shifted copy loop
2519   //
2520   // Arguments:
2521   //     from:              beginning (if forward) or upper bound (if !forward) of the region to be read
2522   //     to:                beginning (if forward) or upper bound (if !forward) of the region to be written
2523   //     count:             32-bit int, number of elements to be copied
2524   //     bytes_per_count:   size of an element
2525   //     forward:           specifies copy direction
2526   //
2527   // Notes:
2528   //     'count' must not be less then the returned value
2529   //     'from' must be aligned by wordSize
2530   //     'to' must be aligned by bytes_per_count but must not be aligned by wordSize
2531   //     shifts 'to' by the number of copied bytes
2532   //
2533   // Scratches 'from', 'count', R3 and R12.
2534   // On AArch64 also scratches R4-R10, on 32-bit ARM saves them to use.
2535   int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, bool forward) {
2536 
2537     const Register Rval = forward ? R12 : R3; // as generate_{forward,backward}_shifted_copy_loop expect
2538 
2539     int min_copy = 0;
2540 
2541     // Note: if {seq} is a sequence of numbers, L{seq} means that if the execution reaches this point,
2542     // then the remainder of 'to' divided by wordSize is one of elements of {seq}.
2543 
2544 #ifdef AARCH64
2545     // TODO-AARCH64: simplify, tune
2546 
2547     load_one(Rval, from, wordSize, forward);
2548 
2549     Label L_loop_finished;
2550 
2551     switch (bytes_per_count) {
2552       case 4:
2553         min_copy = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward);
2554         break;
2555       case 2:
2556       {
2557         Label L2, L4, L6;
2558 
2559         __ tbz(to, 1, L4);
2560         __ tbz(to, 2, L2);
2561 
2562         __ BIND(L6);
2563         int min_copy6 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 6, bytes_per_count, forward);
2564         __ b(L_loop_finished);
2565 
2566         __ BIND(L2);
2567         int min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
2568         __ b(L_loop_finished);
2569 
2570         __ BIND(L4);
2571         int min_copy4 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward);
2572 
2573         min_copy = MAX2(MAX2(min_copy2, min_copy4), min_copy6);
2574         break;
2575       }
2576       case 1:
2577       {
2578         Label L1, L2, L3, L4, L5, L6, L7;
2579         Label L15, L26;
2580         Label L246;
2581 
2582         __ tbz(to, 0, L246);
2583         __ tbz(to, 1, L15);
2584         __ tbz(to, 2, L3);
2585 
2586         __ BIND(L7);
2587         int min_copy7 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 7, bytes_per_count, forward);
2588         __ b(L_loop_finished);
2589 
2590         __ BIND(L246);
2591         __ tbnz(to, 1, L26);
2592 
2593         __ BIND(L4);
2594         int min_copy4 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward);
2595         __ b(L_loop_finished);
2596 
2597         __ BIND(L15);
2598         __ tbz(to, 2, L1);
2599 
2600         __ BIND(L5);
2601         int min_copy5 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 5, bytes_per_count, forward);
2602         __ b(L_loop_finished);
2603 
2604         __ BIND(L3);
2605         int min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward);
2606         __ b(L_loop_finished);
2607 
2608         __ BIND(L26);
2609         __ tbz(to, 2, L2);
2610 
2611         __ BIND(L6);
2612         int min_copy6 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 6, bytes_per_count, forward);
2613         __ b(L_loop_finished);
2614 
2615         __ BIND(L1);
2616         int min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward);
2617         __ b(L_loop_finished);
2618 
2619         __ BIND(L2);
2620         int min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
2621 
2622 
2623         min_copy = MAX2(min_copy1, min_copy2);
2624         min_copy = MAX2(min_copy,  min_copy3);
2625         min_copy = MAX2(min_copy,  min_copy4);
2626         min_copy = MAX2(min_copy,  min_copy5);
2627         min_copy = MAX2(min_copy,  min_copy6);
2628         min_copy = MAX2(min_copy,  min_copy7);
2629         break;
2630       }
2631       default:
2632         ShouldNotReachHere();
2633         break;
2634     }
2635     __ BIND(L_loop_finished);
2636 
2637 #else
2638     __ push(RegisterSet(R4,R10));
2639     load_one(Rval, from, wordSize, forward);
2640 
2641     switch (bytes_per_count) {
2642       case 2:
2643         min_copy = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
2644         break;
2645       case 1:
2646       {
2647         Label L1, L2, L3;
2648         int min_copy1, min_copy2, min_copy3;
2649 
2650         Label L_loop_finished;
2651 
2652         if (forward) {
2653             __ tbz(to, 0, L2);
2654             __ tbz(to, 1, L1);
2655 
2656             __ BIND(L3);
2657             min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward);
2658             __ b(L_loop_finished);
2659 
2660             __ BIND(L1);
2661             min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward);
2662             __ b(L_loop_finished);
2663 
2664             __ BIND(L2);
2665             min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
2666         } else {
2667             __ tbz(to, 0, L2);
2668             __ tbnz(to, 1, L3);
2669 
2670             __ BIND(L1);
2671             min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward);
2672             __ b(L_loop_finished);
2673 
2674              __ BIND(L3);
2675             min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward);
2676             __ b(L_loop_finished);
2677 
2678            __ BIND(L2);
2679             min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward);
2680         }
2681 
2682         min_copy = MAX2(MAX2(min_copy1, min_copy2), min_copy3);
2683 
2684         __ BIND(L_loop_finished);
2685 
2686         break;
2687       }
2688       default:
2689         ShouldNotReachHere();
2690         break;
2691     }
2692 
2693     __ pop(RegisterSet(R4,R10));
2694 #endif // AARCH64
2695 
2696     return min_copy;
2697   }
2698 
2699 #ifndef PRODUCT
2700   int * get_arraycopy_counter(int bytes_per_count) {
2701     switch (bytes_per_count) {
2702       case 1:
2703         return &SharedRuntime::_jbyte_array_copy_ctr;
2704       case 2:
2705         return &SharedRuntime::_jshort_array_copy_ctr;
2706       case 4:
2707         return &SharedRuntime::_jint_array_copy_ctr;
2708       case 8:
2709         return &SharedRuntime::_jlong_array_copy_ctr;
2710       default:
2711         ShouldNotReachHere();
2712         return NULL;
2713     }
2714   }
2715 #endif // !PRODUCT
2716 
2717   //
2718   //  Generate stub for primitive array copy.  If "aligned" is true, the
2719   //  "from" and "to" addresses are assumed to be heapword aligned.
2720   //
2721   //  If "disjoint" is true, arrays are assumed to be disjoint, otherwise they may overlap and
2722   //  "nooverlap_target" must be specified as the address to jump if they don't.
2723   //
2724   // Arguments for generated stub:
2725   //      from:  R0
2726   //      to:    R1
2727   //      count: R2 treated as signed 32-bit int
2728   //
2729   address generate_primitive_copy(bool aligned, const char * name, bool status, int bytes_per_count, bool disjoint, address nooverlap_target = NULL) {
2730     __ align(CodeEntryAlignment);
2731     StubCodeMark mark(this, "StubRoutines", name);
2732     address start = __ pc();
2733 
2734     const Register from  = R0;   // source array address
2735     const Register to    = R1;   // destination array address
2736     const Register count = R2;   // elements count
2737     const Register tmp1  = R3;
2738     const Register tmp2  = R12;
2739 
2740     if (!aligned)  {
2741       BLOCK_COMMENT("Entry:");
2742     }
2743 
2744     __ zap_high_non_significant_bits(R2);
2745 
2746     if (!disjoint) {
2747       assert (nooverlap_target != NULL, "must be specified for conjoint case");
2748       array_overlap_test(nooverlap_target, exact_log2(bytes_per_count), tmp1, tmp2);
2749     }
2750 
2751     inc_counter_np(*get_arraycopy_counter(bytes_per_count), tmp1, tmp2);
2752 
2753     // Conjoint case: since execution reaches this point, the arrays overlap, so performing backward copy
2754     // Disjoint case: perform forward copy
2755     bool forward = disjoint;
2756 
2757 
2758     if (!forward) {
2759       // Set 'from' and 'to' to upper bounds
2760       int log_bytes_per_count = exact_log2(bytes_per_count);
2761       __ add_ptr_scaled_int32(to,   to,   count, log_bytes_per_count);
2762       __ add_ptr_scaled_int32(from, from, count, log_bytes_per_count);
2763     }
2764 
2765     // There are two main copy loop implementations:
2766     //  *) The huge and complex one applicable only for large enough arrays
2767     //  *) The small and simple one applicable for any array (but not efficient for large arrays).
2768     // Currently "small" implementation is used if and only if the "large" one could not be used.
2769     // XXX optim: tune the limit higher ?
2770     // Large implementation lower applicability bound is actually determined by
2771     // aligned copy loop which require <=7 bytes for src alignment, and 8 words for aligned copy loop.
2772     const int small_copy_limit = (8*wordSize + 7) / bytes_per_count;
2773 
2774     Label L_small_array;
2775     __ cmp_32(count, small_copy_limit);
2776     __ b(L_small_array, le); // TODO-AARCH64: le vs lt
2777 
2778     // Otherwise proceed with large implementation.
2779 
2780     bool from_is_aligned = (bytes_per_count >= 8);
2781     if (aligned && forward && (HeapWordSize % 8 == 0)) {
2782         // if 'from' is heapword aligned and HeapWordSize is divisible by 8,
2783         //  then from is aligned by 8
2784         from_is_aligned = true;
2785     }
2786 
2787     int count_required_to_align = from_is_aligned ? 0 : align_src(from, to, count, tmp1, bytes_per_count, forward);
2788     assert (small_copy_limit >= count_required_to_align, "alignment could exhaust count");
2789 
2790     // now 'from' is aligned
2791 
2792     bool to_is_aligned = false;
2793 
2794     if (bytes_per_count >= wordSize) {
2795       // 'to' is aligned by bytes_per_count, so it is aligned by wordSize
2796       to_is_aligned = true;
2797     } else {
2798       if (aligned && (8 % HeapWordSize == 0) && (HeapWordSize % wordSize == 0)) {
2799         // Originally 'from' and 'to' were heapword aligned;
2800         // (from - to) has not been changed, so since now 'from' is 8-byte aligned, then it is also heapword aligned,
2801         //  so 'to' is also heapword aligned and thus aligned by wordSize.
2802         to_is_aligned = true;
2803       }
2804     }
2805 
2806     Label L_unaligned_dst;
2807 
2808     if (!to_is_aligned) {
2809       BLOCK_COMMENT("Check dst alignment:");
2810       __ tst(to, wordSize - 1);
2811       __ b(L_unaligned_dst, ne); // 'to' is not aligned
2812     }
2813 
2814     // 'from' and 'to' are properly aligned
2815 
2816     int min_copy;
2817     if (forward) {
2818       min_copy = generate_forward_aligned_copy_loop (from, to, count, bytes_per_count);
2819     } else {
2820       min_copy = generate_backward_aligned_copy_loop(from, to, count, bytes_per_count);
2821     }
2822     assert(small_copy_limit >= count_required_to_align + min_copy, "first loop might exhaust count");
2823 
2824     if (status) {
2825       __ mov(R0, 0); // OK
2826     }
2827 
2828     __ ret();
2829 
2830     {
2831       copy_small_array(from, to, count, tmp1, tmp2, bytes_per_count, forward, L_small_array /* entry */);
2832 
2833       if (status) {
2834         __ mov(R0, 0); // OK
2835       }
2836 
2837       __ ret();
2838     }
2839 
2840     if (! to_is_aligned) {
2841       __ BIND(L_unaligned_dst);
2842       int min_copy_shifted = align_dst_and_generate_shifted_copy_loop(from, to, count, bytes_per_count, forward);
2843       assert (small_copy_limit >= count_required_to_align + min_copy_shifted, "first loop might exhaust count");
2844 
2845       if (status) {
2846         __ mov(R0, 0); // OK
2847       }
2848 
2849       __ ret();
2850     }
2851 
2852     return start;
2853   }
2854 
2855 #if INCLUDE_ALL_GCS
2856   //
2857   //  Generate pre-write barrier for array.
2858   //
2859   //  Input:
2860   //     addr     - register containing starting address
2861   //     count    - register containing element count, 32-bit int
2862   //     callee_saved_regs -
2863   //                the call must preserve this number of registers: R0, R1, ..., R[callee_saved_regs-1]
2864   //
2865   //  callee_saved_regs must include addr and count
2866   //  Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR) except for callee_saved_regs.
2867   void gen_write_ref_array_pre_barrier(Register addr, Register count, int callee_saved_regs) {
2868     BarrierSet* bs = Universe::heap()->barrier_set();
2869     if (bs->has_write_ref_pre_barrier()) {
2870       assert(bs->has_write_ref_array_pre_opt(),
2871              "Else unsupported barrier set.");
2872 
2873       assert( addr->encoding() < callee_saved_regs, "addr must be saved");
2874       assert(count->encoding() < callee_saved_regs, "count must be saved");
2875 
2876       BLOCK_COMMENT("PreBarrier");
2877 
2878 #ifdef AARCH64
2879       callee_saved_regs = round_to(callee_saved_regs, 2);
2880       for (int i = 0; i < callee_saved_regs; i += 2) {
2881         __ raw_push(as_Register(i), as_Register(i+1));
2882       }
2883 #else
2884       RegisterSet saved_regs = RegisterSet(R0, as_Register(callee_saved_regs-1));
2885       __ push(saved_regs | R9ifScratched);
2886 #endif // AARCH64
2887 
2888       if (addr != R0) {
2889         assert_different_registers(count, R0);
2890         __ mov(R0, addr);
2891       }
2892 #ifdef AARCH64
2893       __ zero_extend(R1, count, 32); // BarrierSet::static_write_ref_array_pre takes size_t
2894 #else
2895       if (count != R1) {
2896         __ mov(R1, count);
2897       }
2898 #endif // AARCH64
2899 
2900       __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
2901 
2902 #ifdef AARCH64
2903       for (int i = callee_saved_regs - 2; i >= 0; i -= 2) {
2904         __ raw_pop(as_Register(i), as_Register(i+1));
2905       }
2906 #else
2907       __ pop(saved_regs | R9ifScratched);
2908 #endif // AARCH64
2909     }
2910   }
2911 #endif // INCLUDE_ALL_GCS
2912 
2913   //
2914   //  Generate post-write barrier for array.
2915   //
2916   //  Input:
2917   //     addr     - register containing starting address (can be scratched)
2918   //     count    - register containing element count, 32-bit int (can be scratched)
2919   //     tmp      - scratch register
2920   //
2921   //  Note: LR can be scratched but might be equal to addr, count or tmp
2922   //  Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR).
2923   void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp) {
2924     assert_different_registers(addr, count, tmp);
2925     BarrierSet* bs = Universe::heap()->barrier_set();
2926 
2927     switch (bs->kind()) {
2928     case BarrierSet::G1SATBCTLogging:
2929       {
2930         BLOCK_COMMENT("G1PostBarrier");
2931         if (addr != R0) {
2932           assert_different_registers(count, R0);
2933           __ mov(R0, addr);
2934         }
2935 #ifdef AARCH64
2936         __ zero_extend(R1, count, 32); // BarrierSet::static_write_ref_array_post takes size_t
2937 #else
2938         if (count != R1) {
2939           __ mov(R1, count);
2940         }
2941 #if R9_IS_SCRATCHED
2942         // Safer to save R9 here since callers may have been written
2943         // assuming R9 survives. This is suboptimal but is not in
2944         // general worth optimizing for the few platforms where R9
2945         // is scratched. Note that the optimization might not be to
2946         // difficult for this particular call site.
2947         __ push(R9);
2948 #endif
2949 #endif // !AARCH64
2950         __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
2951 #ifndef AARCH64
2952 #if R9_IS_SCRATCHED
2953         __ pop(R9);
2954 #endif
2955 #endif // !AARCH64
2956       }
2957       break;
2958     case BarrierSet::CardTableForRS:
2959     case BarrierSet::CardTableExtension:
2960       {
2961         BLOCK_COMMENT("CardTablePostBarrier");
2962         CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
2963         assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
2964 
2965         Label L_cardtable_loop;
2966 
2967         __ add_ptr_scaled_int32(count, addr, count, LogBytesPerHeapOop);
2968         __ sub(count, count, BytesPerHeapOop);                            // last addr
2969 
2970         __ logical_shift_right(addr, addr, CardTableModRefBS::card_shift);
2971         __ logical_shift_right(count, count, CardTableModRefBS::card_shift);
2972         __ sub(count, count, addr); // nb of cards
2973 
2974         // warning: Rthread has not been preserved
2975         __ mov_address(tmp, (address) ct->byte_map_base, symbolic_Relocation::card_table_reference);
2976         __ add(addr,tmp, addr);
2977 
2978         Register zero = __ zero_register(tmp);
2979 
2980         __ BIND(L_cardtable_loop);
2981         __ strb(zero, Address(addr, 1, post_indexed));
2982         __ subs(count, count, 1);
2983         __ b(L_cardtable_loop, ge);
2984       }
2985       break;
2986     case BarrierSet::ModRef:
2987       break;
2988     default:
2989       ShouldNotReachHere();
2990     }
2991   }
2992 
2993   // Generates pattern of code to be placed after raw data copying in generate_oop_copy
2994   // Includes return from arraycopy stub.
2995   //
2996   // Arguments:
2997   //     to:       destination pointer after copying.
2998   //               if 'forward' then 'to' == upper bound, else 'to' == beginning of the modified region
2999   //     count:    total number of copied elements, 32-bit int
3000   //
3001   // Blows all volatile (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR) and 'to', 'count', 'tmp' registers.
3002   void oop_arraycopy_stub_epilogue_helper(Register to, Register count, Register tmp, bool status, bool forward) {
3003     assert_different_registers(to, count, tmp);
3004 
3005     if (forward) {
3006       // 'to' is upper bound of the modified region
3007       // restore initial dst:
3008       __ sub_ptr_scaled_int32(to, to, count, LogBytesPerHeapOop);
3009     }
3010 
3011     // 'to' is the beginning of the region
3012 
3013     gen_write_ref_array_post_barrier(to, count, tmp);
3014 
3015     if (status) {
3016       __ mov(R0, 0); // OK
3017     }
3018 
3019 #ifdef AARCH64
3020     __ raw_pop(LR, ZR);
3021     __ ret();
3022 #else
3023     __ pop(PC);
3024 #endif // AARCH64
3025   }
3026 
3027 
3028   //  Generate stub for assign-compatible oop copy.  If "aligned" is true, the
3029   //  "from" and "to" addresses are assumed to be heapword aligned.
3030   //
3031   //  If "disjoint" is true, arrays are assumed to be disjoint, otherwise they may overlap and
3032   //  "nooverlap_target" must be specified as the address to jump if they don't.
3033   //
3034   // Arguments for generated stub:
3035   //      from:  R0
3036   //      to:    R1
3037   //      count: R2 treated as signed 32-bit int
3038   //
3039   address generate_oop_copy(bool aligned, const char * name, bool status, bool disjoint, address nooverlap_target = NULL) {
3040     __ align(CodeEntryAlignment);
3041     StubCodeMark mark(this, "StubRoutines", name);
3042     address start = __ pc();
3043 
3044     Register from  = R0;
3045     Register to    = R1;
3046     Register count = R2;
3047     Register tmp1  = R3;
3048     Register tmp2  = R12;
3049 
3050 
3051     if (!aligned) {
3052       BLOCK_COMMENT("Entry:");
3053     }
3054 
3055     __ zap_high_non_significant_bits(R2);
3056 
3057     if (!disjoint) {
3058       assert (nooverlap_target != NULL, "must be specified for conjoint case");
3059       array_overlap_test(nooverlap_target, LogBytesPerHeapOop, tmp1, tmp2);
3060     }
3061 
3062     inc_counter_np(SharedRuntime::_oop_array_copy_ctr, tmp1, tmp2);
3063 
3064     // Conjoint case: since execution reaches this point, the arrays overlap, so performing backward copy
3065     // Disjoint case: perform forward copy
3066     bool forward = disjoint;
3067 
3068     const int bytes_per_count = BytesPerHeapOop;
3069     const int log_bytes_per_count = LogBytesPerHeapOop;
3070 
3071     const Register saved_count = LR;
3072     const int callee_saved_regs = 3; // R0-R2
3073 
3074     // LR is used later to save barrier args
3075 #ifdef AARCH64
3076     __ raw_push(LR, ZR);
3077 #else
3078     __ push(LR);
3079 #endif // AARCH64
3080 
3081 #if INCLUDE_ALL_GCS
3082     gen_write_ref_array_pre_barrier(to, count, callee_saved_regs);
3083 #endif // INCLUDE_ALL_GCS
3084 
3085     // save arguments for barrier generation (after the pre barrier)
3086     __ mov(saved_count, count);
3087 
3088     if (!forward) {
3089       __ add_ptr_scaled_int32(to,   to,   count, log_bytes_per_count);
3090       __ add_ptr_scaled_int32(from, from, count, log_bytes_per_count);
3091     }
3092 
3093     // for short arrays, just do single element copy
3094     Label L_small_array;
3095     const int small_copy_limit = (8*wordSize + 7)/bytes_per_count; // XXX optim: tune the limit higher ?
3096     __ cmp_32(count, small_copy_limit);
3097     __ b(L_small_array, le);
3098 
3099     bool from_is_aligned = (bytes_per_count >= 8);
3100     if (aligned && forward && (HeapWordSize % 8 == 0)) {
3101         // if 'from' is heapword aligned and HeapWordSize is divisible by 8,
3102         //  then from is aligned by 8
3103         from_is_aligned = true;
3104     }
3105 
3106     int count_required_to_align = from_is_aligned ? 0 : align_src(from, to, count, tmp1, bytes_per_count, forward);
3107     assert (small_copy_limit >= count_required_to_align, "alignment could exhaust count");
3108 
3109     // now 'from' is aligned
3110 
3111     bool to_is_aligned = false;
3112 
3113     if (bytes_per_count >= wordSize) {
3114       // 'to' is aligned by bytes_per_count, so it is aligned by wordSize
3115       to_is_aligned = true;
3116     } else {
3117       if (aligned && (8 % HeapWordSize == 0) && (HeapWordSize % wordSize == 0)) {
3118         // Originally 'from' and 'to' were heapword aligned;
3119         // (from - to) has not been changed, so since now 'from' is 8-byte aligned, then it is also heapword aligned,
3120         //  so 'to' is also heapword aligned and thus aligned by wordSize.
3121         to_is_aligned = true;
3122       }
3123     }
3124 
3125     Label L_unaligned_dst;
3126 
3127     if (!to_is_aligned) {
3128       BLOCK_COMMENT("Check dst alignment:");
3129       __ tst(to, wordSize - 1);
3130       __ b(L_unaligned_dst, ne); // 'to' is not aligned
3131     }
3132 
3133     int min_copy;
3134     if (forward) {
3135       min_copy = generate_forward_aligned_copy_loop(from, to, count, bytes_per_count);
3136     } else {
3137       min_copy = generate_backward_aligned_copy_loop(from, to, count, bytes_per_count);
3138     }
3139     assert(small_copy_limit >= count_required_to_align + min_copy, "first loop might exhaust count");
3140 
3141     oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward);
3142 
3143     {
3144       copy_small_array(from, to, count, tmp1, noreg, bytes_per_count, forward, L_small_array);
3145 
3146       oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward);
3147     }
3148 
3149     if (!to_is_aligned) {
3150       // !to_is_aligned <=> UseCompressedOops && AArch64
3151       __ BIND(L_unaligned_dst);
3152 #ifdef AARCH64
3153       assert (UseCompressedOops, "unaligned oop array copy may be requested only with UseCompressedOops");
3154 #else
3155       ShouldNotReachHere();
3156 #endif // AARCH64
3157       int min_copy_shifted = align_dst_and_generate_shifted_copy_loop(from, to, count, bytes_per_count, forward);
3158       assert (small_copy_limit >= count_required_to_align + min_copy_shifted, "first loop might exhaust count");
3159 
3160       oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward);
3161     }
3162 
3163     return start;
3164   }
3165 
3166   //  Generate 'unsafe' array copy stub
3167   //  Though just as safe as the other stubs, it takes an unscaled
3168   //  size_t argument instead of an element count.
3169   //
3170   // Arguments for generated stub:
3171   //      from:  R0
3172   //      to:    R1
3173   //      count: R2 byte count, treated as ssize_t, can be zero
3174   //
3175   // Examines the alignment of the operands and dispatches
3176   // to a long, int, short, or byte copy loop.
3177   //
3178   address generate_unsafe_copy(const char* name) {
3179 
3180     const Register R0_from   = R0;      // source array address
3181     const Register R1_to     = R1;      // destination array address
3182     const Register R2_count  = R2;      // elements count
3183 
3184     const Register R3_bits   = R3;      // test copy of low bits
3185 
3186     __ align(CodeEntryAlignment);
3187     StubCodeMark mark(this, "StubRoutines", name);
3188     address start = __ pc();
3189 #ifdef AARCH64
3190     __ NOT_IMPLEMENTED();
3191     start = NULL;
3192 #else
3193     const Register tmp = Rtemp;
3194 
3195     // bump this on entry, not on exit:
3196     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R3, tmp);
3197 
3198     __ orr(R3_bits, R0_from, R1_to);
3199     __ orr(R3_bits, R2_count, R3_bits);
3200 
3201     __ tst(R3_bits, BytesPerLong-1);
3202     __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerLong), eq);
3203     __ jump(StubRoutines::_jlong_arraycopy, relocInfo::runtime_call_type, tmp, eq);
3204 
3205     __ tst(R3_bits, BytesPerInt-1);
3206     __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerInt), eq);
3207     __ jump(StubRoutines::_jint_arraycopy, relocInfo::runtime_call_type, tmp, eq);
3208 
3209     __ tst(R3_bits, BytesPerShort-1);
3210     __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerShort), eq);
3211     __ jump(StubRoutines::_jshort_arraycopy, relocInfo::runtime_call_type, tmp, eq);
3212 
3213     __ jump(StubRoutines::_jbyte_arraycopy, relocInfo::runtime_call_type, tmp);
3214 #endif
3215     return start;
3216   }
3217 
3218   // Helper for generating a dynamic type check.
3219   // Smashes only the given temp registers.
3220   void generate_type_check(Register sub_klass,
3221                            Register super_check_offset,
3222                            Register super_klass,
3223                            Register tmp1,
3224                            Register tmp2,
3225                            Register tmp3,
3226                            Label& L_success) {
3227     assert_different_registers(sub_klass, super_check_offset, super_klass, tmp1, tmp2, tmp3);
3228 
3229     BLOCK_COMMENT("type_check:");
3230 
3231     // If the pointers are equal, we are done (e.g., String[] elements).
3232 
3233     __ cmp(super_klass, sub_klass);
3234     __ b(L_success, eq); // fast success
3235 
3236 
3237     Label L_loop, L_fail;
3238 
3239     int sc_offset = in_bytes(Klass::secondary_super_cache_offset());
3240 
3241     // Check the supertype display:
3242     __ ldr(tmp1, Address(sub_klass, super_check_offset));
3243     __ cmp(tmp1, super_klass);
3244     __ b(L_success, eq);
3245 
3246     __ cmp(super_check_offset, sc_offset);
3247     __ b(L_fail, ne); // failure
3248 
3249     BLOCK_COMMENT("type_check_slow_path:");
3250 
3251     // a couple of useful fields in sub_klass:
3252     int ss_offset = in_bytes(Klass::secondary_supers_offset());
3253 
3254     // Do a linear scan of the secondary super-klass chain.
3255 
3256 #ifndef PRODUCT
3257     int* pst_counter = &SharedRuntime::_partial_subtype_ctr;
3258     __ inc_counter((address) pst_counter, tmp1, tmp2);
3259 #endif
3260 
3261     Register scan_temp = tmp1;
3262     Register count_temp = tmp2;
3263 
3264     // We will consult the secondary-super array.
3265     __ ldr(scan_temp, Address(sub_klass, ss_offset));
3266 
3267     Register search_key = super_klass;
3268 
3269     // Load the array length.
3270     __ ldr_s32(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes()));
3271     __ add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes());
3272 
3273     __ add(count_temp, count_temp, 1);
3274 
3275     // Top of search loop
3276     __ bind(L_loop);
3277     // Notes:
3278     //  scan_temp starts at the array elements
3279     //  count_temp is 1+size
3280 
3281     __ subs(count_temp, count_temp, 1);
3282     __ b(L_fail, eq); // not found
3283 
3284     // Load next super to check
3285     // In the array of super classes elements are pointer sized.
3286     int element_size = wordSize;
3287     __ ldr(tmp3, Address(scan_temp, element_size, post_indexed));
3288 
3289     // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list
3290     __ cmp(tmp3, search_key);
3291 
3292     // A miss means we are NOT a subtype and need to keep looping
3293     __ b(L_loop, ne);
3294 
3295     // Falling out the bottom means we found a hit; we ARE a subtype
3296 
3297     // Success.  Cache the super we found and proceed in triumph.
3298     __ str(super_klass, Address(sub_klass, sc_offset));
3299 
3300     // Jump to success
3301     __ b(L_success);
3302 
3303     // Fall through on failure!
3304     __ bind(L_fail);
3305   }
3306 
3307   //  Generate stub for checked oop copy.
3308   //
3309   // Arguments for generated stub:
3310   //      from:  R0
3311   //      to:    R1
3312   //      count: R2 treated as signed 32-bit int
3313   //      ckoff: R3 (super_check_offset)
3314   //      ckval: R4 (AArch64) / SP[0] (32-bit ARM) (super_klass)
3315   //      ret:   R0 zero for success; (-1^K) where K is partial transfer count (32-bit)
3316   //
3317   address generate_checkcast_copy(const char * name) {
3318     __ align(CodeEntryAlignment);
3319     StubCodeMark mark(this, "StubRoutines", name);
3320     address start = __ pc();
3321 
3322     const Register from  = R0;  // source array address
3323     const Register to    = R1;  // destination array address
3324     const Register count = R2;  // elements count
3325 
3326     const Register R3_ckoff  = R3;      // super_check_offset
3327     const Register R4_ckval  = R4;      // super_klass
3328 
3329     const int callee_saved_regs = AARCH64_ONLY(5) NOT_AARCH64(4); // LR saved differently
3330 
3331     Label load_element, store_element, do_card_marks, fail;
3332 
3333     BLOCK_COMMENT("Entry:");
3334 
3335     __ zap_high_non_significant_bits(R2);
3336 
3337 #ifdef AARCH64
3338     __ raw_push(LR, ZR);
3339     __ raw_push(R19, R20);
3340 #else
3341     int pushed = 0;
3342     __ push(LR);
3343     pushed+=1;
3344 #endif // AARCH64
3345 
3346 #if INCLUDE_ALL_GCS
3347     gen_write_ref_array_pre_barrier(to, count, callee_saved_regs);
3348 #endif // INCLUDE_ALL_GCS
3349 
3350 #ifndef AARCH64
3351     const RegisterSet caller_saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11;
3352     __ push(caller_saved_regs);
3353     assert(caller_saved_regs.size() == 6, "check the count");
3354     pushed+=6;
3355 
3356     __ ldr(R4_ckval,Address(SP, wordSize*pushed)); // read the argument that was on the stack
3357 #endif // !AARCH64
3358 
3359     // Save arguments for barrier generation (after the pre barrier):
3360     // - must be a caller saved register and not LR
3361     // - ARM32: avoid R10 in case RThread is needed
3362     const Register saved_count = AARCH64_ONLY(R19) NOT_AARCH64(altFP_7_11);
3363 #ifdef AARCH64
3364     __ mov_w(saved_count, count);
3365     __ cbnz_w(count, load_element); // and test count
3366 #else
3367     __ movs(saved_count, count); // and test count
3368     __ b(load_element,ne);
3369 #endif // AARCH64
3370 
3371     // nothing to copy
3372     __ mov(R0, 0);
3373 
3374 #ifdef AARCH64
3375     __ raw_pop(R19, R20);
3376     __ raw_pop(LR, ZR);
3377     __ ret();
3378 #else
3379     __ pop(caller_saved_regs);
3380     __ pop(PC);
3381 #endif // AARCH64
3382 
3383     // ======== begin loop ========
3384     // (Loop is rotated; its entry is load_element.)
3385     __ align(OptoLoopAlignment);
3386     __ BIND(store_element);
3387     if (UseCompressedOops) {
3388       __ store_heap_oop(R5, Address(to, BytesPerHeapOop, post_indexed));  // store the oop, changes flags
3389       __ subs_32(count,count,1);
3390     } else {
3391       __ subs_32(count,count,1);
3392       __ str(R5, Address(to, BytesPerHeapOop, post_indexed));             // store the oop
3393     }
3394     __ b(do_card_marks, eq); // count exhausted
3395 
3396     // ======== loop entry is here ========
3397     __ BIND(load_element);
3398     __ load_heap_oop(R5, Address(from, BytesPerHeapOop, post_indexed));  // load the oop
3399     __ cbz(R5, store_element); // NULL
3400 
3401     __ load_klass(R6, R5);
3402 
3403     generate_type_check(R6, R3_ckoff, R4_ckval, /*tmps*/ R12, R8, R9,
3404                         // branch to this on success:
3405                         store_element);
3406     // ======== end loop ========
3407 
3408     // It was a real error; we must depend on the caller to finish the job.
3409     // Register count has number of *remaining* oops, saved_count number of *total* oops.
3410     // Emit GC store barriers for the oops we have copied
3411     // and report their number to the caller (0 or (-1^n))
3412     __ BIND(fail);
3413 
3414     // Note: fail marked by the fact that count differs from saved_count
3415 
3416     __ BIND(do_card_marks);
3417 
3418     Register copied = AARCH64_ONLY(R20) NOT_AARCH64(R4); // saved
3419     Label L_not_copied;
3420 
3421     __ subs_32(copied, saved_count, count); // copied count (in saved reg)
3422     __ b(L_not_copied, eq); // nothing was copied, skip post barrier
3423     __ sub(to, to, AsmOperand(copied, lsl, LogBytesPerHeapOop)); // initial to value
3424     __ mov(R12, copied); // count arg scratched by post barrier
3425 
3426     gen_write_ref_array_post_barrier(to, R12, R3);
3427 
3428     assert_different_registers(R3,R12,LR,copied,saved_count);
3429     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R3, R12);
3430 
3431     __ BIND(L_not_copied);
3432     __ cmp_32(copied, saved_count); // values preserved in saved registers
3433 
3434 #ifdef AARCH64
3435     __ csinv(R0, ZR, copied, eq); // 0 if all copied else NOT(copied)
3436     __ raw_pop(R19, R20);
3437     __ raw_pop(LR, ZR);
3438     __ ret();
3439 #else
3440     __ mov(R0, 0, eq); // 0 if all copied
3441     __ mvn(R0, copied, ne); // else NOT(copied)
3442     __ pop(caller_saved_regs);
3443     __ pop(PC);
3444 #endif // AARCH64
3445 
3446     return start;
3447   }
3448 
3449   // Perform range checks on the proposed arraycopy.
3450   // Kills the two temps, but nothing else.
3451   void arraycopy_range_checks(Register src,     // source array oop
3452                               Register src_pos, // source position (32-bit int)
3453                               Register dst,     // destination array oop
3454                               Register dst_pos, // destination position (32-bit int)
3455                               Register length,  // length of copy (32-bit int)
3456                               Register temp1, Register temp2,
3457                               Label& L_failed) {
3458 
3459     BLOCK_COMMENT("arraycopy_range_checks:");
3460 
3461     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
3462 
3463     const Register array_length = temp1;  // scratch
3464     const Register end_pos      = temp2;  // scratch
3465 
3466     __ add_32(end_pos, length, src_pos);  // src_pos + length
3467     __ ldr_s32(array_length, Address(src, arrayOopDesc::length_offset_in_bytes()));
3468     __ cmp_32(end_pos, array_length);
3469     __ b(L_failed, hi);
3470 
3471     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
3472     __ add_32(end_pos, length, dst_pos); // dst_pos + length
3473     __ ldr_s32(array_length, Address(dst, arrayOopDesc::length_offset_in_bytes()));
3474     __ cmp_32(end_pos, array_length);
3475     __ b(L_failed, hi);
3476 
3477     BLOCK_COMMENT("arraycopy_range_checks done");
3478   }
3479 
3480   //
3481   //  Generate generic array copy stubs
3482   //
3483   //  Input:
3484   //    R0    -  src oop
3485   //    R1    -  src_pos (32-bit int)
3486   //    R2    -  dst oop
3487   //    R3    -  dst_pos (32-bit int)
3488   //    R4 (AArch64) / SP[0] (32-bit ARM) -  element count (32-bit int)
3489   //
3490   //  Output: (32-bit int)
3491   //    R0 ==  0  -  success
3492   //    R0 <   0  -  need to call System.arraycopy
3493   //
3494   address generate_generic_copy(const char *name) {
3495     Label L_failed, L_objArray;
3496 
3497     // Input registers
3498     const Register src      = R0;  // source array oop
3499     const Register src_pos  = R1;  // source position
3500     const Register dst      = R2;  // destination array oop
3501     const Register dst_pos  = R3;  // destination position
3502 
3503     // registers used as temp
3504     const Register R5_src_klass = R5; // source array klass
3505     const Register R6_dst_klass = R6; // destination array klass
3506     const Register R_lh         = AARCH64_ONLY(R7) NOT_AARCH64(altFP_7_11); // layout handler
3507     const Register R8_temp      = R8;
3508 
3509     __ align(CodeEntryAlignment);
3510     StubCodeMark mark(this, "StubRoutines", name);
3511     address start = __ pc();
3512 
3513     __ zap_high_non_significant_bits(R1);
3514     __ zap_high_non_significant_bits(R3);
3515     __ zap_high_non_significant_bits(R4);
3516 
3517 #ifndef AARCH64
3518     int pushed = 0;
3519     const RegisterSet saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11;
3520     __ push(saved_regs);
3521     assert(saved_regs.size() == 6, "check the count");
3522     pushed+=6;
3523 #endif // !AARCH64
3524 
3525     // bump this on entry, not on exit:
3526     inc_counter_np(SharedRuntime::_generic_array_copy_ctr, R5, R12);
3527 
3528     const Register length   = R4;  // elements count
3529 #ifndef AARCH64
3530     __ ldr(length, Address(SP,4*pushed));
3531 #endif // !AARCH64
3532 
3533 
3534     //-----------------------------------------------------------------------
3535     // Assembler stubs will be used for this call to arraycopy
3536     // if the following conditions are met:
3537     //
3538     // (1) src and dst must not be null.
3539     // (2) src_pos must not be negative.
3540     // (3) dst_pos must not be negative.
3541     // (4) length  must not be negative.
3542     // (5) src klass and dst klass should be the same and not NULL.
3543     // (6) src and dst should be arrays.
3544     // (7) src_pos + length must not exceed length of src.
3545     // (8) dst_pos + length must not exceed length of dst.
3546     BLOCK_COMMENT("arraycopy initial argument checks");
3547 
3548     //  if (src == NULL) return -1;
3549     __ cbz(src, L_failed);
3550 
3551     //  if (src_pos < 0) return -1;
3552     __ cmp_32(src_pos, 0);
3553     __ b(L_failed, lt);
3554 
3555     //  if (dst == NULL) return -1;
3556     __ cbz(dst, L_failed);
3557 
3558     //  if (dst_pos < 0) return -1;
3559     __ cmp_32(dst_pos, 0);
3560     __ b(L_failed, lt);
3561 
3562     //  if (length < 0) return -1;
3563     __ cmp_32(length, 0);
3564     __ b(L_failed, lt);
3565 
3566     BLOCK_COMMENT("arraycopy argument klass checks");
3567     //  get src->klass()
3568     __ load_klass(R5_src_klass, src);
3569 
3570     // Load layout helper
3571     //
3572     //  |array_tag|     | header_size | element_type |     |log2_element_size|
3573     // 32        30    24            16              8     2                 0
3574     //
3575     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3576     //
3577 
3578     int lh_offset = in_bytes(Klass::layout_helper_offset());
3579     __ ldr_u32(R_lh, Address(R5_src_klass, lh_offset));
3580 
3581     __ load_klass(R6_dst_klass, dst);
3582 
3583     // Handle objArrays completely differently...
3584     juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3585     __ mov_slow(R8_temp, objArray_lh);
3586     __ cmp_32(R_lh, R8_temp);
3587     __ b(L_objArray,eq);
3588 
3589     //  if (src->klass() != dst->klass()) return -1;
3590     __ cmp(R5_src_klass, R6_dst_klass);
3591     __ b(L_failed, ne);
3592 
3593     //  if (!src->is_Array()) return -1;
3594     __ cmp_32(R_lh, Klass::_lh_neutral_value); // < 0
3595     __ b(L_failed, ge);
3596 
3597     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3598                            R8_temp, R6_dst_klass, L_failed);
3599 
3600     {
3601       // TypeArrayKlass
3602       //
3603       // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3604       // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3605       //
3606 
3607       const Register R6_offset = R6_dst_klass;    // array offset
3608       const Register R12_elsize = R12;            // log2 element size
3609 
3610       __ logical_shift_right(R6_offset, R_lh, Klass::_lh_header_size_shift);
3611       __ andr(R6_offset, R6_offset, (unsigned int)Klass::_lh_header_size_mask); // array_offset
3612       __ add(src, src, R6_offset);       // src array offset
3613       __ add(dst, dst, R6_offset);       // dst array offset
3614       __ andr(R12_elsize, R_lh, (unsigned int)Klass::_lh_log2_element_size_mask); // log2 element size
3615 
3616       // next registers should be set before the jump to corresponding stub
3617       const Register from     = R0;  // source array address
3618       const Register to       = R1;  // destination array address
3619       const Register count    = R2;  // elements count
3620 
3621       // 'from', 'to', 'count' registers should be set in this order
3622       // since they are the same as 'src', 'src_pos', 'dst'.
3623 
3624 #ifdef AARCH64
3625 
3626       BLOCK_COMMENT("choose copy loop based on element size and scale indexes");
3627       Label Lbyte, Lshort, Lint, Llong;
3628 
3629       __ cbz(R12_elsize, Lbyte);
3630 
3631       assert (LogBytesPerShort < LogBytesPerInt && LogBytesPerInt < LogBytesPerLong, "must be");
3632       __ cmp(R12_elsize, LogBytesPerInt);
3633       __ b(Lint,  eq);
3634       __ b(Llong, gt);
3635 
3636       __ BIND(Lshort);
3637       __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerShort);
3638       __ add_ptr_scaled_int32(to,   dst, dst_pos, LogBytesPerShort);
3639       __ mov(count, length);
3640       __ b(StubRoutines::_jshort_arraycopy);
3641 
3642       __ BIND(Lint);
3643       __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerInt);
3644       __ add_ptr_scaled_int32(to,   dst, dst_pos, LogBytesPerInt);
3645       __ mov(count, length);
3646       __ b(StubRoutines::_jint_arraycopy);
3647 
3648       __ BIND(Lbyte);
3649       __ add_ptr_scaled_int32(from, src, src_pos, 0);
3650       __ add_ptr_scaled_int32(to,   dst, dst_pos, 0);
3651       __ mov(count, length);
3652       __ b(StubRoutines::_jbyte_arraycopy);
3653 
3654       __ BIND(Llong);
3655       __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerLong);
3656       __ add_ptr_scaled_int32(to,   dst, dst_pos, LogBytesPerLong);
3657       __ mov(count, length);
3658       __ b(StubRoutines::_jlong_arraycopy);
3659 
3660 #else // AARCH64
3661 
3662       BLOCK_COMMENT("scale indexes to element size");
3663       __ add(from, src, AsmOperand(src_pos, lsl, R12_elsize));       // src_addr
3664       __ add(to, dst, AsmOperand(dst_pos, lsl, R12_elsize));         // dst_addr
3665 
3666       __ mov(count, length);  // length
3667 
3668       // XXX optim: avoid later push in arraycopy variants ?
3669 
3670       __ pop(saved_regs);
3671 
3672       BLOCK_COMMENT("choose copy loop based on element size");
3673       __ cmp(R12_elsize, 0);
3674       __ b(StubRoutines::_jbyte_arraycopy,eq);
3675 
3676       __ cmp(R12_elsize, LogBytesPerShort);
3677       __ b(StubRoutines::_jshort_arraycopy,eq);
3678 
3679       __ cmp(R12_elsize, LogBytesPerInt);
3680       __ b(StubRoutines::_jint_arraycopy,eq);
3681 
3682       __ b(StubRoutines::_jlong_arraycopy);
3683 
3684 #endif // AARCH64
3685     }
3686 
3687     // ObjArrayKlass
3688     __ BIND(L_objArray);
3689     // live at this point:  R5_src_klass, R6_dst_klass, src[_pos], dst[_pos], length
3690 
3691     Label L_plain_copy, L_checkcast_copy;
3692     //  test array classes for subtyping
3693     __ cmp(R5_src_klass, R6_dst_klass);         // usual case is exact equality
3694     __ b(L_checkcast_copy, ne);
3695 
3696     BLOCK_COMMENT("Identically typed arrays");
3697     {
3698       // Identically typed arrays can be copied without element-wise checks.
3699       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3700                              R8_temp, R_lh, L_failed);
3701 
3702       // next registers should be set before the jump to corresponding stub
3703       const Register from     = R0;  // source array address
3704       const Register to       = R1;  // destination array address
3705       const Register count    = R2;  // elements count
3706 
3707       __ add(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
3708       __ add(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
3709       __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerHeapOop);         // src_addr
3710       __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerHeapOop);           // dst_addr
3711       __ BIND(L_plain_copy);
3712       __ mov(count, length);
3713 
3714 #ifndef AARCH64
3715       __ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ?
3716 #endif // !AARCH64
3717       __ b(StubRoutines::_oop_arraycopy);
3718     }
3719 
3720     {
3721       __ BIND(L_checkcast_copy);
3722       // live at this point:  R5_src_klass, R6_dst_klass
3723 
3724       // Before looking at dst.length, make sure dst is also an objArray.
3725       __ ldr_u32(R8_temp, Address(R6_dst_klass, lh_offset));
3726       __ cmp_32(R_lh, R8_temp);
3727       __ b(L_failed, ne);
3728 
3729       // It is safe to examine both src.length and dst.length.
3730 
3731       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3732                              R8_temp, R_lh, L_failed);
3733 
3734       // next registers should be set before the jump to corresponding stub
3735       const Register from     = R0;  // source array address
3736       const Register to       = R1;  // destination array address
3737       const Register count    = R2;  // elements count
3738 
3739       // Marshal the base address arguments now, freeing registers.
3740       __ add(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
3741       __ add(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
3742       __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerHeapOop);         // src_addr
3743       __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerHeapOop);           // dst_addr
3744 
3745       __ mov(count, length); // length (reloaded)
3746 
3747       Register sco_temp = R3;                   // this register is free now
3748       assert_different_registers(from, to, count, sco_temp,
3749                                  R6_dst_klass, R5_src_klass);
3750 
3751       // Generate the type check.
3752       int sco_offset = in_bytes(Klass::super_check_offset_offset());
3753       __ ldr_u32(sco_temp, Address(R6_dst_klass, sco_offset));
3754       generate_type_check(R5_src_klass, sco_temp, R6_dst_klass,
3755                           R8_temp, R9,
3756                           AARCH64_ONLY(R10) NOT_AARCH64(R12),
3757                           L_plain_copy);
3758 
3759       // Fetch destination element klass from the ObjArrayKlass header.
3760       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3761 
3762       // the checkcast_copy loop needs two extra arguments:
3763       const Register Rdst_elem_klass = AARCH64_ONLY(R4) NOT_AARCH64(R3);
3764       __ ldr(Rdst_elem_klass, Address(R6_dst_klass, ek_offset));   // dest elem klass
3765 #ifndef AARCH64
3766       __ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ?
3767       __ str(Rdst_elem_klass, Address(SP,0));    // dest elem klass argument
3768 #endif // !AARCH64
3769       __ ldr_u32(R3, Address(Rdst_elem_klass, sco_offset));  // sco of elem klass
3770       __ b(StubRoutines::_checkcast_arraycopy);
3771     }
3772 
3773     __ BIND(L_failed);
3774 
3775 #ifndef AARCH64
3776     __ pop(saved_regs);
3777 #endif // !AARCH64
3778     __ mvn(R0, 0); // failure, with 0 copied
3779     __ ret();
3780 
3781     return start;
3782   }
3783 
3784   // Safefetch stubs.
3785   void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
3786     // safefetch signatures:
3787     //   int      SafeFetch32(int*      adr, int      errValue);
3788     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3789     //
3790     // arguments:
3791     //   R0 = adr
3792     //   R1 = errValue
3793     //
3794     // result:
3795     //   R0  = *adr or errValue
3796 
3797     StubCodeMark mark(this, "StubRoutines", name);
3798 
3799     // Entry point, pc or function descriptor.
3800     *entry = __ pc();
3801 
3802     // Load *adr into c_rarg2, may fault.
3803     *fault_pc = __ pc();
3804 
3805     switch (size) {
3806       case 4: // int32_t
3807         __ ldr_s32(R1, Address(R0));
3808         break;
3809 
3810       case 8: // int64_t
3811 #ifdef AARCH64
3812         __ ldr(R1, Address(R0));
3813 #else
3814         Unimplemented();
3815 #endif // AARCH64
3816         break;
3817 
3818       default:
3819         ShouldNotReachHere();
3820     }
3821 
3822     // return errValue or *adr
3823     *continuation_pc = __ pc();
3824     __ mov(R0, R1);
3825     __ ret();
3826   }
3827 
3828   void generate_arraycopy_stubs() {
3829 
3830     // Note:  the disjoint stubs must be generated first, some of
3831     //        the conjoint stubs use them.
3832 
3833     bool status = false; // non failing C2 stubs need not return a status in R0
3834 
3835 #ifdef TEST_C2_GENERIC_ARRAYCOPY /* Internal development flag */
3836     // With this flag, the C2 stubs are tested by generating calls to
3837     // generic_arraycopy instead of Runtime1::arraycopy
3838 
3839     // Runtime1::arraycopy return a status in R0 (0 if OK, else ~copied)
3840     // and the result is tested to see whether the arraycopy stub should
3841     // be called.
3842 
3843     // When we test arraycopy this way, we must generate extra code in the
3844     // arraycopy methods callable from C2 generic_arraycopy to set the
3845     // status to 0 for those who always succeed (calling the slow path stub might
3846     // lead to errors since the copy has already been performed).
3847 
3848     status = true; // generate a status compatible with C1 calls
3849 #endif
3850 
3851     // these need always status in case they are called from generic_arraycopy
3852     StubRoutines::_jbyte_disjoint_arraycopy  = generate_primitive_copy(false, "jbyte_disjoint_arraycopy",  true, 1, true);
3853     StubRoutines::_jshort_disjoint_arraycopy = generate_primitive_copy(false, "jshort_disjoint_arraycopy", true, 2, true);
3854     StubRoutines::_jint_disjoint_arraycopy   = generate_primitive_copy(false, "jint_disjoint_arraycopy",   true, 4, true);
3855     StubRoutines::_jlong_disjoint_arraycopy  = generate_primitive_copy(false, "jlong_disjoint_arraycopy",  true, 8, true);
3856     StubRoutines::_oop_disjoint_arraycopy    = generate_oop_copy      (false, "oop_disjoint_arraycopy",    true,    true);
3857 
3858     StubRoutines::_arrayof_jbyte_disjoint_arraycopy  = generate_primitive_copy(true, "arrayof_jbyte_disjoint_arraycopy", status, 1, true);
3859     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_primitive_copy(true, "arrayof_jshort_disjoint_arraycopy",status, 2, true);
3860     StubRoutines::_arrayof_jint_disjoint_arraycopy   = generate_primitive_copy(true, "arrayof_jint_disjoint_arraycopy",  status, 4, true);
3861     StubRoutines::_arrayof_jlong_disjoint_arraycopy  = generate_primitive_copy(true, "arrayof_jlong_disjoint_arraycopy", status, 8, true);
3862     StubRoutines::_arrayof_oop_disjoint_arraycopy    = generate_oop_copy      (true, "arrayof_oop_disjoint_arraycopy",   status,    true);
3863 
3864     // these need always status in case they are called from generic_arraycopy
3865     StubRoutines::_jbyte_arraycopy  = generate_primitive_copy(false, "jbyte_arraycopy",  true, 1, false, StubRoutines::_jbyte_disjoint_arraycopy);
3866     StubRoutines::_jshort_arraycopy = generate_primitive_copy(false, "jshort_arraycopy", true, 2, false, StubRoutines::_jshort_disjoint_arraycopy);
3867     StubRoutines::_jint_arraycopy   = generate_primitive_copy(false, "jint_arraycopy",   true, 4, false, StubRoutines::_jint_disjoint_arraycopy);
3868     StubRoutines::_jlong_arraycopy  = generate_primitive_copy(false, "jlong_arraycopy",  true, 8, false, StubRoutines::_jlong_disjoint_arraycopy);
3869     StubRoutines::_oop_arraycopy    = generate_oop_copy      (false, "oop_arraycopy",    true,    false, StubRoutines::_oop_disjoint_arraycopy);
3870 
3871     StubRoutines::_arrayof_jbyte_arraycopy    = generate_primitive_copy(true, "arrayof_jbyte_arraycopy",  status, 1, false, StubRoutines::_arrayof_jbyte_disjoint_arraycopy);
3872     StubRoutines::_arrayof_jshort_arraycopy   = generate_primitive_copy(true, "arrayof_jshort_arraycopy", status, 2, false, StubRoutines::_arrayof_jshort_disjoint_arraycopy);
3873 #ifdef _LP64
3874     // since sizeof(jint) < sizeof(HeapWord), there's a different flavor:
3875     StubRoutines::_arrayof_jint_arraycopy     = generate_primitive_copy(true, "arrayof_jint_arraycopy",   status, 4, false, StubRoutines::_arrayof_jint_disjoint_arraycopy);
3876 #else
3877     StubRoutines::_arrayof_jint_arraycopy     = StubRoutines::_jint_arraycopy;
3878 #endif
3879     if (BytesPerHeapOop < HeapWordSize) {
3880       StubRoutines::_arrayof_oop_arraycopy    = generate_oop_copy      (true, "arrayof_oop_arraycopy",    status,    false, StubRoutines::_arrayof_oop_disjoint_arraycopy);
3881     } else {
3882       StubRoutines::_arrayof_oop_arraycopy    = StubRoutines::_oop_arraycopy;
3883     }
3884     StubRoutines::_arrayof_jlong_arraycopy    = StubRoutines::_jlong_arraycopy;
3885 
3886     StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy");
3887     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy");
3888     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy");
3889 
3890 
3891   }
3892 
3893 #ifndef AARCH64
3894 #define COMPILE_CRYPTO
3895 #include "stubRoutinesCrypto_arm.cpp"
3896 #else
3897 
3898 #ifdef COMPILER2
3899   // Arguments:
3900   //
3901   // Inputs:
3902   //   c_rarg0   - source byte array address
3903   //   c_rarg1   - destination byte array address
3904   //   c_rarg2   - K (key) in little endian int array
3905   //
3906   address generate_aescrypt_encryptBlock() {
3907     __ align(CodeEntryAlignment);
3908     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3909 
3910     Label L_doLast;
3911 
3912     const Register from        = c_rarg0;  // source array address
3913     const Register to          = c_rarg1;  // destination array address
3914     const Register key         = c_rarg2;  // key array address
3915     const Register keylen      = R8;
3916 
3917     address start = __ pc();
3918     __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed));
3919     __ mov(FP, SP);
3920 
3921     __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
3922 
3923     __ vld1(V0, Address(from), MacroAssembler::VELEM_SIZE_8, 128); // get 16 bytes of input
3924 
3925     __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
3926 
3927     int quad = 1;
3928     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
3929     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
3930     __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad);
3931     __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad);
3932     __ aese(V0, V1);
3933     __ aesmc(V0, V0);
3934     __ aese(V0, V2);
3935     __ aesmc(V0, V0);
3936     __ aese(V0, V3);
3937     __ aesmc(V0, V0);
3938     __ aese(V0, V4);
3939     __ aesmc(V0, V0);
3940 
3941     __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
3942     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
3943     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
3944     __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad);
3945     __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad);
3946     __ aese(V0, V1);
3947     __ aesmc(V0, V0);
3948     __ aese(V0, V2);
3949     __ aesmc(V0, V0);
3950     __ aese(V0, V3);
3951     __ aesmc(V0, V0);
3952     __ aese(V0, V4);
3953     __ aesmc(V0, V0);
3954 
3955     __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
3956     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
3957     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
3958 
3959     __ cmp_w(keylen, 44);
3960     __ b(L_doLast, eq);
3961 
3962     __ aese(V0, V1);
3963     __ aesmc(V0, V0);
3964     __ aese(V0, V2);
3965     __ aesmc(V0, V0);
3966 
3967     __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
3968     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
3969     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
3970 
3971     __ cmp_w(keylen, 52);
3972     __ b(L_doLast, eq);
3973 
3974     __ aese(V0, V1);
3975     __ aesmc(V0, V0);
3976     __ aese(V0, V2);
3977     __ aesmc(V0, V0);
3978 
3979     __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
3980     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
3981     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
3982 
3983     __ BIND(L_doLast);
3984 
3985     __ aese(V0, V1);
3986     __ aesmc(V0, V0);
3987     __ aese(V0, V2);
3988 
3989     __ vld1(V1, Address(key), MacroAssembler::VELEM_SIZE_8, 128);
3990     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
3991     __ eor(V0, V0, V1, MacroAssembler::VELEM_SIZE_8, quad);
3992 
3993     __ vst1(V0, Address(to), MacroAssembler::VELEM_SIZE_8, 128);
3994 
3995     __ mov(R0, 0);
3996 
3997     __ mov(SP, FP);
3998     __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed));
3999     __ ret(LR);
4000 
4001     return start;
4002   }
4003 
4004   // Arguments:
4005   //
4006   // Inputs:
4007   //   c_rarg0   - source byte array address
4008   //   c_rarg1   - destination byte array address
4009   //   c_rarg2   - K (key) in little endian int array
4010   //
4011   address generate_aescrypt_decryptBlock() {
4012     assert(UseAES, "need AES instructions and misaligned SSE support");
4013     __ align(CodeEntryAlignment);
4014     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
4015     Label L_doLast;
4016 
4017     const Register from        = c_rarg0;  // source array address
4018     const Register to          = c_rarg1;  // destination array address
4019     const Register key         = c_rarg2;  // key array address
4020     const Register keylen      = R8;
4021 
4022     address start = __ pc();
4023     __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed));
4024     __ mov(FP, SP);
4025 
4026     __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4027 
4028     __ vld1(V0, Address(from), MacroAssembler::VELEM_SIZE_8, 128); // get 16 bytes of input
4029 
4030     __ vld1(V5, Address(key, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4031 
4032     int quad = 1;
4033     __ rev32(V5, V5, MacroAssembler::VELEM_SIZE_8, quad);
4034 
4035     __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4036     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
4037     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
4038     __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad);
4039     __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad);
4040     __ aesd(V0, V1);
4041     __ aesimc(V0, V0);
4042     __ aesd(V0, V2);
4043     __ aesimc(V0, V0);
4044     __ aesd(V0, V3);
4045     __ aesimc(V0, V0);
4046     __ aesd(V0, V4);
4047     __ aesimc(V0, V0);
4048 
4049     __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4050     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
4051     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
4052     __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad);
4053     __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad);
4054     __ aesd(V0, V1);
4055     __ aesimc(V0, V0);
4056     __ aesd(V0, V2);
4057     __ aesimc(V0, V0);
4058     __ aesd(V0, V3);
4059     __ aesimc(V0, V0);
4060     __ aesd(V0, V4);
4061     __ aesimc(V0, V0);
4062 
4063     __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4064     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
4065     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
4066 
4067     __ cmp_w(keylen, 44);
4068     __ b(L_doLast, eq);
4069 
4070     __ aesd(V0, V1);
4071     __ aesimc(V0, V0);
4072     __ aesd(V0, V2);
4073     __ aesimc(V0, V0);
4074 
4075     __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4076     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
4077     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
4078 
4079     __ cmp_w(keylen, 52);
4080     __ b(L_doLast, eq);
4081 
4082     __ aesd(V0, V1);
4083     __ aesimc(V0, V0);
4084     __ aesd(V0, V2);
4085     __ aesimc(V0, V0);
4086 
4087     __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4088     __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
4089     __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad);
4090 
4091     __ BIND(L_doLast);
4092 
4093     __ aesd(V0, V1);
4094     __ aesimc(V0, V0);
4095     __ aesd(V0, V2);
4096 
4097     __ eor(V0, V0, V5, MacroAssembler::VELEM_SIZE_8, quad);
4098 
4099     __ vst1(V0, Address(to), MacroAssembler::VELEM_SIZE_8, 128);
4100 
4101     __ mov(R0, 0);
4102 
4103     __ mov(SP, FP);
4104     __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed));
4105     __ ret(LR);
4106 
4107 
4108     return start;
4109   }
4110 
4111   // Arguments:
4112   //
4113   // Inputs:
4114   //   c_rarg0   - source byte array address
4115   //   c_rarg1   - destination byte array address
4116   //   c_rarg2   - K (key) in little endian int array
4117   //   c_rarg3   - r vector byte array address
4118   //   c_rarg4   - input length
4119   //
4120   // Output:
4121   //   x0        - input length
4122   //
4123   address generate_cipherBlockChaining_encryptAESCrypt() {
4124     assert(UseAES, "need AES instructions and misaligned SSE support");
4125     __ align(CodeEntryAlignment);
4126     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
4127 
4128     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
4129 
4130     const Register from        = c_rarg0;  // source array address
4131     const Register to          = c_rarg1;  // destination array address
4132     const Register key         = c_rarg2;  // key array address
4133     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
4134                                            // and left with the results of the last encryption block
4135     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
4136     const Register keylen      = R8;
4137 
4138     address start = __ pc();
4139     __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed));
4140     __ mov(FP, SP);
4141 
4142     __ mov(R9, len_reg);
4143     __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4144 
4145     __ vld1(V0, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128);
4146 
4147     __ cmp_w(keylen, 52);
4148     __ b(L_loadkeys_44, cc);
4149     __ b(L_loadkeys_52, eq);
4150 
4151     __ vld1(V17, V18, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4152 
4153     int quad = 1;
4154     __ rev32(V17, V17, MacroAssembler::VELEM_SIZE_8, quad);
4155     __ rev32(V18, V18, MacroAssembler::VELEM_SIZE_8, quad);
4156     __ BIND(L_loadkeys_52);
4157     __ vld1(V19, V20, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4158     __ rev32(V19, V19, MacroAssembler::VELEM_SIZE_8, quad);
4159     __ rev32(V20, V20, MacroAssembler::VELEM_SIZE_8, quad);
4160     __ BIND(L_loadkeys_44);
4161     __ vld1(V21, V22, V23, V24, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4162     __ rev32(V21, V21, MacroAssembler::VELEM_SIZE_8, quad);
4163     __ rev32(V22, V22, MacroAssembler::VELEM_SIZE_8, quad);
4164     __ rev32(V23, V23, MacroAssembler::VELEM_SIZE_8, quad);
4165     __ rev32(V24, V24, MacroAssembler::VELEM_SIZE_8, quad);
4166     __ vld1(V25, V26, V27, V28, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4167     __ rev32(V25, V25, MacroAssembler::VELEM_SIZE_8, quad);
4168     __ rev32(V26, V26, MacroAssembler::VELEM_SIZE_8, quad);
4169     __ rev32(V27, V27, MacroAssembler::VELEM_SIZE_8, quad);
4170     __ rev32(V28, V28, MacroAssembler::VELEM_SIZE_8, quad);
4171     __ vld1(V29, V30, V31, Address(key), MacroAssembler::VELEM_SIZE_8, 128);
4172     __ rev32(V29, V29, MacroAssembler::VELEM_SIZE_8, quad);
4173     __ rev32(V30, V30, MacroAssembler::VELEM_SIZE_8, quad);
4174     __ rev32(V31, V31, MacroAssembler::VELEM_SIZE_8, quad);
4175 
4176     __ BIND(L_aes_loop);
4177     __ vld1(V1, Address(from, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4178     __ eor(V0, V0, V1, MacroAssembler::VELEM_SIZE_8, quad);
4179 
4180     __ b(L_rounds_44, cc);
4181     __ b(L_rounds_52, eq);
4182 
4183     __ aese(V0, V17);
4184     __ aesmc(V0, V0);
4185     __ aese(V0, V18);
4186     __ aesmc(V0, V0);
4187     __ BIND(L_rounds_52);
4188     __ aese(V0, V19);
4189     __ aesmc(V0, V0);
4190     __ aese(V0, V20);
4191     __ aesmc(V0, V0);
4192     __ BIND(L_rounds_44);
4193     __ aese(V0, V21);
4194     __ aesmc(V0, V0);
4195     __ aese(V0, V22);
4196     __ aesmc(V0, V0);
4197     __ aese(V0, V23);
4198     __ aesmc(V0, V0);
4199     __ aese(V0, V24);
4200     __ aesmc(V0, V0);
4201     __ aese(V0, V25);
4202     __ aesmc(V0, V0);
4203     __ aese(V0, V26);
4204     __ aesmc(V0, V0);
4205     __ aese(V0, V27);
4206     __ aesmc(V0, V0);
4207     __ aese(V0, V28);
4208     __ aesmc(V0, V0);
4209     __ aese(V0, V29);
4210     __ aesmc(V0, V0);
4211     __ aese(V0, V30);
4212     __ eor(V0, V0, V31, MacroAssembler::VELEM_SIZE_8, quad);
4213 
4214     __ vst1(V0, Address(to, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4215     __ sub(len_reg, len_reg, 16);
4216     __ cbnz(len_reg, L_aes_loop);
4217 
4218     __ vst1(V0, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128);
4219 
4220     __ mov(R0, R9);
4221 
4222     __ mov(SP, FP);
4223     __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed));
4224     __ ret(LR);
4225 
4226     return start;
4227   }
4228 
4229   // Arguments:
4230   //
4231   // Inputs:
4232   //   c_rarg0   - source byte array address
4233   //   c_rarg1   - destination byte array address
4234   //   c_rarg2   - K (key) in little endian int array
4235   //   c_rarg3   - r vector byte array address
4236   //   c_rarg4   - input length
4237   //
4238   // Output:
4239   //   rax       - input length
4240   //
4241   address generate_cipherBlockChaining_decryptAESCrypt() {
4242     assert(UseAES, "need AES instructions and misaligned SSE support");
4243     __ align(CodeEntryAlignment);
4244     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
4245 
4246     Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52;
4247 
4248     const Register from        = c_rarg0;  // source array address
4249     const Register to          = c_rarg1;  // destination array address
4250     const Register key         = c_rarg2;  // key array address
4251     const Register rvec        = c_rarg3;  // r byte array initialized from initvector array address
4252                                            // and left with the results of the last encryption block
4253     const Register len_reg     = c_rarg4;  // src len (must be multiple of blocksize 16)
4254     const Register keylen      = R8;
4255 
4256     address start = __ pc();
4257     __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed));
4258     __ mov(FP, SP);
4259 
4260     __ mov(R9, len_reg);
4261     __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)));
4262 
4263     __ vld1(V2, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128);
4264 
4265     __ vld1(V31, Address(key, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4266 
4267     int quad = 1;
4268     __ rev32(V31, V31, MacroAssembler::VELEM_SIZE_8, quad);
4269 
4270     __ cmp_w(keylen, 52);
4271     __ b(L_loadkeys_44, cc);
4272     __ b(L_loadkeys_52, eq);
4273 
4274     __ vld1(V17, V18, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4275     __ rev32(V17, V17, MacroAssembler::VELEM_SIZE_8, quad);
4276     __ rev32(V18, V18, MacroAssembler::VELEM_SIZE_8, quad);
4277     __ BIND(L_loadkeys_52);
4278     __ vld1(V19, V20, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4279     __ rev32(V19, V19, MacroAssembler::VELEM_SIZE_8, quad);
4280     __ rev32(V20, V20, MacroAssembler::VELEM_SIZE_8, quad);
4281     __ BIND(L_loadkeys_44);
4282     __ vld1(V21, V22, V23, V24, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4283     __ rev32(V21, V21, MacroAssembler::VELEM_SIZE_8, quad);
4284     __ rev32(V22, V22, MacroAssembler::VELEM_SIZE_8, quad);
4285     __ rev32(V23, V23, MacroAssembler::VELEM_SIZE_8, quad);
4286     __ rev32(V24, V24, MacroAssembler::VELEM_SIZE_8, quad);
4287     __ vld1(V25, V26, V27, V28, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4288     __ rev32(V25, V25, MacroAssembler::VELEM_SIZE_8, quad);
4289     __ rev32(V26, V26, MacroAssembler::VELEM_SIZE_8, quad);
4290     __ rev32(V27, V27, MacroAssembler::VELEM_SIZE_8, quad);
4291     __ rev32(V28, V28, MacroAssembler::VELEM_SIZE_8, quad);
4292     __ vld1(V29, V30, Address(key), MacroAssembler::VELEM_SIZE_8, 128);
4293     __ rev32(V29, V29, MacroAssembler::VELEM_SIZE_8, quad);
4294     __ rev32(V30, V30, MacroAssembler::VELEM_SIZE_8, quad);
4295 
4296     __ BIND(L_aes_loop);
4297     __ vld1(V0, Address(from, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4298     __ orr(V1, V0, V0, MacroAssembler::VELEM_SIZE_8, quad);
4299 
4300     __ b(L_rounds_44, cc);
4301     __ b(L_rounds_52, eq);
4302 
4303     __ aesd(V0, V17);
4304     __ aesimc(V0, V0);
4305     __ aesd(V0, V17);
4306     __ aesimc(V0, V0);
4307     __ BIND(L_rounds_52);
4308     __ aesd(V0, V19);
4309     __ aesimc(V0, V0);
4310     __ aesd(V0, V20);
4311     __ aesimc(V0, V0);
4312     __ BIND(L_rounds_44);
4313     __ aesd(V0, V21);
4314     __ aesimc(V0, V0);
4315     __ aesd(V0, V22);
4316     __ aesimc(V0, V0);
4317     __ aesd(V0, V23);
4318     __ aesimc(V0, V0);
4319     __ aesd(V0, V24);
4320     __ aesimc(V0, V0);
4321     __ aesd(V0, V25);
4322     __ aesimc(V0, V0);
4323     __ aesd(V0, V26);
4324     __ aesimc(V0, V0);
4325     __ aesd(V0, V27);
4326     __ aesimc(V0, V0);
4327     __ aesd(V0, V28);
4328     __ aesimc(V0, V0);
4329     __ aesd(V0, V29);
4330     __ aesimc(V0, V0);
4331     __ aesd(V0, V30);
4332     __ eor(V0, V0, V31, MacroAssembler::VELEM_SIZE_8, quad);
4333     __ eor(V0, V0, V2, MacroAssembler::VELEM_SIZE_8, quad);
4334 
4335     __ vst1(V0, Address(to, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128);
4336     __ orr(V2, V1, V1, MacroAssembler::VELEM_SIZE_8, quad);
4337 
4338     __ sub(len_reg, len_reg, 16);
4339     __ cbnz(len_reg, L_aes_loop);
4340 
4341     __ vst1(V2, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128);
4342 
4343     __ mov(R0, R9);
4344 
4345     __ mov(SP, FP);
4346     __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed));
4347     __ ret(LR);
4348 
4349     return start;
4350   }
4351 
4352 #endif // COMPILER2
4353 #endif // AARCH64
4354 
4355  private:
4356 
4357 #undef  __
4358 #define __ masm->
4359 
4360   //------------------------------------------------------------------------------------------------------------------------
4361   // Continuation point for throwing of implicit exceptions that are not handled in
4362   // the current activation. Fabricates an exception oop and initiates normal
4363   // exception dispatching in this frame.
4364   address generate_throw_exception(const char* name, address runtime_entry) {
4365     int insts_size = 128;
4366     int locs_size  = 32;
4367     CodeBuffer code(name, insts_size, locs_size);
4368     OopMapSet* oop_maps;
4369     int frame_size;
4370     int frame_complete;
4371 
4372     oop_maps = new OopMapSet();
4373     MacroAssembler* masm = new MacroAssembler(&code);
4374 
4375     address start = __ pc();
4376 
4377     frame_size = 2;
4378     __ mov(Rexception_pc, LR);
4379     __ raw_push(FP, LR);
4380 
4381     frame_complete = __ pc() - start;
4382 
4383     // Any extra arguments are already supposed to be R1 and R2
4384     __ mov(R0, Rthread);
4385 
4386     int pc_offset = __ set_last_Java_frame(SP, FP, false, Rtemp);
4387     assert(((__ pc()) - start) == __ offset(), "warning: start differs from code_begin");
4388     __ call(runtime_entry);
4389     if (pc_offset == -1) {
4390       pc_offset = __ offset();
4391     }
4392 
4393     // Generate oop map
4394     OopMap* map =  new OopMap(frame_size*VMRegImpl::slots_per_word, 0);
4395     oop_maps->add_gc_map(pc_offset, map);
4396     __ reset_last_Java_frame(Rtemp); // Rtemp free since scratched by far call
4397 
4398     __ raw_pop(FP, LR);
4399     __ jump(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type, Rtemp);
4400 
4401     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete,
4402                                                       frame_size, oop_maps, false);
4403     return stub->entry_point();
4404   }
4405 
4406   //---------------------------------------------------------------------------
4407   // Initialization
4408 
4409   void generate_initial() {
4410     // Generates all stubs and initializes the entry points
4411 
4412     //------------------------------------------------------------------------------------------------------------------------
4413     // entry points that exist in all platforms
4414     // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
4415     //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
4416     StubRoutines::_forward_exception_entry      = generate_forward_exception();
4417 
4418     StubRoutines::_call_stub_entry              =
4419       generate_call_stub(StubRoutines::_call_stub_return_address);
4420     // is referenced by megamorphic call
4421     StubRoutines::_catch_exception_entry        = generate_catch_exception();
4422 
4423     // stub for throwing stack overflow error used both by interpreter and compiler
4424     StubRoutines::_throw_StackOverflowError_entry  = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
4425 
4426 #ifndef AARCH64
4427     // integer division used both by interpreter and compiler
4428     StubRoutines::Arm::_idiv_irem_entry = generate_idiv_irem();
4429 
4430     StubRoutines::_atomic_add_entry = generate_atomic_add();
4431     StubRoutines::_atomic_xchg_entry = generate_atomic_xchg();
4432     StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg();
4433     StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
4434     StubRoutines::_atomic_load_long_entry = generate_atomic_load_long();
4435     StubRoutines::_atomic_store_long_entry = generate_atomic_store_long();
4436 #endif // !AARCH64
4437   }
4438 
4439   void generate_all() {
4440     // Generates all stubs and initializes the entry points
4441 
4442 #ifdef COMPILER2
4443     // Generate partial_subtype_check first here since its code depends on
4444     // UseZeroBaseCompressedOops which is defined after heap initialization.
4445     StubRoutines::Arm::_partial_subtype_check                = generate_partial_subtype_check();
4446 #endif
4447     // These entry points require SharedInfo::stack0 to be set up in non-core builds
4448     // and need to be relocatable, so they each fabricate a RuntimeStub internally.
4449     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
4450     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
4451     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
4452 
4453     //------------------------------------------------------------------------------------------------------------------------
4454     // entry points that are platform specific
4455 
4456     // support for verify_oop (must happen after universe_init)
4457     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop();
4458 
4459     // arraycopy stubs used by compilers
4460     generate_arraycopy_stubs();
4461 
4462     // Safefetch stubs.
4463     generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
4464                                                    &StubRoutines::_safefetch32_fault_pc,
4465                                                    &StubRoutines::_safefetch32_continuation_pc);
4466 #ifdef AARCH64
4467     generate_safefetch("SafeFetchN", wordSize, &StubRoutines::_safefetchN_entry,
4468                                                &StubRoutines::_safefetchN_fault_pc,
4469                                                &StubRoutines::_safefetchN_continuation_pc);
4470 #ifdef COMPILER2
4471     if (UseAESIntrinsics) {
4472       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4473       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4474       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4475       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4476     }
4477 #endif
4478 #else
4479     assert (sizeof(int) == wordSize, "32-bit architecture");
4480     StubRoutines::_safefetchN_entry           = StubRoutines::_safefetch32_entry;
4481     StubRoutines::_safefetchN_fault_pc        = StubRoutines::_safefetch32_fault_pc;
4482     StubRoutines::_safefetchN_continuation_pc = StubRoutines::_safefetch32_continuation_pc;
4483 #endif // AARCH64
4484 
4485 #ifdef COMPILE_CRYPTO
4486     // generate AES intrinsics code
4487     if (UseAESIntrinsics) {
4488       aes_init();
4489       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
4490       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
4491       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
4492       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt();
4493     }
4494 #endif // COMPILE_CRYPTO
4495   }
4496 
4497 
4498  public:
4499   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
4500     if (all) {
4501       generate_all();
4502     } else {
4503       generate_initial();
4504     }
4505   }
4506 }; // end class declaration
4507 
4508 void StubGenerator_generate(CodeBuffer* code, bool all) {
4509   StubGenerator g(code, all);
4510 }