1 /* 2 * Copyright (c) 2008, 2018, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "assembler_arm.inline.hpp" 28 #include "gc/shared/cardTable.hpp" 29 #include "gc/shared/cardTableBarrierSet.hpp" 30 #include "interpreter/interpreter.hpp" 31 #include "nativeInst_arm.hpp" 32 #include "oops/instanceOop.hpp" 33 #include "oops/method.hpp" 34 #include "oops/objArrayKlass.hpp" 35 #include "oops/oop.inline.hpp" 36 #include "prims/methodHandles.hpp" 37 #include "runtime/frame.inline.hpp" 38 #include "runtime/handles.inline.hpp" 39 #include "runtime/sharedRuntime.hpp" 40 #include "runtime/stubCodeGenerator.hpp" 41 #include "runtime/stubRoutines.hpp" 42 #include "utilities/align.hpp" 43 #ifdef COMPILER2 44 #include "opto/runtime.hpp" 45 #endif 46 47 // Declaration and definition of StubGenerator (no .hpp file). 48 // For a more detailed description of the stub routine structure 49 // see the comment in stubRoutines.hpp 50 51 #define __ _masm-> 52 53 #ifdef PRODUCT 54 #define BLOCK_COMMENT(str) /* nothing */ 55 #else 56 #define BLOCK_COMMENT(str) __ block_comment(str) 57 #endif 58 59 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 60 61 // ------------------------------------------------------------------------------------------------------------------------- 62 // Stub Code definitions 63 64 // Platform dependent parameters for array copy stubs 65 66 // Note: we have noticed a huge change in behavior on a microbenchmark 67 // from platform to platform depending on the configuration. 68 69 // Instead of adding a series of command line options (which 70 // unfortunately have to be done in the shared file and cannot appear 71 // only in the ARM port), the tested result are hard-coded here in a set 72 // of options, selected by specifying 'ArmCopyPlatform' 73 74 // Currently, this 'platform' is hardcoded to a value that is a good 75 // enough trade-off. However, one can easily modify this file to test 76 // the hard-coded configurations or create new ones. If the gain is 77 // significant, we could decide to either add command line options or 78 // add code to automatically choose a configuration. 79 80 // see comments below for the various configurations created 81 #define DEFAULT_ARRAYCOPY_CONFIG 0 82 #define TEGRA2_ARRAYCOPY_CONFIG 1 83 #define IMX515_ARRAYCOPY_CONFIG 2 84 85 // Hard coded choices (XXX: could be changed to a command line option) 86 #define ArmCopyPlatform DEFAULT_ARRAYCOPY_CONFIG 87 88 #ifdef AARCH64 89 #define ArmCopyCacheLineSize 64 90 #else 91 #define ArmCopyCacheLineSize 32 // not worth optimizing to 64 according to measured gains 92 #endif // AARCH64 93 94 // TODO-AARCH64: tune and revise AArch64 arraycopy optimizations 95 96 // configuration for each kind of loop 97 typedef struct { 98 int pld_distance; // prefetch distance (0 => no prefetch, <0: prefetch_before); 99 #ifndef AARCH64 100 bool split_ldm; // if true, split each STM in STMs with fewer registers 101 bool split_stm; // if true, split each LTM in LTMs with fewer registers 102 #endif // !AARCH64 103 } arraycopy_loop_config; 104 105 // configuration for all loops 106 typedef struct { 107 // const char *description; 108 arraycopy_loop_config forward_aligned; 109 arraycopy_loop_config backward_aligned; 110 arraycopy_loop_config forward_shifted; 111 arraycopy_loop_config backward_shifted; 112 } arraycopy_platform_config; 113 114 // configured platforms 115 static arraycopy_platform_config arraycopy_configurations[] = { 116 // configuration parameters for arraycopy loops 117 #ifdef AARCH64 118 { 119 {-256 }, // forward aligned 120 {-128 }, // backward aligned 121 {-256 }, // forward shifted 122 {-128 } // backward shifted 123 } 124 #else 125 126 // Configurations were chosen based on manual analysis of benchmark 127 // results, minimizing overhead with respect to best results on the 128 // different test cases. 129 130 // Prefetch before is always favored since it avoids dirtying the 131 // cache uselessly for small copies. Code for prefetch after has 132 // been kept in case the difference is significant for some 133 // platforms but we might consider dropping it. 134 135 // distance, ldm, stm 136 { 137 // default: tradeoff tegra2/imx515/nv-tegra2, 138 // Notes on benchmarking: 139 // - not far from optimal configuration on nv-tegra2 140 // - within 5% of optimal configuration except for backward aligned on IMX 141 // - up to 40% from optimal configuration for backward shifted and backward align for tegra2 142 // but still on par with the operating system copy 143 {-256, true, true }, // forward aligned 144 {-256, true, true }, // backward aligned 145 {-256, false, false }, // forward shifted 146 {-256, true, true } // backward shifted 147 }, 148 { 149 // configuration tuned on tegra2-4. 150 // Warning: should not be used on nv-tegra2 ! 151 // Notes: 152 // - prefetch after gives 40% gain on backward copies on tegra2-4, 153 // resulting in better number than the operating system 154 // copy. However, this can lead to a 300% loss on nv-tegra and has 155 // more impact on the cache (fetches futher than what is 156 // copied). Use this configuration with care, in case it improves 157 // reference benchmarks. 158 {-256, true, true }, // forward aligned 159 {96, false, false }, // backward aligned 160 {-256, false, false }, // forward shifted 161 {96, false, false } // backward shifted 162 }, 163 { 164 // configuration tuned on imx515 165 // Notes: 166 // - smaller prefetch distance is sufficient to get good result and might be more stable 167 // - refined backward aligned options within 5% of optimal configuration except for 168 // tests were the arrays fit in the cache 169 {-160, false, false }, // forward aligned 170 {-160, false, false }, // backward aligned 171 {-160, false, false }, // forward shifted 172 {-160, true, true } // backward shifted 173 } 174 #endif // AARCH64 175 }; 176 177 class StubGenerator: public StubCodeGenerator { 178 179 #ifdef PRODUCT 180 #define inc_counter_np(a,b,c) ((void)0) 181 #else 182 #define inc_counter_np(counter, t1, t2) \ 183 BLOCK_COMMENT("inc_counter " #counter); \ 184 __ inc_counter(&counter, t1, t2); 185 #endif 186 187 private: 188 189 address generate_call_stub(address& return_address) { 190 StubCodeMark mark(this, "StubRoutines", "call_stub"); 191 address start = __ pc(); 192 193 #ifdef AARCH64 194 const int saved_regs_size = 192; 195 196 __ stp(FP, LR, Address(SP, -saved_regs_size, pre_indexed)); 197 __ mov(FP, SP); 198 199 int sp_offset = 16; 200 assert(frame::entry_frame_call_wrapper_offset * wordSize == sp_offset, "adjust this code"); 201 __ stp(R0, ZR, Address(SP, sp_offset)); sp_offset += 16; 202 203 const int saved_result_and_result_type_offset = sp_offset; 204 __ stp(R1, R2, Address(SP, sp_offset)); sp_offset += 16; 205 __ stp(R19, R20, Address(SP, sp_offset)); sp_offset += 16; 206 __ stp(R21, R22, Address(SP, sp_offset)); sp_offset += 16; 207 __ stp(R23, R24, Address(SP, sp_offset)); sp_offset += 16; 208 __ stp(R25, R26, Address(SP, sp_offset)); sp_offset += 16; 209 __ stp(R27, R28, Address(SP, sp_offset)); sp_offset += 16; 210 211 __ stp_d(V8, V9, Address(SP, sp_offset)); sp_offset += 16; 212 __ stp_d(V10, V11, Address(SP, sp_offset)); sp_offset += 16; 213 __ stp_d(V12, V13, Address(SP, sp_offset)); sp_offset += 16; 214 __ stp_d(V14, V15, Address(SP, sp_offset)); sp_offset += 16; 215 assert (sp_offset == saved_regs_size, "adjust this code"); 216 217 __ mov(Rmethod, R3); 218 __ mov(Rthread, R7); 219 __ reinit_heapbase(); 220 221 { // Pass parameters 222 Label done_parameters, pass_parameters; 223 224 __ mov(Rparams, SP); 225 __ cbz_w(R6, done_parameters); 226 227 __ sub(Rtemp, SP, R6, ex_uxtw, LogBytesPerWord); 228 __ align_reg(SP, Rtemp, StackAlignmentInBytes); 229 __ add(Rparams, SP, R6, ex_uxtw, LogBytesPerWord); 230 231 __ bind(pass_parameters); 232 __ subs_w(R6, R6, 1); 233 __ ldr(Rtemp, Address(R5, wordSize, post_indexed)); 234 __ str(Rtemp, Address(Rparams, -wordSize, pre_indexed)); 235 __ b(pass_parameters, ne); 236 237 __ bind(done_parameters); 238 239 #ifdef ASSERT 240 { 241 Label L; 242 __ cmp(SP, Rparams); 243 __ b(L, eq); 244 __ stop("SP does not match Rparams"); 245 __ bind(L); 246 } 247 #endif 248 } 249 250 __ mov(Rsender_sp, SP); 251 __ blr(R4); 252 return_address = __ pc(); 253 254 __ mov(SP, FP); 255 256 __ ldp(R1, R2, Address(SP, saved_result_and_result_type_offset)); 257 258 { // Handle return value 259 Label cont; 260 __ str(R0, Address(R1)); 261 262 __ cmp_w(R2, T_DOUBLE); 263 __ ccmp_w(R2, T_FLOAT, Assembler::flags_for_condition(eq), ne); 264 __ b(cont, ne); 265 266 __ str_d(V0, Address(R1)); 267 __ bind(cont); 268 } 269 270 sp_offset = saved_result_and_result_type_offset + 16; 271 __ ldp(R19, R20, Address(SP, sp_offset)); sp_offset += 16; 272 __ ldp(R21, R22, Address(SP, sp_offset)); sp_offset += 16; 273 __ ldp(R23, R24, Address(SP, sp_offset)); sp_offset += 16; 274 __ ldp(R25, R26, Address(SP, sp_offset)); sp_offset += 16; 275 __ ldp(R27, R28, Address(SP, sp_offset)); sp_offset += 16; 276 277 __ ldp_d(V8, V9, Address(SP, sp_offset)); sp_offset += 16; 278 __ ldp_d(V10, V11, Address(SP, sp_offset)); sp_offset += 16; 279 __ ldp_d(V12, V13, Address(SP, sp_offset)); sp_offset += 16; 280 __ ldp_d(V14, V15, Address(SP, sp_offset)); sp_offset += 16; 281 assert (sp_offset == saved_regs_size, "adjust this code"); 282 283 __ ldp(FP, LR, Address(SP, saved_regs_size, post_indexed)); 284 __ ret(); 285 286 #else // AARCH64 287 288 assert(frame::entry_frame_call_wrapper_offset == 0, "adjust this code"); 289 290 __ mov(Rtemp, SP); 291 __ push(RegisterSet(FP) | RegisterSet(LR)); 292 #ifndef __SOFTFP__ 293 __ fstmdbd(SP, FloatRegisterSet(D8, 8), writeback); 294 #endif 295 __ stmdb(SP, RegisterSet(R0, R2) | RegisterSet(R4, R6) | RegisterSet(R8, R10) | altFP_7_11, writeback); 296 __ mov(Rmethod, R3); 297 __ ldmia(Rtemp, RegisterSet(R1, R3) | Rthread); // stacked arguments 298 299 // XXX: TODO 300 // Would be better with respect to native tools if the following 301 // setting of FP was changed to conform to the native ABI, with FP 302 // pointing to the saved FP slot (and the corresponding modifications 303 // for entry_frame_call_wrapper_offset and frame::real_fp). 304 __ mov(FP, SP); 305 306 { 307 Label no_parameters, pass_parameters; 308 __ cmp(R3, 0); 309 __ b(no_parameters, eq); 310 311 __ bind(pass_parameters); 312 __ ldr(Rtemp, Address(R2, wordSize, post_indexed)); // Rtemp OK, unused and scratchable 313 __ subs(R3, R3, 1); 314 __ push(Rtemp); 315 __ b(pass_parameters, ne); 316 __ bind(no_parameters); 317 } 318 319 __ mov(Rsender_sp, SP); 320 __ blx(R1); 321 return_address = __ pc(); 322 323 __ add(SP, FP, wordSize); // Skip link to JavaCallWrapper 324 __ pop(RegisterSet(R2, R3)); 325 #ifndef __ABI_HARD__ 326 __ cmp(R3, T_LONG); 327 __ cmp(R3, T_DOUBLE, ne); 328 __ str(R0, Address(R2)); 329 __ str(R1, Address(R2, wordSize), eq); 330 #else 331 Label cont, l_float, l_double; 332 333 __ cmp(R3, T_DOUBLE); 334 __ b(l_double, eq); 335 336 __ cmp(R3, T_FLOAT); 337 __ b(l_float, eq); 338 339 __ cmp(R3, T_LONG); 340 __ str(R0, Address(R2)); 341 __ str(R1, Address(R2, wordSize), eq); 342 __ b(cont); 343 344 345 __ bind(l_double); 346 __ fstd(D0, Address(R2)); 347 __ b(cont); 348 349 __ bind(l_float); 350 __ fsts(S0, Address(R2)); 351 352 __ bind(cont); 353 #endif 354 355 __ pop(RegisterSet(R4, R6) | RegisterSet(R8, R10) | altFP_7_11); 356 #ifndef __SOFTFP__ 357 __ fldmiad(SP, FloatRegisterSet(D8, 8), writeback); 358 #endif 359 __ pop(RegisterSet(FP) | RegisterSet(PC)); 360 361 #endif // AARCH64 362 return start; 363 } 364 365 366 // (in) Rexception_obj: exception oop 367 address generate_catch_exception() { 368 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 369 address start = __ pc(); 370 371 __ str(Rexception_obj, Address(Rthread, Thread::pending_exception_offset())); 372 __ b(StubRoutines::_call_stub_return_address); 373 374 return start; 375 } 376 377 378 // (in) Rexception_pc: return address 379 address generate_forward_exception() { 380 StubCodeMark mark(this, "StubRoutines", "forward exception"); 381 address start = __ pc(); 382 383 __ mov(c_rarg0, Rthread); 384 __ mov(c_rarg1, Rexception_pc); 385 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 386 SharedRuntime::exception_handler_for_return_address), 387 c_rarg0, c_rarg1); 388 __ ldr(Rexception_obj, Address(Rthread, Thread::pending_exception_offset())); 389 const Register Rzero = __ zero_register(Rtemp); // Rtemp OK (cleared by above call) 390 __ str(Rzero, Address(Rthread, Thread::pending_exception_offset())); 391 392 #ifdef ASSERT 393 // make sure exception is set 394 { Label L; 395 __ cbnz(Rexception_obj, L); 396 __ stop("StubRoutines::forward exception: no pending exception (2)"); 397 __ bind(L); 398 } 399 #endif 400 401 // Verify that there is really a valid exception in RAX. 402 __ verify_oop(Rexception_obj); 403 404 __ jump(R0); // handler is returned in R0 by runtime function 405 return start; 406 } 407 408 409 #ifndef AARCH64 410 411 // Integer division shared routine 412 // Input: 413 // R0 - dividend 414 // R2 - divisor 415 // Output: 416 // R0 - remainder 417 // R1 - quotient 418 // Destroys: 419 // R2 420 // LR 421 address generate_idiv_irem() { 422 Label positive_arguments, negative_or_zero, call_slow_path; 423 Register dividend = R0; 424 Register divisor = R2; 425 Register remainder = R0; 426 Register quotient = R1; 427 Register tmp = LR; 428 assert(dividend == remainder, "must be"); 429 430 address start = __ pc(); 431 432 // Check for special cases: divisor <= 0 or dividend < 0 433 __ cmp(divisor, 0); 434 __ orrs(quotient, dividend, divisor, ne); 435 __ b(negative_or_zero, le); 436 437 __ bind(positive_arguments); 438 // Save return address on stack to free one extra register 439 __ push(LR); 440 // Approximate the mamximum order of the quotient 441 __ clz(tmp, dividend); 442 __ clz(quotient, divisor); 443 __ subs(tmp, quotient, tmp); 444 __ mov(quotient, 0); 445 // Jump to the appropriate place in the unrolled loop below 446 __ ldr(PC, Address(PC, tmp, lsl, 2), pl); 447 // If divisor is greater than dividend, return immediately 448 __ pop(PC); 449 450 // Offset table 451 Label offset_table[32]; 452 int i; 453 for (i = 0; i <= 31; i++) { 454 __ emit_address(offset_table[i]); 455 } 456 457 // Unrolled loop of 32 division steps 458 for (i = 31; i >= 0; i--) { 459 __ bind(offset_table[i]); 460 __ cmp(remainder, AsmOperand(divisor, lsl, i)); 461 __ sub(remainder, remainder, AsmOperand(divisor, lsl, i), hs); 462 __ add(quotient, quotient, 1 << i, hs); 463 } 464 __ pop(PC); 465 466 __ bind(negative_or_zero); 467 // Find the combination of argument signs and jump to corresponding handler 468 __ andr(quotient, dividend, 0x80000000, ne); 469 __ orr(quotient, quotient, AsmOperand(divisor, lsr, 31), ne); 470 __ add(PC, PC, AsmOperand(quotient, ror, 26), ne); 471 __ str(LR, Address(Rthread, JavaThread::saved_exception_pc_offset())); 472 473 // The leaf runtime function can destroy R0-R3 and R12 registers which are still alive 474 RegisterSet saved_registers = RegisterSet(R3) | RegisterSet(R12); 475 #if R9_IS_SCRATCHED 476 // Safer to save R9 here since callers may have been written 477 // assuming R9 survives. This is suboptimal but may not be worth 478 // revisiting for this slow case. 479 480 // save also R10 for alignment 481 saved_registers = saved_registers | RegisterSet(R9, R10); 482 #endif 483 { 484 // divisor == 0 485 FixedSizeCodeBlock zero_divisor(_masm, 8, true); 486 __ push(saved_registers); 487 __ mov(R0, Rthread); 488 __ mov(R1, LR); 489 __ mov(R2, SharedRuntime::IMPLICIT_DIVIDE_BY_ZERO); 490 __ b(call_slow_path); 491 } 492 493 { 494 // divisor > 0 && dividend < 0 495 FixedSizeCodeBlock positive_divisor_negative_dividend(_masm, 8, true); 496 __ push(LR); 497 __ rsb(dividend, dividend, 0); 498 __ bl(positive_arguments); 499 __ rsb(remainder, remainder, 0); 500 __ rsb(quotient, quotient, 0); 501 __ pop(PC); 502 } 503 504 { 505 // divisor < 0 && dividend > 0 506 FixedSizeCodeBlock negative_divisor_positive_dividend(_masm, 8, true); 507 __ push(LR); 508 __ rsb(divisor, divisor, 0); 509 __ bl(positive_arguments); 510 __ rsb(quotient, quotient, 0); 511 __ pop(PC); 512 } 513 514 { 515 // divisor < 0 && dividend < 0 516 FixedSizeCodeBlock negative_divisor_negative_dividend(_masm, 8, true); 517 __ push(LR); 518 __ rsb(dividend, dividend, 0); 519 __ rsb(divisor, divisor, 0); 520 __ bl(positive_arguments); 521 __ rsb(remainder, remainder, 0); 522 __ pop(PC); 523 } 524 525 __ bind(call_slow_path); 526 __ call(CAST_FROM_FN_PTR(address, SharedRuntime::continuation_for_implicit_exception)); 527 __ pop(saved_registers); 528 __ bx(R0); 529 530 return start; 531 } 532 533 534 // As per atomic.hpp the Atomic read-modify-write operations must be logically implemented as: 535 // <fence>; <op>; <membar StoreLoad|StoreStore> 536 // But for load-linked/store-conditional based systems a fence here simply means 537 // no load/store can be reordered with respect to the initial load-linked, so we have: 538 // <membar storeload|loadload> ; load-linked; <op>; store-conditional; <membar storeload|storestore> 539 // There are no memory actions in <op> so nothing further is needed. 540 // 541 // So we define the following for convenience: 542 #define MEMBAR_ATOMIC_OP_PRE \ 543 MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad|MacroAssembler::LoadLoad) 544 #define MEMBAR_ATOMIC_OP_POST \ 545 MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad|MacroAssembler::StoreStore) 546 547 // Note: JDK 9 only supports ARMv7+ so we always have ldrexd available even though the 548 // code below allows for it to be otherwise. The else clause indicates an ARMv5 system 549 // for which we do not support MP and so membars are not necessary. This ARMv5 code will 550 // be removed in the future. 551 552 // Support for jint Atomic::add(jint add_value, volatile jint *dest) 553 // 554 // Arguments : 555 // 556 // add_value: R0 557 // dest: R1 558 // 559 // Results: 560 // 561 // R0: the new stored in dest 562 // 563 // Overwrites: 564 // 565 // R1, R2, R3 566 // 567 address generate_atomic_add() { 568 address start; 569 570 StubCodeMark mark(this, "StubRoutines", "atomic_add"); 571 Label retry; 572 start = __ pc(); 573 Register addval = R0; 574 Register dest = R1; 575 Register prev = R2; 576 Register ok = R2; 577 Register newval = R3; 578 579 if (VM_Version::supports_ldrex()) { 580 __ membar(MEMBAR_ATOMIC_OP_PRE, prev); 581 __ bind(retry); 582 __ ldrex(newval, Address(dest)); 583 __ add(newval, addval, newval); 584 __ strex(ok, newval, Address(dest)); 585 __ cmp(ok, 0); 586 __ b(retry, ne); 587 __ mov (R0, newval); 588 __ membar(MEMBAR_ATOMIC_OP_POST, prev); 589 } else { 590 __ bind(retry); 591 __ ldr (prev, Address(dest)); 592 __ add(newval, addval, prev); 593 __ atomic_cas_bool(prev, newval, dest, 0, noreg/*ignored*/); 594 __ b(retry, ne); 595 __ mov (R0, newval); 596 } 597 __ bx(LR); 598 599 return start; 600 } 601 602 // Support for jint Atomic::xchg(jint exchange_value, volatile jint *dest) 603 // 604 // Arguments : 605 // 606 // exchange_value: R0 607 // dest: R1 608 // 609 // Results: 610 // 611 // R0: the value previously stored in dest 612 // 613 // Overwrites: 614 // 615 // R1, R2, R3 616 // 617 address generate_atomic_xchg() { 618 address start; 619 620 StubCodeMark mark(this, "StubRoutines", "atomic_xchg"); 621 start = __ pc(); 622 Register newval = R0; 623 Register dest = R1; 624 Register prev = R2; 625 626 Label retry; 627 628 if (VM_Version::supports_ldrex()) { 629 Register ok=R3; 630 __ membar(MEMBAR_ATOMIC_OP_PRE, prev); 631 __ bind(retry); 632 __ ldrex(prev, Address(dest)); 633 __ strex(ok, newval, Address(dest)); 634 __ cmp(ok, 0); 635 __ b(retry, ne); 636 __ mov (R0, prev); 637 __ membar(MEMBAR_ATOMIC_OP_POST, prev); 638 } else { 639 __ bind(retry); 640 __ ldr (prev, Address(dest)); 641 __ atomic_cas_bool(prev, newval, dest, 0, noreg/*ignored*/); 642 __ b(retry, ne); 643 __ mov (R0, prev); 644 } 645 __ bx(LR); 646 647 return start; 648 } 649 650 // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint *dest, jint compare_value) 651 // 652 // Arguments : 653 // 654 // compare_value: R0 655 // exchange_value: R1 656 // dest: R2 657 // 658 // Results: 659 // 660 // R0: the value previously stored in dest 661 // 662 // Overwrites: 663 // 664 // R0, R1, R2, R3, Rtemp 665 // 666 address generate_atomic_cmpxchg() { 667 address start; 668 669 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg"); 670 start = __ pc(); 671 Register cmp = R0; 672 Register newval = R1; 673 Register dest = R2; 674 Register temp1 = R3; 675 Register temp2 = Rtemp; // Rtemp free (native ABI) 676 677 __ membar(MEMBAR_ATOMIC_OP_PRE, temp1); 678 679 // atomic_cas returns previous value in R0 680 __ atomic_cas(temp1, temp2, cmp, newval, dest, 0); 681 682 __ membar(MEMBAR_ATOMIC_OP_POST, temp1); 683 684 __ bx(LR); 685 686 return start; 687 } 688 689 // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value) 690 // reordered before by a wrapper to (jlong compare_value, jlong exchange_value, volatile jlong *dest) 691 // 692 // Arguments : 693 // 694 // compare_value: R1 (High), R0 (Low) 695 // exchange_value: R3 (High), R2 (Low) 696 // dest: SP+0 697 // 698 // Results: 699 // 700 // R0:R1: the value previously stored in dest 701 // 702 // Overwrites: 703 // 704 address generate_atomic_cmpxchg_long() { 705 address start; 706 707 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long"); 708 start = __ pc(); 709 Register cmp_lo = R0; 710 Register cmp_hi = R1; 711 Register newval_lo = R2; 712 Register newval_hi = R3; 713 Register addr = Rtemp; /* After load from stack */ 714 Register temp_lo = R4; 715 Register temp_hi = R5; 716 Register temp_result = R8; 717 assert_different_registers(cmp_lo, newval_lo, temp_lo, addr, temp_result, R7); 718 assert_different_registers(cmp_hi, newval_hi, temp_hi, addr, temp_result, R7); 719 720 __ membar(MEMBAR_ATOMIC_OP_PRE, Rtemp); // Rtemp free (native ABI) 721 722 // Stack is unaligned, maintain double word alignment by pushing 723 // odd number of regs. 724 __ push(RegisterSet(temp_result) | RegisterSet(temp_lo, temp_hi)); 725 __ ldr(addr, Address(SP, 12)); 726 727 // atomic_cas64 returns previous value in temp_lo, temp_hi 728 __ atomic_cas64(temp_lo, temp_hi, temp_result, cmp_lo, cmp_hi, 729 newval_lo, newval_hi, addr, 0); 730 __ mov(R0, temp_lo); 731 __ mov(R1, temp_hi); 732 733 __ pop(RegisterSet(temp_result) | RegisterSet(temp_lo, temp_hi)); 734 735 __ membar(MEMBAR_ATOMIC_OP_POST, Rtemp); // Rtemp free (native ABI) 736 __ bx(LR); 737 738 return start; 739 } 740 741 address generate_atomic_load_long() { 742 address start; 743 744 StubCodeMark mark(this, "StubRoutines", "atomic_load_long"); 745 start = __ pc(); 746 Register result_lo = R0; 747 Register result_hi = R1; 748 Register src = R0; 749 750 if (!os::is_MP()) { 751 __ ldmia(src, RegisterSet(result_lo, result_hi)); 752 __ bx(LR); 753 } else if (VM_Version::supports_ldrexd()) { 754 __ ldrexd(result_lo, Address(src)); 755 __ clrex(); // FIXME: safe to remove? 756 __ bx(LR); 757 } else { 758 __ stop("Atomic load(jlong) unsupported on this platform"); 759 __ bx(LR); 760 } 761 762 return start; 763 } 764 765 address generate_atomic_store_long() { 766 address start; 767 768 StubCodeMark mark(this, "StubRoutines", "atomic_store_long"); 769 start = __ pc(); 770 Register newval_lo = R0; 771 Register newval_hi = R1; 772 Register dest = R2; 773 Register scratch_lo = R2; 774 Register scratch_hi = R3; /* After load from stack */ 775 Register result = R3; 776 777 if (!os::is_MP()) { 778 __ stmia(dest, RegisterSet(newval_lo, newval_hi)); 779 __ bx(LR); 780 } else if (VM_Version::supports_ldrexd()) { 781 __ mov(Rtemp, dest); // get dest to Rtemp 782 Label retry; 783 __ bind(retry); 784 __ ldrexd(scratch_lo, Address(Rtemp)); 785 __ strexd(result, R0, Address(Rtemp)); 786 __ rsbs(result, result, 1); 787 __ b(retry, eq); 788 __ bx(LR); 789 } else { 790 __ stop("Atomic store(jlong) unsupported on this platform"); 791 __ bx(LR); 792 } 793 794 return start; 795 } 796 797 798 #endif // AARCH64 799 800 #ifdef COMPILER2 801 // Support for uint StubRoutine::Arm::partial_subtype_check( Klass sub, Klass super ); 802 // Arguments : 803 // 804 // ret : R0, returned 805 // icc/xcc: set as R0 (depending on wordSize) 806 // sub : R1, argument, not changed 807 // super: R2, argument, not changed 808 // raddr: LR, blown by call 809 address generate_partial_subtype_check() { 810 __ align(CodeEntryAlignment); 811 StubCodeMark mark(this, "StubRoutines", "partial_subtype_check"); 812 address start = __ pc(); 813 814 // based on SPARC check_klass_subtype_[fast|slow]_path (without CompressedOops) 815 816 // R0 used as tmp_reg (in addition to return reg) 817 Register sub_klass = R1; 818 Register super_klass = R2; 819 Register tmp_reg2 = R3; 820 Register tmp_reg3 = R4; 821 #define saved_set tmp_reg2, tmp_reg3 822 823 Label L_loop, L_fail; 824 825 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 826 827 // fast check should be redundant 828 829 // slow check 830 { 831 __ raw_push(saved_set); 832 833 // a couple of useful fields in sub_klass: 834 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 835 836 // Do a linear scan of the secondary super-klass chain. 837 // This code is rarely used, so simplicity is a virtue here. 838 839 inc_counter_np(SharedRuntime::_partial_subtype_ctr, tmp_reg2, tmp_reg3); 840 841 Register scan_temp = tmp_reg2; 842 Register count_temp = tmp_reg3; 843 844 // We will consult the secondary-super array. 845 __ ldr(scan_temp, Address(sub_klass, ss_offset)); 846 847 Register search_key = super_klass; 848 849 // Load the array length. 850 __ ldr_s32(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes())); 851 __ add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes()); 852 853 __ add(count_temp, count_temp, 1); 854 855 // Top of search loop 856 __ bind(L_loop); 857 // Notes: 858 // scan_temp starts at the array elements 859 // count_temp is 1+size 860 __ subs(count_temp, count_temp, 1); 861 __ b(L_fail, eq); // not found in the array 862 863 // Load next super to check 864 // In the array of super classes elements are pointer sized. 865 int element_size = wordSize; 866 __ ldr(R0, Address(scan_temp, element_size, post_indexed)); 867 868 // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list 869 __ subs(R0, R0, search_key); // set R0 to 0 on success (and flags to eq) 870 871 // A miss means we are NOT a subtype and need to keep looping 872 __ b(L_loop, ne); 873 874 // Falling out the bottom means we found a hit; we ARE a subtype 875 876 // Success. Cache the super we found and proceed in triumph. 877 __ str(super_klass, Address(sub_klass, sc_offset)); 878 879 // Return success 880 // R0 is already 0 and flags are already set to eq 881 __ raw_pop(saved_set); 882 __ ret(); 883 884 // Return failure 885 __ bind(L_fail); 886 #ifdef AARCH64 887 // count_temp is 0, can't use ZR here 888 __ adds(R0, count_temp, 1); // sets the flags 889 #else 890 __ movs(R0, 1); // sets the flags 891 #endif 892 __ raw_pop(saved_set); 893 __ ret(); 894 } 895 return start; 896 } 897 #undef saved_set 898 #endif // COMPILER2 899 900 901 //---------------------------------------------------------------------------------------------------- 902 // Non-destructive plausibility checks for oops 903 904 address generate_verify_oop() { 905 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 906 address start = __ pc(); 907 908 // Incoming arguments: 909 // 910 // R0: error message (char* ) 911 // R1: address of register save area 912 // R2: oop to verify 913 // 914 // All registers are saved before calling this stub. However, condition flags should be saved here. 915 916 const Register oop = R2; 917 const Register klass = R3; 918 const Register tmp1 = R6; 919 const Register tmp2 = R8; 920 921 const Register flags = Rtmp_save0; // R4/R19 922 const Register ret_addr = Rtmp_save1; // R5/R20 923 assert_different_registers(oop, klass, tmp1, tmp2, flags, ret_addr, R7); 924 925 Label exit, error; 926 InlinedAddress verify_oop_count((address) StubRoutines::verify_oop_count_addr()); 927 928 #ifdef AARCH64 929 __ mrs(flags, Assembler::SysReg_NZCV); 930 #else 931 __ mrs(Assembler::CPSR, flags); 932 #endif // AARCH64 933 934 __ ldr_literal(tmp1, verify_oop_count); 935 __ ldr_s32(tmp2, Address(tmp1)); 936 __ add(tmp2, tmp2, 1); 937 __ str_32(tmp2, Address(tmp1)); 938 939 // make sure object is 'reasonable' 940 __ cbz(oop, exit); // if obj is NULL it is ok 941 942 // Check if the oop is in the right area of memory 943 // Note: oop_mask and oop_bits must be updated if the code is saved/reused 944 const address oop_mask = (address) Universe::verify_oop_mask(); 945 const address oop_bits = (address) Universe::verify_oop_bits(); 946 __ mov_address(tmp1, oop_mask, symbolic_Relocation::oop_mask_reference); 947 __ andr(tmp2, oop, tmp1); 948 __ mov_address(tmp1, oop_bits, symbolic_Relocation::oop_bits_reference); 949 __ cmp(tmp2, tmp1); 950 __ b(error, ne); 951 952 // make sure klass is 'reasonable' 953 __ load_klass(klass, oop); // get klass 954 __ cbz(klass, error); // if klass is NULL it is broken 955 956 // return if everything seems ok 957 __ bind(exit); 958 959 #ifdef AARCH64 960 __ msr(Assembler::SysReg_NZCV, flags); 961 #else 962 __ msr(Assembler::CPSR_f, flags); 963 #endif // AARCH64 964 965 __ ret(); 966 967 // handle errors 968 __ bind(error); 969 970 __ mov(ret_addr, LR); // save return address 971 972 // R0: error message 973 // R1: register save area 974 __ call(CAST_FROM_FN_PTR(address, MacroAssembler::debug)); 975 976 __ mov(LR, ret_addr); 977 __ b(exit); 978 979 __ bind_literal(verify_oop_count); 980 981 return start; 982 } 983 984 //---------------------------------------------------------------------------------------------------- 985 // Array copy stubs 986 987 // 988 // Generate overlap test for array copy stubs 989 // 990 // Input: 991 // R0 - array1 992 // R1 - array2 993 // R2 - element count, 32-bit int 994 // 995 // input registers are preserved 996 // 997 void array_overlap_test(address no_overlap_target, int log2_elem_size, Register tmp1, Register tmp2) { 998 assert(no_overlap_target != NULL, "must be generated"); 999 array_overlap_test(no_overlap_target, NULL, log2_elem_size, tmp1, tmp2); 1000 } 1001 void array_overlap_test(Label& L_no_overlap, int log2_elem_size, Register tmp1, Register tmp2) { 1002 array_overlap_test(NULL, &L_no_overlap, log2_elem_size, tmp1, tmp2); 1003 } 1004 void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size, Register tmp1, Register tmp2) { 1005 const Register from = R0; 1006 const Register to = R1; 1007 const Register count = R2; 1008 const Register to_from = tmp1; // to - from 1009 #ifndef AARCH64 1010 const Register byte_count = (log2_elem_size == 0) ? count : tmp2; // count << log2_elem_size 1011 #endif // AARCH64 1012 assert_different_registers(from, to, count, tmp1, tmp2); 1013 1014 // no_overlap version works if 'to' lower (unsigned) than 'from' 1015 // and or 'to' more than (count*size) from 'from' 1016 1017 BLOCK_COMMENT("Array Overlap Test:"); 1018 __ subs(to_from, to, from); 1019 #ifndef AARCH64 1020 if (log2_elem_size != 0) { 1021 __ mov(byte_count, AsmOperand(count, lsl, log2_elem_size)); 1022 } 1023 #endif // !AARCH64 1024 if (NOLp == NULL) 1025 __ b(no_overlap_target,lo); 1026 else 1027 __ b((*NOLp), lo); 1028 #ifdef AARCH64 1029 __ subs(ZR, to_from, count, ex_sxtw, log2_elem_size); 1030 #else 1031 __ cmp(to_from, byte_count); 1032 #endif // AARCH64 1033 if (NOLp == NULL) 1034 __ b(no_overlap_target, ge); 1035 else 1036 __ b((*NOLp), ge); 1037 } 1038 1039 #ifdef AARCH64 1040 // TODO-AARCH64: revise usages of bulk_* methods (probably ldp`s and stp`s should interlace) 1041 1042 // Loads [from, from + count*wordSize) into regs[0], regs[1], ..., regs[count-1] 1043 // and increases 'from' by count*wordSize. 1044 void bulk_load_forward(Register from, const Register regs[], int count) { 1045 assert (count > 0 && count % 2 == 0, "count must be positive even number"); 1046 int bytes = count * wordSize; 1047 1048 int offset = 0; 1049 __ ldp(regs[0], regs[1], Address(from, bytes, post_indexed)); 1050 offset += 2*wordSize; 1051 1052 for (int i = 2; i < count; i += 2) { 1053 __ ldp(regs[i], regs[i+1], Address(from, -bytes + offset)); 1054 offset += 2*wordSize; 1055 } 1056 1057 assert (offset == bytes, "must be"); 1058 } 1059 1060 // Stores regs[0], regs[1], ..., regs[count-1] to [to, to + count*wordSize) 1061 // and increases 'to' by count*wordSize. 1062 void bulk_store_forward(Register to, const Register regs[], int count) { 1063 assert (count > 0 && count % 2 == 0, "count must be positive even number"); 1064 int bytes = count * wordSize; 1065 1066 int offset = 0; 1067 __ stp(regs[0], regs[1], Address(to, bytes, post_indexed)); 1068 offset += 2*wordSize; 1069 1070 for (int i = 2; i < count; i += 2) { 1071 __ stp(regs[i], regs[i+1], Address(to, -bytes + offset)); 1072 offset += 2*wordSize; 1073 } 1074 1075 assert (offset == bytes, "must be"); 1076 } 1077 1078 // Loads [from - count*wordSize, from) into regs[0], regs[1], ..., regs[count-1] 1079 // and decreases 'from' by count*wordSize. 1080 // Note that the word with lowest address goes to regs[0]. 1081 void bulk_load_backward(Register from, const Register regs[], int count) { 1082 assert (count > 0 && count % 2 == 0, "count must be positive even number"); 1083 int bytes = count * wordSize; 1084 1085 int offset = 0; 1086 1087 for (int i = count - 2; i > 0; i -= 2) { 1088 offset += 2*wordSize; 1089 __ ldp(regs[i], regs[i+1], Address(from, -offset)); 1090 } 1091 1092 offset += 2*wordSize; 1093 __ ldp(regs[0], regs[1], Address(from, -bytes, pre_indexed)); 1094 1095 assert (offset == bytes, "must be"); 1096 } 1097 1098 // Stores regs[0], regs[1], ..., regs[count-1] into [to - count*wordSize, to) 1099 // and decreases 'to' by count*wordSize. 1100 // Note that regs[0] value goes into the memory with lowest address. 1101 void bulk_store_backward(Register to, const Register regs[], int count) { 1102 assert (count > 0 && count % 2 == 0, "count must be positive even number"); 1103 int bytes = count * wordSize; 1104 1105 int offset = 0; 1106 1107 for (int i = count - 2; i > 0; i -= 2) { 1108 offset += 2*wordSize; 1109 __ stp(regs[i], regs[i+1], Address(to, -offset)); 1110 } 1111 1112 offset += 2*wordSize; 1113 __ stp(regs[0], regs[1], Address(to, -bytes, pre_indexed)); 1114 1115 assert (offset == bytes, "must be"); 1116 } 1117 #endif // AARCH64 1118 1119 // TODO-AARCH64: rearrange in-loop prefetches: 1120 // probably we should choose between "prefetch-store before or after store", not "before or after load". 1121 void prefetch(Register from, Register to, int offset, int to_delta = 0) { 1122 __ prefetch_read(Address(from, offset)); 1123 #ifdef AARCH64 1124 // Next line commented out to avoid significant loss of performance in memory copy - JDK-8078120 1125 // __ prfm(pstl1keep, Address(to, offset + to_delta)); 1126 #endif // AARCH64 1127 } 1128 1129 // Generate the inner loop for forward aligned array copy 1130 // 1131 // Arguments 1132 // from: src address, 64 bits aligned 1133 // to: dst address, wordSize aligned 1134 // count: number of elements (32-bit int) 1135 // bytes_per_count: number of bytes for each unit of 'count' 1136 // 1137 // Return the minimum initial value for count 1138 // 1139 // Notes: 1140 // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64) 1141 // - 'to' aligned on wordSize 1142 // - 'count' must be greater or equal than the returned value 1143 // 1144 // Increases 'from' and 'to' by count*bytes_per_count. 1145 // 1146 // Scratches 'count', R3. 1147 // On AArch64 also scratches R4-R10; on 32-bit ARM R4-R10 are preserved (saved/restored). 1148 // 1149 int generate_forward_aligned_copy_loop(Register from, Register to, Register count, int bytes_per_count) { 1150 assert (from == R0 && to == R1 && count == R2, "adjust the implementation below"); 1151 1152 const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration 1153 arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_aligned; 1154 int pld_offset = config->pld_distance; 1155 const int count_per_loop = bytes_per_loop / bytes_per_count; 1156 1157 #ifndef AARCH64 1158 bool split_read= config->split_ldm; 1159 bool split_write= config->split_stm; 1160 1161 // XXX optim: use VLDM/VSTM when available (Neon) with PLD 1162 // NEONCopyPLD 1163 // PLD [r1, #0xC0] 1164 // VLDM r1!,{d0-d7} 1165 // VSTM r0!,{d0-d7} 1166 // SUBS r2,r2,#0x40 1167 // BGE NEONCopyPLD 1168 1169 __ push(RegisterSet(R4,R10)); 1170 #endif // !AARCH64 1171 1172 const bool prefetch_before = pld_offset < 0; 1173 const bool prefetch_after = pld_offset > 0; 1174 1175 Label L_skip_pld; 1176 1177 // predecrease to exit when there is less than count_per_loop 1178 __ sub_32(count, count, count_per_loop); 1179 1180 if (pld_offset != 0) { 1181 pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset; 1182 1183 prefetch(from, to, 0); 1184 1185 if (prefetch_before) { 1186 // If prefetch is done ahead, final PLDs that overflow the 1187 // copied area can be easily avoided. 'count' is predecreased 1188 // by the prefetch distance to optimize the inner loop and the 1189 // outer loop skips the PLD. 1190 __ subs_32(count, count, (bytes_per_loop+pld_offset)/bytes_per_count); 1191 1192 // skip prefetch for small copies 1193 __ b(L_skip_pld, lt); 1194 } 1195 1196 int offset = ArmCopyCacheLineSize; 1197 while (offset <= pld_offset) { 1198 prefetch(from, to, offset); 1199 offset += ArmCopyCacheLineSize; 1200 }; 1201 } 1202 1203 #ifdef AARCH64 1204 const Register data_regs[8] = {R3, R4, R5, R6, R7, R8, R9, R10}; 1205 #endif // AARCH64 1206 { 1207 // LDM (32-bit ARM) / LDP (AArch64) copy of 'bytes_per_loop' bytes 1208 1209 // 32-bit ARM note: we have tried implementing loop unrolling to skip one 1210 // PLD with 64 bytes cache line but the gain was not significant. 1211 1212 Label L_copy_loop; 1213 __ align(OptoLoopAlignment); 1214 __ BIND(L_copy_loop); 1215 1216 if (prefetch_before) { 1217 prefetch(from, to, bytes_per_loop + pld_offset); 1218 __ BIND(L_skip_pld); 1219 } 1220 1221 #ifdef AARCH64 1222 bulk_load_forward(from, data_regs, 8); 1223 #else 1224 if (split_read) { 1225 // Split the register set in two sets so that there is less 1226 // latency between LDM and STM (R3-R6 available while R7-R10 1227 // still loading) and less register locking issue when iterating 1228 // on the first LDM. 1229 __ ldmia(from, RegisterSet(R3, R6), writeback); 1230 __ ldmia(from, RegisterSet(R7, R10), writeback); 1231 } else { 1232 __ ldmia(from, RegisterSet(R3, R10), writeback); 1233 } 1234 #endif // AARCH64 1235 1236 __ subs_32(count, count, count_per_loop); 1237 1238 if (prefetch_after) { 1239 prefetch(from, to, pld_offset, bytes_per_loop); 1240 } 1241 1242 #ifdef AARCH64 1243 bulk_store_forward(to, data_regs, 8); 1244 #else 1245 if (split_write) { 1246 __ stmia(to, RegisterSet(R3, R6), writeback); 1247 __ stmia(to, RegisterSet(R7, R10), writeback); 1248 } else { 1249 __ stmia(to, RegisterSet(R3, R10), writeback); 1250 } 1251 #endif // AARCH64 1252 1253 __ b(L_copy_loop, ge); 1254 1255 if (prefetch_before) { 1256 // the inner loop may end earlier, allowing to skip PLD for the last iterations 1257 __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count); 1258 __ b(L_skip_pld, ge); 1259 } 1260 } 1261 BLOCK_COMMENT("Remaining bytes:"); 1262 // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes 1263 1264 // __ add(count, count, ...); // addition useless for the bit tests 1265 assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits"); 1266 1267 #ifdef AARCH64 1268 assert (bytes_per_loop == 64, "adjust the code below"); 1269 assert (bytes_per_count <= 8, "adjust the code below"); 1270 1271 { 1272 Label L; 1273 __ tbz(count, exact_log2(32/bytes_per_count), L); 1274 1275 bulk_load_forward(from, data_regs, 4); 1276 bulk_store_forward(to, data_regs, 4); 1277 1278 __ bind(L); 1279 } 1280 1281 { 1282 Label L; 1283 __ tbz(count, exact_log2(16/bytes_per_count), L); 1284 1285 bulk_load_forward(from, data_regs, 2); 1286 bulk_store_forward(to, data_regs, 2); 1287 1288 __ bind(L); 1289 } 1290 1291 { 1292 Label L; 1293 __ tbz(count, exact_log2(8/bytes_per_count), L); 1294 1295 __ ldr(R3, Address(from, 8, post_indexed)); 1296 __ str(R3, Address(to, 8, post_indexed)); 1297 1298 __ bind(L); 1299 } 1300 1301 if (bytes_per_count <= 4) { 1302 Label L; 1303 __ tbz(count, exact_log2(4/bytes_per_count), L); 1304 1305 __ ldr_w(R3, Address(from, 4, post_indexed)); 1306 __ str_w(R3, Address(to, 4, post_indexed)); 1307 1308 __ bind(L); 1309 } 1310 1311 if (bytes_per_count <= 2) { 1312 Label L; 1313 __ tbz(count, exact_log2(2/bytes_per_count), L); 1314 1315 __ ldrh(R3, Address(from, 2, post_indexed)); 1316 __ strh(R3, Address(to, 2, post_indexed)); 1317 1318 __ bind(L); 1319 } 1320 1321 if (bytes_per_count <= 1) { 1322 Label L; 1323 __ tbz(count, 0, L); 1324 1325 __ ldrb(R3, Address(from, 1, post_indexed)); 1326 __ strb(R3, Address(to, 1, post_indexed)); 1327 1328 __ bind(L); 1329 } 1330 #else 1331 __ tst(count, 16 / bytes_per_count); 1332 __ ldmia(from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes 1333 __ stmia(to, RegisterSet(R3, R6), writeback, ne); 1334 1335 __ tst(count, 8 / bytes_per_count); 1336 __ ldmia(from, RegisterSet(R3, R4), writeback, ne); // copy 8 bytes 1337 __ stmia(to, RegisterSet(R3, R4), writeback, ne); 1338 1339 if (bytes_per_count <= 4) { 1340 __ tst(count, 4 / bytes_per_count); 1341 __ ldr(R3, Address(from, 4, post_indexed), ne); // copy 4 bytes 1342 __ str(R3, Address(to, 4, post_indexed), ne); 1343 } 1344 1345 if (bytes_per_count <= 2) { 1346 __ tst(count, 2 / bytes_per_count); 1347 __ ldrh(R3, Address(from, 2, post_indexed), ne); // copy 2 bytes 1348 __ strh(R3, Address(to, 2, post_indexed), ne); 1349 } 1350 1351 if (bytes_per_count == 1) { 1352 __ tst(count, 1); 1353 __ ldrb(R3, Address(from, 1, post_indexed), ne); 1354 __ strb(R3, Address(to, 1, post_indexed), ne); 1355 } 1356 1357 __ pop(RegisterSet(R4,R10)); 1358 #endif // AARCH64 1359 1360 return count_per_loop; 1361 } 1362 1363 1364 // Generate the inner loop for backward aligned array copy 1365 // 1366 // Arguments 1367 // end_from: src end address, 64 bits aligned 1368 // end_to: dst end address, wordSize aligned 1369 // count: number of elements (32-bit int) 1370 // bytes_per_count: number of bytes for each unit of 'count' 1371 // 1372 // Return the minimum initial value for count 1373 // 1374 // Notes: 1375 // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64) 1376 // - 'end_to' aligned on wordSize 1377 // - 'count' must be greater or equal than the returned value 1378 // 1379 // Decreases 'end_from' and 'end_to' by count*bytes_per_count. 1380 // 1381 // Scratches 'count', R3. 1382 // On AArch64 also scratches R4-R10; on 32-bit ARM R4-R10 are preserved (saved/restored). 1383 // 1384 int generate_backward_aligned_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count) { 1385 assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below"); 1386 1387 const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration 1388 const int count_per_loop = bytes_per_loop / bytes_per_count; 1389 1390 arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_aligned; 1391 int pld_offset = config->pld_distance; 1392 1393 #ifndef AARCH64 1394 bool split_read= config->split_ldm; 1395 bool split_write= config->split_stm; 1396 1397 // See the forward copy variant for additional comments. 1398 1399 __ push(RegisterSet(R4,R10)); 1400 #endif // !AARCH64 1401 1402 __ sub_32(count, count, count_per_loop); 1403 1404 const bool prefetch_before = pld_offset < 0; 1405 const bool prefetch_after = pld_offset > 0; 1406 1407 Label L_skip_pld; 1408 1409 if (pld_offset != 0) { 1410 pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset; 1411 1412 prefetch(end_from, end_to, -wordSize); 1413 1414 if (prefetch_before) { 1415 __ subs_32(count, count, (bytes_per_loop + pld_offset) / bytes_per_count); 1416 __ b(L_skip_pld, lt); 1417 } 1418 1419 int offset = ArmCopyCacheLineSize; 1420 while (offset <= pld_offset) { 1421 prefetch(end_from, end_to, -(wordSize + offset)); 1422 offset += ArmCopyCacheLineSize; 1423 }; 1424 } 1425 1426 #ifdef AARCH64 1427 const Register data_regs[8] = {R3, R4, R5, R6, R7, R8, R9, R10}; 1428 #endif // AARCH64 1429 { 1430 // LDM (32-bit ARM) / LDP (AArch64) copy of 'bytes_per_loop' bytes 1431 1432 // 32-bit ARM note: we have tried implementing loop unrolling to skip one 1433 // PLD with 64 bytes cache line but the gain was not significant. 1434 1435 Label L_copy_loop; 1436 __ align(OptoLoopAlignment); 1437 __ BIND(L_copy_loop); 1438 1439 if (prefetch_before) { 1440 prefetch(end_from, end_to, -(wordSize + bytes_per_loop + pld_offset)); 1441 __ BIND(L_skip_pld); 1442 } 1443 1444 #ifdef AARCH64 1445 bulk_load_backward(end_from, data_regs, 8); 1446 #else 1447 if (split_read) { 1448 __ ldmdb(end_from, RegisterSet(R7, R10), writeback); 1449 __ ldmdb(end_from, RegisterSet(R3, R6), writeback); 1450 } else { 1451 __ ldmdb(end_from, RegisterSet(R3, R10), writeback); 1452 } 1453 #endif // AARCH64 1454 1455 __ subs_32(count, count, count_per_loop); 1456 1457 if (prefetch_after) { 1458 prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop); 1459 } 1460 1461 #ifdef AARCH64 1462 bulk_store_backward(end_to, data_regs, 8); 1463 #else 1464 if (split_write) { 1465 __ stmdb(end_to, RegisterSet(R7, R10), writeback); 1466 __ stmdb(end_to, RegisterSet(R3, R6), writeback); 1467 } else { 1468 __ stmdb(end_to, RegisterSet(R3, R10), writeback); 1469 } 1470 #endif // AARCH64 1471 1472 __ b(L_copy_loop, ge); 1473 1474 if (prefetch_before) { 1475 __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count); 1476 __ b(L_skip_pld, ge); 1477 } 1478 } 1479 BLOCK_COMMENT("Remaining bytes:"); 1480 // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes 1481 1482 // __ add(count, count, ...); // addition useless for the bit tests 1483 assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits"); 1484 1485 #ifdef AARCH64 1486 assert (bytes_per_loop == 64, "adjust the code below"); 1487 assert (bytes_per_count <= 8, "adjust the code below"); 1488 1489 { 1490 Label L; 1491 __ tbz(count, exact_log2(32/bytes_per_count), L); 1492 1493 bulk_load_backward(end_from, data_regs, 4); 1494 bulk_store_backward(end_to, data_regs, 4); 1495 1496 __ bind(L); 1497 } 1498 1499 { 1500 Label L; 1501 __ tbz(count, exact_log2(16/bytes_per_count), L); 1502 1503 bulk_load_backward(end_from, data_regs, 2); 1504 bulk_store_backward(end_to, data_regs, 2); 1505 1506 __ bind(L); 1507 } 1508 1509 { 1510 Label L; 1511 __ tbz(count, exact_log2(8/bytes_per_count), L); 1512 1513 __ ldr(R3, Address(end_from, -8, pre_indexed)); 1514 __ str(R3, Address(end_to, -8, pre_indexed)); 1515 1516 __ bind(L); 1517 } 1518 1519 if (bytes_per_count <= 4) { 1520 Label L; 1521 __ tbz(count, exact_log2(4/bytes_per_count), L); 1522 1523 __ ldr_w(R3, Address(end_from, -4, pre_indexed)); 1524 __ str_w(R3, Address(end_to, -4, pre_indexed)); 1525 1526 __ bind(L); 1527 } 1528 1529 if (bytes_per_count <= 2) { 1530 Label L; 1531 __ tbz(count, exact_log2(2/bytes_per_count), L); 1532 1533 __ ldrh(R3, Address(end_from, -2, pre_indexed)); 1534 __ strh(R3, Address(end_to, -2, pre_indexed)); 1535 1536 __ bind(L); 1537 } 1538 1539 if (bytes_per_count <= 1) { 1540 Label L; 1541 __ tbz(count, 0, L); 1542 1543 __ ldrb(R3, Address(end_from, -1, pre_indexed)); 1544 __ strb(R3, Address(end_to, -1, pre_indexed)); 1545 1546 __ bind(L); 1547 } 1548 #else 1549 __ tst(count, 16 / bytes_per_count); 1550 __ ldmdb(end_from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes 1551 __ stmdb(end_to, RegisterSet(R3, R6), writeback, ne); 1552 1553 __ tst(count, 8 / bytes_per_count); 1554 __ ldmdb(end_from, RegisterSet(R3, R4), writeback, ne); // copy 8 bytes 1555 __ stmdb(end_to, RegisterSet(R3, R4), writeback, ne); 1556 1557 if (bytes_per_count <= 4) { 1558 __ tst(count, 4 / bytes_per_count); 1559 __ ldr(R3, Address(end_from, -4, pre_indexed), ne); // copy 4 bytes 1560 __ str(R3, Address(end_to, -4, pre_indexed), ne); 1561 } 1562 1563 if (bytes_per_count <= 2) { 1564 __ tst(count, 2 / bytes_per_count); 1565 __ ldrh(R3, Address(end_from, -2, pre_indexed), ne); // copy 2 bytes 1566 __ strh(R3, Address(end_to, -2, pre_indexed), ne); 1567 } 1568 1569 if (bytes_per_count == 1) { 1570 __ tst(count, 1); 1571 __ ldrb(R3, Address(end_from, -1, pre_indexed), ne); 1572 __ strb(R3, Address(end_to, -1, pre_indexed), ne); 1573 } 1574 1575 __ pop(RegisterSet(R4,R10)); 1576 #endif // AARCH64 1577 1578 return count_per_loop; 1579 } 1580 1581 1582 // Generate the inner loop for shifted forward array copy (unaligned copy). 1583 // It can be used when bytes_per_count < wordSize, i.e. 1584 // byte/short copy on 32-bit ARM, byte/short/int/compressed-oop copy on AArch64. 1585 // 1586 // Arguments 1587 // from: start src address, 64 bits aligned 1588 // to: start dst address, (now) wordSize aligned 1589 // count: number of elements (32-bit int) 1590 // bytes_per_count: number of bytes for each unit of 'count' 1591 // lsr_shift: shift applied to 'old' value to skipped already written bytes 1592 // lsl_shift: shift applied to 'new' value to set the high bytes of the next write 1593 // 1594 // Return the minimum initial value for count 1595 // 1596 // Notes: 1597 // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64) 1598 // - 'to' aligned on wordSize 1599 // - 'count' must be greater or equal than the returned value 1600 // - 'lsr_shift' + 'lsl_shift' = BitsPerWord 1601 // - 'bytes_per_count' is 1 or 2 on 32-bit ARM; 1, 2 or 4 on AArch64 1602 // 1603 // Increases 'to' by count*bytes_per_count. 1604 // 1605 // Scratches 'from' and 'count', R3-R10, R12 1606 // 1607 // On entry: 1608 // - R12 is preloaded with the first 'BitsPerWord' bits read just before 'from' 1609 // - (R12 >> lsr_shift) is the part not yet written (just before 'to') 1610 // --> (*to) = (R12 >> lsr_shift) | (*from) << lsl_shift); ... 1611 // 1612 // This implementation may read more bytes than required. 1613 // Actually, it always reads exactly all data from the copied region with upper bound aligned up by wordSize, 1614 // so excessive read do not cross a word bound and is thus harmless. 1615 // 1616 int generate_forward_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, int lsr_shift, int lsl_shift) { 1617 assert (from == R0 && to == R1 && count == R2, "adjust the implementation below"); 1618 1619 const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iter 1620 const int count_per_loop = bytes_per_loop / bytes_per_count; 1621 1622 arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_shifted; 1623 int pld_offset = config->pld_distance; 1624 1625 #ifndef AARCH64 1626 bool split_read= config->split_ldm; 1627 bool split_write= config->split_stm; 1628 #endif // !AARCH64 1629 1630 const bool prefetch_before = pld_offset < 0; 1631 const bool prefetch_after = pld_offset > 0; 1632 Label L_skip_pld, L_last_read, L_done; 1633 if (pld_offset != 0) { 1634 1635 pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset; 1636 1637 prefetch(from, to, 0); 1638 1639 if (prefetch_before) { 1640 __ cmp_32(count, count_per_loop); 1641 __ b(L_last_read, lt); 1642 // skip prefetch for small copies 1643 // warning: count is predecreased by the prefetch distance to optimize the inner loop 1644 __ subs_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop); 1645 __ b(L_skip_pld, lt); 1646 } 1647 1648 int offset = ArmCopyCacheLineSize; 1649 while (offset <= pld_offset) { 1650 prefetch(from, to, offset); 1651 offset += ArmCopyCacheLineSize; 1652 }; 1653 } 1654 1655 Label L_shifted_loop; 1656 1657 __ align(OptoLoopAlignment); 1658 __ BIND(L_shifted_loop); 1659 1660 if (prefetch_before) { 1661 // do it early if there might be register locking issues 1662 prefetch(from, to, bytes_per_loop + pld_offset); 1663 __ BIND(L_skip_pld); 1664 } else { 1665 __ cmp_32(count, count_per_loop); 1666 __ b(L_last_read, lt); 1667 } 1668 1669 #ifdef AARCH64 1670 const Register data_regs[9] = {R3, R4, R5, R6, R7, R8, R9, R10, R12}; 1671 __ logical_shift_right(R3, R12, lsr_shift); // part of R12 not yet written 1672 __ subs_32(count, count, count_per_loop); 1673 bulk_load_forward(from, &data_regs[1], 8); 1674 #else 1675 // read 32 bytes 1676 if (split_read) { 1677 // if write is not split, use less registers in first set to reduce locking 1678 RegisterSet set1 = split_write ? RegisterSet(R4, R7) : RegisterSet(R4, R5); 1679 RegisterSet set2 = (split_write ? RegisterSet(R8, R10) : RegisterSet(R6, R10)) | R12; 1680 __ ldmia(from, set1, writeback); 1681 __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); // part of R12 not yet written 1682 __ ldmia(from, set2, writeback); 1683 __ subs(count, count, count_per_loop); // XXX: should it be before the 2nd LDM ? (latency vs locking) 1684 } else { 1685 __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); // part of R12 not yet written 1686 __ ldmia(from, RegisterSet(R4, R10) | R12, writeback); // Note: small latency on R4 1687 __ subs(count, count, count_per_loop); 1688 } 1689 #endif // AARCH64 1690 1691 if (prefetch_after) { 1692 // do it after the 1st ldm/ldp anyway (no locking issues with early STM/STP) 1693 prefetch(from, to, pld_offset, bytes_per_loop); 1694 } 1695 1696 // prepare (shift) the values in R3..R10 1697 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); // merged below low bytes of next val 1698 __ logical_shift_right(R4, R4, lsr_shift); // unused part of next val 1699 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift)); // ... 1700 __ logical_shift_right(R5, R5, lsr_shift); 1701 __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift)); 1702 __ logical_shift_right(R6, R6, lsr_shift); 1703 __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift)); 1704 #ifndef AARCH64 1705 if (split_write) { 1706 // write the first half as soon as possible to reduce stm locking 1707 __ stmia(to, RegisterSet(R3, R6), writeback, prefetch_before ? gt : ge); 1708 } 1709 #endif // !AARCH64 1710 __ logical_shift_right(R7, R7, lsr_shift); 1711 __ orr(R7, R7, AsmOperand(R8, lsl, lsl_shift)); 1712 __ logical_shift_right(R8, R8, lsr_shift); 1713 __ orr(R8, R8, AsmOperand(R9, lsl, lsl_shift)); 1714 __ logical_shift_right(R9, R9, lsr_shift); 1715 __ orr(R9, R9, AsmOperand(R10, lsl, lsl_shift)); 1716 __ logical_shift_right(R10, R10, lsr_shift); 1717 __ orr(R10, R10, AsmOperand(R12, lsl, lsl_shift)); 1718 1719 #ifdef AARCH64 1720 bulk_store_forward(to, data_regs, 8); 1721 #else 1722 if (split_write) { 1723 __ stmia(to, RegisterSet(R7, R10), writeback, prefetch_before ? gt : ge); 1724 } else { 1725 __ stmia(to, RegisterSet(R3, R10), writeback, prefetch_before ? gt : ge); 1726 } 1727 #endif // AARCH64 1728 __ b(L_shifted_loop, gt); // no need to loop if 0 (when count need not be precise modulo bytes_per_loop) 1729 1730 if (prefetch_before) { 1731 // the first loop may end earlier, allowing to skip pld at the end 1732 __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count); 1733 #ifndef AARCH64 1734 __ stmia(to, RegisterSet(R3, R10), writeback); // stmia was skipped 1735 #endif // !AARCH64 1736 __ b(L_skip_pld, ge); 1737 __ adds_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop); 1738 } 1739 1740 __ BIND(L_last_read); 1741 __ b(L_done, eq); 1742 1743 #ifdef AARCH64 1744 assert(bytes_per_count < 8, "adjust the code below"); 1745 1746 __ logical_shift_right(R3, R12, lsr_shift); 1747 1748 { 1749 Label L; 1750 __ tbz(count, exact_log2(32/bytes_per_count), L); 1751 bulk_load_forward(from, &data_regs[1], 4); 1752 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); 1753 __ logical_shift_right(R4, R4, lsr_shift); 1754 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift)); 1755 __ logical_shift_right(R5, R5, lsr_shift); 1756 __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift)); 1757 __ logical_shift_right(R6, R6, lsr_shift); 1758 __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift)); 1759 bulk_store_forward(to, data_regs, 4); 1760 __ logical_shift_right(R3, R7, lsr_shift); 1761 __ bind(L); 1762 } 1763 1764 { 1765 Label L; 1766 __ tbz(count, exact_log2(16/bytes_per_count), L); 1767 bulk_load_forward(from, &data_regs[1], 2); 1768 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); 1769 __ logical_shift_right(R4, R4, lsr_shift); 1770 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift)); 1771 bulk_store_forward(to, data_regs, 2); 1772 __ logical_shift_right(R3, R5, lsr_shift); 1773 __ bind(L); 1774 } 1775 1776 { 1777 Label L; 1778 __ tbz(count, exact_log2(8/bytes_per_count), L); 1779 __ ldr(R4, Address(from, 8, post_indexed)); 1780 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); 1781 __ str(R3, Address(to, 8, post_indexed)); 1782 __ logical_shift_right(R3, R4, lsr_shift); 1783 __ bind(L); 1784 } 1785 1786 const int have_bytes = lsl_shift/BitsPerByte; // number of already read bytes in R3 1787 1788 // It remains less than wordSize to write. 1789 // Do not check count if R3 already has maximal number of loaded elements (one less than wordSize). 1790 if (have_bytes < wordSize - bytes_per_count) { 1791 Label L; 1792 __ andr(count, count, (uintx)(8/bytes_per_count-1)); // make count exact 1793 __ cmp_32(count, have_bytes/bytes_per_count); // do we have enough bytes to store? 1794 __ b(L, le); 1795 __ ldr(R4, Address(from, 8, post_indexed)); 1796 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); 1797 __ bind(L); 1798 } 1799 1800 { 1801 Label L; 1802 __ tbz(count, exact_log2(4/bytes_per_count), L); 1803 __ str_w(R3, Address(to, 4, post_indexed)); 1804 if (bytes_per_count < 4) { 1805 __ logical_shift_right(R3, R3, 4*BitsPerByte); 1806 } 1807 __ bind(L); 1808 } 1809 1810 if (bytes_per_count <= 2) { 1811 Label L; 1812 __ tbz(count, exact_log2(2/bytes_per_count), L); 1813 __ strh(R3, Address(to, 2, post_indexed)); 1814 if (bytes_per_count < 2) { 1815 __ logical_shift_right(R3, R3, 2*BitsPerByte); 1816 } 1817 __ bind(L); 1818 } 1819 1820 if (bytes_per_count <= 1) { 1821 Label L; 1822 __ tbz(count, exact_log2(1/bytes_per_count), L); 1823 __ strb(R3, Address(to, 1, post_indexed)); 1824 __ bind(L); 1825 } 1826 #else 1827 switch (bytes_per_count) { 1828 case 2: 1829 __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); 1830 __ tst(count, 8); 1831 __ ldmia(from, RegisterSet(R4, R7), writeback, ne); 1832 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val 1833 __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val 1834 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ... 1835 __ mov(R5, AsmOperand(R5, lsr, lsr_shift), ne); 1836 __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift), ne); 1837 __ mov(R6, AsmOperand(R6, lsr, lsr_shift), ne); 1838 __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift), ne); 1839 __ stmia(to, RegisterSet(R3, R6), writeback, ne); 1840 __ mov(R3, AsmOperand(R7, lsr, lsr_shift), ne); 1841 1842 __ tst(count, 4); 1843 __ ldmia(from, RegisterSet(R4, R5), writeback, ne); 1844 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val 1845 __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val 1846 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ... 1847 __ stmia(to, RegisterSet(R3, R4), writeback, ne); 1848 __ mov(R3, AsmOperand(R5, lsr, lsr_shift), ne); 1849 1850 __ tst(count, 2); 1851 __ ldr(R4, Address(from, 4, post_indexed), ne); 1852 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); 1853 __ str(R3, Address(to, 4, post_indexed), ne); 1854 __ mov(R3, AsmOperand(R4, lsr, lsr_shift), ne); 1855 1856 __ tst(count, 1); 1857 __ strh(R3, Address(to, 2, post_indexed), ne); // one last short 1858 break; 1859 1860 case 1: 1861 __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); 1862 __ tst(count, 16); 1863 __ ldmia(from, RegisterSet(R4, R7), writeback, ne); 1864 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val 1865 __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val 1866 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ... 1867 __ mov(R5, AsmOperand(R5, lsr, lsr_shift), ne); 1868 __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift), ne); 1869 __ mov(R6, AsmOperand(R6, lsr, lsr_shift), ne); 1870 __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift), ne); 1871 __ stmia(to, RegisterSet(R3, R6), writeback, ne); 1872 __ mov(R3, AsmOperand(R7, lsr, lsr_shift), ne); 1873 1874 __ tst(count, 8); 1875 __ ldmia(from, RegisterSet(R4, R5), writeback, ne); 1876 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val 1877 __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val 1878 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ... 1879 __ stmia(to, RegisterSet(R3, R4), writeback, ne); 1880 __ mov(R3, AsmOperand(R5, lsr, lsr_shift), ne); 1881 1882 __ tst(count, 4); 1883 __ ldr(R4, Address(from, 4, post_indexed), ne); 1884 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); 1885 __ str(R3, Address(to, 4, post_indexed), ne); 1886 __ mov(R3, AsmOperand(R4, lsr, lsr_shift), ne); 1887 1888 __ andr(count, count, 3); 1889 __ cmp(count, 2); 1890 1891 // Note: R3 might contain enough bytes ready to write (3 needed at most), 1892 // thus load on lsl_shift==24 is not needed (in fact forces reading 1893 // beyond source buffer end boundary) 1894 if (lsl_shift == 8) { 1895 __ ldr(R4, Address(from, 4, post_indexed), ge); 1896 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ge); 1897 } else if (lsl_shift == 16) { 1898 __ ldr(R4, Address(from, 4, post_indexed), gt); 1899 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), gt); 1900 } 1901 1902 __ strh(R3, Address(to, 2, post_indexed), ge); // two last bytes 1903 __ mov(R3, AsmOperand(R3, lsr, 16), gt); 1904 1905 __ tst(count, 1); 1906 __ strb(R3, Address(to, 1, post_indexed), ne); // one last byte 1907 break; 1908 } 1909 #endif // AARCH64 1910 1911 __ BIND(L_done); 1912 return 0; // no minimum 1913 } 1914 1915 // Generate the inner loop for shifted backward array copy (unaligned copy). 1916 // It can be used when bytes_per_count < wordSize, i.e. 1917 // byte/short copy on 32-bit ARM, byte/short/int/compressed-oop copy on AArch64. 1918 // 1919 // Arguments 1920 // end_from: end src address, 64 bits aligned 1921 // end_to: end dst address, (now) wordSize aligned 1922 // count: number of elements (32-bit int) 1923 // bytes_per_count: number of bytes for each unit of 'count' 1924 // lsl_shift: shift applied to 'old' value to skipped already written bytes 1925 // lsr_shift: shift applied to 'new' value to set the low bytes of the next write 1926 // 1927 // Return the minimum initial value for count 1928 // 1929 // Notes: 1930 // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64) 1931 // - 'end_to' aligned on wordSize 1932 // - 'count' must be greater or equal than the returned value 1933 // - 'lsr_shift' + 'lsl_shift' = 'BitsPerWord' 1934 // - 'bytes_per_count' is 1 or 2 on 32-bit ARM; 1, 2 or 4 on AArch64 1935 // 1936 // Decreases 'end_to' by count*bytes_per_count. 1937 // 1938 // Scratches 'end_from', 'count', R3-R10, R12 1939 // 1940 // On entry: 1941 // - R3 is preloaded with the first 'BitsPerWord' bits read just after 'from' 1942 // - (R3 << lsl_shift) is the part not yet written 1943 // --> (*--to) = (R3 << lsl_shift) | (*--from) >> lsr_shift); ... 1944 // 1945 // This implementation may read more bytes than required. 1946 // Actually, it always reads exactly all data from the copied region with beginning aligned down by wordSize, 1947 // so excessive read do not cross a word bound and is thus harmless. 1948 // 1949 int generate_backward_shifted_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count, int lsr_shift, int lsl_shift) { 1950 assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below"); 1951 1952 const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iter 1953 const int count_per_loop = bytes_per_loop / bytes_per_count; 1954 1955 arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_shifted; 1956 int pld_offset = config->pld_distance; 1957 1958 #ifndef AARCH64 1959 bool split_read= config->split_ldm; 1960 bool split_write= config->split_stm; 1961 #endif // !AARCH64 1962 1963 1964 const bool prefetch_before = pld_offset < 0; 1965 const bool prefetch_after = pld_offset > 0; 1966 1967 Label L_skip_pld, L_done, L_last_read; 1968 if (pld_offset != 0) { 1969 1970 pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset; 1971 1972 prefetch(end_from, end_to, -wordSize); 1973 1974 if (prefetch_before) { 1975 __ cmp_32(count, count_per_loop); 1976 __ b(L_last_read, lt); 1977 1978 // skip prefetch for small copies 1979 // warning: count is predecreased by the prefetch distance to optimize the inner loop 1980 __ subs_32(count, count, ((bytes_per_loop + pld_offset)/bytes_per_count) + count_per_loop); 1981 __ b(L_skip_pld, lt); 1982 } 1983 1984 int offset = ArmCopyCacheLineSize; 1985 while (offset <= pld_offset) { 1986 prefetch(end_from, end_to, -(wordSize + offset)); 1987 offset += ArmCopyCacheLineSize; 1988 }; 1989 } 1990 1991 Label L_shifted_loop; 1992 __ align(OptoLoopAlignment); 1993 __ BIND(L_shifted_loop); 1994 1995 if (prefetch_before) { 1996 // do the 1st ldm/ldp first anyway (no locking issues with early STM/STP) 1997 prefetch(end_from, end_to, -(wordSize + bytes_per_loop + pld_offset)); 1998 __ BIND(L_skip_pld); 1999 } else { 2000 __ cmp_32(count, count_per_loop); 2001 __ b(L_last_read, lt); 2002 } 2003 2004 #ifdef AARCH64 2005 __ logical_shift_left(R12, R3, lsl_shift); 2006 const Register data_regs[9] = {R3, R4, R5, R6, R7, R8, R9, R10, R12}; 2007 bulk_load_backward(end_from, data_regs, 8); 2008 #else 2009 if (split_read) { 2010 __ ldmdb(end_from, RegisterSet(R7, R10), writeback); 2011 __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written 2012 __ ldmdb(end_from, RegisterSet(R3, R6), writeback); 2013 } else { 2014 __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written 2015 __ ldmdb(end_from, RegisterSet(R3, R10), writeback); 2016 } 2017 #endif // AARCH64 2018 2019 __ subs_32(count, count, count_per_loop); 2020 2021 if (prefetch_after) { // do prefetch during ldm/ldp latency 2022 prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop); 2023 } 2024 2025 // prepare the values in R4..R10,R12 2026 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); // merged above high bytes of prev val 2027 __ logical_shift_left(R10, R10, lsl_shift); // unused part of prev val 2028 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift)); // ... 2029 __ logical_shift_left(R9, R9, lsl_shift); 2030 __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift)); 2031 __ logical_shift_left(R8, R8, lsl_shift); 2032 __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift)); 2033 __ logical_shift_left(R7, R7, lsl_shift); 2034 __ orr(R7, R7, AsmOperand(R6, lsr, lsr_shift)); 2035 __ logical_shift_left(R6, R6, lsl_shift); 2036 __ orr(R6, R6, AsmOperand(R5, lsr, lsr_shift)); 2037 #ifndef AARCH64 2038 if (split_write) { 2039 // store early to reduce locking issues 2040 __ stmdb(end_to, RegisterSet(R6, R10) | R12, writeback, prefetch_before ? gt : ge); 2041 } 2042 #endif // !AARCH64 2043 __ logical_shift_left(R5, R5, lsl_shift); 2044 __ orr(R5, R5, AsmOperand(R4, lsr, lsr_shift)); 2045 __ logical_shift_left(R4, R4, lsl_shift); 2046 __ orr(R4, R4, AsmOperand(R3, lsr, lsr_shift)); 2047 2048 #ifdef AARCH64 2049 bulk_store_backward(end_to, &data_regs[1], 8); 2050 #else 2051 if (split_write) { 2052 __ stmdb(end_to, RegisterSet(R4, R5), writeback, prefetch_before ? gt : ge); 2053 } else { 2054 __ stmdb(end_to, RegisterSet(R4, R10) | R12, writeback, prefetch_before ? gt : ge); 2055 } 2056 #endif // AARCH64 2057 2058 __ b(L_shifted_loop, gt); // no need to loop if 0 (when count need not be precise modulo bytes_per_loop) 2059 2060 if (prefetch_before) { 2061 // the first loop may end earlier, allowing to skip pld at the end 2062 __ cmn_32(count, ((bytes_per_loop + pld_offset)/bytes_per_count)); 2063 #ifndef AARCH64 2064 __ stmdb(end_to, RegisterSet(R4, R10) | R12, writeback); // stmdb was skipped 2065 #endif // !AARCH64 2066 __ b(L_skip_pld, ge); 2067 __ adds_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop); 2068 } 2069 2070 __ BIND(L_last_read); 2071 __ b(L_done, eq); 2072 2073 #ifdef AARCH64 2074 assert(bytes_per_count < 8, "adjust the code below"); 2075 2076 __ logical_shift_left(R12, R3, lsl_shift); 2077 2078 { 2079 Label L; 2080 __ tbz(count, exact_log2(32/bytes_per_count), L); 2081 bulk_load_backward(end_from, &data_regs[4], 4); 2082 2083 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); 2084 __ logical_shift_left(R10, R10, lsl_shift); 2085 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift)); 2086 __ logical_shift_left(R9, R9, lsl_shift); 2087 __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift)); 2088 __ logical_shift_left(R8, R8, lsl_shift); 2089 __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift)); 2090 2091 bulk_store_backward(end_to, &data_regs[5], 4); 2092 __ logical_shift_left(R12, R7, lsl_shift); 2093 __ bind(L); 2094 } 2095 2096 { 2097 Label L; 2098 __ tbz(count, exact_log2(16/bytes_per_count), L); 2099 bulk_load_backward(end_from, &data_regs[6], 2); 2100 2101 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); 2102 __ logical_shift_left(R10, R10, lsl_shift); 2103 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift)); 2104 2105 bulk_store_backward(end_to, &data_regs[7], 2); 2106 __ logical_shift_left(R12, R9, lsl_shift); 2107 __ bind(L); 2108 } 2109 2110 { 2111 Label L; 2112 __ tbz(count, exact_log2(8/bytes_per_count), L); 2113 __ ldr(R10, Address(end_from, -8, pre_indexed)); 2114 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); 2115 __ str(R12, Address(end_to, -8, pre_indexed)); 2116 __ logical_shift_left(R12, R10, lsl_shift); 2117 __ bind(L); 2118 } 2119 2120 const int have_bytes = lsr_shift/BitsPerByte; // number of already read bytes in R12 2121 2122 // It remains less than wordSize to write. 2123 // Do not check count if R12 already has maximal number of loaded elements (one less than wordSize). 2124 if (have_bytes < wordSize - bytes_per_count) { 2125 Label L; 2126 __ andr(count, count, (uintx)(8/bytes_per_count-1)); // make count exact 2127 __ cmp_32(count, have_bytes/bytes_per_count); // do we have enough bytes to store? 2128 __ b(L, le); 2129 __ ldr(R10, Address(end_from, -8, pre_indexed)); 2130 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); 2131 __ bind(L); 2132 } 2133 2134 assert (bytes_per_count <= 4, "must be"); 2135 2136 { 2137 Label L; 2138 __ tbz(count, exact_log2(4/bytes_per_count), L); 2139 __ logical_shift_right(R9, R12, (wordSize-4)*BitsPerByte); 2140 __ str_w(R9, Address(end_to, -4, pre_indexed)); // Write 4 MSB 2141 if (bytes_per_count < 4) { 2142 __ logical_shift_left(R12, R12, 4*BitsPerByte); // Promote remaining bytes to MSB 2143 } 2144 __ bind(L); 2145 } 2146 2147 if (bytes_per_count <= 2) { 2148 Label L; 2149 __ tbz(count, exact_log2(2/bytes_per_count), L); 2150 __ logical_shift_right(R9, R12, (wordSize-2)*BitsPerByte); 2151 __ strh(R9, Address(end_to, -2, pre_indexed)); // Write 2 MSB 2152 if (bytes_per_count < 2) { 2153 __ logical_shift_left(R12, R12, 2*BitsPerByte); // Promote remaining bytes to MSB 2154 } 2155 __ bind(L); 2156 } 2157 2158 if (bytes_per_count <= 1) { 2159 Label L; 2160 __ tbz(count, exact_log2(1/bytes_per_count), L); 2161 __ logical_shift_right(R9, R12, (wordSize-1)*BitsPerByte); 2162 __ strb(R9, Address(end_to, -1, pre_indexed)); // Write 1 MSB 2163 __ bind(L); 2164 } 2165 #else 2166 switch(bytes_per_count) { 2167 case 2: 2168 __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written 2169 __ tst(count, 8); 2170 __ ldmdb(end_from, RegisterSet(R7,R10), writeback, ne); 2171 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 2172 __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val 2173 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ... 2174 __ mov(R9, AsmOperand(R9, lsl, lsl_shift),ne); 2175 __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift),ne); 2176 __ mov(R8, AsmOperand(R8, lsl, lsl_shift),ne); 2177 __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift),ne); 2178 __ stmdb(end_to, RegisterSet(R8,R10)|R12, writeback, ne); 2179 __ mov(R12, AsmOperand(R7, lsl, lsl_shift), ne); 2180 2181 __ tst(count, 4); 2182 __ ldmdb(end_from, RegisterSet(R9, R10), writeback, ne); 2183 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 2184 __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val 2185 __ orr(R10, R10, AsmOperand(R9, lsr,lsr_shift),ne); // ... 2186 __ stmdb(end_to, RegisterSet(R10)|R12, writeback, ne); 2187 __ mov(R12, AsmOperand(R9, lsl, lsl_shift), ne); 2188 2189 __ tst(count, 2); 2190 __ ldr(R10, Address(end_from, -4, pre_indexed), ne); 2191 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 2192 __ str(R12, Address(end_to, -4, pre_indexed), ne); 2193 __ mov(R12, AsmOperand(R10, lsl, lsl_shift), ne); 2194 2195 __ tst(count, 1); 2196 __ mov(R12, AsmOperand(R12, lsr, lsr_shift),ne); 2197 __ strh(R12, Address(end_to, -2, pre_indexed), ne); // one last short 2198 break; 2199 2200 case 1: 2201 __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written 2202 __ tst(count, 16); 2203 __ ldmdb(end_from, RegisterSet(R7,R10), writeback, ne); 2204 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 2205 __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val 2206 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ... 2207 __ mov(R9, AsmOperand(R9, lsl, lsl_shift),ne); 2208 __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift),ne); 2209 __ mov(R8, AsmOperand(R8, lsl, lsl_shift),ne); 2210 __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift),ne); 2211 __ stmdb(end_to, RegisterSet(R8,R10)|R12, writeback, ne); 2212 __ mov(R12, AsmOperand(R7, lsl, lsl_shift), ne); 2213 2214 __ tst(count, 8); 2215 __ ldmdb(end_from, RegisterSet(R9,R10), writeback, ne); 2216 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 2217 __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val 2218 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ... 2219 __ stmdb(end_to, RegisterSet(R10)|R12, writeback, ne); 2220 __ mov(R12, AsmOperand(R9, lsl, lsl_shift), ne); 2221 2222 __ tst(count, 4); 2223 __ ldr(R10, Address(end_from, -4, pre_indexed), ne); 2224 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 2225 __ str(R12, Address(end_to, -4, pre_indexed), ne); 2226 __ mov(R12, AsmOperand(R10, lsl, lsl_shift), ne); 2227 2228 __ tst(count, 2); 2229 if (lsr_shift != 24) { 2230 // avoid useless reading R10 when we already have 3 bytes ready in R12 2231 __ ldr(R10, Address(end_from, -4, pre_indexed), ne); 2232 __ orr(R12, R12, AsmOperand(R10, lsr,lsr_shift), ne); 2233 } 2234 2235 // Note: R12 contains enough bytes ready to write (3 needed at most) 2236 // write the 2 MSBs 2237 __ mov(R9, AsmOperand(R12, lsr, 16), ne); 2238 __ strh(R9, Address(end_to, -2, pre_indexed), ne); 2239 // promote remaining to MSB 2240 __ mov(R12, AsmOperand(R12, lsl, 16), ne); 2241 2242 __ tst(count, 1); 2243 // write the MSB of R12 2244 __ mov(R12, AsmOperand(R12, lsr, 24), ne); 2245 __ strb(R12, Address(end_to, -1, pre_indexed), ne); 2246 2247 break; 2248 } 2249 #endif // AARCH64 2250 2251 __ BIND(L_done); 2252 return 0; // no minimum 2253 } 2254 2255 // This method is very useful for merging forward/backward implementations 2256 Address get_addr_with_indexing(Register base, int delta, bool forward) { 2257 if (forward) { 2258 return Address(base, delta, post_indexed); 2259 } else { 2260 return Address(base, -delta, pre_indexed); 2261 } 2262 } 2263 2264 #ifdef AARCH64 2265 // Loads one 'size_in_bytes'-sized value from 'from' in given direction, i.e. 2266 // if forward: loads value at from and increases from by size 2267 // if !forward: loads value at from-size_in_bytes and decreases from by size 2268 void load_one(Register rd, Register from, int size_in_bytes, bool forward) { 2269 assert_different_registers(from, rd); 2270 Address addr = get_addr_with_indexing(from, size_in_bytes, forward); 2271 __ load_sized_value(rd, addr, size_in_bytes, false); 2272 } 2273 2274 // Stores one 'size_in_bytes'-sized value to 'to' in given direction (see load_one) 2275 void store_one(Register rd, Register to, int size_in_bytes, bool forward) { 2276 assert_different_registers(to, rd); 2277 Address addr = get_addr_with_indexing(to, size_in_bytes, forward); 2278 __ store_sized_value(rd, addr, size_in_bytes); 2279 } 2280 #else 2281 // load_one and store_one are the same as for AArch64 except for 2282 // *) Support for condition execution 2283 // *) Second value register argument for 8-byte values 2284 2285 void load_one(Register rd, Register from, int size_in_bytes, bool forward, AsmCondition cond = al, Register rd2 = noreg) { 2286 assert_different_registers(from, rd, rd2); 2287 if (size_in_bytes < 8) { 2288 Address addr = get_addr_with_indexing(from, size_in_bytes, forward); 2289 __ load_sized_value(rd, addr, size_in_bytes, false, cond); 2290 } else { 2291 assert (rd2 != noreg, "second value register must be specified"); 2292 assert (rd->encoding() < rd2->encoding(), "wrong value register set"); 2293 2294 if (forward) { 2295 __ ldmia(from, RegisterSet(rd) | rd2, writeback, cond); 2296 } else { 2297 __ ldmdb(from, RegisterSet(rd) | rd2, writeback, cond); 2298 } 2299 } 2300 } 2301 2302 void store_one(Register rd, Register to, int size_in_bytes, bool forward, AsmCondition cond = al, Register rd2 = noreg) { 2303 assert_different_registers(to, rd, rd2); 2304 if (size_in_bytes < 8) { 2305 Address addr = get_addr_with_indexing(to, size_in_bytes, forward); 2306 __ store_sized_value(rd, addr, size_in_bytes, cond); 2307 } else { 2308 assert (rd2 != noreg, "second value register must be specified"); 2309 assert (rd->encoding() < rd2->encoding(), "wrong value register set"); 2310 2311 if (forward) { 2312 __ stmia(to, RegisterSet(rd) | rd2, writeback, cond); 2313 } else { 2314 __ stmdb(to, RegisterSet(rd) | rd2, writeback, cond); 2315 } 2316 } 2317 } 2318 #endif // AARCH64 2319 2320 // Copies data from 'from' to 'to' in specified direction to align 'from' by 64 bits. 2321 // (on 32-bit ARM 64-bit alignment is better for LDM). 2322 // 2323 // Arguments: 2324 // from: beginning (if forward) or upper bound (if !forward) of the region to be read 2325 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 2326 // count: 32-bit int, maximum number of elements which can be copied 2327 // bytes_per_count: size of an element 2328 // forward: specifies copy direction 2329 // 2330 // Notes: 2331 // 'from' and 'to' must be aligned by 'bytes_per_count' 2332 // 'count' must not be less than the returned value 2333 // shifts 'from' and 'to' by the number of copied bytes in corresponding direction 2334 // decreases 'count' by the number of elements copied 2335 // 2336 // Returns maximum number of bytes which may be copied. 2337 int align_src(Register from, Register to, Register count, Register tmp, int bytes_per_count, bool forward) { 2338 assert_different_registers(from, to, count, tmp); 2339 #ifdef AARCH64 2340 // TODO-AARCH64: replace by simple loop? 2341 Label Laligned_by_2, Laligned_by_4, Laligned_by_8; 2342 2343 if (bytes_per_count == 1) { 2344 __ tbz(from, 0, Laligned_by_2); 2345 __ sub_32(count, count, 1); 2346 load_one(tmp, from, 1, forward); 2347 store_one(tmp, to, 1, forward); 2348 } 2349 2350 __ BIND(Laligned_by_2); 2351 2352 if (bytes_per_count <= 2) { 2353 __ tbz(from, 1, Laligned_by_4); 2354 __ sub_32(count, count, 2/bytes_per_count); 2355 load_one(tmp, from, 2, forward); 2356 store_one(tmp, to, 2, forward); 2357 } 2358 2359 __ BIND(Laligned_by_4); 2360 2361 if (bytes_per_count <= 4) { 2362 __ tbz(from, 2, Laligned_by_8); 2363 __ sub_32(count, count, 4/bytes_per_count); 2364 load_one(tmp, from, 4, forward); 2365 store_one(tmp, to, 4, forward); 2366 } 2367 __ BIND(Laligned_by_8); 2368 #else // AARCH64 2369 if (bytes_per_count < 8) { 2370 Label L_align_src; 2371 __ BIND(L_align_src); 2372 __ tst(from, 7); 2373 // ne => not aligned: copy one element and (if bytes_per_count < 4) loop 2374 __ sub(count, count, 1, ne); 2375 load_one(tmp, from, bytes_per_count, forward, ne); 2376 store_one(tmp, to, bytes_per_count, forward, ne); 2377 if (bytes_per_count < 4) { 2378 __ b(L_align_src, ne); // if bytes_per_count == 4, then 0 or 1 loop iterations are enough 2379 } 2380 } 2381 #endif // AARCH64 2382 return 7/bytes_per_count; 2383 } 2384 2385 // Copies 'count' of 'bytes_per_count'-sized elements in the specified direction. 2386 // 2387 // Arguments: 2388 // from: beginning (if forward) or upper bound (if !forward) of the region to be read 2389 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 2390 // count: 32-bit int, number of elements to be copied 2391 // entry: copy loop entry point 2392 // bytes_per_count: size of an element 2393 // forward: specifies copy direction 2394 // 2395 // Notes: 2396 // shifts 'from' and 'to' 2397 void copy_small_array(Register from, Register to, Register count, Register tmp, Register tmp2, int bytes_per_count, bool forward, Label & entry) { 2398 assert_different_registers(from, to, count, tmp); 2399 2400 __ align(OptoLoopAlignment); 2401 #ifdef AARCH64 2402 Label L_small_array_done, L_small_array_loop; 2403 __ BIND(entry); 2404 __ cbz_32(count, L_small_array_done); 2405 2406 __ BIND(L_small_array_loop); 2407 __ subs_32(count, count, 1); 2408 load_one(tmp, from, bytes_per_count, forward); 2409 store_one(tmp, to, bytes_per_count, forward); 2410 __ b(L_small_array_loop, gt); 2411 2412 __ BIND(L_small_array_done); 2413 #else 2414 Label L_small_loop; 2415 __ BIND(L_small_loop); 2416 store_one(tmp, to, bytes_per_count, forward, al, tmp2); 2417 __ BIND(entry); // entry point 2418 __ subs(count, count, 1); 2419 load_one(tmp, from, bytes_per_count, forward, ge, tmp2); 2420 __ b(L_small_loop, ge); 2421 #endif // AARCH64 2422 } 2423 2424 // Aligns 'to' by reading one word from 'from' and writting its part to 'to'. 2425 // 2426 // Arguments: 2427 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 2428 // count: 32-bit int, number of elements allowed to be copied 2429 // to_remainder: remainder of dividing 'to' by wordSize 2430 // bytes_per_count: size of an element 2431 // forward: specifies copy direction 2432 // Rval: contains an already read but not yet written word; 2433 // its' LSBs (if forward) or MSBs (if !forward) are to be written to align 'to'. 2434 // 2435 // Notes: 2436 // 'count' must not be less then the returned value 2437 // 'to' must be aligned by bytes_per_count but must not be aligned by wordSize 2438 // shifts 'to' by the number of written bytes (so that it becomes the bound of memory to be written) 2439 // decreases 'count' by the the number of elements written 2440 // Rval's MSBs or LSBs remain to be written further by generate_{forward,backward}_shifted_copy_loop 2441 int align_dst(Register to, Register count, Register Rval, Register tmp, 2442 int to_remainder, int bytes_per_count, bool forward) { 2443 assert_different_registers(to, count, tmp, Rval); 2444 2445 assert (0 < to_remainder && to_remainder < wordSize, "to_remainder is not valid"); 2446 assert (to_remainder % bytes_per_count == 0, "to must be aligned by bytes_per_count"); 2447 2448 int bytes_to_write = forward ? (wordSize - to_remainder) : to_remainder; 2449 2450 int offset = 0; 2451 2452 for (int l = 0; l < LogBytesPerWord; ++l) { 2453 int s = (1 << l); 2454 if (bytes_to_write & s) { 2455 int new_offset = offset + s*BitsPerByte; 2456 if (forward) { 2457 if (offset == 0) { 2458 store_one(Rval, to, s, forward); 2459 } else { 2460 __ logical_shift_right(tmp, Rval, offset); 2461 store_one(tmp, to, s, forward); 2462 } 2463 } else { 2464 __ logical_shift_right(tmp, Rval, BitsPerWord - new_offset); 2465 store_one(tmp, to, s, forward); 2466 } 2467 2468 offset = new_offset; 2469 } 2470 } 2471 2472 assert (offset == bytes_to_write * BitsPerByte, "all bytes must be copied"); 2473 2474 __ sub_32(count, count, bytes_to_write/bytes_per_count); 2475 2476 return bytes_to_write / bytes_per_count; 2477 } 2478 2479 // Copies 'count' of elements using shifted copy loop 2480 // 2481 // Arguments: 2482 // from: beginning (if forward) or upper bound (if !forward) of the region to be read 2483 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 2484 // count: 32-bit int, number of elements to be copied 2485 // to_remainder: remainder of dividing 'to' by wordSize 2486 // bytes_per_count: size of an element 2487 // forward: specifies copy direction 2488 // Rval: contains an already read but not yet written word 2489 // 2490 // 2491 // Notes: 2492 // 'count' must not be less then the returned value 2493 // 'from' must be aligned by wordSize 2494 // 'to' must be aligned by bytes_per_count but must not be aligned by wordSize 2495 // shifts 'to' by the number of copied bytes 2496 // 2497 // Scratches R3-R10, R12 2498 int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, Register Rval, 2499 int to_remainder, int bytes_per_count, bool forward) { 2500 2501 assert (0 < to_remainder && to_remainder < wordSize, "to_remainder is invalid"); 2502 2503 const Register tmp = forward ? R3 : R12; // TODO-AARCH64: on cojoint_short R4 was used for tmp 2504 assert_different_registers(from, to, count, Rval, tmp); 2505 2506 int required_to_align = align_dst(to, count, Rval, tmp, to_remainder, bytes_per_count, forward); 2507 2508 int lsr_shift = (wordSize - to_remainder) * BitsPerByte; 2509 int lsl_shift = to_remainder * BitsPerByte; 2510 2511 int min_copy; 2512 if (forward) { 2513 min_copy = generate_forward_shifted_copy_loop(from, to, count, bytes_per_count, lsr_shift, lsl_shift); 2514 } else { 2515 min_copy = generate_backward_shifted_copy_loop(from, to, count, bytes_per_count, lsr_shift, lsl_shift); 2516 } 2517 2518 return min_copy + required_to_align; 2519 } 2520 2521 // Copies 'count' of elements using shifted copy loop 2522 // 2523 // Arguments: 2524 // from: beginning (if forward) or upper bound (if !forward) of the region to be read 2525 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 2526 // count: 32-bit int, number of elements to be copied 2527 // bytes_per_count: size of an element 2528 // forward: specifies copy direction 2529 // 2530 // Notes: 2531 // 'count' must not be less then the returned value 2532 // 'from' must be aligned by wordSize 2533 // 'to' must be aligned by bytes_per_count but must not be aligned by wordSize 2534 // shifts 'to' by the number of copied bytes 2535 // 2536 // Scratches 'from', 'count', R3 and R12. 2537 // On AArch64 also scratches R4-R10, on 32-bit ARM saves them to use. 2538 int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, bool forward) { 2539 2540 const Register Rval = forward ? R12 : R3; // as generate_{forward,backward}_shifted_copy_loop expect 2541 2542 int min_copy = 0; 2543 2544 // Note: if {seq} is a sequence of numbers, L{seq} means that if the execution reaches this point, 2545 // then the remainder of 'to' divided by wordSize is one of elements of {seq}. 2546 2547 #ifdef AARCH64 2548 // TODO-AARCH64: simplify, tune 2549 2550 load_one(Rval, from, wordSize, forward); 2551 2552 Label L_loop_finished; 2553 2554 switch (bytes_per_count) { 2555 case 4: 2556 min_copy = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward); 2557 break; 2558 case 2: 2559 { 2560 Label L2, L4, L6; 2561 2562 __ tbz(to, 1, L4); 2563 __ tbz(to, 2, L2); 2564 2565 __ BIND(L6); 2566 int min_copy6 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 6, bytes_per_count, forward); 2567 __ b(L_loop_finished); 2568 2569 __ BIND(L2); 2570 int min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward); 2571 __ b(L_loop_finished); 2572 2573 __ BIND(L4); 2574 int min_copy4 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward); 2575 2576 min_copy = MAX2(MAX2(min_copy2, min_copy4), min_copy6); 2577 break; 2578 } 2579 case 1: 2580 { 2581 Label L1, L2, L3, L4, L5, L6, L7; 2582 Label L15, L26; 2583 Label L246; 2584 2585 __ tbz(to, 0, L246); 2586 __ tbz(to, 1, L15); 2587 __ tbz(to, 2, L3); 2588 2589 __ BIND(L7); 2590 int min_copy7 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 7, bytes_per_count, forward); 2591 __ b(L_loop_finished); 2592 2593 __ BIND(L246); 2594 __ tbnz(to, 1, L26); 2595 2596 __ BIND(L4); 2597 int min_copy4 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward); 2598 __ b(L_loop_finished); 2599 2600 __ BIND(L15); 2601 __ tbz(to, 2, L1); 2602 2603 __ BIND(L5); 2604 int min_copy5 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 5, bytes_per_count, forward); 2605 __ b(L_loop_finished); 2606 2607 __ BIND(L3); 2608 int min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward); 2609 __ b(L_loop_finished); 2610 2611 __ BIND(L26); 2612 __ tbz(to, 2, L2); 2613 2614 __ BIND(L6); 2615 int min_copy6 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 6, bytes_per_count, forward); 2616 __ b(L_loop_finished); 2617 2618 __ BIND(L1); 2619 int min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward); 2620 __ b(L_loop_finished); 2621 2622 __ BIND(L2); 2623 int min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward); 2624 2625 2626 min_copy = MAX2(min_copy1, min_copy2); 2627 min_copy = MAX2(min_copy, min_copy3); 2628 min_copy = MAX2(min_copy, min_copy4); 2629 min_copy = MAX2(min_copy, min_copy5); 2630 min_copy = MAX2(min_copy, min_copy6); 2631 min_copy = MAX2(min_copy, min_copy7); 2632 break; 2633 } 2634 default: 2635 ShouldNotReachHere(); 2636 break; 2637 } 2638 __ BIND(L_loop_finished); 2639 2640 #else 2641 __ push(RegisterSet(R4,R10)); 2642 load_one(Rval, from, wordSize, forward); 2643 2644 switch (bytes_per_count) { 2645 case 2: 2646 min_copy = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward); 2647 break; 2648 case 1: 2649 { 2650 Label L1, L2, L3; 2651 int min_copy1, min_copy2, min_copy3; 2652 2653 Label L_loop_finished; 2654 2655 if (forward) { 2656 __ tbz(to, 0, L2); 2657 __ tbz(to, 1, L1); 2658 2659 __ BIND(L3); 2660 min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward); 2661 __ b(L_loop_finished); 2662 2663 __ BIND(L1); 2664 min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward); 2665 __ b(L_loop_finished); 2666 2667 __ BIND(L2); 2668 min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward); 2669 } else { 2670 __ tbz(to, 0, L2); 2671 __ tbnz(to, 1, L3); 2672 2673 __ BIND(L1); 2674 min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward); 2675 __ b(L_loop_finished); 2676 2677 __ BIND(L3); 2678 min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward); 2679 __ b(L_loop_finished); 2680 2681 __ BIND(L2); 2682 min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward); 2683 } 2684 2685 min_copy = MAX2(MAX2(min_copy1, min_copy2), min_copy3); 2686 2687 __ BIND(L_loop_finished); 2688 2689 break; 2690 } 2691 default: 2692 ShouldNotReachHere(); 2693 break; 2694 } 2695 2696 __ pop(RegisterSet(R4,R10)); 2697 #endif // AARCH64 2698 2699 return min_copy; 2700 } 2701 2702 #ifndef PRODUCT 2703 int * get_arraycopy_counter(int bytes_per_count) { 2704 switch (bytes_per_count) { 2705 case 1: 2706 return &SharedRuntime::_jbyte_array_copy_ctr; 2707 case 2: 2708 return &SharedRuntime::_jshort_array_copy_ctr; 2709 case 4: 2710 return &SharedRuntime::_jint_array_copy_ctr; 2711 case 8: 2712 return &SharedRuntime::_jlong_array_copy_ctr; 2713 default: 2714 ShouldNotReachHere(); 2715 return NULL; 2716 } 2717 } 2718 #endif // !PRODUCT 2719 2720 // 2721 // Generate stub for primitive array copy. If "aligned" is true, the 2722 // "from" and "to" addresses are assumed to be heapword aligned. 2723 // 2724 // If "disjoint" is true, arrays are assumed to be disjoint, otherwise they may overlap and 2725 // "nooverlap_target" must be specified as the address to jump if they don't. 2726 // 2727 // Arguments for generated stub: 2728 // from: R0 2729 // to: R1 2730 // count: R2 treated as signed 32-bit int 2731 // 2732 address generate_primitive_copy(bool aligned, const char * name, bool status, int bytes_per_count, bool disjoint, address nooverlap_target = NULL) { 2733 __ align(CodeEntryAlignment); 2734 StubCodeMark mark(this, "StubRoutines", name); 2735 address start = __ pc(); 2736 2737 const Register from = R0; // source array address 2738 const Register to = R1; // destination array address 2739 const Register count = R2; // elements count 2740 const Register tmp1 = R3; 2741 const Register tmp2 = R12; 2742 2743 if (!aligned) { 2744 BLOCK_COMMENT("Entry:"); 2745 } 2746 2747 __ zap_high_non_significant_bits(R2); 2748 2749 if (!disjoint) { 2750 assert (nooverlap_target != NULL, "must be specified for conjoint case"); 2751 array_overlap_test(nooverlap_target, exact_log2(bytes_per_count), tmp1, tmp2); 2752 } 2753 2754 inc_counter_np(*get_arraycopy_counter(bytes_per_count), tmp1, tmp2); 2755 2756 // Conjoint case: since execution reaches this point, the arrays overlap, so performing backward copy 2757 // Disjoint case: perform forward copy 2758 bool forward = disjoint; 2759 2760 2761 if (!forward) { 2762 // Set 'from' and 'to' to upper bounds 2763 int log_bytes_per_count = exact_log2(bytes_per_count); 2764 __ add_ptr_scaled_int32(to, to, count, log_bytes_per_count); 2765 __ add_ptr_scaled_int32(from, from, count, log_bytes_per_count); 2766 } 2767 2768 // There are two main copy loop implementations: 2769 // *) The huge and complex one applicable only for large enough arrays 2770 // *) The small and simple one applicable for any array (but not efficient for large arrays). 2771 // Currently "small" implementation is used if and only if the "large" one could not be used. 2772 // XXX optim: tune the limit higher ? 2773 // Large implementation lower applicability bound is actually determined by 2774 // aligned copy loop which require <=7 bytes for src alignment, and 8 words for aligned copy loop. 2775 const int small_copy_limit = (8*wordSize + 7) / bytes_per_count; 2776 2777 Label L_small_array; 2778 __ cmp_32(count, small_copy_limit); 2779 __ b(L_small_array, le); // TODO-AARCH64: le vs lt 2780 2781 // Otherwise proceed with large implementation. 2782 2783 bool from_is_aligned = (bytes_per_count >= 8); 2784 if (aligned && forward && (HeapWordSize % 8 == 0)) { 2785 // if 'from' is heapword aligned and HeapWordSize is divisible by 8, 2786 // then from is aligned by 8 2787 from_is_aligned = true; 2788 } 2789 2790 int count_required_to_align = from_is_aligned ? 0 : align_src(from, to, count, tmp1, bytes_per_count, forward); 2791 assert (small_copy_limit >= count_required_to_align, "alignment could exhaust count"); 2792 2793 // now 'from' is aligned 2794 2795 bool to_is_aligned = false; 2796 2797 if (bytes_per_count >= wordSize) { 2798 // 'to' is aligned by bytes_per_count, so it is aligned by wordSize 2799 to_is_aligned = true; 2800 } else { 2801 if (aligned && (8 % HeapWordSize == 0) && (HeapWordSize % wordSize == 0)) { 2802 // Originally 'from' and 'to' were heapword aligned; 2803 // (from - to) has not been changed, so since now 'from' is 8-byte aligned, then it is also heapword aligned, 2804 // so 'to' is also heapword aligned and thus aligned by wordSize. 2805 to_is_aligned = true; 2806 } 2807 } 2808 2809 Label L_unaligned_dst; 2810 2811 if (!to_is_aligned) { 2812 BLOCK_COMMENT("Check dst alignment:"); 2813 __ tst(to, wordSize - 1); 2814 __ b(L_unaligned_dst, ne); // 'to' is not aligned 2815 } 2816 2817 // 'from' and 'to' are properly aligned 2818 2819 int min_copy; 2820 if (forward) { 2821 min_copy = generate_forward_aligned_copy_loop (from, to, count, bytes_per_count); 2822 } else { 2823 min_copy = generate_backward_aligned_copy_loop(from, to, count, bytes_per_count); 2824 } 2825 assert(small_copy_limit >= count_required_to_align + min_copy, "first loop might exhaust count"); 2826 2827 if (status) { 2828 __ mov(R0, 0); // OK 2829 } 2830 2831 __ ret(); 2832 2833 { 2834 copy_small_array(from, to, count, tmp1, tmp2, bytes_per_count, forward, L_small_array /* entry */); 2835 2836 if (status) { 2837 __ mov(R0, 0); // OK 2838 } 2839 2840 __ ret(); 2841 } 2842 2843 if (! to_is_aligned) { 2844 __ BIND(L_unaligned_dst); 2845 int min_copy_shifted = align_dst_and_generate_shifted_copy_loop(from, to, count, bytes_per_count, forward); 2846 assert (small_copy_limit >= count_required_to_align + min_copy_shifted, "first loop might exhaust count"); 2847 2848 if (status) { 2849 __ mov(R0, 0); // OK 2850 } 2851 2852 __ ret(); 2853 } 2854 2855 return start; 2856 } 2857 2858 #if INCLUDE_ALL_GCS 2859 // 2860 // Generate pre-write barrier for array. 2861 // 2862 // Input: 2863 // addr - register containing starting address 2864 // count - register containing element count, 32-bit int 2865 // callee_saved_regs - 2866 // the call must preserve this number of registers: R0, R1, ..., R[callee_saved_regs-1] 2867 // 2868 // callee_saved_regs must include addr and count 2869 // Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR) except for callee_saved_regs. 2870 void gen_write_ref_array_pre_barrier(Register addr, Register count, int callee_saved_regs) { 2871 BarrierSet* bs = Universe::heap()->barrier_set(); 2872 switch (bs->kind()) { 2873 case BarrierSet::G1BarrierSet: 2874 { 2875 assert( addr->encoding() < callee_saved_regs, "addr must be saved"); 2876 assert(count->encoding() < callee_saved_regs, "count must be saved"); 2877 2878 BLOCK_COMMENT("PreBarrier"); 2879 2880 #ifdef AARCH64 2881 callee_saved_regs = align_up(callee_saved_regs, 2); 2882 for (int i = 0; i < callee_saved_regs; i += 2) { 2883 __ raw_push(as_Register(i), as_Register(i+1)); 2884 } 2885 #else 2886 RegisterSet saved_regs = RegisterSet(R0, as_Register(callee_saved_regs-1)); 2887 __ push(saved_regs | R9ifScratched); 2888 #endif // AARCH64 2889 2890 if (addr != R0) { 2891 assert_different_registers(count, R0); 2892 __ mov(R0, addr); 2893 } 2894 #ifdef AARCH64 2895 __ zero_extend(R1, count, 32); // BarrierSet::static_write_ref_array_pre takes size_t 2896 #else 2897 if (count != R1) { 2898 __ mov(R1, count); 2899 } 2900 #endif // AARCH64 2901 2902 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre)); 2903 2904 #ifdef AARCH64 2905 for (int i = callee_saved_regs - 2; i >= 0; i -= 2) { 2906 __ raw_pop(as_Register(i), as_Register(i+1)); 2907 } 2908 #else 2909 __ pop(saved_regs | R9ifScratched); 2910 #endif // AARCH64 2911 } 2912 case BarrierSet::CardTableBarrierSet: 2913 break; 2914 default: 2915 ShouldNotReachHere(); 2916 } 2917 } 2918 #endif // INCLUDE_ALL_GCS 2919 2920 // 2921 // Generate post-write barrier for array. 2922 // 2923 // Input: 2924 // addr - register containing starting address (can be scratched) 2925 // count - register containing element count, 32-bit int (can be scratched) 2926 // tmp - scratch register 2927 // 2928 // Note: LR can be scratched but might be equal to addr, count or tmp 2929 // Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR). 2930 void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp) { 2931 assert_different_registers(addr, count, tmp); 2932 BarrierSet* bs = Universe::heap()->barrier_set(); 2933 2934 switch (bs->kind()) { 2935 case BarrierSet::G1BarrierSet: 2936 { 2937 BLOCK_COMMENT("G1PostBarrier"); 2938 if (addr != R0) { 2939 assert_different_registers(count, R0); 2940 __ mov(R0, addr); 2941 } 2942 #ifdef AARCH64 2943 __ zero_extend(R1, count, 32); // BarrierSet::static_write_ref_array_post takes size_t 2944 #else 2945 if (count != R1) { 2946 __ mov(R1, count); 2947 } 2948 #if R9_IS_SCRATCHED 2949 // Safer to save R9 here since callers may have been written 2950 // assuming R9 survives. This is suboptimal but is not in 2951 // general worth optimizing for the few platforms where R9 2952 // is scratched. Note that the optimization might not be to 2953 // difficult for this particular call site. 2954 __ push(R9); 2955 #endif 2956 #endif // !AARCH64 2957 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post)); 2958 #ifndef AARCH64 2959 #if R9_IS_SCRATCHED 2960 __ pop(R9); 2961 #endif 2962 #endif // !AARCH64 2963 } 2964 break; 2965 case BarrierSet::CardTableBarrierSet: 2966 { 2967 BLOCK_COMMENT("CardTablePostBarrier"); 2968 CardTableBarrierSet* ctbs = barrier_set_cast<CardTableBarrierSet>(bs); 2969 CardTable* ct = ctbs->card_table(); 2970 assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code"); 2971 2972 Label L_cardtable_loop, L_done; 2973 2974 __ cbz_32(count, L_done); // zero count - nothing to do 2975 2976 __ add_ptr_scaled_int32(count, addr, count, LogBytesPerHeapOop); 2977 __ sub(count, count, BytesPerHeapOop); // last addr 2978 2979 __ logical_shift_right(addr, addr, CardTable::card_shift); 2980 __ logical_shift_right(count, count, CardTable::card_shift); 2981 __ sub(count, count, addr); // nb of cards 2982 2983 // warning: Rthread has not been preserved 2984 __ mov_address(tmp, (address) ct->byte_map_base(), symbolic_Relocation::card_table_reference); 2985 __ add(addr,tmp, addr); 2986 2987 Register zero = __ zero_register(tmp); 2988 2989 __ BIND(L_cardtable_loop); 2990 __ strb(zero, Address(addr, 1, post_indexed)); 2991 __ subs(count, count, 1); 2992 __ b(L_cardtable_loop, ge); 2993 __ BIND(L_done); 2994 } 2995 break; 2996 default: 2997 ShouldNotReachHere(); 2998 } 2999 } 3000 3001 // Generates pattern of code to be placed after raw data copying in generate_oop_copy 3002 // Includes return from arraycopy stub. 3003 // 3004 // Arguments: 3005 // to: destination pointer after copying. 3006 // if 'forward' then 'to' == upper bound, else 'to' == beginning of the modified region 3007 // count: total number of copied elements, 32-bit int 3008 // 3009 // Blows all volatile (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR) and 'to', 'count', 'tmp' registers. 3010 void oop_arraycopy_stub_epilogue_helper(Register to, Register count, Register tmp, bool status, bool forward) { 3011 assert_different_registers(to, count, tmp); 3012 3013 if (forward) { 3014 // 'to' is upper bound of the modified region 3015 // restore initial dst: 3016 __ sub_ptr_scaled_int32(to, to, count, LogBytesPerHeapOop); 3017 } 3018 3019 // 'to' is the beginning of the region 3020 3021 gen_write_ref_array_post_barrier(to, count, tmp); 3022 3023 if (status) { 3024 __ mov(R0, 0); // OK 3025 } 3026 3027 #ifdef AARCH64 3028 __ raw_pop(LR, ZR); 3029 __ ret(); 3030 #else 3031 __ pop(PC); 3032 #endif // AARCH64 3033 } 3034 3035 3036 // Generate stub for assign-compatible oop copy. If "aligned" is true, the 3037 // "from" and "to" addresses are assumed to be heapword aligned. 3038 // 3039 // If "disjoint" is true, arrays are assumed to be disjoint, otherwise they may overlap and 3040 // "nooverlap_target" must be specified as the address to jump if they don't. 3041 // 3042 // Arguments for generated stub: 3043 // from: R0 3044 // to: R1 3045 // count: R2 treated as signed 32-bit int 3046 // 3047 address generate_oop_copy(bool aligned, const char * name, bool status, bool disjoint, address nooverlap_target = NULL) { 3048 __ align(CodeEntryAlignment); 3049 StubCodeMark mark(this, "StubRoutines", name); 3050 address start = __ pc(); 3051 3052 Register from = R0; 3053 Register to = R1; 3054 Register count = R2; 3055 Register tmp1 = R3; 3056 Register tmp2 = R12; 3057 3058 3059 if (!aligned) { 3060 BLOCK_COMMENT("Entry:"); 3061 } 3062 3063 __ zap_high_non_significant_bits(R2); 3064 3065 if (!disjoint) { 3066 assert (nooverlap_target != NULL, "must be specified for conjoint case"); 3067 array_overlap_test(nooverlap_target, LogBytesPerHeapOop, tmp1, tmp2); 3068 } 3069 3070 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, tmp1, tmp2); 3071 3072 // Conjoint case: since execution reaches this point, the arrays overlap, so performing backward copy 3073 // Disjoint case: perform forward copy 3074 bool forward = disjoint; 3075 3076 const int bytes_per_count = BytesPerHeapOop; 3077 const int log_bytes_per_count = LogBytesPerHeapOop; 3078 3079 const Register saved_count = LR; 3080 const int callee_saved_regs = 3; // R0-R2 3081 3082 // LR is used later to save barrier args 3083 #ifdef AARCH64 3084 __ raw_push(LR, ZR); 3085 #else 3086 __ push(LR); 3087 #endif // AARCH64 3088 3089 #if INCLUDE_ALL_GCS 3090 gen_write_ref_array_pre_barrier(to, count, callee_saved_regs); 3091 #endif // INCLUDE_ALL_GCS 3092 3093 // save arguments for barrier generation (after the pre barrier) 3094 __ mov(saved_count, count); 3095 3096 if (!forward) { 3097 __ add_ptr_scaled_int32(to, to, count, log_bytes_per_count); 3098 __ add_ptr_scaled_int32(from, from, count, log_bytes_per_count); 3099 } 3100 3101 // for short arrays, just do single element copy 3102 Label L_small_array; 3103 const int small_copy_limit = (8*wordSize + 7)/bytes_per_count; // XXX optim: tune the limit higher ? 3104 __ cmp_32(count, small_copy_limit); 3105 __ b(L_small_array, le); 3106 3107 bool from_is_aligned = (bytes_per_count >= 8); 3108 if (aligned && forward && (HeapWordSize % 8 == 0)) { 3109 // if 'from' is heapword aligned and HeapWordSize is divisible by 8, 3110 // then from is aligned by 8 3111 from_is_aligned = true; 3112 } 3113 3114 int count_required_to_align = from_is_aligned ? 0 : align_src(from, to, count, tmp1, bytes_per_count, forward); 3115 assert (small_copy_limit >= count_required_to_align, "alignment could exhaust count"); 3116 3117 // now 'from' is aligned 3118 3119 bool to_is_aligned = false; 3120 3121 if (bytes_per_count >= wordSize) { 3122 // 'to' is aligned by bytes_per_count, so it is aligned by wordSize 3123 to_is_aligned = true; 3124 } else { 3125 if (aligned && (8 % HeapWordSize == 0) && (HeapWordSize % wordSize == 0)) { 3126 // Originally 'from' and 'to' were heapword aligned; 3127 // (from - to) has not been changed, so since now 'from' is 8-byte aligned, then it is also heapword aligned, 3128 // so 'to' is also heapword aligned and thus aligned by wordSize. 3129 to_is_aligned = true; 3130 } 3131 } 3132 3133 Label L_unaligned_dst; 3134 3135 if (!to_is_aligned) { 3136 BLOCK_COMMENT("Check dst alignment:"); 3137 __ tst(to, wordSize - 1); 3138 __ b(L_unaligned_dst, ne); // 'to' is not aligned 3139 } 3140 3141 int min_copy; 3142 if (forward) { 3143 min_copy = generate_forward_aligned_copy_loop(from, to, count, bytes_per_count); 3144 } else { 3145 min_copy = generate_backward_aligned_copy_loop(from, to, count, bytes_per_count); 3146 } 3147 assert(small_copy_limit >= count_required_to_align + min_copy, "first loop might exhaust count"); 3148 3149 oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward); 3150 3151 { 3152 copy_small_array(from, to, count, tmp1, noreg, bytes_per_count, forward, L_small_array); 3153 3154 oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward); 3155 } 3156 3157 if (!to_is_aligned) { 3158 // !to_is_aligned <=> UseCompressedOops && AArch64 3159 __ BIND(L_unaligned_dst); 3160 #ifdef AARCH64 3161 assert (UseCompressedOops, "unaligned oop array copy may be requested only with UseCompressedOops"); 3162 #else 3163 ShouldNotReachHere(); 3164 #endif // AARCH64 3165 int min_copy_shifted = align_dst_and_generate_shifted_copy_loop(from, to, count, bytes_per_count, forward); 3166 assert (small_copy_limit >= count_required_to_align + min_copy_shifted, "first loop might exhaust count"); 3167 3168 oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward); 3169 } 3170 3171 return start; 3172 } 3173 3174 // Generate 'unsafe' array copy stub 3175 // Though just as safe as the other stubs, it takes an unscaled 3176 // size_t argument instead of an element count. 3177 // 3178 // Arguments for generated stub: 3179 // from: R0 3180 // to: R1 3181 // count: R2 byte count, treated as ssize_t, can be zero 3182 // 3183 // Examines the alignment of the operands and dispatches 3184 // to a long, int, short, or byte copy loop. 3185 // 3186 address generate_unsafe_copy(const char* name) { 3187 3188 const Register R0_from = R0; // source array address 3189 const Register R1_to = R1; // destination array address 3190 const Register R2_count = R2; // elements count 3191 3192 const Register R3_bits = R3; // test copy of low bits 3193 3194 __ align(CodeEntryAlignment); 3195 StubCodeMark mark(this, "StubRoutines", name); 3196 address start = __ pc(); 3197 #ifdef AARCH64 3198 __ NOT_IMPLEMENTED(); 3199 start = NULL; 3200 #else 3201 const Register tmp = Rtemp; 3202 3203 // bump this on entry, not on exit: 3204 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R3, tmp); 3205 3206 __ orr(R3_bits, R0_from, R1_to); 3207 __ orr(R3_bits, R2_count, R3_bits); 3208 3209 __ tst(R3_bits, BytesPerLong-1); 3210 __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerLong), eq); 3211 __ jump(StubRoutines::_jlong_arraycopy, relocInfo::runtime_call_type, tmp, eq); 3212 3213 __ tst(R3_bits, BytesPerInt-1); 3214 __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerInt), eq); 3215 __ jump(StubRoutines::_jint_arraycopy, relocInfo::runtime_call_type, tmp, eq); 3216 3217 __ tst(R3_bits, BytesPerShort-1); 3218 __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerShort), eq); 3219 __ jump(StubRoutines::_jshort_arraycopy, relocInfo::runtime_call_type, tmp, eq); 3220 3221 __ jump(StubRoutines::_jbyte_arraycopy, relocInfo::runtime_call_type, tmp); 3222 #endif 3223 return start; 3224 } 3225 3226 // Helper for generating a dynamic type check. 3227 // Smashes only the given temp registers. 3228 void generate_type_check(Register sub_klass, 3229 Register super_check_offset, 3230 Register super_klass, 3231 Register tmp1, 3232 Register tmp2, 3233 Register tmp3, 3234 Label& L_success) { 3235 assert_different_registers(sub_klass, super_check_offset, super_klass, tmp1, tmp2, tmp3); 3236 3237 BLOCK_COMMENT("type_check:"); 3238 3239 // If the pointers are equal, we are done (e.g., String[] elements). 3240 3241 __ cmp(super_klass, sub_klass); 3242 __ b(L_success, eq); // fast success 3243 3244 3245 Label L_loop, L_fail; 3246 3247 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 3248 3249 // Check the supertype display: 3250 __ ldr(tmp1, Address(sub_klass, super_check_offset)); 3251 __ cmp(tmp1, super_klass); 3252 __ b(L_success, eq); 3253 3254 __ cmp(super_check_offset, sc_offset); 3255 __ b(L_fail, ne); // failure 3256 3257 BLOCK_COMMENT("type_check_slow_path:"); 3258 3259 // a couple of useful fields in sub_klass: 3260 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 3261 3262 // Do a linear scan of the secondary super-klass chain. 3263 3264 #ifndef PRODUCT 3265 int* pst_counter = &SharedRuntime::_partial_subtype_ctr; 3266 __ inc_counter((address) pst_counter, tmp1, tmp2); 3267 #endif 3268 3269 Register scan_temp = tmp1; 3270 Register count_temp = tmp2; 3271 3272 // We will consult the secondary-super array. 3273 __ ldr(scan_temp, Address(sub_klass, ss_offset)); 3274 3275 Register search_key = super_klass; 3276 3277 // Load the array length. 3278 __ ldr_s32(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes())); 3279 __ add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes()); 3280 3281 __ add(count_temp, count_temp, 1); 3282 3283 // Top of search loop 3284 __ bind(L_loop); 3285 // Notes: 3286 // scan_temp starts at the array elements 3287 // count_temp is 1+size 3288 3289 __ subs(count_temp, count_temp, 1); 3290 __ b(L_fail, eq); // not found 3291 3292 // Load next super to check 3293 // In the array of super classes elements are pointer sized. 3294 int element_size = wordSize; 3295 __ ldr(tmp3, Address(scan_temp, element_size, post_indexed)); 3296 3297 // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list 3298 __ cmp(tmp3, search_key); 3299 3300 // A miss means we are NOT a subtype and need to keep looping 3301 __ b(L_loop, ne); 3302 3303 // Falling out the bottom means we found a hit; we ARE a subtype 3304 3305 // Success. Cache the super we found and proceed in triumph. 3306 __ str(super_klass, Address(sub_klass, sc_offset)); 3307 3308 // Jump to success 3309 __ b(L_success); 3310 3311 // Fall through on failure! 3312 __ bind(L_fail); 3313 } 3314 3315 // Generate stub for checked oop copy. 3316 // 3317 // Arguments for generated stub: 3318 // from: R0 3319 // to: R1 3320 // count: R2 treated as signed 32-bit int 3321 // ckoff: R3 (super_check_offset) 3322 // ckval: R4 (AArch64) / SP[0] (32-bit ARM) (super_klass) 3323 // ret: R0 zero for success; (-1^K) where K is partial transfer count (32-bit) 3324 // 3325 address generate_checkcast_copy(const char * name) { 3326 __ align(CodeEntryAlignment); 3327 StubCodeMark mark(this, "StubRoutines", name); 3328 address start = __ pc(); 3329 3330 const Register from = R0; // source array address 3331 const Register to = R1; // destination array address 3332 const Register count = R2; // elements count 3333 3334 const Register R3_ckoff = R3; // super_check_offset 3335 const Register R4_ckval = R4; // super_klass 3336 3337 const int callee_saved_regs = AARCH64_ONLY(5) NOT_AARCH64(4); // LR saved differently 3338 3339 Label load_element, store_element, do_card_marks, fail; 3340 3341 BLOCK_COMMENT("Entry:"); 3342 3343 __ zap_high_non_significant_bits(R2); 3344 3345 #ifdef AARCH64 3346 __ raw_push(LR, ZR); 3347 __ raw_push(R19, R20); 3348 #else 3349 int pushed = 0; 3350 __ push(LR); 3351 pushed+=1; 3352 #endif // AARCH64 3353 3354 #if INCLUDE_ALL_GCS 3355 gen_write_ref_array_pre_barrier(to, count, callee_saved_regs); 3356 #endif // INCLUDE_ALL_GCS 3357 3358 #ifndef AARCH64 3359 const RegisterSet caller_saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11; 3360 __ push(caller_saved_regs); 3361 assert(caller_saved_regs.size() == 6, "check the count"); 3362 pushed+=6; 3363 3364 __ ldr(R4_ckval,Address(SP, wordSize*pushed)); // read the argument that was on the stack 3365 #endif // !AARCH64 3366 3367 // Save arguments for barrier generation (after the pre barrier): 3368 // - must be a caller saved register and not LR 3369 // - ARM32: avoid R10 in case RThread is needed 3370 const Register saved_count = AARCH64_ONLY(R19) NOT_AARCH64(altFP_7_11); 3371 #ifdef AARCH64 3372 __ mov_w(saved_count, count); 3373 __ cbnz_w(count, load_element); // and test count 3374 #else 3375 __ movs(saved_count, count); // and test count 3376 __ b(load_element,ne); 3377 #endif // AARCH64 3378 3379 // nothing to copy 3380 __ mov(R0, 0); 3381 3382 #ifdef AARCH64 3383 __ raw_pop(R19, R20); 3384 __ raw_pop(LR, ZR); 3385 __ ret(); 3386 #else 3387 __ pop(caller_saved_regs); 3388 __ pop(PC); 3389 #endif // AARCH64 3390 3391 // ======== begin loop ======== 3392 // (Loop is rotated; its entry is load_element.) 3393 __ align(OptoLoopAlignment); 3394 __ BIND(store_element); 3395 if (UseCompressedOops) { 3396 __ store_heap_oop(R5, Address(to, BytesPerHeapOop, post_indexed)); // store the oop, changes flags 3397 __ subs_32(count,count,1); 3398 } else { 3399 __ subs_32(count,count,1); 3400 __ str(R5, Address(to, BytesPerHeapOop, post_indexed)); // store the oop 3401 } 3402 __ b(do_card_marks, eq); // count exhausted 3403 3404 // ======== loop entry is here ======== 3405 __ BIND(load_element); 3406 __ load_heap_oop(R5, Address(from, BytesPerHeapOop, post_indexed)); // load the oop 3407 __ cbz(R5, store_element); // NULL 3408 3409 __ load_klass(R6, R5); 3410 3411 generate_type_check(R6, R3_ckoff, R4_ckval, /*tmps*/ R12, R8, R9, 3412 // branch to this on success: 3413 store_element); 3414 // ======== end loop ======== 3415 3416 // It was a real error; we must depend on the caller to finish the job. 3417 // Register count has number of *remaining* oops, saved_count number of *total* oops. 3418 // Emit GC store barriers for the oops we have copied 3419 // and report their number to the caller (0 or (-1^n)) 3420 __ BIND(fail); 3421 3422 // Note: fail marked by the fact that count differs from saved_count 3423 3424 __ BIND(do_card_marks); 3425 3426 Register copied = AARCH64_ONLY(R20) NOT_AARCH64(R4); // saved 3427 Label L_not_copied; 3428 3429 __ subs_32(copied, saved_count, count); // copied count (in saved reg) 3430 __ b(L_not_copied, eq); // nothing was copied, skip post barrier 3431 __ sub(to, to, AsmOperand(copied, lsl, LogBytesPerHeapOop)); // initial to value 3432 __ mov(R12, copied); // count arg scratched by post barrier 3433 3434 gen_write_ref_array_post_barrier(to, R12, R3); 3435 3436 assert_different_registers(R3,R12,LR,copied,saved_count); 3437 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R3, R12); 3438 3439 __ BIND(L_not_copied); 3440 __ cmp_32(copied, saved_count); // values preserved in saved registers 3441 3442 #ifdef AARCH64 3443 __ csinv(R0, ZR, copied, eq); // 0 if all copied else NOT(copied) 3444 __ raw_pop(R19, R20); 3445 __ raw_pop(LR, ZR); 3446 __ ret(); 3447 #else 3448 __ mov(R0, 0, eq); // 0 if all copied 3449 __ mvn(R0, copied, ne); // else NOT(copied) 3450 __ pop(caller_saved_regs); 3451 __ pop(PC); 3452 #endif // AARCH64 3453 3454 return start; 3455 } 3456 3457 // Perform range checks on the proposed arraycopy. 3458 // Kills the two temps, but nothing else. 3459 void arraycopy_range_checks(Register src, // source array oop 3460 Register src_pos, // source position (32-bit int) 3461 Register dst, // destination array oop 3462 Register dst_pos, // destination position (32-bit int) 3463 Register length, // length of copy (32-bit int) 3464 Register temp1, Register temp2, 3465 Label& L_failed) { 3466 3467 BLOCK_COMMENT("arraycopy_range_checks:"); 3468 3469 // if (src_pos + length > arrayOop(src)->length() ) FAIL; 3470 3471 const Register array_length = temp1; // scratch 3472 const Register end_pos = temp2; // scratch 3473 3474 __ add_32(end_pos, length, src_pos); // src_pos + length 3475 __ ldr_s32(array_length, Address(src, arrayOopDesc::length_offset_in_bytes())); 3476 __ cmp_32(end_pos, array_length); 3477 __ b(L_failed, hi); 3478 3479 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL; 3480 __ add_32(end_pos, length, dst_pos); // dst_pos + length 3481 __ ldr_s32(array_length, Address(dst, arrayOopDesc::length_offset_in_bytes())); 3482 __ cmp_32(end_pos, array_length); 3483 __ b(L_failed, hi); 3484 3485 BLOCK_COMMENT("arraycopy_range_checks done"); 3486 } 3487 3488 // 3489 // Generate generic array copy stubs 3490 // 3491 // Input: 3492 // R0 - src oop 3493 // R1 - src_pos (32-bit int) 3494 // R2 - dst oop 3495 // R3 - dst_pos (32-bit int) 3496 // R4 (AArch64) / SP[0] (32-bit ARM) - element count (32-bit int) 3497 // 3498 // Output: (32-bit int) 3499 // R0 == 0 - success 3500 // R0 < 0 - need to call System.arraycopy 3501 // 3502 address generate_generic_copy(const char *name) { 3503 Label L_failed, L_objArray; 3504 3505 // Input registers 3506 const Register src = R0; // source array oop 3507 const Register src_pos = R1; // source position 3508 const Register dst = R2; // destination array oop 3509 const Register dst_pos = R3; // destination position 3510 3511 // registers used as temp 3512 const Register R5_src_klass = R5; // source array klass 3513 const Register R6_dst_klass = R6; // destination array klass 3514 const Register R_lh = AARCH64_ONLY(R7) NOT_AARCH64(altFP_7_11); // layout handler 3515 const Register R8_temp = R8; 3516 3517 __ align(CodeEntryAlignment); 3518 StubCodeMark mark(this, "StubRoutines", name); 3519 address start = __ pc(); 3520 3521 __ zap_high_non_significant_bits(R1); 3522 __ zap_high_non_significant_bits(R3); 3523 __ zap_high_non_significant_bits(R4); 3524 3525 #ifndef AARCH64 3526 int pushed = 0; 3527 const RegisterSet saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11; 3528 __ push(saved_regs); 3529 assert(saved_regs.size() == 6, "check the count"); 3530 pushed+=6; 3531 #endif // !AARCH64 3532 3533 // bump this on entry, not on exit: 3534 inc_counter_np(SharedRuntime::_generic_array_copy_ctr, R5, R12); 3535 3536 const Register length = R4; // elements count 3537 #ifndef AARCH64 3538 __ ldr(length, Address(SP,4*pushed)); 3539 #endif // !AARCH64 3540 3541 3542 //----------------------------------------------------------------------- 3543 // Assembler stubs will be used for this call to arraycopy 3544 // if the following conditions are met: 3545 // 3546 // (1) src and dst must not be null. 3547 // (2) src_pos must not be negative. 3548 // (3) dst_pos must not be negative. 3549 // (4) length must not be negative. 3550 // (5) src klass and dst klass should be the same and not NULL. 3551 // (6) src and dst should be arrays. 3552 // (7) src_pos + length must not exceed length of src. 3553 // (8) dst_pos + length must not exceed length of dst. 3554 BLOCK_COMMENT("arraycopy initial argument checks"); 3555 3556 // if (src == NULL) return -1; 3557 __ cbz(src, L_failed); 3558 3559 // if (src_pos < 0) return -1; 3560 __ cmp_32(src_pos, 0); 3561 __ b(L_failed, lt); 3562 3563 // if (dst == NULL) return -1; 3564 __ cbz(dst, L_failed); 3565 3566 // if (dst_pos < 0) return -1; 3567 __ cmp_32(dst_pos, 0); 3568 __ b(L_failed, lt); 3569 3570 // if (length < 0) return -1; 3571 __ cmp_32(length, 0); 3572 __ b(L_failed, lt); 3573 3574 BLOCK_COMMENT("arraycopy argument klass checks"); 3575 // get src->klass() 3576 __ load_klass(R5_src_klass, src); 3577 3578 // Load layout helper 3579 // 3580 // |array_tag| | header_size | element_type | |log2_element_size| 3581 // 32 30 24 16 8 2 0 3582 // 3583 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 3584 // 3585 3586 int lh_offset = in_bytes(Klass::layout_helper_offset()); 3587 __ ldr_u32(R_lh, Address(R5_src_klass, lh_offset)); 3588 3589 __ load_klass(R6_dst_klass, dst); 3590 3591 // Handle objArrays completely differently... 3592 juint objArray_lh = Klass::array_layout_helper(T_OBJECT); 3593 __ mov_slow(R8_temp, objArray_lh); 3594 __ cmp_32(R_lh, R8_temp); 3595 __ b(L_objArray,eq); 3596 3597 // if (src->klass() != dst->klass()) return -1; 3598 __ cmp(R5_src_klass, R6_dst_klass); 3599 __ b(L_failed, ne); 3600 3601 // if (!src->is_Array()) return -1; 3602 __ cmp_32(R_lh, Klass::_lh_neutral_value); // < 0 3603 __ b(L_failed, ge); 3604 3605 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 3606 R8_temp, R6_dst_klass, L_failed); 3607 3608 { 3609 // TypeArrayKlass 3610 // 3611 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 3612 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 3613 // 3614 3615 const Register R6_offset = R6_dst_klass; // array offset 3616 const Register R12_elsize = R12; // log2 element size 3617 3618 __ logical_shift_right(R6_offset, R_lh, Klass::_lh_header_size_shift); 3619 __ andr(R6_offset, R6_offset, (unsigned int)Klass::_lh_header_size_mask); // array_offset 3620 __ add(src, src, R6_offset); // src array offset 3621 __ add(dst, dst, R6_offset); // dst array offset 3622 __ andr(R12_elsize, R_lh, (unsigned int)Klass::_lh_log2_element_size_mask); // log2 element size 3623 3624 // next registers should be set before the jump to corresponding stub 3625 const Register from = R0; // source array address 3626 const Register to = R1; // destination array address 3627 const Register count = R2; // elements count 3628 3629 // 'from', 'to', 'count' registers should be set in this order 3630 // since they are the same as 'src', 'src_pos', 'dst'. 3631 3632 #ifdef AARCH64 3633 3634 BLOCK_COMMENT("choose copy loop based on element size and scale indexes"); 3635 Label Lbyte, Lshort, Lint, Llong; 3636 3637 __ cbz(R12_elsize, Lbyte); 3638 3639 assert (LogBytesPerShort < LogBytesPerInt && LogBytesPerInt < LogBytesPerLong, "must be"); 3640 __ cmp(R12_elsize, LogBytesPerInt); 3641 __ b(Lint, eq); 3642 __ b(Llong, gt); 3643 3644 __ BIND(Lshort); 3645 __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerShort); 3646 __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerShort); 3647 __ mov(count, length); 3648 __ b(StubRoutines::_jshort_arraycopy); 3649 3650 __ BIND(Lint); 3651 __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerInt); 3652 __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerInt); 3653 __ mov(count, length); 3654 __ b(StubRoutines::_jint_arraycopy); 3655 3656 __ BIND(Lbyte); 3657 __ add_ptr_scaled_int32(from, src, src_pos, 0); 3658 __ add_ptr_scaled_int32(to, dst, dst_pos, 0); 3659 __ mov(count, length); 3660 __ b(StubRoutines::_jbyte_arraycopy); 3661 3662 __ BIND(Llong); 3663 __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerLong); 3664 __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerLong); 3665 __ mov(count, length); 3666 __ b(StubRoutines::_jlong_arraycopy); 3667 3668 #else // AARCH64 3669 3670 BLOCK_COMMENT("scale indexes to element size"); 3671 __ add(from, src, AsmOperand(src_pos, lsl, R12_elsize)); // src_addr 3672 __ add(to, dst, AsmOperand(dst_pos, lsl, R12_elsize)); // dst_addr 3673 3674 __ mov(count, length); // length 3675 3676 // XXX optim: avoid later push in arraycopy variants ? 3677 3678 __ pop(saved_regs); 3679 3680 BLOCK_COMMENT("choose copy loop based on element size"); 3681 __ cmp(R12_elsize, 0); 3682 __ b(StubRoutines::_jbyte_arraycopy,eq); 3683 3684 __ cmp(R12_elsize, LogBytesPerShort); 3685 __ b(StubRoutines::_jshort_arraycopy,eq); 3686 3687 __ cmp(R12_elsize, LogBytesPerInt); 3688 __ b(StubRoutines::_jint_arraycopy,eq); 3689 3690 __ b(StubRoutines::_jlong_arraycopy); 3691 3692 #endif // AARCH64 3693 } 3694 3695 // ObjArrayKlass 3696 __ BIND(L_objArray); 3697 // live at this point: R5_src_klass, R6_dst_klass, src[_pos], dst[_pos], length 3698 3699 Label L_plain_copy, L_checkcast_copy; 3700 // test array classes for subtyping 3701 __ cmp(R5_src_klass, R6_dst_klass); // usual case is exact equality 3702 __ b(L_checkcast_copy, ne); 3703 3704 BLOCK_COMMENT("Identically typed arrays"); 3705 { 3706 // Identically typed arrays can be copied without element-wise checks. 3707 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 3708 R8_temp, R_lh, L_failed); 3709 3710 // next registers should be set before the jump to corresponding stub 3711 const Register from = R0; // source array address 3712 const Register to = R1; // destination array address 3713 const Register count = R2; // elements count 3714 3715 __ add(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset 3716 __ add(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset 3717 __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerHeapOop); // src_addr 3718 __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerHeapOop); // dst_addr 3719 __ BIND(L_plain_copy); 3720 __ mov(count, length); 3721 3722 #ifndef AARCH64 3723 __ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ? 3724 #endif // !AARCH64 3725 __ b(StubRoutines::_oop_arraycopy); 3726 } 3727 3728 { 3729 __ BIND(L_checkcast_copy); 3730 // live at this point: R5_src_klass, R6_dst_klass 3731 3732 // Before looking at dst.length, make sure dst is also an objArray. 3733 __ ldr_u32(R8_temp, Address(R6_dst_klass, lh_offset)); 3734 __ cmp_32(R_lh, R8_temp); 3735 __ b(L_failed, ne); 3736 3737 // It is safe to examine both src.length and dst.length. 3738 3739 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 3740 R8_temp, R_lh, L_failed); 3741 3742 // next registers should be set before the jump to corresponding stub 3743 const Register from = R0; // source array address 3744 const Register to = R1; // destination array address 3745 const Register count = R2; // elements count 3746 3747 // Marshal the base address arguments now, freeing registers. 3748 __ add(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset 3749 __ add(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset 3750 __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerHeapOop); // src_addr 3751 __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerHeapOop); // dst_addr 3752 3753 __ mov(count, length); // length (reloaded) 3754 3755 Register sco_temp = R3; // this register is free now 3756 assert_different_registers(from, to, count, sco_temp, 3757 R6_dst_klass, R5_src_klass); 3758 3759 // Generate the type check. 3760 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 3761 __ ldr_u32(sco_temp, Address(R6_dst_klass, sco_offset)); 3762 generate_type_check(R5_src_klass, sco_temp, R6_dst_klass, 3763 R8_temp, R9, 3764 AARCH64_ONLY(R10) NOT_AARCH64(R12), 3765 L_plain_copy); 3766 3767 // Fetch destination element klass from the ObjArrayKlass header. 3768 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 3769 3770 // the checkcast_copy loop needs two extra arguments: 3771 const Register Rdst_elem_klass = AARCH64_ONLY(R4) NOT_AARCH64(R3); 3772 __ ldr(Rdst_elem_klass, Address(R6_dst_klass, ek_offset)); // dest elem klass 3773 #ifndef AARCH64 3774 __ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ? 3775 __ str(Rdst_elem_klass, Address(SP,0)); // dest elem klass argument 3776 #endif // !AARCH64 3777 __ ldr_u32(R3, Address(Rdst_elem_klass, sco_offset)); // sco of elem klass 3778 __ b(StubRoutines::_checkcast_arraycopy); 3779 } 3780 3781 __ BIND(L_failed); 3782 3783 #ifndef AARCH64 3784 __ pop(saved_regs); 3785 #endif // !AARCH64 3786 __ mvn(R0, 0); // failure, with 0 copied 3787 __ ret(); 3788 3789 return start; 3790 } 3791 3792 // Safefetch stubs. 3793 void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) { 3794 // safefetch signatures: 3795 // int SafeFetch32(int* adr, int errValue); 3796 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3797 // 3798 // arguments: 3799 // R0 = adr 3800 // R1 = errValue 3801 // 3802 // result: 3803 // R0 = *adr or errValue 3804 3805 StubCodeMark mark(this, "StubRoutines", name); 3806 3807 // Entry point, pc or function descriptor. 3808 *entry = __ pc(); 3809 3810 // Load *adr into c_rarg2, may fault. 3811 *fault_pc = __ pc(); 3812 3813 switch (size) { 3814 case 4: // int32_t 3815 __ ldr_s32(R1, Address(R0)); 3816 break; 3817 3818 case 8: // int64_t 3819 #ifdef AARCH64 3820 __ ldr(R1, Address(R0)); 3821 #else 3822 Unimplemented(); 3823 #endif // AARCH64 3824 break; 3825 3826 default: 3827 ShouldNotReachHere(); 3828 } 3829 3830 // return errValue or *adr 3831 *continuation_pc = __ pc(); 3832 __ mov(R0, R1); 3833 __ ret(); 3834 } 3835 3836 void generate_arraycopy_stubs() { 3837 3838 // Note: the disjoint stubs must be generated first, some of 3839 // the conjoint stubs use them. 3840 3841 bool status = false; // non failing C2 stubs need not return a status in R0 3842 3843 #ifdef TEST_C2_GENERIC_ARRAYCOPY /* Internal development flag */ 3844 // With this flag, the C2 stubs are tested by generating calls to 3845 // generic_arraycopy instead of Runtime1::arraycopy 3846 3847 // Runtime1::arraycopy return a status in R0 (0 if OK, else ~copied) 3848 // and the result is tested to see whether the arraycopy stub should 3849 // be called. 3850 3851 // When we test arraycopy this way, we must generate extra code in the 3852 // arraycopy methods callable from C2 generic_arraycopy to set the 3853 // status to 0 for those who always succeed (calling the slow path stub might 3854 // lead to errors since the copy has already been performed). 3855 3856 status = true; // generate a status compatible with C1 calls 3857 #endif 3858 3859 // these need always status in case they are called from generic_arraycopy 3860 StubRoutines::_jbyte_disjoint_arraycopy = generate_primitive_copy(false, "jbyte_disjoint_arraycopy", true, 1, true); 3861 StubRoutines::_jshort_disjoint_arraycopy = generate_primitive_copy(false, "jshort_disjoint_arraycopy", true, 2, true); 3862 StubRoutines::_jint_disjoint_arraycopy = generate_primitive_copy(false, "jint_disjoint_arraycopy", true, 4, true); 3863 StubRoutines::_jlong_disjoint_arraycopy = generate_primitive_copy(false, "jlong_disjoint_arraycopy", true, 8, true); 3864 StubRoutines::_oop_disjoint_arraycopy = generate_oop_copy (false, "oop_disjoint_arraycopy", true, true); 3865 3866 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_primitive_copy(true, "arrayof_jbyte_disjoint_arraycopy", status, 1, true); 3867 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_primitive_copy(true, "arrayof_jshort_disjoint_arraycopy",status, 2, true); 3868 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_primitive_copy(true, "arrayof_jint_disjoint_arraycopy", status, 4, true); 3869 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_primitive_copy(true, "arrayof_jlong_disjoint_arraycopy", status, 8, true); 3870 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_oop_copy (true, "arrayof_oop_disjoint_arraycopy", status, true); 3871 3872 // these need always status in case they are called from generic_arraycopy 3873 StubRoutines::_jbyte_arraycopy = generate_primitive_copy(false, "jbyte_arraycopy", true, 1, false, StubRoutines::_jbyte_disjoint_arraycopy); 3874 StubRoutines::_jshort_arraycopy = generate_primitive_copy(false, "jshort_arraycopy", true, 2, false, StubRoutines::_jshort_disjoint_arraycopy); 3875 StubRoutines::_jint_arraycopy = generate_primitive_copy(false, "jint_arraycopy", true, 4, false, StubRoutines::_jint_disjoint_arraycopy); 3876 StubRoutines::_jlong_arraycopy = generate_primitive_copy(false, "jlong_arraycopy", true, 8, false, StubRoutines::_jlong_disjoint_arraycopy); 3877 StubRoutines::_oop_arraycopy = generate_oop_copy (false, "oop_arraycopy", true, false, StubRoutines::_oop_disjoint_arraycopy); 3878 3879 StubRoutines::_arrayof_jbyte_arraycopy = generate_primitive_copy(true, "arrayof_jbyte_arraycopy", status, 1, false, StubRoutines::_arrayof_jbyte_disjoint_arraycopy); 3880 StubRoutines::_arrayof_jshort_arraycopy = generate_primitive_copy(true, "arrayof_jshort_arraycopy", status, 2, false, StubRoutines::_arrayof_jshort_disjoint_arraycopy); 3881 #ifdef _LP64 3882 // since sizeof(jint) < sizeof(HeapWord), there's a different flavor: 3883 StubRoutines::_arrayof_jint_arraycopy = generate_primitive_copy(true, "arrayof_jint_arraycopy", status, 4, false, StubRoutines::_arrayof_jint_disjoint_arraycopy); 3884 #else 3885 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy; 3886 #endif 3887 if (BytesPerHeapOop < HeapWordSize) { 3888 StubRoutines::_arrayof_oop_arraycopy = generate_oop_copy (true, "arrayof_oop_arraycopy", status, false, StubRoutines::_arrayof_oop_disjoint_arraycopy); 3889 } else { 3890 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy; 3891 } 3892 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy; 3893 3894 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy"); 3895 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy"); 3896 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy"); 3897 3898 3899 } 3900 3901 #ifndef AARCH64 3902 #define COMPILE_CRYPTO 3903 #include "stubRoutinesCrypto_arm.cpp" 3904 #else 3905 3906 #ifdef COMPILER2 3907 // Arguments: 3908 // 3909 // Inputs: 3910 // c_rarg0 - source byte array address 3911 // c_rarg1 - destination byte array address 3912 // c_rarg2 - K (key) in little endian int array 3913 // 3914 address generate_aescrypt_encryptBlock() { 3915 __ align(CodeEntryAlignment); 3916 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 3917 3918 Label L_doLast; 3919 3920 const Register from = c_rarg0; // source array address 3921 const Register to = c_rarg1; // destination array address 3922 const Register key = c_rarg2; // key array address 3923 const Register keylen = R8; 3924 3925 address start = __ pc(); 3926 __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed)); 3927 __ mov(FP, SP); 3928 3929 __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3930 3931 __ vld1(V0, Address(from), MacroAssembler::VELEM_SIZE_8, 128); // get 16 bytes of input 3932 3933 __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 3934 3935 int quad = 1; 3936 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 3937 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 3938 __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad); 3939 __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad); 3940 __ aese(V0, V1); 3941 __ aesmc(V0, V0); 3942 __ aese(V0, V2); 3943 __ aesmc(V0, V0); 3944 __ aese(V0, V3); 3945 __ aesmc(V0, V0); 3946 __ aese(V0, V4); 3947 __ aesmc(V0, V0); 3948 3949 __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 3950 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 3951 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 3952 __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad); 3953 __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad); 3954 __ aese(V0, V1); 3955 __ aesmc(V0, V0); 3956 __ aese(V0, V2); 3957 __ aesmc(V0, V0); 3958 __ aese(V0, V3); 3959 __ aesmc(V0, V0); 3960 __ aese(V0, V4); 3961 __ aesmc(V0, V0); 3962 3963 __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 3964 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 3965 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 3966 3967 __ cmp_w(keylen, 44); 3968 __ b(L_doLast, eq); 3969 3970 __ aese(V0, V1); 3971 __ aesmc(V0, V0); 3972 __ aese(V0, V2); 3973 __ aesmc(V0, V0); 3974 3975 __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 3976 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 3977 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 3978 3979 __ cmp_w(keylen, 52); 3980 __ b(L_doLast, eq); 3981 3982 __ aese(V0, V1); 3983 __ aesmc(V0, V0); 3984 __ aese(V0, V2); 3985 __ aesmc(V0, V0); 3986 3987 __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 3988 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 3989 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 3990 3991 __ BIND(L_doLast); 3992 3993 __ aese(V0, V1); 3994 __ aesmc(V0, V0); 3995 __ aese(V0, V2); 3996 3997 __ vld1(V1, Address(key), MacroAssembler::VELEM_SIZE_8, 128); 3998 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 3999 __ eor(V0, V0, V1, MacroAssembler::VELEM_SIZE_8, quad); 4000 4001 __ vst1(V0, Address(to), MacroAssembler::VELEM_SIZE_8, 128); 4002 4003 __ mov(R0, 0); 4004 4005 __ mov(SP, FP); 4006 __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed)); 4007 __ ret(LR); 4008 4009 return start; 4010 } 4011 4012 // Arguments: 4013 // 4014 // Inputs: 4015 // c_rarg0 - source byte array address 4016 // c_rarg1 - destination byte array address 4017 // c_rarg2 - K (key) in little endian int array 4018 // 4019 address generate_aescrypt_decryptBlock() { 4020 assert(UseAES, "need AES instructions and misaligned SSE support"); 4021 __ align(CodeEntryAlignment); 4022 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 4023 Label L_doLast; 4024 4025 const Register from = c_rarg0; // source array address 4026 const Register to = c_rarg1; // destination array address 4027 const Register key = c_rarg2; // key array address 4028 const Register keylen = R8; 4029 4030 address start = __ pc(); 4031 __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed)); 4032 __ mov(FP, SP); 4033 4034 __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 4035 4036 __ vld1(V0, Address(from), MacroAssembler::VELEM_SIZE_8, 128); // get 16 bytes of input 4037 4038 __ vld1(V5, Address(key, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4039 4040 int quad = 1; 4041 __ rev32(V5, V5, MacroAssembler::VELEM_SIZE_8, quad); 4042 4043 __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4044 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 4045 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 4046 __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad); 4047 __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad); 4048 __ aesd(V0, V1); 4049 __ aesimc(V0, V0); 4050 __ aesd(V0, V2); 4051 __ aesimc(V0, V0); 4052 __ aesd(V0, V3); 4053 __ aesimc(V0, V0); 4054 __ aesd(V0, V4); 4055 __ aesimc(V0, V0); 4056 4057 __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4058 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 4059 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 4060 __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad); 4061 __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad); 4062 __ aesd(V0, V1); 4063 __ aesimc(V0, V0); 4064 __ aesd(V0, V2); 4065 __ aesimc(V0, V0); 4066 __ aesd(V0, V3); 4067 __ aesimc(V0, V0); 4068 __ aesd(V0, V4); 4069 __ aesimc(V0, V0); 4070 4071 __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4072 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 4073 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 4074 4075 __ cmp_w(keylen, 44); 4076 __ b(L_doLast, eq); 4077 4078 __ aesd(V0, V1); 4079 __ aesimc(V0, V0); 4080 __ aesd(V0, V2); 4081 __ aesimc(V0, V0); 4082 4083 __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4084 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 4085 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 4086 4087 __ cmp_w(keylen, 52); 4088 __ b(L_doLast, eq); 4089 4090 __ aesd(V0, V1); 4091 __ aesimc(V0, V0); 4092 __ aesd(V0, V2); 4093 __ aesimc(V0, V0); 4094 4095 __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4096 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 4097 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 4098 4099 __ BIND(L_doLast); 4100 4101 __ aesd(V0, V1); 4102 __ aesimc(V0, V0); 4103 __ aesd(V0, V2); 4104 4105 __ eor(V0, V0, V5, MacroAssembler::VELEM_SIZE_8, quad); 4106 4107 __ vst1(V0, Address(to), MacroAssembler::VELEM_SIZE_8, 128); 4108 4109 __ mov(R0, 0); 4110 4111 __ mov(SP, FP); 4112 __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed)); 4113 __ ret(LR); 4114 4115 4116 return start; 4117 } 4118 4119 // Arguments: 4120 // 4121 // Inputs: 4122 // c_rarg0 - source byte array address 4123 // c_rarg1 - destination byte array address 4124 // c_rarg2 - K (key) in little endian int array 4125 // c_rarg3 - r vector byte array address 4126 // c_rarg4 - input length 4127 // 4128 // Output: 4129 // x0 - input length 4130 // 4131 address generate_cipherBlockChaining_encryptAESCrypt() { 4132 assert(UseAES, "need AES instructions and misaligned SSE support"); 4133 __ align(CodeEntryAlignment); 4134 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 4135 4136 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 4137 4138 const Register from = c_rarg0; // source array address 4139 const Register to = c_rarg1; // destination array address 4140 const Register key = c_rarg2; // key array address 4141 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 4142 // and left with the results of the last encryption block 4143 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 4144 const Register keylen = R8; 4145 4146 address start = __ pc(); 4147 __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed)); 4148 __ mov(FP, SP); 4149 4150 __ mov(R9, len_reg); 4151 __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 4152 4153 __ vld1(V0, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128); 4154 4155 __ cmp_w(keylen, 52); 4156 __ b(L_loadkeys_44, cc); 4157 __ b(L_loadkeys_52, eq); 4158 4159 __ vld1(V17, V18, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4160 4161 int quad = 1; 4162 __ rev32(V17, V17, MacroAssembler::VELEM_SIZE_8, quad); 4163 __ rev32(V18, V18, MacroAssembler::VELEM_SIZE_8, quad); 4164 __ BIND(L_loadkeys_52); 4165 __ vld1(V19, V20, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4166 __ rev32(V19, V19, MacroAssembler::VELEM_SIZE_8, quad); 4167 __ rev32(V20, V20, MacroAssembler::VELEM_SIZE_8, quad); 4168 __ BIND(L_loadkeys_44); 4169 __ vld1(V21, V22, V23, V24, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4170 __ rev32(V21, V21, MacroAssembler::VELEM_SIZE_8, quad); 4171 __ rev32(V22, V22, MacroAssembler::VELEM_SIZE_8, quad); 4172 __ rev32(V23, V23, MacroAssembler::VELEM_SIZE_8, quad); 4173 __ rev32(V24, V24, MacroAssembler::VELEM_SIZE_8, quad); 4174 __ vld1(V25, V26, V27, V28, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4175 __ rev32(V25, V25, MacroAssembler::VELEM_SIZE_8, quad); 4176 __ rev32(V26, V26, MacroAssembler::VELEM_SIZE_8, quad); 4177 __ rev32(V27, V27, MacroAssembler::VELEM_SIZE_8, quad); 4178 __ rev32(V28, V28, MacroAssembler::VELEM_SIZE_8, quad); 4179 __ vld1(V29, V30, V31, Address(key), MacroAssembler::VELEM_SIZE_8, 128); 4180 __ rev32(V29, V29, MacroAssembler::VELEM_SIZE_8, quad); 4181 __ rev32(V30, V30, MacroAssembler::VELEM_SIZE_8, quad); 4182 __ rev32(V31, V31, MacroAssembler::VELEM_SIZE_8, quad); 4183 4184 __ BIND(L_aes_loop); 4185 __ vld1(V1, Address(from, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4186 __ eor(V0, V0, V1, MacroAssembler::VELEM_SIZE_8, quad); 4187 4188 __ b(L_rounds_44, cc); 4189 __ b(L_rounds_52, eq); 4190 4191 __ aese(V0, V17); 4192 __ aesmc(V0, V0); 4193 __ aese(V0, V18); 4194 __ aesmc(V0, V0); 4195 __ BIND(L_rounds_52); 4196 __ aese(V0, V19); 4197 __ aesmc(V0, V0); 4198 __ aese(V0, V20); 4199 __ aesmc(V0, V0); 4200 __ BIND(L_rounds_44); 4201 __ aese(V0, V21); 4202 __ aesmc(V0, V0); 4203 __ aese(V0, V22); 4204 __ aesmc(V0, V0); 4205 __ aese(V0, V23); 4206 __ aesmc(V0, V0); 4207 __ aese(V0, V24); 4208 __ aesmc(V0, V0); 4209 __ aese(V0, V25); 4210 __ aesmc(V0, V0); 4211 __ aese(V0, V26); 4212 __ aesmc(V0, V0); 4213 __ aese(V0, V27); 4214 __ aesmc(V0, V0); 4215 __ aese(V0, V28); 4216 __ aesmc(V0, V0); 4217 __ aese(V0, V29); 4218 __ aesmc(V0, V0); 4219 __ aese(V0, V30); 4220 __ eor(V0, V0, V31, MacroAssembler::VELEM_SIZE_8, quad); 4221 4222 __ vst1(V0, Address(to, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4223 __ sub(len_reg, len_reg, 16); 4224 __ cbnz(len_reg, L_aes_loop); 4225 4226 __ vst1(V0, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128); 4227 4228 __ mov(R0, R9); 4229 4230 __ mov(SP, FP); 4231 __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed)); 4232 __ ret(LR); 4233 4234 return start; 4235 } 4236 4237 // Arguments: 4238 // 4239 // Inputs: 4240 // c_rarg0 - source byte array address 4241 // c_rarg1 - destination byte array address 4242 // c_rarg2 - K (key) in little endian int array 4243 // c_rarg3 - r vector byte array address 4244 // c_rarg4 - input length 4245 // 4246 // Output: 4247 // rax - input length 4248 // 4249 address generate_cipherBlockChaining_decryptAESCrypt() { 4250 assert(UseAES, "need AES instructions and misaligned SSE support"); 4251 __ align(CodeEntryAlignment); 4252 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 4253 4254 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 4255 4256 const Register from = c_rarg0; // source array address 4257 const Register to = c_rarg1; // destination array address 4258 const Register key = c_rarg2; // key array address 4259 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 4260 // and left with the results of the last encryption block 4261 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 4262 const Register keylen = R8; 4263 4264 address start = __ pc(); 4265 __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed)); 4266 __ mov(FP, SP); 4267 4268 __ mov(R9, len_reg); 4269 __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 4270 4271 __ vld1(V2, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128); 4272 4273 __ vld1(V31, Address(key, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4274 4275 int quad = 1; 4276 __ rev32(V31, V31, MacroAssembler::VELEM_SIZE_8, quad); 4277 4278 __ cmp_w(keylen, 52); 4279 __ b(L_loadkeys_44, cc); 4280 __ b(L_loadkeys_52, eq); 4281 4282 __ vld1(V17, V18, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4283 __ rev32(V17, V17, MacroAssembler::VELEM_SIZE_8, quad); 4284 __ rev32(V18, V18, MacroAssembler::VELEM_SIZE_8, quad); 4285 __ BIND(L_loadkeys_52); 4286 __ vld1(V19, V20, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4287 __ rev32(V19, V19, MacroAssembler::VELEM_SIZE_8, quad); 4288 __ rev32(V20, V20, MacroAssembler::VELEM_SIZE_8, quad); 4289 __ BIND(L_loadkeys_44); 4290 __ vld1(V21, V22, V23, V24, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4291 __ rev32(V21, V21, MacroAssembler::VELEM_SIZE_8, quad); 4292 __ rev32(V22, V22, MacroAssembler::VELEM_SIZE_8, quad); 4293 __ rev32(V23, V23, MacroAssembler::VELEM_SIZE_8, quad); 4294 __ rev32(V24, V24, MacroAssembler::VELEM_SIZE_8, quad); 4295 __ vld1(V25, V26, V27, V28, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4296 __ rev32(V25, V25, MacroAssembler::VELEM_SIZE_8, quad); 4297 __ rev32(V26, V26, MacroAssembler::VELEM_SIZE_8, quad); 4298 __ rev32(V27, V27, MacroAssembler::VELEM_SIZE_8, quad); 4299 __ rev32(V28, V28, MacroAssembler::VELEM_SIZE_8, quad); 4300 __ vld1(V29, V30, Address(key), MacroAssembler::VELEM_SIZE_8, 128); 4301 __ rev32(V29, V29, MacroAssembler::VELEM_SIZE_8, quad); 4302 __ rev32(V30, V30, MacroAssembler::VELEM_SIZE_8, quad); 4303 4304 __ BIND(L_aes_loop); 4305 __ vld1(V0, Address(from, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4306 __ orr(V1, V0, V0, MacroAssembler::VELEM_SIZE_8, quad); 4307 4308 __ b(L_rounds_44, cc); 4309 __ b(L_rounds_52, eq); 4310 4311 __ aesd(V0, V17); 4312 __ aesimc(V0, V0); 4313 __ aesd(V0, V17); 4314 __ aesimc(V0, V0); 4315 __ BIND(L_rounds_52); 4316 __ aesd(V0, V19); 4317 __ aesimc(V0, V0); 4318 __ aesd(V0, V20); 4319 __ aesimc(V0, V0); 4320 __ BIND(L_rounds_44); 4321 __ aesd(V0, V21); 4322 __ aesimc(V0, V0); 4323 __ aesd(V0, V22); 4324 __ aesimc(V0, V0); 4325 __ aesd(V0, V23); 4326 __ aesimc(V0, V0); 4327 __ aesd(V0, V24); 4328 __ aesimc(V0, V0); 4329 __ aesd(V0, V25); 4330 __ aesimc(V0, V0); 4331 __ aesd(V0, V26); 4332 __ aesimc(V0, V0); 4333 __ aesd(V0, V27); 4334 __ aesimc(V0, V0); 4335 __ aesd(V0, V28); 4336 __ aesimc(V0, V0); 4337 __ aesd(V0, V29); 4338 __ aesimc(V0, V0); 4339 __ aesd(V0, V30); 4340 __ eor(V0, V0, V31, MacroAssembler::VELEM_SIZE_8, quad); 4341 __ eor(V0, V0, V2, MacroAssembler::VELEM_SIZE_8, quad); 4342 4343 __ vst1(V0, Address(to, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4344 __ orr(V2, V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 4345 4346 __ sub(len_reg, len_reg, 16); 4347 __ cbnz(len_reg, L_aes_loop); 4348 4349 __ vst1(V2, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128); 4350 4351 __ mov(R0, R9); 4352 4353 __ mov(SP, FP); 4354 __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed)); 4355 __ ret(LR); 4356 4357 return start; 4358 } 4359 4360 #endif // COMPILER2 4361 #endif // AARCH64 4362 4363 private: 4364 4365 #undef __ 4366 #define __ masm-> 4367 4368 //------------------------------------------------------------------------------------------------------------------------ 4369 // Continuation point for throwing of implicit exceptions that are not handled in 4370 // the current activation. Fabricates an exception oop and initiates normal 4371 // exception dispatching in this frame. 4372 address generate_throw_exception(const char* name, address runtime_entry) { 4373 int insts_size = 128; 4374 int locs_size = 32; 4375 CodeBuffer code(name, insts_size, locs_size); 4376 OopMapSet* oop_maps; 4377 int frame_size; 4378 int frame_complete; 4379 4380 oop_maps = new OopMapSet(); 4381 MacroAssembler* masm = new MacroAssembler(&code); 4382 4383 address start = __ pc(); 4384 4385 frame_size = 2; 4386 __ mov(Rexception_pc, LR); 4387 __ raw_push(FP, LR); 4388 4389 frame_complete = __ pc() - start; 4390 4391 // Any extra arguments are already supposed to be R1 and R2 4392 __ mov(R0, Rthread); 4393 4394 int pc_offset = __ set_last_Java_frame(SP, FP, false, Rtemp); 4395 assert(((__ pc()) - start) == __ offset(), "warning: start differs from code_begin"); 4396 __ call(runtime_entry); 4397 if (pc_offset == -1) { 4398 pc_offset = __ offset(); 4399 } 4400 4401 // Generate oop map 4402 OopMap* map = new OopMap(frame_size*VMRegImpl::slots_per_word, 0); 4403 oop_maps->add_gc_map(pc_offset, map); 4404 __ reset_last_Java_frame(Rtemp); // Rtemp free since scratched by far call 4405 4406 __ raw_pop(FP, LR); 4407 __ jump(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type, Rtemp); 4408 4409 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, 4410 frame_size, oop_maps, false); 4411 return stub->entry_point(); 4412 } 4413 4414 //--------------------------------------------------------------------------- 4415 // Initialization 4416 4417 void generate_initial() { 4418 // Generates all stubs and initializes the entry points 4419 4420 //------------------------------------------------------------------------------------------------------------------------ 4421 // entry points that exist in all platforms 4422 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than 4423 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp. 4424 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4425 4426 StubRoutines::_call_stub_entry = 4427 generate_call_stub(StubRoutines::_call_stub_return_address); 4428 // is referenced by megamorphic call 4429 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4430 4431 // stub for throwing stack overflow error used both by interpreter and compiler 4432 StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError)); 4433 4434 #ifndef AARCH64 4435 // integer division used both by interpreter and compiler 4436 StubRoutines::Arm::_idiv_irem_entry = generate_idiv_irem(); 4437 4438 StubRoutines::_atomic_add_entry = generate_atomic_add(); 4439 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg(); 4440 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg(); 4441 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long(); 4442 StubRoutines::_atomic_load_long_entry = generate_atomic_load_long(); 4443 StubRoutines::_atomic_store_long_entry = generate_atomic_store_long(); 4444 #endif // !AARCH64 4445 } 4446 4447 void generate_all() { 4448 // Generates all stubs and initializes the entry points 4449 4450 #ifdef COMPILER2 4451 // Generate partial_subtype_check first here since its code depends on 4452 // UseZeroBaseCompressedOops which is defined after heap initialization. 4453 StubRoutines::Arm::_partial_subtype_check = generate_partial_subtype_check(); 4454 #endif 4455 // These entry points require SharedInfo::stack0 to be set up in non-core builds 4456 // and need to be relocatable, so they each fabricate a RuntimeStub internally. 4457 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError)); 4458 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError)); 4459 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call)); 4460 4461 //------------------------------------------------------------------------------------------------------------------------ 4462 // entry points that are platform specific 4463 4464 // support for verify_oop (must happen after universe_init) 4465 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4466 4467 // arraycopy stubs used by compilers 4468 generate_arraycopy_stubs(); 4469 4470 // Safefetch stubs. 4471 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4472 &StubRoutines::_safefetch32_fault_pc, 4473 &StubRoutines::_safefetch32_continuation_pc); 4474 #ifdef AARCH64 4475 generate_safefetch("SafeFetchN", wordSize, &StubRoutines::_safefetchN_entry, 4476 &StubRoutines::_safefetchN_fault_pc, 4477 &StubRoutines::_safefetchN_continuation_pc); 4478 #ifdef COMPILER2 4479 if (UseAESIntrinsics) { 4480 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4481 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4482 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4483 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4484 } 4485 #endif 4486 #else 4487 assert (sizeof(int) == wordSize, "32-bit architecture"); 4488 StubRoutines::_safefetchN_entry = StubRoutines::_safefetch32_entry; 4489 StubRoutines::_safefetchN_fault_pc = StubRoutines::_safefetch32_fault_pc; 4490 StubRoutines::_safefetchN_continuation_pc = StubRoutines::_safefetch32_continuation_pc; 4491 #endif // AARCH64 4492 4493 #ifdef COMPILE_CRYPTO 4494 // generate AES intrinsics code 4495 if (UseAESIntrinsics) { 4496 aes_init(); 4497 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4498 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4499 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4500 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4501 } 4502 #endif // COMPILE_CRYPTO 4503 } 4504 4505 4506 public: 4507 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4508 if (all) { 4509 generate_all(); 4510 } else { 4511 generate_initial(); 4512 } 4513 } 4514 }; // end class declaration 4515 4516 void StubGenerator_generate(CodeBuffer* code, bool all) { 4517 StubGenerator g(code, all); 4518 }