1 /* 2 * Copyright (c) 2008, 2016, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "assembler_arm.inline.hpp" 28 #include "code/codeCacheExtensions.hpp" 29 #include "interpreter/interpreter.hpp" 30 #include "nativeInst_arm.hpp" 31 #include "oops/instanceOop.hpp" 32 #include "oops/method.hpp" 33 #include "oops/objArrayKlass.hpp" 34 #include "oops/oop.inline.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/frame.inline.hpp" 37 #include "runtime/handles.inline.hpp" 38 #include "runtime/sharedRuntime.hpp" 39 #include "runtime/stubCodeGenerator.hpp" 40 #include "runtime/stubRoutines.hpp" 41 #ifdef COMPILER2 42 #include "opto/runtime.hpp" 43 #endif 44 45 // Declaration and definition of StubGenerator (no .hpp file). 46 // For a more detailed description of the stub routine structure 47 // see the comment in stubRoutines.hpp 48 49 #define __ _masm-> 50 51 #ifdef PRODUCT 52 #define BLOCK_COMMENT(str) /* nothing */ 53 #else 54 #define BLOCK_COMMENT(str) __ block_comment(str) 55 #endif 56 57 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 58 59 // ------------------------------------------------------------------------------------------------------------------------- 60 // Stub Code definitions 61 62 // Platform dependent parameters for array copy stubs 63 64 // Note: we have noticed a huge change in behavior on a microbenchmark 65 // from platform to platform depending on the configuration. 66 67 // Instead of adding a series of command line options (which 68 // unfortunately have to be done in the shared file and cannot appear 69 // only in the ARM port), the tested result are hard-coded here in a set 70 // of options, selected by specifying 'ArmCopyPlatform' 71 72 // Currently, this 'platform' is hardcoded to a value that is a good 73 // enough trade-off. However, one can easily modify this file to test 74 // the hard-coded configurations or create new ones. If the gain is 75 // significant, we could decide to either add command line options or 76 // add code to automatically choose a configuration. 77 78 // see comments below for the various configurations created 79 #define DEFAULT_ARRAYCOPY_CONFIG 0 80 #define TEGRA2_ARRAYCOPY_CONFIG 1 81 #define IMX515_ARRAYCOPY_CONFIG 2 82 83 // Hard coded choices (XXX: could be changed to a command line option) 84 #define ArmCopyPlatform DEFAULT_ARRAYCOPY_CONFIG 85 86 #ifdef AARCH64 87 #define ArmCopyCacheLineSize 64 88 #else 89 #define ArmCopyCacheLineSize 32 // not worth optimizing to 64 according to measured gains 90 #endif // AARCH64 91 92 // TODO-AARCH64: tune and revise AArch64 arraycopy optimizations 93 94 // configuration for each kind of loop 95 typedef struct { 96 int pld_distance; // prefetch distance (0 => no prefetch, <0: prefetch_before); 97 #ifndef AARCH64 98 bool split_ldm; // if true, split each STM in STMs with fewer registers 99 bool split_stm; // if true, split each LTM in LTMs with fewer registers 100 #endif // !AARCH64 101 } arraycopy_loop_config; 102 103 // configuration for all loops 104 typedef struct { 105 // const char *description; 106 arraycopy_loop_config forward_aligned; 107 arraycopy_loop_config backward_aligned; 108 arraycopy_loop_config forward_shifted; 109 arraycopy_loop_config backward_shifted; 110 } arraycopy_platform_config; 111 112 // configured platforms 113 static arraycopy_platform_config arraycopy_configurations[] = { 114 // configuration parameters for arraycopy loops 115 #ifdef AARCH64 116 { 117 {-256 }, // forward aligned 118 {-128 }, // backward aligned 119 {-256 }, // forward shifted 120 {-128 } // backward shifted 121 } 122 #else 123 124 // Configurations were chosen based on manual analysis of benchmark 125 // results, minimizing overhead with respect to best results on the 126 // different test cases. 127 128 // Prefetch before is always favored since it avoids dirtying the 129 // cache uselessly for small copies. Code for prefetch after has 130 // been kept in case the difference is significant for some 131 // platforms but we might consider dropping it. 132 133 // distance, ldm, stm 134 { 135 // default: tradeoff tegra2/imx515/nv-tegra2, 136 // Notes on benchmarking: 137 // - not far from optimal configuration on nv-tegra2 138 // - within 5% of optimal configuration except for backward aligned on IMX 139 // - up to 40% from optimal configuration for backward shifted and backward align for tegra2 140 // but still on par with the operating system copy 141 {-256, true, true }, // forward aligned 142 {-256, true, true }, // backward aligned 143 {-256, false, false }, // forward shifted 144 {-256, true, true } // backward shifted 145 }, 146 { 147 // configuration tuned on tegra2-4. 148 // Warning: should not be used on nv-tegra2 ! 149 // Notes: 150 // - prefetch after gives 40% gain on backward copies on tegra2-4, 151 // resulting in better number than the operating system 152 // copy. However, this can lead to a 300% loss on nv-tegra and has 153 // more impact on the cache (fetches futher than what is 154 // copied). Use this configuration with care, in case it improves 155 // reference benchmarks. 156 {-256, true, true }, // forward aligned 157 {96, false, false }, // backward aligned 158 {-256, false, false }, // forward shifted 159 {96, false, false } // backward shifted 160 }, 161 { 162 // configuration tuned on imx515 163 // Notes: 164 // - smaller prefetch distance is sufficient to get good result and might be more stable 165 // - refined backward aligned options within 5% of optimal configuration except for 166 // tests were the arrays fit in the cache 167 {-160, false, false }, // forward aligned 168 {-160, false, false }, // backward aligned 169 {-160, false, false }, // forward shifted 170 {-160, true, true } // backward shifted 171 } 172 #endif // AARCH64 173 }; 174 175 class StubGenerator: public StubCodeGenerator { 176 177 #ifdef PRODUCT 178 #define inc_counter_np(a,b,c) ((void)0) 179 #else 180 #define inc_counter_np(counter, t1, t2) \ 181 BLOCK_COMMENT("inc_counter " #counter); \ 182 __ inc_counter(&counter, t1, t2); 183 #endif 184 185 private: 186 187 address generate_call_stub(address& return_address) { 188 StubCodeMark mark(this, "StubRoutines", "call_stub"); 189 address start = __ pc(); 190 191 #ifdef AARCH64 192 const int saved_regs_size = 192; 193 194 __ stp(FP, LR, Address(SP, -saved_regs_size, pre_indexed)); 195 __ mov(FP, SP); 196 197 int sp_offset = 16; 198 assert(frame::entry_frame_call_wrapper_offset * wordSize == sp_offset, "adjust this code"); 199 __ stp(R0, ZR, Address(SP, sp_offset)); sp_offset += 16; 200 201 const int saved_result_and_result_type_offset = sp_offset; 202 __ stp(R1, R2, Address(SP, sp_offset)); sp_offset += 16; 203 __ stp(R19, R20, Address(SP, sp_offset)); sp_offset += 16; 204 __ stp(R21, R22, Address(SP, sp_offset)); sp_offset += 16; 205 __ stp(R23, R24, Address(SP, sp_offset)); sp_offset += 16; 206 __ stp(R25, R26, Address(SP, sp_offset)); sp_offset += 16; 207 __ stp(R27, R28, Address(SP, sp_offset)); sp_offset += 16; 208 209 __ stp_d(V8, V9, Address(SP, sp_offset)); sp_offset += 16; 210 __ stp_d(V10, V11, Address(SP, sp_offset)); sp_offset += 16; 211 __ stp_d(V12, V13, Address(SP, sp_offset)); sp_offset += 16; 212 __ stp_d(V14, V15, Address(SP, sp_offset)); sp_offset += 16; 213 assert (sp_offset == saved_regs_size, "adjust this code"); 214 215 __ mov(Rmethod, R3); 216 __ mov(Rthread, R7); 217 __ reinit_heapbase(); 218 219 { // Pass parameters 220 Label done_parameters, pass_parameters; 221 222 __ mov(Rparams, SP); 223 __ cbz_w(R6, done_parameters); 224 225 __ sub(Rtemp, SP, R6, ex_uxtw, LogBytesPerWord); 226 __ align_reg(SP, Rtemp, StackAlignmentInBytes); 227 __ add(Rparams, SP, R6, ex_uxtw, LogBytesPerWord); 228 229 __ bind(pass_parameters); 230 __ subs_w(R6, R6, 1); 231 __ ldr(Rtemp, Address(R5, wordSize, post_indexed)); 232 __ str(Rtemp, Address(Rparams, -wordSize, pre_indexed)); 233 __ b(pass_parameters, ne); 234 235 __ bind(done_parameters); 236 237 #ifdef ASSERT 238 { 239 Label L; 240 __ cmp(SP, Rparams); 241 __ b(L, eq); 242 __ stop("SP does not match Rparams"); 243 __ bind(L); 244 } 245 #endif 246 } 247 248 __ mov(Rsender_sp, SP); 249 __ blr(R4); 250 return_address = __ pc(); 251 252 __ mov(SP, FP); 253 254 __ ldp(R1, R2, Address(SP, saved_result_and_result_type_offset)); 255 256 { // Handle return value 257 Label cont; 258 __ str(R0, Address(R1)); 259 260 __ cmp_w(R2, T_DOUBLE); 261 __ ccmp_w(R2, T_FLOAT, Assembler::flags_for_condition(eq), ne); 262 __ b(cont, ne); 263 264 __ str_d(V0, Address(R1)); 265 __ bind(cont); 266 } 267 268 sp_offset = saved_result_and_result_type_offset + 16; 269 __ ldp(R19, R20, Address(SP, sp_offset)); sp_offset += 16; 270 __ ldp(R21, R22, Address(SP, sp_offset)); sp_offset += 16; 271 __ ldp(R23, R24, Address(SP, sp_offset)); sp_offset += 16; 272 __ ldp(R25, R26, Address(SP, sp_offset)); sp_offset += 16; 273 __ ldp(R27, R28, Address(SP, sp_offset)); sp_offset += 16; 274 275 __ ldp_d(V8, V9, Address(SP, sp_offset)); sp_offset += 16; 276 __ ldp_d(V10, V11, Address(SP, sp_offset)); sp_offset += 16; 277 __ ldp_d(V12, V13, Address(SP, sp_offset)); sp_offset += 16; 278 __ ldp_d(V14, V15, Address(SP, sp_offset)); sp_offset += 16; 279 assert (sp_offset == saved_regs_size, "adjust this code"); 280 281 __ ldp(FP, LR, Address(SP, saved_regs_size, post_indexed)); 282 __ ret(); 283 284 #else // AARCH64 285 286 assert(frame::entry_frame_call_wrapper_offset == 0, "adjust this code"); 287 288 __ mov(Rtemp, SP); 289 __ push(RegisterSet(FP) | RegisterSet(LR)); 290 #ifndef __SOFTFP__ 291 __ fstmdbd(SP, FloatRegisterSet(D8, 8), writeback); 292 #endif 293 __ stmdb(SP, RegisterSet(R0, R2) | RegisterSet(R4, R6) | RegisterSet(R8, R10) | altFP_7_11, writeback); 294 __ mov(Rmethod, R3); 295 __ ldmia(Rtemp, RegisterSet(R1, R3) | Rthread); // stacked arguments 296 297 // XXX: TODO 298 // Would be better with respect to native tools if the following 299 // setting of FP was changed to conform to the native ABI, with FP 300 // pointing to the saved FP slot (and the corresponding modifications 301 // for entry_frame_call_wrapper_offset and frame::real_fp). 302 __ mov(FP, SP); 303 304 { 305 Label no_parameters, pass_parameters; 306 __ cmp(R3, 0); 307 __ b(no_parameters, eq); 308 309 __ bind(pass_parameters); 310 __ ldr(Rtemp, Address(R2, wordSize, post_indexed)); // Rtemp OK, unused and scratchable 311 __ subs(R3, R3, 1); 312 __ push(Rtemp); 313 __ b(pass_parameters, ne); 314 __ bind(no_parameters); 315 } 316 317 __ mov(Rsender_sp, SP); 318 __ blx(R1); 319 return_address = __ pc(); 320 321 __ add(SP, FP, wordSize); // Skip link to JavaCallWrapper 322 __ pop(RegisterSet(R2, R3)); 323 #ifndef __ABI_HARD__ 324 __ cmp(R3, T_LONG); 325 __ cmp(R3, T_DOUBLE, ne); 326 __ str(R0, Address(R2)); 327 __ str(R1, Address(R2, wordSize), eq); 328 #else 329 Label cont, l_float, l_double; 330 331 __ cmp(R3, T_DOUBLE); 332 __ b(l_double, eq); 333 334 __ cmp(R3, T_FLOAT); 335 __ b(l_float, eq); 336 337 __ cmp(R3, T_LONG); 338 __ str(R0, Address(R2)); 339 __ str(R1, Address(R2, wordSize), eq); 340 __ b(cont); 341 342 343 __ bind(l_double); 344 __ fstd(D0, Address(R2)); 345 __ b(cont); 346 347 __ bind(l_float); 348 __ fsts(S0, Address(R2)); 349 350 __ bind(cont); 351 #endif 352 353 __ pop(RegisterSet(R4, R6) | RegisterSet(R8, R10) | altFP_7_11); 354 #ifndef __SOFTFP__ 355 __ fldmiad(SP, FloatRegisterSet(D8, 8), writeback); 356 #endif 357 __ pop(RegisterSet(FP) | RegisterSet(PC)); 358 359 #endif // AARCH64 360 return start; 361 } 362 363 364 // (in) Rexception_obj: exception oop 365 address generate_catch_exception() { 366 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 367 address start = __ pc(); 368 369 __ str(Rexception_obj, Address(Rthread, Thread::pending_exception_offset())); 370 __ b(StubRoutines::_call_stub_return_address); 371 372 return start; 373 } 374 375 376 // (in) Rexception_pc: return address 377 address generate_forward_exception() { 378 StubCodeMark mark(this, "StubRoutines", "forward exception"); 379 address start = __ pc(); 380 381 __ mov(c_rarg0, Rthread); 382 __ mov(c_rarg1, Rexception_pc); 383 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 384 SharedRuntime::exception_handler_for_return_address), 385 c_rarg0, c_rarg1); 386 __ ldr(Rexception_obj, Address(Rthread, Thread::pending_exception_offset())); 387 const Register Rzero = __ zero_register(Rtemp); // Rtemp OK (cleared by above call) 388 __ str(Rzero, Address(Rthread, Thread::pending_exception_offset())); 389 390 #ifdef ASSERT 391 // make sure exception is set 392 { Label L; 393 __ cbnz(Rexception_obj, L); 394 __ stop("StubRoutines::forward exception: no pending exception (2)"); 395 __ bind(L); 396 } 397 #endif 398 399 // Verify that there is really a valid exception in RAX. 400 __ verify_oop(Rexception_obj); 401 402 __ jump(R0); // handler is returned in R0 by runtime function 403 return start; 404 } 405 406 407 #ifndef AARCH64 408 409 // Integer division shared routine 410 // Input: 411 // R0 - dividend 412 // R2 - divisor 413 // Output: 414 // R0 - remainder 415 // R1 - quotient 416 // Destroys: 417 // R2 418 // LR 419 address generate_idiv_irem() { 420 Label positive_arguments, negative_or_zero, call_slow_path; 421 Register dividend = R0; 422 Register divisor = R2; 423 Register remainder = R0; 424 Register quotient = R1; 425 Register tmp = LR; 426 assert(dividend == remainder, "must be"); 427 428 address start = __ pc(); 429 430 // Check for special cases: divisor <= 0 or dividend < 0 431 __ cmp(divisor, 0); 432 __ orrs(quotient, dividend, divisor, ne); 433 __ b(negative_or_zero, le); 434 435 __ bind(positive_arguments); 436 // Save return address on stack to free one extra register 437 __ push(LR); 438 // Approximate the mamximum order of the quotient 439 __ clz(tmp, dividend); 440 __ clz(quotient, divisor); 441 __ subs(tmp, quotient, tmp); 442 __ mov(quotient, 0); 443 // Jump to the appropriate place in the unrolled loop below 444 __ ldr(PC, Address(PC, tmp, lsl, 2), pl); 445 // If divisor is greater than dividend, return immediately 446 __ pop(PC); 447 448 // Offset table 449 Label offset_table[32]; 450 int i; 451 for (i = 0; i <= 31; i++) { 452 __ emit_address(offset_table[i]); 453 } 454 455 // Unrolled loop of 32 division steps 456 for (i = 31; i >= 0; i--) { 457 __ bind(offset_table[i]); 458 __ cmp(remainder, AsmOperand(divisor, lsl, i)); 459 __ sub(remainder, remainder, AsmOperand(divisor, lsl, i), hs); 460 __ add(quotient, quotient, 1 << i, hs); 461 } 462 __ pop(PC); 463 464 __ bind(negative_or_zero); 465 // Find the combination of argument signs and jump to corresponding handler 466 __ andr(quotient, dividend, 0x80000000, ne); 467 __ orr(quotient, quotient, AsmOperand(divisor, lsr, 31), ne); 468 __ add(PC, PC, AsmOperand(quotient, ror, 26), ne); 469 __ str(LR, Address(Rthread, JavaThread::saved_exception_pc_offset())); 470 471 // The leaf runtime function can destroy R0-R3 and R12 registers which are still alive 472 RegisterSet saved_registers = RegisterSet(R3) | RegisterSet(R12); 473 #if R9_IS_SCRATCHED 474 // Safer to save R9 here since callers may have been written 475 // assuming R9 survives. This is suboptimal but may not be worth 476 // revisiting for this slow case. 477 478 // save also R10 for alignment 479 saved_registers = saved_registers | RegisterSet(R9, R10); 480 #endif 481 { 482 // divisor == 0 483 FixedSizeCodeBlock zero_divisor(_masm, 8, true); 484 __ push(saved_registers); 485 __ mov(R0, Rthread); 486 __ mov(R1, LR); 487 __ mov(R2, SharedRuntime::IMPLICIT_DIVIDE_BY_ZERO); 488 __ b(call_slow_path); 489 } 490 491 { 492 // divisor > 0 && dividend < 0 493 FixedSizeCodeBlock positive_divisor_negative_dividend(_masm, 8, true); 494 __ push(LR); 495 __ rsb(dividend, dividend, 0); 496 __ bl(positive_arguments); 497 __ rsb(remainder, remainder, 0); 498 __ rsb(quotient, quotient, 0); 499 __ pop(PC); 500 } 501 502 { 503 // divisor < 0 && dividend > 0 504 FixedSizeCodeBlock negative_divisor_positive_dividend(_masm, 8, true); 505 __ push(LR); 506 __ rsb(divisor, divisor, 0); 507 __ bl(positive_arguments); 508 __ rsb(quotient, quotient, 0); 509 __ pop(PC); 510 } 511 512 { 513 // divisor < 0 && dividend < 0 514 FixedSizeCodeBlock negative_divisor_negative_dividend(_masm, 8, true); 515 __ push(LR); 516 __ rsb(dividend, dividend, 0); 517 __ rsb(divisor, divisor, 0); 518 __ bl(positive_arguments); 519 __ rsb(remainder, remainder, 0); 520 __ pop(PC); 521 } 522 523 __ bind(call_slow_path); 524 __ call(CAST_FROM_FN_PTR(address, SharedRuntime::continuation_for_implicit_exception)); 525 __ pop(saved_registers); 526 __ bx(R0); 527 528 return start; 529 } 530 531 532 // As per atomic.hpp the Atomic read-modify-write operations must be logically implemented as: 533 // <fence>; <op>; <membar StoreLoad|StoreStore> 534 // But for load-linked/store-conditional based systems a fence here simply means 535 // no load/store can be reordered with respect to the initial load-linked, so we have: 536 // <membar storeload|loadload> ; load-linked; <op>; store-conditional; <membar storeload|storestore> 537 // There are no memory actions in <op> so nothing further is needed. 538 // 539 // So we define the following for convenience: 540 #define MEMBAR_ATOMIC_OP_PRE \ 541 MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad|MacroAssembler::LoadLoad) 542 #define MEMBAR_ATOMIC_OP_POST \ 543 MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad|MacroAssembler::StoreStore) 544 545 // Note: JDK 9 only supports ARMv7+ so we always have ldrexd available even though the 546 // code below allows for it to be otherwise. The else clause indicates an ARMv5 system 547 // for which we do not support MP and so membars are not necessary. This ARMv5 code will 548 // be removed in the future. 549 550 // Support for jint Atomic::add(jint add_value, volatile jint *dest) 551 // 552 // Arguments : 553 // 554 // add_value: R0 555 // dest: R1 556 // 557 // Results: 558 // 559 // R0: the new stored in dest 560 // 561 // Overwrites: 562 // 563 // R1, R2, R3 564 // 565 address generate_atomic_add() { 566 address start; 567 568 StubCodeMark mark(this, "StubRoutines", "atomic_add"); 569 Label retry; 570 start = __ pc(); 571 Register addval = R0; 572 Register dest = R1; 573 Register prev = R2; 574 Register ok = R2; 575 Register newval = R3; 576 577 if (VM_Version::supports_ldrex()) { 578 __ membar(MEMBAR_ATOMIC_OP_PRE, prev); 579 __ bind(retry); 580 __ ldrex(newval, Address(dest)); 581 __ add(newval, addval, newval); 582 __ strex(ok, newval, Address(dest)); 583 __ cmp(ok, 0); 584 __ b(retry, ne); 585 __ mov (R0, newval); 586 __ membar(MEMBAR_ATOMIC_OP_POST, prev); 587 } else { 588 __ bind(retry); 589 __ ldr (prev, Address(dest)); 590 __ add(newval, addval, prev); 591 __ atomic_cas_bool(prev, newval, dest, 0, noreg/*ignored*/); 592 __ b(retry, ne); 593 __ mov (R0, newval); 594 } 595 __ bx(LR); 596 597 return start; 598 } 599 600 // Support for jint Atomic::xchg(jint exchange_value, volatile jint *dest) 601 // 602 // Arguments : 603 // 604 // exchange_value: R0 605 // dest: R1 606 // 607 // Results: 608 // 609 // R0: the value previously stored in dest 610 // 611 // Overwrites: 612 // 613 // R1, R2, R3 614 // 615 address generate_atomic_xchg() { 616 address start; 617 618 StubCodeMark mark(this, "StubRoutines", "atomic_xchg"); 619 start = __ pc(); 620 Register newval = R0; 621 Register dest = R1; 622 Register prev = R2; 623 624 Label retry; 625 626 if (VM_Version::supports_ldrex()) { 627 Register ok=R3; 628 __ membar(MEMBAR_ATOMIC_OP_PRE, prev); 629 __ bind(retry); 630 __ ldrex(prev, Address(dest)); 631 __ strex(ok, newval, Address(dest)); 632 __ cmp(ok, 0); 633 __ b(retry, ne); 634 __ mov (R0, prev); 635 __ membar(MEMBAR_ATOMIC_OP_POST, prev); 636 } else { 637 __ bind(retry); 638 __ ldr (prev, Address(dest)); 639 __ atomic_cas_bool(prev, newval, dest, 0, noreg/*ignored*/); 640 __ b(retry, ne); 641 __ mov (R0, prev); 642 } 643 __ bx(LR); 644 645 return start; 646 } 647 648 // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint *dest, jint compare_value) 649 // 650 // Arguments : 651 // 652 // compare_value: R0 653 // exchange_value: R1 654 // dest: R2 655 // 656 // Results: 657 // 658 // R0: the value previously stored in dest 659 // 660 // Overwrites: 661 // 662 // R0, R1, R2, R3, Rtemp 663 // 664 address generate_atomic_cmpxchg() { 665 address start; 666 667 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg"); 668 start = __ pc(); 669 Register cmp = R0; 670 Register newval = R1; 671 Register dest = R2; 672 Register temp1 = R3; 673 Register temp2 = Rtemp; // Rtemp free (native ABI) 674 675 __ membar(MEMBAR_ATOMIC_OP_PRE, temp1); 676 677 // atomic_cas returns previous value in R0 678 __ atomic_cas(temp1, temp2, cmp, newval, dest, 0); 679 680 __ membar(MEMBAR_ATOMIC_OP_POST, temp1); 681 682 __ bx(LR); 683 684 return start; 685 } 686 687 // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value) 688 // reordered before by a wrapper to (jlong compare_value, jlong exchange_value, volatile jlong *dest) 689 // 690 // Arguments : 691 // 692 // compare_value: R1 (High), R0 (Low) 693 // exchange_value: R3 (High), R2 (Low) 694 // dest: SP+0 695 // 696 // Results: 697 // 698 // R0:R1: the value previously stored in dest 699 // 700 // Overwrites: 701 // 702 address generate_atomic_cmpxchg_long() { 703 address start; 704 705 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long"); 706 start = __ pc(); 707 Register cmp_lo = R0; 708 Register cmp_hi = R1; 709 Register newval_lo = R2; 710 Register newval_hi = R3; 711 Register addr = Rtemp; /* After load from stack */ 712 Register temp_lo = R4; 713 Register temp_hi = R5; 714 Register temp_result = R8; 715 assert_different_registers(cmp_lo, newval_lo, temp_lo, addr, temp_result, R7); 716 assert_different_registers(cmp_hi, newval_hi, temp_hi, addr, temp_result, R7); 717 718 __ membar(MEMBAR_ATOMIC_OP_PRE, Rtemp); // Rtemp free (native ABI) 719 720 // Stack is unaligned, maintain double word alignment by pushing 721 // odd number of regs. 722 __ push(RegisterSet(temp_result) | RegisterSet(temp_lo, temp_hi)); 723 __ ldr(addr, Address(SP, 12)); 724 725 // atomic_cas64 returns previous value in temp_lo, temp_hi 726 __ atomic_cas64(temp_lo, temp_hi, temp_result, cmp_lo, cmp_hi, 727 newval_lo, newval_hi, addr, 0); 728 __ mov(R0, temp_lo); 729 __ mov(R1, temp_hi); 730 731 __ pop(RegisterSet(temp_result) | RegisterSet(temp_lo, temp_hi)); 732 733 __ membar(MEMBAR_ATOMIC_OP_POST, Rtemp); // Rtemp free (native ABI) 734 __ bx(LR); 735 736 return start; 737 } 738 739 address generate_atomic_load_long() { 740 address start; 741 742 StubCodeMark mark(this, "StubRoutines", "atomic_load_long"); 743 start = __ pc(); 744 Register result_lo = R0; 745 Register result_hi = R1; 746 Register src = R0; 747 748 if (!os::is_MP()) { 749 __ ldmia(src, RegisterSet(result_lo, result_hi)); 750 __ bx(LR); 751 } else if (VM_Version::supports_ldrexd()) { 752 __ ldrexd(result_lo, Address(src)); 753 __ clrex(); // FIXME: safe to remove? 754 __ bx(LR); 755 } else { 756 __ stop("Atomic load(jlong) unsupported on this platform"); 757 __ bx(LR); 758 } 759 760 return start; 761 } 762 763 address generate_atomic_store_long() { 764 address start; 765 766 StubCodeMark mark(this, "StubRoutines", "atomic_store_long"); 767 start = __ pc(); 768 Register newval_lo = R0; 769 Register newval_hi = R1; 770 Register dest = R2; 771 Register scratch_lo = R2; 772 Register scratch_hi = R3; /* After load from stack */ 773 Register result = R3; 774 775 if (!os::is_MP()) { 776 __ stmia(dest, RegisterSet(newval_lo, newval_hi)); 777 __ bx(LR); 778 } else if (VM_Version::supports_ldrexd()) { 779 __ mov(Rtemp, dest); // get dest to Rtemp 780 Label retry; 781 __ bind(retry); 782 __ ldrexd(scratch_lo, Address(Rtemp)); 783 __ strexd(result, R0, Address(Rtemp)); 784 __ rsbs(result, result, 1); 785 __ b(retry, eq); 786 __ bx(LR); 787 } else { 788 __ stop("Atomic store(jlong) unsupported on this platform"); 789 __ bx(LR); 790 } 791 792 return start; 793 } 794 795 796 #endif // AARCH64 797 798 #ifdef COMPILER2 799 // Support for uint StubRoutine::Arm::partial_subtype_check( Klass sub, Klass super ); 800 // Arguments : 801 // 802 // ret : R0, returned 803 // icc/xcc: set as R0 (depending on wordSize) 804 // sub : R1, argument, not changed 805 // super: R2, argument, not changed 806 // raddr: LR, blown by call 807 address generate_partial_subtype_check() { 808 __ align(CodeEntryAlignment); 809 StubCodeMark mark(this, "StubRoutines", "partial_subtype_check"); 810 address start = __ pc(); 811 812 // based on SPARC check_klass_subtype_[fast|slow]_path (without CompressedOops) 813 814 // R0 used as tmp_reg (in addition to return reg) 815 Register sub_klass = R1; 816 Register super_klass = R2; 817 Register tmp_reg2 = R3; 818 Register tmp_reg3 = R4; 819 #define saved_set tmp_reg2, tmp_reg3 820 821 Label L_loop, L_fail; 822 823 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 824 825 // fast check should be redundant 826 827 // slow check 828 { 829 __ raw_push(saved_set); 830 831 // a couple of useful fields in sub_klass: 832 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 833 834 // Do a linear scan of the secondary super-klass chain. 835 // This code is rarely used, so simplicity is a virtue here. 836 837 inc_counter_np(SharedRuntime::_partial_subtype_ctr, tmp_reg2, tmp_reg3); 838 839 Register scan_temp = tmp_reg2; 840 Register count_temp = tmp_reg3; 841 842 // We will consult the secondary-super array. 843 __ ldr(scan_temp, Address(sub_klass, ss_offset)); 844 845 Register search_key = super_klass; 846 847 // Load the array length. 848 __ ldr_s32(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes())); 849 __ add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes()); 850 851 __ add(count_temp, count_temp, 1); 852 853 // Top of search loop 854 __ bind(L_loop); 855 // Notes: 856 // scan_temp starts at the array elements 857 // count_temp is 1+size 858 __ subs(count_temp, count_temp, 1); 859 __ b(L_fail, eq); // not found in the array 860 861 // Load next super to check 862 // In the array of super classes elements are pointer sized. 863 int element_size = wordSize; 864 __ ldr(R0, Address(scan_temp, element_size, post_indexed)); 865 866 // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list 867 __ subs(R0, R0, search_key); // set R0 to 0 on success (and flags to eq) 868 869 // A miss means we are NOT a subtype and need to keep looping 870 __ b(L_loop, ne); 871 872 // Falling out the bottom means we found a hit; we ARE a subtype 873 874 // Success. Cache the super we found and proceed in triumph. 875 __ str(super_klass, Address(sub_klass, sc_offset)); 876 877 // Return success 878 // R0 is already 0 and flags are already set to eq 879 __ raw_pop(saved_set); 880 __ ret(); 881 882 // Return failure 883 __ bind(L_fail); 884 #ifdef AARCH64 885 // count_temp is 0, can't use ZR here 886 __ adds(R0, count_temp, 1); // sets the flags 887 #else 888 __ movs(R0, 1); // sets the flags 889 #endif 890 __ raw_pop(saved_set); 891 __ ret(); 892 } 893 return start; 894 } 895 #undef saved_set 896 #endif // COMPILER2 897 898 899 //---------------------------------------------------------------------------------------------------- 900 // Non-destructive plausibility checks for oops 901 902 address generate_verify_oop() { 903 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 904 address start = __ pc(); 905 906 // Incoming arguments: 907 // 908 // R0: error message (char* ) 909 // R1: address of register save area 910 // R2: oop to verify 911 // 912 // All registers are saved before calling this stub. However, condition flags should be saved here. 913 914 const Register oop = R2; 915 const Register klass = R3; 916 const Register tmp1 = R6; 917 const Register tmp2 = R8; 918 919 const Register flags = Rtmp_save0; // R4/R19 920 const Register ret_addr = Rtmp_save1; // R5/R20 921 assert_different_registers(oop, klass, tmp1, tmp2, flags, ret_addr, R7); 922 923 Label exit, error; 924 InlinedAddress verify_oop_count((address) StubRoutines::verify_oop_count_addr()); 925 926 #ifdef AARCH64 927 __ mrs(flags, Assembler::SysReg_NZCV); 928 #else 929 __ mrs(Assembler::CPSR, flags); 930 #endif // AARCH64 931 932 __ ldr_literal(tmp1, verify_oop_count); 933 __ ldr_s32(tmp2, Address(tmp1)); 934 __ add(tmp2, tmp2, 1); 935 __ str_32(tmp2, Address(tmp1)); 936 937 // make sure object is 'reasonable' 938 __ cbz(oop, exit); // if obj is NULL it is ok 939 940 // Check if the oop is in the right area of memory 941 // Note: oop_mask and oop_bits must be updated if the code is saved/reused 942 const address oop_mask = (address) Universe::verify_oop_mask(); 943 const address oop_bits = (address) Universe::verify_oop_bits(); 944 __ mov_address(tmp1, oop_mask, symbolic_Relocation::oop_mask_reference); 945 __ andr(tmp2, oop, tmp1); 946 __ mov_address(tmp1, oop_bits, symbolic_Relocation::oop_bits_reference); 947 __ cmp(tmp2, tmp1); 948 __ b(error, ne); 949 950 // make sure klass is 'reasonable' 951 __ load_klass(klass, oop); // get klass 952 __ cbz(klass, error); // if klass is NULL it is broken 953 954 // return if everything seems ok 955 __ bind(exit); 956 957 #ifdef AARCH64 958 __ msr(Assembler::SysReg_NZCV, flags); 959 #else 960 __ msr(Assembler::CPSR_f, flags); 961 #endif // AARCH64 962 963 __ ret(); 964 965 // handle errors 966 __ bind(error); 967 968 __ mov(ret_addr, LR); // save return address 969 970 // R0: error message 971 // R1: register save area 972 __ call(CAST_FROM_FN_PTR(address, MacroAssembler::debug)); 973 974 __ mov(LR, ret_addr); 975 __ b(exit); 976 977 __ bind_literal(verify_oop_count); 978 979 return start; 980 } 981 982 //---------------------------------------------------------------------------------------------------- 983 // Array copy stubs 984 985 // 986 // Generate overlap test for array copy stubs 987 // 988 // Input: 989 // R0 - array1 990 // R1 - array2 991 // R2 - element count, 32-bit int 992 // 993 // input registers are preserved 994 // 995 void array_overlap_test(address no_overlap_target, int log2_elem_size, Register tmp1, Register tmp2) { 996 assert(no_overlap_target != NULL, "must be generated"); 997 array_overlap_test(no_overlap_target, NULL, log2_elem_size, tmp1, tmp2); 998 } 999 void array_overlap_test(Label& L_no_overlap, int log2_elem_size, Register tmp1, Register tmp2) { 1000 array_overlap_test(NULL, &L_no_overlap, log2_elem_size, tmp1, tmp2); 1001 } 1002 void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size, Register tmp1, Register tmp2) { 1003 const Register from = R0; 1004 const Register to = R1; 1005 const Register count = R2; 1006 const Register to_from = tmp1; // to - from 1007 #ifndef AARCH64 1008 const Register byte_count = (log2_elem_size == 0) ? count : tmp2; // count << log2_elem_size 1009 #endif // AARCH64 1010 assert_different_registers(from, to, count, tmp1, tmp2); 1011 1012 // no_overlap version works if 'to' lower (unsigned) than 'from' 1013 // and or 'to' more than (count*size) from 'from' 1014 1015 BLOCK_COMMENT("Array Overlap Test:"); 1016 __ subs(to_from, to, from); 1017 #ifndef AARCH64 1018 if (log2_elem_size != 0) { 1019 __ mov(byte_count, AsmOperand(count, lsl, log2_elem_size)); 1020 } 1021 #endif // !AARCH64 1022 if (NOLp == NULL) 1023 __ b(no_overlap_target,lo); 1024 else 1025 __ b((*NOLp), lo); 1026 #ifdef AARCH64 1027 __ subs(ZR, to_from, count, ex_sxtw, log2_elem_size); 1028 #else 1029 __ cmp(to_from, byte_count); 1030 #endif // AARCH64 1031 if (NOLp == NULL) 1032 __ b(no_overlap_target, ge); 1033 else 1034 __ b((*NOLp), ge); 1035 } 1036 1037 #ifdef AARCH64 1038 // TODO-AARCH64: revise usages of bulk_* methods (probably ldp`s and stp`s should interlace) 1039 1040 // Loads [from, from + count*wordSize) into regs[0], regs[1], ..., regs[count-1] 1041 // and increases 'from' by count*wordSize. 1042 void bulk_load_forward(Register from, const Register regs[], int count) { 1043 assert (count > 0 && count % 2 == 0, "count must be positive even number"); 1044 int bytes = count * wordSize; 1045 1046 int offset = 0; 1047 __ ldp(regs[0], regs[1], Address(from, bytes, post_indexed)); 1048 offset += 2*wordSize; 1049 1050 for (int i = 2; i < count; i += 2) { 1051 __ ldp(regs[i], regs[i+1], Address(from, -bytes + offset)); 1052 offset += 2*wordSize; 1053 } 1054 1055 assert (offset == bytes, "must be"); 1056 } 1057 1058 // Stores regs[0], regs[1], ..., regs[count-1] to [to, to + count*wordSize) 1059 // and increases 'to' by count*wordSize. 1060 void bulk_store_forward(Register to, const Register regs[], int count) { 1061 assert (count > 0 && count % 2 == 0, "count must be positive even number"); 1062 int bytes = count * wordSize; 1063 1064 int offset = 0; 1065 __ stp(regs[0], regs[1], Address(to, bytes, post_indexed)); 1066 offset += 2*wordSize; 1067 1068 for (int i = 2; i < count; i += 2) { 1069 __ stp(regs[i], regs[i+1], Address(to, -bytes + offset)); 1070 offset += 2*wordSize; 1071 } 1072 1073 assert (offset == bytes, "must be"); 1074 } 1075 1076 // Loads [from - count*wordSize, from) into regs[0], regs[1], ..., regs[count-1] 1077 // and decreases 'from' by count*wordSize. 1078 // Note that the word with lowest address goes to regs[0]. 1079 void bulk_load_backward(Register from, const Register regs[], int count) { 1080 assert (count > 0 && count % 2 == 0, "count must be positive even number"); 1081 int bytes = count * wordSize; 1082 1083 int offset = 0; 1084 1085 for (int i = count - 2; i > 0; i -= 2) { 1086 offset += 2*wordSize; 1087 __ ldp(regs[i], regs[i+1], Address(from, -offset)); 1088 } 1089 1090 offset += 2*wordSize; 1091 __ ldp(regs[0], regs[1], Address(from, -bytes, pre_indexed)); 1092 1093 assert (offset == bytes, "must be"); 1094 } 1095 1096 // Stores regs[0], regs[1], ..., regs[count-1] into [to - count*wordSize, to) 1097 // and decreases 'to' by count*wordSize. 1098 // Note that regs[0] value goes into the memory with lowest address. 1099 void bulk_store_backward(Register to, const Register regs[], int count) { 1100 assert (count > 0 && count % 2 == 0, "count must be positive even number"); 1101 int bytes = count * wordSize; 1102 1103 int offset = 0; 1104 1105 for (int i = count - 2; i > 0; i -= 2) { 1106 offset += 2*wordSize; 1107 __ stp(regs[i], regs[i+1], Address(to, -offset)); 1108 } 1109 1110 offset += 2*wordSize; 1111 __ stp(regs[0], regs[1], Address(to, -bytes, pre_indexed)); 1112 1113 assert (offset == bytes, "must be"); 1114 } 1115 #endif // AARCH64 1116 1117 // TODO-AARCH64: rearrange in-loop prefetches: 1118 // probably we should choose between "prefetch-store before or after store", not "before or after load". 1119 void prefetch(Register from, Register to, int offset, int to_delta = 0) { 1120 __ prefetch_read(Address(from, offset)); 1121 #ifdef AARCH64 1122 // Next line commented out to avoid significant loss of performance in memory copy - JDK-8078120 1123 // __ prfm(pstl1keep, Address(to, offset + to_delta)); 1124 #endif // AARCH64 1125 } 1126 1127 // Generate the inner loop for forward aligned array copy 1128 // 1129 // Arguments 1130 // from: src address, 64 bits aligned 1131 // to: dst address, wordSize aligned 1132 // count: number of elements (32-bit int) 1133 // bytes_per_count: number of bytes for each unit of 'count' 1134 // 1135 // Return the minimum initial value for count 1136 // 1137 // Notes: 1138 // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64) 1139 // - 'to' aligned on wordSize 1140 // - 'count' must be greater or equal than the returned value 1141 // 1142 // Increases 'from' and 'to' by count*bytes_per_count. 1143 // 1144 // Scratches 'count', R3. 1145 // On AArch64 also scratches R4-R10; on 32-bit ARM R4-R10 are preserved (saved/restored). 1146 // 1147 int generate_forward_aligned_copy_loop(Register from, Register to, Register count, int bytes_per_count) { 1148 assert (from == R0 && to == R1 && count == R2, "adjust the implementation below"); 1149 1150 const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration 1151 arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_aligned; 1152 int pld_offset = config->pld_distance; 1153 const int count_per_loop = bytes_per_loop / bytes_per_count; 1154 1155 #ifndef AARCH64 1156 bool split_read= config->split_ldm; 1157 bool split_write= config->split_stm; 1158 1159 // XXX optim: use VLDM/VSTM when available (Neon) with PLD 1160 // NEONCopyPLD 1161 // PLD [r1, #0xC0] 1162 // VLDM r1!,{d0-d7} 1163 // VSTM r0!,{d0-d7} 1164 // SUBS r2,r2,#0x40 1165 // BGE NEONCopyPLD 1166 1167 __ push(RegisterSet(R4,R10)); 1168 #endif // !AARCH64 1169 1170 const bool prefetch_before = pld_offset < 0; 1171 const bool prefetch_after = pld_offset > 0; 1172 1173 Label L_skip_pld; 1174 1175 // predecrease to exit when there is less than count_per_loop 1176 __ sub_32(count, count, count_per_loop); 1177 1178 if (pld_offset != 0) { 1179 pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset; 1180 1181 prefetch(from, to, 0); 1182 1183 if (prefetch_before) { 1184 // If prefetch is done ahead, final PLDs that overflow the 1185 // copied area can be easily avoided. 'count' is predecreased 1186 // by the prefetch distance to optimize the inner loop and the 1187 // outer loop skips the PLD. 1188 __ subs_32(count, count, (bytes_per_loop+pld_offset)/bytes_per_count); 1189 1190 // skip prefetch for small copies 1191 __ b(L_skip_pld, lt); 1192 } 1193 1194 int offset = ArmCopyCacheLineSize; 1195 while (offset <= pld_offset) { 1196 prefetch(from, to, offset); 1197 offset += ArmCopyCacheLineSize; 1198 }; 1199 } 1200 1201 #ifdef AARCH64 1202 const Register data_regs[8] = {R3, R4, R5, R6, R7, R8, R9, R10}; 1203 #endif // AARCH64 1204 { 1205 // LDM (32-bit ARM) / LDP (AArch64) copy of 'bytes_per_loop' bytes 1206 1207 // 32-bit ARM note: we have tried implementing loop unrolling to skip one 1208 // PLD with 64 bytes cache line but the gain was not significant. 1209 1210 Label L_copy_loop; 1211 __ align(OptoLoopAlignment); 1212 __ BIND(L_copy_loop); 1213 1214 if (prefetch_before) { 1215 prefetch(from, to, bytes_per_loop + pld_offset); 1216 __ BIND(L_skip_pld); 1217 } 1218 1219 #ifdef AARCH64 1220 bulk_load_forward(from, data_regs, 8); 1221 #else 1222 if (split_read) { 1223 // Split the register set in two sets so that there is less 1224 // latency between LDM and STM (R3-R6 available while R7-R10 1225 // still loading) and less register locking issue when iterating 1226 // on the first LDM. 1227 __ ldmia(from, RegisterSet(R3, R6), writeback); 1228 __ ldmia(from, RegisterSet(R7, R10), writeback); 1229 } else { 1230 __ ldmia(from, RegisterSet(R3, R10), writeback); 1231 } 1232 #endif // AARCH64 1233 1234 __ subs_32(count, count, count_per_loop); 1235 1236 if (prefetch_after) { 1237 prefetch(from, to, pld_offset, bytes_per_loop); 1238 } 1239 1240 #ifdef AARCH64 1241 bulk_store_forward(to, data_regs, 8); 1242 #else 1243 if (split_write) { 1244 __ stmia(to, RegisterSet(R3, R6), writeback); 1245 __ stmia(to, RegisterSet(R7, R10), writeback); 1246 } else { 1247 __ stmia(to, RegisterSet(R3, R10), writeback); 1248 } 1249 #endif // AARCH64 1250 1251 __ b(L_copy_loop, ge); 1252 1253 if (prefetch_before) { 1254 // the inner loop may end earlier, allowing to skip PLD for the last iterations 1255 __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count); 1256 __ b(L_skip_pld, ge); 1257 } 1258 } 1259 BLOCK_COMMENT("Remaining bytes:"); 1260 // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes 1261 1262 // __ add(count, count, ...); // addition useless for the bit tests 1263 assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits"); 1264 1265 #ifdef AARCH64 1266 assert (bytes_per_loop == 64, "adjust the code below"); 1267 assert (bytes_per_count <= 8, "adjust the code below"); 1268 1269 { 1270 Label L; 1271 __ tbz(count, exact_log2(32/bytes_per_count), L); 1272 1273 bulk_load_forward(from, data_regs, 4); 1274 bulk_store_forward(to, data_regs, 4); 1275 1276 __ bind(L); 1277 } 1278 1279 { 1280 Label L; 1281 __ tbz(count, exact_log2(16/bytes_per_count), L); 1282 1283 bulk_load_forward(from, data_regs, 2); 1284 bulk_store_forward(to, data_regs, 2); 1285 1286 __ bind(L); 1287 } 1288 1289 { 1290 Label L; 1291 __ tbz(count, exact_log2(8/bytes_per_count), L); 1292 1293 __ ldr(R3, Address(from, 8, post_indexed)); 1294 __ str(R3, Address(to, 8, post_indexed)); 1295 1296 __ bind(L); 1297 } 1298 1299 if (bytes_per_count <= 4) { 1300 Label L; 1301 __ tbz(count, exact_log2(4/bytes_per_count), L); 1302 1303 __ ldr_w(R3, Address(from, 4, post_indexed)); 1304 __ str_w(R3, Address(to, 4, post_indexed)); 1305 1306 __ bind(L); 1307 } 1308 1309 if (bytes_per_count <= 2) { 1310 Label L; 1311 __ tbz(count, exact_log2(2/bytes_per_count), L); 1312 1313 __ ldrh(R3, Address(from, 2, post_indexed)); 1314 __ strh(R3, Address(to, 2, post_indexed)); 1315 1316 __ bind(L); 1317 } 1318 1319 if (bytes_per_count <= 1) { 1320 Label L; 1321 __ tbz(count, 0, L); 1322 1323 __ ldrb(R3, Address(from, 1, post_indexed)); 1324 __ strb(R3, Address(to, 1, post_indexed)); 1325 1326 __ bind(L); 1327 } 1328 #else 1329 __ tst(count, 16 / bytes_per_count); 1330 __ ldmia(from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes 1331 __ stmia(to, RegisterSet(R3, R6), writeback, ne); 1332 1333 __ tst(count, 8 / bytes_per_count); 1334 __ ldmia(from, RegisterSet(R3, R4), writeback, ne); // copy 8 bytes 1335 __ stmia(to, RegisterSet(R3, R4), writeback, ne); 1336 1337 if (bytes_per_count <= 4) { 1338 __ tst(count, 4 / bytes_per_count); 1339 __ ldr(R3, Address(from, 4, post_indexed), ne); // copy 4 bytes 1340 __ str(R3, Address(to, 4, post_indexed), ne); 1341 } 1342 1343 if (bytes_per_count <= 2) { 1344 __ tst(count, 2 / bytes_per_count); 1345 __ ldrh(R3, Address(from, 2, post_indexed), ne); // copy 2 bytes 1346 __ strh(R3, Address(to, 2, post_indexed), ne); 1347 } 1348 1349 if (bytes_per_count == 1) { 1350 __ tst(count, 1); 1351 __ ldrb(R3, Address(from, 1, post_indexed), ne); 1352 __ strb(R3, Address(to, 1, post_indexed), ne); 1353 } 1354 1355 __ pop(RegisterSet(R4,R10)); 1356 #endif // AARCH64 1357 1358 return count_per_loop; 1359 } 1360 1361 1362 // Generate the inner loop for backward aligned array copy 1363 // 1364 // Arguments 1365 // end_from: src end address, 64 bits aligned 1366 // end_to: dst end address, wordSize aligned 1367 // count: number of elements (32-bit int) 1368 // bytes_per_count: number of bytes for each unit of 'count' 1369 // 1370 // Return the minimum initial value for count 1371 // 1372 // Notes: 1373 // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64) 1374 // - 'end_to' aligned on wordSize 1375 // - 'count' must be greater or equal than the returned value 1376 // 1377 // Decreases 'end_from' and 'end_to' by count*bytes_per_count. 1378 // 1379 // Scratches 'count', R3. 1380 // On AArch64 also scratches R4-R10; on 32-bit ARM R4-R10 are preserved (saved/restored). 1381 // 1382 int generate_backward_aligned_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count) { 1383 assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below"); 1384 1385 const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration 1386 const int count_per_loop = bytes_per_loop / bytes_per_count; 1387 1388 arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_aligned; 1389 int pld_offset = config->pld_distance; 1390 1391 #ifndef AARCH64 1392 bool split_read= config->split_ldm; 1393 bool split_write= config->split_stm; 1394 1395 // See the forward copy variant for additional comments. 1396 1397 __ push(RegisterSet(R4,R10)); 1398 #endif // !AARCH64 1399 1400 __ sub_32(count, count, count_per_loop); 1401 1402 const bool prefetch_before = pld_offset < 0; 1403 const bool prefetch_after = pld_offset > 0; 1404 1405 Label L_skip_pld; 1406 1407 if (pld_offset != 0) { 1408 pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset; 1409 1410 prefetch(end_from, end_to, -wordSize); 1411 1412 if (prefetch_before) { 1413 __ subs_32(count, count, (bytes_per_loop + pld_offset) / bytes_per_count); 1414 __ b(L_skip_pld, lt); 1415 } 1416 1417 int offset = ArmCopyCacheLineSize; 1418 while (offset <= pld_offset) { 1419 prefetch(end_from, end_to, -(wordSize + offset)); 1420 offset += ArmCopyCacheLineSize; 1421 }; 1422 } 1423 1424 #ifdef AARCH64 1425 const Register data_regs[8] = {R3, R4, R5, R6, R7, R8, R9, R10}; 1426 #endif // AARCH64 1427 { 1428 // LDM (32-bit ARM) / LDP (AArch64) copy of 'bytes_per_loop' bytes 1429 1430 // 32-bit ARM note: we have tried implementing loop unrolling to skip one 1431 // PLD with 64 bytes cache line but the gain was not significant. 1432 1433 Label L_copy_loop; 1434 __ align(OptoLoopAlignment); 1435 __ BIND(L_copy_loop); 1436 1437 if (prefetch_before) { 1438 prefetch(end_from, end_to, -(wordSize + bytes_per_loop + pld_offset)); 1439 __ BIND(L_skip_pld); 1440 } 1441 1442 #ifdef AARCH64 1443 bulk_load_backward(end_from, data_regs, 8); 1444 #else 1445 if (split_read) { 1446 __ ldmdb(end_from, RegisterSet(R7, R10), writeback); 1447 __ ldmdb(end_from, RegisterSet(R3, R6), writeback); 1448 } else { 1449 __ ldmdb(end_from, RegisterSet(R3, R10), writeback); 1450 } 1451 #endif // AARCH64 1452 1453 __ subs_32(count, count, count_per_loop); 1454 1455 if (prefetch_after) { 1456 prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop); 1457 } 1458 1459 #ifdef AARCH64 1460 bulk_store_backward(end_to, data_regs, 8); 1461 #else 1462 if (split_write) { 1463 __ stmdb(end_to, RegisterSet(R7, R10), writeback); 1464 __ stmdb(end_to, RegisterSet(R3, R6), writeback); 1465 } else { 1466 __ stmdb(end_to, RegisterSet(R3, R10), writeback); 1467 } 1468 #endif // AARCH64 1469 1470 __ b(L_copy_loop, ge); 1471 1472 if (prefetch_before) { 1473 __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count); 1474 __ b(L_skip_pld, ge); 1475 } 1476 } 1477 BLOCK_COMMENT("Remaining bytes:"); 1478 // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes 1479 1480 // __ add(count, count, ...); // addition useless for the bit tests 1481 assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits"); 1482 1483 #ifdef AARCH64 1484 assert (bytes_per_loop == 64, "adjust the code below"); 1485 assert (bytes_per_count <= 8, "adjust the code below"); 1486 1487 { 1488 Label L; 1489 __ tbz(count, exact_log2(32/bytes_per_count), L); 1490 1491 bulk_load_backward(end_from, data_regs, 4); 1492 bulk_store_backward(end_to, data_regs, 4); 1493 1494 __ bind(L); 1495 } 1496 1497 { 1498 Label L; 1499 __ tbz(count, exact_log2(16/bytes_per_count), L); 1500 1501 bulk_load_backward(end_from, data_regs, 2); 1502 bulk_store_backward(end_to, data_regs, 2); 1503 1504 __ bind(L); 1505 } 1506 1507 { 1508 Label L; 1509 __ tbz(count, exact_log2(8/bytes_per_count), L); 1510 1511 __ ldr(R3, Address(end_from, -8, pre_indexed)); 1512 __ str(R3, Address(end_to, -8, pre_indexed)); 1513 1514 __ bind(L); 1515 } 1516 1517 if (bytes_per_count <= 4) { 1518 Label L; 1519 __ tbz(count, exact_log2(4/bytes_per_count), L); 1520 1521 __ ldr_w(R3, Address(end_from, -4, pre_indexed)); 1522 __ str_w(R3, Address(end_to, -4, pre_indexed)); 1523 1524 __ bind(L); 1525 } 1526 1527 if (bytes_per_count <= 2) { 1528 Label L; 1529 __ tbz(count, exact_log2(2/bytes_per_count), L); 1530 1531 __ ldrh(R3, Address(end_from, -2, pre_indexed)); 1532 __ strh(R3, Address(end_to, -2, pre_indexed)); 1533 1534 __ bind(L); 1535 } 1536 1537 if (bytes_per_count <= 1) { 1538 Label L; 1539 __ tbz(count, 0, L); 1540 1541 __ ldrb(R3, Address(end_from, -1, pre_indexed)); 1542 __ strb(R3, Address(end_to, -1, pre_indexed)); 1543 1544 __ bind(L); 1545 } 1546 #else 1547 __ tst(count, 16 / bytes_per_count); 1548 __ ldmdb(end_from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes 1549 __ stmdb(end_to, RegisterSet(R3, R6), writeback, ne); 1550 1551 __ tst(count, 8 / bytes_per_count); 1552 __ ldmdb(end_from, RegisterSet(R3, R4), writeback, ne); // copy 8 bytes 1553 __ stmdb(end_to, RegisterSet(R3, R4), writeback, ne); 1554 1555 if (bytes_per_count <= 4) { 1556 __ tst(count, 4 / bytes_per_count); 1557 __ ldr(R3, Address(end_from, -4, pre_indexed), ne); // copy 4 bytes 1558 __ str(R3, Address(end_to, -4, pre_indexed), ne); 1559 } 1560 1561 if (bytes_per_count <= 2) { 1562 __ tst(count, 2 / bytes_per_count); 1563 __ ldrh(R3, Address(end_from, -2, pre_indexed), ne); // copy 2 bytes 1564 __ strh(R3, Address(end_to, -2, pre_indexed), ne); 1565 } 1566 1567 if (bytes_per_count == 1) { 1568 __ tst(count, 1); 1569 __ ldrb(R3, Address(end_from, -1, pre_indexed), ne); 1570 __ strb(R3, Address(end_to, -1, pre_indexed), ne); 1571 } 1572 1573 __ pop(RegisterSet(R4,R10)); 1574 #endif // AARCH64 1575 1576 return count_per_loop; 1577 } 1578 1579 1580 // Generate the inner loop for shifted forward array copy (unaligned copy). 1581 // It can be used when bytes_per_count < wordSize, i.e. 1582 // byte/short copy on 32-bit ARM, byte/short/int/compressed-oop copy on AArch64. 1583 // 1584 // Arguments 1585 // from: start src address, 64 bits aligned 1586 // to: start dst address, (now) wordSize aligned 1587 // count: number of elements (32-bit int) 1588 // bytes_per_count: number of bytes for each unit of 'count' 1589 // lsr_shift: shift applied to 'old' value to skipped already written bytes 1590 // lsl_shift: shift applied to 'new' value to set the high bytes of the next write 1591 // 1592 // Return the minimum initial value for count 1593 // 1594 // Notes: 1595 // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64) 1596 // - 'to' aligned on wordSize 1597 // - 'count' must be greater or equal than the returned value 1598 // - 'lsr_shift' + 'lsl_shift' = BitsPerWord 1599 // - 'bytes_per_count' is 1 or 2 on 32-bit ARM; 1, 2 or 4 on AArch64 1600 // 1601 // Increases 'to' by count*bytes_per_count. 1602 // 1603 // Scratches 'from' and 'count', R3-R10, R12 1604 // 1605 // On entry: 1606 // - R12 is preloaded with the first 'BitsPerWord' bits read just before 'from' 1607 // - (R12 >> lsr_shift) is the part not yet written (just before 'to') 1608 // --> (*to) = (R12 >> lsr_shift) | (*from) << lsl_shift); ... 1609 // 1610 // This implementation may read more bytes than required. 1611 // Actually, it always reads exactly all data from the copied region with upper bound aligned up by wordSize, 1612 // so excessive read do not cross a word bound and is thus harmless. 1613 // 1614 int generate_forward_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, int lsr_shift, int lsl_shift) { 1615 assert (from == R0 && to == R1 && count == R2, "adjust the implementation below"); 1616 1617 const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iter 1618 const int count_per_loop = bytes_per_loop / bytes_per_count; 1619 1620 arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_shifted; 1621 int pld_offset = config->pld_distance; 1622 1623 #ifndef AARCH64 1624 bool split_read= config->split_ldm; 1625 bool split_write= config->split_stm; 1626 #endif // !AARCH64 1627 1628 const bool prefetch_before = pld_offset < 0; 1629 const bool prefetch_after = pld_offset > 0; 1630 Label L_skip_pld, L_last_read, L_done; 1631 if (pld_offset != 0) { 1632 1633 pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset; 1634 1635 prefetch(from, to, 0); 1636 1637 if (prefetch_before) { 1638 __ cmp_32(count, count_per_loop); 1639 __ b(L_last_read, lt); 1640 // skip prefetch for small copies 1641 // warning: count is predecreased by the prefetch distance to optimize the inner loop 1642 __ subs_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop); 1643 __ b(L_skip_pld, lt); 1644 } 1645 1646 int offset = ArmCopyCacheLineSize; 1647 while (offset <= pld_offset) { 1648 prefetch(from, to, offset); 1649 offset += ArmCopyCacheLineSize; 1650 }; 1651 } 1652 1653 Label L_shifted_loop; 1654 1655 __ align(OptoLoopAlignment); 1656 __ BIND(L_shifted_loop); 1657 1658 if (prefetch_before) { 1659 // do it early if there might be register locking issues 1660 prefetch(from, to, bytes_per_loop + pld_offset); 1661 __ BIND(L_skip_pld); 1662 } else { 1663 __ cmp_32(count, count_per_loop); 1664 __ b(L_last_read, lt); 1665 } 1666 1667 #ifdef AARCH64 1668 const Register data_regs[9] = {R3, R4, R5, R6, R7, R8, R9, R10, R12}; 1669 __ logical_shift_right(R3, R12, lsr_shift); // part of R12 not yet written 1670 __ subs_32(count, count, count_per_loop); 1671 bulk_load_forward(from, &data_regs[1], 8); 1672 #else 1673 // read 32 bytes 1674 if (split_read) { 1675 // if write is not split, use less registers in first set to reduce locking 1676 RegisterSet set1 = split_write ? RegisterSet(R4, R7) : RegisterSet(R4, R5); 1677 RegisterSet set2 = (split_write ? RegisterSet(R8, R10) : RegisterSet(R6, R10)) | R12; 1678 __ ldmia(from, set1, writeback); 1679 __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); // part of R12 not yet written 1680 __ ldmia(from, set2, writeback); 1681 __ subs(count, count, count_per_loop); // XXX: should it be before the 2nd LDM ? (latency vs locking) 1682 } else { 1683 __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); // part of R12 not yet written 1684 __ ldmia(from, RegisterSet(R4, R10) | R12, writeback); // Note: small latency on R4 1685 __ subs(count, count, count_per_loop); 1686 } 1687 #endif // AARCH64 1688 1689 if (prefetch_after) { 1690 // do it after the 1st ldm/ldp anyway (no locking issues with early STM/STP) 1691 prefetch(from, to, pld_offset, bytes_per_loop); 1692 } 1693 1694 // prepare (shift) the values in R3..R10 1695 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); // merged below low bytes of next val 1696 __ logical_shift_right(R4, R4, lsr_shift); // unused part of next val 1697 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift)); // ... 1698 __ logical_shift_right(R5, R5, lsr_shift); 1699 __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift)); 1700 __ logical_shift_right(R6, R6, lsr_shift); 1701 __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift)); 1702 #ifndef AARCH64 1703 if (split_write) { 1704 // write the first half as soon as possible to reduce stm locking 1705 __ stmia(to, RegisterSet(R3, R6), writeback, prefetch_before ? gt : ge); 1706 } 1707 #endif // !AARCH64 1708 __ logical_shift_right(R7, R7, lsr_shift); 1709 __ orr(R7, R7, AsmOperand(R8, lsl, lsl_shift)); 1710 __ logical_shift_right(R8, R8, lsr_shift); 1711 __ orr(R8, R8, AsmOperand(R9, lsl, lsl_shift)); 1712 __ logical_shift_right(R9, R9, lsr_shift); 1713 __ orr(R9, R9, AsmOperand(R10, lsl, lsl_shift)); 1714 __ logical_shift_right(R10, R10, lsr_shift); 1715 __ orr(R10, R10, AsmOperand(R12, lsl, lsl_shift)); 1716 1717 #ifdef AARCH64 1718 bulk_store_forward(to, data_regs, 8); 1719 #else 1720 if (split_write) { 1721 __ stmia(to, RegisterSet(R7, R10), writeback, prefetch_before ? gt : ge); 1722 } else { 1723 __ stmia(to, RegisterSet(R3, R10), writeback, prefetch_before ? gt : ge); 1724 } 1725 #endif // AARCH64 1726 __ b(L_shifted_loop, gt); // no need to loop if 0 (when count need not be precise modulo bytes_per_loop) 1727 1728 if (prefetch_before) { 1729 // the first loop may end earlier, allowing to skip pld at the end 1730 __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count); 1731 #ifndef AARCH64 1732 __ stmia(to, RegisterSet(R3, R10), writeback); // stmia was skipped 1733 #endif // !AARCH64 1734 __ b(L_skip_pld, ge); 1735 __ adds_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop); 1736 } 1737 1738 __ BIND(L_last_read); 1739 __ b(L_done, eq); 1740 1741 #ifdef AARCH64 1742 assert(bytes_per_count < 8, "adjust the code below"); 1743 1744 __ logical_shift_right(R3, R12, lsr_shift); 1745 1746 { 1747 Label L; 1748 __ tbz(count, exact_log2(32/bytes_per_count), L); 1749 bulk_load_forward(from, &data_regs[1], 4); 1750 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); 1751 __ logical_shift_right(R4, R4, lsr_shift); 1752 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift)); 1753 __ logical_shift_right(R5, R5, lsr_shift); 1754 __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift)); 1755 __ logical_shift_right(R6, R6, lsr_shift); 1756 __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift)); 1757 bulk_store_forward(to, data_regs, 4); 1758 __ logical_shift_right(R3, R7, lsr_shift); 1759 __ bind(L); 1760 } 1761 1762 { 1763 Label L; 1764 __ tbz(count, exact_log2(16/bytes_per_count), L); 1765 bulk_load_forward(from, &data_regs[1], 2); 1766 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); 1767 __ logical_shift_right(R4, R4, lsr_shift); 1768 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift)); 1769 bulk_store_forward(to, data_regs, 2); 1770 __ logical_shift_right(R3, R5, lsr_shift); 1771 __ bind(L); 1772 } 1773 1774 { 1775 Label L; 1776 __ tbz(count, exact_log2(8/bytes_per_count), L); 1777 __ ldr(R4, Address(from, 8, post_indexed)); 1778 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); 1779 __ str(R3, Address(to, 8, post_indexed)); 1780 __ logical_shift_right(R3, R4, lsr_shift); 1781 __ bind(L); 1782 } 1783 1784 const int have_bytes = lsl_shift/BitsPerByte; // number of already read bytes in R3 1785 1786 // It remains less than wordSize to write. 1787 // Do not check count if R3 already has maximal number of loaded elements (one less than wordSize). 1788 if (have_bytes < wordSize - bytes_per_count) { 1789 Label L; 1790 __ andr(count, count, (uintx)(8/bytes_per_count-1)); // make count exact 1791 __ cmp_32(count, have_bytes/bytes_per_count); // do we have enough bytes to store? 1792 __ b(L, le); 1793 __ ldr(R4, Address(from, 8, post_indexed)); 1794 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); 1795 __ bind(L); 1796 } 1797 1798 { 1799 Label L; 1800 __ tbz(count, exact_log2(4/bytes_per_count), L); 1801 __ str_w(R3, Address(to, 4, post_indexed)); 1802 if (bytes_per_count < 4) { 1803 __ logical_shift_right(R3, R3, 4*BitsPerByte); 1804 } 1805 __ bind(L); 1806 } 1807 1808 if (bytes_per_count <= 2) { 1809 Label L; 1810 __ tbz(count, exact_log2(2/bytes_per_count), L); 1811 __ strh(R3, Address(to, 2, post_indexed)); 1812 if (bytes_per_count < 2) { 1813 __ logical_shift_right(R3, R3, 2*BitsPerByte); 1814 } 1815 __ bind(L); 1816 } 1817 1818 if (bytes_per_count <= 1) { 1819 Label L; 1820 __ tbz(count, exact_log2(1/bytes_per_count), L); 1821 __ strb(R3, Address(to, 1, post_indexed)); 1822 __ bind(L); 1823 } 1824 #else 1825 switch (bytes_per_count) { 1826 case 2: 1827 __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); 1828 __ tst(count, 8); 1829 __ ldmia(from, RegisterSet(R4, R7), writeback, ne); 1830 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val 1831 __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val 1832 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ... 1833 __ mov(R5, AsmOperand(R5, lsr, lsr_shift), ne); 1834 __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift), ne); 1835 __ mov(R6, AsmOperand(R6, lsr, lsr_shift), ne); 1836 __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift), ne); 1837 __ stmia(to, RegisterSet(R3, R6), writeback, ne); 1838 __ mov(R3, AsmOperand(R7, lsr, lsr_shift), ne); 1839 1840 __ tst(count, 4); 1841 __ ldmia(from, RegisterSet(R4, R5), writeback, ne); 1842 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val 1843 __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val 1844 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ... 1845 __ stmia(to, RegisterSet(R3, R4), writeback, ne); 1846 __ mov(R3, AsmOperand(R5, lsr, lsr_shift), ne); 1847 1848 __ tst(count, 2); 1849 __ ldr(R4, Address(from, 4, post_indexed), ne); 1850 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); 1851 __ str(R3, Address(to, 4, post_indexed), ne); 1852 __ mov(R3, AsmOperand(R4, lsr, lsr_shift), ne); 1853 1854 __ tst(count, 1); 1855 __ strh(R3, Address(to, 2, post_indexed), ne); // one last short 1856 break; 1857 1858 case 1: 1859 __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); 1860 __ tst(count, 16); 1861 __ ldmia(from, RegisterSet(R4, R7), writeback, ne); 1862 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val 1863 __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val 1864 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ... 1865 __ mov(R5, AsmOperand(R5, lsr, lsr_shift), ne); 1866 __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift), ne); 1867 __ mov(R6, AsmOperand(R6, lsr, lsr_shift), ne); 1868 __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift), ne); 1869 __ stmia(to, RegisterSet(R3, R6), writeback, ne); 1870 __ mov(R3, AsmOperand(R7, lsr, lsr_shift), ne); 1871 1872 __ tst(count, 8); 1873 __ ldmia(from, RegisterSet(R4, R5), writeback, ne); 1874 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val 1875 __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val 1876 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ... 1877 __ stmia(to, RegisterSet(R3, R4), writeback, ne); 1878 __ mov(R3, AsmOperand(R5, lsr, lsr_shift), ne); 1879 1880 __ tst(count, 4); 1881 __ ldr(R4, Address(from, 4, post_indexed), ne); 1882 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); 1883 __ str(R3, Address(to, 4, post_indexed), ne); 1884 __ mov(R3, AsmOperand(R4, lsr, lsr_shift), ne); 1885 1886 __ andr(count, count, 3); 1887 __ cmp(count, 2); 1888 1889 // Note: R3 might contain enough bytes ready to write (3 needed at most), 1890 // thus load on lsl_shift==24 is not needed (in fact forces reading 1891 // beyond source buffer end boundary) 1892 if (lsl_shift == 8) { 1893 __ ldr(R4, Address(from, 4, post_indexed), ge); 1894 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ge); 1895 } else if (lsl_shift == 16) { 1896 __ ldr(R4, Address(from, 4, post_indexed), gt); 1897 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), gt); 1898 } 1899 1900 __ strh(R3, Address(to, 2, post_indexed), ge); // two last bytes 1901 __ mov(R3, AsmOperand(R3, lsr, 16), gt); 1902 1903 __ tst(count, 1); 1904 __ strb(R3, Address(to, 1, post_indexed), ne); // one last byte 1905 break; 1906 } 1907 #endif // AARCH64 1908 1909 __ BIND(L_done); 1910 return 0; // no minimum 1911 } 1912 1913 // Generate the inner loop for shifted backward array copy (unaligned copy). 1914 // It can be used when bytes_per_count < wordSize, i.e. 1915 // byte/short copy on 32-bit ARM, byte/short/int/compressed-oop copy on AArch64. 1916 // 1917 // Arguments 1918 // end_from: end src address, 64 bits aligned 1919 // end_to: end dst address, (now) wordSize aligned 1920 // count: number of elements (32-bit int) 1921 // bytes_per_count: number of bytes for each unit of 'count' 1922 // lsl_shift: shift applied to 'old' value to skipped already written bytes 1923 // lsr_shift: shift applied to 'new' value to set the low bytes of the next write 1924 // 1925 // Return the minimum initial value for count 1926 // 1927 // Notes: 1928 // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA, required for AArch64) 1929 // - 'end_to' aligned on wordSize 1930 // - 'count' must be greater or equal than the returned value 1931 // - 'lsr_shift' + 'lsl_shift' = 'BitsPerWord' 1932 // - 'bytes_per_count' is 1 or 2 on 32-bit ARM; 1, 2 or 4 on AArch64 1933 // 1934 // Decreases 'end_to' by count*bytes_per_count. 1935 // 1936 // Scratches 'end_from', 'count', R3-R10, R12 1937 // 1938 // On entry: 1939 // - R3 is preloaded with the first 'BitsPerWord' bits read just after 'from' 1940 // - (R3 << lsl_shift) is the part not yet written 1941 // --> (*--to) = (R3 << lsl_shift) | (*--from) >> lsr_shift); ... 1942 // 1943 // This implementation may read more bytes than required. 1944 // Actually, it always reads exactly all data from the copied region with beginning aligned down by wordSize, 1945 // so excessive read do not cross a word bound and is thus harmless. 1946 // 1947 int generate_backward_shifted_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count, int lsr_shift, int lsl_shift) { 1948 assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below"); 1949 1950 const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iter 1951 const int count_per_loop = bytes_per_loop / bytes_per_count; 1952 1953 arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_shifted; 1954 int pld_offset = config->pld_distance; 1955 1956 #ifndef AARCH64 1957 bool split_read= config->split_ldm; 1958 bool split_write= config->split_stm; 1959 #endif // !AARCH64 1960 1961 1962 const bool prefetch_before = pld_offset < 0; 1963 const bool prefetch_after = pld_offset > 0; 1964 1965 Label L_skip_pld, L_done, L_last_read; 1966 if (pld_offset != 0) { 1967 1968 pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset; 1969 1970 prefetch(end_from, end_to, -wordSize); 1971 1972 if (prefetch_before) { 1973 __ cmp_32(count, count_per_loop); 1974 __ b(L_last_read, lt); 1975 1976 // skip prefetch for small copies 1977 // warning: count is predecreased by the prefetch distance to optimize the inner loop 1978 __ subs_32(count, count, ((bytes_per_loop + pld_offset)/bytes_per_count) + count_per_loop); 1979 __ b(L_skip_pld, lt); 1980 } 1981 1982 int offset = ArmCopyCacheLineSize; 1983 while (offset <= pld_offset) { 1984 prefetch(end_from, end_to, -(wordSize + offset)); 1985 offset += ArmCopyCacheLineSize; 1986 }; 1987 } 1988 1989 Label L_shifted_loop; 1990 __ align(OptoLoopAlignment); 1991 __ BIND(L_shifted_loop); 1992 1993 if (prefetch_before) { 1994 // do the 1st ldm/ldp first anyway (no locking issues with early STM/STP) 1995 prefetch(end_from, end_to, -(wordSize + bytes_per_loop + pld_offset)); 1996 __ BIND(L_skip_pld); 1997 } else { 1998 __ cmp_32(count, count_per_loop); 1999 __ b(L_last_read, lt); 2000 } 2001 2002 #ifdef AARCH64 2003 __ logical_shift_left(R12, R3, lsl_shift); 2004 const Register data_regs[9] = {R3, R4, R5, R6, R7, R8, R9, R10, R12}; 2005 bulk_load_backward(end_from, data_regs, 8); 2006 #else 2007 if (split_read) { 2008 __ ldmdb(end_from, RegisterSet(R7, R10), writeback); 2009 __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written 2010 __ ldmdb(end_from, RegisterSet(R3, R6), writeback); 2011 } else { 2012 __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written 2013 __ ldmdb(end_from, RegisterSet(R3, R10), writeback); 2014 } 2015 #endif // AARCH64 2016 2017 __ subs_32(count, count, count_per_loop); 2018 2019 if (prefetch_after) { // do prefetch during ldm/ldp latency 2020 prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop); 2021 } 2022 2023 // prepare the values in R4..R10,R12 2024 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); // merged above high bytes of prev val 2025 __ logical_shift_left(R10, R10, lsl_shift); // unused part of prev val 2026 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift)); // ... 2027 __ logical_shift_left(R9, R9, lsl_shift); 2028 __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift)); 2029 __ logical_shift_left(R8, R8, lsl_shift); 2030 __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift)); 2031 __ logical_shift_left(R7, R7, lsl_shift); 2032 __ orr(R7, R7, AsmOperand(R6, lsr, lsr_shift)); 2033 __ logical_shift_left(R6, R6, lsl_shift); 2034 __ orr(R6, R6, AsmOperand(R5, lsr, lsr_shift)); 2035 #ifndef AARCH64 2036 if (split_write) { 2037 // store early to reduce locking issues 2038 __ stmdb(end_to, RegisterSet(R6, R10) | R12, writeback, prefetch_before ? gt : ge); 2039 } 2040 #endif // !AARCH64 2041 __ logical_shift_left(R5, R5, lsl_shift); 2042 __ orr(R5, R5, AsmOperand(R4, lsr, lsr_shift)); 2043 __ logical_shift_left(R4, R4, lsl_shift); 2044 __ orr(R4, R4, AsmOperand(R3, lsr, lsr_shift)); 2045 2046 #ifdef AARCH64 2047 bulk_store_backward(end_to, &data_regs[1], 8); 2048 #else 2049 if (split_write) { 2050 __ stmdb(end_to, RegisterSet(R4, R5), writeback, prefetch_before ? gt : ge); 2051 } else { 2052 __ stmdb(end_to, RegisterSet(R4, R10) | R12, writeback, prefetch_before ? gt : ge); 2053 } 2054 #endif // AARCH64 2055 2056 __ b(L_shifted_loop, gt); // no need to loop if 0 (when count need not be precise modulo bytes_per_loop) 2057 2058 if (prefetch_before) { 2059 // the first loop may end earlier, allowing to skip pld at the end 2060 __ cmn_32(count, ((bytes_per_loop + pld_offset)/bytes_per_count)); 2061 #ifndef AARCH64 2062 __ stmdb(end_to, RegisterSet(R4, R10) | R12, writeback); // stmdb was skipped 2063 #endif // !AARCH64 2064 __ b(L_skip_pld, ge); 2065 __ adds_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop); 2066 } 2067 2068 __ BIND(L_last_read); 2069 __ b(L_done, eq); 2070 2071 #ifdef AARCH64 2072 assert(bytes_per_count < 8, "adjust the code below"); 2073 2074 __ logical_shift_left(R12, R3, lsl_shift); 2075 2076 { 2077 Label L; 2078 __ tbz(count, exact_log2(32/bytes_per_count), L); 2079 bulk_load_backward(end_from, &data_regs[4], 4); 2080 2081 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); 2082 __ logical_shift_left(R10, R10, lsl_shift); 2083 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift)); 2084 __ logical_shift_left(R9, R9, lsl_shift); 2085 __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift)); 2086 __ logical_shift_left(R8, R8, lsl_shift); 2087 __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift)); 2088 2089 bulk_store_backward(end_to, &data_regs[5], 4); 2090 __ logical_shift_left(R12, R7, lsl_shift); 2091 __ bind(L); 2092 } 2093 2094 { 2095 Label L; 2096 __ tbz(count, exact_log2(16/bytes_per_count), L); 2097 bulk_load_backward(end_from, &data_regs[6], 2); 2098 2099 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); 2100 __ logical_shift_left(R10, R10, lsl_shift); 2101 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift)); 2102 2103 bulk_store_backward(end_to, &data_regs[7], 2); 2104 __ logical_shift_left(R12, R9, lsl_shift); 2105 __ bind(L); 2106 } 2107 2108 { 2109 Label L; 2110 __ tbz(count, exact_log2(8/bytes_per_count), L); 2111 __ ldr(R10, Address(end_from, -8, pre_indexed)); 2112 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); 2113 __ str(R12, Address(end_to, -8, pre_indexed)); 2114 __ logical_shift_left(R12, R10, lsl_shift); 2115 __ bind(L); 2116 } 2117 2118 const int have_bytes = lsr_shift/BitsPerByte; // number of already read bytes in R12 2119 2120 // It remains less than wordSize to write. 2121 // Do not check count if R12 already has maximal number of loaded elements (one less than wordSize). 2122 if (have_bytes < wordSize - bytes_per_count) { 2123 Label L; 2124 __ andr(count, count, (uintx)(8/bytes_per_count-1)); // make count exact 2125 __ cmp_32(count, have_bytes/bytes_per_count); // do we have enough bytes to store? 2126 __ b(L, le); 2127 __ ldr(R10, Address(end_from, -8, pre_indexed)); 2128 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); 2129 __ bind(L); 2130 } 2131 2132 assert (bytes_per_count <= 4, "must be"); 2133 2134 { 2135 Label L; 2136 __ tbz(count, exact_log2(4/bytes_per_count), L); 2137 __ logical_shift_right(R9, R12, (wordSize-4)*BitsPerByte); 2138 __ str_w(R9, Address(end_to, -4, pre_indexed)); // Write 4 MSB 2139 if (bytes_per_count < 4) { 2140 __ logical_shift_left(R12, R12, 4*BitsPerByte); // Promote remaining bytes to MSB 2141 } 2142 __ bind(L); 2143 } 2144 2145 if (bytes_per_count <= 2) { 2146 Label L; 2147 __ tbz(count, exact_log2(2/bytes_per_count), L); 2148 __ logical_shift_right(R9, R12, (wordSize-2)*BitsPerByte); 2149 __ strh(R9, Address(end_to, -2, pre_indexed)); // Write 2 MSB 2150 if (bytes_per_count < 2) { 2151 __ logical_shift_left(R12, R12, 2*BitsPerByte); // Promote remaining bytes to MSB 2152 } 2153 __ bind(L); 2154 } 2155 2156 if (bytes_per_count <= 1) { 2157 Label L; 2158 __ tbz(count, exact_log2(1/bytes_per_count), L); 2159 __ logical_shift_right(R9, R12, (wordSize-1)*BitsPerByte); 2160 __ strb(R9, Address(end_to, -1, pre_indexed)); // Write 1 MSB 2161 __ bind(L); 2162 } 2163 #else 2164 switch(bytes_per_count) { 2165 case 2: 2166 __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written 2167 __ tst(count, 8); 2168 __ ldmdb(end_from, RegisterSet(R7,R10), writeback, ne); 2169 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 2170 __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val 2171 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ... 2172 __ mov(R9, AsmOperand(R9, lsl, lsl_shift),ne); 2173 __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift),ne); 2174 __ mov(R8, AsmOperand(R8, lsl, lsl_shift),ne); 2175 __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift),ne); 2176 __ stmdb(end_to, RegisterSet(R8,R10)|R12, writeback, ne); 2177 __ mov(R12, AsmOperand(R7, lsl, lsl_shift), ne); 2178 2179 __ tst(count, 4); 2180 __ ldmdb(end_from, RegisterSet(R9, R10), writeback, ne); 2181 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 2182 __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val 2183 __ orr(R10, R10, AsmOperand(R9, lsr,lsr_shift),ne); // ... 2184 __ stmdb(end_to, RegisterSet(R10)|R12, writeback, ne); 2185 __ mov(R12, AsmOperand(R9, lsl, lsl_shift), ne); 2186 2187 __ tst(count, 2); 2188 __ ldr(R10, Address(end_from, -4, pre_indexed), ne); 2189 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 2190 __ str(R12, Address(end_to, -4, pre_indexed), ne); 2191 __ mov(R12, AsmOperand(R10, lsl, lsl_shift), ne); 2192 2193 __ tst(count, 1); 2194 __ mov(R12, AsmOperand(R12, lsr, lsr_shift),ne); 2195 __ strh(R12, Address(end_to, -2, pre_indexed), ne); // one last short 2196 break; 2197 2198 case 1: 2199 __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written 2200 __ tst(count, 16); 2201 __ ldmdb(end_from, RegisterSet(R7,R10), writeback, ne); 2202 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 2203 __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val 2204 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ... 2205 __ mov(R9, AsmOperand(R9, lsl, lsl_shift),ne); 2206 __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift),ne); 2207 __ mov(R8, AsmOperand(R8, lsl, lsl_shift),ne); 2208 __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift),ne); 2209 __ stmdb(end_to, RegisterSet(R8,R10)|R12, writeback, ne); 2210 __ mov(R12, AsmOperand(R7, lsl, lsl_shift), ne); 2211 2212 __ tst(count, 8); 2213 __ ldmdb(end_from, RegisterSet(R9,R10), writeback, ne); 2214 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 2215 __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val 2216 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ... 2217 __ stmdb(end_to, RegisterSet(R10)|R12, writeback, ne); 2218 __ mov(R12, AsmOperand(R9, lsl, lsl_shift), ne); 2219 2220 __ tst(count, 4); 2221 __ ldr(R10, Address(end_from, -4, pre_indexed), ne); 2222 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 2223 __ str(R12, Address(end_to, -4, pre_indexed), ne); 2224 __ mov(R12, AsmOperand(R10, lsl, lsl_shift), ne); 2225 2226 __ tst(count, 2); 2227 if (lsr_shift != 24) { 2228 // avoid useless reading R10 when we already have 3 bytes ready in R12 2229 __ ldr(R10, Address(end_from, -4, pre_indexed), ne); 2230 __ orr(R12, R12, AsmOperand(R10, lsr,lsr_shift), ne); 2231 } 2232 2233 // Note: R12 contains enough bytes ready to write (3 needed at most) 2234 // write the 2 MSBs 2235 __ mov(R9, AsmOperand(R12, lsr, 16), ne); 2236 __ strh(R9, Address(end_to, -2, pre_indexed), ne); 2237 // promote remaining to MSB 2238 __ mov(R12, AsmOperand(R12, lsl, 16), ne); 2239 2240 __ tst(count, 1); 2241 // write the MSB of R12 2242 __ mov(R12, AsmOperand(R12, lsr, 24), ne); 2243 __ strb(R12, Address(end_to, -1, pre_indexed), ne); 2244 2245 break; 2246 } 2247 #endif // AARCH64 2248 2249 __ BIND(L_done); 2250 return 0; // no minimum 2251 } 2252 2253 // This method is very useful for merging forward/backward implementations 2254 Address get_addr_with_indexing(Register base, int delta, bool forward) { 2255 if (forward) { 2256 return Address(base, delta, post_indexed); 2257 } else { 2258 return Address(base, -delta, pre_indexed); 2259 } 2260 } 2261 2262 #ifdef AARCH64 2263 // Loads one 'size_in_bytes'-sized value from 'from' in given direction, i.e. 2264 // if forward: loads value at from and increases from by size 2265 // if !forward: loads value at from-size_in_bytes and decreases from by size 2266 void load_one(Register rd, Register from, int size_in_bytes, bool forward) { 2267 assert_different_registers(from, rd); 2268 Address addr = get_addr_with_indexing(from, size_in_bytes, forward); 2269 __ load_sized_value(rd, addr, size_in_bytes, false); 2270 } 2271 2272 // Stores one 'size_in_bytes'-sized value to 'to' in given direction (see load_one) 2273 void store_one(Register rd, Register to, int size_in_bytes, bool forward) { 2274 assert_different_registers(to, rd); 2275 Address addr = get_addr_with_indexing(to, size_in_bytes, forward); 2276 __ store_sized_value(rd, addr, size_in_bytes); 2277 } 2278 #else 2279 // load_one and store_one are the same as for AArch64 except for 2280 // *) Support for condition execution 2281 // *) Second value register argument for 8-byte values 2282 2283 void load_one(Register rd, Register from, int size_in_bytes, bool forward, AsmCondition cond = al, Register rd2 = noreg) { 2284 assert_different_registers(from, rd, rd2); 2285 if (size_in_bytes < 8) { 2286 Address addr = get_addr_with_indexing(from, size_in_bytes, forward); 2287 __ load_sized_value(rd, addr, size_in_bytes, false, cond); 2288 } else { 2289 assert (rd2 != noreg, "second value register must be specified"); 2290 assert (rd->encoding() < rd2->encoding(), "wrong value register set"); 2291 2292 if (forward) { 2293 __ ldmia(from, RegisterSet(rd) | rd2, writeback, cond); 2294 } else { 2295 __ ldmdb(from, RegisterSet(rd) | rd2, writeback, cond); 2296 } 2297 } 2298 } 2299 2300 void store_one(Register rd, Register to, int size_in_bytes, bool forward, AsmCondition cond = al, Register rd2 = noreg) { 2301 assert_different_registers(to, rd, rd2); 2302 if (size_in_bytes < 8) { 2303 Address addr = get_addr_with_indexing(to, size_in_bytes, forward); 2304 __ store_sized_value(rd, addr, size_in_bytes, cond); 2305 } else { 2306 assert (rd2 != noreg, "second value register must be specified"); 2307 assert (rd->encoding() < rd2->encoding(), "wrong value register set"); 2308 2309 if (forward) { 2310 __ stmia(to, RegisterSet(rd) | rd2, writeback, cond); 2311 } else { 2312 __ stmdb(to, RegisterSet(rd) | rd2, writeback, cond); 2313 } 2314 } 2315 } 2316 #endif // AARCH64 2317 2318 // Copies data from 'from' to 'to' in specified direction to align 'from' by 64 bits. 2319 // (on 32-bit ARM 64-bit alignment is better for LDM). 2320 // 2321 // Arguments: 2322 // from: beginning (if forward) or upper bound (if !forward) of the region to be read 2323 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 2324 // count: 32-bit int, maximum number of elements which can be copied 2325 // bytes_per_count: size of an element 2326 // forward: specifies copy direction 2327 // 2328 // Notes: 2329 // 'from' and 'to' must be aligned by 'bytes_per_count' 2330 // 'count' must not be less than the returned value 2331 // shifts 'from' and 'to' by the number of copied bytes in corresponding direction 2332 // decreases 'count' by the number of elements copied 2333 // 2334 // Returns maximum number of bytes which may be copied. 2335 int align_src(Register from, Register to, Register count, Register tmp, int bytes_per_count, bool forward) { 2336 assert_different_registers(from, to, count, tmp); 2337 #ifdef AARCH64 2338 // TODO-AARCH64: replace by simple loop? 2339 Label Laligned_by_2, Laligned_by_4, Laligned_by_8; 2340 2341 if (bytes_per_count == 1) { 2342 __ tbz(from, 0, Laligned_by_2); 2343 __ sub_32(count, count, 1); 2344 load_one(tmp, from, 1, forward); 2345 store_one(tmp, to, 1, forward); 2346 } 2347 2348 __ BIND(Laligned_by_2); 2349 2350 if (bytes_per_count <= 2) { 2351 __ tbz(from, 1, Laligned_by_4); 2352 __ sub_32(count, count, 2/bytes_per_count); 2353 load_one(tmp, from, 2, forward); 2354 store_one(tmp, to, 2, forward); 2355 } 2356 2357 __ BIND(Laligned_by_4); 2358 2359 if (bytes_per_count <= 4) { 2360 __ tbz(from, 2, Laligned_by_8); 2361 __ sub_32(count, count, 4/bytes_per_count); 2362 load_one(tmp, from, 4, forward); 2363 store_one(tmp, to, 4, forward); 2364 } 2365 __ BIND(Laligned_by_8); 2366 #else // AARCH64 2367 if (bytes_per_count < 8) { 2368 Label L_align_src; 2369 __ BIND(L_align_src); 2370 __ tst(from, 7); 2371 // ne => not aligned: copy one element and (if bytes_per_count < 4) loop 2372 __ sub(count, count, 1, ne); 2373 load_one(tmp, from, bytes_per_count, forward, ne); 2374 store_one(tmp, to, bytes_per_count, forward, ne); 2375 if (bytes_per_count < 4) { 2376 __ b(L_align_src, ne); // if bytes_per_count == 4, then 0 or 1 loop iterations are enough 2377 } 2378 } 2379 #endif // AARCH64 2380 return 7/bytes_per_count; 2381 } 2382 2383 // Copies 'count' of 'bytes_per_count'-sized elements in the specified direction. 2384 // 2385 // Arguments: 2386 // from: beginning (if forward) or upper bound (if !forward) of the region to be read 2387 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 2388 // count: 32-bit int, number of elements to be copied 2389 // entry: copy loop entry point 2390 // bytes_per_count: size of an element 2391 // forward: specifies copy direction 2392 // 2393 // Notes: 2394 // shifts 'from' and 'to' 2395 void copy_small_array(Register from, Register to, Register count, Register tmp, Register tmp2, int bytes_per_count, bool forward, Label & entry) { 2396 assert_different_registers(from, to, count, tmp); 2397 2398 __ align(OptoLoopAlignment); 2399 #ifdef AARCH64 2400 Label L_small_array_done, L_small_array_loop; 2401 __ BIND(entry); 2402 __ cbz_32(count, L_small_array_done); 2403 2404 __ BIND(L_small_array_loop); 2405 __ subs_32(count, count, 1); 2406 load_one(tmp, from, bytes_per_count, forward); 2407 store_one(tmp, to, bytes_per_count, forward); 2408 __ b(L_small_array_loop, gt); 2409 2410 __ BIND(L_small_array_done); 2411 #else 2412 Label L_small_loop; 2413 __ BIND(L_small_loop); 2414 store_one(tmp, to, bytes_per_count, forward, al, tmp2); 2415 __ BIND(entry); // entry point 2416 __ subs(count, count, 1); 2417 load_one(tmp, from, bytes_per_count, forward, ge, tmp2); 2418 __ b(L_small_loop, ge); 2419 #endif // AARCH64 2420 } 2421 2422 // Aligns 'to' by reading one word from 'from' and writting its part to 'to'. 2423 // 2424 // Arguments: 2425 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 2426 // count: 32-bit int, number of elements allowed to be copied 2427 // to_remainder: remainder of dividing 'to' by wordSize 2428 // bytes_per_count: size of an element 2429 // forward: specifies copy direction 2430 // Rval: contains an already read but not yet written word; 2431 // its' LSBs (if forward) or MSBs (if !forward) are to be written to align 'to'. 2432 // 2433 // Notes: 2434 // 'count' must not be less then the returned value 2435 // 'to' must be aligned by bytes_per_count but must not be aligned by wordSize 2436 // shifts 'to' by the number of written bytes (so that it becomes the bound of memory to be written) 2437 // decreases 'count' by the the number of elements written 2438 // Rval's MSBs or LSBs remain to be written further by generate_{forward,backward}_shifted_copy_loop 2439 int align_dst(Register to, Register count, Register Rval, Register tmp, 2440 int to_remainder, int bytes_per_count, bool forward) { 2441 assert_different_registers(to, count, tmp, Rval); 2442 2443 assert (0 < to_remainder && to_remainder < wordSize, "to_remainder is not valid"); 2444 assert (to_remainder % bytes_per_count == 0, "to must be aligned by bytes_per_count"); 2445 2446 int bytes_to_write = forward ? (wordSize - to_remainder) : to_remainder; 2447 2448 int offset = 0; 2449 2450 for (int l = 0; l < LogBytesPerWord; ++l) { 2451 int s = (1 << l); 2452 if (bytes_to_write & s) { 2453 int new_offset = offset + s*BitsPerByte; 2454 if (forward) { 2455 if (offset == 0) { 2456 store_one(Rval, to, s, forward); 2457 } else { 2458 __ logical_shift_right(tmp, Rval, offset); 2459 store_one(tmp, to, s, forward); 2460 } 2461 } else { 2462 __ logical_shift_right(tmp, Rval, BitsPerWord - new_offset); 2463 store_one(tmp, to, s, forward); 2464 } 2465 2466 offset = new_offset; 2467 } 2468 } 2469 2470 assert (offset == bytes_to_write * BitsPerByte, "all bytes must be copied"); 2471 2472 __ sub_32(count, count, bytes_to_write/bytes_per_count); 2473 2474 return bytes_to_write / bytes_per_count; 2475 } 2476 2477 // Copies 'count' of elements using shifted copy loop 2478 // 2479 // Arguments: 2480 // from: beginning (if forward) or upper bound (if !forward) of the region to be read 2481 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 2482 // count: 32-bit int, number of elements to be copied 2483 // to_remainder: remainder of dividing 'to' by wordSize 2484 // bytes_per_count: size of an element 2485 // forward: specifies copy direction 2486 // Rval: contains an already read but not yet written word 2487 // 2488 // 2489 // Notes: 2490 // 'count' must not be less then the returned value 2491 // 'from' must be aligned by wordSize 2492 // 'to' must be aligned by bytes_per_count but must not be aligned by wordSize 2493 // shifts 'to' by the number of copied bytes 2494 // 2495 // Scratches R3-R10, R12 2496 int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, Register Rval, 2497 int to_remainder, int bytes_per_count, bool forward) { 2498 2499 assert (0 < to_remainder && to_remainder < wordSize, "to_remainder is invalid"); 2500 2501 const Register tmp = forward ? R3 : R12; // TODO-AARCH64: on cojoint_short R4 was used for tmp 2502 assert_different_registers(from, to, count, Rval, tmp); 2503 2504 int required_to_align = align_dst(to, count, Rval, tmp, to_remainder, bytes_per_count, forward); 2505 2506 int lsr_shift = (wordSize - to_remainder) * BitsPerByte; 2507 int lsl_shift = to_remainder * BitsPerByte; 2508 2509 int min_copy; 2510 if (forward) { 2511 min_copy = generate_forward_shifted_copy_loop(from, to, count, bytes_per_count, lsr_shift, lsl_shift); 2512 } else { 2513 min_copy = generate_backward_shifted_copy_loop(from, to, count, bytes_per_count, lsr_shift, lsl_shift); 2514 } 2515 2516 return min_copy + required_to_align; 2517 } 2518 2519 // Copies 'count' of elements using shifted copy loop 2520 // 2521 // Arguments: 2522 // from: beginning (if forward) or upper bound (if !forward) of the region to be read 2523 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 2524 // count: 32-bit int, number of elements to be copied 2525 // bytes_per_count: size of an element 2526 // forward: specifies copy direction 2527 // 2528 // Notes: 2529 // 'count' must not be less then the returned value 2530 // 'from' must be aligned by wordSize 2531 // 'to' must be aligned by bytes_per_count but must not be aligned by wordSize 2532 // shifts 'to' by the number of copied bytes 2533 // 2534 // Scratches 'from', 'count', R3 and R12. 2535 // On AArch64 also scratches R4-R10, on 32-bit ARM saves them to use. 2536 int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, bool forward) { 2537 2538 const Register Rval = forward ? R12 : R3; // as generate_{forward,backward}_shifted_copy_loop expect 2539 2540 int min_copy = 0; 2541 2542 // Note: if {seq} is a sequence of numbers, L{seq} means that if the execution reaches this point, 2543 // then the remainder of 'to' divided by wordSize is one of elements of {seq}. 2544 2545 #ifdef AARCH64 2546 // TODO-AARCH64: simplify, tune 2547 2548 load_one(Rval, from, wordSize, forward); 2549 2550 Label L_loop_finished; 2551 2552 switch (bytes_per_count) { 2553 case 4: 2554 min_copy = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward); 2555 break; 2556 case 2: 2557 { 2558 Label L2, L4, L6; 2559 2560 __ tbz(to, 1, L4); 2561 __ tbz(to, 2, L2); 2562 2563 __ BIND(L6); 2564 int min_copy6 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 6, bytes_per_count, forward); 2565 __ b(L_loop_finished); 2566 2567 __ BIND(L2); 2568 int min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward); 2569 __ b(L_loop_finished); 2570 2571 __ BIND(L4); 2572 int min_copy4 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward); 2573 2574 min_copy = MAX2(MAX2(min_copy2, min_copy4), min_copy6); 2575 break; 2576 } 2577 case 1: 2578 { 2579 Label L1, L2, L3, L4, L5, L6, L7; 2580 Label L15, L26; 2581 Label L246; 2582 2583 __ tbz(to, 0, L246); 2584 __ tbz(to, 1, L15); 2585 __ tbz(to, 2, L3); 2586 2587 __ BIND(L7); 2588 int min_copy7 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 7, bytes_per_count, forward); 2589 __ b(L_loop_finished); 2590 2591 __ BIND(L246); 2592 __ tbnz(to, 1, L26); 2593 2594 __ BIND(L4); 2595 int min_copy4 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 4, bytes_per_count, forward); 2596 __ b(L_loop_finished); 2597 2598 __ BIND(L15); 2599 __ tbz(to, 2, L1); 2600 2601 __ BIND(L5); 2602 int min_copy5 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 5, bytes_per_count, forward); 2603 __ b(L_loop_finished); 2604 2605 __ BIND(L3); 2606 int min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward); 2607 __ b(L_loop_finished); 2608 2609 __ BIND(L26); 2610 __ tbz(to, 2, L2); 2611 2612 __ BIND(L6); 2613 int min_copy6 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 6, bytes_per_count, forward); 2614 __ b(L_loop_finished); 2615 2616 __ BIND(L1); 2617 int min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward); 2618 __ b(L_loop_finished); 2619 2620 __ BIND(L2); 2621 int min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward); 2622 2623 2624 min_copy = MAX2(min_copy1, min_copy2); 2625 min_copy = MAX2(min_copy, min_copy3); 2626 min_copy = MAX2(min_copy, min_copy4); 2627 min_copy = MAX2(min_copy, min_copy5); 2628 min_copy = MAX2(min_copy, min_copy6); 2629 min_copy = MAX2(min_copy, min_copy7); 2630 break; 2631 } 2632 default: 2633 ShouldNotReachHere(); 2634 break; 2635 } 2636 __ BIND(L_loop_finished); 2637 2638 #else 2639 __ push(RegisterSet(R4,R10)); 2640 load_one(Rval, from, wordSize, forward); 2641 2642 switch (bytes_per_count) { 2643 case 2: 2644 min_copy = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward); 2645 break; 2646 case 1: 2647 { 2648 Label L1, L2, L3; 2649 int min_copy1, min_copy2, min_copy3; 2650 2651 Label L_loop_finished; 2652 2653 if (forward) { 2654 __ tbz(to, 0, L2); 2655 __ tbz(to, 1, L1); 2656 2657 __ BIND(L3); 2658 min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward); 2659 __ b(L_loop_finished); 2660 2661 __ BIND(L1); 2662 min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward); 2663 __ b(L_loop_finished); 2664 2665 __ BIND(L2); 2666 min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward); 2667 } else { 2668 __ tbz(to, 0, L2); 2669 __ tbnz(to, 1, L3); 2670 2671 __ BIND(L1); 2672 min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward); 2673 __ b(L_loop_finished); 2674 2675 __ BIND(L3); 2676 min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward); 2677 __ b(L_loop_finished); 2678 2679 __ BIND(L2); 2680 min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward); 2681 } 2682 2683 min_copy = MAX2(MAX2(min_copy1, min_copy2), min_copy3); 2684 2685 __ BIND(L_loop_finished); 2686 2687 break; 2688 } 2689 default: 2690 ShouldNotReachHere(); 2691 break; 2692 } 2693 2694 __ pop(RegisterSet(R4,R10)); 2695 #endif // AARCH64 2696 2697 return min_copy; 2698 } 2699 2700 #ifndef PRODUCT 2701 int * get_arraycopy_counter(int bytes_per_count) { 2702 switch (bytes_per_count) { 2703 case 1: 2704 return &SharedRuntime::_jbyte_array_copy_ctr; 2705 case 2: 2706 return &SharedRuntime::_jshort_array_copy_ctr; 2707 case 4: 2708 return &SharedRuntime::_jint_array_copy_ctr; 2709 case 8: 2710 return &SharedRuntime::_jlong_array_copy_ctr; 2711 default: 2712 ShouldNotReachHere(); 2713 return NULL; 2714 } 2715 } 2716 #endif // !PRODUCT 2717 2718 // 2719 // Generate stub for primitive array copy. If "aligned" is true, the 2720 // "from" and "to" addresses are assumed to be heapword aligned. 2721 // 2722 // If "disjoint" is true, arrays are assumed to be disjoint, otherwise they may overlap and 2723 // "nooverlap_target" must be specified as the address to jump if they don't. 2724 // 2725 // Arguments for generated stub: 2726 // from: R0 2727 // to: R1 2728 // count: R2 treated as signed 32-bit int 2729 // 2730 address generate_primitive_copy(bool aligned, const char * name, bool status, int bytes_per_count, bool disjoint, address nooverlap_target = NULL) { 2731 __ align(CodeEntryAlignment); 2732 StubCodeMark mark(this, "StubRoutines", name); 2733 address start = __ pc(); 2734 2735 const Register from = R0; // source array address 2736 const Register to = R1; // destination array address 2737 const Register count = R2; // elements count 2738 const Register tmp1 = R3; 2739 const Register tmp2 = R12; 2740 2741 if (!aligned) { 2742 BLOCK_COMMENT("Entry:"); 2743 } 2744 2745 __ zap_high_non_significant_bits(R2); 2746 2747 if (!disjoint) { 2748 assert (nooverlap_target != NULL, "must be specified for conjoint case"); 2749 array_overlap_test(nooverlap_target, exact_log2(bytes_per_count), tmp1, tmp2); 2750 } 2751 2752 inc_counter_np(*get_arraycopy_counter(bytes_per_count), tmp1, tmp2); 2753 2754 // Conjoint case: since execution reaches this point, the arrays overlap, so performing backward copy 2755 // Disjoint case: perform forward copy 2756 bool forward = disjoint; 2757 2758 2759 if (!forward) { 2760 // Set 'from' and 'to' to upper bounds 2761 int log_bytes_per_count = exact_log2(bytes_per_count); 2762 __ add_ptr_scaled_int32(to, to, count, log_bytes_per_count); 2763 __ add_ptr_scaled_int32(from, from, count, log_bytes_per_count); 2764 } 2765 2766 // There are two main copy loop implementations: 2767 // *) The huge and complex one applicable only for large enough arrays 2768 // *) The small and simple one applicable for any array (but not efficient for large arrays). 2769 // Currently "small" implementation is used if and only if the "large" one could not be used. 2770 // XXX optim: tune the limit higher ? 2771 // Large implementation lower applicability bound is actually determined by 2772 // aligned copy loop which require <=7 bytes for src alignment, and 8 words for aligned copy loop. 2773 const int small_copy_limit = (8*wordSize + 7) / bytes_per_count; 2774 2775 Label L_small_array; 2776 __ cmp_32(count, small_copy_limit); 2777 __ b(L_small_array, le); // TODO-AARCH64: le vs lt 2778 2779 // Otherwise proceed with large implementation. 2780 2781 bool from_is_aligned = (bytes_per_count >= 8); 2782 if (aligned && forward && (HeapWordSize % 8 == 0)) { 2783 // if 'from' is heapword aligned and HeapWordSize is divisible by 8, 2784 // then from is aligned by 8 2785 from_is_aligned = true; 2786 } 2787 2788 int count_required_to_align = from_is_aligned ? 0 : align_src(from, to, count, tmp1, bytes_per_count, forward); 2789 assert (small_copy_limit >= count_required_to_align, "alignment could exhaust count"); 2790 2791 // now 'from' is aligned 2792 2793 bool to_is_aligned = false; 2794 2795 if (bytes_per_count >= wordSize) { 2796 // 'to' is aligned by bytes_per_count, so it is aligned by wordSize 2797 to_is_aligned = true; 2798 } else { 2799 if (aligned && (8 % HeapWordSize == 0) && (HeapWordSize % wordSize == 0)) { 2800 // Originally 'from' and 'to' were heapword aligned; 2801 // (from - to) has not been changed, so since now 'from' is 8-byte aligned, then it is also heapword aligned, 2802 // so 'to' is also heapword aligned and thus aligned by wordSize. 2803 to_is_aligned = true; 2804 } 2805 } 2806 2807 Label L_unaligned_dst; 2808 2809 if (!to_is_aligned) { 2810 BLOCK_COMMENT("Check dst alignment:"); 2811 __ tst(to, wordSize - 1); 2812 __ b(L_unaligned_dst, ne); // 'to' is not aligned 2813 } 2814 2815 // 'from' and 'to' are properly aligned 2816 2817 int min_copy; 2818 if (forward) { 2819 min_copy = generate_forward_aligned_copy_loop (from, to, count, bytes_per_count); 2820 } else { 2821 min_copy = generate_backward_aligned_copy_loop(from, to, count, bytes_per_count); 2822 } 2823 assert(small_copy_limit >= count_required_to_align + min_copy, "first loop might exhaust count"); 2824 2825 if (status) { 2826 __ mov(R0, 0); // OK 2827 } 2828 2829 __ ret(); 2830 2831 { 2832 copy_small_array(from, to, count, tmp1, tmp2, bytes_per_count, forward, L_small_array /* entry */); 2833 2834 if (status) { 2835 __ mov(R0, 0); // OK 2836 } 2837 2838 __ ret(); 2839 } 2840 2841 if (! to_is_aligned) { 2842 __ BIND(L_unaligned_dst); 2843 int min_copy_shifted = align_dst_and_generate_shifted_copy_loop(from, to, count, bytes_per_count, forward); 2844 assert (small_copy_limit >= count_required_to_align + min_copy_shifted, "first loop might exhaust count"); 2845 2846 if (status) { 2847 __ mov(R0, 0); // OK 2848 } 2849 2850 __ ret(); 2851 } 2852 2853 return start; 2854 } 2855 2856 #if INCLUDE_ALL_GCS 2857 // 2858 // Generate pre-write barrier for array. 2859 // 2860 // Input: 2861 // addr - register containing starting address 2862 // count - register containing element count, 32-bit int 2863 // callee_saved_regs - 2864 // the call must preserve this number of registers: R0, R1, ..., R[callee_saved_regs-1] 2865 // 2866 // callee_saved_regs must include addr and count 2867 // Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR) except for callee_saved_regs. 2868 void gen_write_ref_array_pre_barrier(Register addr, Register count, int callee_saved_regs) { 2869 BarrierSet* bs = Universe::heap()->barrier_set(); 2870 if (bs->has_write_ref_pre_barrier()) { 2871 assert(bs->has_write_ref_array_pre_opt(), 2872 "Else unsupported barrier set."); 2873 2874 assert( addr->encoding() < callee_saved_regs, "addr must be saved"); 2875 assert(count->encoding() < callee_saved_regs, "count must be saved"); 2876 2877 BLOCK_COMMENT("PreBarrier"); 2878 2879 #ifdef AARCH64 2880 callee_saved_regs = round_to(callee_saved_regs, 2); 2881 for (int i = 0; i < callee_saved_regs; i += 2) { 2882 __ raw_push(as_Register(i), as_Register(i+1)); 2883 } 2884 #else 2885 RegisterSet saved_regs = RegisterSet(R0, as_Register(callee_saved_regs-1)); 2886 __ push(saved_regs | R9ifScratched); 2887 #endif // AARCH64 2888 2889 if (addr != R0) { 2890 assert_different_registers(count, R0); 2891 __ mov(R0, addr); 2892 } 2893 #ifdef AARCH64 2894 __ zero_extend(R1, count, 32); // BarrierSet::static_write_ref_array_pre takes size_t 2895 #else 2896 if (count != R1) { 2897 __ mov(R1, count); 2898 } 2899 #endif // AARCH64 2900 2901 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre)); 2902 2903 #ifdef AARCH64 2904 for (int i = callee_saved_regs - 2; i >= 0; i -= 2) { 2905 __ raw_pop(as_Register(i), as_Register(i+1)); 2906 } 2907 #else 2908 __ pop(saved_regs | R9ifScratched); 2909 #endif // AARCH64 2910 } 2911 } 2912 #endif // INCLUDE_ALL_GCS 2913 2914 // 2915 // Generate post-write barrier for array. 2916 // 2917 // Input: 2918 // addr - register containing starting address (can be scratched) 2919 // count - register containing element count, 32-bit int (can be scratched) 2920 // tmp - scratch register 2921 // 2922 // Note: LR can be scratched but might be equal to addr, count or tmp 2923 // Blows all volatile registers (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR). 2924 void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp) { 2925 assert_different_registers(addr, count, tmp); 2926 BarrierSet* bs = Universe::heap()->barrier_set(); 2927 2928 switch (bs->kind()) { 2929 case BarrierSet::G1SATBCTLogging: 2930 { 2931 BLOCK_COMMENT("G1PostBarrier"); 2932 if (addr != R0) { 2933 assert_different_registers(count, R0); 2934 __ mov(R0, addr); 2935 } 2936 #ifdef AARCH64 2937 __ zero_extend(R1, count, 32); // BarrierSet::static_write_ref_array_post takes size_t 2938 #else 2939 if (count != R1) { 2940 __ mov(R1, count); 2941 } 2942 #if R9_IS_SCRATCHED 2943 // Safer to save R9 here since callers may have been written 2944 // assuming R9 survives. This is suboptimal but is not in 2945 // general worth optimizing for the few platforms where R9 2946 // is scratched. Note that the optimization might not be to 2947 // difficult for this particular call site. 2948 __ push(R9); 2949 #endif 2950 #endif // !AARCH64 2951 __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post)); 2952 #ifndef AARCH64 2953 #if R9_IS_SCRATCHED 2954 __ pop(R9); 2955 #endif 2956 #endif // !AARCH64 2957 } 2958 break; 2959 case BarrierSet::CardTableForRS: 2960 case BarrierSet::CardTableExtension: 2961 { 2962 BLOCK_COMMENT("CardTablePostBarrier"); 2963 CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs); 2964 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 2965 2966 Label L_cardtable_loop; 2967 2968 __ add_ptr_scaled_int32(count, addr, count, LogBytesPerHeapOop); 2969 __ sub(count, count, BytesPerHeapOop); // last addr 2970 2971 __ logical_shift_right(addr, addr, CardTableModRefBS::card_shift); 2972 __ logical_shift_right(count, count, CardTableModRefBS::card_shift); 2973 __ sub(count, count, addr); // nb of cards 2974 2975 // warning: Rthread has not been preserved 2976 __ mov_address(tmp, (address) ct->byte_map_base, symbolic_Relocation::card_table_reference); 2977 __ add(addr,tmp, addr); 2978 2979 Register zero = __ zero_register(tmp); 2980 2981 __ BIND(L_cardtable_loop); 2982 __ strb(zero, Address(addr, 1, post_indexed)); 2983 __ subs(count, count, 1); 2984 __ b(L_cardtable_loop, ge); 2985 } 2986 break; 2987 case BarrierSet::ModRef: 2988 break; 2989 default: 2990 ShouldNotReachHere(); 2991 } 2992 } 2993 2994 // Generates pattern of code to be placed after raw data copying in generate_oop_copy 2995 // Includes return from arraycopy stub. 2996 // 2997 // Arguments: 2998 // to: destination pointer after copying. 2999 // if 'forward' then 'to' == upper bound, else 'to' == beginning of the modified region 3000 // count: total number of copied elements, 32-bit int 3001 // 3002 // Blows all volatile (R0-R3 on 32-bit ARM, R0-R18 on AArch64, Rtemp, LR) and 'to', 'count', 'tmp' registers. 3003 void oop_arraycopy_stub_epilogue_helper(Register to, Register count, Register tmp, bool status, bool forward) { 3004 assert_different_registers(to, count, tmp); 3005 3006 if (forward) { 3007 // 'to' is upper bound of the modified region 3008 // restore initial dst: 3009 __ sub_ptr_scaled_int32(to, to, count, LogBytesPerHeapOop); 3010 } 3011 3012 // 'to' is the beginning of the region 3013 3014 gen_write_ref_array_post_barrier(to, count, tmp); 3015 3016 if (status) { 3017 __ mov(R0, 0); // OK 3018 } 3019 3020 #ifdef AARCH64 3021 __ raw_pop(LR, ZR); 3022 __ ret(); 3023 #else 3024 __ pop(PC); 3025 #endif // AARCH64 3026 } 3027 3028 3029 // Generate stub for assign-compatible oop copy. If "aligned" is true, the 3030 // "from" and "to" addresses are assumed to be heapword aligned. 3031 // 3032 // If "disjoint" is true, arrays are assumed to be disjoint, otherwise they may overlap and 3033 // "nooverlap_target" must be specified as the address to jump if they don't. 3034 // 3035 // Arguments for generated stub: 3036 // from: R0 3037 // to: R1 3038 // count: R2 treated as signed 32-bit int 3039 // 3040 address generate_oop_copy(bool aligned, const char * name, bool status, bool disjoint, address nooverlap_target = NULL) { 3041 __ align(CodeEntryAlignment); 3042 StubCodeMark mark(this, "StubRoutines", name); 3043 address start = __ pc(); 3044 3045 Register from = R0; 3046 Register to = R1; 3047 Register count = R2; 3048 Register tmp1 = R3; 3049 Register tmp2 = R12; 3050 3051 3052 if (!aligned) { 3053 BLOCK_COMMENT("Entry:"); 3054 } 3055 3056 __ zap_high_non_significant_bits(R2); 3057 3058 if (!disjoint) { 3059 assert (nooverlap_target != NULL, "must be specified for conjoint case"); 3060 array_overlap_test(nooverlap_target, LogBytesPerHeapOop, tmp1, tmp2); 3061 } 3062 3063 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, tmp1, tmp2); 3064 3065 // Conjoint case: since execution reaches this point, the arrays overlap, so performing backward copy 3066 // Disjoint case: perform forward copy 3067 bool forward = disjoint; 3068 3069 const int bytes_per_count = BytesPerHeapOop; 3070 const int log_bytes_per_count = LogBytesPerHeapOop; 3071 3072 const Register saved_count = LR; 3073 const int callee_saved_regs = 3; // R0-R2 3074 3075 // LR is used later to save barrier args 3076 #ifdef AARCH64 3077 __ raw_push(LR, ZR); 3078 #else 3079 __ push(LR); 3080 #endif // AARCH64 3081 3082 #if INCLUDE_ALL_GCS 3083 gen_write_ref_array_pre_barrier(to, count, callee_saved_regs); 3084 #endif // INCLUDE_ALL_GCS 3085 3086 // save arguments for barrier generation (after the pre barrier) 3087 __ mov(saved_count, count); 3088 3089 if (!forward) { 3090 __ add_ptr_scaled_int32(to, to, count, log_bytes_per_count); 3091 __ add_ptr_scaled_int32(from, from, count, log_bytes_per_count); 3092 } 3093 3094 // for short arrays, just do single element copy 3095 Label L_small_array; 3096 const int small_copy_limit = (8*wordSize + 7)/bytes_per_count; // XXX optim: tune the limit higher ? 3097 __ cmp_32(count, small_copy_limit); 3098 __ b(L_small_array, le); 3099 3100 bool from_is_aligned = (bytes_per_count >= 8); 3101 if (aligned && forward && (HeapWordSize % 8 == 0)) { 3102 // if 'from' is heapword aligned and HeapWordSize is divisible by 8, 3103 // then from is aligned by 8 3104 from_is_aligned = true; 3105 } 3106 3107 int count_required_to_align = from_is_aligned ? 0 : align_src(from, to, count, tmp1, bytes_per_count, forward); 3108 assert (small_copy_limit >= count_required_to_align, "alignment could exhaust count"); 3109 3110 // now 'from' is aligned 3111 3112 bool to_is_aligned = false; 3113 3114 if (bytes_per_count >= wordSize) { 3115 // 'to' is aligned by bytes_per_count, so it is aligned by wordSize 3116 to_is_aligned = true; 3117 } else { 3118 if (aligned && (8 % HeapWordSize == 0) && (HeapWordSize % wordSize == 0)) { 3119 // Originally 'from' and 'to' were heapword aligned; 3120 // (from - to) has not been changed, so since now 'from' is 8-byte aligned, then it is also heapword aligned, 3121 // so 'to' is also heapword aligned and thus aligned by wordSize. 3122 to_is_aligned = true; 3123 } 3124 } 3125 3126 Label L_unaligned_dst; 3127 3128 if (!to_is_aligned) { 3129 BLOCK_COMMENT("Check dst alignment:"); 3130 __ tst(to, wordSize - 1); 3131 __ b(L_unaligned_dst, ne); // 'to' is not aligned 3132 } 3133 3134 int min_copy; 3135 if (forward) { 3136 min_copy = generate_forward_aligned_copy_loop(from, to, count, bytes_per_count); 3137 } else { 3138 min_copy = generate_backward_aligned_copy_loop(from, to, count, bytes_per_count); 3139 } 3140 assert(small_copy_limit >= count_required_to_align + min_copy, "first loop might exhaust count"); 3141 3142 oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward); 3143 3144 { 3145 copy_small_array(from, to, count, tmp1, noreg, bytes_per_count, forward, L_small_array); 3146 3147 oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward); 3148 } 3149 3150 if (!to_is_aligned) { 3151 // !to_is_aligned <=> UseCompressedOops && AArch64 3152 __ BIND(L_unaligned_dst); 3153 #ifdef AARCH64 3154 assert (UseCompressedOops, "unaligned oop array copy may be requested only with UseCompressedOops"); 3155 #else 3156 ShouldNotReachHere(); 3157 #endif // AARCH64 3158 int min_copy_shifted = align_dst_and_generate_shifted_copy_loop(from, to, count, bytes_per_count, forward); 3159 assert (small_copy_limit >= count_required_to_align + min_copy_shifted, "first loop might exhaust count"); 3160 3161 oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward); 3162 } 3163 3164 return start; 3165 } 3166 3167 // Generate 'unsafe' array copy stub 3168 // Though just as safe as the other stubs, it takes an unscaled 3169 // size_t argument instead of an element count. 3170 // 3171 // Arguments for generated stub: 3172 // from: R0 3173 // to: R1 3174 // count: R2 byte count, treated as ssize_t, can be zero 3175 // 3176 // Examines the alignment of the operands and dispatches 3177 // to a long, int, short, or byte copy loop. 3178 // 3179 address generate_unsafe_copy(const char* name) { 3180 3181 const Register R0_from = R0; // source array address 3182 const Register R1_to = R1; // destination array address 3183 const Register R2_count = R2; // elements count 3184 3185 const Register R3_bits = R3; // test copy of low bits 3186 3187 __ align(CodeEntryAlignment); 3188 StubCodeMark mark(this, "StubRoutines", name); 3189 address start = __ pc(); 3190 #ifdef AARCH64 3191 __ NOT_IMPLEMENTED(); 3192 start = NULL; 3193 #else 3194 const Register tmp = Rtemp; 3195 3196 // bump this on entry, not on exit: 3197 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R3, tmp); 3198 3199 __ orr(R3_bits, R0_from, R1_to); 3200 __ orr(R3_bits, R2_count, R3_bits); 3201 3202 __ tst(R3_bits, BytesPerLong-1); 3203 __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerLong), eq); 3204 __ jump(StubRoutines::_jlong_arraycopy, relocInfo::runtime_call_type, tmp, eq); 3205 3206 __ tst(R3_bits, BytesPerInt-1); 3207 __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerInt), eq); 3208 __ jump(StubRoutines::_jint_arraycopy, relocInfo::runtime_call_type, tmp, eq); 3209 3210 __ tst(R3_bits, BytesPerShort-1); 3211 __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerShort), eq); 3212 __ jump(StubRoutines::_jshort_arraycopy, relocInfo::runtime_call_type, tmp, eq); 3213 3214 __ jump(StubRoutines::_jbyte_arraycopy, relocInfo::runtime_call_type, tmp); 3215 #endif 3216 return start; 3217 } 3218 3219 // Helper for generating a dynamic type check. 3220 // Smashes only the given temp registers. 3221 void generate_type_check(Register sub_klass, 3222 Register super_check_offset, 3223 Register super_klass, 3224 Register tmp1, 3225 Register tmp2, 3226 Register tmp3, 3227 Label& L_success) { 3228 assert_different_registers(sub_klass, super_check_offset, super_klass, tmp1, tmp2, tmp3); 3229 3230 BLOCK_COMMENT("type_check:"); 3231 3232 // If the pointers are equal, we are done (e.g., String[] elements). 3233 3234 __ cmp(super_klass, sub_klass); 3235 __ b(L_success, eq); // fast success 3236 3237 3238 Label L_loop, L_fail; 3239 3240 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 3241 3242 // Check the supertype display: 3243 __ ldr(tmp1, Address(sub_klass, super_check_offset)); 3244 __ cmp(tmp1, super_klass); 3245 __ b(L_success, eq); 3246 3247 __ cmp(super_check_offset, sc_offset); 3248 __ b(L_fail, ne); // failure 3249 3250 BLOCK_COMMENT("type_check_slow_path:"); 3251 3252 // a couple of useful fields in sub_klass: 3253 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 3254 3255 // Do a linear scan of the secondary super-klass chain. 3256 3257 #ifndef PRODUCT 3258 int* pst_counter = &SharedRuntime::_partial_subtype_ctr; 3259 __ inc_counter((address) pst_counter, tmp1, tmp2); 3260 #endif 3261 3262 Register scan_temp = tmp1; 3263 Register count_temp = tmp2; 3264 3265 // We will consult the secondary-super array. 3266 __ ldr(scan_temp, Address(sub_klass, ss_offset)); 3267 3268 Register search_key = super_klass; 3269 3270 // Load the array length. 3271 __ ldr_s32(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes())); 3272 __ add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes()); 3273 3274 __ add(count_temp, count_temp, 1); 3275 3276 // Top of search loop 3277 __ bind(L_loop); 3278 // Notes: 3279 // scan_temp starts at the array elements 3280 // count_temp is 1+size 3281 3282 __ subs(count_temp, count_temp, 1); 3283 __ b(L_fail, eq); // not found 3284 3285 // Load next super to check 3286 // In the array of super classes elements are pointer sized. 3287 int element_size = wordSize; 3288 __ ldr(tmp3, Address(scan_temp, element_size, post_indexed)); 3289 3290 // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list 3291 __ cmp(tmp3, search_key); 3292 3293 // A miss means we are NOT a subtype and need to keep looping 3294 __ b(L_loop, ne); 3295 3296 // Falling out the bottom means we found a hit; we ARE a subtype 3297 3298 // Success. Cache the super we found and proceed in triumph. 3299 __ str(super_klass, Address(sub_klass, sc_offset)); 3300 3301 // Jump to success 3302 __ b(L_success); 3303 3304 // Fall through on failure! 3305 __ bind(L_fail); 3306 } 3307 3308 // Generate stub for checked oop copy. 3309 // 3310 // Arguments for generated stub: 3311 // from: R0 3312 // to: R1 3313 // count: R2 treated as signed 32-bit int 3314 // ckoff: R3 (super_check_offset) 3315 // ckval: R4 (AArch64) / SP[0] (32-bit ARM) (super_klass) 3316 // ret: R0 zero for success; (-1^K) where K is partial transfer count (32-bit) 3317 // 3318 address generate_checkcast_copy(const char * name) { 3319 __ align(CodeEntryAlignment); 3320 StubCodeMark mark(this, "StubRoutines", name); 3321 address start = __ pc(); 3322 3323 const Register from = R0; // source array address 3324 const Register to = R1; // destination array address 3325 const Register count = R2; // elements count 3326 3327 const Register R3_ckoff = R3; // super_check_offset 3328 const Register R4_ckval = R4; // super_klass 3329 3330 const int callee_saved_regs = AARCH64_ONLY(5) NOT_AARCH64(4); // LR saved differently 3331 3332 Label load_element, store_element, do_card_marks, fail; 3333 3334 BLOCK_COMMENT("Entry:"); 3335 3336 __ zap_high_non_significant_bits(R2); 3337 3338 #ifdef AARCH64 3339 __ raw_push(LR, ZR); 3340 __ raw_push(R19, R20); 3341 #else 3342 int pushed = 0; 3343 __ push(LR); 3344 pushed+=1; 3345 #endif // AARCH64 3346 3347 #if INCLUDE_ALL_GCS 3348 gen_write_ref_array_pre_barrier(to, count, callee_saved_regs); 3349 #endif // INCLUDE_ALL_GCS 3350 3351 #ifndef AARCH64 3352 const RegisterSet caller_saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11; 3353 __ push(caller_saved_regs); 3354 assert(caller_saved_regs.size() == 6, "check the count"); 3355 pushed+=6; 3356 3357 __ ldr(R4_ckval,Address(SP, wordSize*pushed)); // read the argument that was on the stack 3358 #endif // !AARCH64 3359 3360 // Save arguments for barrier generation (after the pre barrier): 3361 // - must be a caller saved register and not LR 3362 // - ARM32: avoid R10 in case RThread is needed 3363 const Register saved_count = AARCH64_ONLY(R19) NOT_AARCH64(altFP_7_11); 3364 #ifdef AARCH64 3365 __ mov_w(saved_count, count); 3366 __ cbnz_w(count, load_element); // and test count 3367 #else 3368 __ movs(saved_count, count); // and test count 3369 __ b(load_element,ne); 3370 #endif // AARCH64 3371 3372 // nothing to copy 3373 __ mov(R0, 0); 3374 3375 #ifdef AARCH64 3376 __ raw_pop(R19, R20); 3377 __ raw_pop(LR, ZR); 3378 __ ret(); 3379 #else 3380 __ pop(caller_saved_regs); 3381 __ pop(PC); 3382 #endif // AARCH64 3383 3384 // ======== begin loop ======== 3385 // (Loop is rotated; its entry is load_element.) 3386 __ align(OptoLoopAlignment); 3387 __ BIND(store_element); 3388 if (UseCompressedOops) { 3389 __ store_heap_oop(R5, Address(to, BytesPerHeapOop, post_indexed)); // store the oop, changes flags 3390 __ subs_32(count,count,1); 3391 } else { 3392 __ subs_32(count,count,1); 3393 __ str(R5, Address(to, BytesPerHeapOop, post_indexed)); // store the oop 3394 } 3395 __ b(do_card_marks, eq); // count exhausted 3396 3397 // ======== loop entry is here ======== 3398 __ BIND(load_element); 3399 __ load_heap_oop(R5, Address(from, BytesPerHeapOop, post_indexed)); // load the oop 3400 __ cbz(R5, store_element); // NULL 3401 3402 __ load_klass(R6, R5); 3403 3404 generate_type_check(R6, R3_ckoff, R4_ckval, /*tmps*/ R12, R8, R9, 3405 // branch to this on success: 3406 store_element); 3407 // ======== end loop ======== 3408 3409 // It was a real error; we must depend on the caller to finish the job. 3410 // Register count has number of *remaining* oops, saved_count number of *total* oops. 3411 // Emit GC store barriers for the oops we have copied 3412 // and report their number to the caller (0 or (-1^n)) 3413 __ BIND(fail); 3414 3415 // Note: fail marked by the fact that count differs from saved_count 3416 3417 __ BIND(do_card_marks); 3418 3419 Register copied = AARCH64_ONLY(R20) NOT_AARCH64(R4); // saved 3420 Label L_not_copied; 3421 3422 __ subs_32(copied, saved_count, count); // copied count (in saved reg) 3423 __ b(L_not_copied, eq); // nothing was copied, skip post barrier 3424 __ sub(to, to, AsmOperand(copied, lsl, LogBytesPerHeapOop)); // initial to value 3425 __ mov(R12, copied); // count arg scratched by post barrier 3426 3427 gen_write_ref_array_post_barrier(to, R12, R3); 3428 3429 assert_different_registers(R3,R12,LR,copied,saved_count); 3430 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R3, R12); 3431 3432 __ BIND(L_not_copied); 3433 __ cmp_32(copied, saved_count); // values preserved in saved registers 3434 3435 #ifdef AARCH64 3436 __ csinv(R0, ZR, copied, eq); // 0 if all copied else NOT(copied) 3437 __ raw_pop(R19, R20); 3438 __ raw_pop(LR, ZR); 3439 __ ret(); 3440 #else 3441 __ mov(R0, 0, eq); // 0 if all copied 3442 __ mvn(R0, copied, ne); // else NOT(copied) 3443 __ pop(caller_saved_regs); 3444 __ pop(PC); 3445 #endif // AARCH64 3446 3447 return start; 3448 } 3449 3450 // Perform range checks on the proposed arraycopy. 3451 // Kills the two temps, but nothing else. 3452 void arraycopy_range_checks(Register src, // source array oop 3453 Register src_pos, // source position (32-bit int) 3454 Register dst, // destination array oop 3455 Register dst_pos, // destination position (32-bit int) 3456 Register length, // length of copy (32-bit int) 3457 Register temp1, Register temp2, 3458 Label& L_failed) { 3459 3460 BLOCK_COMMENT("arraycopy_range_checks:"); 3461 3462 // if (src_pos + length > arrayOop(src)->length() ) FAIL; 3463 3464 const Register array_length = temp1; // scratch 3465 const Register end_pos = temp2; // scratch 3466 3467 __ add_32(end_pos, length, src_pos); // src_pos + length 3468 __ ldr_s32(array_length, Address(src, arrayOopDesc::length_offset_in_bytes())); 3469 __ cmp_32(end_pos, array_length); 3470 __ b(L_failed, hi); 3471 3472 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL; 3473 __ add_32(end_pos, length, dst_pos); // dst_pos + length 3474 __ ldr_s32(array_length, Address(dst, arrayOopDesc::length_offset_in_bytes())); 3475 __ cmp_32(end_pos, array_length); 3476 __ b(L_failed, hi); 3477 3478 BLOCK_COMMENT("arraycopy_range_checks done"); 3479 } 3480 3481 // 3482 // Generate generic array copy stubs 3483 // 3484 // Input: 3485 // R0 - src oop 3486 // R1 - src_pos (32-bit int) 3487 // R2 - dst oop 3488 // R3 - dst_pos (32-bit int) 3489 // R4 (AArch64) / SP[0] (32-bit ARM) - element count (32-bit int) 3490 // 3491 // Output: (32-bit int) 3492 // R0 == 0 - success 3493 // R0 < 0 - need to call System.arraycopy 3494 // 3495 address generate_generic_copy(const char *name) { 3496 Label L_failed, L_objArray; 3497 3498 // Input registers 3499 const Register src = R0; // source array oop 3500 const Register src_pos = R1; // source position 3501 const Register dst = R2; // destination array oop 3502 const Register dst_pos = R3; // destination position 3503 3504 // registers used as temp 3505 const Register R5_src_klass = R5; // source array klass 3506 const Register R6_dst_klass = R6; // destination array klass 3507 const Register R_lh = AARCH64_ONLY(R7) NOT_AARCH64(altFP_7_11); // layout handler 3508 const Register R8_temp = R8; 3509 3510 __ align(CodeEntryAlignment); 3511 StubCodeMark mark(this, "StubRoutines", name); 3512 address start = __ pc(); 3513 3514 __ zap_high_non_significant_bits(R1); 3515 __ zap_high_non_significant_bits(R3); 3516 __ zap_high_non_significant_bits(R4); 3517 3518 #ifndef AARCH64 3519 int pushed = 0; 3520 const RegisterSet saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11; 3521 __ push(saved_regs); 3522 assert(saved_regs.size() == 6, "check the count"); 3523 pushed+=6; 3524 #endif // !AARCH64 3525 3526 // bump this on entry, not on exit: 3527 inc_counter_np(SharedRuntime::_generic_array_copy_ctr, R5, R12); 3528 3529 const Register length = R4; // elements count 3530 #ifndef AARCH64 3531 __ ldr(length, Address(SP,4*pushed)); 3532 #endif // !AARCH64 3533 3534 3535 //----------------------------------------------------------------------- 3536 // Assembler stubs will be used for this call to arraycopy 3537 // if the following conditions are met: 3538 // 3539 // (1) src and dst must not be null. 3540 // (2) src_pos must not be negative. 3541 // (3) dst_pos must not be negative. 3542 // (4) length must not be negative. 3543 // (5) src klass and dst klass should be the same and not NULL. 3544 // (6) src and dst should be arrays. 3545 // (7) src_pos + length must not exceed length of src. 3546 // (8) dst_pos + length must not exceed length of dst. 3547 BLOCK_COMMENT("arraycopy initial argument checks"); 3548 3549 // if (src == NULL) return -1; 3550 __ cbz(src, L_failed); 3551 3552 // if (src_pos < 0) return -1; 3553 __ cmp_32(src_pos, 0); 3554 __ b(L_failed, lt); 3555 3556 // if (dst == NULL) return -1; 3557 __ cbz(dst, L_failed); 3558 3559 // if (dst_pos < 0) return -1; 3560 __ cmp_32(dst_pos, 0); 3561 __ b(L_failed, lt); 3562 3563 // if (length < 0) return -1; 3564 __ cmp_32(length, 0); 3565 __ b(L_failed, lt); 3566 3567 BLOCK_COMMENT("arraycopy argument klass checks"); 3568 // get src->klass() 3569 __ load_klass(R5_src_klass, src); 3570 3571 // Load layout helper 3572 // 3573 // |array_tag| | header_size | element_type | |log2_element_size| 3574 // 32 30 24 16 8 2 0 3575 // 3576 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 3577 // 3578 3579 int lh_offset = in_bytes(Klass::layout_helper_offset()); 3580 __ ldr_u32(R_lh, Address(R5_src_klass, lh_offset)); 3581 3582 __ load_klass(R6_dst_klass, dst); 3583 3584 // Handle objArrays completely differently... 3585 juint objArray_lh = Klass::array_layout_helper(T_OBJECT); 3586 __ mov_slow(R8_temp, objArray_lh); 3587 __ cmp_32(R_lh, R8_temp); 3588 __ b(L_objArray,eq); 3589 3590 // if (src->klass() != dst->klass()) return -1; 3591 __ cmp(R5_src_klass, R6_dst_klass); 3592 __ b(L_failed, ne); 3593 3594 // if (!src->is_Array()) return -1; 3595 __ cmp_32(R_lh, Klass::_lh_neutral_value); // < 0 3596 __ b(L_failed, ge); 3597 3598 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 3599 R8_temp, R6_dst_klass, L_failed); 3600 3601 { 3602 // TypeArrayKlass 3603 // 3604 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 3605 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 3606 // 3607 3608 const Register R6_offset = R6_dst_klass; // array offset 3609 const Register R12_elsize = R12; // log2 element size 3610 3611 __ logical_shift_right(R6_offset, R_lh, Klass::_lh_header_size_shift); 3612 __ andr(R6_offset, R6_offset, (unsigned int)Klass::_lh_header_size_mask); // array_offset 3613 __ add(src, src, R6_offset); // src array offset 3614 __ add(dst, dst, R6_offset); // dst array offset 3615 __ andr(R12_elsize, R_lh, (unsigned int)Klass::_lh_log2_element_size_mask); // log2 element size 3616 3617 // next registers should be set before the jump to corresponding stub 3618 const Register from = R0; // source array address 3619 const Register to = R1; // destination array address 3620 const Register count = R2; // elements count 3621 3622 // 'from', 'to', 'count' registers should be set in this order 3623 // since they are the same as 'src', 'src_pos', 'dst'. 3624 3625 #ifdef AARCH64 3626 3627 BLOCK_COMMENT("choose copy loop based on element size and scale indexes"); 3628 Label Lbyte, Lshort, Lint, Llong; 3629 3630 __ cbz(R12_elsize, Lbyte); 3631 3632 assert (LogBytesPerShort < LogBytesPerInt && LogBytesPerInt < LogBytesPerLong, "must be"); 3633 __ cmp(R12_elsize, LogBytesPerInt); 3634 __ b(Lint, eq); 3635 __ b(Llong, gt); 3636 3637 __ BIND(Lshort); 3638 __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerShort); 3639 __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerShort); 3640 __ mov(count, length); 3641 __ b(StubRoutines::_jshort_arraycopy); 3642 3643 __ BIND(Lint); 3644 __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerInt); 3645 __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerInt); 3646 __ mov(count, length); 3647 __ b(StubRoutines::_jint_arraycopy); 3648 3649 __ BIND(Lbyte); 3650 __ add_ptr_scaled_int32(from, src, src_pos, 0); 3651 __ add_ptr_scaled_int32(to, dst, dst_pos, 0); 3652 __ mov(count, length); 3653 __ b(StubRoutines::_jbyte_arraycopy); 3654 3655 __ BIND(Llong); 3656 __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerLong); 3657 __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerLong); 3658 __ mov(count, length); 3659 __ b(StubRoutines::_jlong_arraycopy); 3660 3661 #else // AARCH64 3662 3663 BLOCK_COMMENT("scale indexes to element size"); 3664 __ add(from, src, AsmOperand(src_pos, lsl, R12_elsize)); // src_addr 3665 __ add(to, dst, AsmOperand(dst_pos, lsl, R12_elsize)); // dst_addr 3666 3667 __ mov(count, length); // length 3668 3669 // XXX optim: avoid later push in arraycopy variants ? 3670 3671 __ pop(saved_regs); 3672 3673 BLOCK_COMMENT("choose copy loop based on element size"); 3674 __ cmp(R12_elsize, 0); 3675 __ b(StubRoutines::_jbyte_arraycopy,eq); 3676 3677 __ cmp(R12_elsize, LogBytesPerShort); 3678 __ b(StubRoutines::_jshort_arraycopy,eq); 3679 3680 __ cmp(R12_elsize, LogBytesPerInt); 3681 __ b(StubRoutines::_jint_arraycopy,eq); 3682 3683 __ b(StubRoutines::_jlong_arraycopy); 3684 3685 #endif // AARCH64 3686 } 3687 3688 // ObjArrayKlass 3689 __ BIND(L_objArray); 3690 // live at this point: R5_src_klass, R6_dst_klass, src[_pos], dst[_pos], length 3691 3692 Label L_plain_copy, L_checkcast_copy; 3693 // test array classes for subtyping 3694 __ cmp(R5_src_klass, R6_dst_klass); // usual case is exact equality 3695 __ b(L_checkcast_copy, ne); 3696 3697 BLOCK_COMMENT("Identically typed arrays"); 3698 { 3699 // Identically typed arrays can be copied without element-wise checks. 3700 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 3701 R8_temp, R_lh, L_failed); 3702 3703 // next registers should be set before the jump to corresponding stub 3704 const Register from = R0; // source array address 3705 const Register to = R1; // destination array address 3706 const Register count = R2; // elements count 3707 3708 __ add(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset 3709 __ add(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset 3710 __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerHeapOop); // src_addr 3711 __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerHeapOop); // dst_addr 3712 __ BIND(L_plain_copy); 3713 __ mov(count, length); 3714 3715 #ifndef AARCH64 3716 __ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ? 3717 #endif // !AARCH64 3718 __ b(StubRoutines::_oop_arraycopy); 3719 } 3720 3721 { 3722 __ BIND(L_checkcast_copy); 3723 // live at this point: R5_src_klass, R6_dst_klass 3724 3725 // Before looking at dst.length, make sure dst is also an objArray. 3726 __ ldr_u32(R8_temp, Address(R6_dst_klass, lh_offset)); 3727 __ cmp_32(R_lh, R8_temp); 3728 __ b(L_failed, ne); 3729 3730 // It is safe to examine both src.length and dst.length. 3731 3732 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 3733 R8_temp, R_lh, L_failed); 3734 3735 // next registers should be set before the jump to corresponding stub 3736 const Register from = R0; // source array address 3737 const Register to = R1; // destination array address 3738 const Register count = R2; // elements count 3739 3740 // Marshal the base address arguments now, freeing registers. 3741 __ add(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset 3742 __ add(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset 3743 __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerHeapOop); // src_addr 3744 __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerHeapOop); // dst_addr 3745 3746 __ mov(count, length); // length (reloaded) 3747 3748 Register sco_temp = R3; // this register is free now 3749 assert_different_registers(from, to, count, sco_temp, 3750 R6_dst_klass, R5_src_klass); 3751 3752 // Generate the type check. 3753 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 3754 __ ldr_u32(sco_temp, Address(R6_dst_klass, sco_offset)); 3755 generate_type_check(R5_src_klass, sco_temp, R6_dst_klass, 3756 R8_temp, R9, 3757 AARCH64_ONLY(R10) NOT_AARCH64(R12), 3758 L_plain_copy); 3759 3760 // Fetch destination element klass from the ObjArrayKlass header. 3761 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 3762 3763 // the checkcast_copy loop needs two extra arguments: 3764 const Register Rdst_elem_klass = AARCH64_ONLY(R4) NOT_AARCH64(R3); 3765 __ ldr(Rdst_elem_klass, Address(R6_dst_klass, ek_offset)); // dest elem klass 3766 #ifndef AARCH64 3767 __ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ? 3768 __ str(Rdst_elem_klass, Address(SP,0)); // dest elem klass argument 3769 #endif // !AARCH64 3770 __ ldr_u32(R3, Address(Rdst_elem_klass, sco_offset)); // sco of elem klass 3771 __ b(StubRoutines::_checkcast_arraycopy); 3772 } 3773 3774 __ BIND(L_failed); 3775 3776 #ifndef AARCH64 3777 __ pop(saved_regs); 3778 #endif // !AARCH64 3779 __ mvn(R0, 0); // failure, with 0 copied 3780 __ ret(); 3781 3782 return start; 3783 } 3784 3785 // Safefetch stubs. 3786 void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) { 3787 // safefetch signatures: 3788 // int SafeFetch32(int* adr, int errValue); 3789 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3790 // 3791 // arguments: 3792 // R0 = adr 3793 // R1 = errValue 3794 // 3795 // result: 3796 // R0 = *adr or errValue 3797 3798 StubCodeMark mark(this, "StubRoutines", name); 3799 3800 // Entry point, pc or function descriptor. 3801 *entry = __ pc(); 3802 3803 // Load *adr into c_rarg2, may fault. 3804 *fault_pc = __ pc(); 3805 3806 switch (size) { 3807 case 4: // int32_t 3808 __ ldr_s32(R1, Address(R0)); 3809 break; 3810 3811 case 8: // int64_t 3812 #ifdef AARCH64 3813 __ ldr(R1, Address(R0)); 3814 #else 3815 Unimplemented(); 3816 #endif // AARCH64 3817 break; 3818 3819 default: 3820 ShouldNotReachHere(); 3821 } 3822 3823 // return errValue or *adr 3824 *continuation_pc = __ pc(); 3825 __ mov(R0, R1); 3826 __ ret(); 3827 } 3828 3829 void generate_arraycopy_stubs() { 3830 3831 // Note: the disjoint stubs must be generated first, some of 3832 // the conjoint stubs use them. 3833 3834 bool status = false; // non failing C2 stubs need not return a status in R0 3835 3836 #ifdef TEST_C2_GENERIC_ARRAYCOPY /* Internal development flag */ 3837 // With this flag, the C2 stubs are tested by generating calls to 3838 // generic_arraycopy instead of Runtime1::arraycopy 3839 3840 // Runtime1::arraycopy return a status in R0 (0 if OK, else ~copied) 3841 // and the result is tested to see whether the arraycopy stub should 3842 // be called. 3843 3844 // When we test arraycopy this way, we must generate extra code in the 3845 // arraycopy methods callable from C2 generic_arraycopy to set the 3846 // status to 0 for those who always succeed (calling the slow path stub might 3847 // lead to errors since the copy has already been performed). 3848 3849 status = true; // generate a status compatible with C1 calls 3850 #endif 3851 3852 // these need always status in case they are called from generic_arraycopy 3853 StubRoutines::_jbyte_disjoint_arraycopy = generate_primitive_copy(false, "jbyte_disjoint_arraycopy", true, 1, true); 3854 StubRoutines::_jshort_disjoint_arraycopy = generate_primitive_copy(false, "jshort_disjoint_arraycopy", true, 2, true); 3855 StubRoutines::_jint_disjoint_arraycopy = generate_primitive_copy(false, "jint_disjoint_arraycopy", true, 4, true); 3856 StubRoutines::_jlong_disjoint_arraycopy = generate_primitive_copy(false, "jlong_disjoint_arraycopy", true, 8, true); 3857 StubRoutines::_oop_disjoint_arraycopy = generate_oop_copy (false, "oop_disjoint_arraycopy", true, true); 3858 3859 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_primitive_copy(true, "arrayof_jbyte_disjoint_arraycopy", status, 1, true); 3860 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_primitive_copy(true, "arrayof_jshort_disjoint_arraycopy",status, 2, true); 3861 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_primitive_copy(true, "arrayof_jint_disjoint_arraycopy", status, 4, true); 3862 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_primitive_copy(true, "arrayof_jlong_disjoint_arraycopy", status, 8, true); 3863 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_oop_copy (true, "arrayof_oop_disjoint_arraycopy", status, true); 3864 3865 // these need always status in case they are called from generic_arraycopy 3866 StubRoutines::_jbyte_arraycopy = generate_primitive_copy(false, "jbyte_arraycopy", true, 1, false, StubRoutines::_jbyte_disjoint_arraycopy); 3867 StubRoutines::_jshort_arraycopy = generate_primitive_copy(false, "jshort_arraycopy", true, 2, false, StubRoutines::_jshort_disjoint_arraycopy); 3868 StubRoutines::_jint_arraycopy = generate_primitive_copy(false, "jint_arraycopy", true, 4, false, StubRoutines::_jint_disjoint_arraycopy); 3869 StubRoutines::_jlong_arraycopy = generate_primitive_copy(false, "jlong_arraycopy", true, 8, false, StubRoutines::_jlong_disjoint_arraycopy); 3870 StubRoutines::_oop_arraycopy = generate_oop_copy (false, "oop_arraycopy", true, false, StubRoutines::_oop_disjoint_arraycopy); 3871 3872 StubRoutines::_arrayof_jbyte_arraycopy = generate_primitive_copy(true, "arrayof_jbyte_arraycopy", status, 1, false, StubRoutines::_arrayof_jbyte_disjoint_arraycopy); 3873 StubRoutines::_arrayof_jshort_arraycopy = generate_primitive_copy(true, "arrayof_jshort_arraycopy", status, 2, false, StubRoutines::_arrayof_jshort_disjoint_arraycopy); 3874 #ifdef _LP64 3875 // since sizeof(jint) < sizeof(HeapWord), there's a different flavor: 3876 StubRoutines::_arrayof_jint_arraycopy = generate_primitive_copy(true, "arrayof_jint_arraycopy", status, 4, false, StubRoutines::_arrayof_jint_disjoint_arraycopy); 3877 #else 3878 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy; 3879 #endif 3880 if (BytesPerHeapOop < HeapWordSize) { 3881 StubRoutines::_arrayof_oop_arraycopy = generate_oop_copy (true, "arrayof_oop_arraycopy", status, false, StubRoutines::_arrayof_oop_disjoint_arraycopy); 3882 } else { 3883 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy; 3884 } 3885 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy; 3886 3887 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy"); 3888 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy"); 3889 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy"); 3890 3891 3892 } 3893 3894 #ifndef AARCH64 3895 #define COMPILE_CRYPTO 3896 #include "stubRoutinesCrypto_arm.cpp" 3897 #else 3898 3899 #ifdef COMPILER2 3900 // Arguments: 3901 // 3902 // Inputs: 3903 // c_rarg0 - source byte array address 3904 // c_rarg1 - destination byte array address 3905 // c_rarg2 - K (key) in little endian int array 3906 // 3907 address generate_aescrypt_encryptBlock() { 3908 __ align(CodeEntryAlignment); 3909 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 3910 3911 Label L_doLast; 3912 3913 const Register from = c_rarg0; // source array address 3914 const Register to = c_rarg1; // destination array address 3915 const Register key = c_rarg2; // key array address 3916 const Register keylen = R8; 3917 3918 address start = __ pc(); 3919 __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed)); 3920 __ mov(FP, SP); 3921 3922 __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 3923 3924 __ vld1(V0, Address(from), MacroAssembler::VELEM_SIZE_8, 128); // get 16 bytes of input 3925 3926 __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 3927 3928 int quad = 1; 3929 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 3930 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 3931 __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad); 3932 __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad); 3933 __ aese(V0, V1); 3934 __ aesmc(V0, V0); 3935 __ aese(V0, V2); 3936 __ aesmc(V0, V0); 3937 __ aese(V0, V3); 3938 __ aesmc(V0, V0); 3939 __ aese(V0, V4); 3940 __ aesmc(V0, V0); 3941 3942 __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 3943 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 3944 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 3945 __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad); 3946 __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad); 3947 __ aese(V0, V1); 3948 __ aesmc(V0, V0); 3949 __ aese(V0, V2); 3950 __ aesmc(V0, V0); 3951 __ aese(V0, V3); 3952 __ aesmc(V0, V0); 3953 __ aese(V0, V4); 3954 __ aesmc(V0, V0); 3955 3956 __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 3957 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 3958 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 3959 3960 __ cmp_w(keylen, 44); 3961 __ b(L_doLast, eq); 3962 3963 __ aese(V0, V1); 3964 __ aesmc(V0, V0); 3965 __ aese(V0, V2); 3966 __ aesmc(V0, V0); 3967 3968 __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 3969 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 3970 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 3971 3972 __ cmp_w(keylen, 52); 3973 __ b(L_doLast, eq); 3974 3975 __ aese(V0, V1); 3976 __ aesmc(V0, V0); 3977 __ aese(V0, V2); 3978 __ aesmc(V0, V0); 3979 3980 __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 3981 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 3982 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 3983 3984 __ BIND(L_doLast); 3985 3986 __ aese(V0, V1); 3987 __ aesmc(V0, V0); 3988 __ aese(V0, V2); 3989 3990 __ vld1(V1, Address(key), MacroAssembler::VELEM_SIZE_8, 128); 3991 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 3992 __ eor(V0, V0, V1, MacroAssembler::VELEM_SIZE_8, quad); 3993 3994 __ vst1(V0, Address(to), MacroAssembler::VELEM_SIZE_8, 128); 3995 3996 __ mov(R0, 0); 3997 3998 __ mov(SP, FP); 3999 __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed)); 4000 __ ret(LR); 4001 4002 return start; 4003 } 4004 4005 // Arguments: 4006 // 4007 // Inputs: 4008 // c_rarg0 - source byte array address 4009 // c_rarg1 - destination byte array address 4010 // c_rarg2 - K (key) in little endian int array 4011 // 4012 address generate_aescrypt_decryptBlock() { 4013 assert(UseAES, "need AES instructions and misaligned SSE support"); 4014 __ align(CodeEntryAlignment); 4015 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 4016 Label L_doLast; 4017 4018 const Register from = c_rarg0; // source array address 4019 const Register to = c_rarg1; // destination array address 4020 const Register key = c_rarg2; // key array address 4021 const Register keylen = R8; 4022 4023 address start = __ pc(); 4024 __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed)); 4025 __ mov(FP, SP); 4026 4027 __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 4028 4029 __ vld1(V0, Address(from), MacroAssembler::VELEM_SIZE_8, 128); // get 16 bytes of input 4030 4031 __ vld1(V5, Address(key, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4032 4033 int quad = 1; 4034 __ rev32(V5, V5, MacroAssembler::VELEM_SIZE_8, quad); 4035 4036 __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4037 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 4038 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 4039 __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad); 4040 __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad); 4041 __ aesd(V0, V1); 4042 __ aesimc(V0, V0); 4043 __ aesd(V0, V2); 4044 __ aesimc(V0, V0); 4045 __ aesd(V0, V3); 4046 __ aesimc(V0, V0); 4047 __ aesd(V0, V4); 4048 __ aesimc(V0, V0); 4049 4050 __ vld1(V1, V2, V3, V4, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4051 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 4052 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 4053 __ rev32(V3, V3, MacroAssembler::VELEM_SIZE_8, quad); 4054 __ rev32(V4, V4, MacroAssembler::VELEM_SIZE_8, quad); 4055 __ aesd(V0, V1); 4056 __ aesimc(V0, V0); 4057 __ aesd(V0, V2); 4058 __ aesimc(V0, V0); 4059 __ aesd(V0, V3); 4060 __ aesimc(V0, V0); 4061 __ aesd(V0, V4); 4062 __ aesimc(V0, V0); 4063 4064 __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4065 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 4066 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 4067 4068 __ cmp_w(keylen, 44); 4069 __ b(L_doLast, eq); 4070 4071 __ aesd(V0, V1); 4072 __ aesimc(V0, V0); 4073 __ aesd(V0, V2); 4074 __ aesimc(V0, V0); 4075 4076 __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4077 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 4078 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 4079 4080 __ cmp_w(keylen, 52); 4081 __ b(L_doLast, eq); 4082 4083 __ aesd(V0, V1); 4084 __ aesimc(V0, V0); 4085 __ aesd(V0, V2); 4086 __ aesimc(V0, V0); 4087 4088 __ vld1(V1, V2, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4089 __ rev32(V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 4090 __ rev32(V2, V2, MacroAssembler::VELEM_SIZE_8, quad); 4091 4092 __ BIND(L_doLast); 4093 4094 __ aesd(V0, V1); 4095 __ aesimc(V0, V0); 4096 __ aesd(V0, V2); 4097 4098 __ eor(V0, V0, V5, MacroAssembler::VELEM_SIZE_8, quad); 4099 4100 __ vst1(V0, Address(to), MacroAssembler::VELEM_SIZE_8, 128); 4101 4102 __ mov(R0, 0); 4103 4104 __ mov(SP, FP); 4105 __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed)); 4106 __ ret(LR); 4107 4108 4109 return start; 4110 } 4111 4112 // Arguments: 4113 // 4114 // Inputs: 4115 // c_rarg0 - source byte array address 4116 // c_rarg1 - destination byte array address 4117 // c_rarg2 - K (key) in little endian int array 4118 // c_rarg3 - r vector byte array address 4119 // c_rarg4 - input length 4120 // 4121 // Output: 4122 // x0 - input length 4123 // 4124 address generate_cipherBlockChaining_encryptAESCrypt() { 4125 assert(UseAES, "need AES instructions and misaligned SSE support"); 4126 __ align(CodeEntryAlignment); 4127 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 4128 4129 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 4130 4131 const Register from = c_rarg0; // source array address 4132 const Register to = c_rarg1; // destination array address 4133 const Register key = c_rarg2; // key array address 4134 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 4135 // and left with the results of the last encryption block 4136 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 4137 const Register keylen = R8; 4138 4139 address start = __ pc(); 4140 __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed)); 4141 __ mov(FP, SP); 4142 4143 __ mov(R9, len_reg); 4144 __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 4145 4146 __ vld1(V0, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128); 4147 4148 __ cmp_w(keylen, 52); 4149 __ b(L_loadkeys_44, cc); 4150 __ b(L_loadkeys_52, eq); 4151 4152 __ vld1(V17, V18, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4153 4154 int quad = 1; 4155 __ rev32(V17, V17, MacroAssembler::VELEM_SIZE_8, quad); 4156 __ rev32(V18, V18, MacroAssembler::VELEM_SIZE_8, quad); 4157 __ BIND(L_loadkeys_52); 4158 __ vld1(V19, V20, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4159 __ rev32(V19, V19, MacroAssembler::VELEM_SIZE_8, quad); 4160 __ rev32(V20, V20, MacroAssembler::VELEM_SIZE_8, quad); 4161 __ BIND(L_loadkeys_44); 4162 __ vld1(V21, V22, V23, V24, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4163 __ rev32(V21, V21, MacroAssembler::VELEM_SIZE_8, quad); 4164 __ rev32(V22, V22, MacroAssembler::VELEM_SIZE_8, quad); 4165 __ rev32(V23, V23, MacroAssembler::VELEM_SIZE_8, quad); 4166 __ rev32(V24, V24, MacroAssembler::VELEM_SIZE_8, quad); 4167 __ vld1(V25, V26, V27, V28, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4168 __ rev32(V25, V25, MacroAssembler::VELEM_SIZE_8, quad); 4169 __ rev32(V26, V26, MacroAssembler::VELEM_SIZE_8, quad); 4170 __ rev32(V27, V27, MacroAssembler::VELEM_SIZE_8, quad); 4171 __ rev32(V28, V28, MacroAssembler::VELEM_SIZE_8, quad); 4172 __ vld1(V29, V30, V31, Address(key), MacroAssembler::VELEM_SIZE_8, 128); 4173 __ rev32(V29, V29, MacroAssembler::VELEM_SIZE_8, quad); 4174 __ rev32(V30, V30, MacroAssembler::VELEM_SIZE_8, quad); 4175 __ rev32(V31, V31, MacroAssembler::VELEM_SIZE_8, quad); 4176 4177 __ BIND(L_aes_loop); 4178 __ vld1(V1, Address(from, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4179 __ eor(V0, V0, V1, MacroAssembler::VELEM_SIZE_8, quad); 4180 4181 __ b(L_rounds_44, cc); 4182 __ b(L_rounds_52, eq); 4183 4184 __ aese(V0, V17); 4185 __ aesmc(V0, V0); 4186 __ aese(V0, V18); 4187 __ aesmc(V0, V0); 4188 __ BIND(L_rounds_52); 4189 __ aese(V0, V19); 4190 __ aesmc(V0, V0); 4191 __ aese(V0, V20); 4192 __ aesmc(V0, V0); 4193 __ BIND(L_rounds_44); 4194 __ aese(V0, V21); 4195 __ aesmc(V0, V0); 4196 __ aese(V0, V22); 4197 __ aesmc(V0, V0); 4198 __ aese(V0, V23); 4199 __ aesmc(V0, V0); 4200 __ aese(V0, V24); 4201 __ aesmc(V0, V0); 4202 __ aese(V0, V25); 4203 __ aesmc(V0, V0); 4204 __ aese(V0, V26); 4205 __ aesmc(V0, V0); 4206 __ aese(V0, V27); 4207 __ aesmc(V0, V0); 4208 __ aese(V0, V28); 4209 __ aesmc(V0, V0); 4210 __ aese(V0, V29); 4211 __ aesmc(V0, V0); 4212 __ aese(V0, V30); 4213 __ eor(V0, V0, V31, MacroAssembler::VELEM_SIZE_8, quad); 4214 4215 __ vst1(V0, Address(to, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4216 __ sub(len_reg, len_reg, 16); 4217 __ cbnz(len_reg, L_aes_loop); 4218 4219 __ vst1(V0, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128); 4220 4221 __ mov(R0, R9); 4222 4223 __ mov(SP, FP); 4224 __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed)); 4225 __ ret(LR); 4226 4227 return start; 4228 } 4229 4230 // Arguments: 4231 // 4232 // Inputs: 4233 // c_rarg0 - source byte array address 4234 // c_rarg1 - destination byte array address 4235 // c_rarg2 - K (key) in little endian int array 4236 // c_rarg3 - r vector byte array address 4237 // c_rarg4 - input length 4238 // 4239 // Output: 4240 // rax - input length 4241 // 4242 address generate_cipherBlockChaining_decryptAESCrypt() { 4243 assert(UseAES, "need AES instructions and misaligned SSE support"); 4244 __ align(CodeEntryAlignment); 4245 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 4246 4247 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 4248 4249 const Register from = c_rarg0; // source array address 4250 const Register to = c_rarg1; // destination array address 4251 const Register key = c_rarg2; // key array address 4252 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 4253 // and left with the results of the last encryption block 4254 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 4255 const Register keylen = R8; 4256 4257 address start = __ pc(); 4258 __ stp(FP, LR, Address(SP, -2 * wordSize, pre_indexed)); 4259 __ mov(FP, SP); 4260 4261 __ mov(R9, len_reg); 4262 __ ldr_w(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 4263 4264 __ vld1(V2, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128); 4265 4266 __ vld1(V31, Address(key, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4267 4268 int quad = 1; 4269 __ rev32(V31, V31, MacroAssembler::VELEM_SIZE_8, quad); 4270 4271 __ cmp_w(keylen, 52); 4272 __ b(L_loadkeys_44, cc); 4273 __ b(L_loadkeys_52, eq); 4274 4275 __ vld1(V17, V18, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4276 __ rev32(V17, V17, MacroAssembler::VELEM_SIZE_8, quad); 4277 __ rev32(V18, V18, MacroAssembler::VELEM_SIZE_8, quad); 4278 __ BIND(L_loadkeys_52); 4279 __ vld1(V19, V20, Address(key, 32, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4280 __ rev32(V19, V19, MacroAssembler::VELEM_SIZE_8, quad); 4281 __ rev32(V20, V20, MacroAssembler::VELEM_SIZE_8, quad); 4282 __ BIND(L_loadkeys_44); 4283 __ vld1(V21, V22, V23, V24, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4284 __ rev32(V21, V21, MacroAssembler::VELEM_SIZE_8, quad); 4285 __ rev32(V22, V22, MacroAssembler::VELEM_SIZE_8, quad); 4286 __ rev32(V23, V23, MacroAssembler::VELEM_SIZE_8, quad); 4287 __ rev32(V24, V24, MacroAssembler::VELEM_SIZE_8, quad); 4288 __ vld1(V25, V26, V27, V28, Address(key, 64, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4289 __ rev32(V25, V25, MacroAssembler::VELEM_SIZE_8, quad); 4290 __ rev32(V26, V26, MacroAssembler::VELEM_SIZE_8, quad); 4291 __ rev32(V27, V27, MacroAssembler::VELEM_SIZE_8, quad); 4292 __ rev32(V28, V28, MacroAssembler::VELEM_SIZE_8, quad); 4293 __ vld1(V29, V30, Address(key), MacroAssembler::VELEM_SIZE_8, 128); 4294 __ rev32(V29, V29, MacroAssembler::VELEM_SIZE_8, quad); 4295 __ rev32(V30, V30, MacroAssembler::VELEM_SIZE_8, quad); 4296 4297 __ BIND(L_aes_loop); 4298 __ vld1(V0, Address(from, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4299 __ orr(V1, V0, V0, MacroAssembler::VELEM_SIZE_8, quad); 4300 4301 __ b(L_rounds_44, cc); 4302 __ b(L_rounds_52, eq); 4303 4304 __ aesd(V0, V17); 4305 __ aesimc(V0, V0); 4306 __ aesd(V0, V17); 4307 __ aesimc(V0, V0); 4308 __ BIND(L_rounds_52); 4309 __ aesd(V0, V19); 4310 __ aesimc(V0, V0); 4311 __ aesd(V0, V20); 4312 __ aesimc(V0, V0); 4313 __ BIND(L_rounds_44); 4314 __ aesd(V0, V21); 4315 __ aesimc(V0, V0); 4316 __ aesd(V0, V22); 4317 __ aesimc(V0, V0); 4318 __ aesd(V0, V23); 4319 __ aesimc(V0, V0); 4320 __ aesd(V0, V24); 4321 __ aesimc(V0, V0); 4322 __ aesd(V0, V25); 4323 __ aesimc(V0, V0); 4324 __ aesd(V0, V26); 4325 __ aesimc(V0, V0); 4326 __ aesd(V0, V27); 4327 __ aesimc(V0, V0); 4328 __ aesd(V0, V28); 4329 __ aesimc(V0, V0); 4330 __ aesd(V0, V29); 4331 __ aesimc(V0, V0); 4332 __ aesd(V0, V30); 4333 __ eor(V0, V0, V31, MacroAssembler::VELEM_SIZE_8, quad); 4334 __ eor(V0, V0, V2, MacroAssembler::VELEM_SIZE_8, quad); 4335 4336 __ vst1(V0, Address(to, 16, post_indexed), MacroAssembler::VELEM_SIZE_8, 128); 4337 __ orr(V2, V1, V1, MacroAssembler::VELEM_SIZE_8, quad); 4338 4339 __ sub(len_reg, len_reg, 16); 4340 __ cbnz(len_reg, L_aes_loop); 4341 4342 __ vst1(V2, Address(rvec), MacroAssembler::VELEM_SIZE_8, 128); 4343 4344 __ mov(R0, R9); 4345 4346 __ mov(SP, FP); 4347 __ ldp(FP, LR, Address(SP, 2 * wordSize, post_indexed)); 4348 __ ret(LR); 4349 4350 return start; 4351 } 4352 4353 #endif // COMPILER2 4354 #endif // AARCH64 4355 4356 private: 4357 4358 #undef __ 4359 #define __ masm-> 4360 4361 //------------------------------------------------------------------------------------------------------------------------ 4362 // Continuation point for throwing of implicit exceptions that are not handled in 4363 // the current activation. Fabricates an exception oop and initiates normal 4364 // exception dispatching in this frame. 4365 address generate_throw_exception(const char* name, address runtime_entry) { 4366 int insts_size = 128; 4367 int locs_size = 32; 4368 CodeBuffer code(name, insts_size, locs_size); 4369 OopMapSet* oop_maps; 4370 int frame_size; 4371 int frame_complete; 4372 4373 oop_maps = new OopMapSet(); 4374 MacroAssembler* masm = new MacroAssembler(&code); 4375 4376 address start = __ pc(); 4377 4378 frame_size = 2; 4379 __ mov(Rexception_pc, LR); 4380 __ raw_push(FP, LR); 4381 4382 frame_complete = __ pc() - start; 4383 4384 // Any extra arguments are already supposed to be R1 and R2 4385 __ mov(R0, Rthread); 4386 4387 int pc_offset = __ set_last_Java_frame(SP, FP, false, Rtemp); 4388 assert(((__ pc()) - start) == __ offset(), "warning: start differs from code_begin"); 4389 __ call(runtime_entry); 4390 if (pc_offset == -1) { 4391 pc_offset = __ offset(); 4392 } 4393 4394 // Generate oop map 4395 OopMap* map = new OopMap(frame_size*VMRegImpl::slots_per_word, 0); 4396 oop_maps->add_gc_map(pc_offset, map); 4397 __ reset_last_Java_frame(Rtemp); // Rtemp free since scratched by far call 4398 4399 __ raw_pop(FP, LR); 4400 __ jump(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type, Rtemp); 4401 4402 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, 4403 frame_size, oop_maps, false); 4404 return stub->entry_point(); 4405 } 4406 4407 //--------------------------------------------------------------------------- 4408 // Initialization 4409 4410 void generate_initial() { 4411 // Generates all stubs and initializes the entry points 4412 4413 //------------------------------------------------------------------------------------------------------------------------ 4414 // entry points that exist in all platforms 4415 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than 4416 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp. 4417 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4418 4419 StubRoutines::_call_stub_entry = 4420 generate_call_stub(StubRoutines::_call_stub_return_address); 4421 // is referenced by megamorphic call 4422 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4423 4424 // stub for throwing stack overflow error used both by interpreter and compiler 4425 StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError)); 4426 4427 #ifndef AARCH64 4428 // integer division used both by interpreter and compiler 4429 StubRoutines::Arm::_idiv_irem_entry = generate_idiv_irem(); 4430 4431 StubRoutines::_atomic_add_entry = generate_atomic_add(); 4432 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg(); 4433 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg(); 4434 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long(); 4435 StubRoutines::_atomic_load_long_entry = generate_atomic_load_long(); 4436 StubRoutines::_atomic_store_long_entry = generate_atomic_store_long(); 4437 #endif // !AARCH64 4438 } 4439 4440 void generate_all() { 4441 // Generates all stubs and initializes the entry points 4442 4443 #ifdef COMPILER2 4444 // Generate partial_subtype_check first here since its code depends on 4445 // UseZeroBaseCompressedOops which is defined after heap initialization. 4446 StubRoutines::Arm::_partial_subtype_check = generate_partial_subtype_check(); 4447 #endif 4448 // These entry points require SharedInfo::stack0 to be set up in non-core builds 4449 // and need to be relocatable, so they each fabricate a RuntimeStub internally. 4450 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError)); 4451 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call)); 4452 4453 //------------------------------------------------------------------------------------------------------------------------ 4454 // entry points that are platform specific 4455 4456 // support for verify_oop (must happen after universe_init) 4457 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4458 4459 // arraycopy stubs used by compilers 4460 generate_arraycopy_stubs(); 4461 4462 // Safefetch stubs. 4463 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4464 &StubRoutines::_safefetch32_fault_pc, 4465 &StubRoutines::_safefetch32_continuation_pc); 4466 #ifdef AARCH64 4467 generate_safefetch("SafeFetchN", wordSize, &StubRoutines::_safefetchN_entry, 4468 &StubRoutines::_safefetchN_fault_pc, 4469 &StubRoutines::_safefetchN_continuation_pc); 4470 #ifdef COMPILER2 4471 if (UseAESIntrinsics) { 4472 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4473 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4474 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4475 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4476 } 4477 #endif 4478 #else 4479 assert (sizeof(int) == wordSize, "32-bit architecture"); 4480 StubRoutines::_safefetchN_entry = StubRoutines::_safefetch32_entry; 4481 StubRoutines::_safefetchN_fault_pc = StubRoutines::_safefetch32_fault_pc; 4482 StubRoutines::_safefetchN_continuation_pc = StubRoutines::_safefetch32_continuation_pc; 4483 #endif // AARCH64 4484 4485 #ifdef COMPILE_CRYPTO 4486 // generate AES intrinsics code 4487 if (UseAESIntrinsics) { 4488 aes_init(); 4489 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4490 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4491 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4492 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4493 } 4494 #endif // COMPILE_CRYPTO 4495 } 4496 4497 4498 public: 4499 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4500 if (all) { 4501 generate_all(); 4502 } else { 4503 generate_initial(); 4504 } 4505 } 4506 }; // end class declaration 4507 4508 void StubGenerator_generate(CodeBuffer* code, bool all) { 4509 StubGenerator g(code, all); 4510 }