1 /* 2 * Copyright (c) 2008, 2019, Oracle and/or its affiliates. All rights reserved. 3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 4 * 5 * This code is free software; you can redistribute it and/or modify it 6 * under the terms of the GNU General Public License version 2 only, as 7 * published by the Free Software Foundation. 8 * 9 * This code is distributed in the hope that it will be useful, but WITHOUT 10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 12 * version 2 for more details (a copy is included in the LICENSE file that 13 * accompanied this code). 14 * 15 * You should have received a copy of the GNU General Public License version 16 * 2 along with this work; if not, write to the Free Software Foundation, 17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 18 * 19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 20 * or visit www.oracle.com if you need additional information or have any 21 * questions. 22 * 23 */ 24 25 #include "precompiled.hpp" 26 #include "asm/assembler.hpp" 27 #include "assembler_arm.inline.hpp" 28 #include "gc/shared/barrierSet.hpp" 29 #include "gc/shared/barrierSetAssembler.hpp" 30 #include "interpreter/interpreter.hpp" 31 #include "memory/universe.hpp" 32 #include "nativeInst_arm.hpp" 33 #include "oops/instanceOop.hpp" 34 #include "oops/method.hpp" 35 #include "oops/objArrayKlass.hpp" 36 #include "oops/oop.inline.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "runtime/frame.inline.hpp" 39 #include "runtime/handles.inline.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubCodeGenerator.hpp" 42 #include "runtime/stubRoutines.hpp" 43 #include "utilities/align.hpp" 44 #ifdef COMPILER2 45 #include "opto/runtime.hpp" 46 #endif 47 48 // Declaration and definition of StubGenerator (no .hpp file). 49 // For a more detailed description of the stub routine structure 50 // see the comment in stubRoutines.hpp 51 52 #define __ _masm-> 53 54 #ifdef PRODUCT 55 #define BLOCK_COMMENT(str) /* nothing */ 56 #else 57 #define BLOCK_COMMENT(str) __ block_comment(str) 58 #endif 59 60 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 61 62 // ------------------------------------------------------------------------------------------------------------------------- 63 // Stub Code definitions 64 65 // Platform dependent parameters for array copy stubs 66 67 // Note: we have noticed a huge change in behavior on a microbenchmark 68 // from platform to platform depending on the configuration. 69 70 // Instead of adding a series of command line options (which 71 // unfortunately have to be done in the shared file and cannot appear 72 // only in the ARM port), the tested result are hard-coded here in a set 73 // of options, selected by specifying 'ArmCopyPlatform' 74 75 // Currently, this 'platform' is hardcoded to a value that is a good 76 // enough trade-off. However, one can easily modify this file to test 77 // the hard-coded configurations or create new ones. If the gain is 78 // significant, we could decide to either add command line options or 79 // add code to automatically choose a configuration. 80 81 // see comments below for the various configurations created 82 #define DEFAULT_ARRAYCOPY_CONFIG 0 83 #define TEGRA2_ARRAYCOPY_CONFIG 1 84 #define IMX515_ARRAYCOPY_CONFIG 2 85 86 // Hard coded choices (XXX: could be changed to a command line option) 87 #define ArmCopyPlatform DEFAULT_ARRAYCOPY_CONFIG 88 89 #define ArmCopyCacheLineSize 32 // not worth optimizing to 64 according to measured gains 90 91 // configuration for each kind of loop 92 typedef struct { 93 int pld_distance; // prefetch distance (0 => no prefetch, <0: prefetch_before); 94 bool split_ldm; // if true, split each STM in STMs with fewer registers 95 bool split_stm; // if true, split each LTM in LTMs with fewer registers 96 } arraycopy_loop_config; 97 98 // configuration for all loops 99 typedef struct { 100 // const char *description; 101 arraycopy_loop_config forward_aligned; 102 arraycopy_loop_config backward_aligned; 103 arraycopy_loop_config forward_shifted; 104 arraycopy_loop_config backward_shifted; 105 } arraycopy_platform_config; 106 107 // configured platforms 108 static arraycopy_platform_config arraycopy_configurations[] = { 109 // configuration parameters for arraycopy loops 110 111 // Configurations were chosen based on manual analysis of benchmark 112 // results, minimizing overhead with respect to best results on the 113 // different test cases. 114 115 // Prefetch before is always favored since it avoids dirtying the 116 // cache uselessly for small copies. Code for prefetch after has 117 // been kept in case the difference is significant for some 118 // platforms but we might consider dropping it. 119 120 // distance, ldm, stm 121 { 122 // default: tradeoff tegra2/imx515/nv-tegra2, 123 // Notes on benchmarking: 124 // - not far from optimal configuration on nv-tegra2 125 // - within 5% of optimal configuration except for backward aligned on IMX 126 // - up to 40% from optimal configuration for backward shifted and backward align for tegra2 127 // but still on par with the operating system copy 128 {-256, true, true }, // forward aligned 129 {-256, true, true }, // backward aligned 130 {-256, false, false }, // forward shifted 131 {-256, true, true } // backward shifted 132 }, 133 { 134 // configuration tuned on tegra2-4. 135 // Warning: should not be used on nv-tegra2 ! 136 // Notes: 137 // - prefetch after gives 40% gain on backward copies on tegra2-4, 138 // resulting in better number than the operating system 139 // copy. However, this can lead to a 300% loss on nv-tegra and has 140 // more impact on the cache (fetches futher than what is 141 // copied). Use this configuration with care, in case it improves 142 // reference benchmarks. 143 {-256, true, true }, // forward aligned 144 {96, false, false }, // backward aligned 145 {-256, false, false }, // forward shifted 146 {96, false, false } // backward shifted 147 }, 148 { 149 // configuration tuned on imx515 150 // Notes: 151 // - smaller prefetch distance is sufficient to get good result and might be more stable 152 // - refined backward aligned options within 5% of optimal configuration except for 153 // tests were the arrays fit in the cache 154 {-160, false, false }, // forward aligned 155 {-160, false, false }, // backward aligned 156 {-160, false, false }, // forward shifted 157 {-160, true, true } // backward shifted 158 } 159 }; 160 161 class StubGenerator: public StubCodeGenerator { 162 163 #ifdef PRODUCT 164 #define inc_counter_np(a,b,c) ((void)0) 165 #else 166 #define inc_counter_np(counter, t1, t2) \ 167 BLOCK_COMMENT("inc_counter " #counter); \ 168 __ inc_counter(&counter, t1, t2); 169 #endif 170 171 private: 172 173 address generate_call_stub(address& return_address) { 174 StubCodeMark mark(this, "StubRoutines", "call_stub"); 175 address start = __ pc(); 176 177 178 assert(frame::entry_frame_call_wrapper_offset == 0, "adjust this code"); 179 180 __ mov(Rtemp, SP); 181 __ push(RegisterSet(FP) | RegisterSet(LR)); 182 #ifndef __SOFTFP__ 183 __ fstmdbd(SP, FloatRegisterSet(D8, 8), writeback); 184 #endif 185 __ stmdb(SP, RegisterSet(R0, R2) | RegisterSet(R4, R6) | RegisterSet(R8, R10) | altFP_7_11, writeback); 186 __ mov(Rmethod, R3); 187 __ ldmia(Rtemp, RegisterSet(R1, R3) | Rthread); // stacked arguments 188 189 // XXX: TODO 190 // Would be better with respect to native tools if the following 191 // setting of FP was changed to conform to the native ABI, with FP 192 // pointing to the saved FP slot (and the corresponding modifications 193 // for entry_frame_call_wrapper_offset and frame::real_fp). 194 __ mov(FP, SP); 195 196 { 197 Label no_parameters, pass_parameters; 198 __ cmp(R3, 0); 199 __ b(no_parameters, eq); 200 201 __ bind(pass_parameters); 202 __ ldr(Rtemp, Address(R2, wordSize, post_indexed)); // Rtemp OK, unused and scratchable 203 __ subs(R3, R3, 1); 204 __ push(Rtemp); 205 __ b(pass_parameters, ne); 206 __ bind(no_parameters); 207 } 208 209 __ mov(Rsender_sp, SP); 210 __ blx(R1); 211 return_address = __ pc(); 212 213 __ add(SP, FP, wordSize); // Skip link to JavaCallWrapper 214 __ pop(RegisterSet(R2, R3)); 215 #ifndef __ABI_HARD__ 216 __ cmp(R3, T_LONG); 217 __ cmp(R3, T_DOUBLE, ne); 218 __ str(R0, Address(R2)); 219 __ str(R1, Address(R2, wordSize), eq); 220 #else 221 Label cont, l_float, l_double; 222 223 __ cmp(R3, T_DOUBLE); 224 __ b(l_double, eq); 225 226 __ cmp(R3, T_FLOAT); 227 __ b(l_float, eq); 228 229 __ cmp(R3, T_LONG); 230 __ str(R0, Address(R2)); 231 __ str(R1, Address(R2, wordSize), eq); 232 __ b(cont); 233 234 235 __ bind(l_double); 236 __ fstd(D0, Address(R2)); 237 __ b(cont); 238 239 __ bind(l_float); 240 __ fsts(S0, Address(R2)); 241 242 __ bind(cont); 243 #endif 244 245 __ pop(RegisterSet(R4, R6) | RegisterSet(R8, R10) | altFP_7_11); 246 #ifndef __SOFTFP__ 247 __ fldmiad(SP, FloatRegisterSet(D8, 8), writeback); 248 #endif 249 __ pop(RegisterSet(FP) | RegisterSet(PC)); 250 251 return start; 252 } 253 254 255 // (in) Rexception_obj: exception oop 256 address generate_catch_exception() { 257 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 258 address start = __ pc(); 259 260 __ str(Rexception_obj, Address(Rthread, Thread::pending_exception_offset())); 261 __ b(StubRoutines::_call_stub_return_address); 262 263 return start; 264 } 265 266 267 // (in) Rexception_pc: return address 268 address generate_forward_exception() { 269 StubCodeMark mark(this, "StubRoutines", "forward exception"); 270 address start = __ pc(); 271 272 __ mov(c_rarg0, Rthread); 273 __ mov(c_rarg1, Rexception_pc); 274 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 275 SharedRuntime::exception_handler_for_return_address), 276 c_rarg0, c_rarg1); 277 __ ldr(Rexception_obj, Address(Rthread, Thread::pending_exception_offset())); 278 const Register Rzero = __ zero_register(Rtemp); // Rtemp OK (cleared by above call) 279 __ str(Rzero, Address(Rthread, Thread::pending_exception_offset())); 280 281 #ifdef ASSERT 282 // make sure exception is set 283 { Label L; 284 __ cbnz(Rexception_obj, L); 285 __ stop("StubRoutines::forward exception: no pending exception (2)"); 286 __ bind(L); 287 } 288 #endif 289 290 // Verify that there is really a valid exception in RAX. 291 __ verify_oop(Rexception_obj); 292 293 __ jump(R0); // handler is returned in R0 by runtime function 294 return start; 295 } 296 297 298 299 // Integer division shared routine 300 // Input: 301 // R0 - dividend 302 // R2 - divisor 303 // Output: 304 // R0 - remainder 305 // R1 - quotient 306 // Destroys: 307 // R2 308 // LR 309 address generate_idiv_irem() { 310 Label positive_arguments, negative_or_zero, call_slow_path; 311 Register dividend = R0; 312 Register divisor = R2; 313 Register remainder = R0; 314 Register quotient = R1; 315 Register tmp = LR; 316 assert(dividend == remainder, "must be"); 317 318 address start = __ pc(); 319 320 // Check for special cases: divisor <= 0 or dividend < 0 321 __ cmp(divisor, 0); 322 __ orrs(quotient, dividend, divisor, ne); 323 __ b(negative_or_zero, le); 324 325 __ bind(positive_arguments); 326 // Save return address on stack to free one extra register 327 __ push(LR); 328 // Approximate the mamximum order of the quotient 329 __ clz(tmp, dividend); 330 __ clz(quotient, divisor); 331 __ subs(tmp, quotient, tmp); 332 __ mov(quotient, 0); 333 // Jump to the appropriate place in the unrolled loop below 334 __ ldr(PC, Address(PC, tmp, lsl, 2), pl); 335 // If divisor is greater than dividend, return immediately 336 __ pop(PC); 337 338 // Offset table 339 Label offset_table[32]; 340 int i; 341 for (i = 0; i <= 31; i++) { 342 __ emit_address(offset_table[i]); 343 } 344 345 // Unrolled loop of 32 division steps 346 for (i = 31; i >= 0; i--) { 347 __ bind(offset_table[i]); 348 __ cmp(remainder, AsmOperand(divisor, lsl, i)); 349 __ sub(remainder, remainder, AsmOperand(divisor, lsl, i), hs); 350 __ add(quotient, quotient, 1 << i, hs); 351 } 352 __ pop(PC); 353 354 __ bind(negative_or_zero); 355 // Find the combination of argument signs and jump to corresponding handler 356 __ andr(quotient, dividend, 0x80000000, ne); 357 __ orr(quotient, quotient, AsmOperand(divisor, lsr, 31), ne); 358 __ add(PC, PC, AsmOperand(quotient, ror, 26), ne); 359 __ str(LR, Address(Rthread, JavaThread::saved_exception_pc_offset())); 360 361 // The leaf runtime function can destroy R0-R3 and R12 registers which are still alive 362 RegisterSet saved_registers = RegisterSet(R3) | RegisterSet(R12); 363 #if R9_IS_SCRATCHED 364 // Safer to save R9 here since callers may have been written 365 // assuming R9 survives. This is suboptimal but may not be worth 366 // revisiting for this slow case. 367 368 // save also R10 for alignment 369 saved_registers = saved_registers | RegisterSet(R9, R10); 370 #endif 371 { 372 // divisor == 0 373 FixedSizeCodeBlock zero_divisor(_masm, 8, true); 374 __ push(saved_registers); 375 __ mov(R0, Rthread); 376 __ mov(R1, LR); 377 __ mov(R2, SharedRuntime::IMPLICIT_DIVIDE_BY_ZERO); 378 __ b(call_slow_path); 379 } 380 381 { 382 // divisor > 0 && dividend < 0 383 FixedSizeCodeBlock positive_divisor_negative_dividend(_masm, 8, true); 384 __ push(LR); 385 __ rsb(dividend, dividend, 0); 386 __ bl(positive_arguments); 387 __ rsb(remainder, remainder, 0); 388 __ rsb(quotient, quotient, 0); 389 __ pop(PC); 390 } 391 392 { 393 // divisor < 0 && dividend > 0 394 FixedSizeCodeBlock negative_divisor_positive_dividend(_masm, 8, true); 395 __ push(LR); 396 __ rsb(divisor, divisor, 0); 397 __ bl(positive_arguments); 398 __ rsb(quotient, quotient, 0); 399 __ pop(PC); 400 } 401 402 { 403 // divisor < 0 && dividend < 0 404 FixedSizeCodeBlock negative_divisor_negative_dividend(_masm, 8, true); 405 __ push(LR); 406 __ rsb(dividend, dividend, 0); 407 __ rsb(divisor, divisor, 0); 408 __ bl(positive_arguments); 409 __ rsb(remainder, remainder, 0); 410 __ pop(PC); 411 } 412 413 __ bind(call_slow_path); 414 __ call(CAST_FROM_FN_PTR(address, SharedRuntime::continuation_for_implicit_exception)); 415 __ pop(saved_registers); 416 __ bx(R0); 417 418 return start; 419 } 420 421 422 // As per atomic.hpp the Atomic read-modify-write operations must be logically implemented as: 423 // <fence>; <op>; <membar StoreLoad|StoreStore> 424 // But for load-linked/store-conditional based systems a fence here simply means 425 // no load/store can be reordered with respect to the initial load-linked, so we have: 426 // <membar storeload|loadload> ; load-linked; <op>; store-conditional; <membar storeload|storestore> 427 // There are no memory actions in <op> so nothing further is needed. 428 // 429 // So we define the following for convenience: 430 #define MEMBAR_ATOMIC_OP_PRE \ 431 MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad|MacroAssembler::LoadLoad) 432 #define MEMBAR_ATOMIC_OP_POST \ 433 MacroAssembler::Membar_mask_bits(MacroAssembler::StoreLoad|MacroAssembler::StoreStore) 434 435 // Note: JDK 9 only supports ARMv7+ so we always have ldrexd available even though the 436 // code below allows for it to be otherwise. The else clause indicates an ARMv5 system 437 // for which we do not support MP and so membars are not necessary. This ARMv5 code will 438 // be removed in the future. 439 440 // Support for jint Atomic::add(volatile jint *dest, jint add_value) 441 // 442 // Arguments : 443 // 444 // add_value: R0 445 // dest: R1 446 // 447 // Results: 448 // 449 // R0: the new stored in dest 450 // 451 // Overwrites: 452 // 453 // R1, R2, R3 454 // 455 address generate_atomic_add() { 456 address start; 457 458 StubCodeMark mark(this, "StubRoutines", "atomic_add"); 459 Label retry; 460 start = __ pc(); 461 Register addval = R0; 462 Register dest = R1; 463 Register prev = R2; 464 Register ok = R2; 465 Register newval = R3; 466 467 if (VM_Version::supports_ldrex()) { 468 __ membar(MEMBAR_ATOMIC_OP_PRE, prev); 469 __ bind(retry); 470 __ ldrex(newval, Address(dest)); 471 __ add(newval, addval, newval); 472 __ strex(ok, newval, Address(dest)); 473 __ cmp(ok, 0); 474 __ b(retry, ne); 475 __ mov (R0, newval); 476 __ membar(MEMBAR_ATOMIC_OP_POST, prev); 477 } else { 478 __ bind(retry); 479 __ ldr (prev, Address(dest)); 480 __ add(newval, addval, prev); 481 __ atomic_cas_bool(prev, newval, dest, 0, noreg/*ignored*/); 482 __ b(retry, ne); 483 __ mov (R0, newval); 484 } 485 __ bx(LR); 486 487 return start; 488 } 489 490 // Support for jint Atomic::xchg(jint exchange_value, volatile jint *dest) 491 // 492 // Arguments : 493 // 494 // exchange_value: R0 495 // dest: R1 496 // 497 // Results: 498 // 499 // R0: the value previously stored in dest 500 // 501 // Overwrites: 502 // 503 // R1, R2, R3 504 // 505 address generate_atomic_xchg() { 506 address start; 507 508 StubCodeMark mark(this, "StubRoutines", "atomic_xchg"); 509 start = __ pc(); 510 Register newval = R0; 511 Register dest = R1; 512 Register prev = R2; 513 514 Label retry; 515 516 if (VM_Version::supports_ldrex()) { 517 Register ok=R3; 518 __ membar(MEMBAR_ATOMIC_OP_PRE, prev); 519 __ bind(retry); 520 __ ldrex(prev, Address(dest)); 521 __ strex(ok, newval, Address(dest)); 522 __ cmp(ok, 0); 523 __ b(retry, ne); 524 __ mov (R0, prev); 525 __ membar(MEMBAR_ATOMIC_OP_POST, prev); 526 } else { 527 __ bind(retry); 528 __ ldr (prev, Address(dest)); 529 __ atomic_cas_bool(prev, newval, dest, 0, noreg/*ignored*/); 530 __ b(retry, ne); 531 __ mov (R0, prev); 532 } 533 __ bx(LR); 534 535 return start; 536 } 537 538 // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint *dest, jint compare_value) 539 // 540 // Arguments : 541 // 542 // compare_value: R0 543 // exchange_value: R1 544 // dest: R2 545 // 546 // Results: 547 // 548 // R0: the value previously stored in dest 549 // 550 // Overwrites: 551 // 552 // R0, R1, R2, R3, Rtemp 553 // 554 address generate_atomic_cmpxchg() { 555 address start; 556 557 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg"); 558 start = __ pc(); 559 Register cmp = R0; 560 Register newval = R1; 561 Register dest = R2; 562 Register temp1 = R3; 563 Register temp2 = Rtemp; // Rtemp free (native ABI) 564 565 __ membar(MEMBAR_ATOMIC_OP_PRE, temp1); 566 567 // atomic_cas returns previous value in R0 568 __ atomic_cas(temp1, temp2, cmp, newval, dest, 0); 569 570 __ membar(MEMBAR_ATOMIC_OP_POST, temp1); 571 572 __ bx(LR); 573 574 return start; 575 } 576 577 // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value) 578 // reordered before by a wrapper to (jlong compare_value, jlong exchange_value, volatile jlong *dest) 579 // 580 // Arguments : 581 // 582 // compare_value: R1 (High), R0 (Low) 583 // exchange_value: R3 (High), R2 (Low) 584 // dest: SP+0 585 // 586 // Results: 587 // 588 // R0:R1: the value previously stored in dest 589 // 590 // Overwrites: 591 // 592 address generate_atomic_cmpxchg_long() { 593 address start; 594 595 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long"); 596 start = __ pc(); 597 Register cmp_lo = R0; 598 Register cmp_hi = R1; 599 Register newval_lo = R2; 600 Register newval_hi = R3; 601 Register addr = Rtemp; /* After load from stack */ 602 Register temp_lo = R4; 603 Register temp_hi = R5; 604 Register temp_result = R8; 605 assert_different_registers(cmp_lo, newval_lo, temp_lo, addr, temp_result, R7); 606 assert_different_registers(cmp_hi, newval_hi, temp_hi, addr, temp_result, R7); 607 608 __ membar(MEMBAR_ATOMIC_OP_PRE, Rtemp); // Rtemp free (native ABI) 609 610 // Stack is unaligned, maintain double word alignment by pushing 611 // odd number of regs. 612 __ push(RegisterSet(temp_result) | RegisterSet(temp_lo, temp_hi)); 613 __ ldr(addr, Address(SP, 12)); 614 615 // atomic_cas64 returns previous value in temp_lo, temp_hi 616 __ atomic_cas64(temp_lo, temp_hi, temp_result, cmp_lo, cmp_hi, 617 newval_lo, newval_hi, addr, 0); 618 __ mov(R0, temp_lo); 619 __ mov(R1, temp_hi); 620 621 __ pop(RegisterSet(temp_result) | RegisterSet(temp_lo, temp_hi)); 622 623 __ membar(MEMBAR_ATOMIC_OP_POST, Rtemp); // Rtemp free (native ABI) 624 __ bx(LR); 625 626 return start; 627 } 628 629 address generate_atomic_load_long() { 630 address start; 631 632 StubCodeMark mark(this, "StubRoutines", "atomic_load_long"); 633 start = __ pc(); 634 Register result_lo = R0; 635 Register result_hi = R1; 636 Register src = R0; 637 638 if (!os::is_MP()) { 639 __ ldmia(src, RegisterSet(result_lo, result_hi)); 640 __ bx(LR); 641 } else if (VM_Version::supports_ldrexd()) { 642 __ ldrexd(result_lo, Address(src)); 643 __ clrex(); // FIXME: safe to remove? 644 __ bx(LR); 645 } else { 646 __ stop("Atomic load(jlong) unsupported on this platform"); 647 __ bx(LR); 648 } 649 650 return start; 651 } 652 653 address generate_atomic_store_long() { 654 address start; 655 656 StubCodeMark mark(this, "StubRoutines", "atomic_store_long"); 657 start = __ pc(); 658 Register newval_lo = R0; 659 Register newval_hi = R1; 660 Register dest = R2; 661 Register scratch_lo = R2; 662 Register scratch_hi = R3; /* After load from stack */ 663 Register result = R3; 664 665 if (!os::is_MP()) { 666 __ stmia(dest, RegisterSet(newval_lo, newval_hi)); 667 __ bx(LR); 668 } else if (VM_Version::supports_ldrexd()) { 669 __ mov(Rtemp, dest); // get dest to Rtemp 670 Label retry; 671 __ bind(retry); 672 __ ldrexd(scratch_lo, Address(Rtemp)); 673 __ strexd(result, R0, Address(Rtemp)); 674 __ rsbs(result, result, 1); 675 __ b(retry, eq); 676 __ bx(LR); 677 } else { 678 __ stop("Atomic store(jlong) unsupported on this platform"); 679 __ bx(LR); 680 } 681 682 return start; 683 } 684 685 686 687 #ifdef COMPILER2 688 // Support for uint StubRoutine::Arm::partial_subtype_check( Klass sub, Klass super ); 689 // Arguments : 690 // 691 // ret : R0, returned 692 // icc/xcc: set as R0 (depending on wordSize) 693 // sub : R1, argument, not changed 694 // super: R2, argument, not changed 695 // raddr: LR, blown by call 696 address generate_partial_subtype_check() { 697 __ align(CodeEntryAlignment); 698 StubCodeMark mark(this, "StubRoutines", "partial_subtype_check"); 699 address start = __ pc(); 700 701 // based on SPARC check_klass_subtype_[fast|slow]_path (without CompressedOops) 702 703 // R0 used as tmp_reg (in addition to return reg) 704 Register sub_klass = R1; 705 Register super_klass = R2; 706 Register tmp_reg2 = R3; 707 Register tmp_reg3 = R4; 708 #define saved_set tmp_reg2, tmp_reg3 709 710 Label L_loop, L_fail; 711 712 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 713 714 // fast check should be redundant 715 716 // slow check 717 { 718 __ raw_push(saved_set); 719 720 // a couple of useful fields in sub_klass: 721 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 722 723 // Do a linear scan of the secondary super-klass chain. 724 // This code is rarely used, so simplicity is a virtue here. 725 726 inc_counter_np(SharedRuntime::_partial_subtype_ctr, tmp_reg2, tmp_reg3); 727 728 Register scan_temp = tmp_reg2; 729 Register count_temp = tmp_reg3; 730 731 // We will consult the secondary-super array. 732 __ ldr(scan_temp, Address(sub_klass, ss_offset)); 733 734 Register search_key = super_klass; 735 736 // Load the array length. 737 __ ldr_s32(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes())); 738 __ add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes()); 739 740 __ add(count_temp, count_temp, 1); 741 742 // Top of search loop 743 __ bind(L_loop); 744 // Notes: 745 // scan_temp starts at the array elements 746 // count_temp is 1+size 747 __ subs(count_temp, count_temp, 1); 748 __ b(L_fail, eq); // not found in the array 749 750 // Load next super to check 751 // In the array of super classes elements are pointer sized. 752 int element_size = wordSize; 753 __ ldr(R0, Address(scan_temp, element_size, post_indexed)); 754 755 // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list 756 __ subs(R0, R0, search_key); // set R0 to 0 on success (and flags to eq) 757 758 // A miss means we are NOT a subtype and need to keep looping 759 __ b(L_loop, ne); 760 761 // Falling out the bottom means we found a hit; we ARE a subtype 762 763 // Success. Cache the super we found and proceed in triumph. 764 __ str(super_klass, Address(sub_klass, sc_offset)); 765 766 // Return success 767 // R0 is already 0 and flags are already set to eq 768 __ raw_pop(saved_set); 769 __ ret(); 770 771 // Return failure 772 __ bind(L_fail); 773 __ movs(R0, 1); // sets the flags 774 __ raw_pop(saved_set); 775 __ ret(); 776 } 777 return start; 778 } 779 #undef saved_set 780 #endif // COMPILER2 781 782 783 //---------------------------------------------------------------------------------------------------- 784 // Non-destructive plausibility checks for oops 785 786 address generate_verify_oop() { 787 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 788 address start = __ pc(); 789 790 // Incoming arguments: 791 // 792 // R0: error message (char* ) 793 // R1: address of register save area 794 // R2: oop to verify 795 // 796 // All registers are saved before calling this stub. However, condition flags should be saved here. 797 798 const Register oop = R2; 799 const Register klass = R3; 800 const Register tmp1 = R6; 801 const Register tmp2 = R8; 802 803 const Register flags = Rtmp_save0; // R4/R19 804 const Register ret_addr = Rtmp_save1; // R5/R20 805 assert_different_registers(oop, klass, tmp1, tmp2, flags, ret_addr, R7); 806 807 Label exit, error; 808 InlinedAddress verify_oop_count((address) StubRoutines::verify_oop_count_addr()); 809 810 __ mrs(Assembler::CPSR, flags); 811 812 __ ldr_literal(tmp1, verify_oop_count); 813 __ ldr_s32(tmp2, Address(tmp1)); 814 __ add(tmp2, tmp2, 1); 815 __ str_32(tmp2, Address(tmp1)); 816 817 // make sure object is 'reasonable' 818 __ cbz(oop, exit); // if obj is NULL it is ok 819 820 // Check if the oop is in the right area of memory 821 // Note: oop_mask and oop_bits must be updated if the code is saved/reused 822 const address oop_mask = (address) Universe::verify_oop_mask(); 823 const address oop_bits = (address) Universe::verify_oop_bits(); 824 __ mov_address(tmp1, oop_mask); 825 __ andr(tmp2, oop, tmp1); 826 __ mov_address(tmp1, oop_bits); 827 __ cmp(tmp2, tmp1); 828 __ b(error, ne); 829 830 // make sure klass is 'reasonable' 831 __ load_klass(klass, oop); // get klass 832 __ cbz(klass, error); // if klass is NULL it is broken 833 834 // return if everything seems ok 835 __ bind(exit); 836 837 __ msr(Assembler::CPSR_f, flags); 838 839 __ ret(); 840 841 // handle errors 842 __ bind(error); 843 844 __ mov(ret_addr, LR); // save return address 845 846 // R0: error message 847 // R1: register save area 848 __ call(CAST_FROM_FN_PTR(address, MacroAssembler::debug)); 849 850 __ mov(LR, ret_addr); 851 __ b(exit); 852 853 __ bind_literal(verify_oop_count); 854 855 return start; 856 } 857 858 //---------------------------------------------------------------------------------------------------- 859 // Array copy stubs 860 861 // 862 // Generate overlap test for array copy stubs 863 // 864 // Input: 865 // R0 - array1 866 // R1 - array2 867 // R2 - element count, 32-bit int 868 // 869 // input registers are preserved 870 // 871 void array_overlap_test(address no_overlap_target, int log2_elem_size, Register tmp1, Register tmp2) { 872 assert(no_overlap_target != NULL, "must be generated"); 873 array_overlap_test(no_overlap_target, NULL, log2_elem_size, tmp1, tmp2); 874 } 875 void array_overlap_test(Label& L_no_overlap, int log2_elem_size, Register tmp1, Register tmp2) { 876 array_overlap_test(NULL, &L_no_overlap, log2_elem_size, tmp1, tmp2); 877 } 878 void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size, Register tmp1, Register tmp2) { 879 const Register from = R0; 880 const Register to = R1; 881 const Register count = R2; 882 const Register to_from = tmp1; // to - from 883 const Register byte_count = (log2_elem_size == 0) ? count : tmp2; // count << log2_elem_size 884 assert_different_registers(from, to, count, tmp1, tmp2); 885 886 // no_overlap version works if 'to' lower (unsigned) than 'from' 887 // and or 'to' more than (count*size) from 'from' 888 889 BLOCK_COMMENT("Array Overlap Test:"); 890 __ subs(to_from, to, from); 891 if (log2_elem_size != 0) { 892 __ mov(byte_count, AsmOperand(count, lsl, log2_elem_size)); 893 } 894 if (NOLp == NULL) 895 __ b(no_overlap_target,lo); 896 else 897 __ b((*NOLp), lo); 898 __ cmp(to_from, byte_count); 899 if (NOLp == NULL) 900 __ b(no_overlap_target, ge); 901 else 902 __ b((*NOLp), ge); 903 } 904 905 906 // probably we should choose between "prefetch-store before or after store", not "before or after load". 907 void prefetch(Register from, Register to, int offset, int to_delta = 0) { 908 __ prefetch_read(Address(from, offset)); 909 } 910 911 // Generate the inner loop for forward aligned array copy 912 // 913 // Arguments 914 // from: src address, 64 bits aligned 915 // to: dst address, wordSize aligned 916 // count: number of elements (32-bit int) 917 // bytes_per_count: number of bytes for each unit of 'count' 918 // 919 // Return the minimum initial value for count 920 // 921 // Notes: 922 // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA) 923 // - 'to' aligned on wordSize 924 // - 'count' must be greater or equal than the returned value 925 // 926 // Increases 'from' and 'to' by count*bytes_per_count. 927 // 928 // Scratches 'count', R3. 929 // R4-R10 are preserved (saved/restored). 930 // 931 int generate_forward_aligned_copy_loop(Register from, Register to, Register count, int bytes_per_count, bool unsafe_copy = false) { 932 assert (from == R0 && to == R1 && count == R2, "adjust the implementation below"); 933 934 const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration 935 arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_aligned; 936 int pld_offset = config->pld_distance; 937 const int count_per_loop = bytes_per_loop / bytes_per_count; 938 939 bool split_read= config->split_ldm; 940 bool split_write= config->split_stm; 941 942 // XXX optim: use VLDM/VSTM when available (Neon) with PLD 943 // NEONCopyPLD 944 // PLD [r1, #0xC0] 945 // VLDM r1!,{d0-d7} 946 // VSTM r0!,{d0-d7} 947 // SUBS r2,r2,#0x40 948 // BGE NEONCopyPLD 949 950 __ push(RegisterSet(R4,R10)); 951 952 const bool prefetch_before = pld_offset < 0; 953 const bool prefetch_after = pld_offset > 0; 954 955 Label L_skip_pld; 956 957 { 958 // UnsafeCopyMemory page error: continue after ucm 959 UnsafeCopyMemoryMark ucmm(this, unsafe_copy, true); 960 // predecrease to exit when there is less than count_per_loop 961 __ sub_32(count, count, count_per_loop); 962 963 if (pld_offset != 0) { 964 pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset; 965 966 prefetch(from, to, 0); 967 968 if (prefetch_before) { 969 // If prefetch is done ahead, final PLDs that overflow the 970 // copied area can be easily avoided. 'count' is predecreased 971 // by the prefetch distance to optimize the inner loop and the 972 // outer loop skips the PLD. 973 __ subs_32(count, count, (bytes_per_loop+pld_offset)/bytes_per_count); 974 975 // skip prefetch for small copies 976 __ b(L_skip_pld, lt); 977 } 978 979 int offset = ArmCopyCacheLineSize; 980 while (offset <= pld_offset) { 981 prefetch(from, to, offset); 982 offset += ArmCopyCacheLineSize; 983 }; 984 } 985 986 { 987 // 32-bit ARM note: we have tried implementing loop unrolling to skip one 988 // PLD with 64 bytes cache line but the gain was not significant. 989 990 Label L_copy_loop; 991 __ align(OptoLoopAlignment); 992 __ BIND(L_copy_loop); 993 994 if (prefetch_before) { 995 prefetch(from, to, bytes_per_loop + pld_offset); 996 __ BIND(L_skip_pld); 997 } 998 999 if (split_read) { 1000 // Split the register set in two sets so that there is less 1001 // latency between LDM and STM (R3-R6 available while R7-R10 1002 // still loading) and less register locking issue when iterating 1003 // on the first LDM. 1004 __ ldmia(from, RegisterSet(R3, R6), writeback); 1005 __ ldmia(from, RegisterSet(R7, R10), writeback); 1006 } else { 1007 __ ldmia(from, RegisterSet(R3, R10), writeback); 1008 } 1009 1010 __ subs_32(count, count, count_per_loop); 1011 1012 if (prefetch_after) { 1013 prefetch(from, to, pld_offset, bytes_per_loop); 1014 } 1015 1016 if (split_write) { 1017 __ stmia(to, RegisterSet(R3, R6), writeback); 1018 __ stmia(to, RegisterSet(R7, R10), writeback); 1019 } else { 1020 __ stmia(to, RegisterSet(R3, R10), writeback); 1021 } 1022 1023 __ b(L_copy_loop, ge); 1024 1025 if (prefetch_before) { 1026 // the inner loop may end earlier, allowing to skip PLD for the last iterations 1027 __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count); 1028 __ b(L_skip_pld, ge); 1029 } 1030 } 1031 BLOCK_COMMENT("Remaining bytes:"); 1032 // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes 1033 1034 // __ add(count, count, ...); // addition useless for the bit tests 1035 assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits"); 1036 1037 __ tst(count, 16 / bytes_per_count); 1038 __ ldmia(from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes 1039 __ stmia(to, RegisterSet(R3, R6), writeback, ne); 1040 1041 __ tst(count, 8 / bytes_per_count); 1042 __ ldmia(from, RegisterSet(R3, R4), writeback, ne); // copy 8 bytes 1043 __ stmia(to, RegisterSet(R3, R4), writeback, ne); 1044 1045 if (bytes_per_count <= 4) { 1046 __ tst(count, 4 / bytes_per_count); 1047 __ ldr(R3, Address(from, 4, post_indexed), ne); // copy 4 bytes 1048 __ str(R3, Address(to, 4, post_indexed), ne); 1049 } 1050 1051 if (bytes_per_count <= 2) { 1052 __ tst(count, 2 / bytes_per_count); 1053 __ ldrh(R3, Address(from, 2, post_indexed), ne); // copy 2 bytes 1054 __ strh(R3, Address(to, 2, post_indexed), ne); 1055 } 1056 1057 if (bytes_per_count == 1) { 1058 __ tst(count, 1); 1059 __ ldrb(R3, Address(from, 1, post_indexed), ne); 1060 __ strb(R3, Address(to, 1, post_indexed), ne); 1061 } 1062 } 1063 1064 __ pop(RegisterSet(R4,R10)); 1065 1066 return count_per_loop; 1067 } 1068 1069 1070 // Generate the inner loop for backward aligned array copy 1071 // 1072 // Arguments 1073 // end_from: src end address, 64 bits aligned 1074 // end_to: dst end address, wordSize aligned 1075 // count: number of elements (32-bit int) 1076 // bytes_per_count: number of bytes for each unit of 'count' 1077 // 1078 // Return the minimum initial value for count 1079 // 1080 // Notes: 1081 // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA) 1082 // - 'end_to' aligned on wordSize 1083 // - 'count' must be greater or equal than the returned value 1084 // 1085 // Decreases 'end_from' and 'end_to' by count*bytes_per_count. 1086 // 1087 // Scratches 'count', R3. 1088 // ARM R4-R10 are preserved (saved/restored). 1089 // 1090 int generate_backward_aligned_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count, bool unsafe_copy = false) { 1091 assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below"); 1092 1093 const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iteration 1094 const int count_per_loop = bytes_per_loop / bytes_per_count; 1095 1096 arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_aligned; 1097 int pld_offset = config->pld_distance; 1098 1099 bool split_read= config->split_ldm; 1100 bool split_write= config->split_stm; 1101 1102 // See the forward copy variant for additional comments. 1103 1104 __ push(RegisterSet(R4,R10)); 1105 1106 { 1107 // UnsafeCopyMemory page error: continue after ucm 1108 UnsafeCopyMemoryMark ucmm(this, unsafe_copy, true); 1109 __ sub_32(count, count, count_per_loop); 1110 1111 const bool prefetch_before = pld_offset < 0; 1112 const bool prefetch_after = pld_offset > 0; 1113 1114 Label L_skip_pld; 1115 1116 if (pld_offset != 0) { 1117 pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset; 1118 1119 prefetch(end_from, end_to, -wordSize); 1120 1121 if (prefetch_before) { 1122 __ subs_32(count, count, (bytes_per_loop + pld_offset) / bytes_per_count); 1123 __ b(L_skip_pld, lt); 1124 } 1125 1126 int offset = ArmCopyCacheLineSize; 1127 while (offset <= pld_offset) { 1128 prefetch(end_from, end_to, -(wordSize + offset)); 1129 offset += ArmCopyCacheLineSize; 1130 }; 1131 } 1132 1133 { 1134 // 32-bit ARM note: we have tried implementing loop unrolling to skip one 1135 // PLD with 64 bytes cache line but the gain was not significant. 1136 1137 Label L_copy_loop; 1138 __ align(OptoLoopAlignment); 1139 __ BIND(L_copy_loop); 1140 1141 if (prefetch_before) { 1142 prefetch(end_from, end_to, -(wordSize + bytes_per_loop + pld_offset)); 1143 __ BIND(L_skip_pld); 1144 } 1145 1146 if (split_read) { 1147 __ ldmdb(end_from, RegisterSet(R7, R10), writeback); 1148 __ ldmdb(end_from, RegisterSet(R3, R6), writeback); 1149 } else { 1150 __ ldmdb(end_from, RegisterSet(R3, R10), writeback); 1151 } 1152 1153 __ subs_32(count, count, count_per_loop); 1154 1155 if (prefetch_after) { 1156 prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop); 1157 } 1158 1159 if (split_write) { 1160 __ stmdb(end_to, RegisterSet(R7, R10), writeback); 1161 __ stmdb(end_to, RegisterSet(R3, R6), writeback); 1162 } else { 1163 __ stmdb(end_to, RegisterSet(R3, R10), writeback); 1164 } 1165 1166 __ b(L_copy_loop, ge); 1167 1168 if (prefetch_before) { 1169 __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count); 1170 __ b(L_skip_pld, ge); 1171 } 1172 } 1173 BLOCK_COMMENT("Remaining bytes:"); 1174 // still 0..bytes_per_loop-1 aligned bytes to copy, count already decreased by (at least) bytes_per_loop bytes 1175 1176 // __ add(count, count, ...); // addition useless for the bit tests 1177 assert (pld_offset % bytes_per_loop == 0, "decreasing count by pld_offset before loop must not change tested bits"); 1178 1179 __ tst(count, 16 / bytes_per_count); 1180 __ ldmdb(end_from, RegisterSet(R3, R6), writeback, ne); // copy 16 bytes 1181 __ stmdb(end_to, RegisterSet(R3, R6), writeback, ne); 1182 1183 __ tst(count, 8 / bytes_per_count); 1184 __ ldmdb(end_from, RegisterSet(R3, R4), writeback, ne); // copy 8 bytes 1185 __ stmdb(end_to, RegisterSet(R3, R4), writeback, ne); 1186 1187 if (bytes_per_count <= 4) { 1188 __ tst(count, 4 / bytes_per_count); 1189 __ ldr(R3, Address(end_from, -4, pre_indexed), ne); // copy 4 bytes 1190 __ str(R3, Address(end_to, -4, pre_indexed), ne); 1191 } 1192 1193 if (bytes_per_count <= 2) { 1194 __ tst(count, 2 / bytes_per_count); 1195 __ ldrh(R3, Address(end_from, -2, pre_indexed), ne); // copy 2 bytes 1196 __ strh(R3, Address(end_to, -2, pre_indexed), ne); 1197 } 1198 1199 if (bytes_per_count == 1) { 1200 __ tst(count, 1); 1201 __ ldrb(R3, Address(end_from, -1, pre_indexed), ne); 1202 __ strb(R3, Address(end_to, -1, pre_indexed), ne); 1203 } 1204 } 1205 __ pop(RegisterSet(R4,R10)); 1206 1207 return count_per_loop; 1208 } 1209 1210 1211 // Generate the inner loop for shifted forward array copy (unaligned copy). 1212 // It can be used when bytes_per_count < wordSize, i.e. byte/short copy 1213 // 1214 // Arguments 1215 // from: start src address, 64 bits aligned 1216 // to: start dst address, (now) wordSize aligned 1217 // count: number of elements (32-bit int) 1218 // bytes_per_count: number of bytes for each unit of 'count' 1219 // lsr_shift: shift applied to 'old' value to skipped already written bytes 1220 // lsl_shift: shift applied to 'new' value to set the high bytes of the next write 1221 // 1222 // Return the minimum initial value for count 1223 // 1224 // Notes: 1225 // - 'from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA) 1226 // - 'to' aligned on wordSize 1227 // - 'count' must be greater or equal than the returned value 1228 // - 'lsr_shift' + 'lsl_shift' = BitsPerWord 1229 // - 'bytes_per_count' is 1 or 2 1230 // 1231 // Increases 'to' by count*bytes_per_count. 1232 // 1233 // Scratches 'from' and 'count', R3-R10, R12 1234 // 1235 // On entry: 1236 // - R12 is preloaded with the first 'BitsPerWord' bits read just before 'from' 1237 // - (R12 >> lsr_shift) is the part not yet written (just before 'to') 1238 // --> (*to) = (R12 >> lsr_shift) | (*from) << lsl_shift); ... 1239 // 1240 // This implementation may read more bytes than required. 1241 // Actually, it always reads exactly all data from the copied region with upper bound aligned up by wordSize, 1242 // so excessive read do not cross a word bound and is thus harmless. 1243 // 1244 int generate_forward_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, int lsr_shift, int lsl_shift) { 1245 assert (from == R0 && to == R1 && count == R2, "adjust the implementation below"); 1246 1247 const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iter 1248 const int count_per_loop = bytes_per_loop / bytes_per_count; 1249 1250 arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].forward_shifted; 1251 int pld_offset = config->pld_distance; 1252 1253 bool split_read= config->split_ldm; 1254 bool split_write= config->split_stm; 1255 1256 const bool prefetch_before = pld_offset < 0; 1257 const bool prefetch_after = pld_offset > 0; 1258 Label L_skip_pld, L_last_read, L_done; 1259 if (pld_offset != 0) { 1260 1261 pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset; 1262 1263 prefetch(from, to, 0); 1264 1265 if (prefetch_before) { 1266 __ cmp_32(count, count_per_loop); 1267 __ b(L_last_read, lt); 1268 // skip prefetch for small copies 1269 // warning: count is predecreased by the prefetch distance to optimize the inner loop 1270 __ subs_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop); 1271 __ b(L_skip_pld, lt); 1272 } 1273 1274 int offset = ArmCopyCacheLineSize; 1275 while (offset <= pld_offset) { 1276 prefetch(from, to, offset); 1277 offset += ArmCopyCacheLineSize; 1278 }; 1279 } 1280 1281 Label L_shifted_loop; 1282 1283 __ align(OptoLoopAlignment); 1284 __ BIND(L_shifted_loop); 1285 1286 if (prefetch_before) { 1287 // do it early if there might be register locking issues 1288 prefetch(from, to, bytes_per_loop + pld_offset); 1289 __ BIND(L_skip_pld); 1290 } else { 1291 __ cmp_32(count, count_per_loop); 1292 __ b(L_last_read, lt); 1293 } 1294 1295 // read 32 bytes 1296 if (split_read) { 1297 // if write is not split, use less registers in first set to reduce locking 1298 RegisterSet set1 = split_write ? RegisterSet(R4, R7) : RegisterSet(R4, R5); 1299 RegisterSet set2 = (split_write ? RegisterSet(R8, R10) : RegisterSet(R6, R10)) | R12; 1300 __ ldmia(from, set1, writeback); 1301 __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); // part of R12 not yet written 1302 __ ldmia(from, set2, writeback); 1303 __ subs(count, count, count_per_loop); // XXX: should it be before the 2nd LDM ? (latency vs locking) 1304 } else { 1305 __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); // part of R12 not yet written 1306 __ ldmia(from, RegisterSet(R4, R10) | R12, writeback); // Note: small latency on R4 1307 __ subs(count, count, count_per_loop); 1308 } 1309 1310 if (prefetch_after) { 1311 // do it after the 1st ldm/ldp anyway (no locking issues with early STM/STP) 1312 prefetch(from, to, pld_offset, bytes_per_loop); 1313 } 1314 1315 // prepare (shift) the values in R3..R10 1316 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift)); // merged below low bytes of next val 1317 __ logical_shift_right(R4, R4, lsr_shift); // unused part of next val 1318 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift)); // ... 1319 __ logical_shift_right(R5, R5, lsr_shift); 1320 __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift)); 1321 __ logical_shift_right(R6, R6, lsr_shift); 1322 __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift)); 1323 if (split_write) { 1324 // write the first half as soon as possible to reduce stm locking 1325 __ stmia(to, RegisterSet(R3, R6), writeback, prefetch_before ? gt : ge); 1326 } 1327 __ logical_shift_right(R7, R7, lsr_shift); 1328 __ orr(R7, R7, AsmOperand(R8, lsl, lsl_shift)); 1329 __ logical_shift_right(R8, R8, lsr_shift); 1330 __ orr(R8, R8, AsmOperand(R9, lsl, lsl_shift)); 1331 __ logical_shift_right(R9, R9, lsr_shift); 1332 __ orr(R9, R9, AsmOperand(R10, lsl, lsl_shift)); 1333 __ logical_shift_right(R10, R10, lsr_shift); 1334 __ orr(R10, R10, AsmOperand(R12, lsl, lsl_shift)); 1335 1336 if (split_write) { 1337 __ stmia(to, RegisterSet(R7, R10), writeback, prefetch_before ? gt : ge); 1338 } else { 1339 __ stmia(to, RegisterSet(R3, R10), writeback, prefetch_before ? gt : ge); 1340 } 1341 __ b(L_shifted_loop, gt); // no need to loop if 0 (when count need not be precise modulo bytes_per_loop) 1342 1343 if (prefetch_before) { 1344 // the first loop may end earlier, allowing to skip pld at the end 1345 __ cmn_32(count, (bytes_per_loop + pld_offset)/bytes_per_count); 1346 __ stmia(to, RegisterSet(R3, R10), writeback); // stmia was skipped 1347 __ b(L_skip_pld, ge); 1348 __ adds_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop); 1349 } 1350 1351 __ BIND(L_last_read); 1352 __ b(L_done, eq); 1353 1354 switch (bytes_per_count) { 1355 case 2: 1356 __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); 1357 __ tst(count, 8); 1358 __ ldmia(from, RegisterSet(R4, R7), writeback, ne); 1359 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val 1360 __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val 1361 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ... 1362 __ mov(R5, AsmOperand(R5, lsr, lsr_shift), ne); 1363 __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift), ne); 1364 __ mov(R6, AsmOperand(R6, lsr, lsr_shift), ne); 1365 __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift), ne); 1366 __ stmia(to, RegisterSet(R3, R6), writeback, ne); 1367 __ mov(R3, AsmOperand(R7, lsr, lsr_shift), ne); 1368 1369 __ tst(count, 4); 1370 __ ldmia(from, RegisterSet(R4, R5), writeback, ne); 1371 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val 1372 __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val 1373 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ... 1374 __ stmia(to, RegisterSet(R3, R4), writeback, ne); 1375 __ mov(R3, AsmOperand(R5, lsr, lsr_shift), ne); 1376 1377 __ tst(count, 2); 1378 __ ldr(R4, Address(from, 4, post_indexed), ne); 1379 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); 1380 __ str(R3, Address(to, 4, post_indexed), ne); 1381 __ mov(R3, AsmOperand(R4, lsr, lsr_shift), ne); 1382 1383 __ tst(count, 1); 1384 __ strh(R3, Address(to, 2, post_indexed), ne); // one last short 1385 break; 1386 1387 case 1: 1388 __ mov(R3, AsmOperand(R12, lsr, lsr_shift)); 1389 __ tst(count, 16); 1390 __ ldmia(from, RegisterSet(R4, R7), writeback, ne); 1391 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val 1392 __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val 1393 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ... 1394 __ mov(R5, AsmOperand(R5, lsr, lsr_shift), ne); 1395 __ orr(R5, R5, AsmOperand(R6, lsl, lsl_shift), ne); 1396 __ mov(R6, AsmOperand(R6, lsr, lsr_shift), ne); 1397 __ orr(R6, R6, AsmOperand(R7, lsl, lsl_shift), ne); 1398 __ stmia(to, RegisterSet(R3, R6), writeback, ne); 1399 __ mov(R3, AsmOperand(R7, lsr, lsr_shift), ne); 1400 1401 __ tst(count, 8); 1402 __ ldmia(from, RegisterSet(R4, R5), writeback, ne); 1403 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); // merged below low bytes of next val 1404 __ mov(R4, AsmOperand(R4, lsr, lsr_shift), ne); // unused part of next val 1405 __ orr(R4, R4, AsmOperand(R5, lsl, lsl_shift), ne); // ... 1406 __ stmia(to, RegisterSet(R3, R4), writeback, ne); 1407 __ mov(R3, AsmOperand(R5, lsr, lsr_shift), ne); 1408 1409 __ tst(count, 4); 1410 __ ldr(R4, Address(from, 4, post_indexed), ne); 1411 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ne); 1412 __ str(R3, Address(to, 4, post_indexed), ne); 1413 __ mov(R3, AsmOperand(R4, lsr, lsr_shift), ne); 1414 1415 __ andr(count, count, 3); 1416 __ cmp(count, 2); 1417 1418 // Note: R3 might contain enough bytes ready to write (3 needed at most), 1419 // thus load on lsl_shift==24 is not needed (in fact forces reading 1420 // beyond source buffer end boundary) 1421 if (lsl_shift == 8) { 1422 __ ldr(R4, Address(from, 4, post_indexed), ge); 1423 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), ge); 1424 } else if (lsl_shift == 16) { 1425 __ ldr(R4, Address(from, 4, post_indexed), gt); 1426 __ orr(R3, R3, AsmOperand(R4, lsl, lsl_shift), gt); 1427 } 1428 1429 __ strh(R3, Address(to, 2, post_indexed), ge); // two last bytes 1430 __ mov(R3, AsmOperand(R3, lsr, 16), gt); 1431 1432 __ tst(count, 1); 1433 __ strb(R3, Address(to, 1, post_indexed), ne); // one last byte 1434 break; 1435 } 1436 1437 __ BIND(L_done); 1438 return 0; // no minimum 1439 } 1440 1441 // Generate the inner loop for shifted backward array copy (unaligned copy). 1442 // It can be used when bytes_per_count < wordSize, i.e. byte/short copy 1443 // 1444 // Arguments 1445 // end_from: end src address, 64 bits aligned 1446 // end_to: end dst address, (now) wordSize aligned 1447 // count: number of elements (32-bit int) 1448 // bytes_per_count: number of bytes for each unit of 'count' 1449 // lsl_shift: shift applied to 'old' value to skipped already written bytes 1450 // lsr_shift: shift applied to 'new' value to set the low bytes of the next write 1451 // 1452 // Return the minimum initial value for count 1453 // 1454 // Notes: 1455 // - 'end_from' aligned on 64-bit (recommended for 32-bit ARM in case this speeds up LDMIA) 1456 // - 'end_to' aligned on wordSize 1457 // - 'count' must be greater or equal than the returned value 1458 // - 'lsr_shift' + 'lsl_shift' = 'BitsPerWord' 1459 // - 'bytes_per_count' is 1 or 2 on 32-bit ARM 1460 // 1461 // Decreases 'end_to' by count*bytes_per_count. 1462 // 1463 // Scratches 'end_from', 'count', R3-R10, R12 1464 // 1465 // On entry: 1466 // - R3 is preloaded with the first 'BitsPerWord' bits read just after 'from' 1467 // - (R3 << lsl_shift) is the part not yet written 1468 // --> (*--to) = (R3 << lsl_shift) | (*--from) >> lsr_shift); ... 1469 // 1470 // This implementation may read more bytes than required. 1471 // Actually, it always reads exactly all data from the copied region with beginning aligned down by wordSize, 1472 // so excessive read do not cross a word bound and is thus harmless. 1473 // 1474 int generate_backward_shifted_copy_loop(Register end_from, Register end_to, Register count, int bytes_per_count, int lsr_shift, int lsl_shift) { 1475 assert (end_from == R0 && end_to == R1 && count == R2, "adjust the implementation below"); 1476 1477 const int bytes_per_loop = 8*wordSize; // 8 registers are read and written on every loop iter 1478 const int count_per_loop = bytes_per_loop / bytes_per_count; 1479 1480 arraycopy_loop_config *config=&arraycopy_configurations[ArmCopyPlatform].backward_shifted; 1481 int pld_offset = config->pld_distance; 1482 1483 bool split_read= config->split_ldm; 1484 bool split_write= config->split_stm; 1485 1486 1487 const bool prefetch_before = pld_offset < 0; 1488 const bool prefetch_after = pld_offset > 0; 1489 1490 Label L_skip_pld, L_done, L_last_read; 1491 if (pld_offset != 0) { 1492 1493 pld_offset = (pld_offset < 0) ? -pld_offset : pld_offset; 1494 1495 prefetch(end_from, end_to, -wordSize); 1496 1497 if (prefetch_before) { 1498 __ cmp_32(count, count_per_loop); 1499 __ b(L_last_read, lt); 1500 1501 // skip prefetch for small copies 1502 // warning: count is predecreased by the prefetch distance to optimize the inner loop 1503 __ subs_32(count, count, ((bytes_per_loop + pld_offset)/bytes_per_count) + count_per_loop); 1504 __ b(L_skip_pld, lt); 1505 } 1506 1507 int offset = ArmCopyCacheLineSize; 1508 while (offset <= pld_offset) { 1509 prefetch(end_from, end_to, -(wordSize + offset)); 1510 offset += ArmCopyCacheLineSize; 1511 }; 1512 } 1513 1514 Label L_shifted_loop; 1515 __ align(OptoLoopAlignment); 1516 __ BIND(L_shifted_loop); 1517 1518 if (prefetch_before) { 1519 // do the 1st ldm/ldp first anyway (no locking issues with early STM/STP) 1520 prefetch(end_from, end_to, -(wordSize + bytes_per_loop + pld_offset)); 1521 __ BIND(L_skip_pld); 1522 } else { 1523 __ cmp_32(count, count_per_loop); 1524 __ b(L_last_read, lt); 1525 } 1526 1527 if (split_read) { 1528 __ ldmdb(end_from, RegisterSet(R7, R10), writeback); 1529 __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written 1530 __ ldmdb(end_from, RegisterSet(R3, R6), writeback); 1531 } else { 1532 __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written 1533 __ ldmdb(end_from, RegisterSet(R3, R10), writeback); 1534 } 1535 1536 __ subs_32(count, count, count_per_loop); 1537 1538 if (prefetch_after) { // do prefetch during ldm/ldp latency 1539 prefetch(end_from, end_to, -(wordSize + pld_offset), -bytes_per_loop); 1540 } 1541 1542 // prepare the values in R4..R10,R12 1543 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift)); // merged above high bytes of prev val 1544 __ logical_shift_left(R10, R10, lsl_shift); // unused part of prev val 1545 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift)); // ... 1546 __ logical_shift_left(R9, R9, lsl_shift); 1547 __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift)); 1548 __ logical_shift_left(R8, R8, lsl_shift); 1549 __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift)); 1550 __ logical_shift_left(R7, R7, lsl_shift); 1551 __ orr(R7, R7, AsmOperand(R6, lsr, lsr_shift)); 1552 __ logical_shift_left(R6, R6, lsl_shift); 1553 __ orr(R6, R6, AsmOperand(R5, lsr, lsr_shift)); 1554 if (split_write) { 1555 // store early to reduce locking issues 1556 __ stmdb(end_to, RegisterSet(R6, R10) | R12, writeback, prefetch_before ? gt : ge); 1557 } 1558 __ logical_shift_left(R5, R5, lsl_shift); 1559 __ orr(R5, R5, AsmOperand(R4, lsr, lsr_shift)); 1560 __ logical_shift_left(R4, R4, lsl_shift); 1561 __ orr(R4, R4, AsmOperand(R3, lsr, lsr_shift)); 1562 1563 if (split_write) { 1564 __ stmdb(end_to, RegisterSet(R4, R5), writeback, prefetch_before ? gt : ge); 1565 } else { 1566 __ stmdb(end_to, RegisterSet(R4, R10) | R12, writeback, prefetch_before ? gt : ge); 1567 } 1568 1569 __ b(L_shifted_loop, gt); // no need to loop if 0 (when count need not be precise modulo bytes_per_loop) 1570 1571 if (prefetch_before) { 1572 // the first loop may end earlier, allowing to skip pld at the end 1573 __ cmn_32(count, ((bytes_per_loop + pld_offset)/bytes_per_count)); 1574 __ stmdb(end_to, RegisterSet(R4, R10) | R12, writeback); // stmdb was skipped 1575 __ b(L_skip_pld, ge); 1576 __ adds_32(count, count, ((bytes_per_loop + pld_offset) / bytes_per_count) + count_per_loop); 1577 } 1578 1579 __ BIND(L_last_read); 1580 __ b(L_done, eq); 1581 1582 switch(bytes_per_count) { 1583 case 2: 1584 __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written 1585 __ tst(count, 8); 1586 __ ldmdb(end_from, RegisterSet(R7,R10), writeback, ne); 1587 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 1588 __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val 1589 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ... 1590 __ mov(R9, AsmOperand(R9, lsl, lsl_shift),ne); 1591 __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift),ne); 1592 __ mov(R8, AsmOperand(R8, lsl, lsl_shift),ne); 1593 __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift),ne); 1594 __ stmdb(end_to, RegisterSet(R8,R10)|R12, writeback, ne); 1595 __ mov(R12, AsmOperand(R7, lsl, lsl_shift), ne); 1596 1597 __ tst(count, 4); 1598 __ ldmdb(end_from, RegisterSet(R9, R10), writeback, ne); 1599 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 1600 __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val 1601 __ orr(R10, R10, AsmOperand(R9, lsr,lsr_shift),ne); // ... 1602 __ stmdb(end_to, RegisterSet(R10)|R12, writeback, ne); 1603 __ mov(R12, AsmOperand(R9, lsl, lsl_shift), ne); 1604 1605 __ tst(count, 2); 1606 __ ldr(R10, Address(end_from, -4, pre_indexed), ne); 1607 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 1608 __ str(R12, Address(end_to, -4, pre_indexed), ne); 1609 __ mov(R12, AsmOperand(R10, lsl, lsl_shift), ne); 1610 1611 __ tst(count, 1); 1612 __ mov(R12, AsmOperand(R12, lsr, lsr_shift),ne); 1613 __ strh(R12, Address(end_to, -2, pre_indexed), ne); // one last short 1614 break; 1615 1616 case 1: 1617 __ mov(R12, AsmOperand(R3, lsl, lsl_shift)); // part of R3 not yet written 1618 __ tst(count, 16); 1619 __ ldmdb(end_from, RegisterSet(R7,R10), writeback, ne); 1620 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 1621 __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val 1622 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ... 1623 __ mov(R9, AsmOperand(R9, lsl, lsl_shift),ne); 1624 __ orr(R9, R9, AsmOperand(R8, lsr, lsr_shift),ne); 1625 __ mov(R8, AsmOperand(R8, lsl, lsl_shift),ne); 1626 __ orr(R8, R8, AsmOperand(R7, lsr, lsr_shift),ne); 1627 __ stmdb(end_to, RegisterSet(R8,R10)|R12, writeback, ne); 1628 __ mov(R12, AsmOperand(R7, lsl, lsl_shift), ne); 1629 1630 __ tst(count, 8); 1631 __ ldmdb(end_from, RegisterSet(R9,R10), writeback, ne); 1632 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 1633 __ mov(R10, AsmOperand(R10, lsl, lsl_shift),ne); // unused part of prev val 1634 __ orr(R10, R10, AsmOperand(R9, lsr, lsr_shift),ne); // ... 1635 __ stmdb(end_to, RegisterSet(R10)|R12, writeback, ne); 1636 __ mov(R12, AsmOperand(R9, lsl, lsl_shift), ne); 1637 1638 __ tst(count, 4); 1639 __ ldr(R10, Address(end_from, -4, pre_indexed), ne); 1640 __ orr(R12, R12, AsmOperand(R10, lsr, lsr_shift), ne); 1641 __ str(R12, Address(end_to, -4, pre_indexed), ne); 1642 __ mov(R12, AsmOperand(R10, lsl, lsl_shift), ne); 1643 1644 __ tst(count, 2); 1645 if (lsr_shift != 24) { 1646 // avoid useless reading R10 when we already have 3 bytes ready in R12 1647 __ ldr(R10, Address(end_from, -4, pre_indexed), ne); 1648 __ orr(R12, R12, AsmOperand(R10, lsr,lsr_shift), ne); 1649 } 1650 1651 // Note: R12 contains enough bytes ready to write (3 needed at most) 1652 // write the 2 MSBs 1653 __ mov(R9, AsmOperand(R12, lsr, 16), ne); 1654 __ strh(R9, Address(end_to, -2, pre_indexed), ne); 1655 // promote remaining to MSB 1656 __ mov(R12, AsmOperand(R12, lsl, 16), ne); 1657 1658 __ tst(count, 1); 1659 // write the MSB of R12 1660 __ mov(R12, AsmOperand(R12, lsr, 24), ne); 1661 __ strb(R12, Address(end_to, -1, pre_indexed), ne); 1662 1663 break; 1664 } 1665 1666 __ BIND(L_done); 1667 return 0; // no minimum 1668 } 1669 1670 // This method is very useful for merging forward/backward implementations 1671 Address get_addr_with_indexing(Register base, int delta, bool forward) { 1672 if (forward) { 1673 return Address(base, delta, post_indexed); 1674 } else { 1675 return Address(base, -delta, pre_indexed); 1676 } 1677 } 1678 1679 void load_one(Register rd, Register from, int size_in_bytes, bool forward, AsmCondition cond = al, Register rd2 = noreg) { 1680 assert_different_registers(from, rd, rd2); 1681 if (size_in_bytes < 8) { 1682 Address addr = get_addr_with_indexing(from, size_in_bytes, forward); 1683 __ load_sized_value(rd, addr, size_in_bytes, false, cond); 1684 } else { 1685 assert (rd2 != noreg, "second value register must be specified"); 1686 assert (rd->encoding() < rd2->encoding(), "wrong value register set"); 1687 1688 if (forward) { 1689 __ ldmia(from, RegisterSet(rd) | rd2, writeback, cond); 1690 } else { 1691 __ ldmdb(from, RegisterSet(rd) | rd2, writeback, cond); 1692 } 1693 } 1694 } 1695 1696 void store_one(Register rd, Register to, int size_in_bytes, bool forward, AsmCondition cond = al, Register rd2 = noreg) { 1697 assert_different_registers(to, rd, rd2); 1698 if (size_in_bytes < 8) { 1699 Address addr = get_addr_with_indexing(to, size_in_bytes, forward); 1700 __ store_sized_value(rd, addr, size_in_bytes, cond); 1701 } else { 1702 assert (rd2 != noreg, "second value register must be specified"); 1703 assert (rd->encoding() < rd2->encoding(), "wrong value register set"); 1704 1705 if (forward) { 1706 __ stmia(to, RegisterSet(rd) | rd2, writeback, cond); 1707 } else { 1708 __ stmdb(to, RegisterSet(rd) | rd2, writeback, cond); 1709 } 1710 } 1711 } 1712 1713 // Copies data from 'from' to 'to' in specified direction to align 'from' by 64 bits. 1714 // (on 32-bit ARM 64-bit alignment is better for LDM). 1715 // 1716 // Arguments: 1717 // from: beginning (if forward) or upper bound (if !forward) of the region to be read 1718 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 1719 // count: 32-bit int, maximum number of elements which can be copied 1720 // bytes_per_count: size of an element 1721 // forward: specifies copy direction 1722 // 1723 // Notes: 1724 // 'from' and 'to' must be aligned by 'bytes_per_count' 1725 // 'count' must not be less than the returned value 1726 // shifts 'from' and 'to' by the number of copied bytes in corresponding direction 1727 // decreases 'count' by the number of elements copied 1728 // 1729 // Returns maximum number of bytes which may be copied. 1730 int align_src(Register from, Register to, Register count, Register tmp, int bytes_per_count, bool forward) { 1731 assert_different_registers(from, to, count, tmp); 1732 if (bytes_per_count < 8) { 1733 Label L_align_src; 1734 __ BIND(L_align_src); 1735 __ tst(from, 7); 1736 // ne => not aligned: copy one element and (if bytes_per_count < 4) loop 1737 __ sub(count, count, 1, ne); 1738 load_one(tmp, from, bytes_per_count, forward, ne); 1739 store_one(tmp, to, bytes_per_count, forward, ne); 1740 if (bytes_per_count < 4) { 1741 __ b(L_align_src, ne); // if bytes_per_count == 4, then 0 or 1 loop iterations are enough 1742 } 1743 } 1744 return 7/bytes_per_count; 1745 } 1746 1747 // Copies 'count' of 'bytes_per_count'-sized elements in the specified direction. 1748 // 1749 // Arguments: 1750 // from: beginning (if forward) or upper bound (if !forward) of the region to be read 1751 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 1752 // count: 32-bit int, number of elements to be copied 1753 // entry: copy loop entry point 1754 // bytes_per_count: size of an element 1755 // forward: specifies copy direction 1756 // 1757 // Notes: 1758 // shifts 'from' and 'to' 1759 void copy_small_array(Register from, Register to, Register count, Register tmp, Register tmp2, int bytes_per_count, bool forward, Label & entry, bool unsafe_copy = false) { 1760 assert_different_registers(from, to, count, tmp); 1761 1762 { 1763 // UnsafeCopyMemory page error: continue after ucm 1764 UnsafeCopyMemoryMark ucmm(this, unsafe_copy, true); 1765 __ align(OptoLoopAlignment); 1766 Label L_small_loop; 1767 __ BIND(L_small_loop); 1768 store_one(tmp, to, bytes_per_count, forward, al, tmp2); 1769 __ BIND(entry); // entry point 1770 __ subs(count, count, 1); 1771 load_one(tmp, from, bytes_per_count, forward, ge, tmp2); 1772 __ b(L_small_loop, ge); 1773 } 1774 } 1775 1776 // Aligns 'to' by reading one word from 'from' and writting its part to 'to'. 1777 // 1778 // Arguments: 1779 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 1780 // count: 32-bit int, number of elements allowed to be copied 1781 // to_remainder: remainder of dividing 'to' by wordSize 1782 // bytes_per_count: size of an element 1783 // forward: specifies copy direction 1784 // Rval: contains an already read but not yet written word; 1785 // its' LSBs (if forward) or MSBs (if !forward) are to be written to align 'to'. 1786 // 1787 // Notes: 1788 // 'count' must not be less then the returned value 1789 // 'to' must be aligned by bytes_per_count but must not be aligned by wordSize 1790 // shifts 'to' by the number of written bytes (so that it becomes the bound of memory to be written) 1791 // decreases 'count' by the the number of elements written 1792 // Rval's MSBs or LSBs remain to be written further by generate_{forward,backward}_shifted_copy_loop 1793 int align_dst(Register to, Register count, Register Rval, Register tmp, 1794 int to_remainder, int bytes_per_count, bool forward) { 1795 assert_different_registers(to, count, tmp, Rval); 1796 1797 assert (0 < to_remainder && to_remainder < wordSize, "to_remainder is not valid"); 1798 assert (to_remainder % bytes_per_count == 0, "to must be aligned by bytes_per_count"); 1799 1800 int bytes_to_write = forward ? (wordSize - to_remainder) : to_remainder; 1801 1802 int offset = 0; 1803 1804 for (int l = 0; l < LogBytesPerWord; ++l) { 1805 int s = (1 << l); 1806 if (bytes_to_write & s) { 1807 int new_offset = offset + s*BitsPerByte; 1808 if (forward) { 1809 if (offset == 0) { 1810 store_one(Rval, to, s, forward); 1811 } else { 1812 __ logical_shift_right(tmp, Rval, offset); 1813 store_one(tmp, to, s, forward); 1814 } 1815 } else { 1816 __ logical_shift_right(tmp, Rval, BitsPerWord - new_offset); 1817 store_one(tmp, to, s, forward); 1818 } 1819 1820 offset = new_offset; 1821 } 1822 } 1823 1824 assert (offset == bytes_to_write * BitsPerByte, "all bytes must be copied"); 1825 1826 __ sub_32(count, count, bytes_to_write/bytes_per_count); 1827 1828 return bytes_to_write / bytes_per_count; 1829 } 1830 1831 // Copies 'count' of elements using shifted copy loop 1832 // 1833 // Arguments: 1834 // from: beginning (if forward) or upper bound (if !forward) of the region to be read 1835 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 1836 // count: 32-bit int, number of elements to be copied 1837 // to_remainder: remainder of dividing 'to' by wordSize 1838 // bytes_per_count: size of an element 1839 // forward: specifies copy direction 1840 // Rval: contains an already read but not yet written word 1841 // 1842 // 1843 // Notes: 1844 // 'count' must not be less then the returned value 1845 // 'from' must be aligned by wordSize 1846 // 'to' must be aligned by bytes_per_count but must not be aligned by wordSize 1847 // shifts 'to' by the number of copied bytes 1848 // 1849 // Scratches R3-R10, R12 1850 int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, Register Rval, 1851 int to_remainder, int bytes_per_count, bool forward) { 1852 1853 assert (0 < to_remainder && to_remainder < wordSize, "to_remainder is invalid"); 1854 1855 const Register tmp = forward ? R3 : R12; 1856 assert_different_registers(from, to, count, Rval, tmp); 1857 1858 int required_to_align = align_dst(to, count, Rval, tmp, to_remainder, bytes_per_count, forward); 1859 1860 int lsr_shift = (wordSize - to_remainder) * BitsPerByte; 1861 int lsl_shift = to_remainder * BitsPerByte; 1862 1863 int min_copy; 1864 if (forward) { 1865 min_copy = generate_forward_shifted_copy_loop(from, to, count, bytes_per_count, lsr_shift, lsl_shift); 1866 } else { 1867 min_copy = generate_backward_shifted_copy_loop(from, to, count, bytes_per_count, lsr_shift, lsl_shift); 1868 } 1869 1870 return min_copy + required_to_align; 1871 } 1872 1873 // Copies 'count' of elements using shifted copy loop 1874 // 1875 // Arguments: 1876 // from: beginning (if forward) or upper bound (if !forward) of the region to be read 1877 // to: beginning (if forward) or upper bound (if !forward) of the region to be written 1878 // count: 32-bit int, number of elements to be copied 1879 // bytes_per_count: size of an element 1880 // forward: specifies copy direction 1881 // 1882 // Notes: 1883 // 'count' must not be less then the returned value 1884 // 'from' must be aligned by wordSize 1885 // 'to' must be aligned by bytes_per_count but must not be aligned by wordSize 1886 // shifts 'to' by the number of copied bytes 1887 // 1888 // Scratches 'from', 'count', R3 and R12. 1889 // R4-R10 saved for use. 1890 int align_dst_and_generate_shifted_copy_loop(Register from, Register to, Register count, int bytes_per_count, bool forward, bool unsafe_copy = false) { 1891 1892 const Register Rval = forward ? R12 : R3; // as generate_{forward,backward}_shifted_copy_loop expect 1893 1894 int min_copy = 0; 1895 1896 // Note: if {seq} is a sequence of numbers, L{seq} means that if the execution reaches this point, 1897 // then the remainder of 'to' divided by wordSize is one of elements of {seq}. 1898 1899 __ push(RegisterSet(R4,R10)); 1900 1901 { 1902 // UnsafeCopyMemory page error: continue after ucm 1903 UnsafeCopyMemoryMark ucmm(this, unsafe_copy, true); 1904 load_one(Rval, from, wordSize, forward); 1905 1906 switch (bytes_per_count) { 1907 case 2: 1908 min_copy = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward); 1909 break; 1910 case 1: 1911 { 1912 Label L1, L2, L3; 1913 int min_copy1, min_copy2, min_copy3; 1914 1915 Label L_loop_finished; 1916 1917 if (forward) { 1918 __ tbz(to, 0, L2); 1919 __ tbz(to, 1, L1); 1920 1921 __ BIND(L3); 1922 min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward); 1923 __ b(L_loop_finished); 1924 1925 __ BIND(L1); 1926 min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward); 1927 __ b(L_loop_finished); 1928 1929 __ BIND(L2); 1930 min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward); 1931 } else { 1932 __ tbz(to, 0, L2); 1933 __ tbnz(to, 1, L3); 1934 1935 __ BIND(L1); 1936 min_copy1 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 1, bytes_per_count, forward); 1937 __ b(L_loop_finished); 1938 1939 __ BIND(L3); 1940 min_copy3 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 3, bytes_per_count, forward); 1941 __ b(L_loop_finished); 1942 1943 __ BIND(L2); 1944 min_copy2 = align_dst_and_generate_shifted_copy_loop(from, to, count, Rval, 2, bytes_per_count, forward); 1945 } 1946 1947 min_copy = MAX2(MAX2(min_copy1, min_copy2), min_copy3); 1948 1949 __ BIND(L_loop_finished); 1950 1951 break; 1952 } 1953 default: 1954 ShouldNotReachHere(); 1955 break; 1956 } 1957 } 1958 __ pop(RegisterSet(R4,R10)); 1959 1960 return min_copy; 1961 } 1962 1963 #ifndef PRODUCT 1964 int * get_arraycopy_counter(int bytes_per_count) { 1965 switch (bytes_per_count) { 1966 case 1: 1967 return &SharedRuntime::_jbyte_array_copy_ctr; 1968 case 2: 1969 return &SharedRuntime::_jshort_array_copy_ctr; 1970 case 4: 1971 return &SharedRuntime::_jint_array_copy_ctr; 1972 case 8: 1973 return &SharedRuntime::_jlong_array_copy_ctr; 1974 default: 1975 ShouldNotReachHere(); 1976 return NULL; 1977 } 1978 } 1979 #endif // !PRODUCT 1980 1981 address generate_unsafecopy_common_error_exit() { 1982 address start_pc = __ pc(); 1983 __ mov(R0, 0); 1984 __ ret(); 1985 return start_pc; 1986 } 1987 1988 // 1989 // Generate stub for primitive array copy. If "aligned" is true, the 1990 // "from" and "to" addresses are assumed to be heapword aligned. 1991 // 1992 // If "disjoint" is true, arrays are assumed to be disjoint, otherwise they may overlap and 1993 // "nooverlap_target" must be specified as the address to jump if they don't. 1994 // 1995 // Arguments for generated stub: 1996 // from: R0 1997 // to: R1 1998 // count: R2 treated as signed 32-bit int 1999 // 2000 address generate_primitive_copy(bool aligned, const char * name, bool status, int bytes_per_count, bool disjoint, address nooverlap_target = NULL) { 2001 __ align(CodeEntryAlignment); 2002 StubCodeMark mark(this, "StubRoutines", name); 2003 address start = __ pc(); 2004 2005 const Register from = R0; // source array address 2006 const Register to = R1; // destination array address 2007 const Register count = R2; // elements count 2008 const Register tmp1 = R3; 2009 const Register tmp2 = R12; 2010 2011 if (!aligned) { 2012 BLOCK_COMMENT("Entry:"); 2013 } 2014 2015 __ zap_high_non_significant_bits(R2); 2016 2017 if (!disjoint) { 2018 assert (nooverlap_target != NULL, "must be specified for conjoint case"); 2019 array_overlap_test(nooverlap_target, exact_log2(bytes_per_count), tmp1, tmp2); 2020 } 2021 2022 inc_counter_np(*get_arraycopy_counter(bytes_per_count), tmp1, tmp2); 2023 2024 // Conjoint case: since execution reaches this point, the arrays overlap, so performing backward copy 2025 // Disjoint case: perform forward copy 2026 bool forward = disjoint; 2027 2028 2029 if (!forward) { 2030 // Set 'from' and 'to' to upper bounds 2031 int log_bytes_per_count = exact_log2(bytes_per_count); 2032 __ add_ptr_scaled_int32(to, to, count, log_bytes_per_count); 2033 __ add_ptr_scaled_int32(from, from, count, log_bytes_per_count); 2034 } 2035 2036 // There are two main copy loop implementations: 2037 // *) The huge and complex one applicable only for large enough arrays 2038 // *) The small and simple one applicable for any array (but not efficient for large arrays). 2039 // Currently "small" implementation is used if and only if the "large" one could not be used. 2040 // XXX optim: tune the limit higher ? 2041 // Large implementation lower applicability bound is actually determined by 2042 // aligned copy loop which require <=7 bytes for src alignment, and 8 words for aligned copy loop. 2043 const int small_copy_limit = (8*wordSize + 7) / bytes_per_count; 2044 2045 Label L_small_array; 2046 __ cmp_32(count, small_copy_limit); 2047 __ b(L_small_array, le); 2048 2049 // Otherwise proceed with large implementation. 2050 2051 bool from_is_aligned = (bytes_per_count >= 8); 2052 if (aligned && forward && (HeapWordSize % 8 == 0)) { 2053 // if 'from' is heapword aligned and HeapWordSize is divisible by 8, 2054 // then from is aligned by 8 2055 from_is_aligned = true; 2056 } 2057 2058 int count_required_to_align = 0; 2059 { 2060 // UnsafeCopyMemoryMark page error: continue at UnsafeCopyMemory common_error_exit 2061 UnsafeCopyMemoryMark ucmm(this, !aligned, false); 2062 count_required_to_align = from_is_aligned ? 0 : align_src(from, to, count, tmp1, bytes_per_count, forward); 2063 assert (small_copy_limit >= count_required_to_align, "alignment could exhaust count"); 2064 } 2065 2066 // now 'from' is aligned 2067 2068 bool to_is_aligned = false; 2069 2070 if (bytes_per_count >= wordSize) { 2071 // 'to' is aligned by bytes_per_count, so it is aligned by wordSize 2072 to_is_aligned = true; 2073 } else { 2074 if (aligned && (8 % HeapWordSize == 0) && (HeapWordSize % wordSize == 0)) { 2075 // Originally 'from' and 'to' were heapword aligned; 2076 // (from - to) has not been changed, so since now 'from' is 8-byte aligned, then it is also heapword aligned, 2077 // so 'to' is also heapword aligned and thus aligned by wordSize. 2078 to_is_aligned = true; 2079 } 2080 } 2081 2082 Label L_unaligned_dst; 2083 2084 if (!to_is_aligned) { 2085 BLOCK_COMMENT("Check dst alignment:"); 2086 __ tst(to, wordSize - 1); 2087 __ b(L_unaligned_dst, ne); // 'to' is not aligned 2088 } 2089 2090 // 'from' and 'to' are properly aligned 2091 2092 int min_copy; 2093 if (forward) { 2094 min_copy = generate_forward_aligned_copy_loop(from, to, count, bytes_per_count, !aligned /*add UnsafeCopyMemory entry*/); 2095 } else { 2096 min_copy = generate_backward_aligned_copy_loop(from, to, count, bytes_per_count, !aligned /*add UnsafeCopyMemory entry*/); 2097 } 2098 assert(small_copy_limit >= count_required_to_align + min_copy, "first loop might exhaust count"); 2099 2100 if (status) { 2101 __ mov(R0, 0); // OK 2102 } 2103 2104 __ ret(); 2105 2106 { 2107 copy_small_array(from, to, count, tmp1, tmp2, bytes_per_count, forward, L_small_array /* entry */, !aligned /*add UnsafeCopyMemory entry*/); 2108 2109 if (status) { 2110 __ mov(R0, 0); // OK 2111 } 2112 2113 __ ret(); 2114 } 2115 2116 if (! to_is_aligned) { 2117 __ BIND(L_unaligned_dst); 2118 int min_copy_shifted = align_dst_and_generate_shifted_copy_loop(from, to, count, bytes_per_count, forward, !aligned /*add UnsafeCopyMemory entry*/); 2119 assert (small_copy_limit >= count_required_to_align + min_copy_shifted, "first loop might exhaust count"); 2120 2121 if (status) { 2122 __ mov(R0, 0); // OK 2123 } 2124 2125 __ ret(); 2126 } 2127 2128 return start; 2129 } 2130 2131 2132 // Generates pattern of code to be placed after raw data copying in generate_oop_copy 2133 // Includes return from arraycopy stub. 2134 // 2135 // Arguments: 2136 // to: destination pointer after copying. 2137 // if 'forward' then 'to' == upper bound, else 'to' == beginning of the modified region 2138 // count: total number of copied elements, 32-bit int 2139 // 2140 // Blows all volatile R0-R3, Rtemp, LR) and 'to', 'count', 'tmp' registers. 2141 void oop_arraycopy_stub_epilogue_helper(Register to, Register count, Register tmp, bool status, bool forward, DecoratorSet decorators) { 2142 assert_different_registers(to, count, tmp); 2143 2144 if (forward) { 2145 // 'to' is upper bound of the modified region 2146 // restore initial dst: 2147 __ sub_ptr_scaled_int32(to, to, count, LogBytesPerHeapOop); 2148 } 2149 2150 // 'to' is the beginning of the region 2151 2152 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2153 bs->arraycopy_epilogue(_masm, decorators, true, to, count, tmp); 2154 2155 if (status) { 2156 __ mov(R0, 0); // OK 2157 } 2158 2159 __ pop(PC); 2160 } 2161 2162 2163 // Generate stub for assign-compatible oop copy. If "aligned" is true, the 2164 // "from" and "to" addresses are assumed to be heapword aligned. 2165 // 2166 // If "disjoint" is true, arrays are assumed to be disjoint, otherwise they may overlap and 2167 // "nooverlap_target" must be specified as the address to jump if they don't. 2168 // 2169 // Arguments for generated stub: 2170 // from: R0 2171 // to: R1 2172 // count: R2 treated as signed 32-bit int 2173 // 2174 address generate_oop_copy(bool aligned, const char * name, bool status, bool disjoint, address nooverlap_target = NULL) { 2175 __ align(CodeEntryAlignment); 2176 StubCodeMark mark(this, "StubRoutines", name); 2177 address start = __ pc(); 2178 2179 Register from = R0; 2180 Register to = R1; 2181 Register count = R2; 2182 Register tmp1 = R3; 2183 Register tmp2 = R12; 2184 2185 2186 if (!aligned) { 2187 BLOCK_COMMENT("Entry:"); 2188 } 2189 2190 __ zap_high_non_significant_bits(R2); 2191 2192 if (!disjoint) { 2193 assert (nooverlap_target != NULL, "must be specified for conjoint case"); 2194 array_overlap_test(nooverlap_target, LogBytesPerHeapOop, tmp1, tmp2); 2195 } 2196 2197 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, tmp1, tmp2); 2198 2199 // Conjoint case: since execution reaches this point, the arrays overlap, so performing backward copy 2200 // Disjoint case: perform forward copy 2201 bool forward = disjoint; 2202 2203 const int bytes_per_count = BytesPerHeapOop; 2204 const int log_bytes_per_count = LogBytesPerHeapOop; 2205 2206 const Register saved_count = LR; 2207 const int callee_saved_regs = 3; // R0-R2 2208 2209 // LR is used later to save barrier args 2210 __ push(LR); 2211 2212 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 2213 if (disjoint) { 2214 decorators |= ARRAYCOPY_DISJOINT; 2215 } 2216 if (aligned) { 2217 decorators |= ARRAYCOPY_ALIGNED; 2218 } 2219 2220 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2221 bs->arraycopy_prologue(_masm, decorators, true, to, count, callee_saved_regs); 2222 2223 // save arguments for barrier generation (after the pre barrier) 2224 __ mov(saved_count, count); 2225 2226 if (!forward) { 2227 __ add_ptr_scaled_int32(to, to, count, log_bytes_per_count); 2228 __ add_ptr_scaled_int32(from, from, count, log_bytes_per_count); 2229 } 2230 2231 // for short arrays, just do single element copy 2232 Label L_small_array; 2233 const int small_copy_limit = (8*wordSize + 7)/bytes_per_count; // XXX optim: tune the limit higher ? 2234 __ cmp_32(count, small_copy_limit); 2235 __ b(L_small_array, le); 2236 2237 bool from_is_aligned = (bytes_per_count >= 8); 2238 if (aligned && forward && (HeapWordSize % 8 == 0)) { 2239 // if 'from' is heapword aligned and HeapWordSize is divisible by 8, 2240 // then from is aligned by 8 2241 from_is_aligned = true; 2242 } 2243 2244 int count_required_to_align = from_is_aligned ? 0 : align_src(from, to, count, tmp1, bytes_per_count, forward); 2245 assert (small_copy_limit >= count_required_to_align, "alignment could exhaust count"); 2246 2247 // now 'from' is aligned 2248 2249 bool to_is_aligned = false; 2250 2251 if (bytes_per_count >= wordSize) { 2252 // 'to' is aligned by bytes_per_count, so it is aligned by wordSize 2253 to_is_aligned = true; 2254 } else { 2255 if (aligned && (8 % HeapWordSize == 0) && (HeapWordSize % wordSize == 0)) { 2256 // Originally 'from' and 'to' were heapword aligned; 2257 // (from - to) has not been changed, so since now 'from' is 8-byte aligned, then it is also heapword aligned, 2258 // so 'to' is also heapword aligned and thus aligned by wordSize. 2259 to_is_aligned = true; 2260 } 2261 } 2262 2263 Label L_unaligned_dst; 2264 2265 if (!to_is_aligned) { 2266 BLOCK_COMMENT("Check dst alignment:"); 2267 __ tst(to, wordSize - 1); 2268 __ b(L_unaligned_dst, ne); // 'to' is not aligned 2269 } 2270 2271 int min_copy; 2272 if (forward) { 2273 min_copy = generate_forward_aligned_copy_loop(from, to, count, bytes_per_count); 2274 } else { 2275 min_copy = generate_backward_aligned_copy_loop(from, to, count, bytes_per_count); 2276 } 2277 assert(small_copy_limit >= count_required_to_align + min_copy, "first loop might exhaust count"); 2278 2279 oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward, decorators); 2280 2281 { 2282 copy_small_array(from, to, count, tmp1, noreg, bytes_per_count, forward, L_small_array); 2283 2284 oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward, decorators); 2285 } 2286 2287 if (!to_is_aligned) { 2288 __ BIND(L_unaligned_dst); 2289 ShouldNotReachHere(); 2290 int min_copy_shifted = align_dst_and_generate_shifted_copy_loop(from, to, count, bytes_per_count, forward); 2291 assert (small_copy_limit >= count_required_to_align + min_copy_shifted, "first loop might exhaust count"); 2292 2293 oop_arraycopy_stub_epilogue_helper(to, saved_count, /* tmp */ tmp1, status, forward, decorators); 2294 } 2295 2296 return start; 2297 } 2298 2299 // Generate 'unsafe' array copy stub 2300 // Though just as safe as the other stubs, it takes an unscaled 2301 // size_t argument instead of an element count. 2302 // 2303 // Arguments for generated stub: 2304 // from: R0 2305 // to: R1 2306 // count: R2 byte count, treated as ssize_t, can be zero 2307 // 2308 // Examines the alignment of the operands and dispatches 2309 // to a long, int, short, or byte copy loop. 2310 // 2311 address generate_unsafe_copy(const char* name) { 2312 2313 const Register R0_from = R0; // source array address 2314 const Register R1_to = R1; // destination array address 2315 const Register R2_count = R2; // elements count 2316 2317 const Register R3_bits = R3; // test copy of low bits 2318 2319 __ align(CodeEntryAlignment); 2320 StubCodeMark mark(this, "StubRoutines", name); 2321 address start = __ pc(); 2322 const Register tmp = Rtemp; 2323 2324 // bump this on entry, not on exit: 2325 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R3, tmp); 2326 2327 __ orr(R3_bits, R0_from, R1_to); 2328 __ orr(R3_bits, R2_count, R3_bits); 2329 2330 __ tst(R3_bits, BytesPerLong-1); 2331 __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerLong), eq); 2332 __ jump(StubRoutines::_jlong_arraycopy, relocInfo::runtime_call_type, tmp, eq); 2333 2334 __ tst(R3_bits, BytesPerInt-1); 2335 __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerInt), eq); 2336 __ jump(StubRoutines::_jint_arraycopy, relocInfo::runtime_call_type, tmp, eq); 2337 2338 __ tst(R3_bits, BytesPerShort-1); 2339 __ mov(R2_count,AsmOperand(R2_count,asr,LogBytesPerShort), eq); 2340 __ jump(StubRoutines::_jshort_arraycopy, relocInfo::runtime_call_type, tmp, eq); 2341 2342 __ jump(StubRoutines::_jbyte_arraycopy, relocInfo::runtime_call_type, tmp); 2343 return start; 2344 } 2345 2346 // Helper for generating a dynamic type check. 2347 // Smashes only the given temp registers. 2348 void generate_type_check(Register sub_klass, 2349 Register super_check_offset, 2350 Register super_klass, 2351 Register tmp1, 2352 Register tmp2, 2353 Register tmp3, 2354 Label& L_success) { 2355 assert_different_registers(sub_klass, super_check_offset, super_klass, tmp1, tmp2, tmp3); 2356 2357 BLOCK_COMMENT("type_check:"); 2358 2359 // If the pointers are equal, we are done (e.g., String[] elements). 2360 2361 __ cmp(super_klass, sub_klass); 2362 __ b(L_success, eq); // fast success 2363 2364 2365 Label L_loop, L_fail; 2366 2367 int sc_offset = in_bytes(Klass::secondary_super_cache_offset()); 2368 2369 // Check the supertype display: 2370 __ ldr(tmp1, Address(sub_klass, super_check_offset)); 2371 __ cmp(tmp1, super_klass); 2372 __ b(L_success, eq); 2373 2374 __ cmp(super_check_offset, sc_offset); 2375 __ b(L_fail, ne); // failure 2376 2377 BLOCK_COMMENT("type_check_slow_path:"); 2378 2379 // a couple of useful fields in sub_klass: 2380 int ss_offset = in_bytes(Klass::secondary_supers_offset()); 2381 2382 // Do a linear scan of the secondary super-klass chain. 2383 2384 #ifndef PRODUCT 2385 int* pst_counter = &SharedRuntime::_partial_subtype_ctr; 2386 __ inc_counter((address) pst_counter, tmp1, tmp2); 2387 #endif 2388 2389 Register scan_temp = tmp1; 2390 Register count_temp = tmp2; 2391 2392 // We will consult the secondary-super array. 2393 __ ldr(scan_temp, Address(sub_klass, ss_offset)); 2394 2395 Register search_key = super_klass; 2396 2397 // Load the array length. 2398 __ ldr_s32(count_temp, Address(scan_temp, Array<Klass*>::length_offset_in_bytes())); 2399 __ add(scan_temp, scan_temp, Array<Klass*>::base_offset_in_bytes()); 2400 2401 __ add(count_temp, count_temp, 1); 2402 2403 // Top of search loop 2404 __ bind(L_loop); 2405 // Notes: 2406 // scan_temp starts at the array elements 2407 // count_temp is 1+size 2408 2409 __ subs(count_temp, count_temp, 1); 2410 __ b(L_fail, eq); // not found 2411 2412 // Load next super to check 2413 // In the array of super classes elements are pointer sized. 2414 int element_size = wordSize; 2415 __ ldr(tmp3, Address(scan_temp, element_size, post_indexed)); 2416 2417 // Look for Rsuper_klass on Rsub_klass's secondary super-class-overflow list 2418 __ cmp(tmp3, search_key); 2419 2420 // A miss means we are NOT a subtype and need to keep looping 2421 __ b(L_loop, ne); 2422 2423 // Falling out the bottom means we found a hit; we ARE a subtype 2424 2425 // Success. Cache the super we found and proceed in triumph. 2426 __ str(super_klass, Address(sub_klass, sc_offset)); 2427 2428 // Jump to success 2429 __ b(L_success); 2430 2431 // Fall through on failure! 2432 __ bind(L_fail); 2433 } 2434 2435 // Generate stub for checked oop copy. 2436 // 2437 // Arguments for generated stub: 2438 // from: R0 2439 // to: R1 2440 // count: R2 treated as signed 32-bit int 2441 // ckoff: R3 (super_check_offset) 2442 // ckval: R4 (super_klass) 2443 // ret: R0 zero for success; (-1^K) where K is partial transfer count (32-bit) 2444 // 2445 address generate_checkcast_copy(const char * name) { 2446 __ align(CodeEntryAlignment); 2447 StubCodeMark mark(this, "StubRoutines", name); 2448 address start = __ pc(); 2449 2450 const Register from = R0; // source array address 2451 const Register to = R1; // destination array address 2452 const Register count = R2; // elements count 2453 2454 const Register R3_ckoff = R3; // super_check_offset 2455 const Register R4_ckval = R4; // super_klass 2456 2457 const int callee_saved_regs = 4; // LR saved differently 2458 2459 Label load_element, store_element, do_epilogue, fail; 2460 2461 BLOCK_COMMENT("Entry:"); 2462 2463 __ zap_high_non_significant_bits(R2); 2464 2465 int pushed = 0; 2466 __ push(LR); 2467 pushed+=1; 2468 2469 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST; 2470 2471 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 2472 bs->arraycopy_prologue(_masm, decorators, true, to, count, callee_saved_regs); 2473 2474 const RegisterSet caller_saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11; 2475 __ push(caller_saved_regs); 2476 assert(caller_saved_regs.size() == 6, "check the count"); 2477 pushed+=6; 2478 2479 __ ldr(R4_ckval,Address(SP, wordSize*pushed)); // read the argument that was on the stack 2480 2481 // Save arguments for barrier generation (after the pre barrier): 2482 // - must be a caller saved register and not LR 2483 // - ARM32: avoid R10 in case RThread is needed 2484 const Register saved_count = altFP_7_11; 2485 __ movs(saved_count, count); // and test count 2486 __ b(load_element,ne); 2487 2488 // nothing to copy 2489 __ mov(R0, 0); 2490 2491 __ pop(caller_saved_regs); 2492 __ pop(PC); 2493 2494 // ======== begin loop ======== 2495 // (Loop is rotated; its entry is load_element.) 2496 __ align(OptoLoopAlignment); 2497 __ BIND(store_element); 2498 if (UseCompressedOops) { 2499 __ store_heap_oop(Address(to, BytesPerHeapOop, post_indexed), R5); // store the oop, changes flags 2500 __ subs_32(count,count,1); 2501 } else { 2502 __ subs_32(count,count,1); 2503 __ str(R5, Address(to, BytesPerHeapOop, post_indexed)); // store the oop 2504 } 2505 __ b(do_epilogue, eq); // count exhausted 2506 2507 // ======== loop entry is here ======== 2508 __ BIND(load_element); 2509 __ load_heap_oop(R5, Address(from, BytesPerHeapOop, post_indexed)); // load the oop 2510 __ cbz(R5, store_element); // NULL 2511 2512 __ load_klass(R6, R5); 2513 2514 generate_type_check(R6, R3_ckoff, R4_ckval, /*tmps*/ R12, R8, R9, 2515 // branch to this on success: 2516 store_element); 2517 // ======== end loop ======== 2518 2519 // It was a real error; we must depend on the caller to finish the job. 2520 // Register count has number of *remaining* oops, saved_count number of *total* oops. 2521 // Emit GC store barriers for the oops we have copied 2522 // and report their number to the caller (0 or (-1^n)) 2523 __ BIND(fail); 2524 2525 // Note: fail marked by the fact that count differs from saved_count 2526 2527 __ BIND(do_epilogue); 2528 2529 Register copied = R4; // saved 2530 Label L_not_copied; 2531 2532 __ subs_32(copied, saved_count, count); // copied count (in saved reg) 2533 __ b(L_not_copied, eq); // nothing was copied, skip post barrier 2534 __ sub(to, to, AsmOperand(copied, lsl, LogBytesPerHeapOop)); // initial to value 2535 __ mov(R12, copied); // count arg scratched by post barrier 2536 2537 bs->arraycopy_epilogue(_masm, decorators, true, to, R12, R3); 2538 2539 assert_different_registers(R3,R12,LR,copied,saved_count); 2540 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R3, R12); 2541 2542 __ BIND(L_not_copied); 2543 __ cmp_32(copied, saved_count); // values preserved in saved registers 2544 2545 __ mov(R0, 0, eq); // 0 if all copied 2546 __ mvn(R0, copied, ne); // else NOT(copied) 2547 __ pop(caller_saved_regs); 2548 __ pop(PC); 2549 2550 return start; 2551 } 2552 2553 // Perform range checks on the proposed arraycopy. 2554 // Kills the two temps, but nothing else. 2555 void arraycopy_range_checks(Register src, // source array oop 2556 Register src_pos, // source position (32-bit int) 2557 Register dst, // destination array oop 2558 Register dst_pos, // destination position (32-bit int) 2559 Register length, // length of copy (32-bit int) 2560 Register temp1, Register temp2, 2561 Label& L_failed) { 2562 2563 BLOCK_COMMENT("arraycopy_range_checks:"); 2564 2565 // if (src_pos + length > arrayOop(src)->length() ) FAIL; 2566 2567 const Register array_length = temp1; // scratch 2568 const Register end_pos = temp2; // scratch 2569 2570 __ add_32(end_pos, length, src_pos); // src_pos + length 2571 __ ldr_s32(array_length, Address(src, arrayOopDesc::length_offset_in_bytes())); 2572 __ cmp_32(end_pos, array_length); 2573 __ b(L_failed, hi); 2574 2575 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL; 2576 __ add_32(end_pos, length, dst_pos); // dst_pos + length 2577 __ ldr_s32(array_length, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2578 __ cmp_32(end_pos, array_length); 2579 __ b(L_failed, hi); 2580 2581 BLOCK_COMMENT("arraycopy_range_checks done"); 2582 } 2583 2584 // 2585 // Generate generic array copy stubs 2586 // 2587 // Input: 2588 // R0 - src oop 2589 // R1 - src_pos (32-bit int) 2590 // R2 - dst oop 2591 // R3 - dst_pos (32-bit int) 2592 // SP[0] - element count (32-bit int) 2593 // 2594 // Output: (32-bit int) 2595 // R0 == 0 - success 2596 // R0 < 0 - need to call System.arraycopy 2597 // 2598 address generate_generic_copy(const char *name) { 2599 Label L_failed, L_objArray; 2600 2601 // Input registers 2602 const Register src = R0; // source array oop 2603 const Register src_pos = R1; // source position 2604 const Register dst = R2; // destination array oop 2605 const Register dst_pos = R3; // destination position 2606 2607 // registers used as temp 2608 const Register R5_src_klass = R5; // source array klass 2609 const Register R6_dst_klass = R6; // destination array klass 2610 const Register R_lh = altFP_7_11; // layout handler 2611 const Register R8_temp = R8; 2612 2613 __ align(CodeEntryAlignment); 2614 StubCodeMark mark(this, "StubRoutines", name); 2615 address start = __ pc(); 2616 2617 __ zap_high_non_significant_bits(R1); 2618 __ zap_high_non_significant_bits(R3); 2619 __ zap_high_non_significant_bits(R4); 2620 2621 int pushed = 0; 2622 const RegisterSet saved_regs = RegisterSet(R4,R6) | RegisterSet(R8,R9) | altFP_7_11; 2623 __ push(saved_regs); 2624 assert(saved_regs.size() == 6, "check the count"); 2625 pushed+=6; 2626 2627 // bump this on entry, not on exit: 2628 inc_counter_np(SharedRuntime::_generic_array_copy_ctr, R5, R12); 2629 2630 const Register length = R4; // elements count 2631 __ ldr(length, Address(SP,4*pushed)); 2632 2633 2634 //----------------------------------------------------------------------- 2635 // Assembler stubs will be used for this call to arraycopy 2636 // if the following conditions are met: 2637 // 2638 // (1) src and dst must not be null. 2639 // (2) src_pos must not be negative. 2640 // (3) dst_pos must not be negative. 2641 // (4) length must not be negative. 2642 // (5) src klass and dst klass should be the same and not NULL. 2643 // (6) src and dst should be arrays. 2644 // (7) src_pos + length must not exceed length of src. 2645 // (8) dst_pos + length must not exceed length of dst. 2646 BLOCK_COMMENT("arraycopy initial argument checks"); 2647 2648 // if (src == NULL) return -1; 2649 __ cbz(src, L_failed); 2650 2651 // if (src_pos < 0) return -1; 2652 __ cmp_32(src_pos, 0); 2653 __ b(L_failed, lt); 2654 2655 // if (dst == NULL) return -1; 2656 __ cbz(dst, L_failed); 2657 2658 // if (dst_pos < 0) return -1; 2659 __ cmp_32(dst_pos, 0); 2660 __ b(L_failed, lt); 2661 2662 // if (length < 0) return -1; 2663 __ cmp_32(length, 0); 2664 __ b(L_failed, lt); 2665 2666 BLOCK_COMMENT("arraycopy argument klass checks"); 2667 // get src->klass() 2668 __ load_klass(R5_src_klass, src); 2669 2670 // Load layout helper 2671 // 2672 // |array_tag| | header_size | element_type | |log2_element_size| 2673 // 32 30 24 16 8 2 0 2674 // 2675 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2676 // 2677 2678 int lh_offset = in_bytes(Klass::layout_helper_offset()); 2679 __ ldr_u32(R_lh, Address(R5_src_klass, lh_offset)); 2680 2681 __ load_klass(R6_dst_klass, dst); 2682 2683 // Handle objArrays completely differently... 2684 juint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2685 __ mov_slow(R8_temp, objArray_lh); 2686 __ cmp_32(R_lh, R8_temp); 2687 __ b(L_objArray,eq); 2688 2689 // if (src->klass() != dst->klass()) return -1; 2690 __ cmp(R5_src_klass, R6_dst_klass); 2691 __ b(L_failed, ne); 2692 2693 // if (!src->is_Array()) return -1; 2694 __ cmp_32(R_lh, Klass::_lh_neutral_value); // < 0 2695 __ b(L_failed, ge); 2696 2697 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2698 R8_temp, R6_dst_klass, L_failed); 2699 2700 { 2701 // TypeArrayKlass 2702 // 2703 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2704 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2705 // 2706 2707 const Register R6_offset = R6_dst_klass; // array offset 2708 const Register R12_elsize = R12; // log2 element size 2709 2710 __ logical_shift_right(R6_offset, R_lh, Klass::_lh_header_size_shift); 2711 __ andr(R6_offset, R6_offset, (unsigned int)Klass::_lh_header_size_mask); // array_offset 2712 __ add(src, src, R6_offset); // src array offset 2713 __ add(dst, dst, R6_offset); // dst array offset 2714 __ andr(R12_elsize, R_lh, (unsigned int)Klass::_lh_log2_element_size_mask); // log2 element size 2715 2716 // next registers should be set before the jump to corresponding stub 2717 const Register from = R0; // source array address 2718 const Register to = R1; // destination array address 2719 const Register count = R2; // elements count 2720 2721 // 'from', 'to', 'count' registers should be set in this order 2722 // since they are the same as 'src', 'src_pos', 'dst'. 2723 2724 2725 BLOCK_COMMENT("scale indexes to element size"); 2726 __ add(from, src, AsmOperand(src_pos, lsl, R12_elsize)); // src_addr 2727 __ add(to, dst, AsmOperand(dst_pos, lsl, R12_elsize)); // dst_addr 2728 2729 __ mov(count, length); // length 2730 2731 // XXX optim: avoid later push in arraycopy variants ? 2732 2733 __ pop(saved_regs); 2734 2735 BLOCK_COMMENT("choose copy loop based on element size"); 2736 __ cmp(R12_elsize, 0); 2737 __ b(StubRoutines::_jbyte_arraycopy,eq); 2738 2739 __ cmp(R12_elsize, LogBytesPerShort); 2740 __ b(StubRoutines::_jshort_arraycopy,eq); 2741 2742 __ cmp(R12_elsize, LogBytesPerInt); 2743 __ b(StubRoutines::_jint_arraycopy,eq); 2744 2745 __ b(StubRoutines::_jlong_arraycopy); 2746 2747 } 2748 2749 // ObjArrayKlass 2750 __ BIND(L_objArray); 2751 // live at this point: R5_src_klass, R6_dst_klass, src[_pos], dst[_pos], length 2752 2753 Label L_plain_copy, L_checkcast_copy; 2754 // test array classes for subtyping 2755 __ cmp(R5_src_klass, R6_dst_klass); // usual case is exact equality 2756 __ b(L_checkcast_copy, ne); 2757 2758 BLOCK_COMMENT("Identically typed arrays"); 2759 { 2760 // Identically typed arrays can be copied without element-wise checks. 2761 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2762 R8_temp, R_lh, L_failed); 2763 2764 // next registers should be set before the jump to corresponding stub 2765 const Register from = R0; // source array address 2766 const Register to = R1; // destination array address 2767 const Register count = R2; // elements count 2768 2769 __ add(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset 2770 __ add(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset 2771 __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerHeapOop); // src_addr 2772 __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerHeapOop); // dst_addr 2773 __ BIND(L_plain_copy); 2774 __ mov(count, length); 2775 2776 __ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ? 2777 __ b(StubRoutines::_oop_arraycopy); 2778 } 2779 2780 { 2781 __ BIND(L_checkcast_copy); 2782 // live at this point: R5_src_klass, R6_dst_klass 2783 2784 // Before looking at dst.length, make sure dst is also an objArray. 2785 __ ldr_u32(R8_temp, Address(R6_dst_klass, lh_offset)); 2786 __ cmp_32(R_lh, R8_temp); 2787 __ b(L_failed, ne); 2788 2789 // It is safe to examine both src.length and dst.length. 2790 2791 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2792 R8_temp, R_lh, L_failed); 2793 2794 // next registers should be set before the jump to corresponding stub 2795 const Register from = R0; // source array address 2796 const Register to = R1; // destination array address 2797 const Register count = R2; // elements count 2798 2799 // Marshal the base address arguments now, freeing registers. 2800 __ add(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset 2801 __ add(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset 2802 __ add_ptr_scaled_int32(from, src, src_pos, LogBytesPerHeapOop); // src_addr 2803 __ add_ptr_scaled_int32(to, dst, dst_pos, LogBytesPerHeapOop); // dst_addr 2804 2805 __ mov(count, length); // length (reloaded) 2806 2807 Register sco_temp = R3; // this register is free now 2808 assert_different_registers(from, to, count, sco_temp, 2809 R6_dst_klass, R5_src_klass); 2810 2811 // Generate the type check. 2812 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2813 __ ldr_u32(sco_temp, Address(R6_dst_klass, sco_offset)); 2814 generate_type_check(R5_src_klass, sco_temp, R6_dst_klass, 2815 R8_temp, R9, 2816 R12, 2817 L_plain_copy); 2818 2819 // Fetch destination element klass from the ObjArrayKlass header. 2820 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2821 2822 // the checkcast_copy loop needs two extra arguments: 2823 const Register Rdst_elem_klass = R3; 2824 __ ldr(Rdst_elem_klass, Address(R6_dst_klass, ek_offset)); // dest elem klass 2825 __ pop(saved_regs); // XXX optim: avoid later push in oop_arraycopy ? 2826 __ str(Rdst_elem_klass, Address(SP,0)); // dest elem klass argument 2827 __ ldr_u32(R3, Address(Rdst_elem_klass, sco_offset)); // sco of elem klass 2828 __ b(StubRoutines::_checkcast_arraycopy); 2829 } 2830 2831 __ BIND(L_failed); 2832 2833 __ pop(saved_regs); 2834 __ mvn(R0, 0); // failure, with 0 copied 2835 __ ret(); 2836 2837 return start; 2838 } 2839 2840 // Safefetch stubs. 2841 void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) { 2842 // safefetch signatures: 2843 // int SafeFetch32(int* adr, int errValue); 2844 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 2845 // 2846 // arguments: 2847 // R0 = adr 2848 // R1 = errValue 2849 // 2850 // result: 2851 // R0 = *adr or errValue 2852 2853 StubCodeMark mark(this, "StubRoutines", name); 2854 2855 // Entry point, pc or function descriptor. 2856 *entry = __ pc(); 2857 2858 // Load *adr into c_rarg2, may fault. 2859 *fault_pc = __ pc(); 2860 2861 switch (size) { 2862 case 4: // int32_t 2863 __ ldr_s32(R1, Address(R0)); 2864 break; 2865 2866 case 8: // int64_t 2867 Unimplemented(); 2868 break; 2869 2870 default: 2871 ShouldNotReachHere(); 2872 } 2873 2874 // return errValue or *adr 2875 *continuation_pc = __ pc(); 2876 __ mov(R0, R1); 2877 __ ret(); 2878 } 2879 2880 void generate_arraycopy_stubs() { 2881 2882 // Note: the disjoint stubs must be generated first, some of 2883 // the conjoint stubs use them. 2884 2885 bool status = false; // non failing C2 stubs need not return a status in R0 2886 2887 #ifdef TEST_C2_GENERIC_ARRAYCOPY /* Internal development flag */ 2888 // With this flag, the C2 stubs are tested by generating calls to 2889 // generic_arraycopy instead of Runtime1::arraycopy 2890 2891 // Runtime1::arraycopy return a status in R0 (0 if OK, else ~copied) 2892 // and the result is tested to see whether the arraycopy stub should 2893 // be called. 2894 2895 // When we test arraycopy this way, we must generate extra code in the 2896 // arraycopy methods callable from C2 generic_arraycopy to set the 2897 // status to 0 for those who always succeed (calling the slow path stub might 2898 // lead to errors since the copy has already been performed). 2899 2900 status = true; // generate a status compatible with C1 calls 2901 #endif 2902 2903 address ucm_common_error_exit = generate_unsafecopy_common_error_exit(); 2904 UnsafeCopyMemory::set_common_exit_stub_pc(ucm_common_error_exit); 2905 2906 // these need always status in case they are called from generic_arraycopy 2907 StubRoutines::_jbyte_disjoint_arraycopy = generate_primitive_copy(false, "jbyte_disjoint_arraycopy", true, 1, true); 2908 StubRoutines::_jshort_disjoint_arraycopy = generate_primitive_copy(false, "jshort_disjoint_arraycopy", true, 2, true); 2909 StubRoutines::_jint_disjoint_arraycopy = generate_primitive_copy(false, "jint_disjoint_arraycopy", true, 4, true); 2910 StubRoutines::_jlong_disjoint_arraycopy = generate_primitive_copy(false, "jlong_disjoint_arraycopy", true, 8, true); 2911 StubRoutines::_oop_disjoint_arraycopy = generate_oop_copy (false, "oop_disjoint_arraycopy", true, true); 2912 2913 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_primitive_copy(true, "arrayof_jbyte_disjoint_arraycopy", status, 1, true); 2914 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_primitive_copy(true, "arrayof_jshort_disjoint_arraycopy",status, 2, true); 2915 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_primitive_copy(true, "arrayof_jint_disjoint_arraycopy", status, 4, true); 2916 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_primitive_copy(true, "arrayof_jlong_disjoint_arraycopy", status, 8, true); 2917 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_oop_copy (true, "arrayof_oop_disjoint_arraycopy", status, true); 2918 2919 // these need always status in case they are called from generic_arraycopy 2920 StubRoutines::_jbyte_arraycopy = generate_primitive_copy(false, "jbyte_arraycopy", true, 1, false, StubRoutines::_jbyte_disjoint_arraycopy); 2921 StubRoutines::_jshort_arraycopy = generate_primitive_copy(false, "jshort_arraycopy", true, 2, false, StubRoutines::_jshort_disjoint_arraycopy); 2922 StubRoutines::_jint_arraycopy = generate_primitive_copy(false, "jint_arraycopy", true, 4, false, StubRoutines::_jint_disjoint_arraycopy); 2923 StubRoutines::_jlong_arraycopy = generate_primitive_copy(false, "jlong_arraycopy", true, 8, false, StubRoutines::_jlong_disjoint_arraycopy); 2924 StubRoutines::_oop_arraycopy = generate_oop_copy (false, "oop_arraycopy", true, false, StubRoutines::_oop_disjoint_arraycopy); 2925 2926 StubRoutines::_arrayof_jbyte_arraycopy = generate_primitive_copy(true, "arrayof_jbyte_arraycopy", status, 1, false, StubRoutines::_arrayof_jbyte_disjoint_arraycopy); 2927 StubRoutines::_arrayof_jshort_arraycopy = generate_primitive_copy(true, "arrayof_jshort_arraycopy", status, 2, false, StubRoutines::_arrayof_jshort_disjoint_arraycopy); 2928 #ifdef _LP64 2929 // since sizeof(jint) < sizeof(HeapWord), there's a different flavor: 2930 StubRoutines::_arrayof_jint_arraycopy = generate_primitive_copy(true, "arrayof_jint_arraycopy", status, 4, false, StubRoutines::_arrayof_jint_disjoint_arraycopy); 2931 #else 2932 StubRoutines::_arrayof_jint_arraycopy = StubRoutines::_jint_arraycopy; 2933 #endif 2934 if (BytesPerHeapOop < HeapWordSize) { 2935 StubRoutines::_arrayof_oop_arraycopy = generate_oop_copy (true, "arrayof_oop_arraycopy", status, false, StubRoutines::_arrayof_oop_disjoint_arraycopy); 2936 } else { 2937 StubRoutines::_arrayof_oop_arraycopy = StubRoutines::_oop_arraycopy; 2938 } 2939 StubRoutines::_arrayof_jlong_arraycopy = StubRoutines::_jlong_arraycopy; 2940 2941 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy"); 2942 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy"); 2943 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy"); 2944 2945 2946 } 2947 2948 #define COMPILE_CRYPTO 2949 #include "stubRoutinesCrypto_arm.cpp" 2950 2951 private: 2952 2953 #undef __ 2954 #define __ masm-> 2955 2956 //------------------------------------------------------------------------------------------------------------------------ 2957 // Continuation point for throwing of implicit exceptions that are not handled in 2958 // the current activation. Fabricates an exception oop and initiates normal 2959 // exception dispatching in this frame. 2960 address generate_throw_exception(const char* name, address runtime_entry) { 2961 int insts_size = 128; 2962 int locs_size = 32; 2963 CodeBuffer code(name, insts_size, locs_size); 2964 OopMapSet* oop_maps; 2965 int frame_size; 2966 int frame_complete; 2967 2968 oop_maps = new OopMapSet(); 2969 MacroAssembler* masm = new MacroAssembler(&code); 2970 2971 address start = __ pc(); 2972 2973 frame_size = 2; 2974 __ mov(Rexception_pc, LR); 2975 __ raw_push(FP, LR); 2976 2977 frame_complete = __ pc() - start; 2978 2979 // Any extra arguments are already supposed to be R1 and R2 2980 __ mov(R0, Rthread); 2981 2982 int pc_offset = __ set_last_Java_frame(SP, FP, false, Rtemp); 2983 assert(((__ pc()) - start) == __ offset(), "warning: start differs from code_begin"); 2984 __ call(runtime_entry); 2985 if (pc_offset == -1) { 2986 pc_offset = __ offset(); 2987 } 2988 2989 // Generate oop map 2990 OopMap* map = new OopMap(frame_size*VMRegImpl::slots_per_word, 0); 2991 oop_maps->add_gc_map(pc_offset, map); 2992 __ reset_last_Java_frame(Rtemp); // Rtemp free since scratched by far call 2993 2994 __ raw_pop(FP, LR); 2995 __ jump(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type, Rtemp); 2996 2997 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, 2998 frame_size, oop_maps, false); 2999 return stub->entry_point(); 3000 } 3001 3002 //--------------------------------------------------------------------------- 3003 // Initialization 3004 3005 void generate_initial() { 3006 // Generates all stubs and initializes the entry points 3007 3008 //------------------------------------------------------------------------------------------------------------------------ 3009 // entry points that exist in all platforms 3010 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than 3011 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp. 3012 StubRoutines::_forward_exception_entry = generate_forward_exception(); 3013 3014 StubRoutines::_call_stub_entry = 3015 generate_call_stub(StubRoutines::_call_stub_return_address); 3016 // is referenced by megamorphic call 3017 StubRoutines::_catch_exception_entry = generate_catch_exception(); 3018 3019 // stub for throwing stack overflow error used both by interpreter and compiler 3020 StubRoutines::_throw_StackOverflowError_entry = generate_throw_exception("StackOverflowError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError)); 3021 3022 // integer division used both by interpreter and compiler 3023 StubRoutines::Arm::_idiv_irem_entry = generate_idiv_irem(); 3024 3025 StubRoutines::_atomic_add_entry = generate_atomic_add(); 3026 StubRoutines::_atomic_xchg_entry = generate_atomic_xchg(); 3027 StubRoutines::_atomic_cmpxchg_entry = generate_atomic_cmpxchg(); 3028 StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long(); 3029 StubRoutines::_atomic_load_long_entry = generate_atomic_load_long(); 3030 StubRoutines::_atomic_store_long_entry = generate_atomic_store_long(); 3031 } 3032 3033 void generate_all() { 3034 // Generates all stubs and initializes the entry points 3035 3036 #ifdef COMPILER2 3037 // Generate partial_subtype_check first here since its code depends on 3038 // UseZeroBaseCompressedOops which is defined after heap initialization. 3039 StubRoutines::Arm::_partial_subtype_check = generate_partial_subtype_check(); 3040 #endif 3041 // These entry points require SharedInfo::stack0 to be set up in non-core builds 3042 // and need to be relocatable, so they each fabricate a RuntimeStub internally. 3043 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError)); 3044 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError)); 3045 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call)); 3046 3047 //------------------------------------------------------------------------------------------------------------------------ 3048 // entry points that are platform specific 3049 3050 // support for verify_oop (must happen after universe_init) 3051 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 3052 3053 // arraycopy stubs used by compilers 3054 generate_arraycopy_stubs(); 3055 3056 // Safefetch stubs. 3057 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 3058 &StubRoutines::_safefetch32_fault_pc, 3059 &StubRoutines::_safefetch32_continuation_pc); 3060 assert (sizeof(int) == wordSize, "32-bit architecture"); 3061 StubRoutines::_safefetchN_entry = StubRoutines::_safefetch32_entry; 3062 StubRoutines::_safefetchN_fault_pc = StubRoutines::_safefetch32_fault_pc; 3063 StubRoutines::_safefetchN_continuation_pc = StubRoutines::_safefetch32_continuation_pc; 3064 3065 #ifdef COMPILE_CRYPTO 3066 // generate AES intrinsics code 3067 if (UseAESIntrinsics) { 3068 aes_init(); 3069 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 3070 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 3071 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 3072 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 3073 } 3074 #endif // COMPILE_CRYPTO 3075 } 3076 3077 3078 public: 3079 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 3080 if (all) { 3081 generate_all(); 3082 } else { 3083 generate_initial(); 3084 } 3085 } 3086 }; // end class declaration 3087 3088 #define UCM_TABLE_MAX_ENTRIES 32 3089 void StubGenerator_generate(CodeBuffer* code, bool all) { 3090 if (UnsafeCopyMemory::_table == NULL) { 3091 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES); 3092 } 3093 StubGenerator g(code, all); 3094 }