1 /* 2 * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2012, 2016 SAP SE. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.inline.hpp" 28 #include "interpreter/interpreter.hpp" 29 #include "nativeInst_ppc.hpp" 30 #include "oops/instanceOop.hpp" 31 #include "oops/method.hpp" 32 #include "oops/objArrayKlass.hpp" 33 #include "oops/oop.inline.hpp" 34 #include "prims/methodHandles.hpp" 35 #include "runtime/frame.inline.hpp" 36 #include "runtime/handles.inline.hpp" 37 #include "runtime/sharedRuntime.hpp" 38 #include "runtime/stubCodeGenerator.hpp" 39 #include "runtime/stubRoutines.hpp" 40 #include "utilities/top.hpp" 41 #include "runtime/thread.inline.hpp" 42 43 #define __ _masm-> 44 45 #ifdef PRODUCT 46 #define BLOCK_COMMENT(str) // nothing 47 #else 48 #define BLOCK_COMMENT(str) __ block_comment(str) 49 #endif 50 51 #if defined(ABI_ELFv2) 52 #define STUB_ENTRY(name) StubRoutines::name() 53 #else 54 #define STUB_ENTRY(name) ((FunctionDescriptor*)StubRoutines::name())->entry() 55 #endif 56 57 class StubGenerator: public StubCodeGenerator { 58 private: 59 60 // Call stubs are used to call Java from C 61 // 62 // Arguments: 63 // 64 // R3 - call wrapper address : address 65 // R4 - result : intptr_t* 66 // R5 - result type : BasicType 67 // R6 - method : Method 68 // R7 - frame mgr entry point : address 69 // R8 - parameter block : intptr_t* 70 // R9 - parameter count in words : int 71 // R10 - thread : Thread* 72 // 73 address generate_call_stub(address& return_address) { 74 // Setup a new c frame, copy java arguments, call frame manager or 75 // native_entry, and process result. 76 77 StubCodeMark mark(this, "StubRoutines", "call_stub"); 78 79 address start = __ function_entry(); 80 81 // some sanity checks 82 assert((sizeof(frame::abi_minframe) % 16) == 0, "unaligned"); 83 assert((sizeof(frame::abi_reg_args) % 16) == 0, "unaligned"); 84 assert((sizeof(frame::spill_nonvolatiles) % 16) == 0, "unaligned"); 85 assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned"); 86 assert((sizeof(frame::entry_frame_locals) % 16) == 0, "unaligned"); 87 88 Register r_arg_call_wrapper_addr = R3; 89 Register r_arg_result_addr = R4; 90 Register r_arg_result_type = R5; 91 Register r_arg_method = R6; 92 Register r_arg_entry = R7; 93 Register r_arg_thread = R10; 94 95 Register r_temp = R24; 96 Register r_top_of_arguments_addr = R25; 97 Register r_entryframe_fp = R26; 98 99 { 100 // Stack on entry to call_stub: 101 // 102 // F1 [C_FRAME] 103 // ... 104 105 Register r_arg_argument_addr = R8; 106 Register r_arg_argument_count = R9; 107 Register r_frame_alignment_in_bytes = R27; 108 Register r_argument_addr = R28; 109 Register r_argumentcopy_addr = R29; 110 Register r_argument_size_in_bytes = R30; 111 Register r_frame_size = R23; 112 113 Label arguments_copied; 114 115 // Save LR/CR to caller's C_FRAME. 116 __ save_LR_CR(R0); 117 118 // Zero extend arg_argument_count. 119 __ clrldi(r_arg_argument_count, r_arg_argument_count, 32); 120 121 // Save non-volatiles GPRs to ENTRY_FRAME (not yet pushed, but it's safe). 122 __ save_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14)); 123 124 // Keep copy of our frame pointer (caller's SP). 125 __ mr(r_entryframe_fp, R1_SP); 126 127 BLOCK_COMMENT("Push ENTRY_FRAME including arguments"); 128 // Push ENTRY_FRAME including arguments: 129 // 130 // F0 [TOP_IJAVA_FRAME_ABI] 131 // alignment (optional) 132 // [outgoing Java arguments] 133 // [ENTRY_FRAME_LOCALS] 134 // F1 [C_FRAME] 135 // ... 136 137 // calculate frame size 138 139 // unaligned size of arguments 140 __ sldi(r_argument_size_in_bytes, 141 r_arg_argument_count, Interpreter::logStackElementSize); 142 // arguments alignment (max 1 slot) 143 // FIXME: use round_to() here 144 __ andi_(r_frame_alignment_in_bytes, r_arg_argument_count, 1); 145 __ sldi(r_frame_alignment_in_bytes, 146 r_frame_alignment_in_bytes, Interpreter::logStackElementSize); 147 148 // size = unaligned size of arguments + top abi's size 149 __ addi(r_frame_size, r_argument_size_in_bytes, 150 frame::top_ijava_frame_abi_size); 151 // size += arguments alignment 152 __ add(r_frame_size, 153 r_frame_size, r_frame_alignment_in_bytes); 154 // size += size of call_stub locals 155 __ addi(r_frame_size, 156 r_frame_size, frame::entry_frame_locals_size); 157 158 // push ENTRY_FRAME 159 __ push_frame(r_frame_size, r_temp); 160 161 // initialize call_stub locals (step 1) 162 __ std(r_arg_call_wrapper_addr, 163 _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp); 164 __ std(r_arg_result_addr, 165 _entry_frame_locals_neg(result_address), r_entryframe_fp); 166 __ std(r_arg_result_type, 167 _entry_frame_locals_neg(result_type), r_entryframe_fp); 168 // we will save arguments_tos_address later 169 170 171 BLOCK_COMMENT("Copy Java arguments"); 172 // copy Java arguments 173 174 // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later. 175 // FIXME: why not simply use SP+frame::top_ijava_frame_size? 176 __ addi(r_top_of_arguments_addr, 177 R1_SP, frame::top_ijava_frame_abi_size); 178 __ add(r_top_of_arguments_addr, 179 r_top_of_arguments_addr, r_frame_alignment_in_bytes); 180 181 // any arguments to copy? 182 __ cmpdi(CCR0, r_arg_argument_count, 0); 183 __ beq(CCR0, arguments_copied); 184 185 // prepare loop and copy arguments in reverse order 186 { 187 // init CTR with arg_argument_count 188 __ mtctr(r_arg_argument_count); 189 190 // let r_argumentcopy_addr point to last outgoing Java arguments P 191 __ mr(r_argumentcopy_addr, r_top_of_arguments_addr); 192 193 // let r_argument_addr point to last incoming java argument 194 __ add(r_argument_addr, 195 r_arg_argument_addr, r_argument_size_in_bytes); 196 __ addi(r_argument_addr, r_argument_addr, -BytesPerWord); 197 198 // now loop while CTR > 0 and copy arguments 199 { 200 Label next_argument; 201 __ bind(next_argument); 202 203 __ ld(r_temp, 0, r_argument_addr); 204 // argument_addr--; 205 __ addi(r_argument_addr, r_argument_addr, -BytesPerWord); 206 __ std(r_temp, 0, r_argumentcopy_addr); 207 // argumentcopy_addr++; 208 __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord); 209 210 __ bdnz(next_argument); 211 } 212 } 213 214 // Arguments copied, continue. 215 __ bind(arguments_copied); 216 } 217 218 { 219 BLOCK_COMMENT("Call frame manager or native entry."); 220 // Call frame manager or native entry. 221 Register r_new_arg_entry = R14; 222 assert_different_registers(r_new_arg_entry, r_top_of_arguments_addr, 223 r_arg_method, r_arg_thread); 224 225 __ mr(r_new_arg_entry, r_arg_entry); 226 227 // Register state on entry to frame manager / native entry: 228 // 229 // tos - intptr_t* sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8 230 // R19_method - Method 231 // R16_thread - JavaThread* 232 233 // Tos must point to last argument - element_size. 234 const Register tos = R15_esp; 235 236 __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize); 237 238 // initialize call_stub locals (step 2) 239 // now save tos as arguments_tos_address 240 __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp); 241 242 // load argument registers for call 243 __ mr(R19_method, r_arg_method); 244 __ mr(R16_thread, r_arg_thread); 245 assert(tos != r_arg_method, "trashed r_arg_method"); 246 assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread"); 247 248 // Set R15_prev_state to 0 for simplifying checks in callee. 249 __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R11_scratch1); 250 // Stack on entry to frame manager / native entry: 251 // 252 // F0 [TOP_IJAVA_FRAME_ABI] 253 // alignment (optional) 254 // [outgoing Java arguments] 255 // [ENTRY_FRAME_LOCALS] 256 // F1 [C_FRAME] 257 // ... 258 // 259 260 // global toc register 261 __ load_const_optimized(R29_TOC, MacroAssembler::global_toc(), R11_scratch1); 262 // Remember the senderSP so we interpreter can pop c2i arguments off of the stack 263 // when called via a c2i. 264 265 // Pass initial_caller_sp to framemanager. 266 __ mr(R21_tmp1, R1_SP); 267 268 // Do a light-weight C-call here, r_new_arg_entry holds the address 269 // of the interpreter entry point (frame manager or native entry) 270 // and save runtime-value of LR in return_address. 271 assert(r_new_arg_entry != tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread, 272 "trashed r_new_arg_entry"); 273 return_address = __ call_stub(r_new_arg_entry); 274 } 275 276 { 277 BLOCK_COMMENT("Returned from frame manager or native entry."); 278 // Returned from frame manager or native entry. 279 // Now pop frame, process result, and return to caller. 280 281 // Stack on exit from frame manager / native entry: 282 // 283 // F0 [ABI] 284 // ... 285 // [ENTRY_FRAME_LOCALS] 286 // F1 [C_FRAME] 287 // ... 288 // 289 // Just pop the topmost frame ... 290 // 291 292 Label ret_is_object; 293 Label ret_is_long; 294 Label ret_is_float; 295 Label ret_is_double; 296 297 Register r_entryframe_fp = R30; 298 Register r_lr = R7_ARG5; 299 Register r_cr = R8_ARG6; 300 301 // Reload some volatile registers which we've spilled before the call 302 // to frame manager / native entry. 303 // Access all locals via frame pointer, because we know nothing about 304 // the topmost frame's size. 305 __ ld(r_entryframe_fp, _abi(callers_sp), R1_SP); 306 assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr); 307 __ ld(r_arg_result_addr, 308 _entry_frame_locals_neg(result_address), r_entryframe_fp); 309 __ ld(r_arg_result_type, 310 _entry_frame_locals_neg(result_type), r_entryframe_fp); 311 __ ld(r_cr, _abi(cr), r_entryframe_fp); 312 __ ld(r_lr, _abi(lr), r_entryframe_fp); 313 314 // pop frame and restore non-volatiles, LR and CR 315 __ mr(R1_SP, r_entryframe_fp); 316 __ mtcr(r_cr); 317 __ mtlr(r_lr); 318 319 // Store result depending on type. Everything that is not 320 // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT. 321 __ cmpwi(CCR0, r_arg_result_type, T_OBJECT); 322 __ cmpwi(CCR1, r_arg_result_type, T_LONG); 323 __ cmpwi(CCR5, r_arg_result_type, T_FLOAT); 324 __ cmpwi(CCR6, r_arg_result_type, T_DOUBLE); 325 326 // restore non-volatile registers 327 __ restore_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14)); 328 329 330 // Stack on exit from call_stub: 331 // 332 // 0 [C_FRAME] 333 // ... 334 // 335 // no call_stub frames left. 336 337 // All non-volatiles have been restored at this point!! 338 assert(R3_RET == R3, "R3_RET should be R3"); 339 340 __ beq(CCR0, ret_is_object); 341 __ beq(CCR1, ret_is_long); 342 __ beq(CCR5, ret_is_float); 343 __ beq(CCR6, ret_is_double); 344 345 // default: 346 __ stw(R3_RET, 0, r_arg_result_addr); 347 __ blr(); // return to caller 348 349 // case T_OBJECT: 350 __ bind(ret_is_object); 351 __ std(R3_RET, 0, r_arg_result_addr); 352 __ blr(); // return to caller 353 354 // case T_LONG: 355 __ bind(ret_is_long); 356 __ std(R3_RET, 0, r_arg_result_addr); 357 __ blr(); // return to caller 358 359 // case T_FLOAT: 360 __ bind(ret_is_float); 361 __ stfs(F1_RET, 0, r_arg_result_addr); 362 __ blr(); // return to caller 363 364 // case T_DOUBLE: 365 __ bind(ret_is_double); 366 __ stfd(F1_RET, 0, r_arg_result_addr); 367 __ blr(); // return to caller 368 } 369 370 return start; 371 } 372 373 // Return point for a Java call if there's an exception thrown in 374 // Java code. The exception is caught and transformed into a 375 // pending exception stored in JavaThread that can be tested from 376 // within the VM. 377 // 378 address generate_catch_exception() { 379 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 380 381 address start = __ pc(); 382 383 // Registers alive 384 // 385 // R16_thread 386 // R3_ARG1 - address of pending exception 387 // R4_ARG2 - return address in call stub 388 389 const Register exception_file = R21_tmp1; 390 const Register exception_line = R22_tmp2; 391 392 __ load_const(exception_file, (void*)__FILE__); 393 __ load_const(exception_line, (void*)__LINE__); 394 395 __ std(R3_ARG1, in_bytes(JavaThread::pending_exception_offset()), R16_thread); 396 // store into `char *' 397 __ std(exception_file, in_bytes(JavaThread::exception_file_offset()), R16_thread); 398 // store into `int' 399 __ stw(exception_line, in_bytes(JavaThread::exception_line_offset()), R16_thread); 400 401 // complete return to VM 402 assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before"); 403 404 __ mtlr(R4_ARG2); 405 // continue in call stub 406 __ blr(); 407 408 return start; 409 } 410 411 // Continuation point for runtime calls returning with a pending 412 // exception. The pending exception check happened in the runtime 413 // or native call stub. The pending exception in Thread is 414 // converted into a Java-level exception. 415 // 416 // Read: 417 // 418 // LR: The pc the runtime library callee wants to return to. 419 // Since the exception occurred in the callee, the return pc 420 // from the point of view of Java is the exception pc. 421 // thread: Needed for method handles. 422 // 423 // Invalidate: 424 // 425 // volatile registers (except below). 426 // 427 // Update: 428 // 429 // R4_ARG2: exception 430 // 431 // (LR is unchanged and is live out). 432 // 433 address generate_forward_exception() { 434 StubCodeMark mark(this, "StubRoutines", "forward_exception"); 435 address start = __ pc(); 436 437 #if !defined(PRODUCT) 438 if (VerifyOops) { 439 // Get pending exception oop. 440 __ ld(R3_ARG1, 441 in_bytes(Thread::pending_exception_offset()), 442 R16_thread); 443 // Make sure that this code is only executed if there is a pending exception. 444 { 445 Label L; 446 __ cmpdi(CCR0, R3_ARG1, 0); 447 __ bne(CCR0, L); 448 __ stop("StubRoutines::forward exception: no pending exception (1)"); 449 __ bind(L); 450 } 451 __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop"); 452 } 453 #endif 454 455 // Save LR/CR and copy exception pc (LR) into R4_ARG2. 456 __ save_LR_CR(R4_ARG2); 457 __ push_frame_reg_args(0, R0); 458 // Find exception handler. 459 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 460 SharedRuntime::exception_handler_for_return_address), 461 R16_thread, 462 R4_ARG2); 463 // Copy handler's address. 464 __ mtctr(R3_RET); 465 __ pop_frame(); 466 __ restore_LR_CR(R0); 467 468 // Set up the arguments for the exception handler: 469 // - R3_ARG1: exception oop 470 // - R4_ARG2: exception pc. 471 472 // Load pending exception oop. 473 __ ld(R3_ARG1, 474 in_bytes(Thread::pending_exception_offset()), 475 R16_thread); 476 477 // The exception pc is the return address in the caller. 478 // Must load it into R4_ARG2. 479 __ mflr(R4_ARG2); 480 481 #ifdef ASSERT 482 // Make sure exception is set. 483 { 484 Label L; 485 __ cmpdi(CCR0, R3_ARG1, 0); 486 __ bne(CCR0, L); 487 __ stop("StubRoutines::forward exception: no pending exception (2)"); 488 __ bind(L); 489 } 490 #endif 491 492 // Clear the pending exception. 493 __ li(R0, 0); 494 __ std(R0, 495 in_bytes(Thread::pending_exception_offset()), 496 R16_thread); 497 // Jump to exception handler. 498 __ bctr(); 499 500 return start; 501 } 502 503 #undef __ 504 #define __ masm-> 505 // Continuation point for throwing of implicit exceptions that are 506 // not handled in the current activation. Fabricates an exception 507 // oop and initiates normal exception dispatching in this 508 // frame. Only callee-saved registers are preserved (through the 509 // normal register window / RegisterMap handling). If the compiler 510 // needs all registers to be preserved between the fault point and 511 // the exception handler then it must assume responsibility for that 512 // in AbstractCompiler::continuation_for_implicit_null_exception or 513 // continuation_for_implicit_division_by_zero_exception. All other 514 // implicit exceptions (e.g., NullPointerException or 515 // AbstractMethodError on entry) are either at call sites or 516 // otherwise assume that stack unwinding will be initiated, so 517 // caller saved registers were assumed volatile in the compiler. 518 // 519 // Note that we generate only this stub into a RuntimeStub, because 520 // it needs to be properly traversed and ignored during GC, so we 521 // change the meaning of the "__" macro within this method. 522 // 523 // Note: the routine set_pc_not_at_call_for_caller in 524 // SharedRuntime.cpp requires that this code be generated into a 525 // RuntimeStub. 526 address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc, 527 Register arg1 = noreg, Register arg2 = noreg) { 528 CodeBuffer code(name, 1024 DEBUG_ONLY(+ 512), 0); 529 MacroAssembler* masm = new MacroAssembler(&code); 530 531 OopMapSet* oop_maps = new OopMapSet(); 532 int frame_size_in_bytes = frame::abi_reg_args_size; 533 OopMap* map = new OopMap(frame_size_in_bytes / sizeof(jint), 0); 534 535 address start = __ pc(); 536 537 __ save_LR_CR(R11_scratch1); 538 539 // Push a frame. 540 __ push_frame_reg_args(0, R11_scratch1); 541 542 address frame_complete_pc = __ pc(); 543 544 if (restore_saved_exception_pc) { 545 __ unimplemented("StubGenerator::throw_exception with restore_saved_exception_pc", 74); 546 } 547 548 // Note that we always have a runtime stub frame on the top of 549 // stack by this point. Remember the offset of the instruction 550 // whose address will be moved to R11_scratch1. 551 address gc_map_pc = __ get_PC_trash_LR(R11_scratch1); 552 553 __ set_last_Java_frame(/*sp*/R1_SP, /*pc*/R11_scratch1); 554 555 __ mr(R3_ARG1, R16_thread); 556 if (arg1 != noreg) { 557 __ mr(R4_ARG2, arg1); 558 } 559 if (arg2 != noreg) { 560 __ mr(R5_ARG3, arg2); 561 } 562 #if defined(ABI_ELFv2) 563 __ call_c(runtime_entry, relocInfo::none); 564 #else 565 __ call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, runtime_entry), relocInfo::none); 566 #endif 567 568 // Set an oopmap for the call site. 569 oop_maps->add_gc_map((int)(gc_map_pc - start), map); 570 571 __ reset_last_Java_frame(); 572 573 #ifdef ASSERT 574 // Make sure that this code is only executed if there is a pending 575 // exception. 576 { 577 Label L; 578 __ ld(R0, 579 in_bytes(Thread::pending_exception_offset()), 580 R16_thread); 581 __ cmpdi(CCR0, R0, 0); 582 __ bne(CCR0, L); 583 __ stop("StubRoutines::throw_exception: no pending exception"); 584 __ bind(L); 585 } 586 #endif 587 588 // Pop frame. 589 __ pop_frame(); 590 591 __ restore_LR_CR(R11_scratch1); 592 593 __ load_const(R11_scratch1, StubRoutines::forward_exception_entry()); 594 __ mtctr(R11_scratch1); 595 __ bctr(); 596 597 // Create runtime stub with OopMap. 598 RuntimeStub* stub = 599 RuntimeStub::new_runtime_stub(name, &code, 600 /*frame_complete=*/ (int)(frame_complete_pc - start), 601 frame_size_in_bytes/wordSize, 602 oop_maps, 603 false); 604 return stub->entry_point(); 605 } 606 #undef __ 607 #define __ _masm-> 608 609 // Generate G1 pre-write barrier for array. 610 // 611 // Input: 612 // from - register containing src address (only needed for spilling) 613 // to - register containing starting address 614 // count - register containing element count 615 // tmp - scratch register 616 // 617 // Kills: 618 // nothing 619 // 620 void gen_write_ref_array_pre_barrier(Register from, Register to, Register count, bool dest_uninitialized, Register Rtmp1, 621 Register preserve1 = noreg, Register preserve2 = noreg) { 622 BarrierSet* const bs = Universe::heap()->barrier_set(); 623 switch (bs->kind()) { 624 case BarrierSet::G1SATBCTLogging: 625 // With G1, don't generate the call if we statically know that the target in uninitialized 626 if (!dest_uninitialized) { 627 int spill_slots = 3; 628 if (preserve1 != noreg) { spill_slots++; } 629 if (preserve2 != noreg) { spill_slots++; } 630 const int frame_size = align_size_up(frame::abi_reg_args_size + spill_slots * BytesPerWord, frame::alignment_in_bytes); 631 Label filtered; 632 633 // Is marking active? 634 if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) { 635 __ lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 636 } else { 637 guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1, "Assumption"); 638 __ lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), R16_thread); 639 } 640 __ cmpdi(CCR0, Rtmp1, 0); 641 __ beq(CCR0, filtered); 642 643 __ save_LR_CR(R0); 644 __ push_frame(frame_size, R0); 645 int slot_nr = 0; 646 __ std(from, frame_size - (++slot_nr) * wordSize, R1_SP); 647 __ std(to, frame_size - (++slot_nr) * wordSize, R1_SP); 648 __ std(count, frame_size - (++slot_nr) * wordSize, R1_SP); 649 if (preserve1 != noreg) { __ std(preserve1, frame_size - (++slot_nr) * wordSize, R1_SP); } 650 if (preserve2 != noreg) { __ std(preserve2, frame_size - (++slot_nr) * wordSize, R1_SP); } 651 652 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), to, count); 653 654 slot_nr = 0; 655 __ ld(from, frame_size - (++slot_nr) * wordSize, R1_SP); 656 __ ld(to, frame_size - (++slot_nr) * wordSize, R1_SP); 657 __ ld(count, frame_size - (++slot_nr) * wordSize, R1_SP); 658 if (preserve1 != noreg) { __ ld(preserve1, frame_size - (++slot_nr) * wordSize, R1_SP); } 659 if (preserve2 != noreg) { __ ld(preserve2, frame_size - (++slot_nr) * wordSize, R1_SP); } 660 __ addi(R1_SP, R1_SP, frame_size); // pop_frame() 661 __ restore_LR_CR(R0); 662 663 __ bind(filtered); 664 } 665 break; 666 case BarrierSet::CardTableForRS: 667 case BarrierSet::CardTableExtension: 668 case BarrierSet::ModRef: 669 break; 670 default: 671 ShouldNotReachHere(); 672 } 673 } 674 675 // Generate CMS/G1 post-write barrier for array. 676 // 677 // Input: 678 // addr - register containing starting address 679 // count - register containing element count 680 // tmp - scratch register 681 // 682 // The input registers and R0 are overwritten. 683 // 684 void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp, Register preserve = noreg) { 685 BarrierSet* const bs = Universe::heap()->barrier_set(); 686 687 switch (bs->kind()) { 688 case BarrierSet::G1SATBCTLogging: 689 { 690 int spill_slots = (preserve != noreg) ? 1 : 0; 691 const int frame_size = align_size_up(frame::abi_reg_args_size + spill_slots * BytesPerWord, frame::alignment_in_bytes); 692 693 __ save_LR_CR(R0); 694 __ push_frame(frame_size, R0); 695 if (preserve != noreg) { __ std(preserve, frame_size - 1 * wordSize, R1_SP); } 696 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), addr, count); 697 if (preserve != noreg) { __ ld(preserve, frame_size - 1 * wordSize, R1_SP); } 698 __ addi(R1_SP, R1_SP, frame_size); // pop_frame(); 699 __ restore_LR_CR(R0); 700 } 701 break; 702 case BarrierSet::CardTableForRS: 703 case BarrierSet::CardTableExtension: 704 { 705 Label Lskip_loop, Lstore_loop; 706 if (UseConcMarkSweepGC) { 707 // TODO PPC port: contribute optimization / requires shared changes 708 __ release(); 709 } 710 711 CardTableModRefBS* const ct = barrier_set_cast<CardTableModRefBS>(bs); 712 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 713 assert_different_registers(addr, count, tmp); 714 715 __ sldi(count, count, LogBytesPerHeapOop); 716 __ addi(count, count, -BytesPerHeapOop); 717 __ add(count, addr, count); 718 // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.) 719 __ srdi(addr, addr, CardTableModRefBS::card_shift); 720 __ srdi(count, count, CardTableModRefBS::card_shift); 721 __ subf(count, addr, count); 722 assert_different_registers(R0, addr, count, tmp); 723 __ load_const(tmp, (address)ct->byte_map_base); 724 __ addic_(count, count, 1); 725 __ beq(CCR0, Lskip_loop); 726 __ li(R0, 0); 727 __ mtctr(count); 728 // Byte store loop 729 __ bind(Lstore_loop); 730 __ stbx(R0, tmp, addr); 731 __ addi(addr, addr, 1); 732 __ bdnz(Lstore_loop); 733 __ bind(Lskip_loop); 734 } 735 break; 736 case BarrierSet::ModRef: 737 break; 738 default: 739 ShouldNotReachHere(); 740 } 741 } 742 743 // Support for void zero_words_aligned8(HeapWord* to, size_t count) 744 // 745 // Arguments: 746 // to: 747 // count: 748 // 749 // Destroys: 750 // 751 address generate_zero_words_aligned8() { 752 StubCodeMark mark(this, "StubRoutines", "zero_words_aligned8"); 753 754 // Implemented as in ClearArray. 755 address start = __ function_entry(); 756 757 Register base_ptr_reg = R3_ARG1; // tohw (needs to be 8b aligned) 758 Register cnt_dwords_reg = R4_ARG2; // count (in dwords) 759 Register tmp1_reg = R5_ARG3; 760 Register tmp2_reg = R6_ARG4; 761 Register zero_reg = R7_ARG5; 762 763 // Procedure for large arrays (uses data cache block zero instruction). 764 Label dwloop, fast, fastloop, restloop, lastdword, done; 765 int cl_size = VM_Version::L1_data_cache_line_size(); 766 int cl_dwords = cl_size >> 3; 767 int cl_dwordaddr_bits = exact_log2(cl_dwords); 768 int min_dcbz = 2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines. 769 770 // Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16. 771 __ dcbtst(base_ptr_reg); // Indicate write access to first cache line ... 772 __ andi(tmp2_reg, cnt_dwords_reg, 1); // to check if number of dwords is even. 773 __ srdi_(tmp1_reg, cnt_dwords_reg, 1); // number of double dwords 774 __ load_const_optimized(zero_reg, 0L); // Use as zero register. 775 776 __ cmpdi(CCR1, tmp2_reg, 0); // cnt_dwords even? 777 __ beq(CCR0, lastdword); // size <= 1 778 __ mtctr(tmp1_reg); // Speculatively preload counter for rest loop (>0). 779 __ cmpdi(CCR0, cnt_dwords_reg, (min_dcbz+1)*cl_dwords-1); // Big enough to ensure >=min_dcbz cache lines are included? 780 __ neg(tmp1_reg, base_ptr_reg); // bit 0..58: bogus, bit 57..60: (16-(base>>3))%16, bit 61..63: 000 781 782 __ blt(CCR0, restloop); // Too small. (<31=(2*cl_dwords)-1 is sufficient, but bigger performs better.) 783 __ rldicl_(tmp1_reg, tmp1_reg, 64-3, 64-cl_dwordaddr_bits); // Extract number of dwords to 128byte boundary=(16-(base>>3))%16. 784 785 __ beq(CCR0, fast); // already 128byte aligned 786 __ mtctr(tmp1_reg); // Set ctr to hit 128byte boundary (0<ctr<cnt). 787 __ subf(cnt_dwords_reg, tmp1_reg, cnt_dwords_reg); // rest (>0 since size>=256-8) 788 789 // Clear in first cache line dword-by-dword if not already 128byte aligned. 790 __ bind(dwloop); 791 __ std(zero_reg, 0, base_ptr_reg); // Clear 8byte aligned block. 792 __ addi(base_ptr_reg, base_ptr_reg, 8); 793 __ bdnz(dwloop); 794 795 // clear 128byte blocks 796 __ bind(fast); 797 __ srdi(tmp1_reg, cnt_dwords_reg, cl_dwordaddr_bits); // loop count for 128byte loop (>0 since size>=256-8) 798 __ andi(tmp2_reg, cnt_dwords_reg, 1); // to check if rest even 799 800 __ mtctr(tmp1_reg); // load counter 801 __ cmpdi(CCR1, tmp2_reg, 0); // rest even? 802 __ rldicl_(tmp1_reg, cnt_dwords_reg, 63, 65-cl_dwordaddr_bits); // rest in double dwords 803 804 __ bind(fastloop); 805 __ dcbz(base_ptr_reg); // Clear 128byte aligned block. 806 __ addi(base_ptr_reg, base_ptr_reg, cl_size); 807 __ bdnz(fastloop); 808 809 //__ dcbtst(base_ptr_reg); // Indicate write access to last cache line. 810 __ beq(CCR0, lastdword); // rest<=1 811 __ mtctr(tmp1_reg); // load counter 812 813 // Clear rest. 814 __ bind(restloop); 815 __ std(zero_reg, 0, base_ptr_reg); // Clear 8byte aligned block. 816 __ std(zero_reg, 8, base_ptr_reg); // Clear 8byte aligned block. 817 __ addi(base_ptr_reg, base_ptr_reg, 16); 818 __ bdnz(restloop); 819 820 __ bind(lastdword); 821 __ beq(CCR1, done); 822 __ std(zero_reg, 0, base_ptr_reg); 823 __ bind(done); 824 __ blr(); // return 825 826 return start; 827 } 828 829 // The following routine generates a subroutine to throw an asynchronous 830 // UnknownError when an unsafe access gets a fault that could not be 831 // reasonably prevented by the programmer. (Example: SIGBUS/OBJERR.) 832 // 833 address generate_handler_for_unsafe_access() { 834 StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access"); 835 address start = __ function_entry(); 836 __ unimplemented("StubRoutines::handler_for_unsafe_access", 93); 837 return start; 838 } 839 840 #if !defined(PRODUCT) 841 // Wrapper which calls oopDesc::is_oop_or_null() 842 // Only called by MacroAssembler::verify_oop 843 static void verify_oop_helper(const char* message, oop o) { 844 if (!o->is_oop_or_null()) { 845 fatal("%s", message); 846 } 847 ++ StubRoutines::_verify_oop_count; 848 } 849 #endif 850 851 // Return address of code to be called from code generated by 852 // MacroAssembler::verify_oop. 853 // 854 // Don't generate, rather use C++ code. 855 address generate_verify_oop() { 856 // this is actually a `FunctionDescriptor*'. 857 address start = 0; 858 859 #if !defined(PRODUCT) 860 start = CAST_FROM_FN_PTR(address, verify_oop_helper); 861 #endif 862 863 return start; 864 } 865 866 // Fairer handling of safepoints for native methods. 867 // 868 // Generate code which reads from the polling page. This special handling is needed as the 869 // linux-ppc64 kernel before 2.6.6 doesn't set si_addr on some segfaults in 64bit mode 870 // (cf. http://www.kernel.org/pub/linux/kernel/v2.6/ChangeLog-2.6.6), especially when we try 871 // to read from the safepoint polling page. 872 address generate_load_from_poll() { 873 StubCodeMark mark(this, "StubRoutines", "generate_load_from_poll"); 874 address start = __ function_entry(); 875 __ unimplemented("StubRoutines::verify_oop", 95); // TODO PPC port 876 return start; 877 } 878 879 // -XX:+OptimizeFill : convert fill/copy loops into intrinsic 880 // 881 // The code is implemented(ported from sparc) as we believe it benefits JVM98, however 882 // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all! 883 // 884 // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition 885 // for turning on loop predication optimization, and hence the behavior of "array range check" 886 // and "loop invariant check" could be influenced, which potentially boosted JVM98. 887 // 888 // Generate stub for disjoint short fill. If "aligned" is true, the 889 // "to" address is assumed to be heapword aligned. 890 // 891 // Arguments for generated stub: 892 // to: R3_ARG1 893 // value: R4_ARG2 894 // count: R5_ARG3 treated as signed 895 // 896 address generate_fill(BasicType t, bool aligned, const char* name) { 897 StubCodeMark mark(this, "StubRoutines", name); 898 address start = __ function_entry(); 899 900 const Register to = R3_ARG1; // source array address 901 const Register value = R4_ARG2; // fill value 902 const Register count = R5_ARG3; // elements count 903 const Register temp = R6_ARG4; // temp register 904 905 //assert_clean_int(count, O3); // Make sure 'count' is clean int. 906 907 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte; 908 Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes; 909 910 int shift = -1; 911 switch (t) { 912 case T_BYTE: 913 shift = 2; 914 // Clone bytes (zero extend not needed because store instructions below ignore high order bytes). 915 __ rldimi(value, value, 8, 48); // 8 bit -> 16 bit 916 __ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element. 917 __ blt(CCR0, L_fill_elements); 918 __ rldimi(value, value, 16, 32); // 16 bit -> 32 bit 919 break; 920 case T_SHORT: 921 shift = 1; 922 // Clone bytes (zero extend not needed because store instructions below ignore high order bytes). 923 __ rldimi(value, value, 16, 32); // 16 bit -> 32 bit 924 __ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element. 925 __ blt(CCR0, L_fill_elements); 926 break; 927 case T_INT: 928 shift = 0; 929 __ cmpdi(CCR0, count, 2<<shift); // Short arrays (< 8 bytes) fill by element. 930 __ blt(CCR0, L_fill_4_bytes); 931 break; 932 default: ShouldNotReachHere(); 933 } 934 935 if (!aligned && (t == T_BYTE || t == T_SHORT)) { 936 // Align source address at 4 bytes address boundary. 937 if (t == T_BYTE) { 938 // One byte misalignment happens only for byte arrays. 939 __ andi_(temp, to, 1); 940 __ beq(CCR0, L_skip_align1); 941 __ stb(value, 0, to); 942 __ addi(to, to, 1); 943 __ addi(count, count, -1); 944 __ bind(L_skip_align1); 945 } 946 // Two bytes misalignment happens only for byte and short (char) arrays. 947 __ andi_(temp, to, 2); 948 __ beq(CCR0, L_skip_align2); 949 __ sth(value, 0, to); 950 __ addi(to, to, 2); 951 __ addi(count, count, -(1 << (shift - 1))); 952 __ bind(L_skip_align2); 953 } 954 955 if (!aligned) { 956 // Align to 8 bytes, we know we are 4 byte aligned to start. 957 __ andi_(temp, to, 7); 958 __ beq(CCR0, L_fill_32_bytes); 959 __ stw(value, 0, to); 960 __ addi(to, to, 4); 961 __ addi(count, count, -(1 << shift)); 962 __ bind(L_fill_32_bytes); 963 } 964 965 __ li(temp, 8<<shift); // Prepare for 32 byte loop. 966 // Clone bytes int->long as above. 967 __ rldimi(value, value, 32, 0); // 32 bit -> 64 bit 968 969 Label L_check_fill_8_bytes; 970 // Fill 32-byte chunks. 971 __ subf_(count, temp, count); 972 __ blt(CCR0, L_check_fill_8_bytes); 973 974 Label L_fill_32_bytes_loop; 975 __ align(32); 976 __ bind(L_fill_32_bytes_loop); 977 978 __ std(value, 0, to); 979 __ std(value, 8, to); 980 __ subf_(count, temp, count); // Update count. 981 __ std(value, 16, to); 982 __ std(value, 24, to); 983 984 __ addi(to, to, 32); 985 __ bge(CCR0, L_fill_32_bytes_loop); 986 987 __ bind(L_check_fill_8_bytes); 988 __ add_(count, temp, count); 989 __ beq(CCR0, L_exit); 990 __ addic_(count, count, -(2 << shift)); 991 __ blt(CCR0, L_fill_4_bytes); 992 993 // 994 // Length is too short, just fill 8 bytes at a time. 995 // 996 Label L_fill_8_bytes_loop; 997 __ bind(L_fill_8_bytes_loop); 998 __ std(value, 0, to); 999 __ addic_(count, count, -(2 << shift)); 1000 __ addi(to, to, 8); 1001 __ bge(CCR0, L_fill_8_bytes_loop); 1002 1003 // Fill trailing 4 bytes. 1004 __ bind(L_fill_4_bytes); 1005 __ andi_(temp, count, 1<<shift); 1006 __ beq(CCR0, L_fill_2_bytes); 1007 1008 __ stw(value, 0, to); 1009 if (t == T_BYTE || t == T_SHORT) { 1010 __ addi(to, to, 4); 1011 // Fill trailing 2 bytes. 1012 __ bind(L_fill_2_bytes); 1013 __ andi_(temp, count, 1<<(shift-1)); 1014 __ beq(CCR0, L_fill_byte); 1015 __ sth(value, 0, to); 1016 if (t == T_BYTE) { 1017 __ addi(to, to, 2); 1018 // Fill trailing byte. 1019 __ bind(L_fill_byte); 1020 __ andi_(count, count, 1); 1021 __ beq(CCR0, L_exit); 1022 __ stb(value, 0, to); 1023 } else { 1024 __ bind(L_fill_byte); 1025 } 1026 } else { 1027 __ bind(L_fill_2_bytes); 1028 } 1029 __ bind(L_exit); 1030 __ blr(); 1031 1032 // Handle copies less than 8 bytes. Int is handled elsewhere. 1033 if (t == T_BYTE) { 1034 __ bind(L_fill_elements); 1035 Label L_fill_2, L_fill_4; 1036 __ andi_(temp, count, 1); 1037 __ beq(CCR0, L_fill_2); 1038 __ stb(value, 0, to); 1039 __ addi(to, to, 1); 1040 __ bind(L_fill_2); 1041 __ andi_(temp, count, 2); 1042 __ beq(CCR0, L_fill_4); 1043 __ stb(value, 0, to); 1044 __ stb(value, 0, to); 1045 __ addi(to, to, 2); 1046 __ bind(L_fill_4); 1047 __ andi_(temp, count, 4); 1048 __ beq(CCR0, L_exit); 1049 __ stb(value, 0, to); 1050 __ stb(value, 1, to); 1051 __ stb(value, 2, to); 1052 __ stb(value, 3, to); 1053 __ blr(); 1054 } 1055 1056 if (t == T_SHORT) { 1057 Label L_fill_2; 1058 __ bind(L_fill_elements); 1059 __ andi_(temp, count, 1); 1060 __ beq(CCR0, L_fill_2); 1061 __ sth(value, 0, to); 1062 __ addi(to, to, 2); 1063 __ bind(L_fill_2); 1064 __ andi_(temp, count, 2); 1065 __ beq(CCR0, L_exit); 1066 __ sth(value, 0, to); 1067 __ sth(value, 2, to); 1068 __ blr(); 1069 } 1070 return start; 1071 } 1072 1073 inline void assert_positive_int(Register count) { 1074 #ifdef ASSERT 1075 __ srdi_(R0, count, 31); 1076 __ asm_assert_eq("missing zero extend", 0xAFFE); 1077 #endif 1078 } 1079 1080 // Generate overlap test for array copy stubs. 1081 // 1082 // Input: 1083 // R3_ARG1 - from 1084 // R4_ARG2 - to 1085 // R5_ARG3 - element count 1086 // 1087 void array_overlap_test(address no_overlap_target, int log2_elem_size) { 1088 Register tmp1 = R6_ARG4; 1089 Register tmp2 = R7_ARG5; 1090 1091 assert_positive_int(R5_ARG3); 1092 1093 __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes 1094 __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes 1095 __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison! 1096 __ cmpld(CCR1, tmp1, tmp2); 1097 __ crnand(CCR0, Assembler::less, CCR1, Assembler::less); 1098 // Overlaps if Src before dst and distance smaller than size. 1099 // Branch to forward copy routine otherwise (within range of 32kB). 1100 __ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CCR0, Assembler::less), no_overlap_target); 1101 1102 // need to copy backwards 1103 } 1104 1105 // The guideline in the implementations of generate_disjoint_xxx_copy 1106 // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with 1107 // single instructions, but to avoid alignment interrupts (see subsequent 1108 // comment). Furthermore, we try to minimize misaligned access, even 1109 // though they cause no alignment interrupt. 1110 // 1111 // In Big-Endian mode, the PowerPC architecture requires implementations to 1112 // handle automatically misaligned integer halfword and word accesses, 1113 // word-aligned integer doubleword accesses, and word-aligned floating-point 1114 // accesses. Other accesses may or may not generate an Alignment interrupt 1115 // depending on the implementation. 1116 // Alignment interrupt handling may require on the order of hundreds of cycles, 1117 // so every effort should be made to avoid misaligned memory values. 1118 // 1119 // 1120 // Generate stub for disjoint byte copy. If "aligned" is true, the 1121 // "from" and "to" addresses are assumed to be heapword aligned. 1122 // 1123 // Arguments for generated stub: 1124 // from: R3_ARG1 1125 // to: R4_ARG2 1126 // count: R5_ARG3 treated as signed 1127 // 1128 address generate_disjoint_byte_copy(bool aligned, const char * name) { 1129 StubCodeMark mark(this, "StubRoutines", name); 1130 address start = __ function_entry(); 1131 assert_positive_int(R5_ARG3); 1132 1133 Register tmp1 = R6_ARG4; 1134 Register tmp2 = R7_ARG5; 1135 Register tmp3 = R8_ARG6; 1136 Register tmp4 = R9_ARG7; 1137 1138 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9; 1139 1140 // Don't try anything fancy if arrays don't have many elements. 1141 __ li(tmp3, 0); 1142 __ cmpwi(CCR0, R5_ARG3, 17); 1143 __ ble(CCR0, l_6); // copy 4 at a time 1144 1145 if (!aligned) { 1146 __ xorr(tmp1, R3_ARG1, R4_ARG2); 1147 __ andi_(tmp1, tmp1, 3); 1148 __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy. 1149 1150 // Copy elements if necessary to align to 4 bytes. 1151 __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary. 1152 __ andi_(tmp1, tmp1, 3); 1153 __ beq(CCR0, l_2); 1154 1155 __ subf(R5_ARG3, tmp1, R5_ARG3); 1156 __ bind(l_9); 1157 __ lbz(tmp2, 0, R3_ARG1); 1158 __ addic_(tmp1, tmp1, -1); 1159 __ stb(tmp2, 0, R4_ARG2); 1160 __ addi(R3_ARG1, R3_ARG1, 1); 1161 __ addi(R4_ARG2, R4_ARG2, 1); 1162 __ bne(CCR0, l_9); 1163 1164 __ bind(l_2); 1165 } 1166 1167 // copy 8 elements at a time 1168 __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8 1169 __ andi_(tmp1, tmp2, 7); 1170 __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8 1171 1172 // copy a 2-element word if necessary to align to 8 bytes 1173 __ andi_(R0, R3_ARG1, 7); 1174 __ beq(CCR0, l_7); 1175 1176 __ lwzx(tmp2, R3_ARG1, tmp3); 1177 __ addi(R5_ARG3, R5_ARG3, -4); 1178 __ stwx(tmp2, R4_ARG2, tmp3); 1179 { // FasterArrayCopy 1180 __ addi(R3_ARG1, R3_ARG1, 4); 1181 __ addi(R4_ARG2, R4_ARG2, 4); 1182 } 1183 __ bind(l_7); 1184 1185 { // FasterArrayCopy 1186 __ cmpwi(CCR0, R5_ARG3, 31); 1187 __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain 1188 1189 __ srdi(tmp1, R5_ARG3, 5); 1190 __ andi_(R5_ARG3, R5_ARG3, 31); 1191 __ mtctr(tmp1); 1192 1193 __ bind(l_8); 1194 // Use unrolled version for mass copying (copy 32 elements a time) 1195 // Load feeding store gets zero latency on Power6, however not on Power5. 1196 // Therefore, the following sequence is made for the good of both. 1197 __ ld(tmp1, 0, R3_ARG1); 1198 __ ld(tmp2, 8, R3_ARG1); 1199 __ ld(tmp3, 16, R3_ARG1); 1200 __ ld(tmp4, 24, R3_ARG1); 1201 __ std(tmp1, 0, R4_ARG2); 1202 __ std(tmp2, 8, R4_ARG2); 1203 __ std(tmp3, 16, R4_ARG2); 1204 __ std(tmp4, 24, R4_ARG2); 1205 __ addi(R3_ARG1, R3_ARG1, 32); 1206 __ addi(R4_ARG2, R4_ARG2, 32); 1207 __ bdnz(l_8); 1208 } 1209 1210 __ bind(l_6); 1211 1212 // copy 4 elements at a time 1213 __ cmpwi(CCR0, R5_ARG3, 4); 1214 __ blt(CCR0, l_1); 1215 __ srdi(tmp1, R5_ARG3, 2); 1216 __ mtctr(tmp1); // is > 0 1217 __ andi_(R5_ARG3, R5_ARG3, 3); 1218 1219 { // FasterArrayCopy 1220 __ addi(R3_ARG1, R3_ARG1, -4); 1221 __ addi(R4_ARG2, R4_ARG2, -4); 1222 __ bind(l_3); 1223 __ lwzu(tmp2, 4, R3_ARG1); 1224 __ stwu(tmp2, 4, R4_ARG2); 1225 __ bdnz(l_3); 1226 __ addi(R3_ARG1, R3_ARG1, 4); 1227 __ addi(R4_ARG2, R4_ARG2, 4); 1228 } 1229 1230 // do single element copy 1231 __ bind(l_1); 1232 __ cmpwi(CCR0, R5_ARG3, 0); 1233 __ beq(CCR0, l_4); 1234 1235 { // FasterArrayCopy 1236 __ mtctr(R5_ARG3); 1237 __ addi(R3_ARG1, R3_ARG1, -1); 1238 __ addi(R4_ARG2, R4_ARG2, -1); 1239 1240 __ bind(l_5); 1241 __ lbzu(tmp2, 1, R3_ARG1); 1242 __ stbu(tmp2, 1, R4_ARG2); 1243 __ bdnz(l_5); 1244 } 1245 1246 __ bind(l_4); 1247 __ li(R3_RET, 0); // return 0 1248 __ blr(); 1249 1250 return start; 1251 } 1252 1253 // Generate stub for conjoint byte copy. If "aligned" is true, the 1254 // "from" and "to" addresses are assumed to be heapword aligned. 1255 // 1256 // Arguments for generated stub: 1257 // from: R3_ARG1 1258 // to: R4_ARG2 1259 // count: R5_ARG3 treated as signed 1260 // 1261 address generate_conjoint_byte_copy(bool aligned, const char * name) { 1262 StubCodeMark mark(this, "StubRoutines", name); 1263 address start = __ function_entry(); 1264 assert_positive_int(R5_ARG3); 1265 1266 Register tmp1 = R6_ARG4; 1267 Register tmp2 = R7_ARG5; 1268 Register tmp3 = R8_ARG6; 1269 1270 address nooverlap_target = aligned ? 1271 STUB_ENTRY(arrayof_jbyte_disjoint_arraycopy) : 1272 STUB_ENTRY(jbyte_disjoint_arraycopy); 1273 1274 array_overlap_test(nooverlap_target, 0); 1275 // Do reverse copy. We assume the case of actual overlap is rare enough 1276 // that we don't have to optimize it. 1277 Label l_1, l_2; 1278 1279 __ b(l_2); 1280 __ bind(l_1); 1281 __ stbx(tmp1, R4_ARG2, R5_ARG3); 1282 __ bind(l_2); 1283 __ addic_(R5_ARG3, R5_ARG3, -1); 1284 __ lbzx(tmp1, R3_ARG1, R5_ARG3); 1285 __ bge(CCR0, l_1); 1286 1287 __ li(R3_RET, 0); // return 0 1288 __ blr(); 1289 1290 return start; 1291 } 1292 1293 // Generate stub for disjoint short copy. If "aligned" is true, the 1294 // "from" and "to" addresses are assumed to be heapword aligned. 1295 // 1296 // Arguments for generated stub: 1297 // from: R3_ARG1 1298 // to: R4_ARG2 1299 // elm.count: R5_ARG3 treated as signed 1300 // 1301 // Strategy for aligned==true: 1302 // 1303 // If length <= 9: 1304 // 1. copy 2 elements at a time (l_6) 1305 // 2. copy last element if original element count was odd (l_1) 1306 // 1307 // If length > 9: 1308 // 1. copy 4 elements at a time until less than 4 elements are left (l_7) 1309 // 2. copy 2 elements at a time until less than 2 elements are left (l_6) 1310 // 3. copy last element if one was left in step 2. (l_1) 1311 // 1312 // 1313 // Strategy for aligned==false: 1314 // 1315 // If length <= 9: same as aligned==true case, but NOTE: load/stores 1316 // can be unaligned (see comment below) 1317 // 1318 // If length > 9: 1319 // 1. continue with step 6. if the alignment of from and to mod 4 1320 // is different. 1321 // 2. align from and to to 4 bytes by copying 1 element if necessary 1322 // 3. at l_2 from and to are 4 byte aligned; continue with 1323 // 5. if they cannot be aligned to 8 bytes because they have 1324 // got different alignment mod 8. 1325 // 4. at this point we know that both, from and to, have the same 1326 // alignment mod 8, now copy one element if necessary to get 1327 // 8 byte alignment of from and to. 1328 // 5. copy 4 elements at a time until less than 4 elements are 1329 // left; depending on step 3. all load/stores are aligned or 1330 // either all loads or all stores are unaligned. 1331 // 6. copy 2 elements at a time until less than 2 elements are 1332 // left (l_6); arriving here from step 1., there is a chance 1333 // that all accesses are unaligned. 1334 // 7. copy last element if one was left in step 6. (l_1) 1335 // 1336 // There are unaligned data accesses using integer load/store 1337 // instructions in this stub. POWER allows such accesses. 1338 // 1339 // According to the manuals (PowerISA_V2.06_PUBLIC, Book II, 1340 // Chapter 2: Effect of Operand Placement on Performance) unaligned 1341 // integer load/stores have good performance. Only unaligned 1342 // floating point load/stores can have poor performance. 1343 // 1344 // TODO: 1345 // 1346 // 1. check if aligning the backbranch target of loops is beneficial 1347 // 1348 address generate_disjoint_short_copy(bool aligned, const char * name) { 1349 StubCodeMark mark(this, "StubRoutines", name); 1350 1351 Register tmp1 = R6_ARG4; 1352 Register tmp2 = R7_ARG5; 1353 Register tmp3 = R8_ARG6; 1354 Register tmp4 = R9_ARG7; 1355 1356 address start = __ function_entry(); 1357 assert_positive_int(R5_ARG3); 1358 1359 Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8; 1360 1361 // don't try anything fancy if arrays don't have many elements 1362 __ li(tmp3, 0); 1363 __ cmpwi(CCR0, R5_ARG3, 9); 1364 __ ble(CCR0, l_6); // copy 2 at a time 1365 1366 if (!aligned) { 1367 __ xorr(tmp1, R3_ARG1, R4_ARG2); 1368 __ andi_(tmp1, tmp1, 3); 1369 __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy 1370 1371 // At this point it is guaranteed that both, from and to have the same alignment mod 4. 1372 1373 // Copy 1 element if necessary to align to 4 bytes. 1374 __ andi_(tmp1, R3_ARG1, 3); 1375 __ beq(CCR0, l_2); 1376 1377 __ lhz(tmp2, 0, R3_ARG1); 1378 __ addi(R3_ARG1, R3_ARG1, 2); 1379 __ sth(tmp2, 0, R4_ARG2); 1380 __ addi(R4_ARG2, R4_ARG2, 2); 1381 __ addi(R5_ARG3, R5_ARG3, -1); 1382 __ bind(l_2); 1383 1384 // At this point the positions of both, from and to, are at least 4 byte aligned. 1385 1386 // Copy 4 elements at a time. 1387 // Align to 8 bytes, but only if both, from and to, have same alignment mod 8. 1388 __ xorr(tmp2, R3_ARG1, R4_ARG2); 1389 __ andi_(tmp1, tmp2, 7); 1390 __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned 1391 1392 // Copy a 2-element word if necessary to align to 8 bytes. 1393 __ andi_(R0, R3_ARG1, 7); 1394 __ beq(CCR0, l_7); 1395 1396 __ lwzx(tmp2, R3_ARG1, tmp3); 1397 __ addi(R5_ARG3, R5_ARG3, -2); 1398 __ stwx(tmp2, R4_ARG2, tmp3); 1399 { // FasterArrayCopy 1400 __ addi(R3_ARG1, R3_ARG1, 4); 1401 __ addi(R4_ARG2, R4_ARG2, 4); 1402 } 1403 } 1404 1405 __ bind(l_7); 1406 1407 // Copy 4 elements at a time; either the loads or the stores can 1408 // be unaligned if aligned == false. 1409 1410 { // FasterArrayCopy 1411 __ cmpwi(CCR0, R5_ARG3, 15); 1412 __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain 1413 1414 __ srdi(tmp1, R5_ARG3, 4); 1415 __ andi_(R5_ARG3, R5_ARG3, 15); 1416 __ mtctr(tmp1); 1417 1418 __ bind(l_8); 1419 // Use unrolled version for mass copying (copy 16 elements a time). 1420 // Load feeding store gets zero latency on Power6, however not on Power5. 1421 // Therefore, the following sequence is made for the good of both. 1422 __ ld(tmp1, 0, R3_ARG1); 1423 __ ld(tmp2, 8, R3_ARG1); 1424 __ ld(tmp3, 16, R3_ARG1); 1425 __ ld(tmp4, 24, R3_ARG1); 1426 __ std(tmp1, 0, R4_ARG2); 1427 __ std(tmp2, 8, R4_ARG2); 1428 __ std(tmp3, 16, R4_ARG2); 1429 __ std(tmp4, 24, R4_ARG2); 1430 __ addi(R3_ARG1, R3_ARG1, 32); 1431 __ addi(R4_ARG2, R4_ARG2, 32); 1432 __ bdnz(l_8); 1433 } 1434 __ bind(l_6); 1435 1436 // copy 2 elements at a time 1437 { // FasterArrayCopy 1438 __ cmpwi(CCR0, R5_ARG3, 2); 1439 __ blt(CCR0, l_1); 1440 __ srdi(tmp1, R5_ARG3, 1); 1441 __ andi_(R5_ARG3, R5_ARG3, 1); 1442 1443 __ addi(R3_ARG1, R3_ARG1, -4); 1444 __ addi(R4_ARG2, R4_ARG2, -4); 1445 __ mtctr(tmp1); 1446 1447 __ bind(l_3); 1448 __ lwzu(tmp2, 4, R3_ARG1); 1449 __ stwu(tmp2, 4, R4_ARG2); 1450 __ bdnz(l_3); 1451 1452 __ addi(R3_ARG1, R3_ARG1, 4); 1453 __ addi(R4_ARG2, R4_ARG2, 4); 1454 } 1455 1456 // do single element copy 1457 __ bind(l_1); 1458 __ cmpwi(CCR0, R5_ARG3, 0); 1459 __ beq(CCR0, l_4); 1460 1461 { // FasterArrayCopy 1462 __ mtctr(R5_ARG3); 1463 __ addi(R3_ARG1, R3_ARG1, -2); 1464 __ addi(R4_ARG2, R4_ARG2, -2); 1465 1466 __ bind(l_5); 1467 __ lhzu(tmp2, 2, R3_ARG1); 1468 __ sthu(tmp2, 2, R4_ARG2); 1469 __ bdnz(l_5); 1470 } 1471 __ bind(l_4); 1472 __ li(R3_RET, 0); // return 0 1473 __ blr(); 1474 1475 return start; 1476 } 1477 1478 // Generate stub for conjoint short copy. If "aligned" is true, the 1479 // "from" and "to" addresses are assumed to be heapword aligned. 1480 // 1481 // Arguments for generated stub: 1482 // from: R3_ARG1 1483 // to: R4_ARG2 1484 // count: R5_ARG3 treated as signed 1485 // 1486 address generate_conjoint_short_copy(bool aligned, const char * name) { 1487 StubCodeMark mark(this, "StubRoutines", name); 1488 address start = __ function_entry(); 1489 assert_positive_int(R5_ARG3); 1490 1491 Register tmp1 = R6_ARG4; 1492 Register tmp2 = R7_ARG5; 1493 Register tmp3 = R8_ARG6; 1494 1495 address nooverlap_target = aligned ? 1496 STUB_ENTRY(arrayof_jshort_disjoint_arraycopy) : 1497 STUB_ENTRY(jshort_disjoint_arraycopy); 1498 1499 array_overlap_test(nooverlap_target, 1); 1500 1501 Label l_1, l_2; 1502 __ sldi(tmp1, R5_ARG3, 1); 1503 __ b(l_2); 1504 __ bind(l_1); 1505 __ sthx(tmp2, R4_ARG2, tmp1); 1506 __ bind(l_2); 1507 __ addic_(tmp1, tmp1, -2); 1508 __ lhzx(tmp2, R3_ARG1, tmp1); 1509 __ bge(CCR0, l_1); 1510 1511 __ li(R3_RET, 0); // return 0 1512 __ blr(); 1513 1514 return start; 1515 } 1516 1517 // Generate core code for disjoint int copy (and oop copy on 32-bit). If "aligned" 1518 // is true, the "from" and "to" addresses are assumed to be heapword aligned. 1519 // 1520 // Arguments: 1521 // from: R3_ARG1 1522 // to: R4_ARG2 1523 // count: R5_ARG3 treated as signed 1524 // 1525 void generate_disjoint_int_copy_core(bool aligned) { 1526 Register tmp1 = R6_ARG4; 1527 Register tmp2 = R7_ARG5; 1528 Register tmp3 = R8_ARG6; 1529 Register tmp4 = R0; 1530 1531 Label l_1, l_2, l_3, l_4, l_5, l_6; 1532 1533 // for short arrays, just do single element copy 1534 __ li(tmp3, 0); 1535 __ cmpwi(CCR0, R5_ARG3, 5); 1536 __ ble(CCR0, l_2); 1537 1538 if (!aligned) { 1539 // check if arrays have same alignment mod 8. 1540 __ xorr(tmp1, R3_ARG1, R4_ARG2); 1541 __ andi_(R0, tmp1, 7); 1542 // Not the same alignment, but ld and std just need to be 4 byte aligned. 1543 __ bne(CCR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time 1544 1545 // copy 1 element to align to and from on an 8 byte boundary 1546 __ andi_(R0, R3_ARG1, 7); 1547 __ beq(CCR0, l_4); 1548 1549 __ lwzx(tmp2, R3_ARG1, tmp3); 1550 __ addi(R5_ARG3, R5_ARG3, -1); 1551 __ stwx(tmp2, R4_ARG2, tmp3); 1552 { // FasterArrayCopy 1553 __ addi(R3_ARG1, R3_ARG1, 4); 1554 __ addi(R4_ARG2, R4_ARG2, 4); 1555 } 1556 __ bind(l_4); 1557 } 1558 1559 { // FasterArrayCopy 1560 __ cmpwi(CCR0, R5_ARG3, 7); 1561 __ ble(CCR0, l_2); // copy 1 at a time if less than 8 elements remain 1562 1563 __ srdi(tmp1, R5_ARG3, 3); 1564 __ andi_(R5_ARG3, R5_ARG3, 7); 1565 __ mtctr(tmp1); 1566 1567 __ bind(l_6); 1568 // Use unrolled version for mass copying (copy 8 elements a time). 1569 // Load feeding store gets zero latency on power6, however not on power 5. 1570 // Therefore, the following sequence is made for the good of both. 1571 __ ld(tmp1, 0, R3_ARG1); 1572 __ ld(tmp2, 8, R3_ARG1); 1573 __ ld(tmp3, 16, R3_ARG1); 1574 __ ld(tmp4, 24, R3_ARG1); 1575 __ std(tmp1, 0, R4_ARG2); 1576 __ std(tmp2, 8, R4_ARG2); 1577 __ std(tmp3, 16, R4_ARG2); 1578 __ std(tmp4, 24, R4_ARG2); 1579 __ addi(R3_ARG1, R3_ARG1, 32); 1580 __ addi(R4_ARG2, R4_ARG2, 32); 1581 __ bdnz(l_6); 1582 } 1583 1584 // copy 1 element at a time 1585 __ bind(l_2); 1586 __ cmpwi(CCR0, R5_ARG3, 0); 1587 __ beq(CCR0, l_1); 1588 1589 { // FasterArrayCopy 1590 __ mtctr(R5_ARG3); 1591 __ addi(R3_ARG1, R3_ARG1, -4); 1592 __ addi(R4_ARG2, R4_ARG2, -4); 1593 1594 __ bind(l_3); 1595 __ lwzu(tmp2, 4, R3_ARG1); 1596 __ stwu(tmp2, 4, R4_ARG2); 1597 __ bdnz(l_3); 1598 } 1599 1600 __ bind(l_1); 1601 return; 1602 } 1603 1604 // Generate stub for disjoint int copy. If "aligned" is true, the 1605 // "from" and "to" addresses are assumed to be heapword aligned. 1606 // 1607 // Arguments for generated stub: 1608 // from: R3_ARG1 1609 // to: R4_ARG2 1610 // count: R5_ARG3 treated as signed 1611 // 1612 address generate_disjoint_int_copy(bool aligned, const char * name) { 1613 StubCodeMark mark(this, "StubRoutines", name); 1614 address start = __ function_entry(); 1615 assert_positive_int(R5_ARG3); 1616 generate_disjoint_int_copy_core(aligned); 1617 __ li(R3_RET, 0); // return 0 1618 __ blr(); 1619 return start; 1620 } 1621 1622 // Generate core code for conjoint int copy (and oop copy on 1623 // 32-bit). If "aligned" is true, the "from" and "to" addresses 1624 // are assumed to be heapword aligned. 1625 // 1626 // Arguments: 1627 // from: R3_ARG1 1628 // to: R4_ARG2 1629 // count: R5_ARG3 treated as signed 1630 // 1631 void generate_conjoint_int_copy_core(bool aligned) { 1632 // Do reverse copy. We assume the case of actual overlap is rare enough 1633 // that we don't have to optimize it. 1634 1635 Label l_1, l_2, l_3, l_4, l_5, l_6; 1636 1637 Register tmp1 = R6_ARG4; 1638 Register tmp2 = R7_ARG5; 1639 Register tmp3 = R8_ARG6; 1640 Register tmp4 = R0; 1641 1642 { // FasterArrayCopy 1643 __ cmpwi(CCR0, R5_ARG3, 0); 1644 __ beq(CCR0, l_6); 1645 1646 __ sldi(R5_ARG3, R5_ARG3, 2); 1647 __ add(R3_ARG1, R3_ARG1, R5_ARG3); 1648 __ add(R4_ARG2, R4_ARG2, R5_ARG3); 1649 __ srdi(R5_ARG3, R5_ARG3, 2); 1650 1651 __ cmpwi(CCR0, R5_ARG3, 7); 1652 __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain 1653 1654 __ srdi(tmp1, R5_ARG3, 3); 1655 __ andi(R5_ARG3, R5_ARG3, 7); 1656 __ mtctr(tmp1); 1657 1658 __ bind(l_4); 1659 // Use unrolled version for mass copying (copy 4 elements a time). 1660 // Load feeding store gets zero latency on Power6, however not on Power5. 1661 // Therefore, the following sequence is made for the good of both. 1662 __ addi(R3_ARG1, R3_ARG1, -32); 1663 __ addi(R4_ARG2, R4_ARG2, -32); 1664 __ ld(tmp4, 24, R3_ARG1); 1665 __ ld(tmp3, 16, R3_ARG1); 1666 __ ld(tmp2, 8, R3_ARG1); 1667 __ ld(tmp1, 0, R3_ARG1); 1668 __ std(tmp4, 24, R4_ARG2); 1669 __ std(tmp3, 16, R4_ARG2); 1670 __ std(tmp2, 8, R4_ARG2); 1671 __ std(tmp1, 0, R4_ARG2); 1672 __ bdnz(l_4); 1673 1674 __ cmpwi(CCR0, R5_ARG3, 0); 1675 __ beq(CCR0, l_6); 1676 1677 __ bind(l_5); 1678 __ mtctr(R5_ARG3); 1679 __ bind(l_3); 1680 __ lwz(R0, -4, R3_ARG1); 1681 __ stw(R0, -4, R4_ARG2); 1682 __ addi(R3_ARG1, R3_ARG1, -4); 1683 __ addi(R4_ARG2, R4_ARG2, -4); 1684 __ bdnz(l_3); 1685 1686 __ bind(l_6); 1687 } 1688 } 1689 1690 // Generate stub for conjoint int copy. If "aligned" is true, the 1691 // "from" and "to" addresses are assumed to be heapword aligned. 1692 // 1693 // Arguments for generated stub: 1694 // from: R3_ARG1 1695 // to: R4_ARG2 1696 // count: R5_ARG3 treated as signed 1697 // 1698 address generate_conjoint_int_copy(bool aligned, const char * name) { 1699 StubCodeMark mark(this, "StubRoutines", name); 1700 address start = __ function_entry(); 1701 assert_positive_int(R5_ARG3); 1702 address nooverlap_target = aligned ? 1703 STUB_ENTRY(arrayof_jint_disjoint_arraycopy) : 1704 STUB_ENTRY(jint_disjoint_arraycopy); 1705 1706 array_overlap_test(nooverlap_target, 2); 1707 1708 generate_conjoint_int_copy_core(aligned); 1709 1710 __ li(R3_RET, 0); // return 0 1711 __ blr(); 1712 1713 return start; 1714 } 1715 1716 // Generate core code for disjoint long copy (and oop copy on 1717 // 64-bit). If "aligned" is true, the "from" and "to" addresses 1718 // are assumed to be heapword aligned. 1719 // 1720 // Arguments: 1721 // from: R3_ARG1 1722 // to: R4_ARG2 1723 // count: R5_ARG3 treated as signed 1724 // 1725 void generate_disjoint_long_copy_core(bool aligned) { 1726 Register tmp1 = R6_ARG4; 1727 Register tmp2 = R7_ARG5; 1728 Register tmp3 = R8_ARG6; 1729 Register tmp4 = R0; 1730 1731 Label l_1, l_2, l_3, l_4; 1732 1733 { // FasterArrayCopy 1734 __ cmpwi(CCR0, R5_ARG3, 3); 1735 __ ble(CCR0, l_3); // copy 1 at a time if less than 4 elements remain 1736 1737 __ srdi(tmp1, R5_ARG3, 2); 1738 __ andi_(R5_ARG3, R5_ARG3, 3); 1739 __ mtctr(tmp1); 1740 1741 __ bind(l_4); 1742 // Use unrolled version for mass copying (copy 4 elements a time). 1743 // Load feeding store gets zero latency on Power6, however not on Power5. 1744 // Therefore, the following sequence is made for the good of both. 1745 __ ld(tmp1, 0, R3_ARG1); 1746 __ ld(tmp2, 8, R3_ARG1); 1747 __ ld(tmp3, 16, R3_ARG1); 1748 __ ld(tmp4, 24, R3_ARG1); 1749 __ std(tmp1, 0, R4_ARG2); 1750 __ std(tmp2, 8, R4_ARG2); 1751 __ std(tmp3, 16, R4_ARG2); 1752 __ std(tmp4, 24, R4_ARG2); 1753 __ addi(R3_ARG1, R3_ARG1, 32); 1754 __ addi(R4_ARG2, R4_ARG2, 32); 1755 __ bdnz(l_4); 1756 } 1757 1758 // copy 1 element at a time 1759 __ bind(l_3); 1760 __ cmpwi(CCR0, R5_ARG3, 0); 1761 __ beq(CCR0, l_1); 1762 1763 { // FasterArrayCopy 1764 __ mtctr(R5_ARG3); 1765 __ addi(R3_ARG1, R3_ARG1, -8); 1766 __ addi(R4_ARG2, R4_ARG2, -8); 1767 1768 __ bind(l_2); 1769 __ ldu(R0, 8, R3_ARG1); 1770 __ stdu(R0, 8, R4_ARG2); 1771 __ bdnz(l_2); 1772 1773 } 1774 __ bind(l_1); 1775 } 1776 1777 // Generate stub for disjoint long copy. If "aligned" is true, the 1778 // "from" and "to" addresses are assumed to be heapword aligned. 1779 // 1780 // Arguments for generated stub: 1781 // from: R3_ARG1 1782 // to: R4_ARG2 1783 // count: R5_ARG3 treated as signed 1784 // 1785 address generate_disjoint_long_copy(bool aligned, const char * name) { 1786 StubCodeMark mark(this, "StubRoutines", name); 1787 address start = __ function_entry(); 1788 assert_positive_int(R5_ARG3); 1789 generate_disjoint_long_copy_core(aligned); 1790 __ li(R3_RET, 0); // return 0 1791 __ blr(); 1792 1793 return start; 1794 } 1795 1796 // Generate core code for conjoint long copy (and oop copy on 1797 // 64-bit). If "aligned" is true, the "from" and "to" addresses 1798 // are assumed to be heapword aligned. 1799 // 1800 // Arguments: 1801 // from: R3_ARG1 1802 // to: R4_ARG2 1803 // count: R5_ARG3 treated as signed 1804 // 1805 void generate_conjoint_long_copy_core(bool aligned) { 1806 Register tmp1 = R6_ARG4; 1807 Register tmp2 = R7_ARG5; 1808 Register tmp3 = R8_ARG6; 1809 Register tmp4 = R0; 1810 1811 Label l_1, l_2, l_3, l_4, l_5; 1812 1813 __ cmpwi(CCR0, R5_ARG3, 0); 1814 __ beq(CCR0, l_1); 1815 1816 { // FasterArrayCopy 1817 __ sldi(R5_ARG3, R5_ARG3, 3); 1818 __ add(R3_ARG1, R3_ARG1, R5_ARG3); 1819 __ add(R4_ARG2, R4_ARG2, R5_ARG3); 1820 __ srdi(R5_ARG3, R5_ARG3, 3); 1821 1822 __ cmpwi(CCR0, R5_ARG3, 3); 1823 __ ble(CCR0, l_5); // copy 1 at a time if less than 4 elements remain 1824 1825 __ srdi(tmp1, R5_ARG3, 2); 1826 __ andi(R5_ARG3, R5_ARG3, 3); 1827 __ mtctr(tmp1); 1828 1829 __ bind(l_4); 1830 // Use unrolled version for mass copying (copy 4 elements a time). 1831 // Load feeding store gets zero latency on Power6, however not on Power5. 1832 // Therefore, the following sequence is made for the good of both. 1833 __ addi(R3_ARG1, R3_ARG1, -32); 1834 __ addi(R4_ARG2, R4_ARG2, -32); 1835 __ ld(tmp4, 24, R3_ARG1); 1836 __ ld(tmp3, 16, R3_ARG1); 1837 __ ld(tmp2, 8, R3_ARG1); 1838 __ ld(tmp1, 0, R3_ARG1); 1839 __ std(tmp4, 24, R4_ARG2); 1840 __ std(tmp3, 16, R4_ARG2); 1841 __ std(tmp2, 8, R4_ARG2); 1842 __ std(tmp1, 0, R4_ARG2); 1843 __ bdnz(l_4); 1844 1845 __ cmpwi(CCR0, R5_ARG3, 0); 1846 __ beq(CCR0, l_1); 1847 1848 __ bind(l_5); 1849 __ mtctr(R5_ARG3); 1850 __ bind(l_3); 1851 __ ld(R0, -8, R3_ARG1); 1852 __ std(R0, -8, R4_ARG2); 1853 __ addi(R3_ARG1, R3_ARG1, -8); 1854 __ addi(R4_ARG2, R4_ARG2, -8); 1855 __ bdnz(l_3); 1856 1857 } 1858 __ bind(l_1); 1859 } 1860 1861 // Generate stub for conjoint long copy. If "aligned" is true, the 1862 // "from" and "to" addresses are assumed to be heapword aligned. 1863 // 1864 // Arguments for generated stub: 1865 // from: R3_ARG1 1866 // to: R4_ARG2 1867 // count: R5_ARG3 treated as signed 1868 // 1869 address generate_conjoint_long_copy(bool aligned, const char * name) { 1870 StubCodeMark mark(this, "StubRoutines", name); 1871 address start = __ function_entry(); 1872 assert_positive_int(R5_ARG3); 1873 address nooverlap_target = aligned ? 1874 STUB_ENTRY(arrayof_jlong_disjoint_arraycopy) : 1875 STUB_ENTRY(jlong_disjoint_arraycopy); 1876 1877 array_overlap_test(nooverlap_target, 3); 1878 generate_conjoint_long_copy_core(aligned); 1879 1880 __ li(R3_RET, 0); // return 0 1881 __ blr(); 1882 1883 return start; 1884 } 1885 1886 // Generate stub for conjoint oop copy. If "aligned" is true, the 1887 // "from" and "to" addresses are assumed to be heapword aligned. 1888 // 1889 // Arguments for generated stub: 1890 // from: R3_ARG1 1891 // to: R4_ARG2 1892 // count: R5_ARG3 treated as signed 1893 // dest_uninitialized: G1 support 1894 // 1895 address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) { 1896 StubCodeMark mark(this, "StubRoutines", name); 1897 1898 address start = __ function_entry(); 1899 assert_positive_int(R5_ARG3); 1900 address nooverlap_target = aligned ? 1901 STUB_ENTRY(arrayof_oop_disjoint_arraycopy) : 1902 STUB_ENTRY(oop_disjoint_arraycopy); 1903 1904 gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7); 1905 1906 // Save arguments. 1907 __ mr(R9_ARG7, R4_ARG2); 1908 __ mr(R10_ARG8, R5_ARG3); 1909 1910 if (UseCompressedOops) { 1911 array_overlap_test(nooverlap_target, 2); 1912 generate_conjoint_int_copy_core(aligned); 1913 } else { 1914 array_overlap_test(nooverlap_target, 3); 1915 generate_conjoint_long_copy_core(aligned); 1916 } 1917 1918 gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1); 1919 __ li(R3_RET, 0); // return 0 1920 __ blr(); 1921 return start; 1922 } 1923 1924 // Generate stub for disjoint oop copy. If "aligned" is true, the 1925 // "from" and "to" addresses are assumed to be heapword aligned. 1926 // 1927 // Arguments for generated stub: 1928 // from: R3_ARG1 1929 // to: R4_ARG2 1930 // count: R5_ARG3 treated as signed 1931 // dest_uninitialized: G1 support 1932 // 1933 address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) { 1934 StubCodeMark mark(this, "StubRoutines", name); 1935 address start = __ function_entry(); 1936 assert_positive_int(R5_ARG3); 1937 gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7); 1938 1939 // save some arguments, disjoint_long_copy_core destroys them. 1940 // needed for post barrier 1941 __ mr(R9_ARG7, R4_ARG2); 1942 __ mr(R10_ARG8, R5_ARG3); 1943 1944 if (UseCompressedOops) { 1945 generate_disjoint_int_copy_core(aligned); 1946 } else { 1947 generate_disjoint_long_copy_core(aligned); 1948 } 1949 1950 gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1); 1951 __ li(R3_RET, 0); // return 0 1952 __ blr(); 1953 1954 return start; 1955 } 1956 1957 1958 // Helper for generating a dynamic type check. 1959 // Smashes only the given temp registers. 1960 void generate_type_check(Register sub_klass, 1961 Register super_check_offset, 1962 Register super_klass, 1963 Register temp, 1964 Label& L_success) { 1965 assert_different_registers(sub_klass, super_check_offset, super_klass); 1966 1967 BLOCK_COMMENT("type_check:"); 1968 1969 Label L_miss; 1970 1971 __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, R0, &L_success, &L_miss, NULL, 1972 super_check_offset); 1973 __ check_klass_subtype_slow_path(sub_klass, super_klass, temp, R0, &L_success, NULL); 1974 1975 // Fall through on failure! 1976 __ bind(L_miss); 1977 } 1978 1979 1980 // Generate stub for checked oop copy. 1981 // 1982 // Arguments for generated stub: 1983 // from: R3 1984 // to: R4 1985 // count: R5 treated as signed 1986 // ckoff: R6 (super_check_offset) 1987 // ckval: R7 (super_klass) 1988 // ret: R3 zero for success; (-1^K) where K is partial transfer count 1989 // 1990 address generate_checkcast_copy(const char *name, bool dest_uninitialized) { 1991 1992 const Register R3_from = R3_ARG1; // source array address 1993 const Register R4_to = R4_ARG2; // destination array address 1994 const Register R5_count = R5_ARG3; // elements count 1995 const Register R6_ckoff = R6_ARG4; // super_check_offset 1996 const Register R7_ckval = R7_ARG5; // super_klass 1997 1998 const Register R8_offset = R8_ARG6; // loop var, with stride wordSize 1999 const Register R9_remain = R9_ARG7; // loop var, with stride -1 2000 const Register R10_oop = R10_ARG8; // actual oop copied 2001 const Register R11_klass = R11_scratch1; // oop._klass 2002 const Register R12_tmp = R12_scratch2; 2003 2004 const Register R2_minus1 = R2; 2005 2006 //__ align(CodeEntryAlignment); 2007 StubCodeMark mark(this, "StubRoutines", name); 2008 address start = __ function_entry(); 2009 2010 // Assert that int is 64 bit sign extended and arrays are not conjoint. 2011 #ifdef ASSERT 2012 { 2013 assert_positive_int(R5_ARG3); 2014 const Register tmp1 = R11_scratch1, tmp2 = R12_scratch2; 2015 Label no_overlap; 2016 __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes 2017 __ sldi(tmp2, R5_ARG3, LogBytesPerHeapOop); // size in bytes 2018 __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison! 2019 __ cmpld(CCR1, tmp1, tmp2); 2020 __ crnand(CCR0, Assembler::less, CCR1, Assembler::less); 2021 // Overlaps if Src before dst and distance smaller than size. 2022 // Branch to forward copy routine otherwise. 2023 __ blt(CCR0, no_overlap); 2024 __ stop("overlap in checkcast_copy", 0x9543); 2025 __ bind(no_overlap); 2026 } 2027 #endif 2028 2029 gen_write_ref_array_pre_barrier(R3_from, R4_to, R5_count, dest_uninitialized, R12_tmp, /* preserve: */ R6_ckoff, R7_ckval); 2030 2031 //inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R12_tmp, R3_RET); 2032 2033 Label load_element, store_element, store_null, success, do_card_marks; 2034 __ or_(R9_remain, R5_count, R5_count); // Initialize loop index, and test it. 2035 __ li(R8_offset, 0); // Offset from start of arrays. 2036 __ li(R2_minus1, -1); 2037 __ bne(CCR0, load_element); 2038 2039 // Empty array: Nothing to do. 2040 __ li(R3_RET, 0); // Return 0 on (trivial) success. 2041 __ blr(); 2042 2043 // ======== begin loop ======== 2044 // (Entry is load_element.) 2045 __ align(OptoLoopAlignment); 2046 __ bind(store_element); 2047 if (UseCompressedOops) { 2048 __ encode_heap_oop_not_null(R10_oop); 2049 __ bind(store_null); 2050 __ stw(R10_oop, R8_offset, R4_to); 2051 } else { 2052 __ bind(store_null); 2053 __ std(R10_oop, R8_offset, R4_to); 2054 } 2055 2056 __ addi(R8_offset, R8_offset, heapOopSize); // Step to next offset. 2057 __ add_(R9_remain, R2_minus1, R9_remain); // Decrement the count. 2058 __ beq(CCR0, success); 2059 2060 // ======== loop entry is here ======== 2061 __ bind(load_element); 2062 __ load_heap_oop(R10_oop, R8_offset, R3_from, &store_null); // Load the oop. 2063 2064 __ load_klass(R11_klass, R10_oop); // Query the object klass. 2065 2066 generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp, 2067 // Branch to this on success: 2068 store_element); 2069 // ======== end loop ======== 2070 2071 // It was a real error; we must depend on the caller to finish the job. 2072 // Register R9_remain has number of *remaining* oops, R5_count number of *total* oops. 2073 // Emit GC store barriers for the oops we have copied (R5_count minus R9_remain), 2074 // and report their number to the caller. 2075 __ subf_(R5_count, R9_remain, R5_count); 2076 __ nand(R3_RET, R5_count, R5_count); // report (-1^K) to caller 2077 __ bne(CCR0, do_card_marks); 2078 __ blr(); 2079 2080 __ bind(success); 2081 __ li(R3_RET, 0); 2082 2083 __ bind(do_card_marks); 2084 // Store check on R4_to[0..R5_count-1]. 2085 gen_write_ref_array_post_barrier(R4_to, R5_count, R12_tmp, /* preserve: */ R3_RET); 2086 __ blr(); 2087 return start; 2088 } 2089 2090 2091 // Generate 'unsafe' array copy stub. 2092 // Though just as safe as the other stubs, it takes an unscaled 2093 // size_t argument instead of an element count. 2094 // 2095 // Arguments for generated stub: 2096 // from: R3 2097 // to: R4 2098 // count: R5 byte count, treated as ssize_t, can be zero 2099 // 2100 // Examines the alignment of the operands and dispatches 2101 // to a long, int, short, or byte copy loop. 2102 // 2103 address generate_unsafe_copy(const char* name, 2104 address byte_copy_entry, 2105 address short_copy_entry, 2106 address int_copy_entry, 2107 address long_copy_entry) { 2108 2109 const Register R3_from = R3_ARG1; // source array address 2110 const Register R4_to = R4_ARG2; // destination array address 2111 const Register R5_count = R5_ARG3; // elements count (as long on PPC64) 2112 2113 const Register R6_bits = R6_ARG4; // test copy of low bits 2114 const Register R7_tmp = R7_ARG5; 2115 2116 //__ align(CodeEntryAlignment); 2117 StubCodeMark mark(this, "StubRoutines", name); 2118 address start = __ function_entry(); 2119 2120 // Bump this on entry, not on exit: 2121 //inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R6_bits, R7_tmp); 2122 2123 Label short_copy, int_copy, long_copy; 2124 2125 __ orr(R6_bits, R3_from, R4_to); 2126 __ orr(R6_bits, R6_bits, R5_count); 2127 __ andi_(R0, R6_bits, (BytesPerLong-1)); 2128 __ beq(CCR0, long_copy); 2129 2130 __ andi_(R0, R6_bits, (BytesPerInt-1)); 2131 __ beq(CCR0, int_copy); 2132 2133 __ andi_(R0, R6_bits, (BytesPerShort-1)); 2134 __ beq(CCR0, short_copy); 2135 2136 // byte_copy: 2137 __ b(byte_copy_entry); 2138 2139 __ bind(short_copy); 2140 __ srwi(R5_count, R5_count, LogBytesPerShort); 2141 __ b(short_copy_entry); 2142 2143 __ bind(int_copy); 2144 __ srwi(R5_count, R5_count, LogBytesPerInt); 2145 __ b(int_copy_entry); 2146 2147 __ bind(long_copy); 2148 __ srwi(R5_count, R5_count, LogBytesPerLong); 2149 __ b(long_copy_entry); 2150 2151 return start; 2152 } 2153 2154 2155 // Perform range checks on the proposed arraycopy. 2156 // Kills the two temps, but nothing else. 2157 // Also, clean the sign bits of src_pos and dst_pos. 2158 void arraycopy_range_checks(Register src, // source array oop 2159 Register src_pos, // source position 2160 Register dst, // destination array oop 2161 Register dst_pos, // destination position 2162 Register length, // length of copy 2163 Register temp1, Register temp2, 2164 Label& L_failed) { 2165 BLOCK_COMMENT("arraycopy_range_checks:"); 2166 2167 const Register array_length = temp1; // scratch 2168 const Register end_pos = temp2; // scratch 2169 2170 // if (src_pos + length > arrayOop(src)->length() ) FAIL; 2171 __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), src); 2172 __ add(end_pos, src_pos, length); // src_pos + length 2173 __ cmpd(CCR0, end_pos, array_length); 2174 __ bgt(CCR0, L_failed); 2175 2176 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL; 2177 __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), dst); 2178 __ add(end_pos, dst_pos, length); // src_pos + length 2179 __ cmpd(CCR0, end_pos, array_length); 2180 __ bgt(CCR0, L_failed); 2181 2182 BLOCK_COMMENT("arraycopy_range_checks done"); 2183 } 2184 2185 2186 // 2187 // Generate generic array copy stubs 2188 // 2189 // Input: 2190 // R3 - src oop 2191 // R4 - src_pos 2192 // R5 - dst oop 2193 // R6 - dst_pos 2194 // R7 - element count 2195 // 2196 // Output: 2197 // R3 == 0 - success 2198 // R3 == -1 - need to call System.arraycopy 2199 // 2200 address generate_generic_copy(const char *name, 2201 address entry_jbyte_arraycopy, 2202 address entry_jshort_arraycopy, 2203 address entry_jint_arraycopy, 2204 address entry_oop_arraycopy, 2205 address entry_disjoint_oop_arraycopy, 2206 address entry_jlong_arraycopy, 2207 address entry_checkcast_arraycopy) { 2208 Label L_failed, L_objArray; 2209 2210 // Input registers 2211 const Register src = R3_ARG1; // source array oop 2212 const Register src_pos = R4_ARG2; // source position 2213 const Register dst = R5_ARG3; // destination array oop 2214 const Register dst_pos = R6_ARG4; // destination position 2215 const Register length = R7_ARG5; // elements count 2216 2217 // registers used as temp 2218 const Register src_klass = R8_ARG6; // source array klass 2219 const Register dst_klass = R9_ARG7; // destination array klass 2220 const Register lh = R10_ARG8; // layout handler 2221 const Register temp = R2; 2222 2223 //__ align(CodeEntryAlignment); 2224 StubCodeMark mark(this, "StubRoutines", name); 2225 address start = __ function_entry(); 2226 2227 // Bump this on entry, not on exit: 2228 //inc_counter_np(SharedRuntime::_generic_array_copy_ctr, lh, temp); 2229 2230 // In principle, the int arguments could be dirty. 2231 2232 //----------------------------------------------------------------------- 2233 // Assembler stubs will be used for this call to arraycopy 2234 // if the following conditions are met: 2235 // 2236 // (1) src and dst must not be null. 2237 // (2) src_pos must not be negative. 2238 // (3) dst_pos must not be negative. 2239 // (4) length must not be negative. 2240 // (5) src klass and dst klass should be the same and not NULL. 2241 // (6) src and dst should be arrays. 2242 // (7) src_pos + length must not exceed length of src. 2243 // (8) dst_pos + length must not exceed length of dst. 2244 BLOCK_COMMENT("arraycopy initial argument checks"); 2245 2246 __ cmpdi(CCR1, src, 0); // if (src == NULL) return -1; 2247 __ extsw_(src_pos, src_pos); // if (src_pos < 0) return -1; 2248 __ cmpdi(CCR5, dst, 0); // if (dst == NULL) return -1; 2249 __ cror(CCR1, Assembler::equal, CCR0, Assembler::less); 2250 __ extsw_(dst_pos, dst_pos); // if (src_pos < 0) return -1; 2251 __ cror(CCR5, Assembler::equal, CCR0, Assembler::less); 2252 __ extsw_(length, length); // if (length < 0) return -1; 2253 __ cror(CCR1, Assembler::equal, CCR5, Assembler::equal); 2254 __ cror(CCR1, Assembler::equal, CCR0, Assembler::less); 2255 __ beq(CCR1, L_failed); 2256 2257 BLOCK_COMMENT("arraycopy argument klass checks"); 2258 __ load_klass(src_klass, src); 2259 __ load_klass(dst_klass, dst); 2260 2261 // Load layout helper 2262 // 2263 // |array_tag| | header_size | element_type | |log2_element_size| 2264 // 32 30 24 16 8 2 0 2265 // 2266 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2267 // 2268 2269 int lh_offset = in_bytes(Klass::layout_helper_offset()); 2270 2271 // Load 32-bits signed value. Use br() instruction with it to check icc. 2272 __ lwz(lh, lh_offset, src_klass); 2273 2274 // Handle objArrays completely differently... 2275 jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2276 __ load_const_optimized(temp, objArray_lh, R0); 2277 __ cmpw(CCR0, lh, temp); 2278 __ beq(CCR0, L_objArray); 2279 2280 __ cmpd(CCR5, src_klass, dst_klass); // if (src->klass() != dst->klass()) return -1; 2281 __ cmpwi(CCR6, lh, Klass::_lh_neutral_value); // if (!src->is_Array()) return -1; 2282 2283 __ crnand(CCR5, Assembler::equal, CCR6, Assembler::less); 2284 __ beq(CCR5, L_failed); 2285 2286 // At this point, it is known to be a typeArray (array_tag 0x3). 2287 #ifdef ASSERT 2288 { Label L; 2289 jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2290 __ load_const_optimized(temp, lh_prim_tag_in_place, R0); 2291 __ cmpw(CCR0, lh, temp); 2292 __ bge(CCR0, L); 2293 __ stop("must be a primitive array"); 2294 __ bind(L); 2295 } 2296 #endif 2297 2298 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2299 temp, dst_klass, L_failed); 2300 2301 // TypeArrayKlass 2302 // 2303 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2304 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2305 // 2306 2307 const Register offset = dst_klass; // array offset 2308 const Register elsize = src_klass; // log2 element size 2309 2310 __ rldicl(offset, lh, 64 - Klass::_lh_header_size_shift, 64 - exact_log2(Klass::_lh_header_size_mask + 1)); 2311 __ andi(elsize, lh, Klass::_lh_log2_element_size_mask); 2312 __ add(src, offset, src); // src array offset 2313 __ add(dst, offset, dst); // dst array offset 2314 2315 // Next registers should be set before the jump to corresponding stub. 2316 const Register from = R3_ARG1; // source array address 2317 const Register to = R4_ARG2; // destination array address 2318 const Register count = R5_ARG3; // elements count 2319 2320 // 'from', 'to', 'count' registers should be set in this order 2321 // since they are the same as 'src', 'src_pos', 'dst'. 2322 2323 BLOCK_COMMENT("scale indexes to element size"); 2324 __ sld(src_pos, src_pos, elsize); 2325 __ sld(dst_pos, dst_pos, elsize); 2326 __ add(from, src_pos, src); // src_addr 2327 __ add(to, dst_pos, dst); // dst_addr 2328 __ mr(count, length); // length 2329 2330 BLOCK_COMMENT("choose copy loop based on element size"); 2331 // Using conditional branches with range 32kB. 2332 const int bo = Assembler::bcondCRbiIs1, bi = Assembler::bi0(CCR0, Assembler::equal); 2333 __ cmpwi(CCR0, elsize, 0); 2334 __ bc(bo, bi, entry_jbyte_arraycopy); 2335 __ cmpwi(CCR0, elsize, LogBytesPerShort); 2336 __ bc(bo, bi, entry_jshort_arraycopy); 2337 __ cmpwi(CCR0, elsize, LogBytesPerInt); 2338 __ bc(bo, bi, entry_jint_arraycopy); 2339 #ifdef ASSERT 2340 { Label L; 2341 __ cmpwi(CCR0, elsize, LogBytesPerLong); 2342 __ beq(CCR0, L); 2343 __ stop("must be long copy, but elsize is wrong"); 2344 __ bind(L); 2345 } 2346 #endif 2347 __ b(entry_jlong_arraycopy); 2348 2349 // ObjArrayKlass 2350 __ bind(L_objArray); 2351 // live at this point: src_klass, dst_klass, src[_pos], dst[_pos], length 2352 2353 Label L_disjoint_plain_copy, L_checkcast_copy; 2354 // test array classes for subtyping 2355 __ cmpd(CCR0, src_klass, dst_klass); // usual case is exact equality 2356 __ bne(CCR0, L_checkcast_copy); 2357 2358 // Identically typed arrays can be copied without element-wise checks. 2359 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2360 temp, lh, L_failed); 2361 2362 __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset 2363 __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset 2364 __ sldi(src_pos, src_pos, LogBytesPerHeapOop); 2365 __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop); 2366 __ add(from, src_pos, src); // src_addr 2367 __ add(to, dst_pos, dst); // dst_addr 2368 __ mr(count, length); // length 2369 __ b(entry_oop_arraycopy); 2370 2371 __ bind(L_checkcast_copy); 2372 // live at this point: src_klass, dst_klass 2373 { 2374 // Before looking at dst.length, make sure dst is also an objArray. 2375 __ lwz(temp, lh_offset, dst_klass); 2376 __ cmpw(CCR0, lh, temp); 2377 __ bne(CCR0, L_failed); 2378 2379 // It is safe to examine both src.length and dst.length. 2380 arraycopy_range_checks(src, src_pos, dst, dst_pos, length, 2381 temp, lh, L_failed); 2382 2383 // Marshal the base address arguments now, freeing registers. 2384 __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset 2385 __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset 2386 __ sldi(src_pos, src_pos, LogBytesPerHeapOop); 2387 __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop); 2388 __ add(from, src_pos, src); // src_addr 2389 __ add(to, dst_pos, dst); // dst_addr 2390 __ mr(count, length); // length 2391 2392 Register sco_temp = R6_ARG4; // This register is free now. 2393 assert_different_registers(from, to, count, sco_temp, 2394 dst_klass, src_klass); 2395 2396 // Generate the type check. 2397 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2398 __ lwz(sco_temp, sco_offset, dst_klass); 2399 generate_type_check(src_klass, sco_temp, dst_klass, 2400 temp, L_disjoint_plain_copy); 2401 2402 // Fetch destination element klass from the ObjArrayKlass header. 2403 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2404 2405 // The checkcast_copy loop needs two extra arguments: 2406 __ ld(R7_ARG5, ek_offset, dst_klass); // dest elem klass 2407 __ lwz(R6_ARG4, sco_offset, R7_ARG5); // sco of elem klass 2408 __ b(entry_checkcast_arraycopy); 2409 } 2410 2411 __ bind(L_disjoint_plain_copy); 2412 __ b(entry_disjoint_oop_arraycopy); 2413 2414 __ bind(L_failed); 2415 __ li(R3_RET, -1); // return -1 2416 __ blr(); 2417 return start; 2418 } 2419 2420 2421 void generate_arraycopy_stubs() { 2422 // Note: the disjoint stubs must be generated first, some of 2423 // the conjoint stubs use them. 2424 2425 // non-aligned disjoint versions 2426 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy"); 2427 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy"); 2428 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy"); 2429 StubRoutines::_jlong_disjoint_arraycopy = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy"); 2430 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy", false); 2431 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy_uninit", true); 2432 2433 // aligned disjoint versions 2434 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy"); 2435 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy"); 2436 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, "arrayof_jint_disjoint_arraycopy"); 2437 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy"); 2438 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(true, "arrayof_oop_disjoint_arraycopy", false); 2439 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, "oop_disjoint_arraycopy_uninit", true); 2440 2441 // non-aligned conjoint versions 2442 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, "jbyte_arraycopy"); 2443 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, "jshort_arraycopy"); 2444 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, "jint_arraycopy"); 2445 StubRoutines::_jlong_arraycopy = generate_conjoint_long_copy(false, "jlong_arraycopy"); 2446 StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(false, "oop_arraycopy", false); 2447 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, "oop_arraycopy_uninit", true); 2448 2449 // aligned conjoint versions 2450 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy"); 2451 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy"); 2452 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, "arrayof_jint_arraycopy"); 2453 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy"); 2454 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", false); 2455 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", true); 2456 2457 // special/generic versions 2458 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", false); 2459 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", true); 2460 2461 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2462 STUB_ENTRY(jbyte_arraycopy), 2463 STUB_ENTRY(jshort_arraycopy), 2464 STUB_ENTRY(jint_arraycopy), 2465 STUB_ENTRY(jlong_arraycopy)); 2466 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2467 STUB_ENTRY(jbyte_arraycopy), 2468 STUB_ENTRY(jshort_arraycopy), 2469 STUB_ENTRY(jint_arraycopy), 2470 STUB_ENTRY(oop_arraycopy), 2471 STUB_ENTRY(oop_disjoint_arraycopy), 2472 STUB_ENTRY(jlong_arraycopy), 2473 STUB_ENTRY(checkcast_arraycopy)); 2474 2475 // fill routines 2476 if (OptimizeFill) { 2477 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2478 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2479 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2480 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2481 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2482 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2483 } 2484 } 2485 2486 // Safefetch stubs. 2487 void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) { 2488 // safefetch signatures: 2489 // int SafeFetch32(int* adr, int errValue); 2490 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 2491 // 2492 // arguments: 2493 // R3_ARG1 = adr 2494 // R4_ARG2 = errValue 2495 // 2496 // result: 2497 // R3_RET = *adr or errValue 2498 2499 StubCodeMark mark(this, "StubRoutines", name); 2500 2501 // Entry point, pc or function descriptor. 2502 *entry = __ function_entry(); 2503 2504 // Load *adr into R4_ARG2, may fault. 2505 *fault_pc = __ pc(); 2506 switch (size) { 2507 case 4: 2508 // int32_t, signed extended 2509 __ lwa(R4_ARG2, 0, R3_ARG1); 2510 break; 2511 case 8: 2512 // int64_t 2513 __ ld(R4_ARG2, 0, R3_ARG1); 2514 break; 2515 default: 2516 ShouldNotReachHere(); 2517 } 2518 2519 // return errValue or *adr 2520 *continuation_pc = __ pc(); 2521 __ mr(R3_RET, R4_ARG2); 2522 __ blr(); 2523 } 2524 2525 // Stub for BigInteger::multiplyToLen() 2526 // 2527 // Arguments: 2528 // 2529 // Input: 2530 // R3 - x address 2531 // R4 - x length 2532 // R5 - y address 2533 // R6 - y length 2534 // R7 - z address 2535 // R8 - z length 2536 // 2537 address generate_multiplyToLen() { 2538 2539 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 2540 2541 address start = __ function_entry(); 2542 2543 const Register x = R3; 2544 const Register xlen = R4; 2545 const Register y = R5; 2546 const Register ylen = R6; 2547 const Register z = R7; 2548 const Register zlen = R8; 2549 2550 const Register tmp1 = R2; // TOC not used. 2551 const Register tmp2 = R9; 2552 const Register tmp3 = R10; 2553 const Register tmp4 = R11; 2554 const Register tmp5 = R12; 2555 2556 // non-volatile regs 2557 const Register tmp6 = R31; 2558 const Register tmp7 = R30; 2559 const Register tmp8 = R29; 2560 const Register tmp9 = R28; 2561 const Register tmp10 = R27; 2562 const Register tmp11 = R26; 2563 const Register tmp12 = R25; 2564 const Register tmp13 = R24; 2565 2566 BLOCK_COMMENT("Entry:"); 2567 2568 // C2 does not respect int to long conversion for stub calls. 2569 __ clrldi(xlen, xlen, 32); 2570 __ clrldi(ylen, ylen, 32); 2571 __ clrldi(zlen, zlen, 32); 2572 2573 // Save non-volatile regs (frameless). 2574 int current_offs = 8; 2575 __ std(R24, -current_offs, R1_SP); current_offs += 8; 2576 __ std(R25, -current_offs, R1_SP); current_offs += 8; 2577 __ std(R26, -current_offs, R1_SP); current_offs += 8; 2578 __ std(R27, -current_offs, R1_SP); current_offs += 8; 2579 __ std(R28, -current_offs, R1_SP); current_offs += 8; 2580 __ std(R29, -current_offs, R1_SP); current_offs += 8; 2581 __ std(R30, -current_offs, R1_SP); current_offs += 8; 2582 __ std(R31, -current_offs, R1_SP); 2583 2584 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, 2585 tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13); 2586 2587 // Restore non-volatile regs. 2588 current_offs = 8; 2589 __ ld(R24, -current_offs, R1_SP); current_offs += 8; 2590 __ ld(R25, -current_offs, R1_SP); current_offs += 8; 2591 __ ld(R26, -current_offs, R1_SP); current_offs += 8; 2592 __ ld(R27, -current_offs, R1_SP); current_offs += 8; 2593 __ ld(R28, -current_offs, R1_SP); current_offs += 8; 2594 __ ld(R29, -current_offs, R1_SP); current_offs += 8; 2595 __ ld(R30, -current_offs, R1_SP); current_offs += 8; 2596 __ ld(R31, -current_offs, R1_SP); 2597 2598 __ blr(); // Return to caller. 2599 2600 return start; 2601 } 2602 2603 /** 2604 * Arguments: 2605 * 2606 * Inputs: 2607 * R3_ARG1 - int crc 2608 * R4_ARG2 - byte* buf 2609 * R5_ARG3 - int length (of buffer) 2610 * 2611 * scratch: 2612 * R2, R6-R12 2613 * 2614 * Ouput: 2615 * R3_RET - int crc result 2616 */ 2617 // Compute CRC32 function. 2618 address generate_CRC32_updateBytes(const char* name) { 2619 __ align(CodeEntryAlignment); 2620 StubCodeMark mark(this, "StubRoutines", name); 2621 address start = __ function_entry(); // Remember stub start address (is rtn value). 2622 2623 // arguments to kernel_crc32: 2624 const Register crc = R3_ARG1; // Current checksum, preset by caller or result from previous call. 2625 const Register data = R4_ARG2; // source byte array 2626 const Register dataLen = R5_ARG3; // #bytes to process 2627 const Register table = R6_ARG4; // crc table address 2628 2629 const Register t0 = R2; 2630 const Register t1 = R7; 2631 const Register t2 = R8; 2632 const Register t3 = R9; 2633 const Register tc0 = R10; 2634 const Register tc1 = R11; 2635 const Register tc2 = R12; 2636 2637 BLOCK_COMMENT("Stub body {"); 2638 assert_different_registers(crc, data, dataLen, table); 2639 2640 StubRoutines::ppc64::generate_load_crc_table_addr(_masm, table); 2641 2642 __ kernel_crc32_1word(crc, data, dataLen, table, t0, t1, t2, t3, tc0, tc1, tc2, table); 2643 2644 BLOCK_COMMENT("return"); 2645 __ mr_if_needed(R3_RET, crc); // Updated crc is function result. No copying required (R3_ARG1 == R3_RET). 2646 __ blr(); 2647 2648 BLOCK_COMMENT("} Stub body"); 2649 return start; 2650 } 2651 2652 // Initialization 2653 void generate_initial() { 2654 // Generates all stubs and initializes the entry points 2655 2656 // Entry points that exist in all platforms. 2657 // Note: This is code that could be shared among different platforms - however the 2658 // benefit seems to be smaller than the disadvantage of having a 2659 // much more complicated generator structure. See also comment in 2660 // stubRoutines.hpp. 2661 2662 StubRoutines::_forward_exception_entry = generate_forward_exception(); 2663 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address); 2664 StubRoutines::_catch_exception_entry = generate_catch_exception(); 2665 2666 // Build this early so it's available for the interpreter. 2667 StubRoutines::_throw_StackOverflowError_entry = 2668 generate_throw_exception("StackOverflowError throw_exception", 2669 CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false); 2670 2671 // CRC32 Intrinsics. 2672 if (UseCRC32Intrinsics) { 2673 StubRoutines::_crc_table_adr = (address)StubRoutines::ppc64::_crc_table; 2674 StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes("CRC32_updateBytes"); 2675 } 2676 } 2677 2678 void generate_all() { 2679 // Generates all stubs and initializes the entry points 2680 2681 // These entry points require SharedInfo::stack0 to be set up in 2682 // non-core builds 2683 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError), false); 2684 // Handle IncompatibleClassChangeError in itable stubs. 2685 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError), false); 2686 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false); 2687 2688 StubRoutines::_handler_for_unsafe_access_entry = generate_handler_for_unsafe_access(); 2689 2690 // support for verify_oop (must happen after universe_init) 2691 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 2692 2693 // arraycopy stubs used by compilers 2694 generate_arraycopy_stubs(); 2695 2696 if (UseAESIntrinsics) { 2697 guarantee(!UseAESIntrinsics, "not yet implemented."); 2698 } 2699 2700 // Safefetch stubs. 2701 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 2702 &StubRoutines::_safefetch32_fault_pc, 2703 &StubRoutines::_safefetch32_continuation_pc); 2704 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 2705 &StubRoutines::_safefetchN_fault_pc, 2706 &StubRoutines::_safefetchN_continuation_pc); 2707 2708 #ifdef COMPILER2 2709 if (UseMultiplyToLenIntrinsic) { 2710 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 2711 } 2712 #endif 2713 2714 if (UseMontgomeryMultiplyIntrinsic) { 2715 StubRoutines::_montgomeryMultiply 2716 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply); 2717 } 2718 if (UseMontgomerySquareIntrinsic) { 2719 StubRoutines::_montgomerySquare 2720 = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square); 2721 } 2722 } 2723 2724 public: 2725 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 2726 // replace the standard masm with a special one: 2727 _masm = new MacroAssembler(code); 2728 if (all) { 2729 generate_all(); 2730 } else { 2731 generate_initial(); 2732 } 2733 } 2734 }; 2735 2736 void StubGenerator_generate(CodeBuffer* code, bool all) { 2737 StubGenerator g(code, all); 2738 }