1 /*
   2  * Copyright (c) 1997, 2015, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright 2012, 2015 SAP AG. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "interpreter/interpreter.hpp"
  29 #include "nativeInst_ppc.hpp"
  30 #include "oops/instanceOop.hpp"
  31 #include "oops/method.hpp"
  32 #include "oops/objArrayKlass.hpp"
  33 #include "oops/oop.inline.hpp"
  34 #include "prims/methodHandles.hpp"
  35 #include "runtime/frame.inline.hpp"
  36 #include "runtime/handles.inline.hpp"
  37 #include "runtime/sharedRuntime.hpp"
  38 #include "runtime/stubCodeGenerator.hpp"
  39 #include "runtime/stubRoutines.hpp"
  40 #include "utilities/top.hpp"
  41 #include "runtime/thread.inline.hpp"
  42 
  43 #define __ _masm->
  44 
  45 #ifdef PRODUCT
  46 #define BLOCK_COMMENT(str) // nothing
  47 #else
  48 #define BLOCK_COMMENT(str) __ block_comment(str)
  49 #endif
  50 
  51 class StubGenerator: public StubCodeGenerator {
  52  private:
  53 
  54   // Call stubs are used to call Java from C
  55   //
  56   // Arguments:
  57   //
  58   //   R3  - call wrapper address     : address
  59   //   R4  - result                   : intptr_t*
  60   //   R5  - result type              : BasicType
  61   //   R6  - method                   : Method
  62   //   R7  - frame mgr entry point    : address
  63   //   R8  - parameter block          : intptr_t*
  64   //   R9  - parameter count in words : int
  65   //   R10 - thread                   : Thread*
  66   //
  67   address generate_call_stub(address& return_address) {
  68     // Setup a new c frame, copy java arguments, call frame manager or
  69     // native_entry, and process result.
  70 
  71     StubCodeMark mark(this, "StubRoutines", "call_stub");
  72 
  73     address start = __ function_entry();
  74 
  75     // some sanity checks
  76     assert((sizeof(frame::abi_minframe) % 16) == 0,           "unaligned");
  77     assert((sizeof(frame::abi_reg_args) % 16) == 0,           "unaligned");
  78     assert((sizeof(frame::spill_nonvolatiles) % 16) == 0,     "unaligned");
  79     assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
  80     assert((sizeof(frame::entry_frame_locals) % 16) == 0,     "unaligned");
  81 
  82     Register r_arg_call_wrapper_addr        = R3;
  83     Register r_arg_result_addr              = R4;
  84     Register r_arg_result_type              = R5;
  85     Register r_arg_method                   = R6;
  86     Register r_arg_entry                    = R7;
  87     Register r_arg_thread                   = R10;
  88 
  89     Register r_temp                         = R24;
  90     Register r_top_of_arguments_addr        = R25;
  91     Register r_entryframe_fp                = R26;
  92 
  93     {
  94       // Stack on entry to call_stub:
  95       //
  96       //      F1      [C_FRAME]
  97       //              ...
  98 
  99       Register r_arg_argument_addr          = R8;
 100       Register r_arg_argument_count         = R9;
 101       Register r_frame_alignment_in_bytes   = R27;
 102       Register r_argument_addr              = R28;
 103       Register r_argumentcopy_addr          = R29;
 104       Register r_argument_size_in_bytes     = R30;
 105       Register r_frame_size                 = R23;
 106 
 107       Label arguments_copied;
 108 
 109       // Save LR/CR to caller's C_FRAME.
 110       __ save_LR_CR(R0);
 111 
 112       // Zero extend arg_argument_count.
 113       __ clrldi(r_arg_argument_count, r_arg_argument_count, 32);
 114 
 115       // Save non-volatiles GPRs to ENTRY_FRAME (not yet pushed, but it's safe).
 116       __ save_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
 117 
 118       // Keep copy of our frame pointer (caller's SP).
 119       __ mr(r_entryframe_fp, R1_SP);
 120 
 121       BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
 122       // Push ENTRY_FRAME including arguments:
 123       //
 124       //      F0      [TOP_IJAVA_FRAME_ABI]
 125       //              alignment (optional)
 126       //              [outgoing Java arguments]
 127       //              [ENTRY_FRAME_LOCALS]
 128       //      F1      [C_FRAME]
 129       //              ...
 130 
 131       // calculate frame size
 132 
 133       // unaligned size of arguments
 134       __ sldi(r_argument_size_in_bytes,
 135                   r_arg_argument_count, Interpreter::logStackElementSize);
 136       // arguments alignment (max 1 slot)
 137       // FIXME: use round_to() here
 138       __ andi_(r_frame_alignment_in_bytes, r_arg_argument_count, 1);
 139       __ sldi(r_frame_alignment_in_bytes,
 140               r_frame_alignment_in_bytes, Interpreter::logStackElementSize);
 141 
 142       // size = unaligned size of arguments + top abi's size
 143       __ addi(r_frame_size, r_argument_size_in_bytes,
 144               frame::top_ijava_frame_abi_size);
 145       // size += arguments alignment
 146       __ add(r_frame_size,
 147              r_frame_size, r_frame_alignment_in_bytes);
 148       // size += size of call_stub locals
 149       __ addi(r_frame_size,
 150               r_frame_size, frame::entry_frame_locals_size);
 151 
 152       // push ENTRY_FRAME
 153       __ push_frame(r_frame_size, r_temp);
 154 
 155       // initialize call_stub locals (step 1)
 156       __ std(r_arg_call_wrapper_addr,
 157              _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
 158       __ std(r_arg_result_addr,
 159              _entry_frame_locals_neg(result_address), r_entryframe_fp);
 160       __ std(r_arg_result_type,
 161              _entry_frame_locals_neg(result_type), r_entryframe_fp);
 162       // we will save arguments_tos_address later
 163 
 164 
 165       BLOCK_COMMENT("Copy Java arguments");
 166       // copy Java arguments
 167 
 168       // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
 169       // FIXME: why not simply use SP+frame::top_ijava_frame_size?
 170       __ addi(r_top_of_arguments_addr,
 171               R1_SP, frame::top_ijava_frame_abi_size);
 172       __ add(r_top_of_arguments_addr,
 173              r_top_of_arguments_addr, r_frame_alignment_in_bytes);
 174 
 175       // any arguments to copy?
 176       __ cmpdi(CCR0, r_arg_argument_count, 0);
 177       __ beq(CCR0, arguments_copied);
 178 
 179       // prepare loop and copy arguments in reverse order
 180       {
 181         // init CTR with arg_argument_count
 182         __ mtctr(r_arg_argument_count);
 183 
 184         // let r_argumentcopy_addr point to last outgoing Java arguments P
 185         __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
 186 
 187         // let r_argument_addr point to last incoming java argument
 188         __ add(r_argument_addr,
 189                    r_arg_argument_addr, r_argument_size_in_bytes);
 190         __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
 191 
 192         // now loop while CTR > 0 and copy arguments
 193         {
 194           Label next_argument;
 195           __ bind(next_argument);
 196 
 197           __ ld(r_temp, 0, r_argument_addr);
 198           // argument_addr--;
 199           __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
 200           __ std(r_temp, 0, r_argumentcopy_addr);
 201           // argumentcopy_addr++;
 202           __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
 203 
 204           __ bdnz(next_argument);
 205         }
 206       }
 207 
 208       // Arguments copied, continue.
 209       __ bind(arguments_copied);
 210     }
 211 
 212     {
 213       BLOCK_COMMENT("Call frame manager or native entry.");
 214       // Call frame manager or native entry.
 215       Register r_new_arg_entry = R14;
 216       assert_different_registers(r_new_arg_entry, r_top_of_arguments_addr,
 217                                  r_arg_method, r_arg_thread);
 218 
 219       __ mr(r_new_arg_entry, r_arg_entry);
 220 
 221       // Register state on entry to frame manager / native entry:
 222       //
 223       //   tos         -  intptr_t*    sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
 224       //   R19_method  -  Method
 225       //   R16_thread  -  JavaThread*
 226 
 227       // Tos must point to last argument - element_size.
 228 #ifdef CC_INTERP
 229       const Register tos = R17_tos;
 230 #else
 231       const Register tos = R15_esp;
 232 #endif
 233       __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
 234 
 235       // initialize call_stub locals (step 2)
 236       // now save tos as arguments_tos_address
 237       __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
 238 
 239       // load argument registers for call
 240       __ mr(R19_method, r_arg_method);
 241       __ mr(R16_thread, r_arg_thread);
 242       assert(tos != r_arg_method, "trashed r_arg_method");
 243       assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
 244 
 245       // Set R15_prev_state to 0 for simplifying checks in callee.
 246 #ifdef CC_INTERP
 247       __ li(R15_prev_state, 0);
 248 #else
 249       __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R11_scratch1);
 250 #endif
 251       // Stack on entry to frame manager / native entry:
 252       //
 253       //      F0      [TOP_IJAVA_FRAME_ABI]
 254       //              alignment (optional)
 255       //              [outgoing Java arguments]
 256       //              [ENTRY_FRAME_LOCALS]
 257       //      F1      [C_FRAME]
 258       //              ...
 259       //
 260 
 261       // global toc register
 262       __ load_const(R29, MacroAssembler::global_toc(), R11_scratch1);
 263 
 264       // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
 265       // when called via a c2i.
 266 
 267       // Pass initial_caller_sp to framemanager.
 268       __ mr(R21_tmp1, R1_SP);
 269 
 270       // Do a light-weight C-call here, r_new_arg_entry holds the address
 271       // of the interpreter entry point (frame manager or native entry)
 272       // and save runtime-value of LR in return_address.
 273       assert(r_new_arg_entry != tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread,
 274              "trashed r_new_arg_entry");
 275       return_address = __ call_stub(r_new_arg_entry);
 276     }
 277 
 278     {
 279       BLOCK_COMMENT("Returned from frame manager or native entry.");
 280       // Returned from frame manager or native entry.
 281       // Now pop frame, process result, and return to caller.
 282 
 283       // Stack on exit from frame manager / native entry:
 284       //
 285       //      F0      [ABI]
 286       //              ...
 287       //              [ENTRY_FRAME_LOCALS]
 288       //      F1      [C_FRAME]
 289       //              ...
 290       //
 291       // Just pop the topmost frame ...
 292       //
 293 
 294       Label ret_is_object;
 295       Label ret_is_long;
 296       Label ret_is_float;
 297       Label ret_is_double;
 298 
 299       Register r_entryframe_fp = R30;
 300       Register r_lr            = R7_ARG5;
 301       Register r_cr            = R8_ARG6;
 302 
 303       // Reload some volatile registers which we've spilled before the call
 304       // to frame manager / native entry.
 305       // Access all locals via frame pointer, because we know nothing about
 306       // the topmost frame's size.
 307       __ ld(r_entryframe_fp, _abi(callers_sp), R1_SP);
 308       assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
 309       __ ld(r_arg_result_addr,
 310             _entry_frame_locals_neg(result_address), r_entryframe_fp);
 311       __ ld(r_arg_result_type,
 312             _entry_frame_locals_neg(result_type), r_entryframe_fp);
 313       __ ld(r_cr, _abi(cr), r_entryframe_fp);
 314       __ ld(r_lr, _abi(lr), r_entryframe_fp);
 315 
 316       // pop frame and restore non-volatiles, LR and CR
 317       __ mr(R1_SP, r_entryframe_fp);
 318       __ mtcr(r_cr);
 319       __ mtlr(r_lr);
 320 
 321       // Store result depending on type. Everything that is not
 322       // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
 323       __ cmpwi(CCR0, r_arg_result_type, T_OBJECT);
 324       __ cmpwi(CCR1, r_arg_result_type, T_LONG);
 325       __ cmpwi(CCR5, r_arg_result_type, T_FLOAT);
 326       __ cmpwi(CCR6, r_arg_result_type, T_DOUBLE);
 327 
 328       // restore non-volatile registers
 329       __ restore_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
 330 
 331 
 332       // Stack on exit from call_stub:
 333       //
 334       //      0       [C_FRAME]
 335       //              ...
 336       //
 337       //  no call_stub frames left.
 338 
 339       // All non-volatiles have been restored at this point!!
 340       assert(R3_RET == R3, "R3_RET should be R3");
 341 
 342       __ beq(CCR0, ret_is_object);
 343       __ beq(CCR1, ret_is_long);
 344       __ beq(CCR5, ret_is_float);
 345       __ beq(CCR6, ret_is_double);
 346 
 347       // default:
 348       __ stw(R3_RET, 0, r_arg_result_addr);
 349       __ blr(); // return to caller
 350 
 351       // case T_OBJECT:
 352       __ bind(ret_is_object);
 353       __ std(R3_RET, 0, r_arg_result_addr);
 354       __ blr(); // return to caller
 355 
 356       // case T_LONG:
 357       __ bind(ret_is_long);
 358       __ std(R3_RET, 0, r_arg_result_addr);
 359       __ blr(); // return to caller
 360 
 361       // case T_FLOAT:
 362       __ bind(ret_is_float);
 363       __ stfs(F1_RET, 0, r_arg_result_addr);
 364       __ blr(); // return to caller
 365 
 366       // case T_DOUBLE:
 367       __ bind(ret_is_double);
 368       __ stfd(F1_RET, 0, r_arg_result_addr);
 369       __ blr(); // return to caller
 370     }
 371 
 372     return start;
 373   }
 374 
 375   // Return point for a Java call if there's an exception thrown in
 376   // Java code.  The exception is caught and transformed into a
 377   // pending exception stored in JavaThread that can be tested from
 378   // within the VM.
 379   //
 380   address generate_catch_exception() {
 381     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 382 
 383     address start = __ pc();
 384 
 385     // Registers alive
 386     //
 387     //  R16_thread
 388     //  R3_ARG1 - address of pending exception
 389     //  R4_ARG2 - return address in call stub
 390 
 391     const Register exception_file = R21_tmp1;
 392     const Register exception_line = R22_tmp2;
 393 
 394     __ load_const(exception_file, (void*)__FILE__);
 395     __ load_const(exception_line, (void*)__LINE__);
 396 
 397     __ std(R3_ARG1, in_bytes(JavaThread::pending_exception_offset()), R16_thread);
 398     // store into `char *'
 399     __ std(exception_file, in_bytes(JavaThread::exception_file_offset()), R16_thread);
 400     // store into `int'
 401     __ stw(exception_line, in_bytes(JavaThread::exception_line_offset()), R16_thread);
 402 
 403     // complete return to VM
 404     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
 405 
 406     __ mtlr(R4_ARG2);
 407     // continue in call stub
 408     __ blr();
 409 
 410     return start;
 411   }
 412 
 413   // Continuation point for runtime calls returning with a pending
 414   // exception.  The pending exception check happened in the runtime
 415   // or native call stub.  The pending exception in Thread is
 416   // converted into a Java-level exception.
 417   //
 418   // Read:
 419   //
 420   //   LR:     The pc the runtime library callee wants to return to.
 421   //           Since the exception occurred in the callee, the return pc
 422   //           from the point of view of Java is the exception pc.
 423   //   thread: Needed for method handles.
 424   //
 425   // Invalidate:
 426   //
 427   //   volatile registers (except below).
 428   //
 429   // Update:
 430   //
 431   //   R4_ARG2: exception
 432   //
 433   // (LR is unchanged and is live out).
 434   //
 435   address generate_forward_exception() {
 436     StubCodeMark mark(this, "StubRoutines", "forward_exception");
 437     address start = __ pc();
 438 
 439 #if !defined(PRODUCT)
 440     if (VerifyOops) {
 441       // Get pending exception oop.
 442       __ ld(R3_ARG1,
 443                 in_bytes(Thread::pending_exception_offset()),
 444                 R16_thread);
 445       // Make sure that this code is only executed if there is a pending exception.
 446       {
 447         Label L;
 448         __ cmpdi(CCR0, R3_ARG1, 0);
 449         __ bne(CCR0, L);
 450         __ stop("StubRoutines::forward exception: no pending exception (1)");
 451         __ bind(L);
 452       }
 453       __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
 454     }
 455 #endif
 456 
 457     // Save LR/CR and copy exception pc (LR) into R4_ARG2.
 458     __ save_LR_CR(R4_ARG2);
 459     __ push_frame_reg_args(0, R0);
 460     // Find exception handler.
 461     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 462                      SharedRuntime::exception_handler_for_return_address),
 463                     R16_thread,
 464                     R4_ARG2);
 465     // Copy handler's address.
 466     __ mtctr(R3_RET);
 467     __ pop_frame();
 468     __ restore_LR_CR(R0);
 469 
 470     // Set up the arguments for the exception handler:
 471     //  - R3_ARG1: exception oop
 472     //  - R4_ARG2: exception pc.
 473 
 474     // Load pending exception oop.
 475     __ ld(R3_ARG1,
 476               in_bytes(Thread::pending_exception_offset()),
 477               R16_thread);
 478 
 479     // The exception pc is the return address in the caller.
 480     // Must load it into R4_ARG2.
 481     __ mflr(R4_ARG2);
 482 
 483 #ifdef ASSERT
 484     // Make sure exception is set.
 485     {
 486       Label L;
 487       __ cmpdi(CCR0, R3_ARG1, 0);
 488       __ bne(CCR0, L);
 489       __ stop("StubRoutines::forward exception: no pending exception (2)");
 490       __ bind(L);
 491     }
 492 #endif
 493 
 494     // Clear the pending exception.
 495     __ li(R0, 0);
 496     __ std(R0,
 497                in_bytes(Thread::pending_exception_offset()),
 498                R16_thread);
 499     // Jump to exception handler.
 500     __ bctr();
 501 
 502     return start;
 503   }
 504 
 505 #undef __
 506 #define __ masm->
 507   // Continuation point for throwing of implicit exceptions that are
 508   // not handled in the current activation. Fabricates an exception
 509   // oop and initiates normal exception dispatching in this
 510   // frame. Only callee-saved registers are preserved (through the
 511   // normal register window / RegisterMap handling).  If the compiler
 512   // needs all registers to be preserved between the fault point and
 513   // the exception handler then it must assume responsibility for that
 514   // in AbstractCompiler::continuation_for_implicit_null_exception or
 515   // continuation_for_implicit_division_by_zero_exception. All other
 516   // implicit exceptions (e.g., NullPointerException or
 517   // AbstractMethodError on entry) are either at call sites or
 518   // otherwise assume that stack unwinding will be initiated, so
 519   // caller saved registers were assumed volatile in the compiler.
 520   //
 521   // Note that we generate only this stub into a RuntimeStub, because
 522   // it needs to be properly traversed and ignored during GC, so we
 523   // change the meaning of the "__" macro within this method.
 524   //
 525   // Note: the routine set_pc_not_at_call_for_caller in
 526   // SharedRuntime.cpp requires that this code be generated into a
 527   // RuntimeStub.
 528   address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc,
 529                                    Register arg1 = noreg, Register arg2 = noreg) {
 530     CodeBuffer code(name, 1024 DEBUG_ONLY(+ 512), 0);
 531     MacroAssembler* masm = new MacroAssembler(&code);
 532 
 533     OopMapSet* oop_maps  = new OopMapSet();
 534     int frame_size_in_bytes = frame::abi_reg_args_size;
 535     OopMap* map = new OopMap(frame_size_in_bytes / sizeof(jint), 0);
 536 
 537     address start = __ pc();
 538 
 539     __ save_LR_CR(R11_scratch1);
 540 
 541     // Push a frame.
 542     __ push_frame_reg_args(0, R11_scratch1);
 543 
 544     address frame_complete_pc = __ pc();
 545 
 546     if (restore_saved_exception_pc) {
 547       __ unimplemented("StubGenerator::throw_exception with restore_saved_exception_pc", 74);
 548     }
 549 
 550     // Note that we always have a runtime stub frame on the top of
 551     // stack by this point. Remember the offset of the instruction
 552     // whose address will be moved to R11_scratch1.
 553     address gc_map_pc = __ get_PC_trash_LR(R11_scratch1);
 554 
 555     __ set_last_Java_frame(/*sp*/R1_SP, /*pc*/R11_scratch1);
 556 
 557     __ mr(R3_ARG1, R16_thread);
 558     if (arg1 != noreg) {
 559       __ mr(R4_ARG2, arg1);
 560     }
 561     if (arg2 != noreg) {
 562       __ mr(R5_ARG3, arg2);
 563     }
 564 #if defined(ABI_ELFv2)
 565     __ call_c(runtime_entry, relocInfo::none);
 566 #else
 567     __ call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, runtime_entry), relocInfo::none);
 568 #endif
 569 
 570     // Set an oopmap for the call site.
 571     oop_maps->add_gc_map((int)(gc_map_pc - start), map);
 572 
 573     __ reset_last_Java_frame();
 574 
 575 #ifdef ASSERT
 576     // Make sure that this code is only executed if there is a pending
 577     // exception.
 578     {
 579       Label L;
 580       __ ld(R0,
 581                 in_bytes(Thread::pending_exception_offset()),
 582                 R16_thread);
 583       __ cmpdi(CCR0, R0, 0);
 584       __ bne(CCR0, L);
 585       __ stop("StubRoutines::throw_exception: no pending exception");
 586       __ bind(L);
 587     }
 588 #endif
 589 
 590     // Pop frame.
 591     __ pop_frame();
 592 
 593     __ restore_LR_CR(R11_scratch1);
 594 
 595     __ load_const(R11_scratch1, StubRoutines::forward_exception_entry());
 596     __ mtctr(R11_scratch1);
 597     __ bctr();
 598 
 599     // Create runtime stub with OopMap.
 600     RuntimeStub* stub =
 601       RuntimeStub::new_runtime_stub(name, &code,
 602                                     /*frame_complete=*/ (int)(frame_complete_pc - start),
 603                                     frame_size_in_bytes/wordSize,
 604                                     oop_maps,
 605                                     false);
 606     return stub->entry_point();
 607   }
 608 #undef __
 609 #define __ _masm->
 610 
 611   //  Generate G1 pre-write barrier for array.
 612   //
 613   //  Input:
 614   //     from     - register containing src address (only needed for spilling)
 615   //     to       - register containing starting address
 616   //     count    - register containing element count
 617   //     tmp      - scratch register
 618   //
 619   //  Kills:
 620   //     nothing
 621   //
 622   void gen_write_ref_array_pre_barrier(Register from, Register to, Register count, bool dest_uninitialized, Register Rtmp1) {
 623     BarrierSet* const bs = Universe::heap()->barrier_set();
 624     switch (bs->kind()) {
 625       case BarrierSet::G1SATBCTLogging:
 626         // With G1, don't generate the call if we statically know that the target in uninitialized
 627         if (!dest_uninitialized) {
 628           const int spill_slots = 4 * wordSize;
 629           const int frame_size  = frame::abi_reg_args_size + spill_slots;
 630           Label filtered;
 631 
 632           // Is marking active?
 633           if (in_bytes(PtrQueue::byte_width_of_active()) == 4) {
 634             __ lwz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
 635           } else {
 636             guarantee(in_bytes(PtrQueue::byte_width_of_active()) == 1, "Assumption");
 637             __ lbz(Rtmp1, in_bytes(JavaThread::satb_mark_queue_offset() + PtrQueue::byte_offset_of_active()), R16_thread);
 638           }
 639           __ cmpdi(CCR0, Rtmp1, 0);
 640           __ beq(CCR0, filtered);
 641 
 642           __ save_LR_CR(R0);
 643           __ push_frame_reg_args(spill_slots, R0);
 644           __ std(from,  frame_size - 1 * wordSize, R1_SP);
 645           __ std(to,    frame_size - 2 * wordSize, R1_SP);
 646           __ std(count, frame_size - 3 * wordSize, R1_SP);
 647 
 648           __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), to, count);
 649 
 650           __ ld(from,  frame_size - 1 * wordSize, R1_SP);
 651           __ ld(to,    frame_size - 2 * wordSize, R1_SP);
 652           __ ld(count, frame_size - 3 * wordSize, R1_SP);
 653           __ pop_frame();
 654           __ restore_LR_CR(R0);
 655 
 656           __ bind(filtered);
 657         }
 658         break;
 659       case BarrierSet::CardTableModRef:
 660       case BarrierSet::CardTableExtension:
 661       case BarrierSet::ModRef:
 662         break;
 663       default:
 664         ShouldNotReachHere();
 665     }
 666   }
 667 
 668   //  Generate CMS/G1 post-write barrier for array.
 669   //
 670   //  Input:
 671   //     addr     - register containing starting address
 672   //     count    - register containing element count
 673   //     tmp      - scratch register
 674   //
 675   //  The input registers and R0 are overwritten.
 676   //
 677   void gen_write_ref_array_post_barrier(Register addr, Register count, Register tmp, bool branchToEnd) {
 678     BarrierSet* const bs = Universe::heap()->barrier_set();
 679 
 680     switch (bs->kind()) {
 681       case BarrierSet::G1SATBCTLogging:
 682         {
 683           if (branchToEnd) {
 684             __ save_LR_CR(R0);
 685             // We need this frame only to spill LR.
 686             __ push_frame_reg_args(0, R0);
 687             __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), addr, count);
 688             __ pop_frame();
 689             __ restore_LR_CR(R0);
 690           } else {
 691             // Tail call: fake call from stub caller by branching without linking.
 692             address entry_point = (address)CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post);
 693             __ mr_if_needed(R3_ARG1, addr);
 694             __ mr_if_needed(R4_ARG2, count);
 695             __ load_const(R11, entry_point, R0);
 696             __ call_c_and_return_to_caller(R11);
 697           }
 698         }
 699         break;
 700       case BarrierSet::CardTableModRef:
 701       case BarrierSet::CardTableExtension:
 702         {
 703           Label Lskip_loop, Lstore_loop;
 704           if (UseConcMarkSweepGC) {
 705             // TODO PPC port: contribute optimization / requires shared changes
 706             __ release();
 707           }
 708 
 709           CardTableModRefBS* const ct = barrier_set_cast<CardTableModRefBS>(bs);
 710           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 711           assert_different_registers(addr, count, tmp);
 712 
 713           __ sldi(count, count, LogBytesPerHeapOop);
 714           __ addi(count, count, -BytesPerHeapOop);
 715           __ add(count, addr, count);
 716           // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
 717           __ srdi(addr, addr, CardTableModRefBS::card_shift);
 718           __ srdi(count, count, CardTableModRefBS::card_shift);
 719           __ subf(count, addr, count);
 720           assert_different_registers(R0, addr, count, tmp);
 721           __ load_const(tmp, (address)ct->byte_map_base);
 722           __ addic_(count, count, 1);
 723           __ beq(CCR0, Lskip_loop);
 724           __ li(R0, 0);
 725           __ mtctr(count);
 726           // Byte store loop
 727           __ bind(Lstore_loop);
 728           __ stbx(R0, tmp, addr);
 729           __ addi(addr, addr, 1);
 730           __ bdnz(Lstore_loop);
 731           __ bind(Lskip_loop);
 732 
 733           if (!branchToEnd) __ blr();
 734         }
 735       break;
 736       case BarrierSet::ModRef:
 737         if (!branchToEnd) __ blr();
 738         break;
 739       default:
 740         ShouldNotReachHere();
 741     }
 742   }
 743 
 744   // Support for void zero_words_aligned8(HeapWord* to, size_t count)
 745   //
 746   // Arguments:
 747   //   to:
 748   //   count:
 749   //
 750   // Destroys:
 751   //
 752   address generate_zero_words_aligned8() {
 753     StubCodeMark mark(this, "StubRoutines", "zero_words_aligned8");
 754 
 755     // Implemented as in ClearArray.
 756     address start = __ function_entry();
 757 
 758     Register base_ptr_reg   = R3_ARG1; // tohw (needs to be 8b aligned)
 759     Register cnt_dwords_reg = R4_ARG2; // count (in dwords)
 760     Register tmp1_reg       = R5_ARG3;
 761     Register tmp2_reg       = R6_ARG4;
 762     Register zero_reg       = R7_ARG5;
 763 
 764     // Procedure for large arrays (uses data cache block zero instruction).
 765     Label dwloop, fast, fastloop, restloop, lastdword, done;
 766     int cl_size=VM_Version::get_cache_line_size(), cl_dwords=cl_size>>3, cl_dwordaddr_bits=exact_log2(cl_dwords);
 767     int min_dcbz=2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
 768 
 769     // Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.
 770     __ dcbtst(base_ptr_reg);                    // Indicate write access to first cache line ...
 771     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if number of dwords is even.
 772     __ srdi_(tmp1_reg, cnt_dwords_reg, 1);      // number of double dwords
 773     __ load_const_optimized(zero_reg, 0L);      // Use as zero register.
 774 
 775     __ cmpdi(CCR1, tmp2_reg, 0);                // cnt_dwords even?
 776     __ beq(CCR0, lastdword);                    // size <= 1
 777     __ mtctr(tmp1_reg);                         // Speculatively preload counter for rest loop (>0).
 778     __ cmpdi(CCR0, cnt_dwords_reg, (min_dcbz+1)*cl_dwords-1); // Big enough to ensure >=min_dcbz cache lines are included?
 779     __ neg(tmp1_reg, base_ptr_reg);             // bit 0..58: bogus, bit 57..60: (16-(base>>3))%16, bit 61..63: 000
 780 
 781     __ blt(CCR0, restloop);                     // Too small. (<31=(2*cl_dwords)-1 is sufficient, but bigger performs better.)
 782     __ rldicl_(tmp1_reg, tmp1_reg, 64-3, 64-cl_dwordaddr_bits); // Extract number of dwords to 128byte boundary=(16-(base>>3))%16.
 783 
 784     __ beq(CCR0, fast);                         // already 128byte aligned
 785     __ mtctr(tmp1_reg);                         // Set ctr to hit 128byte boundary (0<ctr<cnt).
 786     __ subf(cnt_dwords_reg, tmp1_reg, cnt_dwords_reg); // rest (>0 since size>=256-8)
 787 
 788     // Clear in first cache line dword-by-dword if not already 128byte aligned.
 789     __ bind(dwloop);
 790       __ std(zero_reg, 0, base_ptr_reg);        // Clear 8byte aligned block.
 791       __ addi(base_ptr_reg, base_ptr_reg, 8);
 792     __ bdnz(dwloop);
 793 
 794     // clear 128byte blocks
 795     __ bind(fast);
 796     __ srdi(tmp1_reg, cnt_dwords_reg, cl_dwordaddr_bits); // loop count for 128byte loop (>0 since size>=256-8)
 797     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if rest even
 798 
 799     __ mtctr(tmp1_reg);                         // load counter
 800     __ cmpdi(CCR1, tmp2_reg, 0);                // rest even?
 801     __ rldicl_(tmp1_reg, cnt_dwords_reg, 63, 65-cl_dwordaddr_bits); // rest in double dwords
 802 
 803     __ bind(fastloop);
 804       __ dcbz(base_ptr_reg);                    // Clear 128byte aligned block.
 805       __ addi(base_ptr_reg, base_ptr_reg, cl_size);
 806     __ bdnz(fastloop);
 807 
 808     //__ dcbtst(base_ptr_reg);                  // Indicate write access to last cache line.
 809     __ beq(CCR0, lastdword);                    // rest<=1
 810     __ mtctr(tmp1_reg);                         // load counter
 811 
 812     // Clear rest.
 813     __ bind(restloop);
 814       __ std(zero_reg, 0, base_ptr_reg);        // Clear 8byte aligned block.
 815       __ std(zero_reg, 8, base_ptr_reg);        // Clear 8byte aligned block.
 816       __ addi(base_ptr_reg, base_ptr_reg, 16);
 817     __ bdnz(restloop);
 818 
 819     __ bind(lastdword);
 820     __ beq(CCR1, done);
 821     __ std(zero_reg, 0, base_ptr_reg);
 822     __ bind(done);
 823     __ blr();                                   // return
 824 
 825     return start;
 826   }
 827 
 828   // The following routine generates a subroutine to throw an asynchronous
 829   // UnknownError when an unsafe access gets a fault that could not be
 830   // reasonably prevented by the programmer.  (Example: SIGBUS/OBJERR.)
 831   //
 832   address generate_handler_for_unsafe_access() {
 833     StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
 834     address start = __ function_entry();
 835     __ unimplemented("StubRoutines::handler_for_unsafe_access", 93);
 836     return start;
 837   }
 838 
 839 #if !defined(PRODUCT)
 840   // Wrapper which calls oopDesc::is_oop_or_null()
 841   // Only called by MacroAssembler::verify_oop
 842   static void verify_oop_helper(const char* message, oop o) {
 843     if (!o->is_oop_or_null()) {
 844       fatal(message);
 845     }
 846     ++ StubRoutines::_verify_oop_count;
 847   }
 848 #endif
 849 
 850   // Return address of code to be called from code generated by
 851   // MacroAssembler::verify_oop.
 852   //
 853   // Don't generate, rather use C++ code.
 854   address generate_verify_oop() {
 855     // this is actually a `FunctionDescriptor*'.
 856     address start = 0;
 857 
 858 #if !defined(PRODUCT)
 859     start = CAST_FROM_FN_PTR(address, verify_oop_helper);
 860 #endif
 861 
 862     return start;
 863   }
 864 
 865   // Fairer handling of safepoints for native methods.
 866   //
 867   // Generate code which reads from the polling page. This special handling is needed as the
 868   // linux-ppc64 kernel before 2.6.6 doesn't set si_addr on some segfaults in 64bit mode
 869   // (cf. http://www.kernel.org/pub/linux/kernel/v2.6/ChangeLog-2.6.6), especially when we try
 870   // to read from the safepoint polling page.
 871   address generate_load_from_poll() {
 872     StubCodeMark mark(this, "StubRoutines", "generate_load_from_poll");
 873     address start = __ function_entry();
 874     __ unimplemented("StubRoutines::verify_oop", 95);  // TODO PPC port
 875     return start;
 876   }
 877 
 878   // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
 879   //
 880   // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
 881   // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
 882   //
 883   // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
 884   // for turning on loop predication optimization, and hence the behavior of "array range check"
 885   // and "loop invariant check" could be influenced, which potentially boosted JVM98.
 886   //
 887   // Generate stub for disjoint short fill. If "aligned" is true, the
 888   // "to" address is assumed to be heapword aligned.
 889   //
 890   // Arguments for generated stub:
 891   //   to:    R3_ARG1
 892   //   value: R4_ARG2
 893   //   count: R5_ARG3 treated as signed
 894   //
 895   address generate_fill(BasicType t, bool aligned, const char* name) {
 896     StubCodeMark mark(this, "StubRoutines", name);
 897     address start = __ function_entry();
 898 
 899     const Register to    = R3_ARG1;   // source array address
 900     const Register value = R4_ARG2;   // fill value
 901     const Register count = R5_ARG3;   // elements count
 902     const Register temp  = R6_ARG4;   // temp register
 903 
 904     //assert_clean_int(count, O3);    // Make sure 'count' is clean int.
 905 
 906     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
 907     Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
 908 
 909     int shift = -1;
 910     switch (t) {
 911        case T_BYTE:
 912         shift = 2;
 913         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
 914         __ rldimi(value, value, 8, 48);     // 8 bit -> 16 bit
 915         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 916         __ blt(CCR0, L_fill_elements);
 917         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
 918         break;
 919        case T_SHORT:
 920         shift = 1;
 921         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
 922         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
 923         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 924         __ blt(CCR0, L_fill_elements);
 925         break;
 926       case T_INT:
 927         shift = 0;
 928         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 929         __ blt(CCR0, L_fill_4_bytes);
 930         break;
 931       default: ShouldNotReachHere();
 932     }
 933 
 934     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
 935       // Align source address at 4 bytes address boundary.
 936       if (t == T_BYTE) {
 937         // One byte misalignment happens only for byte arrays.
 938         __ andi_(temp, to, 1);
 939         __ beq(CCR0, L_skip_align1);
 940         __ stb(value, 0, to);
 941         __ addi(to, to, 1);
 942         __ addi(count, count, -1);
 943         __ bind(L_skip_align1);
 944       }
 945       // Two bytes misalignment happens only for byte and short (char) arrays.
 946       __ andi_(temp, to, 2);
 947       __ beq(CCR0, L_skip_align2);
 948       __ sth(value, 0, to);
 949       __ addi(to, to, 2);
 950       __ addi(count, count, -(1 << (shift - 1)));
 951       __ bind(L_skip_align2);
 952     }
 953 
 954     if (!aligned) {
 955       // Align to 8 bytes, we know we are 4 byte aligned to start.
 956       __ andi_(temp, to, 7);
 957       __ beq(CCR0, L_fill_32_bytes);
 958       __ stw(value, 0, to);
 959       __ addi(to, to, 4);
 960       __ addi(count, count, -(1 << shift));
 961       __ bind(L_fill_32_bytes);
 962     }
 963 
 964     __ li(temp, 8<<shift);                  // Prepare for 32 byte loop.
 965     // Clone bytes int->long as above.
 966     __ rldimi(value, value, 32, 0);         // 32 bit -> 64 bit
 967 
 968     Label L_check_fill_8_bytes;
 969     // Fill 32-byte chunks.
 970     __ subf_(count, temp, count);
 971     __ blt(CCR0, L_check_fill_8_bytes);
 972 
 973     Label L_fill_32_bytes_loop;
 974     __ align(32);
 975     __ bind(L_fill_32_bytes_loop);
 976 
 977     __ std(value, 0, to);
 978     __ std(value, 8, to);
 979     __ subf_(count, temp, count);           // Update count.
 980     __ std(value, 16, to);
 981     __ std(value, 24, to);
 982 
 983     __ addi(to, to, 32);
 984     __ bge(CCR0, L_fill_32_bytes_loop);
 985 
 986     __ bind(L_check_fill_8_bytes);
 987     __ add_(count, temp, count);
 988     __ beq(CCR0, L_exit);
 989     __ addic_(count, count, -(2 << shift));
 990     __ blt(CCR0, L_fill_4_bytes);
 991 
 992     //
 993     // Length is too short, just fill 8 bytes at a time.
 994     //
 995     Label L_fill_8_bytes_loop;
 996     __ bind(L_fill_8_bytes_loop);
 997     __ std(value, 0, to);
 998     __ addic_(count, count, -(2 << shift));
 999     __ addi(to, to, 8);
1000     __ bge(CCR0, L_fill_8_bytes_loop);
1001 
1002     // Fill trailing 4 bytes.
1003     __ bind(L_fill_4_bytes);
1004     __ andi_(temp, count, 1<<shift);
1005     __ beq(CCR0, L_fill_2_bytes);
1006 
1007     __ stw(value, 0, to);
1008     if (t == T_BYTE || t == T_SHORT) {
1009       __ addi(to, to, 4);
1010       // Fill trailing 2 bytes.
1011       __ bind(L_fill_2_bytes);
1012       __ andi_(temp, count, 1<<(shift-1));
1013       __ beq(CCR0, L_fill_byte);
1014       __ sth(value, 0, to);
1015       if (t == T_BYTE) {
1016         __ addi(to, to, 2);
1017         // Fill trailing byte.
1018         __ bind(L_fill_byte);
1019         __ andi_(count, count, 1);
1020         __ beq(CCR0, L_exit);
1021         __ stb(value, 0, to);
1022       } else {
1023         __ bind(L_fill_byte);
1024       }
1025     } else {
1026       __ bind(L_fill_2_bytes);
1027     }
1028     __ bind(L_exit);
1029     __ blr();
1030 
1031     // Handle copies less than 8 bytes. Int is handled elsewhere.
1032     if (t == T_BYTE) {
1033       __ bind(L_fill_elements);
1034       Label L_fill_2, L_fill_4;
1035       __ andi_(temp, count, 1);
1036       __ beq(CCR0, L_fill_2);
1037       __ stb(value, 0, to);
1038       __ addi(to, to, 1);
1039       __ bind(L_fill_2);
1040       __ andi_(temp, count, 2);
1041       __ beq(CCR0, L_fill_4);
1042       __ stb(value, 0, to);
1043       __ stb(value, 0, to);
1044       __ addi(to, to, 2);
1045       __ bind(L_fill_4);
1046       __ andi_(temp, count, 4);
1047       __ beq(CCR0, L_exit);
1048       __ stb(value, 0, to);
1049       __ stb(value, 1, to);
1050       __ stb(value, 2, to);
1051       __ stb(value, 3, to);
1052       __ blr();
1053     }
1054 
1055     if (t == T_SHORT) {
1056       Label L_fill_2;
1057       __ bind(L_fill_elements);
1058       __ andi_(temp, count, 1);
1059       __ beq(CCR0, L_fill_2);
1060       __ sth(value, 0, to);
1061       __ addi(to, to, 2);
1062       __ bind(L_fill_2);
1063       __ andi_(temp, count, 2);
1064       __ beq(CCR0, L_exit);
1065       __ sth(value, 0, to);
1066       __ sth(value, 2, to);
1067       __ blr();
1068     }
1069     return start;
1070   }
1071 
1072 
1073   // Generate overlap test for array copy stubs.
1074   //
1075   // Input:
1076   //   R3_ARG1    -  from
1077   //   R4_ARG2    -  to
1078   //   R5_ARG3    -  element count
1079   //
1080   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
1081     Register tmp1 = R6_ARG4;
1082     Register tmp2 = R7_ARG5;
1083 
1084     Label l_overlap;
1085 #ifdef ASSERT
1086     __ srdi_(tmp2, R5_ARG3, 31);
1087     __ asm_assert_eq("missing zero extend", 0xAFFE);
1088 #endif
1089 
1090     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
1091     __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
1092     __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
1093     __ cmpld(CCR1, tmp1, tmp2);
1094     __ crand(CCR0, Assembler::less, CCR1, Assembler::less);
1095     __ blt(CCR0, l_overlap); // Src before dst and distance smaller than size.
1096 
1097     // need to copy forwards
1098     if (__ is_within_range_of_b(no_overlap_target, __ pc())) {
1099       __ b(no_overlap_target);
1100     } else {
1101       __ load_const(tmp1, no_overlap_target, tmp2);
1102       __ mtctr(tmp1);
1103       __ bctr();
1104     }
1105 
1106     __ bind(l_overlap);
1107     // need to copy backwards
1108   }
1109 
1110   // The guideline in the implementations of generate_disjoint_xxx_copy
1111   // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
1112   // single instructions, but to avoid alignment interrupts (see subsequent
1113   // comment). Furthermore, we try to minimize misaligned access, even
1114   // though they cause no alignment interrupt.
1115   //
1116   // In Big-Endian mode, the PowerPC architecture requires implementations to
1117   // handle automatically misaligned integer halfword and word accesses,
1118   // word-aligned integer doubleword accesses, and word-aligned floating-point
1119   // accesses. Other accesses may or may not generate an Alignment interrupt
1120   // depending on the implementation.
1121   // Alignment interrupt handling may require on the order of hundreds of cycles,
1122   // so every effort should be made to avoid misaligned memory values.
1123   //
1124   //
1125   // Generate stub for disjoint byte copy.  If "aligned" is true, the
1126   // "from" and "to" addresses are assumed to be heapword aligned.
1127   //
1128   // Arguments for generated stub:
1129   //      from:  R3_ARG1
1130   //      to:    R4_ARG2
1131   //      count: R5_ARG3 treated as signed
1132   //
1133   address generate_disjoint_byte_copy(bool aligned, const char * name) {
1134     StubCodeMark mark(this, "StubRoutines", name);
1135     address start = __ function_entry();
1136 
1137     Register tmp1 = R6_ARG4;
1138     Register tmp2 = R7_ARG5;
1139     Register tmp3 = R8_ARG6;
1140     Register tmp4 = R9_ARG7;
1141 
1142 
1143     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
1144     // Don't try anything fancy if arrays don't have many elements.
1145     __ li(tmp3, 0);
1146     __ cmpwi(CCR0, R5_ARG3, 17);
1147     __ ble(CCR0, l_6); // copy 4 at a time
1148 
1149     if (!aligned) {
1150       __ xorr(tmp1, R3_ARG1, R4_ARG2);
1151       __ andi_(tmp1, tmp1, 3);
1152       __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
1153 
1154       // Copy elements if necessary to align to 4 bytes.
1155       __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
1156       __ andi_(tmp1, tmp1, 3);
1157       __ beq(CCR0, l_2);
1158 
1159       __ subf(R5_ARG3, tmp1, R5_ARG3);
1160       __ bind(l_9);
1161       __ lbz(tmp2, 0, R3_ARG1);
1162       __ addic_(tmp1, tmp1, -1);
1163       __ stb(tmp2, 0, R4_ARG2);
1164       __ addi(R3_ARG1, R3_ARG1, 1);
1165       __ addi(R4_ARG2, R4_ARG2, 1);
1166       __ bne(CCR0, l_9);
1167 
1168       __ bind(l_2);
1169     }
1170 
1171     // copy 8 elements at a time
1172     __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
1173     __ andi_(tmp1, tmp2, 7);
1174     __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8
1175 
1176     // copy a 2-element word if necessary to align to 8 bytes
1177     __ andi_(R0, R3_ARG1, 7);
1178     __ beq(CCR0, l_7);
1179 
1180     __ lwzx(tmp2, R3_ARG1, tmp3);
1181     __ addi(R5_ARG3, R5_ARG3, -4);
1182     __ stwx(tmp2, R4_ARG2, tmp3);
1183     { // FasterArrayCopy
1184       __ addi(R3_ARG1, R3_ARG1, 4);
1185       __ addi(R4_ARG2, R4_ARG2, 4);
1186     }
1187     __ bind(l_7);
1188 
1189     { // FasterArrayCopy
1190       __ cmpwi(CCR0, R5_ARG3, 31);
1191       __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain
1192 
1193       __ srdi(tmp1, R5_ARG3, 5);
1194       __ andi_(R5_ARG3, R5_ARG3, 31);
1195       __ mtctr(tmp1);
1196 
1197       __ bind(l_8);
1198       // Use unrolled version for mass copying (copy 32 elements a time)
1199       // Load feeding store gets zero latency on Power6, however not on Power5.
1200       // Therefore, the following sequence is made for the good of both.
1201       __ ld(tmp1, 0, R3_ARG1);
1202       __ ld(tmp2, 8, R3_ARG1);
1203       __ ld(tmp3, 16, R3_ARG1);
1204       __ ld(tmp4, 24, R3_ARG1);
1205       __ std(tmp1, 0, R4_ARG2);
1206       __ std(tmp2, 8, R4_ARG2);
1207       __ std(tmp3, 16, R4_ARG2);
1208       __ std(tmp4, 24, R4_ARG2);
1209       __ addi(R3_ARG1, R3_ARG1, 32);
1210       __ addi(R4_ARG2, R4_ARG2, 32);
1211       __ bdnz(l_8);
1212     }
1213 
1214     __ bind(l_6);
1215 
1216     // copy 4 elements at a time
1217     __ cmpwi(CCR0, R5_ARG3, 4);
1218     __ blt(CCR0, l_1);
1219     __ srdi(tmp1, R5_ARG3, 2);
1220     __ mtctr(tmp1); // is > 0
1221     __ andi_(R5_ARG3, R5_ARG3, 3);
1222 
1223     { // FasterArrayCopy
1224       __ addi(R3_ARG1, R3_ARG1, -4);
1225       __ addi(R4_ARG2, R4_ARG2, -4);
1226       __ bind(l_3);
1227       __ lwzu(tmp2, 4, R3_ARG1);
1228       __ stwu(tmp2, 4, R4_ARG2);
1229       __ bdnz(l_3);
1230       __ addi(R3_ARG1, R3_ARG1, 4);
1231       __ addi(R4_ARG2, R4_ARG2, 4);
1232     }
1233 
1234     // do single element copy
1235     __ bind(l_1);
1236     __ cmpwi(CCR0, R5_ARG3, 0);
1237     __ beq(CCR0, l_4);
1238 
1239     { // FasterArrayCopy
1240       __ mtctr(R5_ARG3);
1241       __ addi(R3_ARG1, R3_ARG1, -1);
1242       __ addi(R4_ARG2, R4_ARG2, -1);
1243 
1244       __ bind(l_5);
1245       __ lbzu(tmp2, 1, R3_ARG1);
1246       __ stbu(tmp2, 1, R4_ARG2);
1247       __ bdnz(l_5);
1248     }
1249 
1250     __ bind(l_4);
1251     __ blr();
1252 
1253     return start;
1254   }
1255 
1256   // Generate stub for conjoint byte copy.  If "aligned" is true, the
1257   // "from" and "to" addresses are assumed to be heapword aligned.
1258   //
1259   // Arguments for generated stub:
1260   //      from:  R3_ARG1
1261   //      to:    R4_ARG2
1262   //      count: R5_ARG3 treated as signed
1263   //
1264   address generate_conjoint_byte_copy(bool aligned, const char * name) {
1265     StubCodeMark mark(this, "StubRoutines", name);
1266     address start = __ function_entry();
1267 
1268     Register tmp1 = R6_ARG4;
1269     Register tmp2 = R7_ARG5;
1270     Register tmp3 = R8_ARG6;
1271 
1272 #if defined(ABI_ELFv2)
1273     address nooverlap_target = aligned ?
1274       StubRoutines::arrayof_jbyte_disjoint_arraycopy() :
1275       StubRoutines::jbyte_disjoint_arraycopy();
1276 #else
1277     address nooverlap_target = aligned ?
1278       ((FunctionDescriptor*)StubRoutines::arrayof_jbyte_disjoint_arraycopy())->entry() :
1279       ((FunctionDescriptor*)StubRoutines::jbyte_disjoint_arraycopy())->entry();
1280 #endif
1281 
1282     array_overlap_test(nooverlap_target, 0);
1283     // Do reverse copy. We assume the case of actual overlap is rare enough
1284     // that we don't have to optimize it.
1285     Label l_1, l_2;
1286 
1287     __ b(l_2);
1288     __ bind(l_1);
1289     __ stbx(tmp1, R4_ARG2, R5_ARG3);
1290     __ bind(l_2);
1291     __ addic_(R5_ARG3, R5_ARG3, -1);
1292     __ lbzx(tmp1, R3_ARG1, R5_ARG3);
1293     __ bge(CCR0, l_1);
1294 
1295     __ blr();
1296 
1297     return start;
1298   }
1299 
1300   // Generate stub for disjoint short copy.  If "aligned" is true, the
1301   // "from" and "to" addresses are assumed to be heapword aligned.
1302   //
1303   // Arguments for generated stub:
1304   //      from:  R3_ARG1
1305   //      to:    R4_ARG2
1306   //  elm.count: R5_ARG3 treated as signed
1307   //
1308   // Strategy for aligned==true:
1309   //
1310   //  If length <= 9:
1311   //     1. copy 2 elements at a time (l_6)
1312   //     2. copy last element if original element count was odd (l_1)
1313   //
1314   //  If length > 9:
1315   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)
1316   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)
1317   //     3. copy last element if one was left in step 2. (l_1)
1318   //
1319   //
1320   // Strategy for aligned==false:
1321   //
1322   //  If length <= 9: same as aligned==true case, but NOTE: load/stores
1323   //                  can be unaligned (see comment below)
1324   //
1325   //  If length > 9:
1326   //     1. continue with step 6. if the alignment of from and to mod 4
1327   //        is different.
1328   //     2. align from and to to 4 bytes by copying 1 element if necessary
1329   //     3. at l_2 from and to are 4 byte aligned; continue with
1330   //        5. if they cannot be aligned to 8 bytes because they have
1331   //        got different alignment mod 8.
1332   //     4. at this point we know that both, from and to, have the same
1333   //        alignment mod 8, now copy one element if necessary to get
1334   //        8 byte alignment of from and to.
1335   //     5. copy 4 elements at a time until less than 4 elements are
1336   //        left; depending on step 3. all load/stores are aligned or
1337   //        either all loads or all stores are unaligned.
1338   //     6. copy 2 elements at a time until less than 2 elements are
1339   //        left (l_6); arriving here from step 1., there is a chance
1340   //        that all accesses are unaligned.
1341   //     7. copy last element if one was left in step 6. (l_1)
1342   //
1343   //  There are unaligned data accesses using integer load/store
1344   //  instructions in this stub. POWER allows such accesses.
1345   //
1346   //  According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
1347   //  Chapter 2: Effect of Operand Placement on Performance) unaligned
1348   //  integer load/stores have good performance. Only unaligned
1349   //  floating point load/stores can have poor performance.
1350   //
1351   //  TODO:
1352   //
1353   //  1. check if aligning the backbranch target of loops is beneficial
1354   //
1355   address generate_disjoint_short_copy(bool aligned, const char * name) {
1356     StubCodeMark mark(this, "StubRoutines", name);
1357 
1358     Register tmp1 = R6_ARG4;
1359     Register tmp2 = R7_ARG5;
1360     Register tmp3 = R8_ARG6;
1361     Register tmp4 = R9_ARG7;
1362 
1363     address start = __ function_entry();
1364 
1365       Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8;
1366     // don't try anything fancy if arrays don't have many elements
1367     __ li(tmp3, 0);
1368     __ cmpwi(CCR0, R5_ARG3, 9);
1369     __ ble(CCR0, l_6); // copy 2 at a time
1370 
1371     if (!aligned) {
1372       __ xorr(tmp1, R3_ARG1, R4_ARG2);
1373       __ andi_(tmp1, tmp1, 3);
1374       __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
1375 
1376       // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1377 
1378       // Copy 1 element if necessary to align to 4 bytes.
1379       __ andi_(tmp1, R3_ARG1, 3);
1380       __ beq(CCR0, l_2);
1381 
1382       __ lhz(tmp2, 0, R3_ARG1);
1383       __ addi(R3_ARG1, R3_ARG1, 2);
1384       __ sth(tmp2, 0, R4_ARG2);
1385       __ addi(R4_ARG2, R4_ARG2, 2);
1386       __ addi(R5_ARG3, R5_ARG3, -1);
1387       __ bind(l_2);
1388 
1389       // At this point the positions of both, from and to, are at least 4 byte aligned.
1390 
1391       // Copy 4 elements at a time.
1392       // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
1393       __ xorr(tmp2, R3_ARG1, R4_ARG2);
1394       __ andi_(tmp1, tmp2, 7);
1395       __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
1396 
1397       // Copy a 2-element word if necessary to align to 8 bytes.
1398       __ andi_(R0, R3_ARG1, 7);
1399       __ beq(CCR0, l_7);
1400 
1401       __ lwzx(tmp2, R3_ARG1, tmp3);
1402       __ addi(R5_ARG3, R5_ARG3, -2);
1403       __ stwx(tmp2, R4_ARG2, tmp3);
1404       { // FasterArrayCopy
1405         __ addi(R3_ARG1, R3_ARG1, 4);
1406         __ addi(R4_ARG2, R4_ARG2, 4);
1407       }
1408     }
1409 
1410     __ bind(l_7);
1411 
1412     // Copy 4 elements at a time; either the loads or the stores can
1413     // be unaligned if aligned == false.
1414 
1415     { // FasterArrayCopy
1416       __ cmpwi(CCR0, R5_ARG3, 15);
1417       __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain
1418 
1419       __ srdi(tmp1, R5_ARG3, 4);
1420       __ andi_(R5_ARG3, R5_ARG3, 15);
1421       __ mtctr(tmp1);
1422 
1423       __ bind(l_8);
1424       // Use unrolled version for mass copying (copy 16 elements a time).
1425       // Load feeding store gets zero latency on Power6, however not on Power5.
1426       // Therefore, the following sequence is made for the good of both.
1427       __ ld(tmp1, 0, R3_ARG1);
1428       __ ld(tmp2, 8, R3_ARG1);
1429       __ ld(tmp3, 16, R3_ARG1);
1430       __ ld(tmp4, 24, R3_ARG1);
1431       __ std(tmp1, 0, R4_ARG2);
1432       __ std(tmp2, 8, R4_ARG2);
1433       __ std(tmp3, 16, R4_ARG2);
1434       __ std(tmp4, 24, R4_ARG2);
1435       __ addi(R3_ARG1, R3_ARG1, 32);
1436       __ addi(R4_ARG2, R4_ARG2, 32);
1437       __ bdnz(l_8);
1438     }
1439     __ bind(l_6);
1440 
1441     // copy 2 elements at a time
1442     { // FasterArrayCopy
1443       __ cmpwi(CCR0, R5_ARG3, 2);
1444       __ blt(CCR0, l_1);
1445       __ srdi(tmp1, R5_ARG3, 1);
1446       __ andi_(R5_ARG3, R5_ARG3, 1);
1447 
1448       __ addi(R3_ARG1, R3_ARG1, -4);
1449       __ addi(R4_ARG2, R4_ARG2, -4);
1450       __ mtctr(tmp1);
1451 
1452       __ bind(l_3);
1453       __ lwzu(tmp2, 4, R3_ARG1);
1454       __ stwu(tmp2, 4, R4_ARG2);
1455       __ bdnz(l_3);
1456 
1457       __ addi(R3_ARG1, R3_ARG1, 4);
1458       __ addi(R4_ARG2, R4_ARG2, 4);
1459     }
1460 
1461     // do single element copy
1462     __ bind(l_1);
1463     __ cmpwi(CCR0, R5_ARG3, 0);
1464     __ beq(CCR0, l_4);
1465 
1466     { // FasterArrayCopy
1467       __ mtctr(R5_ARG3);
1468       __ addi(R3_ARG1, R3_ARG1, -2);
1469       __ addi(R4_ARG2, R4_ARG2, -2);
1470 
1471       __ bind(l_5);
1472       __ lhzu(tmp2, 2, R3_ARG1);
1473       __ sthu(tmp2, 2, R4_ARG2);
1474       __ bdnz(l_5);
1475     }
1476     __ bind(l_4);
1477     __ blr();
1478 
1479     return start;
1480   }
1481 
1482   // Generate stub for conjoint short copy.  If "aligned" is true, the
1483   // "from" and "to" addresses are assumed to be heapword aligned.
1484   //
1485   // Arguments for generated stub:
1486   //      from:  R3_ARG1
1487   //      to:    R4_ARG2
1488   //      count: R5_ARG3 treated as signed
1489   //
1490   address generate_conjoint_short_copy(bool aligned, const char * name) {
1491     StubCodeMark mark(this, "StubRoutines", name);
1492     address start = __ function_entry();
1493 
1494     Register tmp1 = R6_ARG4;
1495     Register tmp2 = R7_ARG5;
1496     Register tmp3 = R8_ARG6;
1497 
1498 #if defined(ABI_ELFv2)
1499     address nooverlap_target = aligned ?
1500         StubRoutines::arrayof_jshort_disjoint_arraycopy() :
1501         StubRoutines::jshort_disjoint_arraycopy();
1502 #else
1503     address nooverlap_target = aligned ?
1504         ((FunctionDescriptor*)StubRoutines::arrayof_jshort_disjoint_arraycopy())->entry() :
1505         ((FunctionDescriptor*)StubRoutines::jshort_disjoint_arraycopy())->entry();
1506 #endif
1507 
1508     array_overlap_test(nooverlap_target, 1);
1509 
1510     Label l_1, l_2;
1511     __ sldi(tmp1, R5_ARG3, 1);
1512     __ b(l_2);
1513     __ bind(l_1);
1514     __ sthx(tmp2, R4_ARG2, tmp1);
1515     __ bind(l_2);
1516     __ addic_(tmp1, tmp1, -2);
1517     __ lhzx(tmp2, R3_ARG1, tmp1);
1518     __ bge(CCR0, l_1);
1519 
1520     __ blr();
1521 
1522     return start;
1523   }
1524 
1525   // Generate core code for disjoint int copy (and oop copy on 32-bit).  If "aligned"
1526   // is true, the "from" and "to" addresses are assumed to be heapword aligned.
1527   //
1528   // Arguments:
1529   //      from:  R3_ARG1
1530   //      to:    R4_ARG2
1531   //      count: R5_ARG3 treated as signed
1532   //
1533   void generate_disjoint_int_copy_core(bool aligned) {
1534     Register tmp1 = R6_ARG4;
1535     Register tmp2 = R7_ARG5;
1536     Register tmp3 = R8_ARG6;
1537     Register tmp4 = R0;
1538 
1539     Label l_1, l_2, l_3, l_4, l_5, l_6;
1540     // for short arrays, just do single element copy
1541     __ li(tmp3, 0);
1542     __ cmpwi(CCR0, R5_ARG3, 5);
1543     __ ble(CCR0, l_2);
1544 
1545     if (!aligned) {
1546         // check if arrays have same alignment mod 8.
1547         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1548         __ andi_(R0, tmp1, 7);
1549         // Not the same alignment, but ld and std just need to be 4 byte aligned.
1550         __ bne(CCR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
1551 
1552         // copy 1 element to align to and from on an 8 byte boundary
1553         __ andi_(R0, R3_ARG1, 7);
1554         __ beq(CCR0, l_4);
1555 
1556         __ lwzx(tmp2, R3_ARG1, tmp3);
1557         __ addi(R5_ARG3, R5_ARG3, -1);
1558         __ stwx(tmp2, R4_ARG2, tmp3);
1559         { // FasterArrayCopy
1560           __ addi(R3_ARG1, R3_ARG1, 4);
1561           __ addi(R4_ARG2, R4_ARG2, 4);
1562         }
1563         __ bind(l_4);
1564       }
1565 
1566     { // FasterArrayCopy
1567       __ cmpwi(CCR0, R5_ARG3, 7);
1568       __ ble(CCR0, l_2); // copy 1 at a time if less than 8 elements remain
1569 
1570       __ srdi(tmp1, R5_ARG3, 3);
1571       __ andi_(R5_ARG3, R5_ARG3, 7);
1572       __ mtctr(tmp1);
1573 
1574       __ bind(l_6);
1575       // Use unrolled version for mass copying (copy 8 elements a time).
1576       // Load feeding store gets zero latency on power6, however not on power 5.
1577       // Therefore, the following sequence is made for the good of both.
1578       __ ld(tmp1, 0, R3_ARG1);
1579       __ ld(tmp2, 8, R3_ARG1);
1580       __ ld(tmp3, 16, R3_ARG1);
1581       __ ld(tmp4, 24, R3_ARG1);
1582       __ std(tmp1, 0, R4_ARG2);
1583       __ std(tmp2, 8, R4_ARG2);
1584       __ std(tmp3, 16, R4_ARG2);
1585       __ std(tmp4, 24, R4_ARG2);
1586       __ addi(R3_ARG1, R3_ARG1, 32);
1587       __ addi(R4_ARG2, R4_ARG2, 32);
1588       __ bdnz(l_6);
1589     }
1590 
1591     // copy 1 element at a time
1592     __ bind(l_2);
1593     __ cmpwi(CCR0, R5_ARG3, 0);
1594     __ beq(CCR0, l_1);
1595 
1596     { // FasterArrayCopy
1597       __ mtctr(R5_ARG3);
1598       __ addi(R3_ARG1, R3_ARG1, -4);
1599       __ addi(R4_ARG2, R4_ARG2, -4);
1600 
1601       __ bind(l_3);
1602       __ lwzu(tmp2, 4, R3_ARG1);
1603       __ stwu(tmp2, 4, R4_ARG2);
1604       __ bdnz(l_3);
1605     }
1606 
1607     __ bind(l_1);
1608     return;
1609   }
1610 
1611   // Generate stub for disjoint int copy.  If "aligned" is true, the
1612   // "from" and "to" addresses are assumed to be heapword aligned.
1613   //
1614   // Arguments for generated stub:
1615   //      from:  R3_ARG1
1616   //      to:    R4_ARG2
1617   //      count: R5_ARG3 treated as signed
1618   //
1619   address generate_disjoint_int_copy(bool aligned, const char * name) {
1620     StubCodeMark mark(this, "StubRoutines", name);
1621     address start = __ function_entry();
1622     generate_disjoint_int_copy_core(aligned);
1623     __ blr();
1624     return start;
1625   }
1626 
1627   // Generate core code for conjoint int copy (and oop copy on
1628   // 32-bit).  If "aligned" is true, the "from" and "to" addresses
1629   // are assumed to be heapword aligned.
1630   //
1631   // Arguments:
1632   //      from:  R3_ARG1
1633   //      to:    R4_ARG2
1634   //      count: R5_ARG3 treated as signed
1635   //
1636   void generate_conjoint_int_copy_core(bool aligned) {
1637     // Do reverse copy.  We assume the case of actual overlap is rare enough
1638     // that we don't have to optimize it.
1639 
1640     Label l_1, l_2, l_3, l_4, l_5, l_6;
1641 
1642     Register tmp1 = R6_ARG4;
1643     Register tmp2 = R7_ARG5;
1644     Register tmp3 = R8_ARG6;
1645     Register tmp4 = R0;
1646 
1647     { // FasterArrayCopy
1648       __ cmpwi(CCR0, R5_ARG3, 0);
1649       __ beq(CCR0, l_6);
1650 
1651       __ sldi(R5_ARG3, R5_ARG3, 2);
1652       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1653       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1654       __ srdi(R5_ARG3, R5_ARG3, 2);
1655 
1656       __ cmpwi(CCR0, R5_ARG3, 7);
1657       __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain
1658 
1659       __ srdi(tmp1, R5_ARG3, 3);
1660       __ andi(R5_ARG3, R5_ARG3, 7);
1661       __ mtctr(tmp1);
1662 
1663       __ bind(l_4);
1664       // Use unrolled version for mass copying (copy 4 elements a time).
1665       // Load feeding store gets zero latency on Power6, however not on Power5.
1666       // Therefore, the following sequence is made for the good of both.
1667       __ addi(R3_ARG1, R3_ARG1, -32);
1668       __ addi(R4_ARG2, R4_ARG2, -32);
1669       __ ld(tmp4, 24, R3_ARG1);
1670       __ ld(tmp3, 16, R3_ARG1);
1671       __ ld(tmp2, 8, R3_ARG1);
1672       __ ld(tmp1, 0, R3_ARG1);
1673       __ std(tmp4, 24, R4_ARG2);
1674       __ std(tmp3, 16, R4_ARG2);
1675       __ std(tmp2, 8, R4_ARG2);
1676       __ std(tmp1, 0, R4_ARG2);
1677       __ bdnz(l_4);
1678 
1679       __ cmpwi(CCR0, R5_ARG3, 0);
1680       __ beq(CCR0, l_6);
1681 
1682       __ bind(l_5);
1683       __ mtctr(R5_ARG3);
1684       __ bind(l_3);
1685       __ lwz(R0, -4, R3_ARG1);
1686       __ stw(R0, -4, R4_ARG2);
1687       __ addi(R3_ARG1, R3_ARG1, -4);
1688       __ addi(R4_ARG2, R4_ARG2, -4);
1689       __ bdnz(l_3);
1690 
1691       __ bind(l_6);
1692     }
1693   }
1694 
1695   // Generate stub for conjoint int copy.  If "aligned" is true, the
1696   // "from" and "to" addresses are assumed to be heapword aligned.
1697   //
1698   // Arguments for generated stub:
1699   //      from:  R3_ARG1
1700   //      to:    R4_ARG2
1701   //      count: R5_ARG3 treated as signed
1702   //
1703   address generate_conjoint_int_copy(bool aligned, const char * name) {
1704     StubCodeMark mark(this, "StubRoutines", name);
1705     address start = __ function_entry();
1706 
1707 #if defined(ABI_ELFv2)
1708     address nooverlap_target = aligned ?
1709       StubRoutines::arrayof_jint_disjoint_arraycopy() :
1710       StubRoutines::jint_disjoint_arraycopy();
1711 #else
1712     address nooverlap_target = aligned ?
1713       ((FunctionDescriptor*)StubRoutines::arrayof_jint_disjoint_arraycopy())->entry() :
1714       ((FunctionDescriptor*)StubRoutines::jint_disjoint_arraycopy())->entry();
1715 #endif
1716 
1717     array_overlap_test(nooverlap_target, 2);
1718 
1719     generate_conjoint_int_copy_core(aligned);
1720 
1721     __ blr();
1722 
1723     return start;
1724   }
1725 
1726   // Generate core code for disjoint long copy (and oop copy on
1727   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
1728   // are assumed to be heapword aligned.
1729   //
1730   // Arguments:
1731   //      from:  R3_ARG1
1732   //      to:    R4_ARG2
1733   //      count: R5_ARG3 treated as signed
1734   //
1735   void generate_disjoint_long_copy_core(bool aligned) {
1736     Register tmp1 = R6_ARG4;
1737     Register tmp2 = R7_ARG5;
1738     Register tmp3 = R8_ARG6;
1739     Register tmp4 = R0;
1740 
1741     Label l_1, l_2, l_3, l_4;
1742 
1743     { // FasterArrayCopy
1744       __ cmpwi(CCR0, R5_ARG3, 3);
1745       __ ble(CCR0, l_3); // copy 1 at a time if less than 4 elements remain
1746 
1747       __ srdi(tmp1, R5_ARG3, 2);
1748       __ andi_(R5_ARG3, R5_ARG3, 3);
1749       __ mtctr(tmp1);
1750 
1751       __ bind(l_4);
1752       // Use unrolled version for mass copying (copy 4 elements a time).
1753       // Load feeding store gets zero latency on Power6, however not on Power5.
1754       // Therefore, the following sequence is made for the good of both.
1755       __ ld(tmp1, 0, R3_ARG1);
1756       __ ld(tmp2, 8, R3_ARG1);
1757       __ ld(tmp3, 16, R3_ARG1);
1758       __ ld(tmp4, 24, R3_ARG1);
1759       __ std(tmp1, 0, R4_ARG2);
1760       __ std(tmp2, 8, R4_ARG2);
1761       __ std(tmp3, 16, R4_ARG2);
1762       __ std(tmp4, 24, R4_ARG2);
1763       __ addi(R3_ARG1, R3_ARG1, 32);
1764       __ addi(R4_ARG2, R4_ARG2, 32);
1765       __ bdnz(l_4);
1766     }
1767 
1768     // copy 1 element at a time
1769     __ bind(l_3);
1770     __ cmpwi(CCR0, R5_ARG3, 0);
1771     __ beq(CCR0, l_1);
1772 
1773     { // FasterArrayCopy
1774       __ mtctr(R5_ARG3);
1775       __ addi(R3_ARG1, R3_ARG1, -8);
1776       __ addi(R4_ARG2, R4_ARG2, -8);
1777 
1778       __ bind(l_2);
1779       __ ldu(R0, 8, R3_ARG1);
1780       __ stdu(R0, 8, R4_ARG2);
1781       __ bdnz(l_2);
1782 
1783     }
1784     __ bind(l_1);
1785   }
1786 
1787   // Generate stub for disjoint long copy.  If "aligned" is true, the
1788   // "from" and "to" addresses are assumed to be heapword aligned.
1789   //
1790   // Arguments for generated stub:
1791   //      from:  R3_ARG1
1792   //      to:    R4_ARG2
1793   //      count: R5_ARG3 treated as signed
1794   //
1795   address generate_disjoint_long_copy(bool aligned, const char * name) {
1796     StubCodeMark mark(this, "StubRoutines", name);
1797     address start = __ function_entry();
1798     generate_disjoint_long_copy_core(aligned);
1799     __ blr();
1800 
1801     return start;
1802   }
1803 
1804   // Generate core code for conjoint long copy (and oop copy on
1805   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
1806   // are assumed to be heapword aligned.
1807   //
1808   // Arguments:
1809   //      from:  R3_ARG1
1810   //      to:    R4_ARG2
1811   //      count: R5_ARG3 treated as signed
1812   //
1813   void generate_conjoint_long_copy_core(bool aligned) {
1814     Register tmp1 = R6_ARG4;
1815     Register tmp2 = R7_ARG5;
1816     Register tmp3 = R8_ARG6;
1817     Register tmp4 = R0;
1818 
1819     Label l_1, l_2, l_3, l_4, l_5;
1820 
1821     __ cmpwi(CCR0, R5_ARG3, 0);
1822     __ beq(CCR0, l_1);
1823 
1824     { // FasterArrayCopy
1825       __ sldi(R5_ARG3, R5_ARG3, 3);
1826       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1827       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1828       __ srdi(R5_ARG3, R5_ARG3, 3);
1829 
1830       __ cmpwi(CCR0, R5_ARG3, 3);
1831       __ ble(CCR0, l_5); // copy 1 at a time if less than 4 elements remain
1832 
1833       __ srdi(tmp1, R5_ARG3, 2);
1834       __ andi(R5_ARG3, R5_ARG3, 3);
1835       __ mtctr(tmp1);
1836 
1837       __ bind(l_4);
1838       // Use unrolled version for mass copying (copy 4 elements a time).
1839       // Load feeding store gets zero latency on Power6, however not on Power5.
1840       // Therefore, the following sequence is made for the good of both.
1841       __ addi(R3_ARG1, R3_ARG1, -32);
1842       __ addi(R4_ARG2, R4_ARG2, -32);
1843       __ ld(tmp4, 24, R3_ARG1);
1844       __ ld(tmp3, 16, R3_ARG1);
1845       __ ld(tmp2, 8, R3_ARG1);
1846       __ ld(tmp1, 0, R3_ARG1);
1847       __ std(tmp4, 24, R4_ARG2);
1848       __ std(tmp3, 16, R4_ARG2);
1849       __ std(tmp2, 8, R4_ARG2);
1850       __ std(tmp1, 0, R4_ARG2);
1851       __ bdnz(l_4);
1852 
1853       __ cmpwi(CCR0, R5_ARG3, 0);
1854       __ beq(CCR0, l_1);
1855 
1856       __ bind(l_5);
1857       __ mtctr(R5_ARG3);
1858       __ bind(l_3);
1859       __ ld(R0, -8, R3_ARG1);
1860       __ std(R0, -8, R4_ARG2);
1861       __ addi(R3_ARG1, R3_ARG1, -8);
1862       __ addi(R4_ARG2, R4_ARG2, -8);
1863       __ bdnz(l_3);
1864 
1865     }
1866     __ bind(l_1);
1867   }
1868 
1869   // Generate stub for conjoint long copy.  If "aligned" is true, the
1870   // "from" and "to" addresses are assumed to be heapword aligned.
1871   //
1872   // Arguments for generated stub:
1873   //      from:  R3_ARG1
1874   //      to:    R4_ARG2
1875   //      count: R5_ARG3 treated as signed
1876   //
1877   address generate_conjoint_long_copy(bool aligned, const char * name) {
1878     StubCodeMark mark(this, "StubRoutines", name);
1879     address start = __ function_entry();
1880 
1881 #if defined(ABI_ELFv2)
1882     address nooverlap_target = aligned ?
1883       StubRoutines::arrayof_jlong_disjoint_arraycopy() :
1884       StubRoutines::jlong_disjoint_arraycopy();
1885 #else
1886     address nooverlap_target = aligned ?
1887       ((FunctionDescriptor*)StubRoutines::arrayof_jlong_disjoint_arraycopy())->entry() :
1888       ((FunctionDescriptor*)StubRoutines::jlong_disjoint_arraycopy())->entry();
1889 #endif
1890 
1891     array_overlap_test(nooverlap_target, 3);
1892     generate_conjoint_long_copy_core(aligned);
1893 
1894     __ blr();
1895 
1896     return start;
1897   }
1898 
1899   // Generate stub for conjoint oop copy.  If "aligned" is true, the
1900   // "from" and "to" addresses are assumed to be heapword aligned.
1901   //
1902   // Arguments for generated stub:
1903   //      from:  R3_ARG1
1904   //      to:    R4_ARG2
1905   //      count: R5_ARG3 treated as signed
1906   //      dest_uninitialized: G1 support
1907   //
1908   address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
1909     StubCodeMark mark(this, "StubRoutines", name);
1910 
1911     address start = __ function_entry();
1912 
1913 #if defined(ABI_ELFv2)
1914     address nooverlap_target = aligned ?
1915       StubRoutines::arrayof_oop_disjoint_arraycopy() :
1916       StubRoutines::oop_disjoint_arraycopy();
1917 #else
1918     address nooverlap_target = aligned ?
1919       ((FunctionDescriptor*)StubRoutines::arrayof_oop_disjoint_arraycopy())->entry() :
1920       ((FunctionDescriptor*)StubRoutines::oop_disjoint_arraycopy())->entry();
1921 #endif
1922 
1923     gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
1924 
1925     // Save arguments.
1926     __ mr(R9_ARG7, R4_ARG2);
1927     __ mr(R10_ARG8, R5_ARG3);
1928 
1929     if (UseCompressedOops) {
1930       array_overlap_test(nooverlap_target, 2);
1931       generate_conjoint_int_copy_core(aligned);
1932     } else {
1933       array_overlap_test(nooverlap_target, 3);
1934       generate_conjoint_long_copy_core(aligned);
1935     }
1936 
1937     gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
1938     return start;
1939   }
1940 
1941   // Generate stub for disjoint oop copy.  If "aligned" is true, the
1942   // "from" and "to" addresses are assumed to be heapword aligned.
1943   //
1944   // Arguments for generated stub:
1945   //      from:  R3_ARG1
1946   //      to:    R4_ARG2
1947   //      count: R5_ARG3 treated as signed
1948   //      dest_uninitialized: G1 support
1949   //
1950   address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
1951     StubCodeMark mark(this, "StubRoutines", name);
1952     address start = __ function_entry();
1953 
1954     gen_write_ref_array_pre_barrier(R3_ARG1, R4_ARG2, R5_ARG3, dest_uninitialized, R9_ARG7);
1955 
1956     // save some arguments, disjoint_long_copy_core destroys them.
1957     // needed for post barrier
1958     __ mr(R9_ARG7, R4_ARG2);
1959     __ mr(R10_ARG8, R5_ARG3);
1960 
1961     if (UseCompressedOops) {
1962       generate_disjoint_int_copy_core(aligned);
1963     } else {
1964       generate_disjoint_long_copy_core(aligned);
1965     }
1966 
1967     gen_write_ref_array_post_barrier(R9_ARG7, R10_ARG8, R11_scratch1, /*branchToEnd*/ false);
1968 
1969     return start;
1970   }
1971 
1972   void generate_arraycopy_stubs() {
1973     // Note: the disjoint stubs must be generated first, some of
1974     // the conjoint stubs use them.
1975 
1976     // non-aligned disjoint versions
1977     StubRoutines::_jbyte_disjoint_arraycopy       = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
1978     StubRoutines::_jshort_disjoint_arraycopy      = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
1979     StubRoutines::_jint_disjoint_arraycopy        = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");
1980     StubRoutines::_jlong_disjoint_arraycopy       = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
1981     StubRoutines::_oop_disjoint_arraycopy         = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy", false);
1982     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy_uninit", true);
1983 
1984     // aligned disjoint versions
1985     StubRoutines::_arrayof_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
1986     StubRoutines::_arrayof_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
1987     StubRoutines::_arrayof_jint_disjoint_arraycopy       = generate_disjoint_int_copy(true, "arrayof_jint_disjoint_arraycopy");
1988     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
1989     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, "arrayof_oop_disjoint_arraycopy", false);
1990     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, "oop_disjoint_arraycopy_uninit", true);
1991 
1992     // non-aligned conjoint versions
1993     StubRoutines::_jbyte_arraycopy      = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
1994     StubRoutines::_jshort_arraycopy     = generate_conjoint_short_copy(false, "jshort_arraycopy");
1995     StubRoutines::_jint_arraycopy       = generate_conjoint_int_copy(false, "jint_arraycopy");
1996     StubRoutines::_jlong_arraycopy      = generate_conjoint_long_copy(false, "jlong_arraycopy");
1997     StubRoutines::_oop_arraycopy        = generate_conjoint_oop_copy(false, "oop_arraycopy", false);
1998     StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, "oop_arraycopy_uninit", true);
1999 
2000     // aligned conjoint versions
2001     StubRoutines::_arrayof_jbyte_arraycopy      = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
2002     StubRoutines::_arrayof_jshort_arraycopy     = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
2003     StubRoutines::_arrayof_jint_arraycopy       = generate_conjoint_int_copy(true, "arrayof_jint_arraycopy");
2004     StubRoutines::_arrayof_jlong_arraycopy      = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
2005     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", false);
2006     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", true);
2007 
2008     // fill routines
2009     StubRoutines::_jbyte_fill          = generate_fill(T_BYTE,  false, "jbyte_fill");
2010     StubRoutines::_jshort_fill         = generate_fill(T_SHORT, false, "jshort_fill");
2011     StubRoutines::_jint_fill           = generate_fill(T_INT,   false, "jint_fill");
2012     StubRoutines::_arrayof_jbyte_fill  = generate_fill(T_BYTE,  true, "arrayof_jbyte_fill");
2013     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
2014     StubRoutines::_arrayof_jint_fill   = generate_fill(T_INT,   true, "arrayof_jint_fill");
2015   }
2016 
2017   // Safefetch stubs.
2018   void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
2019     // safefetch signatures:
2020     //   int      SafeFetch32(int*      adr, int      errValue);
2021     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
2022     //
2023     // arguments:
2024     //   R3_ARG1 = adr
2025     //   R4_ARG2 = errValue
2026     //
2027     // result:
2028     //   R3_RET  = *adr or errValue
2029 
2030     StubCodeMark mark(this, "StubRoutines", name);
2031 
2032     // Entry point, pc or function descriptor.
2033     *entry = __ function_entry();
2034 
2035     // Load *adr into R4_ARG2, may fault.
2036     *fault_pc = __ pc();
2037     switch (size) {
2038       case 4:
2039         // int32_t, signed extended
2040         __ lwa(R4_ARG2, 0, R3_ARG1);
2041         break;
2042       case 8:
2043         // int64_t
2044         __ ld(R4_ARG2, 0, R3_ARG1);
2045         break;
2046       default:
2047         ShouldNotReachHere();
2048     }
2049 
2050     // return errValue or *adr
2051     *continuation_pc = __ pc();
2052     __ mr(R3_RET, R4_ARG2);
2053     __ blr();
2054   }
2055 
2056   // Stub for BigInteger::multiplyToLen()
2057   //
2058   //  Arguments:
2059   //
2060   //  Input:
2061   //    R3 - x address
2062   //    R4 - x length
2063   //    R5 - y address
2064   //    R6 - y length
2065   //    R7 - z address
2066   //    R8 - z length
2067   //
2068   address generate_multiplyToLen() {
2069 
2070     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
2071 
2072     address start = __ function_entry();
2073 
2074     const Register x     = R3;
2075     const Register xlen  = R4;
2076     const Register y     = R5;
2077     const Register ylen  = R6;
2078     const Register z     = R7;
2079     const Register zlen  = R8;
2080 
2081     const Register tmp1  = R2; // TOC not used.
2082     const Register tmp2  = R9;
2083     const Register tmp3  = R10;
2084     const Register tmp4  = R11;
2085     const Register tmp5  = R12;
2086 
2087     // non-volatile regs
2088     const Register tmp6  = R31;
2089     const Register tmp7  = R30;
2090     const Register tmp8  = R29;
2091     const Register tmp9  = R28;
2092     const Register tmp10 = R27;
2093     const Register tmp11 = R26;
2094     const Register tmp12 = R25;
2095     const Register tmp13 = R24;
2096 
2097     BLOCK_COMMENT("Entry:");
2098 
2099     // Save non-volatile regs (frameless).
2100     int current_offs = 8;
2101     __ std(R24, -current_offs, R1_SP); current_offs += 8;
2102     __ std(R25, -current_offs, R1_SP); current_offs += 8;
2103     __ std(R26, -current_offs, R1_SP); current_offs += 8;
2104     __ std(R27, -current_offs, R1_SP); current_offs += 8;
2105     __ std(R28, -current_offs, R1_SP); current_offs += 8;
2106     __ std(R29, -current_offs, R1_SP); current_offs += 8;
2107     __ std(R30, -current_offs, R1_SP); current_offs += 8;
2108     __ std(R31, -current_offs, R1_SP);
2109 
2110     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5,
2111                        tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13);
2112 
2113     // Restore non-volatile regs.
2114     current_offs = 8;
2115     __ ld(R24, -current_offs, R1_SP); current_offs += 8;
2116     __ ld(R25, -current_offs, R1_SP); current_offs += 8;
2117     __ ld(R26, -current_offs, R1_SP); current_offs += 8;
2118     __ ld(R27, -current_offs, R1_SP); current_offs += 8;
2119     __ ld(R28, -current_offs, R1_SP); current_offs += 8;
2120     __ ld(R29, -current_offs, R1_SP); current_offs += 8;
2121     __ ld(R30, -current_offs, R1_SP); current_offs += 8;
2122     __ ld(R31, -current_offs, R1_SP);
2123 
2124     __ blr();  // Return to caller.
2125 
2126     return start;
2127   }
2128 
2129   // Initialization
2130   void generate_initial() {
2131     // Generates all stubs and initializes the entry points
2132 
2133     // Entry points that exist in all platforms.
2134     // Note: This is code that could be shared among different platforms - however the
2135     // benefit seems to be smaller than the disadvantage of having a
2136     // much more complicated generator structure. See also comment in
2137     // stubRoutines.hpp.
2138 
2139     StubRoutines::_forward_exception_entry          = generate_forward_exception();
2140     StubRoutines::_call_stub_entry                  = generate_call_stub(StubRoutines::_call_stub_return_address);
2141     StubRoutines::_catch_exception_entry            = generate_catch_exception();
2142 
2143     // Build this early so it's available for the interpreter.
2144     StubRoutines::_throw_StackOverflowError_entry   =
2145       generate_throw_exception("StackOverflowError throw_exception",
2146                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
2147   }
2148 
2149   void generate_all() {
2150     // Generates all stubs and initializes the entry points
2151 
2152     // These entry points require SharedInfo::stack0 to be set up in
2153     // non-core builds
2154     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
2155     // Handle IncompatibleClassChangeError in itable stubs.
2156     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError),  false);
2157     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
2158 
2159     StubRoutines::_handler_for_unsafe_access_entry         = generate_handler_for_unsafe_access();
2160 
2161     // support for verify_oop (must happen after universe_init)
2162     StubRoutines::_verify_oop_subroutine_entry             = generate_verify_oop();
2163 
2164     // arraycopy stubs used by compilers
2165     generate_arraycopy_stubs();
2166 
2167     if (UseAESIntrinsics) {
2168       guarantee(!UseAESIntrinsics, "not yet implemented.");
2169     }
2170 
2171     // Safefetch stubs.
2172     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
2173                                                        &StubRoutines::_safefetch32_fault_pc,
2174                                                        &StubRoutines::_safefetch32_continuation_pc);
2175     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
2176                                                        &StubRoutines::_safefetchN_fault_pc,
2177                                                        &StubRoutines::_safefetchN_continuation_pc);
2178 
2179 #ifdef COMPILER2
2180     if (UseMultiplyToLenIntrinsic) {
2181       StubRoutines::_multiplyToLen = generate_multiplyToLen();
2182     }
2183 #endif
2184   }
2185 
2186  public:
2187   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
2188     // replace the standard masm with a special one:
2189     _masm = new MacroAssembler(code);
2190     if (all) {
2191       generate_all();
2192     } else {
2193       generate_initial();
2194     }
2195   }
2196 };
2197 
2198 void StubGenerator_generate(CodeBuffer* code, bool all) {
2199   StubGenerator g(code, all);
2200 }