Old src/hotspot/cpu/ppc/stubGenerator

   1 /*
   2  * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
   3  * Copyright (c) 2012, 2019, SAP SE. All rights reserved.
   4  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   5  *
   6  * This code is free software; you can redistribute it and/or modify it
   7  * under the terms of the GNU General Public License version 2 only, as
   8  * published by the Free Software Foundation.
   9  *
  10  * This code is distributed in the hope that it will be useful, but WITHOUT
  11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  13  * version 2 for more details (a copy is included in the LICENSE file that
  14  * accompanied this code).
  15  *
  16  * You should have received a copy of the GNU General Public License version
  17  * 2 along with this work; if not, write to the Free Software Foundation,
  18  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  19  *
  20  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  21  * or visit www.oracle.com if you need additional information or have any
  22  * questions.
  23  *
  24  */
  25 
  26 #include "precompiled.hpp"
  27 #include "asm/macroAssembler.inline.hpp"
  28 #include "gc/shared/barrierSet.hpp"
  29 #include "gc/shared/barrierSetAssembler.hpp"
  30 #include "interpreter/interpreter.hpp"
  31 #include "nativeInst_ppc.hpp"
  32 #include "oops/instanceOop.hpp"
  33 #include "oops/method.hpp"
  34 #include "oops/objArrayKlass.hpp"
  35 #include "oops/oop.inline.hpp"
  36 #include "prims/methodHandles.hpp"
  37 #include "runtime/frame.inline.hpp"
  38 #include "runtime/handles.inline.hpp"
  39 #include "runtime/sharedRuntime.hpp"
  40 #include "runtime/stubCodeGenerator.hpp"
  41 #include "runtime/stubRoutines.hpp"
  42 #include "runtime/thread.inline.hpp"
  43 #include "utilities/align.hpp"
  44 #include "utilities/powerOfTwo.hpp"
  45 
  46 // Declaration and definition of StubGenerator (no .hpp file).
  47 // For a more detailed description of the stub routine structure
  48 // see the comment in stubRoutines.hpp.
  49 
  50 #define __ _masm->
  51 
  52 #ifdef PRODUCT
  53 #define BLOCK_COMMENT(str) // nothing
  54 #else
  55 #define BLOCK_COMMENT(str) __ block_comment(str)
  56 #endif
  57 
  58 #if defined(ABI_ELFv2)
  59 #define STUB_ENTRY(name) StubRoutines::name()
  60 #else
  61 #define STUB_ENTRY(name) ((FunctionDescriptor*)StubRoutines::name())->entry()
  62 #endif
  63 
  64 class StubGenerator: public StubCodeGenerator {
  65  private:
  66 
  67   // Call stubs are used to call Java from C
  68   //
  69   // Arguments:
  70   //
  71   //   R3  - call wrapper address     : address
  72   //   R4  - result                   : intptr_t*
  73   //   R5  - result type              : BasicType
  74   //   R6  - method                   : Method
  75   //   R7  - frame mgr entry point    : address
  76   //   R8  - parameter block          : intptr_t*
  77   //   R9  - parameter count in words : int
  78   //   R10 - thread                   : Thread*
  79   //
  80   address generate_call_stub(address& return_address) {
  81     // Setup a new c frame, copy java arguments, call frame manager or
  82     // native_entry, and process result.
  83 
  84     StubCodeMark mark(this, "StubRoutines", "call_stub");
  85 
  86     address start = __ function_entry();
  87 
  88     // some sanity checks
  89     assert((sizeof(frame::abi_minframe) % 16) == 0,           "unaligned");
  90     assert((sizeof(frame::abi_reg_args) % 16) == 0,           "unaligned");
  91     assert((sizeof(frame::spill_nonvolatiles) % 16) == 0,     "unaligned");
  92     assert((sizeof(frame::parent_ijava_frame_abi) % 16) == 0, "unaligned");
  93     assert((sizeof(frame::entry_frame_locals) % 16) == 0,     "unaligned");
  94 
  95     Register r_arg_call_wrapper_addr        = R3;
  96     Register r_arg_result_addr              = R4;
  97     Register r_arg_result_type              = R5;
  98     Register r_arg_method                   = R6;
  99     Register r_arg_entry                    = R7;
 100     Register r_arg_thread                   = R10;
 101 
 102     Register r_temp                         = R24;
 103     Register r_top_of_arguments_addr        = R25;
 104     Register r_entryframe_fp                = R26;
 105 
 106     {
 107       // Stack on entry to call_stub:
 108       //
 109       //      F1      [C_FRAME]
 110       //              ...
 111 
 112       Register r_arg_argument_addr          = R8;
 113       Register r_arg_argument_count         = R9;
 114       Register r_frame_alignment_in_bytes   = R27;
 115       Register r_argument_addr              = R28;
 116       Register r_argumentcopy_addr          = R29;
 117       Register r_argument_size_in_bytes     = R30;
 118       Register r_frame_size                 = R23;
 119 
 120       Label arguments_copied;
 121 
 122       // Save LR/CR to caller's C_FRAME.
 123       __ save_LR_CR(R0);
 124 
 125       // Zero extend arg_argument_count.
 126       __ clrldi(r_arg_argument_count, r_arg_argument_count, 32);
 127 
 128       // Save non-volatiles GPRs to ENTRY_FRAME (not yet pushed, but it's safe).
 129       __ save_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
 130 
 131       // Keep copy of our frame pointer (caller's SP).
 132       __ mr(r_entryframe_fp, R1_SP);
 133 
 134       BLOCK_COMMENT("Push ENTRY_FRAME including arguments");
 135       // Push ENTRY_FRAME including arguments:
 136       //
 137       //      F0      [TOP_IJAVA_FRAME_ABI]
 138       //              alignment (optional)
 139       //              [outgoing Java arguments]
 140       //              [ENTRY_FRAME_LOCALS]
 141       //      F1      [C_FRAME]
 142       //              ...
 143 
 144       // calculate frame size
 145 
 146       // unaligned size of arguments
 147       __ sldi(r_argument_size_in_bytes,
 148                   r_arg_argument_count, Interpreter::logStackElementSize);
 149       // arguments alignment (max 1 slot)
 150       // FIXME: use round_to() here
 151       __ andi_(r_frame_alignment_in_bytes, r_arg_argument_count, 1);
 152       __ sldi(r_frame_alignment_in_bytes,
 153               r_frame_alignment_in_bytes, Interpreter::logStackElementSize);
 154 
 155       // size = unaligned size of arguments + top abi's size
 156       __ addi(r_frame_size, r_argument_size_in_bytes,
 157               frame::top_ijava_frame_abi_size);
 158       // size += arguments alignment
 159       __ add(r_frame_size,
 160              r_frame_size, r_frame_alignment_in_bytes);
 161       // size += size of call_stub locals
 162       __ addi(r_frame_size,
 163               r_frame_size, frame::entry_frame_locals_size);
 164 
 165       // push ENTRY_FRAME
 166       __ push_frame(r_frame_size, r_temp);
 167 
 168       // initialize call_stub locals (step 1)
 169       __ std(r_arg_call_wrapper_addr,
 170              _entry_frame_locals_neg(call_wrapper_address), r_entryframe_fp);
 171       __ std(r_arg_result_addr,
 172              _entry_frame_locals_neg(result_address), r_entryframe_fp);
 173       __ std(r_arg_result_type,
 174              _entry_frame_locals_neg(result_type), r_entryframe_fp);
 175       // we will save arguments_tos_address later
 176 
 177 
 178       BLOCK_COMMENT("Copy Java arguments");
 179       // copy Java arguments
 180 
 181       // Calculate top_of_arguments_addr which will be R17_tos (not prepushed) later.
 182       // FIXME: why not simply use SP+frame::top_ijava_frame_size?
 183       __ addi(r_top_of_arguments_addr,
 184               R1_SP, frame::top_ijava_frame_abi_size);
 185       __ add(r_top_of_arguments_addr,
 186              r_top_of_arguments_addr, r_frame_alignment_in_bytes);
 187 
 188       // any arguments to copy?
 189       __ cmpdi(CCR0, r_arg_argument_count, 0);
 190       __ beq(CCR0, arguments_copied);
 191 
 192       // prepare loop and copy arguments in reverse order
 193       {
 194         // init CTR with arg_argument_count
 195         __ mtctr(r_arg_argument_count);
 196 
 197         // let r_argumentcopy_addr point to last outgoing Java arguments P
 198         __ mr(r_argumentcopy_addr, r_top_of_arguments_addr);
 199 
 200         // let r_argument_addr point to last incoming java argument
 201         __ add(r_argument_addr,
 202                    r_arg_argument_addr, r_argument_size_in_bytes);
 203         __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
 204 
 205         // now loop while CTR > 0 and copy arguments
 206         {
 207           Label next_argument;
 208           __ bind(next_argument);
 209 
 210           __ ld(r_temp, 0, r_argument_addr);
 211           // argument_addr--;
 212           __ addi(r_argument_addr, r_argument_addr, -BytesPerWord);
 213           __ std(r_temp, 0, r_argumentcopy_addr);
 214           // argumentcopy_addr++;
 215           __ addi(r_argumentcopy_addr, r_argumentcopy_addr, BytesPerWord);
 216 
 217           __ bdnz(next_argument);
 218         }
 219       }
 220 
 221       // Arguments copied, continue.
 222       __ bind(arguments_copied);
 223     }
 224 
 225     {
 226       BLOCK_COMMENT("Call frame manager or native entry.");
 227       // Call frame manager or native entry.
 228       Register r_new_arg_entry = R14;
 229       assert_different_registers(r_new_arg_entry, r_top_of_arguments_addr,
 230                                  r_arg_method, r_arg_thread);
 231 
 232       __ mr(r_new_arg_entry, r_arg_entry);
 233 
 234       // Register state on entry to frame manager / native entry:
 235       //
 236       //   tos         -  intptr_t*    sender tos (prepushed) Lesp = (SP) + copied_arguments_offset - 8
 237       //   R19_method  -  Method
 238       //   R16_thread  -  JavaThread*
 239 
 240       // Tos must point to last argument - element_size.
 241       const Register tos = R15_esp;
 242 
 243       __ addi(tos, r_top_of_arguments_addr, -Interpreter::stackElementSize);
 244 
 245       // initialize call_stub locals (step 2)
 246       // now save tos as arguments_tos_address
 247       __ std(tos, _entry_frame_locals_neg(arguments_tos_address), r_entryframe_fp);
 248 
 249       // load argument registers for call
 250       __ mr(R19_method, r_arg_method);
 251       __ mr(R16_thread, r_arg_thread);
 252       assert(tos != r_arg_method, "trashed r_arg_method");
 253       assert(tos != r_arg_thread && R19_method != r_arg_thread, "trashed r_arg_thread");
 254 
 255       // Set R15_prev_state to 0 for simplifying checks in callee.
 256       __ load_const_optimized(R25_templateTableBase, (address)Interpreter::dispatch_table((TosState)0), R11_scratch1);
 257       // Stack on entry to frame manager / native entry:
 258       //
 259       //      F0      [TOP_IJAVA_FRAME_ABI]
 260       //              alignment (optional)
 261       //              [outgoing Java arguments]
 262       //              [ENTRY_FRAME_LOCALS]
 263       //      F1      [C_FRAME]
 264       //              ...
 265       //
 266 
 267       // global toc register
 268       __ load_const_optimized(R29_TOC, MacroAssembler::global_toc(), R11_scratch1);
 269       // Remember the senderSP so we interpreter can pop c2i arguments off of the stack
 270       // when called via a c2i.
 271 
 272       // Pass initial_caller_sp to framemanager.
 273       __ mr(R21_sender_SP, R1_SP);
 274 
 275       // Do a light-weight C-call here, r_new_arg_entry holds the address
 276       // of the interpreter entry point (frame manager or native entry)
 277       // and save runtime-value of LR in return_address.
 278       assert(r_new_arg_entry != tos && r_new_arg_entry != R19_method && r_new_arg_entry != R16_thread,
 279              "trashed r_new_arg_entry");
 280       return_address = __ call_stub(r_new_arg_entry);
 281     }
 282 
 283     {
 284       BLOCK_COMMENT("Returned from frame manager or native entry.");
 285       // Returned from frame manager or native entry.
 286       // Now pop frame, process result, and return to caller.
 287 
 288       // Stack on exit from frame manager / native entry:
 289       //
 290       //      F0      [ABI]
 291       //              ...
 292       //              [ENTRY_FRAME_LOCALS]
 293       //      F1      [C_FRAME]
 294       //              ...
 295       //
 296       // Just pop the topmost frame ...
 297       //
 298 
 299       Label ret_is_object;
 300       Label ret_is_long;
 301       Label ret_is_float;
 302       Label ret_is_double;
 303 
 304       Register r_entryframe_fp = R30;
 305       Register r_lr            = R7_ARG5;
 306       Register r_cr            = R8_ARG6;
 307 
 308       // Reload some volatile registers which we've spilled before the call
 309       // to frame manager / native entry.
 310       // Access all locals via frame pointer, because we know nothing about
 311       // the topmost frame's size.
 312       __ ld(r_entryframe_fp, _abi(callers_sp), R1_SP);
 313       assert_different_registers(r_entryframe_fp, R3_RET, r_arg_result_addr, r_arg_result_type, r_cr, r_lr);
 314       __ ld(r_arg_result_addr,
 315             _entry_frame_locals_neg(result_address), r_entryframe_fp);
 316       __ ld(r_arg_result_type,
 317             _entry_frame_locals_neg(result_type), r_entryframe_fp);
 318       __ ld(r_cr, _abi(cr), r_entryframe_fp);
 319       __ ld(r_lr, _abi(lr), r_entryframe_fp);
 320 
 321       // pop frame and restore non-volatiles, LR and CR
 322       __ mr(R1_SP, r_entryframe_fp);
 323       __ mtcr(r_cr);
 324       __ mtlr(r_lr);
 325 
 326       // Store result depending on type. Everything that is not
 327       // T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE is treated as T_INT.
 328       __ cmpwi(CCR0, r_arg_result_type, T_OBJECT);
 329       __ cmpwi(CCR1, r_arg_result_type, T_LONG);
 330       __ cmpwi(CCR5, r_arg_result_type, T_FLOAT);
 331       __ cmpwi(CCR6, r_arg_result_type, T_DOUBLE);
 332 
 333       // restore non-volatile registers
 334       __ restore_nonvolatile_gprs(R1_SP, _spill_nonvolatiles_neg(r14));
 335 
 336 
 337       // Stack on exit from call_stub:
 338       //
 339       //      0       [C_FRAME]
 340       //              ...
 341       //
 342       //  no call_stub frames left.
 343 
 344       // All non-volatiles have been restored at this point!!
 345       assert(R3_RET == R3, "R3_RET should be R3");
 346 
 347       __ beq(CCR0, ret_is_object);
 348       __ beq(CCR1, ret_is_long);
 349       __ beq(CCR5, ret_is_float);
 350       __ beq(CCR6, ret_is_double);
 351 
 352       // default:
 353       __ stw(R3_RET, 0, r_arg_result_addr);
 354       __ blr(); // return to caller
 355 
 356       // case T_OBJECT:
 357       __ bind(ret_is_object);
 358       __ std(R3_RET, 0, r_arg_result_addr);
 359       __ blr(); // return to caller
 360 
 361       // case T_LONG:
 362       __ bind(ret_is_long);
 363       __ std(R3_RET, 0, r_arg_result_addr);
 364       __ blr(); // return to caller
 365 
 366       // case T_FLOAT:
 367       __ bind(ret_is_float);
 368       __ stfs(F1_RET, 0, r_arg_result_addr);
 369       __ blr(); // return to caller
 370 
 371       // case T_DOUBLE:
 372       __ bind(ret_is_double);
 373       __ stfd(F1_RET, 0, r_arg_result_addr);
 374       __ blr(); // return to caller
 375     }
 376 
 377     return start;
 378   }
 379 
 380   // Return point for a Java call if there's an exception thrown in
 381   // Java code.  The exception is caught and transformed into a
 382   // pending exception stored in JavaThread that can be tested from
 383   // within the VM.
 384   //
 385   address generate_catch_exception() {
 386     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 387 
 388     address start = __ pc();
 389 
 390     // Registers alive
 391     //
 392     //  R16_thread
 393     //  R3_ARG1 - address of pending exception
 394     //  R4_ARG2 - return address in call stub
 395 
 396     const Register exception_file = R21_tmp1;
 397     const Register exception_line = R22_tmp2;
 398 
 399     __ load_const(exception_file, (void*)__FILE__);
 400     __ load_const(exception_line, (void*)__LINE__);
 401 
 402     __ std(R3_ARG1, in_bytes(JavaThread::pending_exception_offset()), R16_thread);
 403     // store into `char *'
 404     __ std(exception_file, in_bytes(JavaThread::exception_file_offset()), R16_thread);
 405     // store into `int'
 406     __ stw(exception_line, in_bytes(JavaThread::exception_line_offset()), R16_thread);
 407 
 408     // complete return to VM
 409     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
 410 
 411     __ mtlr(R4_ARG2);
 412     // continue in call stub
 413     __ blr();
 414 
 415     return start;
 416   }
 417 
 418   // Continuation point for runtime calls returning with a pending
 419   // exception.  The pending exception check happened in the runtime
 420   // or native call stub.  The pending exception in Thread is
 421   // converted into a Java-level exception.
 422   //
 423   // Read:
 424   //
 425   //   LR:     The pc the runtime library callee wants to return to.
 426   //           Since the exception occurred in the callee, the return pc
 427   //           from the point of view of Java is the exception pc.
 428   //   thread: Needed for method handles.
 429   //
 430   // Invalidate:
 431   //
 432   //   volatile registers (except below).
 433   //
 434   // Update:
 435   //
 436   //   R4_ARG2: exception
 437   //
 438   // (LR is unchanged and is live out).
 439   //
 440   address generate_forward_exception() {
 441     StubCodeMark mark(this, "StubRoutines", "forward_exception");
 442     address start = __ pc();
 443 
 444     if (VerifyOops) {
 445       // Get pending exception oop.
 446       __ ld(R3_ARG1,
 447                 in_bytes(Thread::pending_exception_offset()),
 448                 R16_thread);
 449       // Make sure that this code is only executed if there is a pending exception.
 450       {
 451         Label L;
 452         __ cmpdi(CCR0, R3_ARG1, 0);
 453         __ bne(CCR0, L);
 454         __ stop("StubRoutines::forward exception: no pending exception (1)");
 455         __ bind(L);
 456       }
 457       __ verify_oop(R3_ARG1, "StubRoutines::forward exception: not an oop");
 458     }
 459 
 460     // Save LR/CR and copy exception pc (LR) into R4_ARG2.
 461     __ save_LR_CR(R4_ARG2);
 462     __ push_frame_reg_args(0, R0);
 463     // Find exception handler.
 464     __ call_VM_leaf(CAST_FROM_FN_PTR(address,
 465                      SharedRuntime::exception_handler_for_return_address),
 466                     R16_thread,
 467                     R4_ARG2);
 468     // Copy handler's address.
 469     __ mtctr(R3_RET);
 470     __ pop_frame();
 471     __ restore_LR_CR(R0);
 472 
 473     // Set up the arguments for the exception handler:
 474     //  - R3_ARG1: exception oop
 475     //  - R4_ARG2: exception pc.
 476 
 477     // Load pending exception oop.
 478     __ ld(R3_ARG1,
 479               in_bytes(Thread::pending_exception_offset()),
 480               R16_thread);
 481 
 482     // The exception pc is the return address in the caller.
 483     // Must load it into R4_ARG2.
 484     __ mflr(R4_ARG2);
 485 
 486 #ifdef ASSERT
 487     // Make sure exception is set.
 488     {
 489       Label L;
 490       __ cmpdi(CCR0, R3_ARG1, 0);
 491       __ bne(CCR0, L);
 492       __ stop("StubRoutines::forward exception: no pending exception (2)");
 493       __ bind(L);
 494     }
 495 #endif
 496 
 497     // Clear the pending exception.
 498     __ li(R0, 0);
 499     __ std(R0,
 500                in_bytes(Thread::pending_exception_offset()),
 501                R16_thread);
 502     // Jump to exception handler.
 503     __ bctr();
 504 
 505     return start;
 506   }
 507 
 508 #undef __
 509 #define __ masm->
 510   // Continuation point for throwing of implicit exceptions that are
 511   // not handled in the current activation. Fabricates an exception
 512   // oop and initiates normal exception dispatching in this
 513   // frame. Only callee-saved registers are preserved (through the
 514   // normal register window / RegisterMap handling).  If the compiler
 515   // needs all registers to be preserved between the fault point and
 516   // the exception handler then it must assume responsibility for that
 517   // in AbstractCompiler::continuation_for_implicit_null_exception or
 518   // continuation_for_implicit_division_by_zero_exception. All other
 519   // implicit exceptions (e.g., NullPointerException or
 520   // AbstractMethodError on entry) are either at call sites or
 521   // otherwise assume that stack unwinding will be initiated, so
 522   // caller saved registers were assumed volatile in the compiler.
 523   //
 524   // Note that we generate only this stub into a RuntimeStub, because
 525   // it needs to be properly traversed and ignored during GC, so we
 526   // change the meaning of the "__" macro within this method.
 527   //
 528   // Note: the routine set_pc_not_at_call_for_caller in
 529   // SharedRuntime.cpp requires that this code be generated into a
 530   // RuntimeStub.
 531   address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc,
 532                                    Register arg1 = noreg, Register arg2 = noreg) {
 533     CodeBuffer code(name, 1024 DEBUG_ONLY(+ 512), 0);
 534     MacroAssembler* masm = new MacroAssembler(&code);
 535 
 536     OopMapSet* oop_maps  = new OopMapSet();
 537     int frame_size_in_bytes = frame::abi_reg_args_size;
 538     OopMap* map = new OopMap(frame_size_in_bytes / sizeof(jint), 0);
 539 
 540     address start = __ pc();
 541 
 542     __ save_LR_CR(R11_scratch1);
 543 
 544     // Push a frame.
 545     __ push_frame_reg_args(0, R11_scratch1);
 546 
 547     address frame_complete_pc = __ pc();
 548 
 549     if (restore_saved_exception_pc) {
 550       __ unimplemented("StubGenerator::throw_exception with restore_saved_exception_pc");
 551     }
 552 
 553     // Note that we always have a runtime stub frame on the top of
 554     // stack by this point. Remember the offset of the instruction
 555     // whose address will be moved to R11_scratch1.
 556     address gc_map_pc = __ get_PC_trash_LR(R11_scratch1);
 557 
 558     __ set_last_Java_frame(/*sp*/R1_SP, /*pc*/R11_scratch1);
 559 
 560     __ mr(R3_ARG1, R16_thread);
 561     if (arg1 != noreg) {
 562       __ mr(R4_ARG2, arg1);
 563     }
 564     if (arg2 != noreg) {
 565       __ mr(R5_ARG3, arg2);
 566     }
 567 #if defined(ABI_ELFv2)
 568     __ call_c(runtime_entry, relocInfo::none);
 569 #else
 570     __ call_c(CAST_FROM_FN_PTR(FunctionDescriptor*, runtime_entry), relocInfo::none);
 571 #endif
 572 
 573     // Set an oopmap for the call site.
 574     oop_maps->add_gc_map((int)(gc_map_pc - start), map);
 575 
 576     __ reset_last_Java_frame();
 577 
 578 #ifdef ASSERT
 579     // Make sure that this code is only executed if there is a pending
 580     // exception.
 581     {
 582       Label L;
 583       __ ld(R0,
 584                 in_bytes(Thread::pending_exception_offset()),
 585                 R16_thread);
 586       __ cmpdi(CCR0, R0, 0);
 587       __ bne(CCR0, L);
 588       __ stop("StubRoutines::throw_exception: no pending exception");
 589       __ bind(L);
 590     }
 591 #endif
 592 
 593     // Pop frame.
 594     __ pop_frame();
 595 
 596     __ restore_LR_CR(R11_scratch1);
 597 
 598     __ load_const(R11_scratch1, StubRoutines::forward_exception_entry());
 599     __ mtctr(R11_scratch1);
 600     __ bctr();
 601 
 602     // Create runtime stub with OopMap.
 603     RuntimeStub* stub =
 604       RuntimeStub::new_runtime_stub(name, &code,
 605                                     /*frame_complete=*/ (int)(frame_complete_pc - start),
 606                                     frame_size_in_bytes/wordSize,
 607                                     oop_maps,
 608                                     false);
 609     return stub->entry_point();
 610   }
 611 #undef __
 612 #define __ _masm->
 613 
 614 
 615   // Support for void zero_words_aligned8(HeapWord* to, size_t count)
 616   //
 617   // Arguments:
 618   //   to:
 619   //   count:
 620   //
 621   // Destroys:
 622   //
 623   address generate_zero_words_aligned8() {
 624     StubCodeMark mark(this, "StubRoutines", "zero_words_aligned8");
 625 
 626     // Implemented as in ClearArray.
 627     address start = __ function_entry();
 628 
 629     Register base_ptr_reg   = R3_ARG1; // tohw (needs to be 8b aligned)
 630     Register cnt_dwords_reg = R4_ARG2; // count (in dwords)
 631     Register tmp1_reg       = R5_ARG3;
 632     Register tmp2_reg       = R6_ARG4;
 633     Register zero_reg       = R7_ARG5;
 634 
 635     // Procedure for large arrays (uses data cache block zero instruction).
 636     Label dwloop, fast, fastloop, restloop, lastdword, done;
 637     int cl_size = VM_Version::L1_data_cache_line_size();
 638     int cl_dwords = cl_size >> 3;
 639     int cl_dwordaddr_bits = exact_log2(cl_dwords);
 640     int min_dcbz = 2; // Needs to be positive, apply dcbz only to at least min_dcbz cache lines.
 641 
 642     // Clear up to 128byte boundary if long enough, dword_cnt=(16-(base>>3))%16.
 643     __ dcbtst(base_ptr_reg);                    // Indicate write access to first cache line ...
 644     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if number of dwords is even.
 645     __ srdi_(tmp1_reg, cnt_dwords_reg, 1);      // number of double dwords
 646     __ load_const_optimized(zero_reg, 0L);      // Use as zero register.
 647 
 648     __ cmpdi(CCR1, tmp2_reg, 0);                // cnt_dwords even?
 649     __ beq(CCR0, lastdword);                    // size <= 1
 650     __ mtctr(tmp1_reg);                         // Speculatively preload counter for rest loop (>0).
 651     __ cmpdi(CCR0, cnt_dwords_reg, (min_dcbz+1)*cl_dwords-1); // Big enough to ensure >=min_dcbz cache lines are included?
 652     __ neg(tmp1_reg, base_ptr_reg);             // bit 0..58: bogus, bit 57..60: (16-(base>>3))%16, bit 61..63: 000
 653 
 654     __ blt(CCR0, restloop);                     // Too small. (<31=(2*cl_dwords)-1 is sufficient, but bigger performs better.)
 655     __ rldicl_(tmp1_reg, tmp1_reg, 64-3, 64-cl_dwordaddr_bits); // Extract number of dwords to 128byte boundary=(16-(base>>3))%16.
 656 
 657     __ beq(CCR0, fast);                         // already 128byte aligned
 658     __ mtctr(tmp1_reg);                         // Set ctr to hit 128byte boundary (0<ctr<cnt).
 659     __ subf(cnt_dwords_reg, tmp1_reg, cnt_dwords_reg); // rest (>0 since size>=256-8)
 660 
 661     // Clear in first cache line dword-by-dword if not already 128byte aligned.
 662     __ bind(dwloop);
 663       __ std(zero_reg, 0, base_ptr_reg);        // Clear 8byte aligned block.
 664       __ addi(base_ptr_reg, base_ptr_reg, 8);
 665     __ bdnz(dwloop);
 666 
 667     // clear 128byte blocks
 668     __ bind(fast);
 669     __ srdi(tmp1_reg, cnt_dwords_reg, cl_dwordaddr_bits); // loop count for 128byte loop (>0 since size>=256-8)
 670     __ andi(tmp2_reg, cnt_dwords_reg, 1);       // to check if rest even
 671 
 672     __ mtctr(tmp1_reg);                         // load counter
 673     __ cmpdi(CCR1, tmp2_reg, 0);                // rest even?
 674     __ rldicl_(tmp1_reg, cnt_dwords_reg, 63, 65-cl_dwordaddr_bits); // rest in double dwords
 675 
 676     __ bind(fastloop);
 677       __ dcbz(base_ptr_reg);                    // Clear 128byte aligned block.
 678       __ addi(base_ptr_reg, base_ptr_reg, cl_size);
 679     __ bdnz(fastloop);
 680 
 681     //__ dcbtst(base_ptr_reg);                  // Indicate write access to last cache line.
 682     __ beq(CCR0, lastdword);                    // rest<=1
 683     __ mtctr(tmp1_reg);                         // load counter
 684 
 685     // Clear rest.
 686     __ bind(restloop);
 687       __ std(zero_reg, 0, base_ptr_reg);        // Clear 8byte aligned block.
 688       __ std(zero_reg, 8, base_ptr_reg);        // Clear 8byte aligned block.
 689       __ addi(base_ptr_reg, base_ptr_reg, 16);
 690     __ bdnz(restloop);
 691 
 692     __ bind(lastdword);
 693     __ beq(CCR1, done);
 694     __ std(zero_reg, 0, base_ptr_reg);
 695     __ bind(done);
 696     __ blr();                                   // return
 697 
 698     return start;
 699   }
 700 
 701 #if !defined(PRODUCT)
 702   // Wrapper which calls oopDesc::is_oop_or_null()
 703   // Only called by MacroAssembler::verify_oop
 704   static void verify_oop_helper(const char* message, oopDesc* o) {
 705     if (!oopDesc::is_oop_or_null(o)) {
 706       fatal("%s. oop: " PTR_FORMAT, message, p2i(o));
 707     }
 708     ++ StubRoutines::_verify_oop_count;
 709   }
 710 #endif
 711 
 712   // Return address of code to be called from code generated by
 713   // MacroAssembler::verify_oop.
 714   //
 715   // Don't generate, rather use C++ code.
 716   address generate_verify_oop() {
 717     // this is actually a `FunctionDescriptor*'.
 718     address start = 0;
 719 
 720 #if !defined(PRODUCT)
 721     start = CAST_FROM_FN_PTR(address, verify_oop_helper);
 722 #endif
 723 
 724     return start;
 725   }
 726 
 727   // -XX:+OptimizeFill : convert fill/copy loops into intrinsic
 728   //
 729   // The code is implemented(ported from sparc) as we believe it benefits JVM98, however
 730   // tracing(-XX:+TraceOptimizeFill) shows the intrinsic replacement doesn't happen at all!
 731   //
 732   // Source code in function is_range_check_if() shows that OptimizeFill relaxed the condition
 733   // for turning on loop predication optimization, and hence the behavior of "array range check"
 734   // and "loop invariant check" could be influenced, which potentially boosted JVM98.
 735   //
 736   // Generate stub for disjoint short fill. If "aligned" is true, the
 737   // "to" address is assumed to be heapword aligned.
 738   //
 739   // Arguments for generated stub:
 740   //   to:    R3_ARG1
 741   //   value: R4_ARG2
 742   //   count: R5_ARG3 treated as signed
 743   //
 744   address generate_fill(BasicType t, bool aligned, const char* name) {
 745     StubCodeMark mark(this, "StubRoutines", name);
 746     address start = __ function_entry();
 747 
 748     const Register to    = R3_ARG1;   // source array address
 749     const Register value = R4_ARG2;   // fill value
 750     const Register count = R5_ARG3;   // elements count
 751     const Register temp  = R6_ARG4;   // temp register
 752 
 753     //assert_clean_int(count, O3);    // Make sure 'count' is clean int.
 754 
 755     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
 756     Label L_fill_2_bytes, L_fill_4_bytes, L_fill_elements, L_fill_32_bytes;
 757 
 758     int shift = -1;
 759     switch (t) {
 760        case T_BYTE:
 761         shift = 2;
 762         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
 763         __ rldimi(value, value, 8, 48);     // 8 bit -> 16 bit
 764         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 765         __ blt(CCR0, L_fill_elements);
 766         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
 767         break;
 768        case T_SHORT:
 769         shift = 1;
 770         // Clone bytes (zero extend not needed because store instructions below ignore high order bytes).
 771         __ rldimi(value, value, 16, 32);    // 16 bit -> 32 bit
 772         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 773         __ blt(CCR0, L_fill_elements);
 774         break;
 775       case T_INT:
 776         shift = 0;
 777         __ cmpdi(CCR0, count, 2<<shift);    // Short arrays (< 8 bytes) fill by element.
 778         __ blt(CCR0, L_fill_4_bytes);
 779         break;
 780       default: ShouldNotReachHere();
 781     }
 782 
 783     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
 784       // Align source address at 4 bytes address boundary.
 785       if (t == T_BYTE) {
 786         // One byte misalignment happens only for byte arrays.
 787         __ andi_(temp, to, 1);
 788         __ beq(CCR0, L_skip_align1);
 789         __ stb(value, 0, to);
 790         __ addi(to, to, 1);
 791         __ addi(count, count, -1);
 792         __ bind(L_skip_align1);
 793       }
 794       // Two bytes misalignment happens only for byte and short (char) arrays.
 795       __ andi_(temp, to, 2);
 796       __ beq(CCR0, L_skip_align2);
 797       __ sth(value, 0, to);
 798       __ addi(to, to, 2);
 799       __ addi(count, count, -(1 << (shift - 1)));
 800       __ bind(L_skip_align2);
 801     }
 802 
 803     if (!aligned) {
 804       // Align to 8 bytes, we know we are 4 byte aligned to start.
 805       __ andi_(temp, to, 7);
 806       __ beq(CCR0, L_fill_32_bytes);
 807       __ stw(value, 0, to);
 808       __ addi(to, to, 4);
 809       __ addi(count, count, -(1 << shift));
 810       __ bind(L_fill_32_bytes);
 811     }
 812 
 813     __ li(temp, 8<<shift);                  // Prepare for 32 byte loop.
 814     // Clone bytes int->long as above.
 815     __ rldimi(value, value, 32, 0);         // 32 bit -> 64 bit
 816 
 817     Label L_check_fill_8_bytes;
 818     // Fill 32-byte chunks.
 819     __ subf_(count, temp, count);
 820     __ blt(CCR0, L_check_fill_8_bytes);
 821 
 822     Label L_fill_32_bytes_loop;
 823     __ align(32);
 824     __ bind(L_fill_32_bytes_loop);
 825 
 826     __ std(value, 0, to);
 827     __ std(value, 8, to);
 828     __ subf_(count, temp, count);           // Update count.
 829     __ std(value, 16, to);
 830     __ std(value, 24, to);
 831 
 832     __ addi(to, to, 32);
 833     __ bge(CCR0, L_fill_32_bytes_loop);
 834 
 835     __ bind(L_check_fill_8_bytes);
 836     __ add_(count, temp, count);
 837     __ beq(CCR0, L_exit);
 838     __ addic_(count, count, -(2 << shift));
 839     __ blt(CCR0, L_fill_4_bytes);
 840 
 841     //
 842     // Length is too short, just fill 8 bytes at a time.
 843     //
 844     Label L_fill_8_bytes_loop;
 845     __ bind(L_fill_8_bytes_loop);
 846     __ std(value, 0, to);
 847     __ addic_(count, count, -(2 << shift));
 848     __ addi(to, to, 8);
 849     __ bge(CCR0, L_fill_8_bytes_loop);
 850 
 851     // Fill trailing 4 bytes.
 852     __ bind(L_fill_4_bytes);
 853     __ andi_(temp, count, 1<<shift);
 854     __ beq(CCR0, L_fill_2_bytes);
 855 
 856     __ stw(value, 0, to);
 857     if (t == T_BYTE || t == T_SHORT) {
 858       __ addi(to, to, 4);
 859       // Fill trailing 2 bytes.
 860       __ bind(L_fill_2_bytes);
 861       __ andi_(temp, count, 1<<(shift-1));
 862       __ beq(CCR0, L_fill_byte);
 863       __ sth(value, 0, to);
 864       if (t == T_BYTE) {
 865         __ addi(to, to, 2);
 866         // Fill trailing byte.
 867         __ bind(L_fill_byte);
 868         __ andi_(count, count, 1);
 869         __ beq(CCR0, L_exit);
 870         __ stb(value, 0, to);
 871       } else {
 872         __ bind(L_fill_byte);
 873       }
 874     } else {
 875       __ bind(L_fill_2_bytes);
 876     }
 877     __ bind(L_exit);
 878     __ blr();
 879 
 880     // Handle copies less than 8 bytes. Int is handled elsewhere.
 881     if (t == T_BYTE) {
 882       __ bind(L_fill_elements);
 883       Label L_fill_2, L_fill_4;
 884       __ andi_(temp, count, 1);
 885       __ beq(CCR0, L_fill_2);
 886       __ stb(value, 0, to);
 887       __ addi(to, to, 1);
 888       __ bind(L_fill_2);
 889       __ andi_(temp, count, 2);
 890       __ beq(CCR0, L_fill_4);
 891       __ stb(value, 0, to);
 892       __ stb(value, 0, to);
 893       __ addi(to, to, 2);
 894       __ bind(L_fill_4);
 895       __ andi_(temp, count, 4);
 896       __ beq(CCR0, L_exit);
 897       __ stb(value, 0, to);
 898       __ stb(value, 1, to);
 899       __ stb(value, 2, to);
 900       __ stb(value, 3, to);
 901       __ blr();
 902     }
 903 
 904     if (t == T_SHORT) {
 905       Label L_fill_2;
 906       __ bind(L_fill_elements);
 907       __ andi_(temp, count, 1);
 908       __ beq(CCR0, L_fill_2);
 909       __ sth(value, 0, to);
 910       __ addi(to, to, 2);
 911       __ bind(L_fill_2);
 912       __ andi_(temp, count, 2);
 913       __ beq(CCR0, L_exit);
 914       __ sth(value, 0, to);
 915       __ sth(value, 2, to);
 916       __ blr();
 917     }
 918     return start;
 919   }
 920 
 921   inline void assert_positive_int(Register count) {
 922 #ifdef ASSERT
 923     __ srdi_(R0, count, 31);
 924     __ asm_assert_eq("missing zero extend");
 925 #endif
 926   }
 927 
 928   // Generate overlap test for array copy stubs.
 929   //
 930   // Input:
 931   //   R3_ARG1    -  from
 932   //   R4_ARG2    -  to
 933   //   R5_ARG3    -  element count
 934   //
 935   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
 936     Register tmp1 = R6_ARG4;
 937     Register tmp2 = R7_ARG5;
 938 
 939     assert_positive_int(R5_ARG3);
 940 
 941     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
 942     __ sldi(tmp2, R5_ARG3, log2_elem_size); // size in bytes
 943     __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
 944     __ cmpld(CCR1, tmp1, tmp2);
 945     __ crnand(CCR0, Assembler::less, CCR1, Assembler::less);
 946     // Overlaps if Src before dst and distance smaller than size.
 947     // Branch to forward copy routine otherwise (within range of 32kB).
 948     __ bc(Assembler::bcondCRbiIs1, Assembler::bi0(CCR0, Assembler::less), no_overlap_target);
 949 
 950     // need to copy backwards
 951   }
 952 
 953   // This is common errorexit stub for UnsafeCopyMemory.
 954   address generate_unsafecopy_common_error_exit() {
 955     address start_pc = __ pc();
 956     Register tmp1 = R6_ARG4;
 957     // probably copy stub would have changed value reset it.
 958     if (VM_Version::has_mfdscr()) {
 959       __ load_const_optimized(tmp1, VM_Version::_dscr_val);
 960       __ mtdscr(tmp1);
 961     }
 962     __ li(R3_RET, 0); // return 0
 963     __ blr();
 964     return start_pc;
 965   }
 966 
 967   // The guideline in the implementations of generate_disjoint_xxx_copy
 968   // (xxx=byte,short,int,long,oop) is to copy as many elements as possible with
 969   // single instructions, but to avoid alignment interrupts (see subsequent
 970   // comment). Furthermore, we try to minimize misaligned access, even
 971   // though they cause no alignment interrupt.
 972   //
 973   // In Big-Endian mode, the PowerPC architecture requires implementations to
 974   // handle automatically misaligned integer halfword and word accesses,
 975   // word-aligned integer doubleword accesses, and word-aligned floating-point
 976   // accesses. Other accesses may or may not generate an Alignment interrupt
 977   // depending on the implementation.
 978   // Alignment interrupt handling may require on the order of hundreds of cycles,
 979   // so every effort should be made to avoid misaligned memory values.
 980   //
 981   //
 982   // Generate stub for disjoint byte copy.  If "aligned" is true, the
 983   // "from" and "to" addresses are assumed to be heapword aligned.
 984   //
 985   // Arguments for generated stub:
 986   //      from:  R3_ARG1
 987   //      to:    R4_ARG2
 988   //      count: R5_ARG3 treated as signed
 989   //
 990   address generate_disjoint_byte_copy(bool aligned, const char * name) {
 991     StubCodeMark mark(this, "StubRoutines", name);
 992     address start = __ function_entry();
 993     assert_positive_int(R5_ARG3);
 994 
 995     Register tmp1 = R6_ARG4;
 996     Register tmp2 = R7_ARG5;
 997     Register tmp3 = R8_ARG6;
 998     Register tmp4 = R9_ARG7;
 999 
1000     VectorSRegister tmp_vsr1  = VSR1;
1001     VectorSRegister tmp_vsr2  = VSR2;
1002 
1003     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9, l_10;
1004     {
1005       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1006       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1007 
1008       // Don't try anything fancy if arrays don't have many elements.
1009       __ li(tmp3, 0);
1010       __ cmpwi(CCR0, R5_ARG3, 17);
1011       __ ble(CCR0, l_6); // copy 4 at a time
1012 
1013       if (!aligned) {
1014         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1015         __ andi_(tmp1, tmp1, 3);
1016         __ bne(CCR0, l_6); // If arrays don't have the same alignment mod 4, do 4 element copy.
1017 
1018         // Copy elements if necessary to align to 4 bytes.
1019         __ neg(tmp1, R3_ARG1); // Compute distance to alignment boundary.
1020         __ andi_(tmp1, tmp1, 3);
1021         __ beq(CCR0, l_2);
1022 
1023         __ subf(R5_ARG3, tmp1, R5_ARG3);
1024         __ bind(l_9);
1025         __ lbz(tmp2, 0, R3_ARG1);
1026         __ addic_(tmp1, tmp1, -1);
1027         __ stb(tmp2, 0, R4_ARG2);
1028         __ addi(R3_ARG1, R3_ARG1, 1);
1029         __ addi(R4_ARG2, R4_ARG2, 1);
1030         __ bne(CCR0, l_9);
1031 
1032         __ bind(l_2);
1033       }
1034 
1035       // copy 8 elements at a time
1036       __ xorr(tmp2, R3_ARG1, R4_ARG2); // skip if src & dest have differing alignment mod 8
1037       __ andi_(tmp1, tmp2, 7);
1038       __ bne(CCR0, l_7); // not same alignment -> to or from is aligned -> copy 8
1039 
1040       // copy a 2-element word if necessary to align to 8 bytes
1041       __ andi_(R0, R3_ARG1, 7);
1042       __ beq(CCR0, l_7);
1043 
1044       __ lwzx(tmp2, R3_ARG1, tmp3);
1045       __ addi(R5_ARG3, R5_ARG3, -4);
1046       __ stwx(tmp2, R4_ARG2, tmp3);
1047       { // FasterArrayCopy
1048         __ addi(R3_ARG1, R3_ARG1, 4);
1049         __ addi(R4_ARG2, R4_ARG2, 4);
1050       }
1051       __ bind(l_7);
1052 
1053       { // FasterArrayCopy
1054         __ cmpwi(CCR0, R5_ARG3, 31);
1055         __ ble(CCR0, l_6); // copy 2 at a time if less than 32 elements remain
1056 
1057         __ srdi(tmp1, R5_ARG3, 5);
1058         __ andi_(R5_ARG3, R5_ARG3, 31);
1059         __ mtctr(tmp1);
1060 
1061        if (!VM_Version::has_vsx()) {
1062 
1063         __ bind(l_8);
1064         // Use unrolled version for mass copying (copy 32 elements a time)
1065         // Load feeding store gets zero latency on Power6, however not on Power5.
1066         // Therefore, the following sequence is made for the good of both.
1067         __ ld(tmp1, 0, R3_ARG1);
1068         __ ld(tmp2, 8, R3_ARG1);
1069         __ ld(tmp3, 16, R3_ARG1);
1070         __ ld(tmp4, 24, R3_ARG1);
1071         __ std(tmp1, 0, R4_ARG2);
1072         __ std(tmp2, 8, R4_ARG2);
1073         __ std(tmp3, 16, R4_ARG2);
1074         __ std(tmp4, 24, R4_ARG2);
1075         __ addi(R3_ARG1, R3_ARG1, 32);
1076         __ addi(R4_ARG2, R4_ARG2, 32);
1077         __ bdnz(l_8);
1078 
1079       } else { // Processor supports VSX, so use it to mass copy.
1080 
1081         // Prefetch the data into the L2 cache.
1082         __ dcbt(R3_ARG1, 0);
1083 
1084         // If supported set DSCR pre-fetch to deepest.
1085         if (VM_Version::has_mfdscr()) {
1086           __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1087           __ mtdscr(tmp2);
1088         }
1089 
1090         __ li(tmp1, 16);
1091 
1092         // Backbranch target aligned to 32-byte. Not 16-byte align as
1093         // loop contains < 8 instructions that fit inside a single
1094         // i-cache sector.
1095         __ align(32);
1096 
1097         __ bind(l_10);
1098         // Use loop with VSX load/store instructions to
1099         // copy 32 elements a time.
1100         __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1101         __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1102         __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1103         __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1104         __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1105         __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1106         __ bdnz(l_10);                       // Dec CTR and loop if not zero.
1107 
1108         // Restore DSCR pre-fetch value.
1109         if (VM_Version::has_mfdscr()) {
1110           __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1111           __ mtdscr(tmp2);
1112         }
1113 
1114       } // VSX
1115      } // FasterArrayCopy
1116 
1117       __ bind(l_6);
1118 
1119       // copy 4 elements at a time
1120       __ cmpwi(CCR0, R5_ARG3, 4);
1121       __ blt(CCR0, l_1);
1122       __ srdi(tmp1, R5_ARG3, 2);
1123       __ mtctr(tmp1); // is > 0
1124       __ andi_(R5_ARG3, R5_ARG3, 3);
1125 
1126       { // FasterArrayCopy
1127         __ addi(R3_ARG1, R3_ARG1, -4);
1128         __ addi(R4_ARG2, R4_ARG2, -4);
1129         __ bind(l_3);
1130         __ lwzu(tmp2, 4, R3_ARG1);
1131         __ stwu(tmp2, 4, R4_ARG2);
1132         __ bdnz(l_3);
1133         __ addi(R3_ARG1, R3_ARG1, 4);
1134         __ addi(R4_ARG2, R4_ARG2, 4);
1135       }
1136 
1137       // do single element copy
1138       __ bind(l_1);
1139       __ cmpwi(CCR0, R5_ARG3, 0);
1140       __ beq(CCR0, l_4);
1141 
1142       { // FasterArrayCopy
1143         __ mtctr(R5_ARG3);
1144         __ addi(R3_ARG1, R3_ARG1, -1);
1145         __ addi(R4_ARG2, R4_ARG2, -1);
1146 
1147         __ bind(l_5);
1148         __ lbzu(tmp2, 1, R3_ARG1);
1149         __ stbu(tmp2, 1, R4_ARG2);
1150         __ bdnz(l_5);
1151       }
1152     }
1153 
1154     __ bind(l_4);
1155     __ li(R3_RET, 0); // return 0
1156     __ blr();
1157 
1158     return start;
1159   }
1160 
1161   // Generate stub for conjoint byte copy.  If "aligned" is true, the
1162   // "from" and "to" addresses are assumed to be heapword aligned.
1163   //
1164   // Arguments for generated stub:
1165   //      from:  R3_ARG1
1166   //      to:    R4_ARG2
1167   //      count: R5_ARG3 treated as signed
1168   //
1169   address generate_conjoint_byte_copy(bool aligned, const char * name) {
1170     StubCodeMark mark(this, "StubRoutines", name);
1171     address start = __ function_entry();
1172     assert_positive_int(R5_ARG3);
1173 
1174     Register tmp1 = R6_ARG4;
1175     Register tmp2 = R7_ARG5;
1176     Register tmp3 = R8_ARG6;
1177 
1178     address nooverlap_target = aligned ?
1179       STUB_ENTRY(arrayof_jbyte_disjoint_arraycopy) :
1180       STUB_ENTRY(jbyte_disjoint_arraycopy);
1181 
1182     array_overlap_test(nooverlap_target, 0);
1183     // Do reverse copy. We assume the case of actual overlap is rare enough
1184     // that we don't have to optimize it.
1185     Label l_1, l_2;
1186     {
1187       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1188       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1189       __ b(l_2);
1190       __ bind(l_1);
1191       __ stbx(tmp1, R4_ARG2, R5_ARG3);
1192       __ bind(l_2);
1193       __ addic_(R5_ARG3, R5_ARG3, -1);
1194       __ lbzx(tmp1, R3_ARG1, R5_ARG3);
1195       __ bge(CCR0, l_1);
1196     }
1197     __ li(R3_RET, 0); // return 0
1198     __ blr();
1199 
1200     return start;
1201   }
1202 
1203   // Generate stub for disjoint short copy.  If "aligned" is true, the
1204   // "from" and "to" addresses are assumed to be heapword aligned.
1205   //
1206   // Arguments for generated stub:
1207   //      from:  R3_ARG1
1208   //      to:    R4_ARG2
1209   //  elm.count: R5_ARG3 treated as signed
1210   //
1211   // Strategy for aligned==true:
1212   //
1213   //  If length <= 9:
1214   //     1. copy 2 elements at a time (l_6)
1215   //     2. copy last element if original element count was odd (l_1)
1216   //
1217   //  If length > 9:
1218   //     1. copy 4 elements at a time until less than 4 elements are left (l_7)
1219   //     2. copy 2 elements at a time until less than 2 elements are left (l_6)
1220   //     3. copy last element if one was left in step 2. (l_1)
1221   //
1222   //
1223   // Strategy for aligned==false:
1224   //
1225   //  If length <= 9: same as aligned==true case, but NOTE: load/stores
1226   //                  can be unaligned (see comment below)
1227   //
1228   //  If length > 9:
1229   //     1. continue with step 6. if the alignment of from and to mod 4
1230   //        is different.
1231   //     2. align from and to to 4 bytes by copying 1 element if necessary
1232   //     3. at l_2 from and to are 4 byte aligned; continue with
1233   //        5. if they cannot be aligned to 8 bytes because they have
1234   //        got different alignment mod 8.
1235   //     4. at this point we know that both, from and to, have the same
1236   //        alignment mod 8, now copy one element if necessary to get
1237   //        8 byte alignment of from and to.
1238   //     5. copy 4 elements at a time until less than 4 elements are
1239   //        left; depending on step 3. all load/stores are aligned or
1240   //        either all loads or all stores are unaligned.
1241   //     6. copy 2 elements at a time until less than 2 elements are
1242   //        left (l_6); arriving here from step 1., there is a chance
1243   //        that all accesses are unaligned.
1244   //     7. copy last element if one was left in step 6. (l_1)
1245   //
1246   //  There are unaligned data accesses using integer load/store
1247   //  instructions in this stub. POWER allows such accesses.
1248   //
1249   //  According to the manuals (PowerISA_V2.06_PUBLIC, Book II,
1250   //  Chapter 2: Effect of Operand Placement on Performance) unaligned
1251   //  integer load/stores have good performance. Only unaligned
1252   //  floating point load/stores can have poor performance.
1253   //
1254   //  TODO:
1255   //
1256   //  1. check if aligning the backbranch target of loops is beneficial
1257   //
1258   address generate_disjoint_short_copy(bool aligned, const char * name) {
1259     StubCodeMark mark(this, "StubRoutines", name);
1260 
1261     Register tmp1 = R6_ARG4;
1262     Register tmp2 = R7_ARG5;
1263     Register tmp3 = R8_ARG6;
1264     Register tmp4 = R9_ARG7;
1265 
1266     VectorSRegister tmp_vsr1  = VSR1;
1267     VectorSRegister tmp_vsr2  = VSR2;
1268 
1269     address start = __ function_entry();
1270     assert_positive_int(R5_ARG3);
1271 
1272     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7, l_8, l_9;
1273     {
1274       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1275       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1276       // don't try anything fancy if arrays don't have many elements
1277       __ li(tmp3, 0);
1278       __ cmpwi(CCR0, R5_ARG3, 9);
1279       __ ble(CCR0, l_6); // copy 2 at a time
1280 
1281       if (!aligned) {
1282         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1283         __ andi_(tmp1, tmp1, 3);
1284         __ bne(CCR0, l_6); // if arrays don't have the same alignment mod 4, do 2 element copy
1285 
1286         // At this point it is guaranteed that both, from and to have the same alignment mod 4.
1287 
1288         // Copy 1 element if necessary to align to 4 bytes.
1289         __ andi_(tmp1, R3_ARG1, 3);
1290         __ beq(CCR0, l_2);
1291 
1292         __ lhz(tmp2, 0, R3_ARG1);
1293         __ addi(R3_ARG1, R3_ARG1, 2);
1294         __ sth(tmp2, 0, R4_ARG2);
1295         __ addi(R4_ARG2, R4_ARG2, 2);
1296         __ addi(R5_ARG3, R5_ARG3, -1);
1297         __ bind(l_2);
1298 
1299         // At this point the positions of both, from and to, are at least 4 byte aligned.
1300 
1301         // Copy 4 elements at a time.
1302         // Align to 8 bytes, but only if both, from and to, have same alignment mod 8.
1303         __ xorr(tmp2, R3_ARG1, R4_ARG2);
1304         __ andi_(tmp1, tmp2, 7);
1305         __ bne(CCR0, l_7); // not same alignment mod 8 -> copy 4, either from or to will be unaligned
1306 
1307         // Copy a 2-element word if necessary to align to 8 bytes.
1308         __ andi_(R0, R3_ARG1, 7);
1309         __ beq(CCR0, l_7);
1310 
1311         __ lwzx(tmp2, R3_ARG1, tmp3);
1312         __ addi(R5_ARG3, R5_ARG3, -2);
1313         __ stwx(tmp2, R4_ARG2, tmp3);
1314         { // FasterArrayCopy
1315           __ addi(R3_ARG1, R3_ARG1, 4);
1316           __ addi(R4_ARG2, R4_ARG2, 4);
1317         }
1318       }
1319 
1320       __ bind(l_7);
1321 
1322       // Copy 4 elements at a time; either the loads or the stores can
1323       // be unaligned if aligned == false.
1324 
1325       { // FasterArrayCopy
1326         __ cmpwi(CCR0, R5_ARG3, 15);
1327         __ ble(CCR0, l_6); // copy 2 at a time if less than 16 elements remain
1328 
1329         __ srdi(tmp1, R5_ARG3, 4);
1330         __ andi_(R5_ARG3, R5_ARG3, 15);
1331         __ mtctr(tmp1);
1332 
1333         if (!VM_Version::has_vsx()) {
1334 
1335           __ bind(l_8);
1336           // Use unrolled version for mass copying (copy 16 elements a time).
1337           // Load feeding store gets zero latency on Power6, however not on Power5.
1338           // Therefore, the following sequence is made for the good of both.
1339           __ ld(tmp1, 0, R3_ARG1);
1340           __ ld(tmp2, 8, R3_ARG1);
1341           __ ld(tmp3, 16, R3_ARG1);
1342           __ ld(tmp4, 24, R3_ARG1);
1343           __ std(tmp1, 0, R4_ARG2);
1344           __ std(tmp2, 8, R4_ARG2);
1345           __ std(tmp3, 16, R4_ARG2);
1346           __ std(tmp4, 24, R4_ARG2);
1347           __ addi(R3_ARG1, R3_ARG1, 32);
1348           __ addi(R4_ARG2, R4_ARG2, 32);
1349           __ bdnz(l_8);
1350 
1351         } else { // Processor supports VSX, so use it to mass copy.
1352 
1353           // Prefetch src data into L2 cache.
1354           __ dcbt(R3_ARG1, 0);
1355 
1356           // If supported set DSCR pre-fetch to deepest.
1357           if (VM_Version::has_mfdscr()) {
1358             __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1359             __ mtdscr(tmp2);
1360           }
1361           __ li(tmp1, 16);
1362 
1363           // Backbranch target aligned to 32-byte. It's not aligned 16-byte
1364           // as loop contains < 8 instructions that fit inside a single
1365           // i-cache sector.
1366           __ align(32);
1367 
1368           __ bind(l_9);
1369           // Use loop with VSX load/store instructions to
1370           // copy 16 elements a time.
1371           __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load from src.
1372           __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst.
1373           __ lxvd2x(tmp_vsr2, R3_ARG1, tmp1);  // Load from src + 16.
1374           __ stxvd2x(tmp_vsr2, R4_ARG2, tmp1); // Store to dst + 16.
1375           __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32.
1376           __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32.
1377           __ bdnz(l_9);                        // Dec CTR and loop if not zero.
1378 
1379           // Restore DSCR pre-fetch value.
1380           if (VM_Version::has_mfdscr()) {
1381             __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1382             __ mtdscr(tmp2);
1383           }
1384 
1385         }
1386       } // FasterArrayCopy
1387       __ bind(l_6);
1388 
1389       // copy 2 elements at a time
1390       { // FasterArrayCopy
1391         __ cmpwi(CCR0, R5_ARG3, 2);
1392         __ blt(CCR0, l_1);
1393         __ srdi(tmp1, R5_ARG3, 1);
1394         __ andi_(R5_ARG3, R5_ARG3, 1);
1395 
1396         __ addi(R3_ARG1, R3_ARG1, -4);
1397         __ addi(R4_ARG2, R4_ARG2, -4);
1398         __ mtctr(tmp1);
1399 
1400         __ bind(l_3);
1401         __ lwzu(tmp2, 4, R3_ARG1);
1402         __ stwu(tmp2, 4, R4_ARG2);
1403         __ bdnz(l_3);
1404 
1405         __ addi(R3_ARG1, R3_ARG1, 4);
1406         __ addi(R4_ARG2, R4_ARG2, 4);
1407       }
1408 
1409       // do single element copy
1410       __ bind(l_1);
1411       __ cmpwi(CCR0, R5_ARG3, 0);
1412       __ beq(CCR0, l_4);
1413 
1414       { // FasterArrayCopy
1415         __ mtctr(R5_ARG3);
1416         __ addi(R3_ARG1, R3_ARG1, -2);
1417         __ addi(R4_ARG2, R4_ARG2, -2);
1418 
1419         __ bind(l_5);
1420         __ lhzu(tmp2, 2, R3_ARG1);
1421         __ sthu(tmp2, 2, R4_ARG2);
1422         __ bdnz(l_5);
1423       }
1424     }
1425 
1426     __ bind(l_4);
1427     __ li(R3_RET, 0); // return 0
1428     __ blr();
1429 
1430     return start;
1431   }
1432 
1433   // Generate stub for conjoint short copy.  If "aligned" is true, the
1434   // "from" and "to" addresses are assumed to be heapword aligned.
1435   //
1436   // Arguments for generated stub:
1437   //      from:  R3_ARG1
1438   //      to:    R4_ARG2
1439   //      count: R5_ARG3 treated as signed
1440   //
1441   address generate_conjoint_short_copy(bool aligned, const char * name) {
1442     StubCodeMark mark(this, "StubRoutines", name);
1443     address start = __ function_entry();
1444     assert_positive_int(R5_ARG3);
1445 
1446     Register tmp1 = R6_ARG4;
1447     Register tmp2 = R7_ARG5;
1448     Register tmp3 = R8_ARG6;
1449 
1450     address nooverlap_target = aligned ?
1451       STUB_ENTRY(arrayof_jshort_disjoint_arraycopy) :
1452       STUB_ENTRY(jshort_disjoint_arraycopy);
1453 
1454     array_overlap_test(nooverlap_target, 1);
1455 
1456     Label l_1, l_2;
1457     {
1458       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1459       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1460       __ sldi(tmp1, R5_ARG3, 1);
1461       __ b(l_2);
1462       __ bind(l_1);
1463       __ sthx(tmp2, R4_ARG2, tmp1);
1464       __ bind(l_2);
1465       __ addic_(tmp1, tmp1, -2);
1466       __ lhzx(tmp2, R3_ARG1, tmp1);
1467       __ bge(CCR0, l_1);
1468     }
1469     __ li(R3_RET, 0); // return 0
1470     __ blr();
1471 
1472     return start;
1473   }
1474 
1475   // Generate core code for disjoint int copy (and oop copy on 32-bit).  If "aligned"
1476   // is true, the "from" and "to" addresses are assumed to be heapword aligned.
1477   //
1478   // Arguments:
1479   //      from:  R3_ARG1
1480   //      to:    R4_ARG2
1481   //      count: R5_ARG3 treated as signed
1482   //
1483   void generate_disjoint_int_copy_core(bool aligned) {
1484     Register tmp1 = R6_ARG4;
1485     Register tmp2 = R7_ARG5;
1486     Register tmp3 = R8_ARG6;
1487     Register tmp4 = R0;
1488 
1489     VectorSRegister tmp_vsr1  = VSR1;
1490     VectorSRegister tmp_vsr2  = VSR2;
1491 
1492     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1493 
1494     // for short arrays, just do single element copy
1495     __ li(tmp3, 0);
1496     __ cmpwi(CCR0, R5_ARG3, 5);
1497     __ ble(CCR0, l_2);
1498 
1499     if (!aligned) {
1500         // check if arrays have same alignment mod 8.
1501         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1502         __ andi_(R0, tmp1, 7);
1503         // Not the same alignment, but ld and std just need to be 4 byte aligned.
1504         __ bne(CCR0, l_4); // to OR from is 8 byte aligned -> copy 2 at a time
1505 
1506         // copy 1 element to align to and from on an 8 byte boundary
1507         __ andi_(R0, R3_ARG1, 7);
1508         __ beq(CCR0, l_4);
1509 
1510         __ lwzx(tmp2, R3_ARG1, tmp3);
1511         __ addi(R5_ARG3, R5_ARG3, -1);
1512         __ stwx(tmp2, R4_ARG2, tmp3);
1513         { // FasterArrayCopy
1514           __ addi(R3_ARG1, R3_ARG1, 4);
1515           __ addi(R4_ARG2, R4_ARG2, 4);
1516         }
1517         __ bind(l_4);
1518       }
1519 
1520     { // FasterArrayCopy
1521       __ cmpwi(CCR0, R5_ARG3, 7);
1522       __ ble(CCR0, l_2); // copy 1 at a time if less than 8 elements remain
1523 
1524       __ srdi(tmp1, R5_ARG3, 3);
1525       __ andi_(R5_ARG3, R5_ARG3, 7);
1526       __ mtctr(tmp1);
1527 
1528      if (!VM_Version::has_vsx()) {
1529 
1530       __ bind(l_6);
1531       // Use unrolled version for mass copying (copy 8 elements a time).
1532       // Load feeding store gets zero latency on power6, however not on power 5.
1533       // Therefore, the following sequence is made for the good of both.
1534       __ ld(tmp1, 0, R3_ARG1);
1535       __ ld(tmp2, 8, R3_ARG1);
1536       __ ld(tmp3, 16, R3_ARG1);
1537       __ ld(tmp4, 24, R3_ARG1);
1538       __ std(tmp1, 0, R4_ARG2);
1539       __ std(tmp2, 8, R4_ARG2);
1540       __ std(tmp3, 16, R4_ARG2);
1541       __ std(tmp4, 24, R4_ARG2);
1542       __ addi(R3_ARG1, R3_ARG1, 32);
1543       __ addi(R4_ARG2, R4_ARG2, 32);
1544       __ bdnz(l_6);
1545 
1546     } else { // Processor supports VSX, so use it to mass copy.
1547 
1548       // Prefetch the data into the L2 cache.
1549       __ dcbt(R3_ARG1, 0);
1550 
1551       // If supported set DSCR pre-fetch to deepest.
1552       if (VM_Version::has_mfdscr()) {
1553         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1554         __ mtdscr(tmp2);
1555       }
1556 
1557       __ li(tmp1, 16);
1558 
1559       // Backbranch target aligned to 32-byte. Not 16-byte align as
1560       // loop contains < 8 instructions that fit inside a single
1561       // i-cache sector.
1562       __ align(32);
1563 
1564       __ bind(l_7);
1565       // Use loop with VSX load/store instructions to
1566       // copy 8 elements a time.
1567       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1568       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1569       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1570       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1571       __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1572       __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1573       __ bdnz(l_7);                        // Dec CTR and loop if not zero.
1574 
1575       // Restore DSCR pre-fetch value.
1576       if (VM_Version::has_mfdscr()) {
1577         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1578         __ mtdscr(tmp2);
1579       }
1580 
1581     } // VSX
1582    } // FasterArrayCopy
1583 
1584     // copy 1 element at a time
1585     __ bind(l_2);
1586     __ cmpwi(CCR0, R5_ARG3, 0);
1587     __ beq(CCR0, l_1);
1588 
1589     { // FasterArrayCopy
1590       __ mtctr(R5_ARG3);
1591       __ addi(R3_ARG1, R3_ARG1, -4);
1592       __ addi(R4_ARG2, R4_ARG2, -4);
1593 
1594       __ bind(l_3);
1595       __ lwzu(tmp2, 4, R3_ARG1);
1596       __ stwu(tmp2, 4, R4_ARG2);
1597       __ bdnz(l_3);
1598     }
1599 
1600     __ bind(l_1);
1601     return;
1602   }
1603 
1604   // Generate stub for disjoint int copy.  If "aligned" is true, the
1605   // "from" and "to" addresses are assumed to be heapword aligned.
1606   //
1607   // Arguments for generated stub:
1608   //      from:  R3_ARG1
1609   //      to:    R4_ARG2
1610   //      count: R5_ARG3 treated as signed
1611   //
1612   address generate_disjoint_int_copy(bool aligned, const char * name) {
1613     StubCodeMark mark(this, "StubRoutines", name);
1614     address start = __ function_entry();
1615     assert_positive_int(R5_ARG3);
1616     {
1617       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1618       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1619       generate_disjoint_int_copy_core(aligned);
1620     }
1621     __ li(R3_RET, 0); // return 0
1622     __ blr();
1623     return start;
1624   }
1625 
1626   // Generate core code for conjoint int copy (and oop copy on
1627   // 32-bit).  If "aligned" is true, the "from" and "to" addresses
1628   // are assumed to be heapword aligned.
1629   //
1630   // Arguments:
1631   //      from:  R3_ARG1
1632   //      to:    R4_ARG2
1633   //      count: R5_ARG3 treated as signed
1634   //
1635   void generate_conjoint_int_copy_core(bool aligned) {
1636     // Do reverse copy.  We assume the case of actual overlap is rare enough
1637     // that we don't have to optimize it.
1638 
1639     Label l_1, l_2, l_3, l_4, l_5, l_6, l_7;
1640 
1641     Register tmp1 = R6_ARG4;
1642     Register tmp2 = R7_ARG5;
1643     Register tmp3 = R8_ARG6;
1644     Register tmp4 = R0;
1645 
1646     VectorSRegister tmp_vsr1  = VSR1;
1647     VectorSRegister tmp_vsr2  = VSR2;
1648 
1649     { // FasterArrayCopy
1650       __ cmpwi(CCR0, R5_ARG3, 0);
1651       __ beq(CCR0, l_6);
1652 
1653       __ sldi(R5_ARG3, R5_ARG3, 2);
1654       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1655       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1656       __ srdi(R5_ARG3, R5_ARG3, 2);
1657 
1658       if (!aligned) {
1659         // check if arrays have same alignment mod 8.
1660         __ xorr(tmp1, R3_ARG1, R4_ARG2);
1661         __ andi_(R0, tmp1, 7);
1662         // Not the same alignment, but ld and std just need to be 4 byte aligned.
1663         __ bne(CCR0, l_7); // to OR from is 8 byte aligned -> copy 2 at a time
1664 
1665         // copy 1 element to align to and from on an 8 byte boundary
1666         __ andi_(R0, R3_ARG1, 7);
1667         __ beq(CCR0, l_7);
1668 
1669         __ addi(R3_ARG1, R3_ARG1, -4);
1670         __ addi(R4_ARG2, R4_ARG2, -4);
1671         __ addi(R5_ARG3, R5_ARG3, -1);
1672         __ lwzx(tmp2, R3_ARG1);
1673         __ stwx(tmp2, R4_ARG2);
1674         __ bind(l_7);
1675       }
1676 
1677       __ cmpwi(CCR0, R5_ARG3, 7);
1678       __ ble(CCR0, l_5); // copy 1 at a time if less than 8 elements remain
1679 
1680       __ srdi(tmp1, R5_ARG3, 3);
1681       __ andi(R5_ARG3, R5_ARG3, 7);
1682       __ mtctr(tmp1);
1683 
1684      if (!VM_Version::has_vsx()) {
1685       __ bind(l_4);
1686       // Use unrolled version for mass copying (copy 4 elements a time).
1687       // Load feeding store gets zero latency on Power6, however not on Power5.
1688       // Therefore, the following sequence is made for the good of both.
1689       __ addi(R3_ARG1, R3_ARG1, -32);
1690       __ addi(R4_ARG2, R4_ARG2, -32);
1691       __ ld(tmp4, 24, R3_ARG1);
1692       __ ld(tmp3, 16, R3_ARG1);
1693       __ ld(tmp2, 8, R3_ARG1);
1694       __ ld(tmp1, 0, R3_ARG1);
1695       __ std(tmp4, 24, R4_ARG2);
1696       __ std(tmp3, 16, R4_ARG2);
1697       __ std(tmp2, 8, R4_ARG2);
1698       __ std(tmp1, 0, R4_ARG2);
1699       __ bdnz(l_4);
1700      } else {  // Processor supports VSX, so use it to mass copy.
1701       // Prefetch the data into the L2 cache.
1702       __ dcbt(R3_ARG1, 0);
1703 
1704       // If supported set DSCR pre-fetch to deepest.
1705       if (VM_Version::has_mfdscr()) {
1706         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1707         __ mtdscr(tmp2);
1708       }
1709 
1710       __ li(tmp1, 16);
1711 
1712       // Backbranch target aligned to 32-byte. Not 16-byte align as
1713       // loop contains < 8 instructions that fit inside a single
1714       // i-cache sector.
1715       __ align(32);
1716 
1717       __ bind(l_4);
1718       // Use loop with VSX load/store instructions to
1719       // copy 8 elements a time.
1720       __ addi(R3_ARG1, R3_ARG1, -32);      // Update src-=32
1721       __ addi(R4_ARG2, R4_ARG2, -32);      // Update dsc-=32
1722       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src+16
1723       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1724       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1725       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1726       __ bdnz(l_4);
1727 
1728       // Restore DSCR pre-fetch value.
1729       if (VM_Version::has_mfdscr()) {
1730         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1731         __ mtdscr(tmp2);
1732       }
1733      }
1734 
1735       __ cmpwi(CCR0, R5_ARG3, 0);
1736       __ beq(CCR0, l_6);
1737 
1738       __ bind(l_5);
1739       __ mtctr(R5_ARG3);
1740       __ bind(l_3);
1741       __ lwz(R0, -4, R3_ARG1);
1742       __ stw(R0, -4, R4_ARG2);
1743       __ addi(R3_ARG1, R3_ARG1, -4);
1744       __ addi(R4_ARG2, R4_ARG2, -4);
1745       __ bdnz(l_3);
1746 
1747       __ bind(l_6);
1748     }
1749   }
1750 
1751   // Generate stub for conjoint int copy.  If "aligned" is true, the
1752   // "from" and "to" addresses are assumed to be heapword aligned.
1753   //
1754   // Arguments for generated stub:
1755   //      from:  R3_ARG1
1756   //      to:    R4_ARG2
1757   //      count: R5_ARG3 treated as signed
1758   //
1759   address generate_conjoint_int_copy(bool aligned, const char * name) {
1760     StubCodeMark mark(this, "StubRoutines", name);
1761     address start = __ function_entry();
1762     assert_positive_int(R5_ARG3);
1763     address nooverlap_target = aligned ?
1764       STUB_ENTRY(arrayof_jint_disjoint_arraycopy) :
1765       STUB_ENTRY(jint_disjoint_arraycopy);
1766 
1767     array_overlap_test(nooverlap_target, 2);
1768     {
1769       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1770       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1771       generate_conjoint_int_copy_core(aligned);
1772     }
1773 
1774     __ li(R3_RET, 0); // return 0
1775     __ blr();
1776 
1777     return start;
1778   }
1779 
1780   // Generate core code for disjoint long copy (and oop copy on
1781   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
1782   // are assumed to be heapword aligned.
1783   //
1784   // Arguments:
1785   //      from:  R3_ARG1
1786   //      to:    R4_ARG2
1787   //      count: R5_ARG3 treated as signed
1788   //
1789   void generate_disjoint_long_copy_core(bool aligned) {
1790     Register tmp1 = R6_ARG4;
1791     Register tmp2 = R7_ARG5;
1792     Register tmp3 = R8_ARG6;
1793     Register tmp4 = R0;
1794 
1795     Label l_1, l_2, l_3, l_4, l_5;
1796 
1797     VectorSRegister tmp_vsr1  = VSR1;
1798     VectorSRegister tmp_vsr2  = VSR2;
1799 
1800     { // FasterArrayCopy
1801       __ cmpwi(CCR0, R5_ARG3, 3);
1802       __ ble(CCR0, l_3); // copy 1 at a time if less than 4 elements remain
1803 
1804       __ srdi(tmp1, R5_ARG3, 2);
1805       __ andi_(R5_ARG3, R5_ARG3, 3);
1806       __ mtctr(tmp1);
1807 
1808     if (!VM_Version::has_vsx()) {
1809       __ bind(l_4);
1810       // Use unrolled version for mass copying (copy 4 elements a time).
1811       // Load feeding store gets zero latency on Power6, however not on Power5.
1812       // Therefore, the following sequence is made for the good of both.
1813       __ ld(tmp1, 0, R3_ARG1);
1814       __ ld(tmp2, 8, R3_ARG1);
1815       __ ld(tmp3, 16, R3_ARG1);
1816       __ ld(tmp4, 24, R3_ARG1);
1817       __ std(tmp1, 0, R4_ARG2);
1818       __ std(tmp2, 8, R4_ARG2);
1819       __ std(tmp3, 16, R4_ARG2);
1820       __ std(tmp4, 24, R4_ARG2);
1821       __ addi(R3_ARG1, R3_ARG1, 32);
1822       __ addi(R4_ARG2, R4_ARG2, 32);
1823       __ bdnz(l_4);
1824 
1825     } else { // Processor supports VSX, so use it to mass copy.
1826 
1827       // Prefetch the data into the L2 cache.
1828       __ dcbt(R3_ARG1, 0);
1829 
1830       // If supported set DSCR pre-fetch to deepest.
1831       if (VM_Version::has_mfdscr()) {
1832         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1833         __ mtdscr(tmp2);
1834       }
1835 
1836       __ li(tmp1, 16);
1837 
1838       // Backbranch target aligned to 32-byte. Not 16-byte align as
1839       // loop contains < 8 instructions that fit inside a single
1840       // i-cache sector.
1841       __ align(32);
1842 
1843       __ bind(l_5);
1844       // Use loop with VSX load/store instructions to
1845       // copy 4 elements a time.
1846       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1847       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1848       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src + 16
1849       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst + 16
1850       __ addi(R3_ARG1, R3_ARG1, 32);       // Update src+=32
1851       __ addi(R4_ARG2, R4_ARG2, 32);       // Update dsc+=32
1852       __ bdnz(l_5);                        // Dec CTR and loop if not zero.
1853 
1854       // Restore DSCR pre-fetch value.
1855       if (VM_Version::has_mfdscr()) {
1856         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1857         __ mtdscr(tmp2);
1858       }
1859 
1860     } // VSX
1861    } // FasterArrayCopy
1862 
1863     // copy 1 element at a time
1864     __ bind(l_3);
1865     __ cmpwi(CCR0, R5_ARG3, 0);
1866     __ beq(CCR0, l_1);
1867 
1868     { // FasterArrayCopy
1869       __ mtctr(R5_ARG3);
1870       __ addi(R3_ARG1, R3_ARG1, -8);
1871       __ addi(R4_ARG2, R4_ARG2, -8);
1872 
1873       __ bind(l_2);
1874       __ ldu(R0, 8, R3_ARG1);
1875       __ stdu(R0, 8, R4_ARG2);
1876       __ bdnz(l_2);
1877 
1878     }
1879     __ bind(l_1);
1880   }
1881 
1882   // Generate stub for disjoint long copy.  If "aligned" is true, the
1883   // "from" and "to" addresses are assumed to be heapword aligned.
1884   //
1885   // Arguments for generated stub:
1886   //      from:  R3_ARG1
1887   //      to:    R4_ARG2
1888   //      count: R5_ARG3 treated as signed
1889   //
1890   address generate_disjoint_long_copy(bool aligned, const char * name) {
1891     StubCodeMark mark(this, "StubRoutines", name);
1892     address start = __ function_entry();
1893     assert_positive_int(R5_ARG3);
1894     {
1895       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1896       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1897       generate_disjoint_long_copy_core(aligned);
1898     }
1899     __ li(R3_RET, 0); // return 0
1900     __ blr();
1901 
1902   return start;
1903   }
1904 
1905   // Generate core code for conjoint long copy (and oop copy on
1906   // 64-bit).  If "aligned" is true, the "from" and "to" addresses
1907   // are assumed to be heapword aligned.
1908   //
1909   // Arguments:
1910   //      from:  R3_ARG1
1911   //      to:    R4_ARG2
1912   //      count: R5_ARG3 treated as signed
1913   //
1914   void generate_conjoint_long_copy_core(bool aligned) {
1915     Register tmp1 = R6_ARG4;
1916     Register tmp2 = R7_ARG5;
1917     Register tmp3 = R8_ARG6;
1918     Register tmp4 = R0;
1919 
1920     VectorSRegister tmp_vsr1  = VSR1;
1921     VectorSRegister tmp_vsr2  = VSR2;
1922 
1923     Label l_1, l_2, l_3, l_4, l_5;
1924 
1925     __ cmpwi(CCR0, R5_ARG3, 0);
1926     __ beq(CCR0, l_1);
1927 
1928     { // FasterArrayCopy
1929       __ sldi(R5_ARG3, R5_ARG3, 3);
1930       __ add(R3_ARG1, R3_ARG1, R5_ARG3);
1931       __ add(R4_ARG2, R4_ARG2, R5_ARG3);
1932       __ srdi(R5_ARG3, R5_ARG3, 3);
1933 
1934       __ cmpwi(CCR0, R5_ARG3, 3);
1935       __ ble(CCR0, l_5); // copy 1 at a time if less than 4 elements remain
1936 
1937       __ srdi(tmp1, R5_ARG3, 2);
1938       __ andi(R5_ARG3, R5_ARG3, 3);
1939       __ mtctr(tmp1);
1940 
1941      if (!VM_Version::has_vsx()) {
1942       __ bind(l_4);
1943       // Use unrolled version for mass copying (copy 4 elements a time).
1944       // Load feeding store gets zero latency on Power6, however not on Power5.
1945       // Therefore, the following sequence is made for the good of both.
1946       __ addi(R3_ARG1, R3_ARG1, -32);
1947       __ addi(R4_ARG2, R4_ARG2, -32);
1948       __ ld(tmp4, 24, R3_ARG1);
1949       __ ld(tmp3, 16, R3_ARG1);
1950       __ ld(tmp2, 8, R3_ARG1);
1951       __ ld(tmp1, 0, R3_ARG1);
1952       __ std(tmp4, 24, R4_ARG2);
1953       __ std(tmp3, 16, R4_ARG2);
1954       __ std(tmp2, 8, R4_ARG2);
1955       __ std(tmp1, 0, R4_ARG2);
1956       __ bdnz(l_4);
1957      } else { // Processor supports VSX, so use it to mass copy.
1958       // Prefetch the data into the L2 cache.
1959       __ dcbt(R3_ARG1, 0);
1960 
1961       // If supported set DSCR pre-fetch to deepest.
1962       if (VM_Version::has_mfdscr()) {
1963         __ load_const_optimized(tmp2, VM_Version::_dscr_val | 7);
1964         __ mtdscr(tmp2);
1965       }
1966 
1967       __ li(tmp1, 16);
1968 
1969       // Backbranch target aligned to 32-byte. Not 16-byte align as
1970       // loop contains < 8 instructions that fit inside a single
1971       // i-cache sector.
1972       __ align(32);
1973 
1974       __ bind(l_4);
1975       // Use loop with VSX load/store instructions to
1976       // copy 4 elements a time.
1977       __ addi(R3_ARG1, R3_ARG1, -32);      // Update src-=32
1978       __ addi(R4_ARG2, R4_ARG2, -32);      // Update dsc-=32
1979       __ lxvd2x(tmp_vsr2, tmp1, R3_ARG1);  // Load src+16
1980       __ lxvd2x(tmp_vsr1, R3_ARG1);        // Load src
1981       __ stxvd2x(tmp_vsr2, tmp1, R4_ARG2); // Store to dst+16
1982       __ stxvd2x(tmp_vsr1, R4_ARG2);       // Store to dst
1983       __ bdnz(l_4);
1984 
1985       // Restore DSCR pre-fetch value.
1986       if (VM_Version::has_mfdscr()) {
1987         __ load_const_optimized(tmp2, VM_Version::_dscr_val);
1988         __ mtdscr(tmp2);
1989       }
1990      }
1991 
1992       __ cmpwi(CCR0, R5_ARG3, 0);
1993       __ beq(CCR0, l_1);
1994 
1995       __ bind(l_5);
1996       __ mtctr(R5_ARG3);
1997       __ bind(l_3);
1998       __ ld(R0, -8, R3_ARG1);
1999       __ std(R0, -8, R4_ARG2);
2000       __ addi(R3_ARG1, R3_ARG1, -8);
2001       __ addi(R4_ARG2, R4_ARG2, -8);
2002       __ bdnz(l_3);
2003 
2004     }
2005     __ bind(l_1);
2006   }
2007 
2008   // Generate stub for conjoint long copy.  If "aligned" is true, the
2009   // "from" and "to" addresses are assumed to be heapword aligned.
2010   //
2011   // Arguments for generated stub:
2012   //      from:  R3_ARG1
2013   //      to:    R4_ARG2
2014   //      count: R5_ARG3 treated as signed
2015   //
2016   address generate_conjoint_long_copy(bool aligned, const char * name) {
2017     StubCodeMark mark(this, "StubRoutines", name);
2018     address start = __ function_entry();
2019     assert_positive_int(R5_ARG3);
2020     address nooverlap_target = aligned ?
2021       STUB_ENTRY(arrayof_jlong_disjoint_arraycopy) :
2022       STUB_ENTRY(jlong_disjoint_arraycopy);
2023 
2024     array_overlap_test(nooverlap_target, 3);
2025     {
2026       // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
2027       UnsafeCopyMemoryMark ucmm(this, !aligned, false);
2028       generate_conjoint_long_copy_core(aligned);
2029     }
2030     __ li(R3_RET, 0); // return 0
2031     __ blr();
2032 
2033     return start;
2034   }
2035 
2036   // Generate stub for conjoint oop copy.  If "aligned" is true, the
2037   // "from" and "to" addresses are assumed to be heapword aligned.
2038   //
2039   // Arguments for generated stub:
2040   //      from:  R3_ARG1
2041   //      to:    R4_ARG2
2042   //      count: R5_ARG3 treated as signed
2043   //      dest_uninitialized: G1 support
2044   //
2045   address generate_conjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
2046     StubCodeMark mark(this, "StubRoutines", name);
2047 
2048     address start = __ function_entry();
2049     assert_positive_int(R5_ARG3);
2050     address nooverlap_target = aligned ?
2051       STUB_ENTRY(arrayof_oop_disjoint_arraycopy) :
2052       STUB_ENTRY(oop_disjoint_arraycopy);
2053 
2054     DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2055     if (dest_uninitialized) {
2056       decorators |= IS_DEST_UNINITIALIZED;
2057     }
2058     if (aligned) {
2059       decorators |= ARRAYCOPY_ALIGNED;
2060     }
2061 
2062     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2063     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
2064 
2065     if (UseCompressedOops) {
2066       array_overlap_test(nooverlap_target, 2);
2067       generate_conjoint_int_copy_core(aligned);
2068     } else {
2069       array_overlap_test(nooverlap_target, 3);
2070       generate_conjoint_long_copy_core(aligned);
2071     }
2072 
2073     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
2074     __ li(R3_RET, 0); // return 0
2075     __ blr();
2076     return start;
2077   }
2078 
2079   // Generate stub for disjoint oop copy.  If "aligned" is true, the
2080   // "from" and "to" addresses are assumed to be heapword aligned.
2081   //
2082   // Arguments for generated stub:
2083   //      from:  R3_ARG1
2084   //      to:    R4_ARG2
2085   //      count: R5_ARG3 treated as signed
2086   //      dest_uninitialized: G1 support
2087   //
2088   address generate_disjoint_oop_copy(bool aligned, const char * name, bool dest_uninitialized) {
2089     StubCodeMark mark(this, "StubRoutines", name);
2090     address start = __ function_entry();
2091     assert_positive_int(R5_ARG3);
2092 
2093     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2094     if (dest_uninitialized) {
2095       decorators |= IS_DEST_UNINITIALIZED;
2096     }
2097     if (aligned) {
2098       decorators |= ARRAYCOPY_ALIGNED;
2099     }
2100 
2101     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2102     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_ARG1, R4_ARG2, R5_ARG3, noreg, noreg);
2103 
2104     if (UseCompressedOops) {
2105       generate_disjoint_int_copy_core(aligned);
2106     } else {
2107       generate_disjoint_long_copy_core(aligned);
2108     }
2109 
2110     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_ARG2, R5_ARG3, noreg);
2111     __ li(R3_RET, 0); // return 0
2112     __ blr();
2113 
2114     return start;
2115   }
2116 
2117 
2118   // Helper for generating a dynamic type check.
2119   // Smashes only the given temp registers.
2120   void generate_type_check(Register sub_klass,
2121                            Register super_check_offset,
2122                            Register super_klass,
2123                            Register temp,
2124                            Label& L_success) {
2125     assert_different_registers(sub_klass, super_check_offset, super_klass);
2126 
2127     BLOCK_COMMENT("type_check:");
2128 
2129     Label L_miss;
2130 
2131     __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, R0, &L_success, &L_miss, NULL,
2132                                      super_check_offset);
2133     __ check_klass_subtype_slow_path(sub_klass, super_klass, temp, R0, &L_success, NULL);
2134 
2135     // Fall through on failure!
2136     __ bind(L_miss);
2137   }
2138 
2139 
2140   //  Generate stub for checked oop copy.
2141   //
2142   // Arguments for generated stub:
2143   //      from:  R3
2144   //      to:    R4
2145   //      count: R5 treated as signed
2146   //      ckoff: R6 (super_check_offset)
2147   //      ckval: R7 (super_klass)
2148   //      ret:   R3 zero for success; (-1^K) where K is partial transfer count
2149   //
2150   address generate_checkcast_copy(const char *name, bool dest_uninitialized) {
2151 
2152     const Register R3_from   = R3_ARG1;      // source array address
2153     const Register R4_to     = R4_ARG2;      // destination array address
2154     const Register R5_count  = R5_ARG3;      // elements count
2155     const Register R6_ckoff  = R6_ARG4;      // super_check_offset
2156     const Register R7_ckval  = R7_ARG5;      // super_klass
2157 
2158     const Register R8_offset = R8_ARG6;      // loop var, with stride wordSize
2159     const Register R9_remain = R9_ARG7;      // loop var, with stride -1
2160     const Register R10_oop   = R10_ARG8;     // actual oop copied
2161     const Register R11_klass = R11_scratch1; // oop._klass
2162     const Register R12_tmp   = R12_scratch2;
2163 
2164     const Register R2_minus1 = R2;
2165 
2166     //__ align(CodeEntryAlignment);
2167     StubCodeMark mark(this, "StubRoutines", name);
2168     address start = __ function_entry();
2169 
2170     // Assert that int is 64 bit sign extended and arrays are not conjoint.
2171 #ifdef ASSERT
2172     {
2173     assert_positive_int(R5_ARG3);
2174     const Register tmp1 = R11_scratch1, tmp2 = R12_scratch2;
2175     Label no_overlap;
2176     __ subf(tmp1, R3_ARG1, R4_ARG2); // distance in bytes
2177     __ sldi(tmp2, R5_ARG3, LogBytesPerHeapOop); // size in bytes
2178     __ cmpld(CCR0, R3_ARG1, R4_ARG2); // Use unsigned comparison!
2179     __ cmpld(CCR1, tmp1, tmp2);
2180     __ crnand(CCR0, Assembler::less, CCR1, Assembler::less);
2181     // Overlaps if Src before dst and distance smaller than size.
2182     // Branch to forward copy routine otherwise.
2183     __ blt(CCR0, no_overlap);
2184     __ stop("overlap in checkcast_copy");
2185     __ bind(no_overlap);
2186     }
2187 #endif
2188 
2189     DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
2190     if (dest_uninitialized) {
2191       decorators |= IS_DEST_UNINITIALIZED;
2192     }
2193 
2194     BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2195     bs->arraycopy_prologue(_masm, decorators, T_OBJECT, R3_from, R4_to, R5_count, /* preserve: */ R6_ckoff, R7_ckval);
2196 
2197     //inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, R12_tmp, R3_RET);
2198 
2199     Label load_element, store_element, store_null, success, do_epilogue;
2200     __ or_(R9_remain, R5_count, R5_count); // Initialize loop index, and test it.
2201     __ li(R8_offset, 0);                   // Offset from start of arrays.
2202     __ li(R2_minus1, -1);
2203     __ bne(CCR0, load_element);
2204 
2205     // Empty array: Nothing to do.
2206     __ li(R3_RET, 0);           // Return 0 on (trivial) success.
2207     __ blr();
2208 
2209     // ======== begin loop ========
2210     // (Entry is load_element.)
2211     __ align(OptoLoopAlignment);
2212     __ bind(store_element);
2213     if (UseCompressedOops) {
2214       __ encode_heap_oop_not_null(R10_oop);
2215       __ bind(store_null);
2216       __ stw(R10_oop, R8_offset, R4_to);
2217     } else {
2218       __ bind(store_null);
2219       __ std(R10_oop, R8_offset, R4_to);
2220     }
2221 
2222     __ addi(R8_offset, R8_offset, heapOopSize);   // Step to next offset.
2223     __ add_(R9_remain, R2_minus1, R9_remain);     // Decrement the count.
2224     __ beq(CCR0, success);
2225 
2226     // ======== loop entry is here ========
2227     __ bind(load_element);
2228     __ load_heap_oop(R10_oop, R8_offset, R3_from, R12_tmp, noreg, false, AS_RAW, &store_null);
2229 
2230     __ load_klass(R11_klass, R10_oop); // Query the object klass.
2231 
2232     generate_type_check(R11_klass, R6_ckoff, R7_ckval, R12_tmp,
2233                         // Branch to this on success:
2234                         store_element);
2235     // ======== end loop ========
2236 
2237     // It was a real error; we must depend on the caller to finish the job.
2238     // Register R9_remain has number of *remaining* oops, R5_count number of *total* oops.
2239     // Emit GC store barriers for the oops we have copied (R5_count minus R9_remain),
2240     // and report their number to the caller.
2241     __ subf_(R5_count, R9_remain, R5_count);
2242     __ nand(R3_RET, R5_count, R5_count);   // report (-1^K) to caller
2243     __ bne(CCR0, do_epilogue);
2244     __ blr();
2245 
2246     __ bind(success);
2247     __ li(R3_RET, 0);
2248 
2249     __ bind(do_epilogue);
2250     bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, R4_to, R5_count, /* preserve */ R3_RET);
2251 
2252     __ blr();
2253     return start;
2254   }
2255 
2256 
2257   //  Generate 'unsafe' array copy stub.
2258   //  Though just as safe as the other stubs, it takes an unscaled
2259   //  size_t argument instead of an element count.
2260   //
2261   // Arguments for generated stub:
2262   //      from:  R3
2263   //      to:    R4
2264   //      count: R5 byte count, treated as ssize_t, can be zero
2265   //
2266   // Examines the alignment of the operands and dispatches
2267   // to a long, int, short, or byte copy loop.
2268   //
2269   address generate_unsafe_copy(const char* name,
2270                                address byte_copy_entry,
2271                                address short_copy_entry,
2272                                address int_copy_entry,
2273                                address long_copy_entry) {
2274 
2275     const Register R3_from   = R3_ARG1;      // source array address
2276     const Register R4_to     = R4_ARG2;      // destination array address
2277     const Register R5_count  = R5_ARG3;      // elements count (as long on PPC64)
2278 
2279     const Register R6_bits   = R6_ARG4;      // test copy of low bits
2280     const Register R7_tmp    = R7_ARG5;
2281 
2282     //__ align(CodeEntryAlignment);
2283     StubCodeMark mark(this, "StubRoutines", name);
2284     address start = __ function_entry();
2285 
2286     // Bump this on entry, not on exit:
2287     //inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, R6_bits, R7_tmp);
2288 
2289     Label short_copy, int_copy, long_copy;
2290 
2291     __ orr(R6_bits, R3_from, R4_to);
2292     __ orr(R6_bits, R6_bits, R5_count);
2293     __ andi_(R0, R6_bits, (BytesPerLong-1));
2294     __ beq(CCR0, long_copy);
2295 
2296     __ andi_(R0, R6_bits, (BytesPerInt-1));
2297     __ beq(CCR0, int_copy);
2298 
2299     __ andi_(R0, R6_bits, (BytesPerShort-1));
2300     __ beq(CCR0, short_copy);
2301 
2302     // byte_copy:
2303     __ b(byte_copy_entry);
2304 
2305     __ bind(short_copy);
2306     __ srwi(R5_count, R5_count, LogBytesPerShort);
2307     __ b(short_copy_entry);
2308 
2309     __ bind(int_copy);
2310     __ srwi(R5_count, R5_count, LogBytesPerInt);
2311     __ b(int_copy_entry);
2312 
2313     __ bind(long_copy);
2314     __ srwi(R5_count, R5_count, LogBytesPerLong);
2315     __ b(long_copy_entry);
2316 
2317     return start;
2318   }
2319 
2320 
2321   // Perform range checks on the proposed arraycopy.
2322   // Kills the two temps, but nothing else.
2323   // Also, clean the sign bits of src_pos and dst_pos.
2324   void arraycopy_range_checks(Register src,     // source array oop
2325                               Register src_pos, // source position
2326                               Register dst,     // destination array oop
2327                               Register dst_pos, // destination position
2328                               Register length,  // length of copy
2329                               Register temp1, Register temp2,
2330                               Label& L_failed) {
2331     BLOCK_COMMENT("arraycopy_range_checks:");
2332 
2333     const Register array_length = temp1;  // scratch
2334     const Register end_pos      = temp2;  // scratch
2335 
2336     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
2337     __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), src);
2338     __ add(end_pos, src_pos, length);  // src_pos + length
2339     __ cmpd(CCR0, end_pos, array_length);
2340     __ bgt(CCR0, L_failed);
2341 
2342     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2343     __ lwa(array_length, arrayOopDesc::length_offset_in_bytes(), dst);
2344     __ add(end_pos, dst_pos, length);  // src_pos + length
2345     __ cmpd(CCR0, end_pos, array_length);
2346     __ bgt(CCR0, L_failed);
2347 
2348     BLOCK_COMMENT("arraycopy_range_checks done");
2349   }
2350 
2351 
2352   //
2353   //  Generate generic array copy stubs
2354   //
2355   //  Input:
2356   //    R3    -  src oop
2357   //    R4    -  src_pos
2358   //    R5    -  dst oop
2359   //    R6    -  dst_pos
2360   //    R7    -  element count
2361   //
2362   //  Output:
2363   //    R3 ==  0  -  success
2364   //    R3 == -1  -  need to call System.arraycopy
2365   //
2366   address generate_generic_copy(const char *name,
2367                                 address entry_jbyte_arraycopy,
2368                                 address entry_jshort_arraycopy,
2369                                 address entry_jint_arraycopy,
2370                                 address entry_oop_arraycopy,
2371                                 address entry_disjoint_oop_arraycopy,
2372                                 address entry_jlong_arraycopy,
2373                                 address entry_checkcast_arraycopy) {
2374     Label L_failed, L_objArray;
2375 
2376     // Input registers
2377     const Register src       = R3_ARG1;  // source array oop
2378     const Register src_pos   = R4_ARG2;  // source position
2379     const Register dst       = R5_ARG3;  // destination array oop
2380     const Register dst_pos   = R6_ARG4;  // destination position
2381     const Register length    = R7_ARG5;  // elements count
2382 
2383     // registers used as temp
2384     const Register src_klass = R8_ARG6;  // source array klass
2385     const Register dst_klass = R9_ARG7;  // destination array klass
2386     const Register lh        = R10_ARG8; // layout handler
2387     const Register temp      = R2;
2388 
2389     //__ align(CodeEntryAlignment);
2390     StubCodeMark mark(this, "StubRoutines", name);
2391     address start = __ function_entry();
2392 
2393     // Bump this on entry, not on exit:
2394     //inc_counter_np(SharedRuntime::_generic_array_copy_ctr, lh, temp);
2395 
2396     // In principle, the int arguments could be dirty.
2397 
2398     //-----------------------------------------------------------------------
2399     // Assembler stubs will be used for this call to arraycopy
2400     // if the following conditions are met:
2401     //
2402     // (1) src and dst must not be null.
2403     // (2) src_pos must not be negative.
2404     // (3) dst_pos must not be negative.
2405     // (4) length  must not be negative.
2406     // (5) src klass and dst klass should be the same and not NULL.
2407     // (6) src and dst should be arrays.
2408     // (7) src_pos + length must not exceed length of src.
2409     // (8) dst_pos + length must not exceed length of dst.
2410     BLOCK_COMMENT("arraycopy initial argument checks");
2411 
2412     __ cmpdi(CCR1, src, 0);      // if (src == NULL) return -1;
2413     __ extsw_(src_pos, src_pos); // if (src_pos < 0) return -1;
2414     __ cmpdi(CCR5, dst, 0);      // if (dst == NULL) return -1;
2415     __ cror(CCR1, Assembler::equal, CCR0, Assembler::less);
2416     __ extsw_(dst_pos, dst_pos); // if (src_pos < 0) return -1;
2417     __ cror(CCR5, Assembler::equal, CCR0, Assembler::less);
2418     __ extsw_(length, length);   // if (length < 0) return -1;
2419     __ cror(CCR1, Assembler::equal, CCR5, Assembler::equal);
2420     __ cror(CCR1, Assembler::equal, CCR0, Assembler::less);
2421     __ beq(CCR1, L_failed);
2422 
2423     BLOCK_COMMENT("arraycopy argument klass checks");
2424     __ load_klass(src_klass, src);
2425     __ load_klass(dst_klass, dst);
2426 
2427     // Load layout helper
2428     //
2429     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2430     // 32        30    24            16              8     2                 0
2431     //
2432     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2433     //
2434 
2435     int lh_offset = in_bytes(Klass::layout_helper_offset());
2436 
2437     // Load 32-bits signed value. Use br() instruction with it to check icc.
2438     __ lwz(lh, lh_offset, src_klass);
2439 
2440     // Handle objArrays completely differently...
2441     jint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2442     __ load_const_optimized(temp, objArray_lh, R0);
2443     __ cmpw(CCR0, lh, temp);
2444     __ beq(CCR0, L_objArray);
2445 
2446     __ cmpd(CCR5, src_klass, dst_klass);          // if (src->klass() != dst->klass()) return -1;
2447     __ cmpwi(CCR6, lh, Klass::_lh_neutral_value); // if (!src->is_Array()) return -1;
2448 
2449     __ crnand(CCR5, Assembler::equal, CCR6, Assembler::less);
2450     __ beq(CCR5, L_failed);
2451 
2452     // At this point, it is known to be a typeArray (array_tag 0x3).
2453 #ifdef ASSERT
2454     { Label L;
2455       jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2456       __ load_const_optimized(temp, lh_prim_tag_in_place, R0);
2457       __ cmpw(CCR0, lh, temp);
2458       __ bge(CCR0, L);
2459       __ stop("must be a primitive array");
2460       __ bind(L);
2461     }
2462 #endif
2463 
2464     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2465                            temp, dst_klass, L_failed);
2466 
2467     // TypeArrayKlass
2468     //
2469     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2470     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2471     //
2472 
2473     const Register offset = dst_klass;    // array offset
2474     const Register elsize = src_klass;    // log2 element size
2475 
2476     __ rldicl(offset, lh, 64 - Klass::_lh_header_size_shift, 64 - exact_log2(Klass::_lh_header_size_mask + 1));
2477     __ andi(elsize, lh, Klass::_lh_log2_element_size_mask);
2478     __ add(src, offset, src);       // src array offset
2479     __ add(dst, offset, dst);       // dst array offset
2480 
2481     // Next registers should be set before the jump to corresponding stub.
2482     const Register from     = R3_ARG1;  // source array address
2483     const Register to       = R4_ARG2;  // destination array address
2484     const Register count    = R5_ARG3;  // elements count
2485 
2486     // 'from', 'to', 'count' registers should be set in this order
2487     // since they are the same as 'src', 'src_pos', 'dst'.
2488 
2489     BLOCK_COMMENT("scale indexes to element size");
2490     __ sld(src_pos, src_pos, elsize);
2491     __ sld(dst_pos, dst_pos, elsize);
2492     __ add(from, src_pos, src);  // src_addr
2493     __ add(to, dst_pos, dst);    // dst_addr
2494     __ mr(count, length);        // length
2495 
2496     BLOCK_COMMENT("choose copy loop based on element size");
2497     // Using conditional branches with range 32kB.
2498     const int bo = Assembler::bcondCRbiIs1, bi = Assembler::bi0(CCR0, Assembler::equal);
2499     __ cmpwi(CCR0, elsize, 0);
2500     __ bc(bo, bi, entry_jbyte_arraycopy);
2501     __ cmpwi(CCR0, elsize, LogBytesPerShort);
2502     __ bc(bo, bi, entry_jshort_arraycopy);
2503     __ cmpwi(CCR0, elsize, LogBytesPerInt);
2504     __ bc(bo, bi, entry_jint_arraycopy);
2505 #ifdef ASSERT
2506     { Label L;
2507       __ cmpwi(CCR0, elsize, LogBytesPerLong);
2508       __ beq(CCR0, L);
2509       __ stop("must be long copy, but elsize is wrong");
2510       __ bind(L);
2511     }
2512 #endif
2513     __ b(entry_jlong_arraycopy);
2514 
2515     // ObjArrayKlass
2516   __ bind(L_objArray);
2517     // live at this point:  src_klass, dst_klass, src[_pos], dst[_pos], length
2518 
2519     Label L_disjoint_plain_copy, L_checkcast_copy;
2520     //  test array classes for subtyping
2521     __ cmpd(CCR0, src_klass, dst_klass);         // usual case is exact equality
2522     __ bne(CCR0, L_checkcast_copy);
2523 
2524     // Identically typed arrays can be copied without element-wise checks.
2525     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2526                            temp, lh, L_failed);
2527 
2528     __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
2529     __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
2530     __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
2531     __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
2532     __ add(from, src_pos, src);  // src_addr
2533     __ add(to, dst_pos, dst);    // dst_addr
2534     __ mr(count, length);        // length
2535     __ b(entry_oop_arraycopy);
2536 
2537   __ bind(L_checkcast_copy);
2538     // live at this point:  src_klass, dst_klass
2539     {
2540       // Before looking at dst.length, make sure dst is also an objArray.
2541       __ lwz(temp, lh_offset, dst_klass);
2542       __ cmpw(CCR0, lh, temp);
2543       __ bne(CCR0, L_failed);
2544 
2545       // It is safe to examine both src.length and dst.length.
2546       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2547                              temp, lh, L_failed);
2548 
2549       // Marshal the base address arguments now, freeing registers.
2550       __ addi(src, src, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //src offset
2551       __ addi(dst, dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); //dst offset
2552       __ sldi(src_pos, src_pos, LogBytesPerHeapOop);
2553       __ sldi(dst_pos, dst_pos, LogBytesPerHeapOop);
2554       __ add(from, src_pos, src);  // src_addr
2555       __ add(to, dst_pos, dst);    // dst_addr
2556       __ mr(count, length);        // length
2557 
2558       Register sco_temp = R6_ARG4;             // This register is free now.
2559       assert_different_registers(from, to, count, sco_temp,
2560                                  dst_klass, src_klass);
2561 
2562       // Generate the type check.
2563       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2564       __ lwz(sco_temp, sco_offset, dst_klass);
2565       generate_type_check(src_klass, sco_temp, dst_klass,
2566                           temp, L_disjoint_plain_copy);
2567 
2568       // Fetch destination element klass from the ObjArrayKlass header.
2569       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2570 
2571       // The checkcast_copy loop needs two extra arguments:
2572       __ ld(R7_ARG5, ek_offset, dst_klass);   // dest elem klass
2573       __ lwz(R6_ARG4, sco_offset, R7_ARG5);   // sco of elem klass
2574       __ b(entry_checkcast_arraycopy);
2575     }
2576 
2577     __ bind(L_disjoint_plain_copy);
2578     __ b(entry_disjoint_oop_arraycopy);
2579 
2580   __ bind(L_failed);
2581     __ li(R3_RET, -1); // return -1
2582     __ blr();
2583     return start;
2584   }
2585 
2586   // Arguments for generated stub:
2587   //   R3_ARG1   - source byte array address
2588   //   R4_ARG2   - destination byte array address
2589   //   R5_ARG3   - round key array
2590   address generate_aescrypt_encryptBlock() {
2591     assert(UseAES, "need AES instructions and misaligned SSE support");
2592     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
2593 
2594     address start = __ function_entry();
2595 
2596     Label L_doLast;
2597 
2598     Register from           = R3_ARG1;  // source array address
2599     Register to             = R4_ARG2;  // destination array address
2600     Register key            = R5_ARG3;  // round key array
2601 
2602     Register keylen         = R8;
2603     Register temp           = R9;
2604     Register keypos         = R10;
2605     Register fifteen        = R12;
2606 
2607     VectorRegister vRet     = VR0;
2608 
2609     VectorRegister vKey1    = VR1;
2610     VectorRegister vKey2    = VR2;
2611     VectorRegister vKey3    = VR3;
2612     VectorRegister vKey4    = VR4;
2613 
2614     VectorRegister fromPerm = VR5;
2615     VectorRegister keyPerm  = VR6;
2616     VectorRegister toPerm   = VR7;
2617     VectorRegister fSplt    = VR8;
2618 
2619     VectorRegister vTmp1    = VR9;
2620     VectorRegister vTmp2    = VR10;
2621     VectorRegister vTmp3    = VR11;
2622     VectorRegister vTmp4    = VR12;
2623 
2624     __ li              (fifteen, 15);
2625 
2626     // load unaligned from[0-15] to vsRet
2627     __ lvx             (vRet, from);
2628     __ lvx             (vTmp1, fifteen, from);
2629     __ lvsl            (fromPerm, from);
2630 #ifdef VM_LITTLE_ENDIAN
2631     __ vspltisb        (fSplt, 0x0f);
2632     __ vxor            (fromPerm, fromPerm, fSplt);
2633 #endif
2634     __ vperm           (vRet, vRet, vTmp1, fromPerm);
2635 
2636     // load keylen (44 or 52 or 60)
2637     __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
2638 
2639     // to load keys
2640     __ load_perm       (keyPerm, key);
2641 #ifdef VM_LITTLE_ENDIAN
2642     __ vspltisb        (vTmp2, -16);
2643     __ vrld            (keyPerm, keyPerm, vTmp2);
2644     __ vrld            (keyPerm, keyPerm, vTmp2);
2645     __ vsldoi          (keyPerm, keyPerm, keyPerm, 8);
2646 #endif
2647 
2648     // load the 1st round key to vTmp1
2649     __ lvx             (vTmp1, key);
2650     __ li              (keypos, 16);
2651     __ lvx             (vKey1, keypos, key);
2652     __ vec_perm        (vTmp1, vKey1, keyPerm);
2653 
2654     // 1st round
2655     __ vxor            (vRet, vRet, vTmp1);
2656 
2657     // load the 2nd round key to vKey1
2658     __ li              (keypos, 32);
2659     __ lvx             (vKey2, keypos, key);
2660     __ vec_perm        (vKey1, vKey2, keyPerm);
2661 
2662     // load the 3rd round key to vKey2
2663     __ li              (keypos, 48);
2664     __ lvx             (vKey3, keypos, key);
2665     __ vec_perm        (vKey2, vKey3, keyPerm);
2666 
2667     // load the 4th round key to vKey3
2668     __ li              (keypos, 64);
2669     __ lvx             (vKey4, keypos, key);
2670     __ vec_perm        (vKey3, vKey4, keyPerm);
2671 
2672     // load the 5th round key to vKey4
2673     __ li              (keypos, 80);
2674     __ lvx             (vTmp1, keypos, key);
2675     __ vec_perm        (vKey4, vTmp1, keyPerm);
2676 
2677     // 2nd - 5th rounds
2678     __ vcipher         (vRet, vRet, vKey1);
2679     __ vcipher         (vRet, vRet, vKey2);
2680     __ vcipher         (vRet, vRet, vKey3);
2681     __ vcipher         (vRet, vRet, vKey4);
2682 
2683     // load the 6th round key to vKey1
2684     __ li              (keypos, 96);
2685     __ lvx             (vKey2, keypos, key);
2686     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2687 
2688     // load the 7th round key to vKey2
2689     __ li              (keypos, 112);
2690     __ lvx             (vKey3, keypos, key);
2691     __ vec_perm        (vKey2, vKey3, keyPerm);
2692 
2693     // load the 8th round key to vKey3
2694     __ li              (keypos, 128);
2695     __ lvx             (vKey4, keypos, key);
2696     __ vec_perm        (vKey3, vKey4, keyPerm);
2697 
2698     // load the 9th round key to vKey4
2699     __ li              (keypos, 144);
2700     __ lvx             (vTmp1, keypos, key);
2701     __ vec_perm        (vKey4, vTmp1, keyPerm);
2702 
2703     // 6th - 9th rounds
2704     __ vcipher         (vRet, vRet, vKey1);
2705     __ vcipher         (vRet, vRet, vKey2);
2706     __ vcipher         (vRet, vRet, vKey3);
2707     __ vcipher         (vRet, vRet, vKey4);
2708 
2709     // load the 10th round key to vKey1
2710     __ li              (keypos, 160);
2711     __ lvx             (vKey2, keypos, key);
2712     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2713 
2714     // load the 11th round key to vKey2
2715     __ li              (keypos, 176);
2716     __ lvx             (vTmp1, keypos, key);
2717     __ vec_perm        (vKey2, vTmp1, keyPerm);
2718 
2719     // if all round keys are loaded, skip next 4 rounds
2720     __ cmpwi           (CCR0, keylen, 44);
2721     __ beq             (CCR0, L_doLast);
2722 
2723     // 10th - 11th rounds
2724     __ vcipher         (vRet, vRet, vKey1);
2725     __ vcipher         (vRet, vRet, vKey2);
2726 
2727     // load the 12th round key to vKey1
2728     __ li              (keypos, 192);
2729     __ lvx             (vKey2, keypos, key);
2730     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2731 
2732     // load the 13th round key to vKey2
2733     __ li              (keypos, 208);
2734     __ lvx             (vTmp1, keypos, key);
2735     __ vec_perm        (vKey2, vTmp1, keyPerm);
2736 
2737     // if all round keys are loaded, skip next 2 rounds
2738     __ cmpwi           (CCR0, keylen, 52);
2739     __ beq             (CCR0, L_doLast);
2740 
2741     // 12th - 13th rounds
2742     __ vcipher         (vRet, vRet, vKey1);
2743     __ vcipher         (vRet, vRet, vKey2);
2744 
2745     // load the 14th round key to vKey1
2746     __ li              (keypos, 224);
2747     __ lvx             (vKey2, keypos, key);
2748     __ vec_perm        (vKey1, vTmp1, vKey2, keyPerm);
2749 
2750     // load the 15th round key to vKey2
2751     __ li              (keypos, 240);
2752     __ lvx             (vTmp1, keypos, key);
2753     __ vec_perm        (vKey2, vTmp1, keyPerm);
2754 
2755     __ bind(L_doLast);
2756 
2757     // last two rounds
2758     __ vcipher         (vRet, vRet, vKey1);
2759     __ vcipherlast     (vRet, vRet, vKey2);
2760 
2761     // store result (unaligned)
2762 #ifdef VM_LITTLE_ENDIAN
2763     __ lvsl            (toPerm, to);
2764 #else
2765     __ lvsr            (toPerm, to);
2766 #endif
2767     __ vspltisb        (vTmp3, -1);
2768     __ vspltisb        (vTmp4, 0);
2769     __ lvx             (vTmp1, to);
2770     __ lvx             (vTmp2, fifteen, to);
2771 #ifdef VM_LITTLE_ENDIAN
2772     __ vperm           (vTmp3, vTmp3, vTmp4, toPerm); // generate select mask
2773     __ vxor            (toPerm, toPerm, fSplt);       // swap bytes
2774 #else
2775     __ vperm           (vTmp3, vTmp4, vTmp3, toPerm); // generate select mask
2776 #endif
2777     __ vperm           (vTmp4, vRet, vRet, toPerm);   // rotate data
2778     __ vsel            (vTmp2, vTmp4, vTmp2, vTmp3);
2779     __ vsel            (vTmp1, vTmp1, vTmp4, vTmp3);
2780     __ stvx            (vTmp2, fifteen, to);          // store this one first (may alias)
2781     __ stvx            (vTmp1, to);
2782 
2783     __ blr();
2784      return start;
2785   }
2786 
2787   // Arguments for generated stub:
2788   //   R3_ARG1   - source byte array address
2789   //   R4_ARG2   - destination byte array address
2790   //   R5_ARG3   - K (key) in little endian int array
2791   address generate_aescrypt_decryptBlock() {
2792     assert(UseAES, "need AES instructions and misaligned SSE support");
2793     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
2794 
2795     address start = __ function_entry();
2796 
2797     Label L_doLast;
2798     Label L_do44;
2799     Label L_do52;
2800 
2801     Register from           = R3_ARG1;  // source array address
2802     Register to             = R4_ARG2;  // destination array address
2803     Register key            = R5_ARG3;  // round key array
2804 
2805     Register keylen         = R8;
2806     Register temp           = R9;
2807     Register keypos         = R10;
2808     Register fifteen        = R12;
2809 
2810     VectorRegister vRet     = VR0;
2811 
2812     VectorRegister vKey1    = VR1;
2813     VectorRegister vKey2    = VR2;
2814     VectorRegister vKey3    = VR3;
2815     VectorRegister vKey4    = VR4;
2816     VectorRegister vKey5    = VR5;
2817 
2818     VectorRegister fromPerm = VR6;
2819     VectorRegister keyPerm  = VR7;
2820     VectorRegister toPerm   = VR8;
2821     VectorRegister fSplt    = VR9;
2822 
2823     VectorRegister vTmp1    = VR10;
2824     VectorRegister vTmp2    = VR11;
2825     VectorRegister vTmp3    = VR12;
2826     VectorRegister vTmp4    = VR13;
2827 
2828     __ li              (fifteen, 15);
2829 
2830     // load unaligned from[0-15] to vsRet
2831     __ lvx             (vRet, from);
2832     __ lvx             (vTmp1, fifteen, from);
2833     __ lvsl            (fromPerm, from);
2834 #ifdef VM_LITTLE_ENDIAN
2835     __ vspltisb        (fSplt, 0x0f);
2836     __ vxor            (fromPerm, fromPerm, fSplt);
2837 #endif
2838     __ vperm           (vRet, vRet, vTmp1, fromPerm); // align [and byte swap in LE]
2839 
2840     // load keylen (44 or 52 or 60)
2841     __ lwz             (keylen, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT), key);
2842 
2843     // to load keys
2844     __ load_perm       (keyPerm, key);
2845 #ifdef VM_LITTLE_ENDIAN
2846     __ vxor            (vTmp2, vTmp2, vTmp2);
2847     __ vspltisb        (vTmp2, -16);
2848     __ vrld            (keyPerm, keyPerm, vTmp2);
2849     __ vrld            (keyPerm, keyPerm, vTmp2);
2850     __ vsldoi          (keyPerm, keyPerm, keyPerm, 8);
2851 #endif
2852 
2853     __ cmpwi           (CCR0, keylen, 44);
2854     __ beq             (CCR0, L_do44);
2855 
2856     __ cmpwi           (CCR0, keylen, 52);
2857     __ beq             (CCR0, L_do52);
2858 
2859     // load the 15th round key to vKey1
2860     __ li              (keypos, 240);
2861     __ lvx             (vKey1, keypos, key);
2862     __ li              (keypos, 224);
2863     __ lvx             (vKey2, keypos, key);
2864     __ vec_perm        (vKey1, vKey2, vKey1, keyPerm);
2865 
2866     // load the 14th round key to vKey2
2867     __ li              (keypos, 208);
2868     __ lvx             (vKey3, keypos, key);
2869     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
2870 
2871     // load the 13th round key to vKey3
2872     __ li              (keypos, 192);
2873     __ lvx             (vKey4, keypos, key);
2874     __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
2875 
2876     // load the 12th round key to vKey4
2877     __ li              (keypos, 176);
2878     __ lvx             (vKey5, keypos, key);
2879     __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
2880 
2881     // load the 11th round key to vKey5
2882     __ li              (keypos, 160);
2883     __ lvx             (vTmp1, keypos, key);
2884     __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
2885 
2886     // 1st - 5th rounds
2887     __ vxor            (vRet, vRet, vKey1);
2888     __ vncipher        (vRet, vRet, vKey2);
2889     __ vncipher        (vRet, vRet, vKey3);
2890     __ vncipher        (vRet, vRet, vKey4);
2891     __ vncipher        (vRet, vRet, vKey5);
2892 
2893     __ b               (L_doLast);
2894 
2895     __ bind            (L_do52);
2896 
2897     // load the 13th round key to vKey1
2898     __ li              (keypos, 208);
2899     __ lvx             (vKey1, keypos, key);
2900     __ li              (keypos, 192);
2901     __ lvx             (vKey2, keypos, key);
2902     __ vec_perm        (vKey1, vKey2, vKey1, keyPerm);
2903 
2904     // load the 12th round key to vKey2
2905     __ li              (keypos, 176);
2906     __ lvx             (vKey3, keypos, key);
2907     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
2908 
2909     // load the 11th round key to vKey3
2910     __ li              (keypos, 160);
2911     __ lvx             (vTmp1, keypos, key);
2912     __ vec_perm        (vKey3, vTmp1, vKey3, keyPerm);
2913 
2914     // 1st - 3rd rounds
2915     __ vxor            (vRet, vRet, vKey1);
2916     __ vncipher        (vRet, vRet, vKey2);
2917     __ vncipher        (vRet, vRet, vKey3);
2918 
2919     __ b               (L_doLast);
2920 
2921     __ bind            (L_do44);
2922 
2923     // load the 11th round key to vKey1
2924     __ li              (keypos, 176);
2925     __ lvx             (vKey1, keypos, key);
2926     __ li              (keypos, 160);
2927     __ lvx             (vTmp1, keypos, key);
2928     __ vec_perm        (vKey1, vTmp1, vKey1, keyPerm);
2929 
2930     // 1st round
2931     __ vxor            (vRet, vRet, vKey1);
2932 
2933     __ bind            (L_doLast);
2934 
2935     // load the 10th round key to vKey1
2936     __ li              (keypos, 144);
2937     __ lvx             (vKey2, keypos, key);
2938     __ vec_perm        (vKey1, vKey2, vTmp1, keyPerm);
2939 
2940     // load the 9th round key to vKey2
2941     __ li              (keypos, 128);
2942     __ lvx             (vKey3, keypos, key);
2943     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
2944 
2945     // load the 8th round key to vKey3
2946     __ li              (keypos, 112);
2947     __ lvx             (vKey4, keypos, key);
2948     __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
2949 
2950     // load the 7th round key to vKey4
2951     __ li              (keypos, 96);
2952     __ lvx             (vKey5, keypos, key);
2953     __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
2954 
2955     // load the 6th round key to vKey5
2956     __ li              (keypos, 80);
2957     __ lvx             (vTmp1, keypos, key);
2958     __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
2959 
2960     // last 10th - 6th rounds
2961     __ vncipher        (vRet, vRet, vKey1);
2962     __ vncipher        (vRet, vRet, vKey2);
2963     __ vncipher        (vRet, vRet, vKey3);
2964     __ vncipher        (vRet, vRet, vKey4);
2965     __ vncipher        (vRet, vRet, vKey5);
2966 
2967     // load the 5th round key to vKey1
2968     __ li              (keypos, 64);
2969     __ lvx             (vKey2, keypos, key);
2970     __ vec_perm        (vKey1, vKey2, vTmp1, keyPerm);
2971 
2972     // load the 4th round key to vKey2
2973     __ li              (keypos, 48);
2974     __ lvx             (vKey3, keypos, key);
2975     __ vec_perm        (vKey2, vKey3, vKey2, keyPerm);
2976 
2977     // load the 3rd round key to vKey3
2978     __ li              (keypos, 32);
2979     __ lvx             (vKey4, keypos, key);
2980     __ vec_perm        (vKey3, vKey4, vKey3, keyPerm);
2981 
2982     // load the 2nd round key to vKey4
2983     __ li              (keypos, 16);
2984     __ lvx             (vKey5, keypos, key);
2985     __ vec_perm        (vKey4, vKey5, vKey4, keyPerm);
2986 
2987     // load the 1st round key to vKey5
2988     __ lvx             (vTmp1, key);
2989     __ vec_perm        (vKey5, vTmp1, vKey5, keyPerm);
2990 
2991     // last 5th - 1th rounds
2992     __ vncipher        (vRet, vRet, vKey1);
2993     __ vncipher        (vRet, vRet, vKey2);
2994     __ vncipher        (vRet, vRet, vKey3);
2995     __ vncipher        (vRet, vRet, vKey4);
2996     __ vncipherlast    (vRet, vRet, vKey5);
2997 
2998     // store result (unaligned)
2999 #ifdef VM_LITTLE_ENDIAN
3000     __ lvsl            (toPerm, to);
3001 #else
3002     __ lvsr            (toPerm, to);
3003 #endif
3004     __ vspltisb        (vTmp3, -1);
3005     __ vspltisb        (vTmp4, 0);
3006     __ lvx             (vTmp1, to);
3007     __ lvx             (vTmp2, fifteen, to);
3008 #ifdef VM_LITTLE_ENDIAN
3009     __ vperm           (vTmp3, vTmp3, vTmp4, toPerm); // generate select mask
3010     __ vxor            (toPerm, toPerm, fSplt);       // swap bytes
3011 #else
3012     __ vperm           (vTmp3, vTmp4, vTmp3, toPerm); // generate select mask
3013 #endif
3014     __ vperm           (vTmp4, vRet, vRet, toPerm);   // rotate data
3015     __ vsel            (vTmp2, vTmp4, vTmp2, vTmp3);
3016     __ vsel            (vTmp1, vTmp1, vTmp4, vTmp3);
3017     __ stvx            (vTmp2, fifteen, to);          // store this one first (may alias)
3018     __ stvx            (vTmp1, to);
3019 
3020     __ blr();
3021      return start;
3022   }
3023 
3024   address generate_sha256_implCompress(bool multi_block, const char *name) {
3025     assert(UseSHA, "need SHA instructions");
3026     StubCodeMark mark(this, "StubRoutines", name);
3027     address start = __ function_entry();
3028 
3029     __ sha256 (multi_block);
3030     __ blr();
3031 
3032     return start;
3033   }
3034 
3035   address generate_sha512_implCompress(bool multi_block, const char *name) {
3036     assert(UseSHA, "need SHA instructions");
3037     StubCodeMark mark(this, "StubRoutines", name);
3038     address start = __ function_entry();
3039 
3040     __ sha512 (multi_block);
3041     __ blr();
3042 
3043     return start;
3044   }
3045 
3046   address generate_data_cache_writeback() {
3047     const Register cacheline = R3_ARG1;
3048     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback");
3049     address start = __ pc();
3050 
3051     __ cache_wb(Address(cacheline));
3052     __ blr();
3053 
3054     return start;
3055   }
3056 
3057   address generate_data_cache_writeback_sync() {
3058     const Register is_presync = R3_ARG1;
3059     Register temp = R4;
3060     Label SKIP;
3061 
3062     StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync");
3063     address start = __ pc();
3064 
3065     __ andi_(temp, is_presync, 1);
3066     __ bne(CCR0, SKIP);
3067     __ cache_wbsync(false); // post sync => emit 'sync'
3068     __ bind(SKIP);          // pre sync => emit nothing
3069     __ blr();
3070 
3071     return start;
3072   }
3073 
3074   void generate_arraycopy_stubs() {
3075     // Note: the disjoint stubs must be generated first, some of
3076     // the conjoint stubs use them.
3077 
3078     address ucm_common_error_exit       =  generate_unsafecopy_common_error_exit();
3079     UnsafeCopyMemory::set_common_exit_stub_pc(ucm_common_error_exit);
3080 
3081     // non-aligned disjoint versions
3082     StubRoutines::_jbyte_disjoint_arraycopy       = generate_disjoint_byte_copy(false, "jbyte_disjoint_arraycopy");
3083     StubRoutines::_jshort_disjoint_arraycopy      = generate_disjoint_short_copy(false, "jshort_disjoint_arraycopy");
3084     StubRoutines::_jint_disjoint_arraycopy        = generate_disjoint_int_copy(false, "jint_disjoint_arraycopy");
3085     StubRoutines::_jlong_disjoint_arraycopy       = generate_disjoint_long_copy(false, "jlong_disjoint_arraycopy");
3086     StubRoutines::_oop_disjoint_arraycopy         = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy", false);
3087     StubRoutines::_oop_disjoint_arraycopy_uninit  = generate_disjoint_oop_copy(false, "oop_disjoint_arraycopy_uninit", true);
3088 
3089     // aligned disjoint versions
3090     StubRoutines::_arrayof_jbyte_disjoint_arraycopy      = generate_disjoint_byte_copy(true, "arrayof_jbyte_disjoint_arraycopy");
3091     StubRoutines::_arrayof_jshort_disjoint_arraycopy     = generate_disjoint_short_copy(true, "arrayof_jshort_disjoint_arraycopy");
3092     StubRoutines::_arrayof_jint_disjoint_arraycopy       = generate_disjoint_int_copy(true, "arrayof_jint_disjoint_arraycopy");
3093     StubRoutines::_arrayof_jlong_disjoint_arraycopy      = generate_disjoint_long_copy(true, "arrayof_jlong_disjoint_arraycopy");
3094     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, "arrayof_oop_disjoint_arraycopy", false);
3095     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, "oop_disjoint_arraycopy_uninit", true);
3096 
3097     // non-aligned conjoint versions
3098     StubRoutines::_jbyte_arraycopy      = generate_conjoint_byte_copy(false, "jbyte_arraycopy");
3099     StubRoutines::_jshort_arraycopy     = generate_conjoint_short_copy(false, "jshort_arraycopy");
3100     StubRoutines::_jint_arraycopy       = generate_conjoint_int_copy(false, "jint_arraycopy");
3101     StubRoutines::_jlong_arraycopy      = generate_conjoint_long_copy(false, "jlong_arraycopy");
3102     StubRoutines::_oop_arraycopy        = generate_conjoint_oop_copy(false, "oop_arraycopy", false);
3103     StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, "oop_arraycopy_uninit", true);
3104 
3105     // aligned conjoint versions
3106     StubRoutines::_arrayof_jbyte_arraycopy      = generate_conjoint_byte_copy(true, "arrayof_jbyte_arraycopy");
3107     StubRoutines::_arrayof_jshort_arraycopy     = generate_conjoint_short_copy(true, "arrayof_jshort_arraycopy");
3108     StubRoutines::_arrayof_jint_arraycopy       = generate_conjoint_int_copy(true, "arrayof_jint_arraycopy");
3109     StubRoutines::_arrayof_jlong_arraycopy      = generate_conjoint_long_copy(true, "arrayof_jlong_arraycopy");
3110     StubRoutines::_arrayof_oop_arraycopy        = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", false);
3111     StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, "arrayof_oop_arraycopy", true);
3112 
3113     // special/generic versions
3114     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", false);
3115     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", true);
3116 
3117     StubRoutines::_unsafe_arraycopy  = generate_unsafe_copy("unsafe_arraycopy",
3118                                                             STUB_ENTRY(jbyte_arraycopy),
3119                                                             STUB_ENTRY(jshort_arraycopy),
3120                                                             STUB_ENTRY(jint_arraycopy),
3121                                                             STUB_ENTRY(jlong_arraycopy));
3122     StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy",
3123                                                              STUB_ENTRY(jbyte_arraycopy),
3124                                                              STUB_ENTRY(jshort_arraycopy),
3125                                                              STUB_ENTRY(jint_arraycopy),
3126                                                              STUB_ENTRY(oop_arraycopy),
3127                                                              STUB_ENTRY(oop_disjoint_arraycopy),
3128                                                              STUB_ENTRY(jlong_arraycopy),
3129                                                              STUB_ENTRY(checkcast_arraycopy));
3130 
3131     // fill routines
3132 #ifdef COMPILER2
3133     if (OptimizeFill) {
3134       StubRoutines::_jbyte_fill          = generate_fill(T_BYTE,  false, "jbyte_fill");
3135       StubRoutines::_jshort_fill         = generate_fill(T_SHORT, false, "jshort_fill");
3136       StubRoutines::_jint_fill           = generate_fill(T_INT,   false, "jint_fill");
3137       StubRoutines::_arrayof_jbyte_fill  = generate_fill(T_BYTE,  true, "arrayof_jbyte_fill");
3138       StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3139       StubRoutines::_arrayof_jint_fill   = generate_fill(T_INT,   true, "arrayof_jint_fill");
3140     }
3141 #endif
3142   }
3143 
3144   // Safefetch stubs.
3145   void generate_safefetch(const char* name, int size, address* entry, address* fault_pc, address* continuation_pc) {
3146     // safefetch signatures:
3147     //   int      SafeFetch32(int*      adr, int      errValue);
3148     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
3149     //
3150     // arguments:
3151     //   R3_ARG1 = adr
3152     //   R4_ARG2 = errValue
3153     //
3154     // result:
3155     //   R3_RET  = *adr or errValue
3156 
3157     StubCodeMark mark(this, "StubRoutines", name);
3158 
3159     // Entry point, pc or function descriptor.
3160     *entry = __ function_entry();
3161 
3162     // Load *adr into R4_ARG2, may fault.
3163     *fault_pc = __ pc();
3164     switch (size) {
3165       case 4:
3166         // int32_t, signed extended
3167         __ lwa(R4_ARG2, 0, R3_ARG1);
3168         break;
3169       case 8:
3170         // int64_t
3171         __ ld(R4_ARG2, 0, R3_ARG1);
3172         break;
3173       default:
3174         ShouldNotReachHere();
3175     }
3176 
3177     // return errValue or *adr
3178     *continuation_pc = __ pc();
3179     __ mr(R3_RET, R4_ARG2);
3180     __ blr();
3181   }
3182 
3183   // Stub for BigInteger::multiplyToLen()
3184   //
3185   //  Arguments:
3186   //
3187   //  Input:
3188   //    R3 - x address
3189   //    R4 - x length
3190   //    R5 - y address
3191   //    R6 - y length
3192   //    R7 - z address
3193   //    R8 - z length
3194   //
3195   address generate_multiplyToLen() {
3196 
3197     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
3198 
3199     address start = __ function_entry();
3200 
3201     const Register x     = R3;
3202     const Register xlen  = R4;
3203     const Register y     = R5;
3204     const Register ylen  = R6;
3205     const Register z     = R7;
3206     const Register zlen  = R8;
3207 
3208     const Register tmp1  = R2; // TOC not used.
3209     const Register tmp2  = R9;
3210     const Register tmp3  = R10;
3211     const Register tmp4  = R11;
3212     const Register tmp5  = R12;
3213 
3214     // non-volatile regs
3215     const Register tmp6  = R31;
3216     const Register tmp7  = R30;
3217     const Register tmp8  = R29;
3218     const Register tmp9  = R28;
3219     const Register tmp10 = R27;
3220     const Register tmp11 = R26;
3221     const Register tmp12 = R25;
3222     const Register tmp13 = R24;
3223 
3224     BLOCK_COMMENT("Entry:");
3225 
3226     // C2 does not respect int to long conversion for stub calls.
3227     __ clrldi(xlen, xlen, 32);
3228     __ clrldi(ylen, ylen, 32);
3229     __ clrldi(zlen, zlen, 32);
3230 
3231     // Save non-volatile regs (frameless).
3232     int current_offs = 8;
3233     __ std(R24, -current_offs, R1_SP); current_offs += 8;
3234     __ std(R25, -current_offs, R1_SP); current_offs += 8;
3235     __ std(R26, -current_offs, R1_SP); current_offs += 8;
3236     __ std(R27, -current_offs, R1_SP); current_offs += 8;
3237     __ std(R28, -current_offs, R1_SP); current_offs += 8;
3238     __ std(R29, -current_offs, R1_SP); current_offs += 8;
3239     __ std(R30, -current_offs, R1_SP); current_offs += 8;
3240     __ std(R31, -current_offs, R1_SP);
3241 
3242     __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5,
3243                        tmp6, tmp7, tmp8, tmp9, tmp10, tmp11, tmp12, tmp13);
3244 
3245     // Restore non-volatile regs.
3246     current_offs = 8;
3247     __ ld(R24, -current_offs, R1_SP); current_offs += 8;
3248     __ ld(R25, -current_offs, R1_SP); current_offs += 8;
3249     __ ld(R26, -current_offs, R1_SP); current_offs += 8;
3250     __ ld(R27, -current_offs, R1_SP); current_offs += 8;
3251     __ ld(R28, -current_offs, R1_SP); current_offs += 8;
3252     __ ld(R29, -current_offs, R1_SP); current_offs += 8;
3253     __ ld(R30, -current_offs, R1_SP); current_offs += 8;
3254     __ ld(R31, -current_offs, R1_SP);
3255 
3256     __ blr();  // Return to caller.
3257 
3258     return start;
3259   }
3260 
3261   /**
3262   *  Arguments:
3263   *
3264   *  Input:
3265   *   R3_ARG1    - out address
3266   *   R4_ARG2    - in address
3267   *   R5_ARG3    - offset
3268   *   R6_ARG4    - len
3269   *   R7_ARG5    - k
3270   *  Output:
3271   *   R3_RET     - carry
3272   */
3273   address generate_mulAdd() {
3274     __ align(CodeEntryAlignment);
3275     StubCodeMark mark(this, "StubRoutines", "mulAdd");
3276 
3277     address start = __ function_entry();
3278 
3279     // C2 does not sign extend signed parameters to full 64 bits registers:
3280     __ rldic (R5_ARG3, R5_ARG3, 2, 32);  // always positive
3281     __ clrldi(R6_ARG4, R6_ARG4, 32);     // force zero bits on higher word
3282     __ clrldi(R7_ARG5, R7_ARG5, 32);     // force zero bits on higher word
3283 
3284     __ muladd(R3_ARG1, R4_ARG2, R5_ARG3, R6_ARG4, R7_ARG5, R8, R9, R10);
3285 
3286     // Moves output carry to return register
3287     __ mr    (R3_RET,  R10);
3288 
3289     __ blr();
3290 
3291     return start;
3292   }
3293 
3294   /**
3295   *  Arguments:
3296   *
3297   *  Input:
3298   *   R3_ARG1    - in address
3299   *   R4_ARG2    - in length
3300   *   R5_ARG3    - out address
3301   *   R6_ARG4    - out length
3302   */
3303   address generate_squareToLen() {
3304     __ align(CodeEntryAlignment);
3305     StubCodeMark mark(this, "StubRoutines", "squareToLen");
3306 
3307     address start = __ function_entry();
3308 
3309     // args - higher word is cleaned (unsignedly) due to int to long casting
3310     const Register in        = R3_ARG1;
3311     const Register in_len    = R4_ARG2;
3312     __ clrldi(in_len, in_len, 32);
3313     const Register out       = R5_ARG3;
3314     const Register out_len   = R6_ARG4;
3315     __ clrldi(out_len, out_len, 32);
3316 
3317     // output
3318     const Register ret       = R3_RET;
3319 
3320     // temporaries
3321     const Register lplw_s    = R7;
3322     const Register in_aux    = R8;
3323     const Register out_aux   = R9;
3324     const Register piece     = R10;
3325     const Register product   = R14;
3326     const Register lplw      = R15;
3327     const Register i_minus1  = R16;
3328     const Register carry     = R17;
3329     const Register offset    = R18;
3330     const Register off_aux   = R19;
3331     const Register t         = R20;
3332     const Register mlen      = R21;
3333     const Register len       = R22;
3334     const Register a         = R23;
3335     const Register b         = R24;
3336     const Register i         = R25;
3337     const Register c         = R26;
3338     const Register cs        = R27;
3339 
3340     // Labels
3341     Label SKIP_LSHIFT, SKIP_DIAGONAL_SUM, SKIP_ADDONE, SKIP_LOOP_SQUARE;
3342     Label LOOP_LSHIFT, LOOP_DIAGONAL_SUM, LOOP_ADDONE, LOOP_SQUARE;
3343 
3344     // Save non-volatile regs (frameless).
3345     int current_offs = -8;
3346     __ std(R28, current_offs, R1_SP); current_offs -= 8;
3347     __ std(R27, current_offs, R1_SP); current_offs -= 8;
3348     __ std(R26, current_offs, R1_SP); current_offs -= 8;
3349     __ std(R25, current_offs, R1_SP); current_offs -= 8;
3350     __ std(R24, current_offs, R1_SP); current_offs -= 8;
3351     __ std(R23, current_offs, R1_SP); current_offs -= 8;
3352     __ std(R22, current_offs, R1_SP); current_offs -= 8;
3353     __ std(R21, current_offs, R1_SP); current_offs -= 8;
3354     __ std(R20, current_offs, R1_SP); current_offs -= 8;
3355     __ std(R19, current_offs, R1_SP); current_offs -= 8;
3356     __ std(R18, current_offs, R1_SP); current_offs -= 8;
3357     __ std(R17, current_offs, R1_SP); current_offs -= 8;
3358     __ std(R16, current_offs, R1_SP); current_offs -= 8;
3359     __ std(R15, current_offs, R1_SP); current_offs -= 8;
3360     __ std(R14, current_offs, R1_SP);
3361 
3362     // Store the squares, right shifted one bit (i.e., divided by 2)
3363     __ subi   (out_aux,   out,       8);
3364     __ subi   (in_aux,    in,        4);
3365     __ cmpwi  (CCR0,      in_len,    0);
3366     // Initialize lplw outside of the loop
3367     __ xorr   (lplw,      lplw,      lplw);
3368     __ ble    (CCR0,      SKIP_LOOP_SQUARE);    // in_len <= 0
3369     __ mtctr  (in_len);
3370 
3371     __ bind(LOOP_SQUARE);
3372     __ lwzu   (piece,     4,         in_aux);
3373     __ mulld  (product,   piece,     piece);
3374     // shift left 63 bits and only keep the MSB
3375     __ rldic  (lplw_s,    lplw,      63, 0);
3376     __ mr     (lplw,      product);
3377     // shift right 1 bit without sign extension
3378     __ srdi   (product,   product,   1);
3379     // join them to the same register and store it
3380     __ orr    (product,   lplw_s,    product);
3381 #ifdef VM_LITTLE_ENDIAN
3382     // Swap low and high words for little endian
3383     __ rldicl (product,   product,   32, 0);
3384 #endif
3385     __ stdu   (product,   8,         out_aux);
3386     __ bdnz   (LOOP_SQUARE);
3387 
3388     __ bind(SKIP_LOOP_SQUARE);
3389 
3390     // Add in off-diagonal sums
3391     __ cmpwi  (CCR0,      in_len,    0);
3392     __ ble    (CCR0,      SKIP_DIAGONAL_SUM);
3393     // Avoid CTR usage here in order to use it at mulAdd
3394     __ subi   (i_minus1,  in_len,    1);
3395     __ li     (offset,    4);
3396 
3397     __ bind(LOOP_DIAGONAL_SUM);
3398 
3399     __ sldi   (off_aux,   out_len,   2);
3400     __ sub    (off_aux,   off_aux,   offset);
3401 
3402     __ mr     (len,       i_minus1);
3403     __ sldi   (mlen,      i_minus1,  2);
3404     __ lwzx   (t,         in,        mlen);
3405 
3406     __ muladd (out, in, off_aux, len, t, a, b, carry);
3407 
3408     // begin<addOne>
3409     // off_aux = out_len*4 - 4 - mlen - offset*4 - 4;
3410     __ addi   (mlen,      mlen,      4);
3411     __ sldi   (a,         out_len,   2);
3412     __ subi   (a,         a,         4);
3413     __ sub    (a,         a,         mlen);
3414     __ subi   (off_aux,   offset,    4);
3415     __ sub    (off_aux,   a,         off_aux);
3416 
3417     __ lwzx   (b,         off_aux,   out);
3418     __ add    (b,         b,         carry);
3419     __ stwx   (b,         off_aux,   out);
3420 
3421     // if (((uint64_t)s >> 32) != 0) {
3422     __ srdi_  (a,         b,         32);
3423     __ beq    (CCR0,      SKIP_ADDONE);
3424 
3425     // while (--mlen >= 0) {
3426     __ bind(LOOP_ADDONE);
3427     __ subi   (mlen,      mlen,      4);
3428     __ cmpwi  (CCR0,      mlen,      0);
3429     __ beq    (CCR0,      SKIP_ADDONE);
3430 
3431     // if (--offset_aux < 0) { // Carry out of number
3432     __ subi   (off_aux,   off_aux,   4);
3433     __ cmpwi  (CCR0,      off_aux,   0);
3434     __ blt    (CCR0,      SKIP_ADDONE);
3435 
3436     // } else {
3437     __ lwzx   (b,         off_aux,   out);
3438     __ addi   (b,         b,         1);
3439     __ stwx   (b,         off_aux,   out);
3440     __ cmpwi  (CCR0,      b,         0);
3441     __ bne    (CCR0,      SKIP_ADDONE);
3442     __ b      (LOOP_ADDONE);
3443 
3444     __ bind(SKIP_ADDONE);
3445     // } } } end<addOne>
3446 
3447     __ addi   (offset,    offset,    8);
3448     __ subi   (i_minus1,  i_minus1,  1);
3449     __ cmpwi  (CCR0,      i_minus1,  0);
3450     __ bge    (CCR0,      LOOP_DIAGONAL_SUM);
3451 
3452     __ bind(SKIP_DIAGONAL_SUM);
3453 
3454     // Shift back up and set low bit
3455     // Shifts 1 bit left up to len positions. Assumes no leading zeros
3456     // begin<primitiveLeftShift>
3457     __ cmpwi  (CCR0,      out_len,   0);
3458     __ ble    (CCR0,      SKIP_LSHIFT);
3459     __ li     (i,         0);
3460     __ lwz    (c,         0,         out);
3461     __ subi   (b,         out_len,   1);
3462     __ mtctr  (b);
3463 
3464     __ bind(LOOP_LSHIFT);
3465     __ mr     (b,         c);
3466     __ addi   (cs,        i,         4);
3467     __ lwzx   (c,         out,       cs);
3468 
3469     __ sldi   (b,         b,         1);
3470     __ srwi   (cs,        c,         31);
3471     __ orr    (b,         b,         cs);
3472     __ stwx   (b,         i,         out);
3473 
3474     __ addi   (i,         i,         4);
3475     __ bdnz   (LOOP_LSHIFT);
3476 
3477     __ sldi   (c,         out_len,   2);
3478     __ subi   (c,         c,         4);
3479     __ lwzx   (b,         out,       c);
3480     __ sldi   (b,         b,         1);
3481     __ stwx   (b,         out,       c);
3482 
3483     __ bind(SKIP_LSHIFT);
3484     // end<primitiveLeftShift>
3485 
3486     // Set low bit
3487     __ sldi   (i,         in_len,    2);
3488     __ subi   (i,         i,         4);
3489     __ lwzx   (i,         in,        i);
3490     __ sldi   (c,         out_len,   2);
3491     __ subi   (c,         c,         4);
3492     __ lwzx   (b,         out,       c);
3493 
3494     __ andi   (i,         i,         1);
3495     __ orr    (i,         b,         i);
3496 
3497     __ stwx   (i,         out,       c);
3498 
3499     // Restore non-volatile regs.
3500     current_offs = -8;
3501     __ ld(R28, current_offs, R1_SP); current_offs -= 8;
3502     __ ld(R27, current_offs, R1_SP); current_offs -= 8;
3503     __ ld(R26, current_offs, R1_SP); current_offs -= 8;
3504     __ ld(R25, current_offs, R1_SP); current_offs -= 8;
3505     __ ld(R24, current_offs, R1_SP); current_offs -= 8;
3506     __ ld(R23, current_offs, R1_SP); current_offs -= 8;
3507     __ ld(R22, current_offs, R1_SP); current_offs -= 8;
3508     __ ld(R21, current_offs, R1_SP); current_offs -= 8;
3509     __ ld(R20, current_offs, R1_SP); current_offs -= 8;
3510     __ ld(R19, current_offs, R1_SP); current_offs -= 8;
3511     __ ld(R18, current_offs, R1_SP); current_offs -= 8;
3512     __ ld(R17, current_offs, R1_SP); current_offs -= 8;
3513     __ ld(R16, current_offs, R1_SP); current_offs -= 8;
3514     __ ld(R15, current_offs, R1_SP); current_offs -= 8;
3515     __ ld(R14, current_offs, R1_SP);
3516 
3517     __ mr(ret, out);
3518     __ blr();
3519 
3520     return start;
3521   }
3522 
3523   /**
3524    * Arguments:
3525    *
3526    * Inputs:
3527    *   R3_ARG1    - int   crc
3528    *   R4_ARG2    - byte* buf
3529    *   R5_ARG3    - int   length (of buffer)
3530    *
3531    * scratch:
3532    *   R2, R6-R12
3533    *
3534    * Ouput:
3535    *   R3_RET     - int   crc result
3536    */
3537   // Compute CRC32 function.
3538   address generate_CRC32_updateBytes(bool is_crc32c) {
3539     __ align(CodeEntryAlignment);
3540     StubCodeMark mark(this, "StubRoutines", is_crc32c ? "CRC32C_updateBytes" : "CRC32_updateBytes");
3541     address start = __ function_entry();  // Remember stub start address (is rtn value).
3542     __ crc32(R3_ARG1, R4_ARG2, R5_ARG3, R2, R6, R7, R8, R9, R10, R11, R12, is_crc32c);
3543     __ blr();
3544     return start;
3545   }
3546 
3547   // Initialization
3548   void generate_initial() {
3549     // Generates all stubs and initializes the entry points
3550 
3551     // Entry points that exist in all platforms.
3552     // Note: This is code that could be shared among different platforms - however the
3553     // benefit seems to be smaller than the disadvantage of having a
3554     // much more complicated generator structure. See also comment in
3555     // stubRoutines.hpp.
3556 
3557     StubRoutines::_forward_exception_entry          = generate_forward_exception();
3558     StubRoutines::_call_stub_entry                  = generate_call_stub(StubRoutines::_call_stub_return_address);
3559     StubRoutines::_catch_exception_entry            = generate_catch_exception();
3560 
3561     // Build this early so it's available for the interpreter.
3562     StubRoutines::_throw_StackOverflowError_entry   =
3563       generate_throw_exception("StackOverflowError throw_exception",
3564                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError), false);
3565     StubRoutines::_throw_delayed_StackOverflowError_entry =
3566       generate_throw_exception("delayed StackOverflowError throw_exception",
3567                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError), false);
3568 
3569     // CRC32 Intrinsics.
3570     if (UseCRC32Intrinsics) {
3571       StubRoutines::_crc_table_adr = StubRoutines::generate_crc_constants(REVERSE_CRC32_POLY);
3572       StubRoutines::_updateBytesCRC32 = generate_CRC32_updateBytes(false);
3573     }
3574 
3575     // CRC32C Intrinsics.
3576     if (UseCRC32CIntrinsics) {
3577       StubRoutines::_crc32c_table_addr = StubRoutines::generate_crc_constants(REVERSE_CRC32C_POLY);
3578       StubRoutines::_updateBytesCRC32C = generate_CRC32_updateBytes(true);
3579     }
3580 
3581     // Safefetch stubs.
3582     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
3583                                                        &StubRoutines::_safefetch32_fault_pc,
3584                                                        &StubRoutines::_safefetch32_continuation_pc);
3585     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
3586                                                        &StubRoutines::_safefetchN_fault_pc,
3587                                                        &StubRoutines::_safefetchN_continuation_pc);
3588   }
3589 
3590   void generate_all() {
3591     // Generates all stubs and initializes the entry points
3592 
3593     // These entry points require SharedInfo::stack0 to be set up in
3594     // non-core builds
3595     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
3596     // Handle IncompatibleClassChangeError in itable stubs.
3597     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError),  false);
3598     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
3599 
3600     // support for verify_oop (must happen after universe_init)
3601     StubRoutines::_verify_oop_subroutine_entry             = generate_verify_oop();
3602 
3603     // arraycopy stubs used by compilers
3604     generate_arraycopy_stubs();
3605 
3606 #ifdef COMPILER2
3607     if (UseMultiplyToLenIntrinsic) {
3608       StubRoutines::_multiplyToLen = generate_multiplyToLen();
3609     }
3610     if (UseSquareToLenIntrinsic) {
3611       StubRoutines::_squareToLen = generate_squareToLen();
3612     }
3613     if (UseMulAddIntrinsic) {
3614       StubRoutines::_mulAdd = generate_mulAdd();
3615     }
3616     if (UseMontgomeryMultiplyIntrinsic) {
3617       StubRoutines::_montgomeryMultiply
3618         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_multiply);
3619     }
3620     if (UseMontgomerySquareIntrinsic) {
3621       StubRoutines::_montgomerySquare
3622         = CAST_FROM_FN_PTR(address, SharedRuntime::montgomery_square);
3623     }
3624 #endif
3625 
3626     // data cache line writeback
3627     if (VM_Version::supports_data_cache_line_flush()) {
3628       StubRoutines::_data_cache_writeback = generate_data_cache_writeback();
3629       StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync();
3630     }
3631 
3632     if (UseAESIntrinsics) {
3633       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
3634       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
3635     }
3636 
3637     if (UseSHA256Intrinsics) {
3638       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
3639       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
3640     }
3641     if (UseSHA512Intrinsics) {
3642       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
3643       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
3644     }
3645   }
3646 
3647  public:
3648   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3649     // replace the standard masm with a special one:
3650     _masm = new MacroAssembler(code);
3651     if (all) {
3652       generate_all();
3653     } else {
3654       generate_initial();
3655     }
3656   }
3657 };
3658 
3659 #define UCM_TABLE_MAX_ENTRIES 8
3660 void StubGenerator_generate(CodeBuffer* code, bool all) {
3661   if (UnsafeCopyMemory::_table == NULL) {
3662     UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
3663   }
3664   StubGenerator g(code, all);
3665 }