Old src/cpu/sparc/vm/stubGenerator

   1 /*
   2  * Copyright (c) 1997, 2011, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "assembler_sparc.inline.hpp"
  28 #include "interpreter/interpreter.hpp"
  29 #include "nativeInst_sparc.hpp"
  30 #include "oops/instanceOop.hpp"
  31 #include "oops/methodOop.hpp"
  32 #include "oops/objArrayKlass.hpp"
  33 #include "oops/oop.inline.hpp"
  34 #include "prims/methodHandles.hpp"
  35 #include "runtime/frame.inline.hpp"
  36 #include "runtime/handles.inline.hpp"
  37 #include "runtime/sharedRuntime.hpp"
  38 #include "runtime/stubCodeGenerator.hpp"
  39 #include "runtime/stubRoutines.hpp"
  40 #include "utilities/top.hpp"
  41 #ifdef TARGET_OS_FAMILY_linux
  42 # include "thread_linux.inline.hpp"
  43 #endif
  44 #ifdef TARGET_OS_FAMILY_solaris
  45 # include "thread_solaris.inline.hpp"
  46 #endif
  47 #ifdef COMPILER2
  48 #include "opto/runtime.hpp"
  49 #endif
  50 
  51 // Declaration and definition of StubGenerator (no .hpp file).
  52 // For a more detailed description of the stub routine structure
  53 // see the comment in stubRoutines.hpp.
  54 
  55 #define __ _masm->
  56 
  57 #ifdef PRODUCT
  58 #define BLOCK_COMMENT(str) /* nothing */
  59 #else
  60 #define BLOCK_COMMENT(str) __ block_comment(str)
  61 #endif
  62 
  63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  64 
  65 // Note:  The register L7 is used as L7_thread_cache, and may not be used
  66 //        any other way within this module.
  67 
  68 
  69 static const Register& Lstub_temp = L2;
  70 
  71 // -------------------------------------------------------------------------------------------------------------------------
  72 // Stub Code definitions
  73 
  74 static address handle_unsafe_access() {
  75   JavaThread* thread = JavaThread::current();
  76   address pc  = thread->saved_exception_pc();
  77   address npc = thread->saved_exception_npc();
  78   // pc is the instruction which we must emulate
  79   // doing a no-op is fine:  return garbage from the load
  80 
  81   // request an async exception
  82   thread->set_pending_unsafe_access_error();
  83 
  84   // return address of next instruction to execute
  85   return npc;
  86 }
  87 
  88 class StubGenerator: public StubCodeGenerator {
  89  private:
  90 
  91 #ifdef PRODUCT
  92 #define inc_counter_np(a,b,c) (0)
  93 #else
  94 #define inc_counter_np(counter, t1, t2) \
  95   BLOCK_COMMENT("inc_counter " #counter); \
  96   __ inc_counter(&counter, t1, t2);
  97 #endif
  98 
  99   //----------------------------------------------------------------------------------------------------
 100   // Call stubs are used to call Java from C
 101 
 102   address generate_call_stub(address& return_pc) {
 103     StubCodeMark mark(this, "StubRoutines", "call_stub");
 104     address start = __ pc();
 105 
 106     // Incoming arguments:
 107     //
 108     // o0         : call wrapper address
 109     // o1         : result (address)
 110     // o2         : result type
 111     // o3         : method
 112     // o4         : (interpreter) entry point
 113     // o5         : parameters (address)
 114     // [sp + 0x5c]: parameter size (in words)
 115     // [sp + 0x60]: thread
 116     //
 117     // +---------------+ <--- sp + 0
 118     // |               |
 119     // . reg save area .
 120     // |               |
 121     // +---------------+ <--- sp + 0x40
 122     // |               |
 123     // . extra 7 slots .
 124     // |               |
 125     // +---------------+ <--- sp + 0x5c
 126     // |  param. size  |
 127     // +---------------+ <--- sp + 0x60
 128     // |    thread     |
 129     // +---------------+
 130     // |               |
 131 
 132     // note: if the link argument position changes, adjust
 133     //       the code in frame::entry_frame_call_wrapper()
 134 
 135     const Argument link           = Argument(0, false); // used only for GC
 136     const Argument result         = Argument(1, false);
 137     const Argument result_type    = Argument(2, false);
 138     const Argument method         = Argument(3, false);
 139     const Argument entry_point    = Argument(4, false);
 140     const Argument parameters     = Argument(5, false);
 141     const Argument parameter_size = Argument(6, false);
 142     const Argument thread         = Argument(7, false);
 143 
 144     // setup thread register
 145     __ ld_ptr(thread.as_address(), G2_thread);
 146     __ reinit_heapbase();
 147 
 148 #ifdef ASSERT
 149     // make sure we have no pending exceptions
 150     { const Register t = G3_scratch;
 151       Label L;
 152       __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
 153       __ br_null(t, false, Assembler::pt, L);
 154       __ stop("StubRoutines::call_stub: entered with pending exception");
 155       __ bind(L);
 156     }
 157 #endif
 158 
 159     // create activation frame & allocate space for parameters
 160     { const Register t = G3_scratch;
 161       __ ld_ptr(parameter_size.as_address(), t);                // get parameter size (in words)
 162       __ add(t, frame::memory_parameter_word_sp_offset, t);     // add space for save area (in words)
 163       __ round_to(t, WordsPerLong);                             // make sure it is multiple of 2 (in words)
 164       __ sll(t, Interpreter::logStackElementSize, t);           // compute number of bytes
 165       __ neg(t);                                                // negate so it can be used with save
 166       __ save(SP, t, SP);                                       // setup new frame
 167     }
 168 
 169     // +---------------+ <--- sp + 0
 170     // |               |
 171     // . reg save area .
 172     // |               |
 173     // +---------------+ <--- sp + 0x40
 174     // |               |
 175     // . extra 7 slots .
 176     // |               |
 177     // +---------------+ <--- sp + 0x5c
 178     // |  empty slot   |      (only if parameter size is even)
 179     // +---------------+
 180     // |               |
 181     // .  parameters   .
 182     // |               |
 183     // +---------------+ <--- fp + 0
 184     // |               |
 185     // . reg save area .
 186     // |               |
 187     // +---------------+ <--- fp + 0x40
 188     // |               |
 189     // . extra 7 slots .
 190     // |               |
 191     // +---------------+ <--- fp + 0x5c
 192     // |  param. size  |
 193     // +---------------+ <--- fp + 0x60
 194     // |    thread     |
 195     // +---------------+
 196     // |               |
 197 
 198     // pass parameters if any
 199     BLOCK_COMMENT("pass parameters if any");
 200     { const Register src = parameters.as_in().as_register();
 201       const Register dst = Lentry_args;
 202       const Register tmp = G3_scratch;
 203       const Register cnt = G4_scratch;
 204 
 205       // test if any parameters & setup of Lentry_args
 206       Label exit;
 207       __ ld_ptr(parameter_size.as_in().as_address(), cnt);      // parameter counter
 208       __ add( FP, STACK_BIAS, dst );
 209       __ tst(cnt);
 210       __ br(Assembler::zero, false, Assembler::pn, exit);
 211       __ delayed()->sub(dst, BytesPerWord, dst);                 // setup Lentry_args
 212 
 213       // copy parameters if any
 214       Label loop;
 215       __ BIND(loop);
 216       // Store parameter value
 217       __ ld_ptr(src, 0, tmp);
 218       __ add(src, BytesPerWord, src);
 219       __ st_ptr(tmp, dst, 0);
 220       __ deccc(cnt);
 221       __ br(Assembler::greater, false, Assembler::pt, loop);
 222       __ delayed()->sub(dst, Interpreter::stackElementSize, dst);
 223 
 224       // done
 225       __ BIND(exit);
 226     }
 227 
 228     // setup parameters, method & call Java function
 229 #ifdef ASSERT
 230     // layout_activation_impl checks it's notion of saved SP against
 231     // this register, so if this changes update it as well.
 232     const Register saved_SP = Lscratch;
 233     __ mov(SP, saved_SP);                               // keep track of SP before call
 234 #endif
 235 
 236     // setup parameters
 237     const Register t = G3_scratch;
 238     __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words)
 239     __ sll(t, Interpreter::logStackElementSize, t);    // compute number of bytes
 240     __ sub(FP, t, Gargs);                              // setup parameter pointer
 241 #ifdef _LP64
 242     __ add( Gargs, STACK_BIAS, Gargs );                // Account for LP64 stack bias
 243 #endif
 244     __ mov(SP, O5_savedSP);
 245 
 246 
 247     // do the call
 248     //
 249     // the following register must be setup:
 250     //
 251     // G2_thread
 252     // G5_method
 253     // Gargs
 254     BLOCK_COMMENT("call Java function");
 255     __ jmpl(entry_point.as_in().as_register(), G0, O7);
 256     __ delayed()->mov(method.as_in().as_register(), G5_method);   // setup method
 257 
 258     BLOCK_COMMENT("call_stub_return_address:");
 259     return_pc = __ pc();
 260 
 261     // The callee, if it wasn't interpreted, can return with SP changed so
 262     // we can no longer assert of change of SP.
 263 
 264     // store result depending on type
 265     // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE
 266     //  is treated as T_INT)
 267     { const Register addr = result     .as_in().as_register();
 268       const Register type = result_type.as_in().as_register();
 269       Label is_long, is_float, is_double, is_object, exit;
 270       __            cmp(type, T_OBJECT);  __ br(Assembler::equal, false, Assembler::pn, is_object);
 271       __ delayed()->cmp(type, T_FLOAT);   __ br(Assembler::equal, false, Assembler::pn, is_float);
 272       __ delayed()->cmp(type, T_DOUBLE);  __ br(Assembler::equal, false, Assembler::pn, is_double);
 273       __ delayed()->cmp(type, T_LONG);    __ br(Assembler::equal, false, Assembler::pn, is_long);
 274       __ delayed()->nop();
 275 
 276       // store int result
 277       __ st(O0, addr, G0);
 278 
 279       __ BIND(exit);
 280       __ ret();
 281       __ delayed()->restore();
 282 
 283       __ BIND(is_object);
 284       __ ba(exit, false);
 285       __ delayed()->st_ptr(O0, addr, G0);
 286 
 287       __ BIND(is_float);
 288       __ ba(exit, false);
 289       __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
 290 
 291       __ BIND(is_double);
 292       __ ba(exit, false);
 293       __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
 294 
 295       __ BIND(is_long);
 296 #ifdef _LP64
 297       __ ba(exit, false);
 298       __ delayed()->st_long(O0, addr, G0);      // store entire long
 299 #else
 300 #if defined(COMPILER2)
 301   // All return values are where we want them, except for Longs.  C2 returns
 302   // longs in G1 in the 32-bit build whereas the interpreter wants them in O0/O1.
 303   // Since the interpreter will return longs in G1 and O0/O1 in the 32bit
 304   // build we simply always use G1.
 305   // Note: I tried to make c2 return longs in O0/O1 and G1 so we wouldn't have to
 306   // do this here. Unfortunately if we did a rethrow we'd see an machepilog node
 307   // first which would move g1 -> O0/O1 and destroy the exception we were throwing.
 308 
 309       __ ba(exit, false);
 310       __ delayed()->stx(G1, addr, G0);  // store entire long
 311 #else
 312       __ st(O1, addr, BytesPerInt);
 313       __ ba(exit, false);
 314       __ delayed()->st(O0, addr, G0);
 315 #endif /* COMPILER2 */
 316 #endif /* _LP64 */
 317      }
 318      return start;
 319   }
 320 
 321 
 322   //----------------------------------------------------------------------------------------------------
 323   // Return point for a Java call if there's an exception thrown in Java code.
 324   // The exception is caught and transformed into a pending exception stored in
 325   // JavaThread that can be tested from within the VM.
 326   //
 327   // Oexception: exception oop
 328 
 329   address generate_catch_exception() {
 330     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 331 
 332     address start = __ pc();
 333     // verify that thread corresponds
 334     __ verify_thread();
 335 
 336     const Register& temp_reg = Gtemp;
 337     Address pending_exception_addr    (G2_thread, Thread::pending_exception_offset());
 338     Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset   ());
 339     Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset   ());
 340 
 341     // set pending exception
 342     __ verify_oop(Oexception);
 343     __ st_ptr(Oexception, pending_exception_addr);
 344     __ set((intptr_t)__FILE__, temp_reg);
 345     __ st_ptr(temp_reg, exception_file_offset_addr);
 346     __ set((intptr_t)__LINE__, temp_reg);
 347     __ st(temp_reg, exception_line_offset_addr);
 348 
 349     // complete return to VM
 350     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
 351 
 352     AddressLiteral stub_ret(StubRoutines::_call_stub_return_address);
 353     __ jump_to(stub_ret, temp_reg);
 354     __ delayed()->nop();
 355 
 356     return start;
 357   }
 358 
 359 
 360   //----------------------------------------------------------------------------------------------------
 361   // Continuation point for runtime calls returning with a pending exception
 362   // The pending exception check happened in the runtime or native call stub
 363   // The pending exception in Thread is converted into a Java-level exception
 364   //
 365   // Contract with Java-level exception handler: O0 = exception
 366   //                                             O1 = throwing pc
 367 
 368   address generate_forward_exception() {
 369     StubCodeMark mark(this, "StubRoutines", "forward_exception");
 370     address start = __ pc();
 371 
 372     // Upon entry, O7 has the return address returning into Java
 373     // (interpreted or compiled) code; i.e. the return address
 374     // becomes the throwing pc.
 375 
 376     const Register& handler_reg = Gtemp;
 377 
 378     Address exception_addr(G2_thread, Thread::pending_exception_offset());
 379 
 380 #ifdef ASSERT
 381     // make sure that this code is only executed if there is a pending exception
 382     { Label L;
 383       __ ld_ptr(exception_addr, Gtemp);
 384       __ br_notnull(Gtemp, false, Assembler::pt, L);
 385       __ stop("StubRoutines::forward exception: no pending exception (1)");
 386       __ bind(L);
 387     }
 388 #endif
 389 
 390     // compute exception handler into handler_reg
 391     __ get_thread();
 392     __ ld_ptr(exception_addr, Oexception);
 393     __ verify_oop(Oexception);
 394     __ save_frame(0);             // compensates for compiler weakness
 395     __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC
 396     BLOCK_COMMENT("call exception_handler_for_return_address");
 397     __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch);
 398     __ mov(O0, handler_reg);
 399     __ restore();                 // compensates for compiler weakness
 400 
 401     __ ld_ptr(exception_addr, Oexception);
 402     __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC
 403 
 404 #ifdef ASSERT
 405     // make sure exception is set
 406     { Label L;
 407       __ br_notnull(Oexception, false, Assembler::pt, L);
 408       __ stop("StubRoutines::forward exception: no pending exception (2)");
 409       __ bind(L);
 410     }
 411 #endif
 412     // jump to exception handler
 413     __ jmp(handler_reg, 0);
 414     // clear pending exception
 415     __ delayed()->st_ptr(G0, exception_addr);
 416 
 417     return start;
 418   }
 419 
 420 
 421   //------------------------------------------------------------------------------------------------------------------------
 422   // Continuation point for throwing of implicit exceptions that are not handled in
 423   // the current activation. Fabricates an exception oop and initiates normal
 424   // exception dispatching in this frame. Only callee-saved registers are preserved
 425   // (through the normal register window / RegisterMap handling).
 426   // If the compiler needs all registers to be preserved between the fault
 427   // point and the exception handler then it must assume responsibility for that in
 428   // AbstractCompiler::continuation_for_implicit_null_exception or
 429   // continuation_for_implicit_division_by_zero_exception. All other implicit
 430   // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
 431   // either at call sites or otherwise assume that stack unwinding will be initiated,
 432   // so caller saved registers were assumed volatile in the compiler.
 433 
 434   // Note that we generate only this stub into a RuntimeStub, because it needs to be
 435   // properly traversed and ignored during GC, so we change the meaning of the "__"
 436   // macro within this method.
 437 #undef __
 438 #define __ masm->
 439 
 440   address generate_throw_exception(const char* name, address runtime_entry, bool restore_saved_exception_pc,
 441                                    Register arg1 = noreg, Register arg2 = noreg) {
 442 #ifdef ASSERT
 443     int insts_size = VerifyThread ? 1 * K : 600;
 444 #else
 445     int insts_size = VerifyThread ? 1 * K : 256;
 446 #endif /* ASSERT */
 447     int locs_size  = 32;
 448 
 449     CodeBuffer      code(name, insts_size, locs_size);
 450     MacroAssembler* masm = new MacroAssembler(&code);
 451 
 452     __ verify_thread();
 453 
 454     // This is an inlined and slightly modified version of call_VM
 455     // which has the ability to fetch the return PC out of thread-local storage
 456     __ assert_not_delayed();
 457 
 458     // Note that we always push a frame because on the SPARC
 459     // architecture, for all of our implicit exception kinds at call
 460     // sites, the implicit exception is taken before the callee frame
 461     // is pushed.
 462     __ save_frame(0);
 463 
 464     int frame_complete = __ offset();
 465 
 466     if (restore_saved_exception_pc) {
 467       __ ld_ptr(G2_thread, JavaThread::saved_exception_pc_offset(), I7);
 468       __ sub(I7, frame::pc_return_offset, I7);
 469     }
 470 
 471     // Note that we always have a runtime stub frame on the top of stack by this point
 472     Register last_java_sp = SP;
 473     // 64-bit last_java_sp is biased!
 474     __ set_last_Java_frame(last_java_sp, G0);
 475     if (VerifyThread)  __ mov(G2_thread, O0); // about to be smashed; pass early
 476     __ save_thread(noreg);
 477     if (arg1 != noreg) {
 478       assert(arg2 != O1, "clobbered");
 479       __ mov(arg1, O1);
 480     }
 481     if (arg2 != noreg) {
 482       __ mov(arg2, O2);
 483     }
 484     // do the call
 485     BLOCK_COMMENT("call runtime_entry");
 486     __ call(runtime_entry, relocInfo::runtime_call_type);
 487     if (!VerifyThread)
 488       __ delayed()->mov(G2_thread, O0);  // pass thread as first argument
 489     else
 490       __ delayed()->nop();             // (thread already passed)
 491     __ restore_thread(noreg);
 492     __ reset_last_Java_frame();
 493 
 494     // check for pending exceptions. use Gtemp as scratch register.
 495 #ifdef ASSERT
 496     Label L;
 497 
 498     Address exception_addr(G2_thread, Thread::pending_exception_offset());
 499     Register scratch_reg = Gtemp;
 500     __ ld_ptr(exception_addr, scratch_reg);
 501     __ br_notnull(scratch_reg, false, Assembler::pt, L);
 502     __ should_not_reach_here();
 503     __ bind(L);
 504 #endif // ASSERT
 505     BLOCK_COMMENT("call forward_exception_entry");
 506     __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
 507     // we use O7 linkage so that forward_exception_entry has the issuing PC
 508     __ delayed()->restore();
 509 
 510     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false);
 511     return stub->entry_point();
 512   }
 513 
 514 #undef __
 515 #define __ _masm->
 516 
 517 
 518   // Generate a routine that sets all the registers so we
 519   // can tell if the stop routine prints them correctly.
 520   address generate_test_stop() {
 521     StubCodeMark mark(this, "StubRoutines", "test_stop");
 522     address start = __ pc();
 523 
 524     int i;
 525 
 526     __ save_frame(0);
 527 
 528     static jfloat zero = 0.0, one = 1.0;
 529 
 530     // put addr in L0, then load through L0 to F0
 531     __ set((intptr_t)&zero, L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F0);
 532     __ set((intptr_t)&one,  L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1
 533 
 534     // use add to put 2..18 in F2..F18
 535     for ( i = 2;  i <= 18;  ++i ) {
 536       __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1),  as_FloatRegister(i));
 537     }
 538 
 539     // Now put double 2 in F16, double 18 in F18
 540     __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 );
 541     __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 );
 542 
 543     // use add to put 20..32 in F20..F32
 544     for (i = 20; i < 32; i += 2) {
 545       __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2),  as_FloatRegister(i));
 546     }
 547 
 548     // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's
 549     for ( i = 0; i < 8; ++i ) {
 550       if (i < 6) {
 551         __ set(     i, as_iRegister(i));
 552         __ set(16 + i, as_oRegister(i));
 553         __ set(24 + i, as_gRegister(i));
 554       }
 555       __ set( 8 + i, as_lRegister(i));
 556     }
 557 
 558     __ stop("testing stop");
 559 
 560 
 561     __ ret();
 562     __ delayed()->restore();
 563 
 564     return start;
 565   }
 566 
 567 
 568   address generate_stop_subroutine() {
 569     StubCodeMark mark(this, "StubRoutines", "stop_subroutine");
 570     address start = __ pc();
 571 
 572     __ stop_subroutine();
 573 
 574     return start;
 575   }
 576 
 577   address generate_flush_callers_register_windows() {
 578     StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows");
 579     address start = __ pc();
 580 
 581     __ flush_windows();
 582     __ retl(false);
 583     __ delayed()->add( FP, STACK_BIAS, O0 );
 584     // The returned value must be a stack pointer whose register save area
 585     // is flushed, and will stay flushed while the caller executes.
 586 
 587     return start;
 588   }
 589 
 590   // Helper functions for v8 atomic operations.
 591   //
 592   void get_v8_oop_lock_ptr(Register lock_ptr_reg, Register mark_oop_reg, Register scratch_reg) {
 593     if (mark_oop_reg == noreg) {
 594       address lock_ptr = (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr();
 595       __ set((intptr_t)lock_ptr, lock_ptr_reg);
 596     } else {
 597       assert(scratch_reg != noreg, "just checking");
 598       address lock_ptr = (address)StubRoutines::Sparc::_v8_oop_lock_cache;
 599       __ set((intptr_t)lock_ptr, lock_ptr_reg);
 600       __ and3(mark_oop_reg, StubRoutines::Sparc::v8_oop_lock_mask_in_place, scratch_reg);
 601       __ add(lock_ptr_reg, scratch_reg, lock_ptr_reg);
 602     }
 603   }
 604 
 605   void generate_v8_lock_prologue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
 606 
 607     get_v8_oop_lock_ptr(lock_ptr_reg, mark_oop_reg, scratch_reg);
 608     __ set(StubRoutines::Sparc::locked, lock_reg);
 609     // Initialize yield counter
 610     __ mov(G0,yield_reg);
 611 
 612     __ BIND(retry);
 613     __ cmp_and_br(yield_reg, V8AtomicOperationUnderLockSpinCount, Assembler::less, false, Assembler::pt, dontyield);
 614 
 615     // This code can only be called from inside the VM, this
 616     // stub is only invoked from Atomic::add().  We do not
 617     // want to use call_VM, because _last_java_sp and such
 618     // must already be set.
 619     //
 620     // Save the regs and make space for a C call
 621     __ save(SP, -96, SP);
 622     __ save_all_globals_into_locals();
 623     BLOCK_COMMENT("call os::naked_sleep");
 624     __ call(CAST_FROM_FN_PTR(address, os::naked_sleep));
 625     __ delayed()->nop();
 626     __ restore_globals_from_locals();
 627     __ restore();
 628     // reset the counter
 629     __ mov(G0,yield_reg);
 630 
 631     __ BIND(dontyield);
 632 
 633     // try to get lock
 634     __ swap(lock_ptr_reg, 0, lock_reg);
 635 
 636     // did we get the lock?
 637     __ cmp(lock_reg, StubRoutines::Sparc::unlocked);
 638     __ br(Assembler::notEqual, true, Assembler::pn, retry);
 639     __ delayed()->add(yield_reg,1,yield_reg);
 640 
 641     // yes, got lock. do the operation here.
 642   }
 643 
 644   void generate_v8_lock_epilogue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
 645     __ st(lock_reg, lock_ptr_reg, 0); // unlock
 646   }
 647 
 648   // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest).
 649   //
 650   // Arguments :
 651   //
 652   //      exchange_value: O0
 653   //      dest:           O1
 654   //
 655   // Results:
 656   //
 657   //     O0: the value previously stored in dest
 658   //
 659   address generate_atomic_xchg() {
 660     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
 661     address start = __ pc();
 662 
 663     if (UseCASForSwap) {
 664       // Use CAS instead of swap, just in case the MP hardware
 665       // prefers to work with just one kind of synch. instruction.
 666       Label retry;
 667       __ BIND(retry);
 668       __ mov(O0, O3);       // scratch copy of exchange value
 669       __ ld(O1, 0, O2);     // observe the previous value
 670       // try to replace O2 with O3
 671       __ cas_under_lock(O1, O2, O3,
 672       (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
 673       __ cmp_and_br(O2, O3, Assembler::notEqual, false, Assembler::pn, retry);
 674 
 675       __ retl(false);
 676       __ delayed()->mov(O2, O0);  // report previous value to caller
 677 
 678     } else {
 679       if (VM_Version::v9_instructions_work()) {
 680         __ retl(false);
 681         __ delayed()->swap(O1, 0, O0);
 682       } else {
 683         const Register& lock_reg = O2;
 684         const Register& lock_ptr_reg = O3;
 685         const Register& yield_reg = O4;
 686 
 687         Label retry;
 688         Label dontyield;
 689 
 690         generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
 691         // got the lock, do the swap
 692         __ swap(O1, 0, O0);
 693 
 694         generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
 695         __ retl(false);
 696         __ delayed()->nop();
 697       }
 698     }
 699 
 700     return start;
 701   }
 702 
 703 
 704   // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value)
 705   //
 706   // Arguments :
 707   //
 708   //      exchange_value: O0
 709   //      dest:           O1
 710   //      compare_value:  O2
 711   //
 712   // Results:
 713   //
 714   //     O0: the value previously stored in dest
 715   //
 716   // Overwrites (v8): O3,O4,O5
 717   //
 718   address generate_atomic_cmpxchg() {
 719     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
 720     address start = __ pc();
 721 
 722     // cmpxchg(dest, compare_value, exchange_value)
 723     __ cas_under_lock(O1, O2, O0,
 724       (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
 725     __ retl(false);
 726     __ delayed()->nop();
 727 
 728     return start;
 729   }
 730 
 731   // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
 732   //
 733   // Arguments :
 734   //
 735   //      exchange_value: O1:O0
 736   //      dest:           O2
 737   //      compare_value:  O4:O3
 738   //
 739   // Results:
 740   //
 741   //     O1:O0: the value previously stored in dest
 742   //
 743   // This only works on V9, on V8 we don't generate any
 744   // code and just return NULL.
 745   //
 746   // Overwrites: G1,G2,G3
 747   //
 748   address generate_atomic_cmpxchg_long() {
 749     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
 750     address start = __ pc();
 751 
 752     if (!VM_Version::supports_cx8())
 753         return NULL;;
 754     __ sllx(O0, 32, O0);
 755     __ srl(O1, 0, O1);
 756     __ or3(O0,O1,O0);      // O0 holds 64-bit value from compare_value
 757     __ sllx(O3, 32, O3);
 758     __ srl(O4, 0, O4);
 759     __ or3(O3,O4,O3);     // O3 holds 64-bit value from exchange_value
 760     __ casx(O2, O3, O0);
 761     __ srl(O0, 0, O1);    // unpacked return value in O1:O0
 762     __ retl(false);
 763     __ delayed()->srlx(O0, 32, O0);
 764 
 765     return start;
 766   }
 767 
 768 
 769   // Support for jint Atomic::add(jint add_value, volatile jint* dest).
 770   //
 771   // Arguments :
 772   //
 773   //      add_value: O0   (e.g., +1 or -1)
 774   //      dest:      O1
 775   //
 776   // Results:
 777   //
 778   //     O0: the new value stored in dest
 779   //
 780   // Overwrites (v9): O3
 781   // Overwrites (v8): O3,O4,O5
 782   //
 783   address generate_atomic_add() {
 784     StubCodeMark mark(this, "StubRoutines", "atomic_add");
 785     address start = __ pc();
 786     __ BIND(_atomic_add_stub);
 787 
 788     if (VM_Version::v9_instructions_work()) {
 789       Label(retry);
 790       __ BIND(retry);
 791 
 792       __ lduw(O1, 0, O2);
 793       __ add(O0, O2, O3);
 794       __ cas(O1, O2, O3);
 795       __ cmp_and_br(O2, O3, Assembler::notEqual, false, Assembler::pn, retry);
 796       __ retl(false);
 797       __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
 798     } else {
 799       const Register& lock_reg = O2;
 800       const Register& lock_ptr_reg = O3;
 801       const Register& value_reg = O4;
 802       const Register& yield_reg = O5;
 803 
 804       Label(retry);
 805       Label(dontyield);
 806 
 807       generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
 808       // got lock, do the increment
 809       __ ld(O1, 0, value_reg);
 810       __ add(O0, value_reg, value_reg);
 811       __ st(value_reg, O1, 0);
 812 
 813       // %%% only for RMO and PSO
 814       __ membar(Assembler::StoreStore);
 815 
 816       generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
 817 
 818       __ retl(false);
 819       __ delayed()->mov(value_reg, O0);
 820     }
 821 
 822     return start;
 823   }
 824   Label _atomic_add_stub;  // called from other stubs
 825 
 826 
 827   //------------------------------------------------------------------------------------------------------------------------
 828   // The following routine generates a subroutine to throw an asynchronous
 829   // UnknownError when an unsafe access gets a fault that could not be
 830   // reasonably prevented by the programmer.  (Example: SIGBUS/OBJERR.)
 831   //
 832   // Arguments :
 833   //
 834   //      trapping PC:    O7
 835   //
 836   // Results:
 837   //     posts an asynchronous exception, skips the trapping instruction
 838   //
 839 
 840   address generate_handler_for_unsafe_access() {
 841     StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
 842     address start = __ pc();
 843 
 844     const int preserve_register_words = (64 * 2);
 845     Address preserve_addr(FP, (-preserve_register_words * wordSize) + STACK_BIAS);
 846 
 847     Register Lthread = L7_thread_cache;
 848     int i;
 849 
 850     __ save_frame(0);
 851     __ mov(G1, L1);
 852     __ mov(G2, L2);
 853     __ mov(G3, L3);
 854     __ mov(G4, L4);
 855     __ mov(G5, L5);
 856     for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
 857       __ stf(FloatRegisterImpl::D, as_FloatRegister(i), preserve_addr, i * wordSize);
 858     }
 859 
 860     address entry_point = CAST_FROM_FN_PTR(address, handle_unsafe_access);
 861     BLOCK_COMMENT("call handle_unsafe_access");
 862     __ call(entry_point, relocInfo::runtime_call_type);
 863     __ delayed()->nop();
 864 
 865     __ mov(L1, G1);
 866     __ mov(L2, G2);
 867     __ mov(L3, G3);
 868     __ mov(L4, G4);
 869     __ mov(L5, G5);
 870     for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
 871       __ ldf(FloatRegisterImpl::D, preserve_addr, as_FloatRegister(i), i * wordSize);
 872     }
 873 
 874     __ verify_thread();
 875 
 876     __ jmp(O0, 0);
 877     __ delayed()->restore();
 878 
 879     return start;
 880   }
 881 
 882 
 883   // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super );
 884   // Arguments :
 885   //
 886   //      ret  : O0, returned
 887   //      icc/xcc: set as O0 (depending on wordSize)
 888   //      sub  : O1, argument, not changed
 889   //      super: O2, argument, not changed
 890   //      raddr: O7, blown by call
 891   address generate_partial_subtype_check() {
 892     __ align(CodeEntryAlignment);
 893     StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
 894     address start = __ pc();
 895     Label miss;
 896 
 897 #if defined(COMPILER2) && !defined(_LP64)
 898     // Do not use a 'save' because it blows the 64-bit O registers.
 899     __ add(SP,-4*wordSize,SP);  // Make space for 4 temps (stack must be 2 words aligned)
 900     __ st_ptr(L0,SP,(frame::register_save_words+0)*wordSize);
 901     __ st_ptr(L1,SP,(frame::register_save_words+1)*wordSize);
 902     __ st_ptr(L2,SP,(frame::register_save_words+2)*wordSize);
 903     __ st_ptr(L3,SP,(frame::register_save_words+3)*wordSize);
 904     Register Rret   = O0;
 905     Register Rsub   = O1;
 906     Register Rsuper = O2;
 907 #else
 908     __ save_frame(0);
 909     Register Rret   = I0;
 910     Register Rsub   = I1;
 911     Register Rsuper = I2;
 912 #endif
 913 
 914     Register L0_ary_len = L0;
 915     Register L1_ary_ptr = L1;
 916     Register L2_super   = L2;
 917     Register L3_index   = L3;
 918 
 919     __ check_klass_subtype_slow_path(Rsub, Rsuper,
 920                                      L0, L1, L2, L3,
 921                                      NULL, &miss);
 922 
 923     // Match falls through here.
 924     __ addcc(G0,0,Rret);        // set Z flags, Z result
 925 
 926 #if defined(COMPILER2) && !defined(_LP64)
 927     __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
 928     __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
 929     __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
 930     __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
 931     __ retl();                  // Result in Rret is zero; flags set to Z
 932     __ delayed()->add(SP,4*wordSize,SP);
 933 #else
 934     __ ret();                   // Result in Rret is zero; flags set to Z
 935     __ delayed()->restore();
 936 #endif
 937 
 938     __ BIND(miss);
 939     __ addcc(G0,1,Rret);        // set NZ flags, NZ result
 940 
 941 #if defined(COMPILER2) && !defined(_LP64)
 942     __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
 943     __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
 944     __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
 945     __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
 946     __ retl();                  // Result in Rret is != 0; flags set to NZ
 947     __ delayed()->add(SP,4*wordSize,SP);
 948 #else
 949     __ ret();                   // Result in Rret is != 0; flags set to NZ
 950     __ delayed()->restore();
 951 #endif
 952 
 953     return start;
 954   }
 955 
 956 
 957   // Called from MacroAssembler::verify_oop
 958   //
 959   address generate_verify_oop_subroutine() {
 960     StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
 961 
 962     address start = __ pc();
 963 
 964     __ verify_oop_subroutine();
 965 
 966     return start;
 967   }
 968 
 969 
 970   //
 971   // Verify that a register contains clean 32-bits positive value
 972   // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax).
 973   //
 974   //  Input:
 975   //    Rint  -  32-bits value
 976   //    Rtmp  -  scratch
 977   //
 978   void assert_clean_int(Register Rint, Register Rtmp) {
 979 #if defined(ASSERT) && defined(_LP64)
 980     __ signx(Rint, Rtmp);
 981     __ cmp(Rint, Rtmp);
 982     __ breakpoint_trap(Assembler::notEqual, Assembler::xcc);
 983 #endif
 984   }
 985 
 986   //
 987   //  Generate overlap test for array copy stubs
 988   //
 989   //  Input:
 990   //    O0    -  array1
 991   //    O1    -  array2
 992   //    O2    -  element count
 993   //
 994   //  Kills temps:  O3, O4
 995   //
 996   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
 997     assert(no_overlap_target != NULL, "must be generated");
 998     array_overlap_test(no_overlap_target, NULL, log2_elem_size);
 999   }
1000   void array_overlap_test(Label& L_no_overlap, int log2_elem_size) {
1001     array_overlap_test(NULL, &L_no_overlap, log2_elem_size);
1002   }
1003   void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) {
1004     const Register from       = O0;
1005     const Register to         = O1;
1006     const Register count      = O2;
1007     const Register to_from    = O3; // to - from
1008     const Register byte_count = O4; // count << log2_elem_size
1009 
1010       __ subcc(to, from, to_from);
1011       __ sll_ptr(count, log2_elem_size, byte_count);
1012       if (NOLp == NULL)
1013         __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target);
1014       else
1015         __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp));
1016       __ delayed()->cmp(to_from, byte_count);
1017       if (NOLp == NULL)
1018         __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target);
1019       else
1020         __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp));
1021       __ delayed()->nop();
1022   }
1023 
1024   //
1025   //  Generate pre-write barrier for array.
1026   //
1027   //  Input:
1028   //     addr     - register containing starting address
1029   //     count    - register containing element count
1030   //     tmp      - scratch register
1031   //
1032   //  The input registers are overwritten.
1033   //
1034   void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
1035     BarrierSet* bs = Universe::heap()->barrier_set();
1036     switch (bs->kind()) {
1037       case BarrierSet::G1SATBCT:
1038       case BarrierSet::G1SATBCTLogging:
1039         // With G1, don't generate the call if we statically know that the target in uninitialized
1040         if (!dest_uninitialized) {
1041           __ save_frame(0);
1042           // Save the necessary global regs... will be used after.
1043           if (addr->is_global()) {
1044             __ mov(addr, L0);
1045           }
1046           if (count->is_global()) {
1047             __ mov(count, L1);
1048           }
1049           __ mov(addr->after_save(), O0);
1050           // Get the count into O1
1051           __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
1052           __ delayed()->mov(count->after_save(), O1);
1053           if (addr->is_global()) {
1054             __ mov(L0, addr);
1055           }
1056           if (count->is_global()) {
1057             __ mov(L1, count);
1058           }
1059           __ restore();
1060         }
1061         break;
1062       case BarrierSet::CardTableModRef:
1063       case BarrierSet::CardTableExtension:
1064       case BarrierSet::ModRef:
1065         break;
1066       default:
1067         ShouldNotReachHere();
1068     }
1069   }
1070   //
1071   //  Generate post-write barrier for array.
1072   //
1073   //  Input:
1074   //     addr     - register containing starting address
1075   //     count    - register containing element count
1076   //     tmp      - scratch register
1077   //
1078   //  The input registers are overwritten.
1079   //
1080   void gen_write_ref_array_post_barrier(Register addr, Register count,
1081                                         Register tmp) {
1082     BarrierSet* bs = Universe::heap()->barrier_set();
1083 
1084     switch (bs->kind()) {
1085       case BarrierSet::G1SATBCT:
1086       case BarrierSet::G1SATBCTLogging:
1087         {
1088           // Get some new fresh output registers.
1089           __ save_frame(0);
1090           __ mov(addr->after_save(), O0);
1091           __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
1092           __ delayed()->mov(count->after_save(), O1);
1093           __ restore();
1094         }
1095         break;
1096       case BarrierSet::CardTableModRef:
1097       case BarrierSet::CardTableExtension:
1098         {
1099           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
1100           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1101           assert_different_registers(addr, count, tmp);
1102 
1103           Label L_loop;
1104 
1105           __ sll_ptr(count, LogBytesPerHeapOop, count);
1106           __ sub(count, BytesPerHeapOop, count);
1107           __ add(count, addr, count);
1108           // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
1109           __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
1110           __ srl_ptr(count, CardTableModRefBS::card_shift, count);
1111           __ sub(count, addr, count);
1112           AddressLiteral rs(ct->byte_map_base);
1113           __ set(rs, tmp);
1114         __ BIND(L_loop);
1115           __ stb(G0, tmp, addr);
1116           __ subcc(count, 1, count);
1117           __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1118           __ delayed()->add(addr, 1, addr);
1119         }
1120         break;
1121       case BarrierSet::ModRef:
1122         break;
1123       default:
1124         ShouldNotReachHere();
1125     }
1126   }
1127 
1128 
1129   // Copy big chunks forward with shift
1130   //
1131   // Inputs:
1132   //   from      - source arrays
1133   //   to        - destination array aligned to 8-bytes
1134   //   count     - elements count to copy >= the count equivalent to 16 bytes
1135   //   count_dec - elements count's decrement equivalent to 16 bytes
1136   //   L_copy_bytes - copy exit label
1137   //
1138   void copy_16_bytes_forward_with_shift(Register from, Register to,
1139                      Register count, int count_dec, Label& L_copy_bytes) {
1140     Label L_loop, L_aligned_copy, L_copy_last_bytes;
1141 
1142     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1143       __ andcc(from, 7, G1); // misaligned bytes
1144       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1145       __ delayed()->nop();
1146 
1147     const Register left_shift  = G1; // left  shift bit counter
1148     const Register right_shift = G5; // right shift bit counter
1149 
1150       __ sll(G1, LogBitsPerByte, left_shift);
1151       __ mov(64, right_shift);
1152       __ sub(right_shift, left_shift, right_shift);
1153 
1154     //
1155     // Load 2 aligned 8-bytes chunks and use one from previous iteration
1156     // to form 2 aligned 8-bytes chunks to store.
1157     //
1158       __ deccc(count, count_dec); // Pre-decrement 'count'
1159       __ andn(from, 7, from);     // Align address
1160       __ ldx(from, 0, O3);
1161       __ inc(from, 8);
1162       __ align(OptoLoopAlignment);
1163     __ BIND(L_loop);
1164       __ ldx(from, 0, O4);
1165       __ deccc(count, count_dec); // Can we do next iteration after this one?
1166       __ ldx(from, 8, G4);
1167       __ inc(to, 16);
1168       __ inc(from, 16);
1169       __ sllx(O3, left_shift,  O3);
1170       __ srlx(O4, right_shift, G3);
1171       __ bset(G3, O3);
1172       __ stx(O3, to, -16);
1173       __ sllx(O4, left_shift,  O4);
1174       __ srlx(G4, right_shift, G3);
1175       __ bset(G3, O4);
1176       __ stx(O4, to, -8);
1177       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1178       __ delayed()->mov(G4, O3);
1179 
1180       __ inccc(count, count_dec>>1 ); // + 8 bytes
1181       __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1182       __ delayed()->inc(count, count_dec>>1); // restore 'count'
1183 
1184       // copy 8 bytes, part of them already loaded in O3
1185       __ ldx(from, 0, O4);
1186       __ inc(to, 8);
1187       __ inc(from, 8);
1188       __ sllx(O3, left_shift,  O3);
1189       __ srlx(O4, right_shift, G3);
1190       __ bset(O3, G3);
1191       __ stx(G3, to, -8);
1192 
1193     __ BIND(L_copy_last_bytes);
1194       __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
1195       __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1196       __ delayed()->sub(from, right_shift, from);       // restore address
1197 
1198     __ BIND(L_aligned_copy);
1199   }
1200 
1201   // Copy big chunks backward with shift
1202   //
1203   // Inputs:
1204   //   end_from  - source arrays end address
1205   //   end_to    - destination array end address aligned to 8-bytes
1206   //   count     - elements count to copy >= the count equivalent to 16 bytes
1207   //   count_dec - elements count's decrement equivalent to 16 bytes
1208   //   L_aligned_copy - aligned copy exit label
1209   //   L_copy_bytes   - copy exit label
1210   //
1211   void copy_16_bytes_backward_with_shift(Register end_from, Register end_to,
1212                      Register count, int count_dec,
1213                      Label& L_aligned_copy, Label& L_copy_bytes) {
1214     Label L_loop, L_copy_last_bytes;
1215 
1216     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1217       __ andcc(end_from, 7, G1); // misaligned bytes
1218       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1219       __ delayed()->deccc(count, count_dec); // Pre-decrement 'count'
1220 
1221     const Register left_shift  = G1; // left  shift bit counter
1222     const Register right_shift = G5; // right shift bit counter
1223 
1224       __ sll(G1, LogBitsPerByte, left_shift);
1225       __ mov(64, right_shift);
1226       __ sub(right_shift, left_shift, right_shift);
1227 
1228     //
1229     // Load 2 aligned 8-bytes chunks and use one from previous iteration
1230     // to form 2 aligned 8-bytes chunks to store.
1231     //
1232       __ andn(end_from, 7, end_from);     // Align address
1233       __ ldx(end_from, 0, O3);
1234       __ align(OptoLoopAlignment);
1235     __ BIND(L_loop);
1236       __ ldx(end_from, -8, O4);
1237       __ deccc(count, count_dec); // Can we do next iteration after this one?
1238       __ ldx(end_from, -16, G4);
1239       __ dec(end_to, 16);
1240       __ dec(end_from, 16);
1241       __ srlx(O3, right_shift, O3);
1242       __ sllx(O4, left_shift,  G3);
1243       __ bset(G3, O3);
1244       __ stx(O3, end_to, 8);
1245       __ srlx(O4, right_shift, O4);
1246       __ sllx(G4, left_shift,  G3);
1247       __ bset(G3, O4);
1248       __ stx(O4, end_to, 0);
1249       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1250       __ delayed()->mov(G4, O3);
1251 
1252       __ inccc(count, count_dec>>1 ); // + 8 bytes
1253       __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1254       __ delayed()->inc(count, count_dec>>1); // restore 'count'
1255 
1256       // copy 8 bytes, part of them already loaded in O3
1257       __ ldx(end_from, -8, O4);
1258       __ dec(end_to, 8);
1259       __ dec(end_from, 8);
1260       __ srlx(O3, right_shift, O3);
1261       __ sllx(O4, left_shift,  G3);
1262       __ bset(O3, G3);
1263       __ stx(G3, end_to, 0);
1264 
1265     __ BIND(L_copy_last_bytes);
1266       __ srl(left_shift, LogBitsPerByte, left_shift);    // misaligned bytes
1267       __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1268       __ delayed()->add(end_from, left_shift, end_from); // restore address
1269   }
1270 
1271   //
1272   //  Generate stub for disjoint byte copy.  If "aligned" is true, the
1273   //  "from" and "to" addresses are assumed to be heapword aligned.
1274   //
1275   // Arguments for generated stub:
1276   //      from:  O0
1277   //      to:    O1
1278   //      count: O2 treated as signed
1279   //
1280   address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) {
1281     __ align(CodeEntryAlignment);
1282     StubCodeMark mark(this, "StubRoutines", name);
1283     address start = __ pc();
1284 
1285     Label L_skip_alignment, L_align;
1286     Label L_copy_byte, L_copy_byte_loop, L_exit;
1287 
1288     const Register from      = O0;   // source array address
1289     const Register to        = O1;   // destination array address
1290     const Register count     = O2;   // elements count
1291     const Register offset    = O5;   // offset from start of arrays
1292     // O3, O4, G3, G4 are used as temp registers
1293 
1294     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1295 
1296     if (entry != NULL) {
1297       *entry = __ pc();
1298       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1299       BLOCK_COMMENT("Entry:");
1300     }
1301 
1302     // for short arrays, just do single element copy
1303     __ cmp(count, 23); // 16 + 7
1304     __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1305     __ delayed()->mov(G0, offset);
1306 
1307     if (aligned) {
1308       // 'aligned' == true when it is known statically during compilation
1309       // of this arraycopy call site that both 'from' and 'to' addresses
1310       // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1311       //
1312       // Aligned arrays have 4 bytes alignment in 32-bits VM
1313       // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM
1314       //
1315 #ifndef _LP64
1316       // copy a 4-bytes word if necessary to align 'to' to 8 bytes
1317       __ andcc(to, 7, G0);
1318       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment);
1319       __ delayed()->ld(from, 0, O3);
1320       __ inc(from, 4);
1321       __ inc(to, 4);
1322       __ dec(count, 4);
1323       __ st(O3, to, -4);
1324     __ BIND(L_skip_alignment);
1325 #endif
1326     } else {
1327       // copy bytes to align 'to' on 8 byte boundary
1328       __ andcc(to, 7, G1); // misaligned bytes
1329       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1330       __ delayed()->neg(G1);
1331       __ inc(G1, 8);       // bytes need to copy to next 8-bytes alignment
1332       __ sub(count, G1, count);
1333     __ BIND(L_align);
1334       __ ldub(from, 0, O3);
1335       __ deccc(G1);
1336       __ inc(from);
1337       __ stb(O3, to, 0);
1338       __ br(Assembler::notZero, false, Assembler::pt, L_align);
1339       __ delayed()->inc(to);
1340     __ BIND(L_skip_alignment);
1341     }
1342 #ifdef _LP64
1343     if (!aligned)
1344 #endif
1345     {
1346       // Copy with shift 16 bytes per iteration if arrays do not have
1347       // the same alignment mod 8, otherwise fall through to the next
1348       // code for aligned copy.
1349       // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
1350       // Also jump over aligned copy after the copy with shift completed.
1351 
1352       copy_16_bytes_forward_with_shift(from, to, count, 16, L_copy_byte);
1353     }
1354 
1355     // Both array are 8 bytes aligned, copy 16 bytes at a time
1356       __ and3(count, 7, G4); // Save count
1357       __ srl(count, 3, count);
1358      generate_disjoint_long_copy_core(aligned);
1359       __ mov(G4, count);     // Restore count
1360 
1361     // copy tailing bytes
1362     __ BIND(L_copy_byte);
1363       __ br_zero(count, L_exit);
1364       __ align(OptoLoopAlignment);
1365     __ BIND(L_copy_byte_loop);
1366       __ ldub(from, offset, O3);
1367       __ deccc(count);
1368       __ stb(O3, to, offset);
1369       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1370       __ delayed()->inc(offset);
1371 
1372     __ BIND(L_exit);
1373       // O3, O4 are used as temp registers
1374       inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1375       __ retl();
1376       __ delayed()->mov(G0, O0); // return 0
1377     return start;
1378   }
1379 
1380   //
1381   //  Generate stub for conjoint byte copy.  If "aligned" is true, the
1382   //  "from" and "to" addresses are assumed to be heapword aligned.
1383   //
1384   // Arguments for generated stub:
1385   //      from:  O0
1386   //      to:    O1
1387   //      count: O2 treated as signed
1388   //
1389   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1390                                       address *entry, const char *name) {
1391     // Do reverse copy.
1392 
1393     __ align(CodeEntryAlignment);
1394     StubCodeMark mark(this, "StubRoutines", name);
1395     address start = __ pc();
1396 
1397     Label L_skip_alignment, L_align, L_aligned_copy;
1398     Label L_copy_byte, L_copy_byte_loop, L_exit;
1399 
1400     const Register from      = O0;   // source array address
1401     const Register to        = O1;   // destination array address
1402     const Register count     = O2;   // elements count
1403     const Register end_from  = from; // source array end address
1404     const Register end_to    = to;   // destination array end address
1405 
1406     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1407 
1408     if (entry != NULL) {
1409       *entry = __ pc();
1410       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1411       BLOCK_COMMENT("Entry:");
1412     }
1413 
1414     array_overlap_test(nooverlap_target, 0);
1415 
1416     __ add(to, count, end_to);       // offset after last copied element
1417 
1418     // for short arrays, just do single element copy
1419     __ cmp(count, 23); // 16 + 7
1420     __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1421     __ delayed()->add(from, count, end_from);
1422 
1423     {
1424       // Align end of arrays since they could be not aligned even
1425       // when arrays itself are aligned.
1426 
1427       // copy bytes to align 'end_to' on 8 byte boundary
1428       __ andcc(end_to, 7, G1); // misaligned bytes
1429       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1430       __ delayed()->nop();
1431       __ sub(count, G1, count);
1432     __ BIND(L_align);
1433       __ dec(end_from);
1434       __ dec(end_to);
1435       __ ldub(end_from, 0, O3);
1436       __ deccc(G1);
1437       __ brx(Assembler::notZero, false, Assembler::pt, L_align);
1438       __ delayed()->stb(O3, end_to, 0);
1439     __ BIND(L_skip_alignment);
1440     }
1441 #ifdef _LP64
1442     if (aligned) {
1443       // Both arrays are aligned to 8-bytes in 64-bits VM.
1444       // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1445       // in unaligned case.
1446       __ dec(count, 16);
1447     } else
1448 #endif
1449     {
1450       // Copy with shift 16 bytes per iteration if arrays do not have
1451       // the same alignment mod 8, otherwise jump to the next
1452       // code for aligned copy (and substracting 16 from 'count' before jump).
1453       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1454       // Also jump over aligned copy after the copy with shift completed.
1455 
1456       copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
1457                                         L_aligned_copy, L_copy_byte);
1458     }
1459     // copy 4 elements (16 bytes) at a time
1460       __ align(OptoLoopAlignment);
1461     __ BIND(L_aligned_copy);
1462       __ dec(end_from, 16);
1463       __ ldx(end_from, 8, O3);
1464       __ ldx(end_from, 0, O4);
1465       __ dec(end_to, 16);
1466       __ deccc(count, 16);
1467       __ stx(O3, end_to, 8);
1468       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1469       __ delayed()->stx(O4, end_to, 0);
1470       __ inc(count, 16);
1471 
1472     // copy 1 element (2 bytes) at a time
1473     __ BIND(L_copy_byte);
1474       __ br_zero(count, L_exit);
1475       __ align(OptoLoopAlignment);
1476     __ BIND(L_copy_byte_loop);
1477       __ dec(end_from);
1478       __ dec(end_to);
1479       __ ldub(end_from, 0, O4);
1480       __ deccc(count);
1481       __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop);
1482       __ delayed()->stb(O4, end_to, 0);
1483 
1484     __ BIND(L_exit);
1485     // O3, O4 are used as temp registers
1486     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1487     __ retl();
1488     __ delayed()->mov(G0, O0); // return 0
1489     return start;
1490   }
1491 
1492   //
1493   //  Generate stub for disjoint short copy.  If "aligned" is true, the
1494   //  "from" and "to" addresses are assumed to be heapword aligned.
1495   //
1496   // Arguments for generated stub:
1497   //      from:  O0
1498   //      to:    O1
1499   //      count: O2 treated as signed
1500   //
1501   address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) {
1502     __ align(CodeEntryAlignment);
1503     StubCodeMark mark(this, "StubRoutines", name);
1504     address start = __ pc();
1505 
1506     Label L_skip_alignment, L_skip_alignment2;
1507     Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1508 
1509     const Register from      = O0;   // source array address
1510     const Register to        = O1;   // destination array address
1511     const Register count     = O2;   // elements count
1512     const Register offset    = O5;   // offset from start of arrays
1513     // O3, O4, G3, G4 are used as temp registers
1514 
1515     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1516 
1517     if (entry != NULL) {
1518       *entry = __ pc();
1519       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1520       BLOCK_COMMENT("Entry:");
1521     }
1522 
1523     // for short arrays, just do single element copy
1524     __ cmp(count, 11); // 8 + 3  (22 bytes)
1525     __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1526     __ delayed()->mov(G0, offset);
1527 
1528     if (aligned) {
1529       // 'aligned' == true when it is known statically during compilation
1530       // of this arraycopy call site that both 'from' and 'to' addresses
1531       // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1532       //
1533       // Aligned arrays have 4 bytes alignment in 32-bits VM
1534       // and 8 bytes - in 64-bits VM.
1535       //
1536 #ifndef _LP64
1537       // copy a 2-elements word if necessary to align 'to' to 8 bytes
1538       __ andcc(to, 7, G0);
1539       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1540       __ delayed()->ld(from, 0, O3);
1541       __ inc(from, 4);
1542       __ inc(to, 4);
1543       __ dec(count, 2);
1544       __ st(O3, to, -4);
1545     __ BIND(L_skip_alignment);
1546 #endif
1547     } else {
1548       // copy 1 element if necessary to align 'to' on an 4 bytes
1549       __ andcc(to, 3, G0);
1550       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1551       __ delayed()->lduh(from, 0, O3);
1552       __ inc(from, 2);
1553       __ inc(to, 2);
1554       __ dec(count);
1555       __ sth(O3, to, -2);
1556     __ BIND(L_skip_alignment);
1557 
1558       // copy 2 elements to align 'to' on an 8 byte boundary
1559       __ andcc(to, 7, G0);
1560       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1561       __ delayed()->lduh(from, 0, O3);
1562       __ dec(count, 2);
1563       __ lduh(from, 2, O4);
1564       __ inc(from, 4);
1565       __ inc(to, 4);
1566       __ sth(O3, to, -4);
1567       __ sth(O4, to, -2);
1568     __ BIND(L_skip_alignment2);
1569     }
1570 #ifdef _LP64
1571     if (!aligned)
1572 #endif
1573     {
1574       // Copy with shift 16 bytes per iteration if arrays do not have
1575       // the same alignment mod 8, otherwise fall through to the next
1576       // code for aligned copy.
1577       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1578       // Also jump over aligned copy after the copy with shift completed.
1579 
1580       copy_16_bytes_forward_with_shift(from, to, count, 8, L_copy_2_bytes);
1581     }
1582 
1583     // Both array are 8 bytes aligned, copy 16 bytes at a time
1584       __ and3(count, 3, G4); // Save
1585       __ srl(count, 2, count);
1586      generate_disjoint_long_copy_core(aligned);
1587       __ mov(G4, count); // restore
1588 
1589     // copy 1 element at a time
1590     __ BIND(L_copy_2_bytes);
1591       __ br_zero(count, L_exit);
1592       __ align(OptoLoopAlignment);
1593     __ BIND(L_copy_2_bytes_loop);
1594       __ lduh(from, offset, O3);
1595       __ deccc(count);
1596       __ sth(O3, to, offset);
1597       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1598       __ delayed()->inc(offset, 2);
1599 
1600     __ BIND(L_exit);
1601       // O3, O4 are used as temp registers
1602       inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1603       __ retl();
1604       __ delayed()->mov(G0, O0); // return 0
1605     return start;
1606   }
1607 
1608   //
1609   //  Generate stub for disjoint short fill.  If "aligned" is true, the
1610   //  "to" address is assumed to be heapword aligned.
1611   //
1612   // Arguments for generated stub:
1613   //      to:    O0
1614   //      value: O1
1615   //      count: O2 treated as signed
1616   //
1617   address generate_fill(BasicType t, bool aligned, const char* name) {
1618     __ align(CodeEntryAlignment);
1619     StubCodeMark mark(this, "StubRoutines", name);
1620     address start = __ pc();
1621 
1622     const Register to        = O0;   // source array address
1623     const Register value     = O1;   // fill value
1624     const Register count     = O2;   // elements count
1625     // O3 is used as a temp register
1626 
1627     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1628 
1629     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
1630     Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes;
1631 
1632     int shift = -1;
1633     switch (t) {
1634        case T_BYTE:
1635         shift = 2;
1636         break;
1637        case T_SHORT:
1638         shift = 1;
1639         break;
1640       case T_INT:
1641          shift = 0;
1642         break;
1643       default: ShouldNotReachHere();
1644     }
1645 
1646     BLOCK_COMMENT("Entry:");
1647 
1648     if (t == T_BYTE) {
1649       // Zero extend value
1650       __ and3(value, 0xff, value);
1651       __ sllx(value, 8, O3);
1652       __ or3(value, O3, value);
1653     }
1654     if (t == T_SHORT) {
1655       // Zero extend value
1656       __ sllx(value, 48, value);
1657       __ srlx(value, 48, value);
1658     }
1659     if (t == T_BYTE || t == T_SHORT) {
1660       __ sllx(value, 16, O3);
1661       __ or3(value, O3, value);
1662     }
1663 
1664     __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
1665     __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp
1666     __ delayed()->andcc(count, 1, G0);
1667 
1668     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
1669       // align source address at 4 bytes address boundary
1670       if (t == T_BYTE) {
1671         // One byte misalignment happens only for byte arrays
1672         __ andcc(to, 1, G0);
1673         __ br(Assembler::zero, false, Assembler::pt, L_skip_align1);
1674         __ delayed()->nop();
1675         __ stb(value, to, 0);
1676         __ inc(to, 1);
1677         __ dec(count, 1);
1678         __ BIND(L_skip_align1);
1679       }
1680       // Two bytes misalignment happens only for byte and short (char) arrays
1681       __ andcc(to, 2, G0);
1682       __ br(Assembler::zero, false, Assembler::pt, L_skip_align2);
1683       __ delayed()->nop();
1684       __ sth(value, to, 0);
1685       __ inc(to, 2);
1686       __ dec(count, 1 << (shift - 1));
1687       __ BIND(L_skip_align2);
1688     }
1689 #ifdef _LP64
1690     if (!aligned) {
1691 #endif
1692     // align to 8 bytes, we know we are 4 byte aligned to start
1693     __ andcc(to, 7, G0);
1694     __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes);
1695     __ delayed()->nop();
1696     __ stw(value, to, 0);
1697     __ inc(to, 4);
1698     __ dec(count, 1 << shift);
1699     __ BIND(L_fill_32_bytes);
1700 #ifdef _LP64
1701     }
1702 #endif
1703 
1704     if (t == T_INT) {
1705       // Zero extend value
1706       __ srl(value, 0, value);
1707     }
1708     if (t == T_BYTE || t == T_SHORT || t == T_INT) {
1709       __ sllx(value, 32, O3);
1710       __ or3(value, O3, value);
1711     }
1712 
1713     Label L_check_fill_8_bytes;
1714     // Fill 32-byte chunks
1715     __ subcc(count, 8 << shift, count);
1716     __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes);
1717     __ delayed()->nop();
1718 
1719     Label L_fill_32_bytes_loop, L_fill_4_bytes;
1720     __ align(16);
1721     __ BIND(L_fill_32_bytes_loop);
1722 
1723     __ stx(value, to, 0);
1724     __ stx(value, to, 8);
1725     __ stx(value, to, 16);
1726     __ stx(value, to, 24);
1727 
1728     __ subcc(count, 8 << shift, count);
1729     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop);
1730     __ delayed()->add(to, 32, to);
1731 
1732     __ BIND(L_check_fill_8_bytes);
1733     __ addcc(count, 8 << shift, count);
1734     __ brx(Assembler::zero, false, Assembler::pn, L_exit);
1735     __ delayed()->subcc(count, 1 << (shift + 1), count);
1736     __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes);
1737     __ delayed()->andcc(count, 1<<shift, G0);
1738 
1739     //
1740     // length is too short, just fill 8 bytes at a time
1741     //
1742     Label L_fill_8_bytes_loop;
1743     __ BIND(L_fill_8_bytes_loop);
1744     __ stx(value, to, 0);
1745     __ subcc(count, 1 << (shift + 1), count);
1746     __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop);
1747     __ delayed()->add(to, 8, to);
1748 
1749     // fill trailing 4 bytes
1750     __ andcc(count, 1<<shift, G0);  // in delay slot of branches
1751     if (t == T_INT) {
1752       __ BIND(L_fill_elements);
1753     }
1754     __ BIND(L_fill_4_bytes);
1755     __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes);
1756     if (t == T_BYTE || t == T_SHORT) {
1757       __ delayed()->andcc(count, 1<<(shift-1), G0);
1758     } else {
1759       __ delayed()->nop();
1760     }
1761     __ stw(value, to, 0);
1762     if (t == T_BYTE || t == T_SHORT) {
1763       __ inc(to, 4);
1764       // fill trailing 2 bytes
1765       __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches
1766       __ BIND(L_fill_2_bytes);
1767       __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte);
1768       __ delayed()->andcc(count, 1, count);
1769       __ sth(value, to, 0);
1770       if (t == T_BYTE) {
1771         __ inc(to, 2);
1772         // fill trailing byte
1773         __ andcc(count, 1, count);  // in delay slot of branches
1774         __ BIND(L_fill_byte);
1775         __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1776         __ delayed()->nop();
1777         __ stb(value, to, 0);
1778       } else {
1779         __ BIND(L_fill_byte);
1780       }
1781     } else {
1782       __ BIND(L_fill_2_bytes);
1783     }
1784     __ BIND(L_exit);
1785     __ retl();
1786     __ delayed()->nop();
1787 
1788     // Handle copies less than 8 bytes.  Int is handled elsewhere.
1789     if (t == T_BYTE) {
1790       __ BIND(L_fill_elements);
1791       Label L_fill_2, L_fill_4;
1792       // in delay slot __ andcc(count, 1, G0);
1793       __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1794       __ delayed()->andcc(count, 2, G0);
1795       __ stb(value, to, 0);
1796       __ inc(to, 1);
1797       __ BIND(L_fill_2);
1798       __ brx(Assembler::zero, false, Assembler::pt, L_fill_4);
1799       __ delayed()->andcc(count, 4, G0);
1800       __ stb(value, to, 0);
1801       __ stb(value, to, 1);
1802       __ inc(to, 2);
1803       __ BIND(L_fill_4);
1804       __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1805       __ delayed()->nop();
1806       __ stb(value, to, 0);
1807       __ stb(value, to, 1);
1808       __ stb(value, to, 2);
1809       __ retl();
1810       __ delayed()->stb(value, to, 3);
1811     }
1812 
1813     if (t == T_SHORT) {
1814       Label L_fill_2;
1815       __ BIND(L_fill_elements);
1816       // in delay slot __ andcc(count, 1, G0);
1817       __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1818       __ delayed()->andcc(count, 2, G0);
1819       __ sth(value, to, 0);
1820       __ inc(to, 2);
1821       __ BIND(L_fill_2);
1822       __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1823       __ delayed()->nop();
1824       __ sth(value, to, 0);
1825       __ retl();
1826       __ delayed()->sth(value, to, 2);
1827     }
1828     return start;
1829   }
1830 
1831   //
1832   //  Generate stub for conjoint short copy.  If "aligned" is true, the
1833   //  "from" and "to" addresses are assumed to be heapword aligned.
1834   //
1835   // Arguments for generated stub:
1836   //      from:  O0
1837   //      to:    O1
1838   //      count: O2 treated as signed
1839   //
1840   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1841                                        address *entry, const char *name) {
1842     // Do reverse copy.
1843 
1844     __ align(CodeEntryAlignment);
1845     StubCodeMark mark(this, "StubRoutines", name);
1846     address start = __ pc();
1847 
1848     Label L_skip_alignment, L_skip_alignment2, L_aligned_copy;
1849     Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1850 
1851     const Register from      = O0;   // source array address
1852     const Register to        = O1;   // destination array address
1853     const Register count     = O2;   // elements count
1854     const Register end_from  = from; // source array end address
1855     const Register end_to    = to;   // destination array end address
1856 
1857     const Register byte_count = O3;  // bytes count to copy
1858 
1859     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1860 
1861     if (entry != NULL) {
1862       *entry = __ pc();
1863       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1864       BLOCK_COMMENT("Entry:");
1865     }
1866 
1867     array_overlap_test(nooverlap_target, 1);
1868 
1869     __ sllx(count, LogBytesPerShort, byte_count);
1870     __ add(to, byte_count, end_to);  // offset after last copied element
1871 
1872     // for short arrays, just do single element copy
1873     __ cmp(count, 11); // 8 + 3  (22 bytes)
1874     __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1875     __ delayed()->add(from, byte_count, end_from);
1876 
1877     {
1878       // Align end of arrays since they could be not aligned even
1879       // when arrays itself are aligned.
1880 
1881       // copy 1 element if necessary to align 'end_to' on an 4 bytes
1882       __ andcc(end_to, 3, G0);
1883       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1884       __ delayed()->lduh(end_from, -2, O3);
1885       __ dec(end_from, 2);
1886       __ dec(end_to, 2);
1887       __ dec(count);
1888       __ sth(O3, end_to, 0);
1889     __ BIND(L_skip_alignment);
1890 
1891       // copy 2 elements to align 'end_to' on an 8 byte boundary
1892       __ andcc(end_to, 7, G0);
1893       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1894       __ delayed()->lduh(end_from, -2, O3);
1895       __ dec(count, 2);
1896       __ lduh(end_from, -4, O4);
1897       __ dec(end_from, 4);
1898       __ dec(end_to, 4);
1899       __ sth(O3, end_to, 2);
1900       __ sth(O4, end_to, 0);
1901     __ BIND(L_skip_alignment2);
1902     }
1903 #ifdef _LP64
1904     if (aligned) {
1905       // Both arrays are aligned to 8-bytes in 64-bits VM.
1906       // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1907       // in unaligned case.
1908       __ dec(count, 8);
1909     } else
1910 #endif
1911     {
1912       // Copy with shift 16 bytes per iteration if arrays do not have
1913       // the same alignment mod 8, otherwise jump to the next
1914       // code for aligned copy (and substracting 8 from 'count' before jump).
1915       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1916       // Also jump over aligned copy after the copy with shift completed.
1917 
1918       copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
1919                                         L_aligned_copy, L_copy_2_bytes);
1920     }
1921     // copy 4 elements (16 bytes) at a time
1922       __ align(OptoLoopAlignment);
1923     __ BIND(L_aligned_copy);
1924       __ dec(end_from, 16);
1925       __ ldx(end_from, 8, O3);
1926       __ ldx(end_from, 0, O4);
1927       __ dec(end_to, 16);
1928       __ deccc(count, 8);
1929       __ stx(O3, end_to, 8);
1930       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1931       __ delayed()->stx(O4, end_to, 0);
1932       __ inc(count, 8);
1933 
1934     // copy 1 element (2 bytes) at a time
1935     __ BIND(L_copy_2_bytes);
1936       __ br_zero(count, L_exit);
1937     __ BIND(L_copy_2_bytes_loop);
1938       __ dec(end_from, 2);
1939       __ dec(end_to, 2);
1940       __ lduh(end_from, 0, O4);
1941       __ deccc(count);
1942       __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
1943       __ delayed()->sth(O4, end_to, 0);
1944 
1945     __ BIND(L_exit);
1946     // O3, O4 are used as temp registers
1947     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1948     __ retl();
1949     __ delayed()->mov(G0, O0); // return 0
1950     return start;
1951   }
1952 
1953   //
1954   //  Generate core code for disjoint int copy (and oop copy on 32-bit).
1955   //  If "aligned" is true, the "from" and "to" addresses are assumed
1956   //  to be heapword aligned.
1957   //
1958   // Arguments:
1959   //      from:  O0
1960   //      to:    O1
1961   //      count: O2 treated as signed
1962   //
1963   void generate_disjoint_int_copy_core(bool aligned) {
1964 
1965     Label L_skip_alignment, L_aligned_copy;
1966     Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
1967 
1968     const Register from      = O0;   // source array address
1969     const Register to        = O1;   // destination array address
1970     const Register count     = O2;   // elements count
1971     const Register offset    = O5;   // offset from start of arrays
1972     // O3, O4, G3, G4 are used as temp registers
1973 
1974     // 'aligned' == true when it is known statically during compilation
1975     // of this arraycopy call site that both 'from' and 'to' addresses
1976     // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1977     //
1978     // Aligned arrays have 4 bytes alignment in 32-bits VM
1979     // and 8 bytes - in 64-bits VM.
1980     //
1981 #ifdef _LP64
1982     if (!aligned)
1983 #endif
1984     {
1985       // The next check could be put under 'ifndef' since the code in
1986       // generate_disjoint_long_copy_core() has own checks and set 'offset'.
1987 
1988       // for short arrays, just do single element copy
1989       __ cmp(count, 5); // 4 + 1 (20 bytes)
1990       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
1991       __ delayed()->mov(G0, offset);
1992 
1993       // copy 1 element to align 'to' on an 8 byte boundary
1994       __ andcc(to, 7, G0);
1995       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1996       __ delayed()->ld(from, 0, O3);
1997       __ inc(from, 4);
1998       __ inc(to, 4);
1999       __ dec(count);
2000       __ st(O3, to, -4);
2001     __ BIND(L_skip_alignment);
2002 
2003     // if arrays have same alignment mod 8, do 4 elements copy
2004       __ andcc(from, 7, G0);
2005       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2006       __ delayed()->ld(from, 0, O3);
2007 
2008     //
2009     // Load 2 aligned 8-bytes chunks and use one from previous iteration
2010     // to form 2 aligned 8-bytes chunks to store.
2011     //
2012     // copy_16_bytes_forward_with_shift() is not used here since this
2013     // code is more optimal.
2014 
2015     // copy with shift 4 elements (16 bytes) at a time
2016       __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
2017 
2018       __ align(OptoLoopAlignment);
2019     __ BIND(L_copy_16_bytes);
2020       __ ldx(from, 4, O4);
2021       __ deccc(count, 4); // Can we do next iteration after this one?
2022       __ ldx(from, 12, G4);
2023       __ inc(to, 16);
2024       __ inc(from, 16);
2025       __ sllx(O3, 32, O3);
2026       __ srlx(O4, 32, G3);
2027       __ bset(G3, O3);
2028       __ stx(O3, to, -16);
2029       __ sllx(O4, 32, O4);
2030       __ srlx(G4, 32, G3);
2031       __ bset(G3, O4);
2032       __ stx(O4, to, -8);
2033       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2034       __ delayed()->mov(G4, O3);
2035 
2036       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2037       __ delayed()->inc(count, 4); // restore 'count'
2038 
2039     __ BIND(L_aligned_copy);
2040     }
2041     // copy 4 elements (16 bytes) at a time
2042       __ and3(count, 1, G4); // Save
2043       __ srl(count, 1, count);
2044      generate_disjoint_long_copy_core(aligned);
2045       __ mov(G4, count);     // Restore
2046 
2047     // copy 1 element at a time
2048     __ BIND(L_copy_4_bytes);
2049       __ br_zero(count, L_exit);
2050     __ BIND(L_copy_4_bytes_loop);
2051       __ ld(from, offset, O3);
2052       __ deccc(count);
2053       __ st(O3, to, offset);
2054       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
2055       __ delayed()->inc(offset, 4);
2056     __ BIND(L_exit);
2057   }
2058 
2059   //
2060   //  Generate stub for disjoint int copy.  If "aligned" is true, the
2061   //  "from" and "to" addresses are assumed to be heapword aligned.
2062   //
2063   // Arguments for generated stub:
2064   //      from:  O0
2065   //      to:    O1
2066   //      count: O2 treated as signed
2067   //
2068   address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) {
2069     __ align(CodeEntryAlignment);
2070     StubCodeMark mark(this, "StubRoutines", name);
2071     address start = __ pc();
2072 
2073     const Register count = O2;
2074     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2075 
2076     if (entry != NULL) {
2077       *entry = __ pc();
2078       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2079       BLOCK_COMMENT("Entry:");
2080     }
2081 
2082     generate_disjoint_int_copy_core(aligned);
2083 
2084     // O3, O4 are used as temp registers
2085     inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2086     __ retl();
2087     __ delayed()->mov(G0, O0); // return 0
2088     return start;
2089   }
2090 
2091   //
2092   //  Generate core code for conjoint int copy (and oop copy on 32-bit).
2093   //  If "aligned" is true, the "from" and "to" addresses are assumed
2094   //  to be heapword aligned.
2095   //
2096   // Arguments:
2097   //      from:  O0
2098   //      to:    O1
2099   //      count: O2 treated as signed
2100   //
2101   void generate_conjoint_int_copy_core(bool aligned) {
2102     // Do reverse copy.
2103 
2104     Label L_skip_alignment, L_aligned_copy;
2105     Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
2106 
2107     const Register from      = O0;   // source array address
2108     const Register to        = O1;   // destination array address
2109     const Register count     = O2;   // elements count
2110     const Register end_from  = from; // source array end address
2111     const Register end_to    = to;   // destination array end address
2112     // O3, O4, O5, G3 are used as temp registers
2113 
2114     const Register byte_count = O3;  // bytes count to copy
2115 
2116       __ sllx(count, LogBytesPerInt, byte_count);
2117       __ add(to, byte_count, end_to); // offset after last copied element
2118 
2119       __ cmp(count, 5); // for short arrays, just do single element copy
2120       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
2121       __ delayed()->add(from, byte_count, end_from);
2122 
2123     // copy 1 element to align 'to' on an 8 byte boundary
2124       __ andcc(end_to, 7, G0);
2125       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
2126       __ delayed()->nop();
2127       __ dec(count);
2128       __ dec(end_from, 4);
2129       __ dec(end_to,   4);
2130       __ ld(end_from, 0, O4);
2131       __ st(O4, end_to, 0);
2132     __ BIND(L_skip_alignment);
2133 
2134     // Check if 'end_from' and 'end_to' has the same alignment.
2135       __ andcc(end_from, 7, G0);
2136       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2137       __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4
2138 
2139     // copy with shift 4 elements (16 bytes) at a time
2140     //
2141     // Load 2 aligned 8-bytes chunks and use one from previous iteration
2142     // to form 2 aligned 8-bytes chunks to store.
2143     //
2144       __ ldx(end_from, -4, O3);
2145       __ align(OptoLoopAlignment);
2146     __ BIND(L_copy_16_bytes);
2147       __ ldx(end_from, -12, O4);
2148       __ deccc(count, 4);
2149       __ ldx(end_from, -20, O5);
2150       __ dec(end_to, 16);
2151       __ dec(end_from, 16);
2152       __ srlx(O3, 32, O3);
2153       __ sllx(O4, 32, G3);
2154       __ bset(G3, O3);
2155       __ stx(O3, end_to, 8);
2156       __ srlx(O4, 32, O4);
2157       __ sllx(O5, 32, G3);
2158       __ bset(O4, G3);
2159       __ stx(G3, end_to, 0);
2160       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2161       __ delayed()->mov(O5, O3);
2162 
2163       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2164       __ delayed()->inc(count, 4);
2165 
2166     // copy 4 elements (16 bytes) at a time
2167       __ align(OptoLoopAlignment);
2168     __ BIND(L_aligned_copy);
2169       __ dec(end_from, 16);
2170       __ ldx(end_from, 8, O3);
2171       __ ldx(end_from, 0, O4);
2172       __ dec(end_to, 16);
2173       __ deccc(count, 4);
2174       __ stx(O3, end_to, 8);
2175       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
2176       __ delayed()->stx(O4, end_to, 0);
2177       __ inc(count, 4);
2178 
2179     // copy 1 element (4 bytes) at a time
2180     __ BIND(L_copy_4_bytes);
2181       __ br_zero(count, L_exit);
2182     __ BIND(L_copy_4_bytes_loop);
2183       __ dec(end_from, 4);
2184       __ dec(end_to, 4);
2185       __ ld(end_from, 0, O4);
2186       __ deccc(count);
2187       __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop);
2188       __ delayed()->st(O4, end_to, 0);
2189     __ BIND(L_exit);
2190   }
2191 
2192   //
2193   //  Generate stub for conjoint int copy.  If "aligned" is true, the
2194   //  "from" and "to" addresses are assumed to be heapword aligned.
2195   //
2196   // Arguments for generated stub:
2197   //      from:  O0
2198   //      to:    O1
2199   //      count: O2 treated as signed
2200   //
2201   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
2202                                      address *entry, const char *name) {
2203     __ align(CodeEntryAlignment);
2204     StubCodeMark mark(this, "StubRoutines", name);
2205     address start = __ pc();
2206 
2207     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2208 
2209     if (entry != NULL) {
2210       *entry = __ pc();
2211       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2212       BLOCK_COMMENT("Entry:");
2213     }
2214 
2215     array_overlap_test(nooverlap_target, 2);
2216 
2217     generate_conjoint_int_copy_core(aligned);
2218 
2219     // O3, O4 are used as temp registers
2220     inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2221     __ retl();
2222     __ delayed()->mov(G0, O0); // return 0
2223     return start;
2224   }
2225 
2226   //
2227   //  Generate core code for disjoint long copy (and oop copy on 64-bit).
2228   //  "aligned" is ignored, because we must make the stronger
2229   //  assumption that both addresses are always 64-bit aligned.
2230   //
2231   // Arguments:
2232   //      from:  O0
2233   //      to:    O1
2234   //      count: O2 treated as signed
2235   //
2236   // count -= 2;
2237   // if ( count >= 0 ) { // >= 2 elements
2238   //   if ( count > 6) { // >= 8 elements
2239   //     count -= 6; // original count - 8
2240   //     do {
2241   //       copy_8_elements;
2242   //       count -= 8;
2243   //     } while ( count >= 0 );
2244   //     count += 6;
2245   //   }
2246   //   if ( count >= 0 ) { // >= 2 elements
2247   //     do {
2248   //       copy_2_elements;
2249   //     } while ( (count=count-2) >= 0 );
2250   //   }
2251   // }
2252   // count += 2;
2253   // if ( count != 0 ) { // 1 element left
2254   //   copy_1_element;
2255   // }
2256   //
2257   void generate_disjoint_long_copy_core(bool aligned) {
2258     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2259     const Register from    = O0;  // source array address
2260     const Register to      = O1;  // destination array address
2261     const Register count   = O2;  // elements count
2262     const Register offset0 = O4;  // element offset
2263     const Register offset8 = O5;  // next element offset
2264 
2265       __ deccc(count, 2);
2266       __ mov(G0, offset0);   // offset from start of arrays (0)
2267       __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2268       __ delayed()->add(offset0, 8, offset8);
2269 
2270     // Copy by 64 bytes chunks
2271     Label L_copy_64_bytes;
2272     const Register from64 = O3;  // source address
2273     const Register to64   = G3;  // destination address
2274       __ subcc(count, 6, O3);
2275       __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
2276       __ delayed()->mov(to,   to64);
2277       // Now we can use O4(offset0), O5(offset8) as temps
2278       __ mov(O3, count);
2279       __ mov(from, from64);
2280 
2281       __ align(OptoLoopAlignment);
2282     __ BIND(L_copy_64_bytes);
2283       for( int off = 0; off < 64; off += 16 ) {
2284         __ ldx(from64,  off+0, O4);
2285         __ ldx(from64,  off+8, O5);
2286         __ stx(O4, to64,  off+0);
2287         __ stx(O5, to64,  off+8);
2288       }
2289       __ deccc(count, 8);
2290       __ inc(from64, 64);
2291       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_64_bytes);
2292       __ delayed()->inc(to64, 64);
2293 
2294       // Restore O4(offset0), O5(offset8)
2295       __ sub(from64, from, offset0);
2296       __ inccc(count, 6);
2297       __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2298       __ delayed()->add(offset0, 8, offset8);
2299 
2300       // Copy by 16 bytes chunks
2301       __ align(OptoLoopAlignment);
2302     __ BIND(L_copy_16_bytes);
2303       __ ldx(from, offset0, O3);
2304       __ ldx(from, offset8, G3);
2305       __ deccc(count, 2);
2306       __ stx(O3, to, offset0);
2307       __ inc(offset0, 16);
2308       __ stx(G3, to, offset8);
2309       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2310       __ delayed()->inc(offset8, 16);
2311 
2312       // Copy last 8 bytes
2313     __ BIND(L_copy_8_bytes);
2314       __ inccc(count, 2);
2315       __ brx(Assembler::zero, true, Assembler::pn, L_exit );
2316       __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
2317       __ ldx(from, offset0, O3);
2318       __ stx(O3, to, offset0);
2319     __ BIND(L_exit);
2320   }
2321 
2322   //
2323   //  Generate stub for disjoint long copy.
2324   //  "aligned" is ignored, because we must make the stronger
2325   //  assumption that both addresses are always 64-bit aligned.
2326   //
2327   // Arguments for generated stub:
2328   //      from:  O0
2329   //      to:    O1
2330   //      count: O2 treated as signed
2331   //
2332   address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) {
2333     __ align(CodeEntryAlignment);
2334     StubCodeMark mark(this, "StubRoutines", name);
2335     address start = __ pc();
2336 
2337     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2338 
2339     if (entry != NULL) {
2340       *entry = __ pc();
2341       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2342       BLOCK_COMMENT("Entry:");
2343     }
2344 
2345     generate_disjoint_long_copy_core(aligned);
2346 
2347     // O3, O4 are used as temp registers
2348     inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2349     __ retl();
2350     __ delayed()->mov(G0, O0); // return 0
2351     return start;
2352   }
2353 
2354   //
2355   //  Generate core code for conjoint long copy (and oop copy on 64-bit).
2356   //  "aligned" is ignored, because we must make the stronger
2357   //  assumption that both addresses are always 64-bit aligned.
2358   //
2359   // Arguments:
2360   //      from:  O0
2361   //      to:    O1
2362   //      count: O2 treated as signed
2363   //
2364   void generate_conjoint_long_copy_core(bool aligned) {
2365     // Do reverse copy.
2366     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2367     const Register from    = O0;  // source array address
2368     const Register to      = O1;  // destination array address
2369     const Register count   = O2;  // elements count
2370     const Register offset8 = O4;  // element offset
2371     const Register offset0 = O5;  // previous element offset
2372 
2373       __ subcc(count, 1, count);
2374       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
2375       __ delayed()->sllx(count, LogBytesPerLong, offset8);
2376       __ sub(offset8, 8, offset0);
2377       __ align(OptoLoopAlignment);
2378     __ BIND(L_copy_16_bytes);
2379       __ ldx(from, offset8, O2);
2380       __ ldx(from, offset0, O3);
2381       __ stx(O2, to, offset8);
2382       __ deccc(offset8, 16);      // use offset8 as counter
2383       __ stx(O3, to, offset0);
2384       __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes);
2385       __ delayed()->dec(offset0, 16);
2386 
2387     __ BIND(L_copy_8_bytes);
2388       __ brx(Assembler::negative, false, Assembler::pn, L_exit );
2389       __ delayed()->nop();
2390       __ ldx(from, 0, O3);
2391       __ stx(O3, to, 0);
2392     __ BIND(L_exit);
2393   }
2394 
2395   //  Generate stub for conjoint long copy.
2396   //  "aligned" is ignored, because we must make the stronger
2397   //  assumption that both addresses are always 64-bit aligned.
2398   //
2399   // Arguments for generated stub:
2400   //      from:  O0
2401   //      to:    O1
2402   //      count: O2 treated as signed
2403   //
2404   address generate_conjoint_long_copy(bool aligned, address nooverlap_target,
2405                                       address *entry, const char *name) {
2406     __ align(CodeEntryAlignment);
2407     StubCodeMark mark(this, "StubRoutines", name);
2408     address start = __ pc();
2409 
2410     assert(aligned, "Should always be aligned");
2411 
2412     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2413 
2414     if (entry != NULL) {
2415       *entry = __ pc();
2416       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2417       BLOCK_COMMENT("Entry:");
2418     }
2419 
2420     array_overlap_test(nooverlap_target, 3);
2421 
2422     generate_conjoint_long_copy_core(aligned);
2423 
2424     // O3, O4 are used as temp registers
2425     inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2426     __ retl();
2427     __ delayed()->mov(G0, O0); // return 0
2428     return start;
2429   }
2430 
2431   //  Generate stub for disjoint oop copy.  If "aligned" is true, the
2432   //  "from" and "to" addresses are assumed to be heapword aligned.
2433   //
2434   // Arguments for generated stub:
2435   //      from:  O0
2436   //      to:    O1
2437   //      count: O2 treated as signed
2438   //
2439   address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name,
2440                                      bool dest_uninitialized = false) {
2441 
2442     const Register from  = O0;  // source array address
2443     const Register to    = O1;  // destination array address
2444     const Register count = O2;  // elements count
2445 
2446     __ align(CodeEntryAlignment);
2447     StubCodeMark mark(this, "StubRoutines", name);
2448     address start = __ pc();
2449 
2450     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2451 
2452     if (entry != NULL) {
2453       *entry = __ pc();
2454       // caller can pass a 64-bit byte count here
2455       BLOCK_COMMENT("Entry:");
2456     }
2457 
2458     // save arguments for barrier generation
2459     __ mov(to, G1);
2460     __ mov(count, G5);
2461     gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2462   #ifdef _LP64
2463     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2464     if (UseCompressedOops) {
2465       generate_disjoint_int_copy_core(aligned);
2466     } else {
2467       generate_disjoint_long_copy_core(aligned);
2468     }
2469   #else
2470     generate_disjoint_int_copy_core(aligned);
2471   #endif
2472     // O0 is used as temp register
2473     gen_write_ref_array_post_barrier(G1, G5, O0);
2474 
2475     // O3, O4 are used as temp registers
2476     inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2477     __ retl();
2478     __ delayed()->mov(G0, O0); // return 0
2479     return start;
2480   }
2481 
2482   //  Generate stub for conjoint oop copy.  If "aligned" is true, the
2483   //  "from" and "to" addresses are assumed to be heapword aligned.
2484   //
2485   // Arguments for generated stub:
2486   //      from:  O0
2487   //      to:    O1
2488   //      count: O2 treated as signed
2489   //
2490   address generate_conjoint_oop_copy(bool aligned, address nooverlap_target,
2491                                      address *entry, const char *name,
2492                                      bool dest_uninitialized = false) {
2493 
2494     const Register from  = O0;  // source array address
2495     const Register to    = O1;  // destination array address
2496     const Register count = O2;  // elements count
2497 
2498     __ align(CodeEntryAlignment);
2499     StubCodeMark mark(this, "StubRoutines", name);
2500     address start = __ pc();
2501 
2502     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2503 
2504     if (entry != NULL) {
2505       *entry = __ pc();
2506       // caller can pass a 64-bit byte count here
2507       BLOCK_COMMENT("Entry:");
2508     }
2509 
2510     array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
2511 
2512     // save arguments for barrier generation
2513     __ mov(to, G1);
2514     __ mov(count, G5);
2515     gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2516 
2517   #ifdef _LP64
2518     if (UseCompressedOops) {
2519       generate_conjoint_int_copy_core(aligned);
2520     } else {
2521       generate_conjoint_long_copy_core(aligned);
2522     }
2523   #else
2524     generate_conjoint_int_copy_core(aligned);
2525   #endif
2526 
2527     // O0 is used as temp register
2528     gen_write_ref_array_post_barrier(G1, G5, O0);
2529 
2530     // O3, O4 are used as temp registers
2531     inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2532     __ retl();
2533     __ delayed()->mov(G0, O0); // return 0
2534     return start;
2535   }
2536 
2537 
2538   // Helper for generating a dynamic type check.
2539   // Smashes only the given temp registers.
2540   void generate_type_check(Register sub_klass,
2541                            Register super_check_offset,
2542                            Register super_klass,
2543                            Register temp,
2544                            Label& L_success) {
2545     assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
2546 
2547     BLOCK_COMMENT("type_check:");
2548 
2549     Label L_miss, L_pop_to_miss;
2550 
2551     assert_clean_int(super_check_offset, temp);
2552 
2553     __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg,
2554                                      &L_success, &L_miss, NULL,
2555                                      super_check_offset);
2556 
2557     BLOCK_COMMENT("type_check_slow_path:");
2558     __ save_frame(0);
2559     __ check_klass_subtype_slow_path(sub_klass->after_save(),
2560                                      super_klass->after_save(),
2561                                      L0, L1, L2, L4,
2562                                      NULL, &L_pop_to_miss);
2563     __ ba(L_success, false);
2564     __ delayed()->restore();
2565 
2566     __ bind(L_pop_to_miss);
2567     __ restore();
2568 
2569     // Fall through on failure!
2570     __ BIND(L_miss);
2571   }
2572 
2573 
2574   //  Generate stub for checked oop copy.
2575   //
2576   // Arguments for generated stub:
2577   //      from:  O0
2578   //      to:    O1
2579   //      count: O2 treated as signed
2580   //      ckoff: O3 (super_check_offset)
2581   //      ckval: O4 (super_klass)
2582   //      ret:   O0 zero for success; (-1^K) where K is partial transfer count
2583   //
2584   address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) {
2585 
2586     const Register O0_from   = O0;      // source array address
2587     const Register O1_to     = O1;      // destination array address
2588     const Register O2_count  = O2;      // elements count
2589     const Register O3_ckoff  = O3;      // super_check_offset
2590     const Register O4_ckval  = O4;      // super_klass
2591 
2592     const Register O5_offset = O5;      // loop var, with stride wordSize
2593     const Register G1_remain = G1;      // loop var, with stride -1
2594     const Register G3_oop    = G3;      // actual oop copied
2595     const Register G4_klass  = G4;      // oop._klass
2596     const Register G5_super  = G5;      // oop._klass._primary_supers[ckval]
2597 
2598     __ align(CodeEntryAlignment);
2599     StubCodeMark mark(this, "StubRoutines", name);
2600     address start = __ pc();
2601 
2602 #ifdef ASSERT
2603     // We sometimes save a frame (see generate_type_check below).
2604     // If this will cause trouble, let's fail now instead of later.
2605     __ save_frame(0);
2606     __ restore();
2607 #endif
2608 
2609     assert_clean_int(O2_count, G1);     // Make sure 'count' is clean int.
2610 
2611 #ifdef ASSERT
2612     // caller guarantees that the arrays really are different
2613     // otherwise, we would have to make conjoint checks
2614     { Label L;
2615       __ mov(O3, G1);           // spill: overlap test smashes O3
2616       __ mov(O4, G4);           // spill: overlap test smashes O4
2617       array_overlap_test(L, LogBytesPerHeapOop);
2618       __ stop("checkcast_copy within a single array");
2619       __ bind(L);
2620       __ mov(G1, O3);
2621       __ mov(G4, O4);
2622     }
2623 #endif //ASSERT
2624 
2625     if (entry != NULL) {
2626       *entry = __ pc();
2627       // caller can pass a 64-bit byte count here (from generic stub)
2628       BLOCK_COMMENT("Entry:");
2629     }
2630     gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized);
2631 
2632     Label load_element, store_element, do_card_marks, fail, done;
2633     __ addcc(O2_count, 0, G1_remain);   // initialize loop index, and test it
2634     __ brx(Assembler::notZero, false, Assembler::pt, load_element);
2635     __ delayed()->mov(G0, O5_offset);   // offset from start of arrays
2636 
2637     // Empty array:  Nothing to do.
2638     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2639     __ retl();
2640     __ delayed()->set(0, O0);           // return 0 on (trivial) success
2641 
2642     // ======== begin loop ========
2643     // (Loop is rotated; its entry is load_element.)
2644     // Loop variables:
2645     //   (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
2646     //   (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
2647     //   G3, G4, G5 --- current oop, oop.klass, oop.klass.super
2648     __ align(OptoLoopAlignment);
2649 
2650     __ BIND(store_element);
2651     __ deccc(G1_remain);                // decrement the count
2652     __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
2653     __ inc(O5_offset, heapOopSize);     // step to next offset
2654     __ brx(Assembler::zero, true, Assembler::pt, do_card_marks);
2655     __ delayed()->set(0, O0);           // return -1 on success
2656 
2657     // ======== loop entry is here ========
2658     __ BIND(load_element);
2659     __ load_heap_oop(O0_from, O5_offset, G3_oop);  // load the oop
2660     __ br_null(G3_oop, true, Assembler::pt, store_element);
2661 
2662     __ load_klass(G3_oop, G4_klass); // query the object klass
2663 
2664     generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
2665                         // branch to this on success:
2666                         store_element);
2667     // ======== end loop ========
2668 
2669     // It was a real error; we must depend on the caller to finish the job.
2670     // Register G1 has number of *remaining* oops, O2 number of *total* oops.
2671     // Emit GC store barriers for the oops we have copied (O2 minus G1),
2672     // and report their number to the caller.
2673     __ BIND(fail);
2674     __ subcc(O2_count, G1_remain, O2_count);
2675     __ brx(Assembler::zero, false, Assembler::pt, done);
2676     __ delayed()->not1(O2_count, O0);   // report (-1^K) to caller
2677 
2678     __ BIND(do_card_marks);
2679     gen_write_ref_array_post_barrier(O1_to, O2_count, O3);   // store check on O1[0..O2]
2680 
2681     __ BIND(done);
2682     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2683     __ retl();
2684     __ delayed()->nop();             // return value in 00
2685 
2686     return start;
2687   }
2688 
2689 
2690   //  Generate 'unsafe' array copy stub
2691   //  Though just as safe as the other stubs, it takes an unscaled
2692   //  size_t argument instead of an element count.
2693   //
2694   // Arguments for generated stub:
2695   //      from:  O0
2696   //      to:    O1
2697   //      count: O2 byte count, treated as ssize_t, can be zero
2698   //
2699   // Examines the alignment of the operands and dispatches
2700   // to a long, int, short, or byte copy loop.
2701   //
2702   address generate_unsafe_copy(const char* name,
2703                                address byte_copy_entry,
2704                                address short_copy_entry,
2705                                address int_copy_entry,
2706                                address long_copy_entry) {
2707 
2708     const Register O0_from   = O0;      // source array address
2709     const Register O1_to     = O1;      // destination array address
2710     const Register O2_count  = O2;      // elements count
2711 
2712     const Register G1_bits   = G1;      // test copy of low bits
2713 
2714     __ align(CodeEntryAlignment);
2715     StubCodeMark mark(this, "StubRoutines", name);
2716     address start = __ pc();
2717 
2718     // bump this on entry, not on exit:
2719     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3);
2720 
2721     __ or3(O0_from, O1_to, G1_bits);
2722     __ or3(O2_count,       G1_bits, G1_bits);
2723 
2724     __ btst(BytesPerLong-1, G1_bits);
2725     __ br(Assembler::zero, true, Assembler::pt,
2726           long_copy_entry, relocInfo::runtime_call_type);
2727     // scale the count on the way out:
2728     __ delayed()->srax(O2_count, LogBytesPerLong, O2_count);
2729 
2730     __ btst(BytesPerInt-1, G1_bits);
2731     __ br(Assembler::zero, true, Assembler::pt,
2732           int_copy_entry, relocInfo::runtime_call_type);
2733     // scale the count on the way out:
2734     __ delayed()->srax(O2_count, LogBytesPerInt, O2_count);
2735 
2736     __ btst(BytesPerShort-1, G1_bits);
2737     __ br(Assembler::zero, true, Assembler::pt,
2738           short_copy_entry, relocInfo::runtime_call_type);
2739     // scale the count on the way out:
2740     __ delayed()->srax(O2_count, LogBytesPerShort, O2_count);
2741 
2742     __ br(Assembler::always, false, Assembler::pt,
2743           byte_copy_entry, relocInfo::runtime_call_type);
2744     __ delayed()->nop();
2745 
2746     return start;
2747   }
2748 
2749 
2750   // Perform range checks on the proposed arraycopy.
2751   // Kills the two temps, but nothing else.
2752   // Also, clean the sign bits of src_pos and dst_pos.
2753   void arraycopy_range_checks(Register src,     // source array oop (O0)
2754                               Register src_pos, // source position (O1)
2755                               Register dst,     // destination array oo (O2)
2756                               Register dst_pos, // destination position (O3)
2757                               Register length,  // length of copy (O4)
2758                               Register temp1, Register temp2,
2759                               Label& L_failed) {
2760     BLOCK_COMMENT("arraycopy_range_checks:");
2761 
2762     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
2763 
2764     const Register array_length = temp1;  // scratch
2765     const Register end_pos      = temp2;  // scratch
2766 
2767     // Note:  This next instruction may be in the delay slot of a branch:
2768     __ add(length, src_pos, end_pos);  // src_pos + length
2769     __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length);
2770     __ cmp(end_pos, array_length);
2771     __ br(Assembler::greater, false, Assembler::pn, L_failed);
2772 
2773     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2774     __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length
2775     __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length);
2776     __ cmp(end_pos, array_length);
2777     __ br(Assembler::greater, false, Assembler::pn, L_failed);
2778 
2779     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2780     // Move with sign extension can be used since they are positive.
2781     __ delayed()->signx(src_pos, src_pos);
2782     __ signx(dst_pos, dst_pos);
2783 
2784     BLOCK_COMMENT("arraycopy_range_checks done");
2785   }
2786 
2787 
2788   //
2789   //  Generate generic array copy stubs
2790   //
2791   //  Input:
2792   //    O0    -  src oop
2793   //    O1    -  src_pos
2794   //    O2    -  dst oop
2795   //    O3    -  dst_pos
2796   //    O4    -  element count
2797   //
2798   //  Output:
2799   //    O0 ==  0  -  success
2800   //    O0 == -1  -  need to call System.arraycopy
2801   //
2802   address generate_generic_copy(const char *name,
2803                                 address entry_jbyte_arraycopy,
2804                                 address entry_jshort_arraycopy,
2805                                 address entry_jint_arraycopy,
2806                                 address entry_oop_arraycopy,
2807                                 address entry_jlong_arraycopy,
2808                                 address entry_checkcast_arraycopy) {
2809     Label L_failed, L_objArray;
2810 
2811     // Input registers
2812     const Register src      = O0;  // source array oop
2813     const Register src_pos  = O1;  // source position
2814     const Register dst      = O2;  // destination array oop
2815     const Register dst_pos  = O3;  // destination position
2816     const Register length   = O4;  // elements count
2817 
2818     // registers used as temp
2819     const Register G3_src_klass = G3; // source array klass
2820     const Register G4_dst_klass = G4; // destination array klass
2821     const Register G5_lh        = G5; // layout handler
2822     const Register O5_temp      = O5;
2823 
2824     __ align(CodeEntryAlignment);
2825     StubCodeMark mark(this, "StubRoutines", name);
2826     address start = __ pc();
2827 
2828     // bump this on entry, not on exit:
2829     inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3);
2830 
2831     // In principle, the int arguments could be dirty.
2832     //assert_clean_int(src_pos, G1);
2833     //assert_clean_int(dst_pos, G1);
2834     //assert_clean_int(length, G1);
2835 
2836     //-----------------------------------------------------------------------
2837     // Assembler stubs will be used for this call to arraycopy
2838     // if the following conditions are met:
2839     //
2840     // (1) src and dst must not be null.
2841     // (2) src_pos must not be negative.
2842     // (3) dst_pos must not be negative.
2843     // (4) length  must not be negative.
2844     // (5) src klass and dst klass should be the same and not NULL.
2845     // (6) src and dst should be arrays.
2846     // (7) src_pos + length must not exceed length of src.
2847     // (8) dst_pos + length must not exceed length of dst.
2848     BLOCK_COMMENT("arraycopy initial argument checks");
2849 
2850     //  if (src == NULL) return -1;
2851     __ br_null(src, false, Assembler::pn, L_failed, false);
2852 
2853     //  if (src_pos < 0) return -1;
2854     __ delayed()->tst(src_pos);
2855     __ br(Assembler::negative, false, Assembler::pn, L_failed);
2856     __ delayed()->nop();
2857 
2858     //  if (dst == NULL) return -1;
2859     __ br_null(dst, false, Assembler::pn, L_failed, false);
2860 
2861     //  if (dst_pos < 0) return -1;
2862     __ delayed()->tst(dst_pos);
2863     __ br(Assembler::negative, false, Assembler::pn, L_failed);
2864 
2865     //  if (length < 0) return -1;
2866     __ delayed()->tst(length);
2867     __ br(Assembler::negative, false, Assembler::pn, L_failed);
2868 
2869     BLOCK_COMMENT("arraycopy argument klass checks");
2870     //  get src->klass()
2871     if (UseCompressedOops) {
2872       __ delayed()->nop(); // ??? not good
2873       __ load_klass(src, G3_src_klass);
2874     } else {
2875       __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass);
2876     }
2877 
2878 #ifdef ASSERT
2879     //  assert(src->klass() != NULL);
2880     BLOCK_COMMENT("assert klasses not null");
2881     { Label L_a, L_b;
2882       __ br_notnull(G3_src_klass, false, Assembler::pt, L_b); // it is broken if klass is NULL
2883       __ bind(L_a);
2884       __ stop("broken null klass");
2885       __ bind(L_b);
2886       __ load_klass(dst, G4_dst_klass);
2887       __ br_null(G4_dst_klass, false, Assembler::pn, L_a, false); // this would be broken also
2888       __ delayed()->mov(G0, G4_dst_klass);      // scribble the temp
2889       BLOCK_COMMENT("assert done");
2890     }
2891 #endif
2892 
2893     // Load layout helper
2894     //
2895     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2896     // 32        30    24            16              8     2                 0
2897     //
2898     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2899     //
2900 
2901     int lh_offset = klassOopDesc::header_size() * HeapWordSize +
2902                     Klass::layout_helper_offset_in_bytes();
2903 
2904     // Load 32-bits signed value. Use br() instruction with it to check icc.
2905     __ lduw(G3_src_klass, lh_offset, G5_lh);
2906 
2907     if (UseCompressedOops) {
2908       __ load_klass(dst, G4_dst_klass);
2909     }
2910     // Handle objArrays completely differently...
2911     juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2912     __ set(objArray_lh, O5_temp);
2913     __ cmp(G5_lh,       O5_temp);
2914     __ br(Assembler::equal, false, Assembler::pt, L_objArray);
2915     if (UseCompressedOops) {
2916       __ delayed()->nop();
2917     } else {
2918       __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
2919     }
2920 
2921     //  if (src->klass() != dst->klass()) return -1;
2922     __ cmp_and_brx(G3_src_klass, G4_dst_klass, Assembler::notEqual, false, Assembler::pn, L_failed);
2923 
2924     //  if (!src->is_Array()) return -1;
2925     __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
2926     __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed);
2927 
2928     // At this point, it is known to be a typeArray (array_tag 0x3).
2929 #ifdef ASSERT
2930     __ delayed()->nop();
2931     { Label L;
2932       jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2933       __ set(lh_prim_tag_in_place, O5_temp);
2934       __ cmp(G5_lh,                O5_temp);
2935       __ br(Assembler::greaterEqual, false, Assembler::pt, L);
2936       __ delayed()->nop();
2937       __ stop("must be a primitive array");
2938       __ bind(L);
2939     }
2940 #else
2941     __ delayed();                               // match next insn to prev branch
2942 #endif
2943 
2944     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2945                            O5_temp, G4_dst_klass, L_failed);
2946 
2947     // typeArrayKlass
2948     //
2949     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2950     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2951     //
2952 
2953     const Register G4_offset = G4_dst_klass;    // array offset
2954     const Register G3_elsize = G3_src_klass;    // log2 element size
2955 
2956     __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset);
2957     __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset
2958     __ add(src, G4_offset, src);       // src array offset
2959     __ add(dst, G4_offset, dst);       // dst array offset
2960     __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size
2961 
2962     // next registers should be set before the jump to corresponding stub
2963     const Register from     = O0;  // source array address
2964     const Register to       = O1;  // destination array address
2965     const Register count    = O2;  // elements count
2966 
2967     // 'from', 'to', 'count' registers should be set in this order
2968     // since they are the same as 'src', 'src_pos', 'dst'.
2969 
2970     BLOCK_COMMENT("scale indexes to element size");
2971     __ sll_ptr(src_pos, G3_elsize, src_pos);
2972     __ sll_ptr(dst_pos, G3_elsize, dst_pos);
2973     __ add(src, src_pos, from);       // src_addr
2974     __ add(dst, dst_pos, to);         // dst_addr
2975 
2976     BLOCK_COMMENT("choose copy loop based on element size");
2977     __ cmp(G3_elsize, 0);
2978     __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy);
2979     __ delayed()->signx(length, count); // length
2980 
2981     __ cmp(G3_elsize, LogBytesPerShort);
2982     __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy);
2983     __ delayed()->signx(length, count); // length
2984 
2985     __ cmp(G3_elsize, LogBytesPerInt);
2986     __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy);
2987     __ delayed()->signx(length, count); // length
2988 #ifdef ASSERT
2989     { Label L;
2990       __ cmp_and_br(G3_elsize, LogBytesPerLong, Assembler::equal, false, Assembler::pt, L);
2991       __ stop("must be long copy, but elsize is wrong");
2992       __ bind(L);
2993     }
2994 #endif
2995     __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy);
2996     __ delayed()->signx(length, count); // length
2997 
2998     // objArrayKlass
2999   __ BIND(L_objArray);
3000     // live at this point:  G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length
3001 
3002     Label L_plain_copy, L_checkcast_copy;
3003     //  test array classes for subtyping
3004     __ cmp(G3_src_klass, G4_dst_klass);         // usual case is exact equality
3005     __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy);
3006     __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below
3007 
3008     // Identically typed arrays can be copied without element-wise checks.
3009     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3010                            O5_temp, G5_lh, L_failed);
3011 
3012     __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3013     __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3014     __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3015     __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3016     __ add(src, src_pos, from);       // src_addr
3017     __ add(dst, dst_pos, to);         // dst_addr
3018   __ BIND(L_plain_copy);
3019     __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy);
3020     __ delayed()->signx(length, count); // length
3021 
3022   __ BIND(L_checkcast_copy);
3023     // live at this point:  G3_src_klass, G4_dst_klass
3024     {
3025       // Before looking at dst.length, make sure dst is also an objArray.
3026       // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot
3027       __ cmp(G5_lh,                    O5_temp);
3028       __ br(Assembler::notEqual, false, Assembler::pn, L_failed);
3029 
3030       // It is safe to examine both src.length and dst.length.
3031       __ delayed();                             // match next insn to prev branch
3032       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3033                              O5_temp, G5_lh, L_failed);
3034 
3035       // Marshal the base address arguments now, freeing registers.
3036       __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3037       __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3038       __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3039       __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3040       __ add(src, src_pos, from);               // src_addr
3041       __ add(dst, dst_pos, to);                 // dst_addr
3042       __ signx(length, count);                  // length (reloaded)
3043 
3044       Register sco_temp = O3;                   // this register is free now
3045       assert_different_registers(from, to, count, sco_temp,
3046                                  G4_dst_klass, G3_src_klass);
3047 
3048       // Generate the type check.
3049       int sco_offset = (klassOopDesc::header_size() * HeapWordSize +
3050                         Klass::super_check_offset_offset_in_bytes());
3051       __ lduw(G4_dst_klass, sco_offset, sco_temp);
3052       generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
3053                           O5_temp, L_plain_copy);
3054 
3055       // Fetch destination element klass from the objArrayKlass header.
3056       int ek_offset = (klassOopDesc::header_size() * HeapWordSize +
3057                        objArrayKlass::element_klass_offset_in_bytes());
3058 
3059       // the checkcast_copy loop needs two extra arguments:
3060       __ ld_ptr(G4_dst_klass, ek_offset, O4);   // dest elem klass
3061       // lduw(O4, sco_offset, O3);              // sco of elem klass
3062 
3063       __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy);
3064       __ delayed()->lduw(O4, sco_offset, O3);
3065     }
3066 
3067   __ BIND(L_failed);
3068     __ retl();
3069     __ delayed()->sub(G0, 1, O0); // return -1
3070     return start;
3071   }
3072 
3073   void generate_arraycopy_stubs() {
3074     address entry;
3075     address entry_jbyte_arraycopy;
3076     address entry_jshort_arraycopy;
3077     address entry_jint_arraycopy;
3078     address entry_oop_arraycopy;
3079     address entry_jlong_arraycopy;
3080     address entry_checkcast_arraycopy;
3081 
3082     //*** jbyte
3083     // Always need aligned and unaligned versions
3084     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
3085                                                                                   "jbyte_disjoint_arraycopy");
3086     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
3087                                                                                   &entry_jbyte_arraycopy,
3088                                                                                   "jbyte_arraycopy");
3089     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
3090                                                                                   "arrayof_jbyte_disjoint_arraycopy");
3091     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
3092                                                                                   "arrayof_jbyte_arraycopy");
3093 
3094     //*** jshort
3095     // Always need aligned and unaligned versions
3096     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
3097                                                                                     "jshort_disjoint_arraycopy");
3098     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
3099                                                                                     &entry_jshort_arraycopy,
3100                                                                                     "jshort_arraycopy");
3101     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
3102                                                                                     "arrayof_jshort_disjoint_arraycopy");
3103     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
3104                                                                                     "arrayof_jshort_arraycopy");
3105 
3106     //*** jint
3107     // Aligned versions
3108     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
3109                                                                                 "arrayof_jint_disjoint_arraycopy");
3110     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
3111                                                                                 "arrayof_jint_arraycopy");
3112 #ifdef _LP64
3113     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
3114     // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it).
3115     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
3116                                                                                 "jint_disjoint_arraycopy");
3117     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
3118                                                                                 &entry_jint_arraycopy,
3119                                                                                 "jint_arraycopy");
3120 #else
3121     // In 32 bit jints are always HeapWordSize aligned, so always use the aligned version
3122     // (in fact in 32bit we always have a pre-loop part even in the aligned version,
3123     //  because it uses 64-bit loads/stores, so the aligned flag is actually ignored).
3124     StubRoutines::_jint_disjoint_arraycopy = StubRoutines::_arrayof_jint_disjoint_arraycopy;
3125     StubRoutines::_jint_arraycopy          = StubRoutines::_arrayof_jint_arraycopy;
3126 #endif
3127 
3128 
3129     //*** jlong
3130     // It is always aligned
3131     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
3132                                                                                   "arrayof_jlong_disjoint_arraycopy");
3133     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
3134                                                                                   "arrayof_jlong_arraycopy");
3135     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
3136     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
3137 
3138 
3139     //*** oops
3140     // Aligned versions
3141     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, &entry,
3142                                                                                       "arrayof_oop_disjoint_arraycopy");
3143     StubRoutines::_arrayof_oop_arraycopy                 = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy,
3144                                                                                       "arrayof_oop_arraycopy");
3145     // Aligned versions without pre-barriers
3146     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry,
3147                                                                                       "arrayof_oop_disjoint_arraycopy_uninit",
3148                                                                                       /*dest_uninitialized*/true);
3149     StubRoutines::_arrayof_oop_arraycopy_uninit          = generate_conjoint_oop_copy(true, entry, NULL,
3150                                                                                       "arrayof_oop_arraycopy_uninit",
3151                                                                                       /*dest_uninitialized*/true);
3152 #ifdef _LP64
3153     if (UseCompressedOops) {
3154       // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy.
3155       StubRoutines::_oop_disjoint_arraycopy            = generate_disjoint_oop_copy(false, &entry,
3156                                                                                     "oop_disjoint_arraycopy");
3157       StubRoutines::_oop_arraycopy                     = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy,
3158                                                                                     "oop_arraycopy");
3159       // Unaligned versions without pre-barriers
3160       StubRoutines::_oop_disjoint_arraycopy_uninit     = generate_disjoint_oop_copy(false, &entry,
3161                                                                                     "oop_disjoint_arraycopy_uninit",
3162                                                                                     /*dest_uninitialized*/true);
3163       StubRoutines::_oop_arraycopy_uninit              = generate_conjoint_oop_copy(false, entry, NULL,
3164                                                                                     "oop_arraycopy_uninit",
3165                                                                                     /*dest_uninitialized*/true);
3166     } else
3167 #endif
3168     {
3169       // oop arraycopy is always aligned on 32bit and 64bit without compressed oops
3170       StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
3171       StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
3172       StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
3173       StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
3174     }
3175 
3176     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3177     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3178                                                                         /*dest_uninitialized*/true);
3179 
3180     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3181                                                               entry_jbyte_arraycopy,
3182                                                               entry_jshort_arraycopy,
3183                                                               entry_jint_arraycopy,
3184                                                               entry_jlong_arraycopy);
3185     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3186                                                                entry_jbyte_arraycopy,
3187                                                                entry_jshort_arraycopy,
3188                                                                entry_jint_arraycopy,
3189                                                                entry_oop_arraycopy,
3190                                                                entry_jlong_arraycopy,
3191                                                                entry_checkcast_arraycopy);
3192 
3193     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3194     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3195     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3196     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3197     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3198     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3199   }
3200 
3201   void generate_initial() {
3202     // Generates all stubs and initializes the entry points
3203 
3204     //------------------------------------------------------------------------------------------------------------------------
3205     // entry points that exist in all platforms
3206     // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
3207     //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
3208     StubRoutines::_forward_exception_entry                 = generate_forward_exception();
3209 
3210     StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
3211     StubRoutines::_catch_exception_entry                   = generate_catch_exception();
3212 
3213     //------------------------------------------------------------------------------------------------------------------------
3214     // entry points that are platform specific
3215     StubRoutines::Sparc::_test_stop_entry                  = generate_test_stop();
3216 
3217     StubRoutines::Sparc::_stop_subroutine_entry            = generate_stop_subroutine();
3218     StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
3219 
3220 #if !defined(COMPILER2) && !defined(_LP64)
3221     StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();
3222     StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();
3223     StubRoutines::_atomic_add_entry          = generate_atomic_add();
3224     StubRoutines::_atomic_xchg_ptr_entry     = StubRoutines::_atomic_xchg_entry;
3225     StubRoutines::_atomic_cmpxchg_ptr_entry  = StubRoutines::_atomic_cmpxchg_entry;
3226     StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
3227     StubRoutines::_atomic_add_ptr_entry      = StubRoutines::_atomic_add_entry;
3228 #endif  // COMPILER2 !=> _LP64
3229 
3230     // Build this early so it's available for the interpreter.  The
3231     // stub expects the required and actual type to already be in O1
3232     // and O2 respectively.
3233     StubRoutines::_throw_WrongMethodTypeException_entry =
3234       generate_throw_exception("WrongMethodTypeException throw_exception",
3235                                CAST_FROM_FN_PTR(address, SharedRuntime::throw_WrongMethodTypeException),
3236                                false, G5_method_type, G3_method_handle);
3237   }
3238 
3239 
3240   void generate_all() {
3241     // Generates all stubs and initializes the entry points
3242 
3243     // Generate partial_subtype_check first here since its code depends on
3244     // UseZeroBaseCompressedOops which is defined after heap initialization.
3245     StubRoutines::Sparc::_partial_subtype_check                = generate_partial_subtype_check();
3246     // These entry points require SharedInfo::stack0 to be set up in non-core builds
3247     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError),  false);
3248     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError),  false);
3249     StubRoutines::_throw_ArithmeticException_entry         = generate_throw_exception("ArithmeticException throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_ArithmeticException),  true);
3250     StubRoutines::_throw_NullPointerException_entry        = generate_throw_exception("NullPointerException throw_exception",         CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException), true);
3251     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call), false);
3252     StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError),   false);
3253 
3254     StubRoutines::_handler_for_unsafe_access_entry =
3255       generate_handler_for_unsafe_access();
3256 
3257     // support for verify_oop (must happen after universe_init)
3258     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop_subroutine();
3259 
3260     // arraycopy stubs used by compilers
3261     generate_arraycopy_stubs();
3262 
3263     // Don't initialize the platform math functions since sparc
3264     // doesn't have intrinsics for these operations.
3265   }
3266 
3267 
3268  public:
3269   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3270     // replace the standard masm with a special one:
3271     _masm = new MacroAssembler(code);
3272 
3273     _stub_count = !all ? 0x100 : 0x200;
3274     if (all) {
3275       generate_all();
3276     } else {
3277       generate_initial();
3278     }
3279 
3280     // make sure this stub is available for all local calls
3281     if (_atomic_add_stub.is_unbound()) {
3282       // generate a second time, if necessary
3283       (void) generate_atomic_add();
3284     }
3285   }
3286 
3287 
3288  private:
3289   int _stub_count;
3290   void stub_prolog(StubCodeDesc* cdesc) {
3291     # ifdef ASSERT
3292       // put extra information in the stub code, to make it more readable
3293 #ifdef _LP64
3294 // Write the high part of the address
3295 // [RGV] Check if there is a dependency on the size of this prolog
3296       __ emit_data((intptr_t)cdesc >> 32,    relocInfo::none);
3297 #endif
3298       __ emit_data((intptr_t)cdesc,    relocInfo::none);
3299       __ emit_data(++_stub_count, relocInfo::none);
3300     # endif
3301     align(true);
3302   }
3303 
3304   void align(bool at_header = false) {
3305     // %%%%% move this constant somewhere else
3306     // UltraSPARC cache line size is 8 instructions:
3307     const unsigned int icache_line_size = 32;
3308     const unsigned int icache_half_line_size = 16;
3309 
3310     if (at_header) {
3311       while ((intptr_t)(__ pc()) % icache_line_size != 0) {
3312         __ emit_data(0, relocInfo::none);
3313       }
3314     } else {
3315       while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
3316         __ nop();
3317       }
3318     }
3319   }
3320 
3321 }; // end class declaration
3322 
3323 void StubGenerator_generate(CodeBuffer* code, bool all) {
3324   StubGenerator g(code, all);
3325 }