Old src/cpu/sparc/vm/stubGenerator

   1 /*
   2  * Copyright (c) 1997, 2012, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/assembler.hpp"
  27 #include "assembler_sparc.inline.hpp"
  28 #include "interpreter/interpreter.hpp"
  29 #include "nativeInst_sparc.hpp"
  30 #include "oops/instanceOop.hpp"
  31 #include "oops/method.hpp"
  32 #include "oops/objArrayKlass.hpp"
  33 #include "oops/oop.inline.hpp"
  34 #include "prims/methodHandles.hpp"
  35 #include "runtime/frame.inline.hpp"
  36 #include "runtime/handles.inline.hpp"
  37 #include "runtime/sharedRuntime.hpp"
  38 #include "runtime/stubCodeGenerator.hpp"
  39 #include "runtime/stubRoutines.hpp"
  40 #include "utilities/top.hpp"
  41 #ifdef TARGET_OS_FAMILY_linux
  42 # include "thread_linux.inline.hpp"
  43 #endif
  44 #ifdef TARGET_OS_FAMILY_solaris
  45 # include "thread_solaris.inline.hpp"
  46 #endif
  47 #ifdef COMPILER2
  48 #include "opto/runtime.hpp"
  49 #endif
  50 
  51 // Declaration and definition of StubGenerator (no .hpp file).
  52 // For a more detailed description of the stub routine structure
  53 // see the comment in stubRoutines.hpp.
  54 
  55 #define __ _masm->
  56 
  57 #ifdef PRODUCT
  58 #define BLOCK_COMMENT(str) /* nothing */
  59 #else
  60 #define BLOCK_COMMENT(str) __ block_comment(str)
  61 #endif
  62 
  63 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  64 
  65 // Note:  The register L7 is used as L7_thread_cache, and may not be used
  66 //        any other way within this module.
  67 
  68 
  69 static const Register& Lstub_temp = L2;
  70 
  71 // -------------------------------------------------------------------------------------------------------------------------
  72 // Stub Code definitions
  73 
  74 static address handle_unsafe_access() {
  75   JavaThread* thread = JavaThread::current();
  76   address pc  = thread->saved_exception_pc();
  77   address npc = thread->saved_exception_npc();
  78   // pc is the instruction which we must emulate
  79   // doing a no-op is fine:  return garbage from the load
  80 
  81   // request an async exception
  82   thread->set_pending_unsafe_access_error();
  83 
  84   // return address of next instruction to execute
  85   return npc;
  86 }
  87 
  88 class StubGenerator: public StubCodeGenerator {
  89  private:
  90 
  91 #ifdef PRODUCT
  92 #define inc_counter_np(a,b,c) (0)
  93 #else
  94 #define inc_counter_np(counter, t1, t2) \
  95   BLOCK_COMMENT("inc_counter " #counter); \
  96   __ inc_counter(&counter, t1, t2);
  97 #endif
  98 
  99   //----------------------------------------------------------------------------------------------------
 100   // Call stubs are used to call Java from C
 101 
 102   address generate_call_stub(address& return_pc) {
 103     StubCodeMark mark(this, "StubRoutines", "call_stub");
 104     address start = __ pc();
 105 
 106     // Incoming arguments:
 107     //
 108     // o0         : call wrapper address
 109     // o1         : result (address)
 110     // o2         : result type
 111     // o3         : method
 112     // o4         : (interpreter) entry point
 113     // o5         : parameters (address)
 114     // [sp + 0x5c]: parameter size (in words)
 115     // [sp + 0x60]: thread
 116     //
 117     // +---------------+ <--- sp + 0
 118     // |               |
 119     // . reg save area .
 120     // |               |
 121     // +---------------+ <--- sp + 0x40
 122     // |               |
 123     // . extra 7 slots .
 124     // |               |
 125     // +---------------+ <--- sp + 0x5c
 126     // |  param. size  |
 127     // +---------------+ <--- sp + 0x60
 128     // |    thread     |
 129     // +---------------+
 130     // |               |
 131 
 132     // note: if the link argument position changes, adjust
 133     //       the code in frame::entry_frame_call_wrapper()
 134 
 135     const Argument link           = Argument(0, false); // used only for GC
 136     const Argument result         = Argument(1, false);
 137     const Argument result_type    = Argument(2, false);
 138     const Argument method         = Argument(3, false);
 139     const Argument entry_point    = Argument(4, false);
 140     const Argument parameters     = Argument(5, false);
 141     const Argument parameter_size = Argument(6, false);
 142     const Argument thread         = Argument(7, false);
 143 
 144     // setup thread register
 145     __ ld_ptr(thread.as_address(), G2_thread);
 146     __ reinit_heapbase();
 147 
 148 #ifdef ASSERT
 149     // make sure we have no pending exceptions
 150     { const Register t = G3_scratch;
 151       Label L;
 152       __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
 153       __ br_null_short(t, Assembler::pt, L);
 154       __ stop("StubRoutines::call_stub: entered with pending exception");
 155       __ bind(L);
 156     }
 157 #endif
 158 
 159     // create activation frame & allocate space for parameters
 160     { const Register t = G3_scratch;
 161       __ ld_ptr(parameter_size.as_address(), t);                // get parameter size (in words)
 162       __ add(t, frame::memory_parameter_word_sp_offset, t);     // add space for save area (in words)
 163       __ round_to(t, WordsPerLong);                             // make sure it is multiple of 2 (in words)
 164       __ sll(t, Interpreter::logStackElementSize, t);           // compute number of bytes
 165       __ neg(t);                                                // negate so it can be used with save
 166       __ save(SP, t, SP);                                       // setup new frame
 167     }
 168 
 169     // +---------------+ <--- sp + 0
 170     // |               |
 171     // . reg save area .
 172     // |               |
 173     // +---------------+ <--- sp + 0x40
 174     // |               |
 175     // . extra 7 slots .
 176     // |               |
 177     // +---------------+ <--- sp + 0x5c
 178     // |  empty slot   |      (only if parameter size is even)
 179     // +---------------+
 180     // |               |
 181     // .  parameters   .
 182     // |               |
 183     // +---------------+ <--- fp + 0
 184     // |               |
 185     // . reg save area .
 186     // |               |
 187     // +---------------+ <--- fp + 0x40
 188     // |               |
 189     // . extra 7 slots .
 190     // |               |
 191     // +---------------+ <--- fp + 0x5c
 192     // |  param. size  |
 193     // +---------------+ <--- fp + 0x60
 194     // |    thread     |
 195     // +---------------+
 196     // |               |
 197 
 198     // pass parameters if any
 199     BLOCK_COMMENT("pass parameters if any");
 200     { const Register src = parameters.as_in().as_register();
 201       const Register dst = Lentry_args;
 202       const Register tmp = G3_scratch;
 203       const Register cnt = G4_scratch;
 204 
 205       // test if any parameters & setup of Lentry_args
 206       Label exit;
 207       __ ld_ptr(parameter_size.as_in().as_address(), cnt);      // parameter counter
 208       __ add( FP, STACK_BIAS, dst );
 209       __ cmp_zero_and_br(Assembler::zero, cnt, exit);
 210       __ delayed()->sub(dst, BytesPerWord, dst);                 // setup Lentry_args
 211 
 212       // copy parameters if any
 213       Label loop;
 214       __ BIND(loop);
 215       // Store parameter value
 216       __ ld_ptr(src, 0, tmp);
 217       __ add(src, BytesPerWord, src);
 218       __ st_ptr(tmp, dst, 0);
 219       __ deccc(cnt);
 220       __ br(Assembler::greater, false, Assembler::pt, loop);
 221       __ delayed()->sub(dst, Interpreter::stackElementSize, dst);
 222 
 223       // done
 224       __ BIND(exit);
 225     }
 226 
 227     // setup parameters, method & call Java function
 228 #ifdef ASSERT
 229     // layout_activation_impl checks it's notion of saved SP against
 230     // this register, so if this changes update it as well.
 231     const Register saved_SP = Lscratch;
 232     __ mov(SP, saved_SP);                               // keep track of SP before call
 233 #endif
 234 
 235     // setup parameters
 236     const Register t = G3_scratch;
 237     __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words)
 238     __ sll(t, Interpreter::logStackElementSize, t);    // compute number of bytes
 239     __ sub(FP, t, Gargs);                              // setup parameter pointer
 240 #ifdef _LP64
 241     __ add( Gargs, STACK_BIAS, Gargs );                // Account for LP64 stack bias
 242 #endif
 243     __ mov(SP, O5_savedSP);
 244 
 245 
 246     // do the call
 247     //
 248     // the following register must be setup:
 249     //
 250     // G2_thread
 251     // G5_method
 252     // Gargs
 253     BLOCK_COMMENT("call Java function");
 254     __ jmpl(entry_point.as_in().as_register(), G0, O7);
 255     __ delayed()->mov(method.as_in().as_register(), G5_method);   // setup method
 256 
 257     BLOCK_COMMENT("call_stub_return_address:");
 258     return_pc = __ pc();
 259 
 260     // The callee, if it wasn't interpreted, can return with SP changed so
 261     // we can no longer assert of change of SP.
 262 
 263     // store result depending on type
 264     // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE
 265     //  is treated as T_INT)
 266     { const Register addr = result     .as_in().as_register();
 267       const Register type = result_type.as_in().as_register();
 268       Label is_long, is_float, is_double, is_object, exit;
 269       __            cmp(type, T_OBJECT);  __ br(Assembler::equal, false, Assembler::pn, is_object);
 270       __ delayed()->cmp(type, T_FLOAT);   __ br(Assembler::equal, false, Assembler::pn, is_float);
 271       __ delayed()->cmp(type, T_DOUBLE);  __ br(Assembler::equal, false, Assembler::pn, is_double);
 272       __ delayed()->cmp(type, T_LONG);    __ br(Assembler::equal, false, Assembler::pn, is_long);
 273       __ delayed()->nop();
 274 
 275       // store int result
 276       __ st(O0, addr, G0);
 277 
 278       __ BIND(exit);
 279       __ ret();
 280       __ delayed()->restore();
 281 
 282       __ BIND(is_object);
 283       __ ba(exit);
 284       __ delayed()->st_ptr(O0, addr, G0);
 285 
 286       __ BIND(is_float);
 287       __ ba(exit);
 288       __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
 289 
 290       __ BIND(is_double);
 291       __ ba(exit);
 292       __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
 293 
 294       __ BIND(is_long);
 295 #ifdef _LP64
 296       __ ba(exit);
 297       __ delayed()->st_long(O0, addr, G0);      // store entire long
 298 #else
 299 #if defined(COMPILER2)
 300   // All return values are where we want them, except for Longs.  C2 returns
 301   // longs in G1 in the 32-bit build whereas the interpreter wants them in O0/O1.
 302   // Since the interpreter will return longs in G1 and O0/O1 in the 32bit
 303   // build we simply always use G1.
 304   // Note: I tried to make c2 return longs in O0/O1 and G1 so we wouldn't have to
 305   // do this here. Unfortunately if we did a rethrow we'd see an machepilog node
 306   // first which would move g1 -> O0/O1 and destroy the exception we were throwing.
 307 
 308       __ ba(exit);
 309       __ delayed()->stx(G1, addr, G0);  // store entire long
 310 #else
 311       __ st(O1, addr, BytesPerInt);
 312       __ ba(exit);
 313       __ delayed()->st(O0, addr, G0);
 314 #endif /* COMPILER2 */
 315 #endif /* _LP64 */
 316      }
 317      return start;
 318   }
 319 
 320 
 321   //----------------------------------------------------------------------------------------------------
 322   // Return point for a Java call if there's an exception thrown in Java code.
 323   // The exception is caught and transformed into a pending exception stored in
 324   // JavaThread that can be tested from within the VM.
 325   //
 326   // Oexception: exception oop
 327 
 328   address generate_catch_exception() {
 329     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 330 
 331     address start = __ pc();
 332     // verify that thread corresponds
 333     __ verify_thread();
 334 
 335     const Register& temp_reg = Gtemp;
 336     Address pending_exception_addr    (G2_thread, Thread::pending_exception_offset());
 337     Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset   ());
 338     Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset   ());
 339 
 340     // set pending exception
 341     __ verify_oop(Oexception);
 342     __ st_ptr(Oexception, pending_exception_addr);
 343     __ set((intptr_t)__FILE__, temp_reg);
 344     __ st_ptr(temp_reg, exception_file_offset_addr);
 345     __ set((intptr_t)__LINE__, temp_reg);
 346     __ st(temp_reg, exception_line_offset_addr);
 347 
 348     // complete return to VM
 349     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
 350 
 351     AddressLiteral stub_ret(StubRoutines::_call_stub_return_address);
 352     __ jump_to(stub_ret, temp_reg);
 353     __ delayed()->nop();
 354 
 355     return start;
 356   }
 357 
 358 
 359   //----------------------------------------------------------------------------------------------------
 360   // Continuation point for runtime calls returning with a pending exception
 361   // The pending exception check happened in the runtime or native call stub
 362   // The pending exception in Thread is converted into a Java-level exception
 363   //
 364   // Contract with Java-level exception handler: O0 = exception
 365   //                                             O1 = throwing pc
 366 
 367   address generate_forward_exception() {
 368     StubCodeMark mark(this, "StubRoutines", "forward_exception");
 369     address start = __ pc();
 370 
 371     // Upon entry, O7 has the return address returning into Java
 372     // (interpreted or compiled) code; i.e. the return address
 373     // becomes the throwing pc.
 374 
 375     const Register& handler_reg = Gtemp;
 376 
 377     Address exception_addr(G2_thread, Thread::pending_exception_offset());
 378 
 379 #ifdef ASSERT
 380     // make sure that this code is only executed if there is a pending exception
 381     { Label L;
 382       __ ld_ptr(exception_addr, Gtemp);
 383       __ br_notnull_short(Gtemp, Assembler::pt, L);
 384       __ stop("StubRoutines::forward exception: no pending exception (1)");
 385       __ bind(L);
 386     }
 387 #endif
 388 
 389     // compute exception handler into handler_reg
 390     __ get_thread();
 391     __ ld_ptr(exception_addr, Oexception);
 392     __ verify_oop(Oexception);
 393     __ save_frame(0);             // compensates for compiler weakness
 394     __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC
 395     BLOCK_COMMENT("call exception_handler_for_return_address");
 396     __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch);
 397     __ mov(O0, handler_reg);
 398     __ restore();                 // compensates for compiler weakness
 399 
 400     __ ld_ptr(exception_addr, Oexception);
 401     __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC
 402 
 403 #ifdef ASSERT
 404     // make sure exception is set
 405     { Label L;
 406       __ br_notnull_short(Oexception, Assembler::pt, L);
 407       __ stop("StubRoutines::forward exception: no pending exception (2)");
 408       __ bind(L);
 409     }
 410 #endif
 411     // jump to exception handler
 412     __ jmp(handler_reg, 0);
 413     // clear pending exception
 414     __ delayed()->st_ptr(G0, exception_addr);
 415 
 416     return start;
 417   }
 418 
 419 
 420   //------------------------------------------------------------------------------------------------------------------------
 421   // Continuation point for throwing of implicit exceptions that are not handled in
 422   // the current activation. Fabricates an exception oop and initiates normal
 423   // exception dispatching in this frame. Only callee-saved registers are preserved
 424   // (through the normal register window / RegisterMap handling).
 425   // If the compiler needs all registers to be preserved between the fault
 426   // point and the exception handler then it must assume responsibility for that in
 427   // AbstractCompiler::continuation_for_implicit_null_exception or
 428   // continuation_for_implicit_division_by_zero_exception. All other implicit
 429   // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
 430   // either at call sites or otherwise assume that stack unwinding will be initiated,
 431   // so caller saved registers were assumed volatile in the compiler.
 432 
 433   // Note that we generate only this stub into a RuntimeStub, because it needs to be
 434   // properly traversed and ignored during GC, so we change the meaning of the "__"
 435   // macro within this method.
 436 #undef __
 437 #define __ masm->
 438 
 439   address generate_throw_exception(const char* name, address runtime_entry,
 440                                    Register arg1 = noreg, Register arg2 = noreg) {
 441 #ifdef ASSERT
 442     int insts_size = VerifyThread ? 1 * K : 600;
 443 #else
 444     int insts_size = VerifyThread ? 1 * K : 256;
 445 #endif /* ASSERT */
 446     int locs_size  = 32;
 447 
 448     CodeBuffer      code(name, insts_size, locs_size);
 449     MacroAssembler* masm = new MacroAssembler(&code);
 450 
 451     __ verify_thread();
 452 
 453     // This is an inlined and slightly modified version of call_VM
 454     // which has the ability to fetch the return PC out of thread-local storage
 455     __ assert_not_delayed();
 456 
 457     // Note that we always push a frame because on the SPARC
 458     // architecture, for all of our implicit exception kinds at call
 459     // sites, the implicit exception is taken before the callee frame
 460     // is pushed.
 461     __ save_frame(0);
 462 
 463     int frame_complete = __ offset();
 464 
 465     // Note that we always have a runtime stub frame on the top of stack by this point
 466     Register last_java_sp = SP;
 467     // 64-bit last_java_sp is biased!
 468     __ set_last_Java_frame(last_java_sp, G0);
 469     if (VerifyThread)  __ mov(G2_thread, O0); // about to be smashed; pass early
 470     __ save_thread(noreg);
 471     if (arg1 != noreg) {
 472       assert(arg2 != O1, "clobbered");
 473       __ mov(arg1, O1);
 474     }
 475     if (arg2 != noreg) {
 476       __ mov(arg2, O2);
 477     }
 478     // do the call
 479     BLOCK_COMMENT("call runtime_entry");
 480     __ call(runtime_entry, relocInfo::runtime_call_type);
 481     if (!VerifyThread)
 482       __ delayed()->mov(G2_thread, O0);  // pass thread as first argument
 483     else
 484       __ delayed()->nop();             // (thread already passed)
 485     __ restore_thread(noreg);
 486     __ reset_last_Java_frame();
 487 
 488     // check for pending exceptions. use Gtemp as scratch register.
 489 #ifdef ASSERT
 490     Label L;
 491 
 492     Address exception_addr(G2_thread, Thread::pending_exception_offset());
 493     Register scratch_reg = Gtemp;
 494     __ ld_ptr(exception_addr, scratch_reg);
 495     __ br_notnull_short(scratch_reg, Assembler::pt, L);
 496     __ should_not_reach_here();
 497     __ bind(L);
 498 #endif // ASSERT
 499     BLOCK_COMMENT("call forward_exception_entry");
 500     __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
 501     // we use O7 linkage so that forward_exception_entry has the issuing PC
 502     __ delayed()->restore();
 503 
 504     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false);
 505     return stub->entry_point();
 506   }
 507 
 508 #undef __
 509 #define __ _masm->
 510 
 511 
 512   // Generate a routine that sets all the registers so we
 513   // can tell if the stop routine prints them correctly.
 514   address generate_test_stop() {
 515     StubCodeMark mark(this, "StubRoutines", "test_stop");
 516     address start = __ pc();
 517 
 518     int i;
 519 
 520     __ save_frame(0);
 521 
 522     static jfloat zero = 0.0, one = 1.0;
 523 
 524     // put addr in L0, then load through L0 to F0
 525     __ set((intptr_t)&zero, L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F0);
 526     __ set((intptr_t)&one,  L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1
 527 
 528     // use add to put 2..18 in F2..F18
 529     for ( i = 2;  i <= 18;  ++i ) {
 530       __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1),  as_FloatRegister(i));
 531     }
 532 
 533     // Now put double 2 in F16, double 18 in F18
 534     __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 );
 535     __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 );
 536 
 537     // use add to put 20..32 in F20..F32
 538     for (i = 20; i < 32; i += 2) {
 539       __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2),  as_FloatRegister(i));
 540     }
 541 
 542     // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's
 543     for ( i = 0; i < 8; ++i ) {
 544       if (i < 6) {
 545         __ set(     i, as_iRegister(i));
 546         __ set(16 + i, as_oRegister(i));
 547         __ set(24 + i, as_gRegister(i));
 548       }
 549       __ set( 8 + i, as_lRegister(i));
 550     }
 551 
 552     __ stop("testing stop");
 553 
 554 
 555     __ ret();
 556     __ delayed()->restore();
 557 
 558     return start;
 559   }
 560 
 561 
 562   address generate_stop_subroutine() {
 563     StubCodeMark mark(this, "StubRoutines", "stop_subroutine");
 564     address start = __ pc();
 565 
 566     __ stop_subroutine();
 567 
 568     return start;
 569   }
 570 
 571   address generate_flush_callers_register_windows() {
 572     StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows");
 573     address start = __ pc();
 574 
 575     __ flush_windows();
 576     __ retl(false);
 577     __ delayed()->add( FP, STACK_BIAS, O0 );
 578     // The returned value must be a stack pointer whose register save area
 579     // is flushed, and will stay flushed while the caller executes.
 580 
 581     return start;
 582   }
 583 
 584   // Helper functions for v8 atomic operations.
 585   //
 586   void get_v8_oop_lock_ptr(Register lock_ptr_reg, Register mark_oop_reg, Register scratch_reg) {
 587     if (mark_oop_reg == noreg) {
 588       address lock_ptr = (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr();
 589       __ set((intptr_t)lock_ptr, lock_ptr_reg);
 590     } else {
 591       assert(scratch_reg != noreg, "just checking");
 592       address lock_ptr = (address)StubRoutines::Sparc::_v8_oop_lock_cache;
 593       __ set((intptr_t)lock_ptr, lock_ptr_reg);
 594       __ and3(mark_oop_reg, StubRoutines::Sparc::v8_oop_lock_mask_in_place, scratch_reg);
 595       __ add(lock_ptr_reg, scratch_reg, lock_ptr_reg);
 596     }
 597   }
 598 
 599   void generate_v8_lock_prologue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
 600 
 601     get_v8_oop_lock_ptr(lock_ptr_reg, mark_oop_reg, scratch_reg);
 602     __ set(StubRoutines::Sparc::locked, lock_reg);
 603     // Initialize yield counter
 604     __ mov(G0,yield_reg);
 605 
 606     __ BIND(retry);
 607     __ cmp_and_br_short(yield_reg, V8AtomicOperationUnderLockSpinCount, Assembler::less, Assembler::pt, dontyield);
 608 
 609     // This code can only be called from inside the VM, this
 610     // stub is only invoked from Atomic::add().  We do not
 611     // want to use call_VM, because _last_java_sp and such
 612     // must already be set.
 613     //
 614     // Save the regs and make space for a C call
 615     __ save(SP, -96, SP);
 616     __ save_all_globals_into_locals();
 617     BLOCK_COMMENT("call os::naked_sleep");
 618     __ call(CAST_FROM_FN_PTR(address, os::naked_sleep));
 619     __ delayed()->nop();
 620     __ restore_globals_from_locals();
 621     __ restore();
 622     // reset the counter
 623     __ mov(G0,yield_reg);
 624 
 625     __ BIND(dontyield);
 626 
 627     // try to get lock
 628     __ swap(lock_ptr_reg, 0, lock_reg);
 629 
 630     // did we get the lock?
 631     __ cmp(lock_reg, StubRoutines::Sparc::unlocked);
 632     __ br(Assembler::notEqual, true, Assembler::pn, retry);
 633     __ delayed()->add(yield_reg,1,yield_reg);
 634 
 635     // yes, got lock. do the operation here.
 636   }
 637 
 638   void generate_v8_lock_epilogue(Register lock_reg, Register lock_ptr_reg, Register yield_reg, Label& retry, Label& dontyield, Register mark_oop_reg = noreg, Register scratch_reg = noreg) {
 639     __ st(lock_reg, lock_ptr_reg, 0); // unlock
 640   }
 641 
 642   // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest).
 643   //
 644   // Arguments :
 645   //
 646   //      exchange_value: O0
 647   //      dest:           O1
 648   //
 649   // Results:
 650   //
 651   //     O0: the value previously stored in dest
 652   //
 653   address generate_atomic_xchg() {
 654     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
 655     address start = __ pc();
 656 
 657     if (UseCASForSwap) {
 658       // Use CAS instead of swap, just in case the MP hardware
 659       // prefers to work with just one kind of synch. instruction.
 660       Label retry;
 661       __ BIND(retry);
 662       __ mov(O0, O3);       // scratch copy of exchange value
 663       __ ld(O1, 0, O2);     // observe the previous value
 664       // try to replace O2 with O3
 665       __ cas_under_lock(O1, O2, O3,
 666       (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
 667       __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
 668 
 669       __ retl(false);
 670       __ delayed()->mov(O2, O0);  // report previous value to caller
 671 
 672     } else {
 673       if (VM_Version::v9_instructions_work()) {
 674         __ retl(false);
 675         __ delayed()->swap(O1, 0, O0);
 676       } else {
 677         const Register& lock_reg = O2;
 678         const Register& lock_ptr_reg = O3;
 679         const Register& yield_reg = O4;
 680 
 681         Label retry;
 682         Label dontyield;
 683 
 684         generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
 685         // got the lock, do the swap
 686         __ swap(O1, 0, O0);
 687 
 688         generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
 689         __ retl(false);
 690         __ delayed()->nop();
 691       }
 692     }
 693 
 694     return start;
 695   }
 696 
 697 
 698   // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value)
 699   //
 700   // Arguments :
 701   //
 702   //      exchange_value: O0
 703   //      dest:           O1
 704   //      compare_value:  O2
 705   //
 706   // Results:
 707   //
 708   //     O0: the value previously stored in dest
 709   //
 710   // Overwrites (v8): O3,O4,O5
 711   //
 712   address generate_atomic_cmpxchg() {
 713     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
 714     address start = __ pc();
 715 
 716     // cmpxchg(dest, compare_value, exchange_value)
 717     __ cas_under_lock(O1, O2, O0,
 718       (address)StubRoutines::Sparc::atomic_memory_operation_lock_addr(),false);
 719     __ retl(false);
 720     __ delayed()->nop();
 721 
 722     return start;
 723   }
 724 
 725   // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
 726   //
 727   // Arguments :
 728   //
 729   //      exchange_value: O1:O0
 730   //      dest:           O2
 731   //      compare_value:  O4:O3
 732   //
 733   // Results:
 734   //
 735   //     O1:O0: the value previously stored in dest
 736   //
 737   // This only works on V9, on V8 we don't generate any
 738   // code and just return NULL.
 739   //
 740   // Overwrites: G1,G2,G3
 741   //
 742   address generate_atomic_cmpxchg_long() {
 743     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
 744     address start = __ pc();
 745 
 746     if (!VM_Version::supports_cx8())
 747         return NULL;;
 748     __ sllx(O0, 32, O0);
 749     __ srl(O1, 0, O1);
 750     __ or3(O0,O1,O0);      // O0 holds 64-bit value from compare_value
 751     __ sllx(O3, 32, O3);
 752     __ srl(O4, 0, O4);
 753     __ or3(O3,O4,O3);     // O3 holds 64-bit value from exchange_value
 754     __ casx(O2, O3, O0);
 755     __ srl(O0, 0, O1);    // unpacked return value in O1:O0
 756     __ retl(false);
 757     __ delayed()->srlx(O0, 32, O0);
 758 
 759     return start;
 760   }
 761 
 762 
 763   // Support for jint Atomic::add(jint add_value, volatile jint* dest).
 764   //
 765   // Arguments :
 766   //
 767   //      add_value: O0   (e.g., +1 or -1)
 768   //      dest:      O1
 769   //
 770   // Results:
 771   //
 772   //     O0: the new value stored in dest
 773   //
 774   // Overwrites (v9): O3
 775   // Overwrites (v8): O3,O4,O5
 776   //
 777   address generate_atomic_add() {
 778     StubCodeMark mark(this, "StubRoutines", "atomic_add");
 779     address start = __ pc();
 780     __ BIND(_atomic_add_stub);
 781 
 782     if (VM_Version::v9_instructions_work()) {
 783       Label(retry);
 784       __ BIND(retry);
 785 
 786       __ lduw(O1, 0, O2);
 787       __ add(O0, O2, O3);
 788       __ cas(O1, O2, O3);
 789       __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
 790       __ retl(false);
 791       __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
 792     } else {
 793       const Register& lock_reg = O2;
 794       const Register& lock_ptr_reg = O3;
 795       const Register& value_reg = O4;
 796       const Register& yield_reg = O5;
 797 
 798       Label(retry);
 799       Label(dontyield);
 800 
 801       generate_v8_lock_prologue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
 802       // got lock, do the increment
 803       __ ld(O1, 0, value_reg);
 804       __ add(O0, value_reg, value_reg);
 805       __ st(value_reg, O1, 0);
 806 
 807       // %%% only for RMO and PSO
 808       __ membar(Assembler::StoreStore);
 809 
 810       generate_v8_lock_epilogue(lock_reg, lock_ptr_reg, yield_reg, retry, dontyield);
 811 
 812       __ retl(false);
 813       __ delayed()->mov(value_reg, O0);
 814     }
 815 
 816     return start;
 817   }
 818   Label _atomic_add_stub;  // called from other stubs
 819 
 820 
 821   //------------------------------------------------------------------------------------------------------------------------
 822   // The following routine generates a subroutine to throw an asynchronous
 823   // UnknownError when an unsafe access gets a fault that could not be
 824   // reasonably prevented by the programmer.  (Example: SIGBUS/OBJERR.)
 825   //
 826   // Arguments :
 827   //
 828   //      trapping PC:    O7
 829   //
 830   // Results:
 831   //     posts an asynchronous exception, skips the trapping instruction
 832   //
 833 
 834   address generate_handler_for_unsafe_access() {
 835     StubCodeMark mark(this, "StubRoutines", "handler_for_unsafe_access");
 836     address start = __ pc();
 837 
 838     const int preserve_register_words = (64 * 2);
 839     Address preserve_addr(FP, (-preserve_register_words * wordSize) + STACK_BIAS);
 840 
 841     Register Lthread = L7_thread_cache;
 842     int i;
 843 
 844     __ save_frame(0);
 845     __ mov(G1, L1);
 846     __ mov(G2, L2);
 847     __ mov(G3, L3);
 848     __ mov(G4, L4);
 849     __ mov(G5, L5);
 850     for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
 851       __ stf(FloatRegisterImpl::D, as_FloatRegister(i), preserve_addr, i * wordSize);
 852     }
 853 
 854     address entry_point = CAST_FROM_FN_PTR(address, handle_unsafe_access);
 855     BLOCK_COMMENT("call handle_unsafe_access");
 856     __ call(entry_point, relocInfo::runtime_call_type);
 857     __ delayed()->nop();
 858 
 859     __ mov(L1, G1);
 860     __ mov(L2, G2);
 861     __ mov(L3, G3);
 862     __ mov(L4, G4);
 863     __ mov(L5, G5);
 864     for (i = 0; i < (VM_Version::v9_instructions_work() ? 64 : 32); i += 2) {
 865       __ ldf(FloatRegisterImpl::D, preserve_addr, as_FloatRegister(i), i * wordSize);
 866     }
 867 
 868     __ verify_thread();
 869 
 870     __ jmp(O0, 0);
 871     __ delayed()->restore();
 872 
 873     return start;
 874   }
 875 
 876 
 877   // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super );
 878   // Arguments :
 879   //
 880   //      ret  : O0, returned
 881   //      icc/xcc: set as O0 (depending on wordSize)
 882   //      sub  : O1, argument, not changed
 883   //      super: O2, argument, not changed
 884   //      raddr: O7, blown by call
 885   address generate_partial_subtype_check() {
 886     __ align(CodeEntryAlignment);
 887     StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
 888     address start = __ pc();
 889     Label miss;
 890 
 891 #if defined(COMPILER2) && !defined(_LP64)
 892     // Do not use a 'save' because it blows the 64-bit O registers.
 893     __ add(SP,-4*wordSize,SP);  // Make space for 4 temps (stack must be 2 words aligned)
 894     __ st_ptr(L0,SP,(frame::register_save_words+0)*wordSize);
 895     __ st_ptr(L1,SP,(frame::register_save_words+1)*wordSize);
 896     __ st_ptr(L2,SP,(frame::register_save_words+2)*wordSize);
 897     __ st_ptr(L3,SP,(frame::register_save_words+3)*wordSize);
 898     Register Rret   = O0;
 899     Register Rsub   = O1;
 900     Register Rsuper = O2;
 901 #else
 902     __ save_frame(0);
 903     Register Rret   = I0;
 904     Register Rsub   = I1;
 905     Register Rsuper = I2;
 906 #endif
 907 
 908     Register L0_ary_len = L0;
 909     Register L1_ary_ptr = L1;
 910     Register L2_super   = L2;
 911     Register L3_index   = L3;
 912 
 913     __ check_klass_subtype_slow_path(Rsub, Rsuper,
 914                                      L0, L1, L2, L3,
 915                                      NULL, &miss);
 916 
 917     // Match falls through here.
 918     __ addcc(G0,0,Rret);        // set Z flags, Z result
 919 
 920 #if defined(COMPILER2) && !defined(_LP64)
 921     __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
 922     __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
 923     __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
 924     __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
 925     __ retl();                  // Result in Rret is zero; flags set to Z
 926     __ delayed()->add(SP,4*wordSize,SP);
 927 #else
 928     __ ret();                   // Result in Rret is zero; flags set to Z
 929     __ delayed()->restore();
 930 #endif
 931 
 932     __ BIND(miss);
 933     __ addcc(G0,1,Rret);        // set NZ flags, NZ result
 934 
 935 #if defined(COMPILER2) && !defined(_LP64)
 936     __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
 937     __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
 938     __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
 939     __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
 940     __ retl();                  // Result in Rret is != 0; flags set to NZ
 941     __ delayed()->add(SP,4*wordSize,SP);
 942 #else
 943     __ ret();                   // Result in Rret is != 0; flags set to NZ
 944     __ delayed()->restore();
 945 #endif
 946 
 947     return start;
 948   }
 949 
 950 
 951   // Called from MacroAssembler::verify_oop
 952   //
 953   address generate_verify_oop_subroutine() {
 954     StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
 955 
 956     address start = __ pc();
 957 
 958     __ verify_oop_subroutine();
 959 
 960     return start;
 961   }
 962 
 963 
 964   //
 965   // Verify that a register contains clean 32-bits positive value
 966   // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax).
 967   //
 968   //  Input:
 969   //    Rint  -  32-bits value
 970   //    Rtmp  -  scratch
 971   //
 972   void assert_clean_int(Register Rint, Register Rtmp) {
 973 #if defined(ASSERT) && defined(_LP64)
 974     __ signx(Rint, Rtmp);
 975     __ cmp(Rint, Rtmp);
 976     __ breakpoint_trap(Assembler::notEqual, Assembler::xcc);
 977 #endif
 978   }
 979 
 980   //
 981   //  Generate overlap test for array copy stubs
 982   //
 983   //  Input:
 984   //    O0    -  array1
 985   //    O1    -  array2
 986   //    O2    -  element count
 987   //
 988   //  Kills temps:  O3, O4
 989   //
 990   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
 991     assert(no_overlap_target != NULL, "must be generated");
 992     array_overlap_test(no_overlap_target, NULL, log2_elem_size);
 993   }
 994   void array_overlap_test(Label& L_no_overlap, int log2_elem_size) {
 995     array_overlap_test(NULL, &L_no_overlap, log2_elem_size);
 996   }
 997   void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) {
 998     const Register from       = O0;
 999     const Register to         = O1;
1000     const Register count      = O2;
1001     const Register to_from    = O3; // to - from
1002     const Register byte_count = O4; // count << log2_elem_size
1003 
1004       __ subcc(to, from, to_from);
1005       __ sll_ptr(count, log2_elem_size, byte_count);
1006       if (NOLp == NULL)
1007         __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target);
1008       else
1009         __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp));
1010       __ delayed()->cmp(to_from, byte_count);
1011       if (NOLp == NULL)
1012         __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target);
1013       else
1014         __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp));
1015       __ delayed()->nop();
1016   }
1017 
1018   //
1019   //  Generate pre-write barrier for array.
1020   //
1021   //  Input:
1022   //     addr     - register containing starting address
1023   //     count    - register containing element count
1024   //     tmp      - scratch register
1025   //
1026   //  The input registers are overwritten.
1027   //
1028   void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
1029     BarrierSet* bs = Universe::heap()->barrier_set();
1030     switch (bs->kind()) {
1031       case BarrierSet::G1SATBCT:
1032       case BarrierSet::G1SATBCTLogging:
1033         // With G1, don't generate the call if we statically know that the target in uninitialized
1034         if (!dest_uninitialized) {
1035           __ save_frame(0);
1036           // Save the necessary global regs... will be used after.
1037           if (addr->is_global()) {
1038             __ mov(addr, L0);
1039           }
1040           if (count->is_global()) {
1041             __ mov(count, L1);
1042           }
1043           __ mov(addr->after_save(), O0);
1044           // Get the count into O1
1045           __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
1046           __ delayed()->mov(count->after_save(), O1);
1047           if (addr->is_global()) {
1048             __ mov(L0, addr);
1049           }
1050           if (count->is_global()) {
1051             __ mov(L1, count);
1052           }
1053           __ restore();
1054         }
1055         break;
1056       case BarrierSet::CardTableModRef:
1057       case BarrierSet::CardTableExtension:
1058       case BarrierSet::ModRef:
1059         break;
1060       default:
1061         ShouldNotReachHere();
1062     }
1063   }
1064   //
1065   //  Generate post-write barrier for array.
1066   //
1067   //  Input:
1068   //     addr     - register containing starting address
1069   //     count    - register containing element count
1070   //     tmp      - scratch register
1071   //
1072   //  The input registers are overwritten.
1073   //
1074   void gen_write_ref_array_post_barrier(Register addr, Register count,
1075                                         Register tmp) {
1076     BarrierSet* bs = Universe::heap()->barrier_set();
1077 
1078     switch (bs->kind()) {
1079       case BarrierSet::G1SATBCT:
1080       case BarrierSet::G1SATBCTLogging:
1081         {
1082           // Get some new fresh output registers.
1083           __ save_frame(0);
1084           __ mov(addr->after_save(), O0);
1085           __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
1086           __ delayed()->mov(count->after_save(), O1);
1087           __ restore();
1088         }
1089         break;
1090       case BarrierSet::CardTableModRef:
1091       case BarrierSet::CardTableExtension:
1092         {
1093           CardTableModRefBS* ct = (CardTableModRefBS*)bs;
1094           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
1095           assert_different_registers(addr, count, tmp);
1096 
1097           Label L_loop;
1098 
1099           __ sll_ptr(count, LogBytesPerHeapOop, count);
1100           __ sub(count, BytesPerHeapOop, count);
1101           __ add(count, addr, count);
1102           // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
1103           __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
1104           __ srl_ptr(count, CardTableModRefBS::card_shift, count);
1105           __ sub(count, addr, count);
1106           AddressLiteral rs(ct->byte_map_base);
1107           __ set(rs, tmp);
1108         __ BIND(L_loop);
1109           __ stb(G0, tmp, addr);
1110           __ subcc(count, 1, count);
1111           __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1112           __ delayed()->add(addr, 1, addr);
1113         }
1114         break;
1115       case BarrierSet::ModRef:
1116         break;
1117       default:
1118         ShouldNotReachHere();
1119     }
1120   }
1121 
1122   //
1123   // Generate main code for disjoint arraycopy
1124   //
1125   typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
1126                                               Label& L_loop, bool use_prefetch, bool use_bis);
1127 
1128   void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
1129                           int iter_size, CopyLoopFunc copy_loop_func) {
1130     Label L_copy;
1131 
1132     assert(log2_elem_size <= 3, "the following code should be changed");
1133     int count_dec = 16>>log2_elem_size;
1134 
1135     int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance);
1136     assert(prefetch_dist < 4096, "invalid value");
1137     prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size
1138     int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count
1139 
1140     if (UseBlockCopy) {
1141       Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy;
1142 
1143       // 64 bytes tail + bytes copied in one loop iteration
1144       int tail_size = 64 + iter_size;
1145       int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size;
1146       // Use BIS copy only for big arrays since it requires membar.
1147       __ set(block_copy_count, O4);
1148       __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy);
1149       // This code is for disjoint source and destination:
1150       //   to <= from || to >= from+count
1151       // but BIS will stomp over 'from' if (to > from-tail_size && to <= from)
1152       __ sub(from, to, O4);
1153       __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm.
1154       __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy);
1155 
1156       __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
1157       // BIS should not be used to copy tail (64 bytes+iter_size)
1158       // to avoid zeroing of following values.
1159       __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0
1160 
1161       if (prefetch_count > 0) { // rounded up to one iteration count
1162         // Do prefetching only if copy size is bigger
1163         // than prefetch distance.
1164         __ set(prefetch_count, O4);
1165         __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy);
1166         __ sub(count, prefetch_count, count);
1167 
1168         (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true);
1169         __ add(count, prefetch_count, count); // restore count
1170 
1171       } // prefetch_count > 0
1172 
1173       (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true);
1174       __ add(count, (tail_size>>log2_elem_size), count); // restore count
1175 
1176       __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
1177       // BIS needs membar.
1178       __ membar(Assembler::StoreLoad);
1179       // Copy tail
1180       __ ba_short(L_copy);
1181 
1182       __ BIND(L_skip_block_copy);
1183     } // UseBlockCopy
1184 
1185     if (prefetch_count > 0) { // rounded up to one iteration count
1186       // Do prefetching only if copy size is bigger
1187       // than prefetch distance.
1188       __ set(prefetch_count, O4);
1189       __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy);
1190       __ sub(count, prefetch_count, count);
1191 
1192       Label L_copy_prefetch;
1193       (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false);
1194       __ add(count, prefetch_count, count); // restore count
1195 
1196     } // prefetch_count > 0
1197 
1198     (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false);
1199   }
1200 
1201 
1202 
1203   //
1204   // Helper methods for copy_16_bytes_forward_with_shift()
1205   //
1206   void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec,
1207                                 Label& L_loop, bool use_prefetch, bool use_bis) {
1208 
1209     const Register left_shift  = G1; // left  shift bit counter
1210     const Register right_shift = G5; // right shift bit counter
1211 
1212     __ align(OptoLoopAlignment);
1213     __ BIND(L_loop);
1214     if (use_prefetch) {
1215       if (ArraycopySrcPrefetchDistance > 0) {
1216         __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
1217       }
1218       if (ArraycopyDstPrefetchDistance > 0) {
1219         __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
1220       }
1221     }
1222     __ ldx(from, 0, O4);
1223     __ ldx(from, 8, G4);
1224     __ inc(to, 16);
1225     __ inc(from, 16);
1226     __ deccc(count, count_dec); // Can we do next iteration after this one?
1227     __ srlx(O4, right_shift, G3);
1228     __ bset(G3, O3);
1229     __ sllx(O4, left_shift,  O4);
1230     __ srlx(G4, right_shift, G3);
1231     __ bset(G3, O4);
1232     if (use_bis) {
1233       __ stxa(O3, to, -16);
1234       __ stxa(O4, to, -8);
1235     } else {
1236       __ stx(O3, to, -16);
1237       __ stx(O4, to, -8);
1238     }
1239     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1240     __ delayed()->sllx(G4, left_shift,  O3);
1241   }
1242 
1243   // Copy big chunks forward with shift
1244   //
1245   // Inputs:
1246   //   from      - source arrays
1247   //   to        - destination array aligned to 8-bytes
1248   //   count     - elements count to copy >= the count equivalent to 16 bytes
1249   //   count_dec - elements count's decrement equivalent to 16 bytes
1250   //   L_copy_bytes - copy exit label
1251   //
1252   void copy_16_bytes_forward_with_shift(Register from, Register to,
1253                      Register count, int log2_elem_size, Label& L_copy_bytes) {
1254     Label L_aligned_copy, L_copy_last_bytes;
1255     assert(log2_elem_size <= 3, "the following code should be changed");
1256     int count_dec = 16>>log2_elem_size;
1257 
1258     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1259     __ andcc(from, 7, G1); // misaligned bytes
1260     __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1261     __ delayed()->nop();
1262 
1263     const Register left_shift  = G1; // left  shift bit counter
1264     const Register right_shift = G5; // right shift bit counter
1265 
1266     __ sll(G1, LogBitsPerByte, left_shift);
1267     __ mov(64, right_shift);
1268     __ sub(right_shift, left_shift, right_shift);
1269 
1270     //
1271     // Load 2 aligned 8-bytes chunks and use one from previous iteration
1272     // to form 2 aligned 8-bytes chunks to store.
1273     //
1274     __ dec(count, count_dec);   // Pre-decrement 'count'
1275     __ andn(from, 7, from);     // Align address
1276     __ ldx(from, 0, O3);
1277     __ inc(from, 8);
1278     __ sllx(O3, left_shift,  O3);
1279 
1280     disjoint_copy_core(from, to, count, log2_elem_size, 16, copy_16_bytes_shift_loop);
1281 
1282     __ inccc(count, count_dec>>1 ); // + 8 bytes
1283     __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1284     __ delayed()->inc(count, count_dec>>1); // restore 'count'
1285 
1286     // copy 8 bytes, part of them already loaded in O3
1287     __ ldx(from, 0, O4);
1288     __ inc(to, 8);
1289     __ inc(from, 8);
1290     __ srlx(O4, right_shift, G3);
1291     __ bset(O3, G3);
1292     __ stx(G3, to, -8);
1293 
1294     __ BIND(L_copy_last_bytes);
1295     __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
1296     __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1297     __ delayed()->sub(from, right_shift, from);       // restore address
1298 
1299     __ BIND(L_aligned_copy);
1300   }
1301 
1302   // Copy big chunks backward with shift
1303   //
1304   // Inputs:
1305   //   end_from  - source arrays end address
1306   //   end_to    - destination array end address aligned to 8-bytes
1307   //   count     - elements count to copy >= the count equivalent to 16 bytes
1308   //   count_dec - elements count's decrement equivalent to 16 bytes
1309   //   L_aligned_copy - aligned copy exit label
1310   //   L_copy_bytes   - copy exit label
1311   //
1312   void copy_16_bytes_backward_with_shift(Register end_from, Register end_to,
1313                      Register count, int count_dec,
1314                      Label& L_aligned_copy, Label& L_copy_bytes) {
1315     Label L_loop, L_copy_last_bytes;
1316 
1317     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1318       __ andcc(end_from, 7, G1); // misaligned bytes
1319       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1320       __ delayed()->deccc(count, count_dec); // Pre-decrement 'count'
1321 
1322     const Register left_shift  = G1; // left  shift bit counter
1323     const Register right_shift = G5; // right shift bit counter
1324 
1325       __ sll(G1, LogBitsPerByte, left_shift);
1326       __ mov(64, right_shift);
1327       __ sub(right_shift, left_shift, right_shift);
1328 
1329     //
1330     // Load 2 aligned 8-bytes chunks and use one from previous iteration
1331     // to form 2 aligned 8-bytes chunks to store.
1332     //
1333       __ andn(end_from, 7, end_from);     // Align address
1334       __ ldx(end_from, 0, O3);
1335       __ align(OptoLoopAlignment);
1336     __ BIND(L_loop);
1337       __ ldx(end_from, -8, O4);
1338       __ deccc(count, count_dec); // Can we do next iteration after this one?
1339       __ ldx(end_from, -16, G4);
1340       __ dec(end_to, 16);
1341       __ dec(end_from, 16);
1342       __ srlx(O3, right_shift, O3);
1343       __ sllx(O4, left_shift,  G3);
1344       __ bset(G3, O3);
1345       __ stx(O3, end_to, 8);
1346       __ srlx(O4, right_shift, O4);
1347       __ sllx(G4, left_shift,  G3);
1348       __ bset(G3, O4);
1349       __ stx(O4, end_to, 0);
1350       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1351       __ delayed()->mov(G4, O3);
1352 
1353       __ inccc(count, count_dec>>1 ); // + 8 bytes
1354       __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1355       __ delayed()->inc(count, count_dec>>1); // restore 'count'
1356 
1357       // copy 8 bytes, part of them already loaded in O3
1358       __ ldx(end_from, -8, O4);
1359       __ dec(end_to, 8);
1360       __ dec(end_from, 8);
1361       __ srlx(O3, right_shift, O3);
1362       __ sllx(O4, left_shift,  G3);
1363       __ bset(O3, G3);
1364       __ stx(G3, end_to, 0);
1365 
1366     __ BIND(L_copy_last_bytes);
1367       __ srl(left_shift, LogBitsPerByte, left_shift);    // misaligned bytes
1368       __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1369       __ delayed()->add(end_from, left_shift, end_from); // restore address
1370   }
1371 
1372   //
1373   //  Generate stub for disjoint byte copy.  If "aligned" is true, the
1374   //  "from" and "to" addresses are assumed to be heapword aligned.
1375   //
1376   // Arguments for generated stub:
1377   //      from:  O0
1378   //      to:    O1
1379   //      count: O2 treated as signed
1380   //
1381   address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) {
1382     __ align(CodeEntryAlignment);
1383     StubCodeMark mark(this, "StubRoutines", name);
1384     address start = __ pc();
1385 
1386     Label L_skip_alignment, L_align;
1387     Label L_copy_byte, L_copy_byte_loop, L_exit;
1388 
1389     const Register from      = O0;   // source array address
1390     const Register to        = O1;   // destination array address
1391     const Register count     = O2;   // elements count
1392     const Register offset    = O5;   // offset from start of arrays
1393     // O3, O4, G3, G4 are used as temp registers
1394 
1395     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1396 
1397     if (entry != NULL) {
1398       *entry = __ pc();
1399       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1400       BLOCK_COMMENT("Entry:");
1401     }
1402 
1403     // for short arrays, just do single element copy
1404     __ cmp(count, 23); // 16 + 7
1405     __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1406     __ delayed()->mov(G0, offset);
1407 
1408     if (aligned) {
1409       // 'aligned' == true when it is known statically during compilation
1410       // of this arraycopy call site that both 'from' and 'to' addresses
1411       // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1412       //
1413       // Aligned arrays have 4 bytes alignment in 32-bits VM
1414       // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM
1415       //
1416 #ifndef _LP64
1417       // copy a 4-bytes word if necessary to align 'to' to 8 bytes
1418       __ andcc(to, 7, G0);
1419       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment);
1420       __ delayed()->ld(from, 0, O3);
1421       __ inc(from, 4);
1422       __ inc(to, 4);
1423       __ dec(count, 4);
1424       __ st(O3, to, -4);
1425     __ BIND(L_skip_alignment);
1426 #endif
1427     } else {
1428       // copy bytes to align 'to' on 8 byte boundary
1429       __ andcc(to, 7, G1); // misaligned bytes
1430       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1431       __ delayed()->neg(G1);
1432       __ inc(G1, 8);       // bytes need to copy to next 8-bytes alignment
1433       __ sub(count, G1, count);
1434     __ BIND(L_align);
1435       __ ldub(from, 0, O3);
1436       __ deccc(G1);
1437       __ inc(from);
1438       __ stb(O3, to, 0);
1439       __ br(Assembler::notZero, false, Assembler::pt, L_align);
1440       __ delayed()->inc(to);
1441     __ BIND(L_skip_alignment);
1442     }
1443 #ifdef _LP64
1444     if (!aligned)
1445 #endif
1446     {
1447       // Copy with shift 16 bytes per iteration if arrays do not have
1448       // the same alignment mod 8, otherwise fall through to the next
1449       // code for aligned copy.
1450       // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
1451       // Also jump over aligned copy after the copy with shift completed.
1452 
1453       copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte);
1454     }
1455 
1456     // Both array are 8 bytes aligned, copy 16 bytes at a time
1457       __ and3(count, 7, G4); // Save count
1458       __ srl(count, 3, count);
1459      generate_disjoint_long_copy_core(aligned);
1460       __ mov(G4, count);     // Restore count
1461 
1462     // copy tailing bytes
1463     __ BIND(L_copy_byte);
1464       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1465       __ align(OptoLoopAlignment);
1466     __ BIND(L_copy_byte_loop);
1467       __ ldub(from, offset, O3);
1468       __ deccc(count);
1469       __ stb(O3, to, offset);
1470       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1471       __ delayed()->inc(offset);
1472 
1473     __ BIND(L_exit);
1474       // O3, O4 are used as temp registers
1475       inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1476       __ retl();
1477       __ delayed()->mov(G0, O0); // return 0
1478     return start;
1479   }
1480 
1481   //
1482   //  Generate stub for conjoint byte copy.  If "aligned" is true, the
1483   //  "from" and "to" addresses are assumed to be heapword aligned.
1484   //
1485   // Arguments for generated stub:
1486   //      from:  O0
1487   //      to:    O1
1488   //      count: O2 treated as signed
1489   //
1490   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1491                                       address *entry, const char *name) {
1492     // Do reverse copy.
1493 
1494     __ align(CodeEntryAlignment);
1495     StubCodeMark mark(this, "StubRoutines", name);
1496     address start = __ pc();
1497 
1498     Label L_skip_alignment, L_align, L_aligned_copy;
1499     Label L_copy_byte, L_copy_byte_loop, L_exit;
1500 
1501     const Register from      = O0;   // source array address
1502     const Register to        = O1;   // destination array address
1503     const Register count     = O2;   // elements count
1504     const Register end_from  = from; // source array end address
1505     const Register end_to    = to;   // destination array end address
1506 
1507     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1508 
1509     if (entry != NULL) {
1510       *entry = __ pc();
1511       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1512       BLOCK_COMMENT("Entry:");
1513     }
1514 
1515     array_overlap_test(nooverlap_target, 0);
1516 
1517     __ add(to, count, end_to);       // offset after last copied element
1518 
1519     // for short arrays, just do single element copy
1520     __ cmp(count, 23); // 16 + 7
1521     __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1522     __ delayed()->add(from, count, end_from);
1523 
1524     {
1525       // Align end of arrays since they could be not aligned even
1526       // when arrays itself are aligned.
1527 
1528       // copy bytes to align 'end_to' on 8 byte boundary
1529       __ andcc(end_to, 7, G1); // misaligned bytes
1530       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1531       __ delayed()->nop();
1532       __ sub(count, G1, count);
1533     __ BIND(L_align);
1534       __ dec(end_from);
1535       __ dec(end_to);
1536       __ ldub(end_from, 0, O3);
1537       __ deccc(G1);
1538       __ brx(Assembler::notZero, false, Assembler::pt, L_align);
1539       __ delayed()->stb(O3, end_to, 0);
1540     __ BIND(L_skip_alignment);
1541     }
1542 #ifdef _LP64
1543     if (aligned) {
1544       // Both arrays are aligned to 8-bytes in 64-bits VM.
1545       // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1546       // in unaligned case.
1547       __ dec(count, 16);
1548     } else
1549 #endif
1550     {
1551       // Copy with shift 16 bytes per iteration if arrays do not have
1552       // the same alignment mod 8, otherwise jump to the next
1553       // code for aligned copy (and substracting 16 from 'count' before jump).
1554       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1555       // Also jump over aligned copy after the copy with shift completed.
1556 
1557       copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
1558                                         L_aligned_copy, L_copy_byte);
1559     }
1560     // copy 4 elements (16 bytes) at a time
1561       __ align(OptoLoopAlignment);
1562     __ BIND(L_aligned_copy);
1563       __ dec(end_from, 16);
1564       __ ldx(end_from, 8, O3);
1565       __ ldx(end_from, 0, O4);
1566       __ dec(end_to, 16);
1567       __ deccc(count, 16);
1568       __ stx(O3, end_to, 8);
1569       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1570       __ delayed()->stx(O4, end_to, 0);
1571       __ inc(count, 16);
1572 
1573     // copy 1 element (2 bytes) at a time
1574     __ BIND(L_copy_byte);
1575       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1576       __ align(OptoLoopAlignment);
1577     __ BIND(L_copy_byte_loop);
1578       __ dec(end_from);
1579       __ dec(end_to);
1580       __ ldub(end_from, 0, O4);
1581       __ deccc(count);
1582       __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop);
1583       __ delayed()->stb(O4, end_to, 0);
1584 
1585     __ BIND(L_exit);
1586     // O3, O4 are used as temp registers
1587     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1588     __ retl();
1589     __ delayed()->mov(G0, O0); // return 0
1590     return start;
1591   }
1592 
1593   //
1594   //  Generate stub for disjoint short copy.  If "aligned" is true, the
1595   //  "from" and "to" addresses are assumed to be heapword aligned.
1596   //
1597   // Arguments for generated stub:
1598   //      from:  O0
1599   //      to:    O1
1600   //      count: O2 treated as signed
1601   //
1602   address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) {
1603     __ align(CodeEntryAlignment);
1604     StubCodeMark mark(this, "StubRoutines", name);
1605     address start = __ pc();
1606 
1607     Label L_skip_alignment, L_skip_alignment2;
1608     Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1609 
1610     const Register from      = O0;   // source array address
1611     const Register to        = O1;   // destination array address
1612     const Register count     = O2;   // elements count
1613     const Register offset    = O5;   // offset from start of arrays
1614     // O3, O4, G3, G4 are used as temp registers
1615 
1616     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1617 
1618     if (entry != NULL) {
1619       *entry = __ pc();
1620       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1621       BLOCK_COMMENT("Entry:");
1622     }
1623 
1624     // for short arrays, just do single element copy
1625     __ cmp(count, 11); // 8 + 3  (22 bytes)
1626     __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1627     __ delayed()->mov(G0, offset);
1628 
1629     if (aligned) {
1630       // 'aligned' == true when it is known statically during compilation
1631       // of this arraycopy call site that both 'from' and 'to' addresses
1632       // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1633       //
1634       // Aligned arrays have 4 bytes alignment in 32-bits VM
1635       // and 8 bytes - in 64-bits VM.
1636       //
1637 #ifndef _LP64
1638       // copy a 2-elements word if necessary to align 'to' to 8 bytes
1639       __ andcc(to, 7, G0);
1640       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1641       __ delayed()->ld(from, 0, O3);
1642       __ inc(from, 4);
1643       __ inc(to, 4);
1644       __ dec(count, 2);
1645       __ st(O3, to, -4);
1646     __ BIND(L_skip_alignment);
1647 #endif
1648     } else {
1649       // copy 1 element if necessary to align 'to' on an 4 bytes
1650       __ andcc(to, 3, G0);
1651       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1652       __ delayed()->lduh(from, 0, O3);
1653       __ inc(from, 2);
1654       __ inc(to, 2);
1655       __ dec(count);
1656       __ sth(O3, to, -2);
1657     __ BIND(L_skip_alignment);
1658 
1659       // copy 2 elements to align 'to' on an 8 byte boundary
1660       __ andcc(to, 7, G0);
1661       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1662       __ delayed()->lduh(from, 0, O3);
1663       __ dec(count, 2);
1664       __ lduh(from, 2, O4);
1665       __ inc(from, 4);
1666       __ inc(to, 4);
1667       __ sth(O3, to, -4);
1668       __ sth(O4, to, -2);
1669     __ BIND(L_skip_alignment2);
1670     }
1671 #ifdef _LP64
1672     if (!aligned)
1673 #endif
1674     {
1675       // Copy with shift 16 bytes per iteration if arrays do not have
1676       // the same alignment mod 8, otherwise fall through to the next
1677       // code for aligned copy.
1678       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1679       // Also jump over aligned copy after the copy with shift completed.
1680 
1681       copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes);
1682     }
1683 
1684     // Both array are 8 bytes aligned, copy 16 bytes at a time
1685       __ and3(count, 3, G4); // Save
1686       __ srl(count, 2, count);
1687      generate_disjoint_long_copy_core(aligned);
1688       __ mov(G4, count); // restore
1689 
1690     // copy 1 element at a time
1691     __ BIND(L_copy_2_bytes);
1692       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1693       __ align(OptoLoopAlignment);
1694     __ BIND(L_copy_2_bytes_loop);
1695       __ lduh(from, offset, O3);
1696       __ deccc(count);
1697       __ sth(O3, to, offset);
1698       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1699       __ delayed()->inc(offset, 2);
1700 
1701     __ BIND(L_exit);
1702       // O3, O4 are used as temp registers
1703       inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1704       __ retl();
1705       __ delayed()->mov(G0, O0); // return 0
1706     return start;
1707   }
1708 
1709   //
1710   //  Generate stub for disjoint short fill.  If "aligned" is true, the
1711   //  "to" address is assumed to be heapword aligned.
1712   //
1713   // Arguments for generated stub:
1714   //      to:    O0
1715   //      value: O1
1716   //      count: O2 treated as signed
1717   //
1718   address generate_fill(BasicType t, bool aligned, const char* name) {
1719     __ align(CodeEntryAlignment);
1720     StubCodeMark mark(this, "StubRoutines", name);
1721     address start = __ pc();
1722 
1723     const Register to        = O0;   // source array address
1724     const Register value     = O1;   // fill value
1725     const Register count     = O2;   // elements count
1726     // O3 is used as a temp register
1727 
1728     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1729 
1730     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
1731     Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes;
1732 
1733     int shift = -1;
1734     switch (t) {
1735        case T_BYTE:
1736         shift = 2;
1737         break;
1738        case T_SHORT:
1739         shift = 1;
1740         break;
1741       case T_INT:
1742          shift = 0;
1743         break;
1744       default: ShouldNotReachHere();
1745     }
1746 
1747     BLOCK_COMMENT("Entry:");
1748 
1749     if (t == T_BYTE) {
1750       // Zero extend value
1751       __ and3(value, 0xff, value);
1752       __ sllx(value, 8, O3);
1753       __ or3(value, O3, value);
1754     }
1755     if (t == T_SHORT) {
1756       // Zero extend value
1757       __ sllx(value, 48, value);
1758       __ srlx(value, 48, value);
1759     }
1760     if (t == T_BYTE || t == T_SHORT) {
1761       __ sllx(value, 16, O3);
1762       __ or3(value, O3, value);
1763     }
1764 
1765     __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
1766     __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp
1767     __ delayed()->andcc(count, 1, G0);
1768 
1769     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
1770       // align source address at 4 bytes address boundary
1771       if (t == T_BYTE) {
1772         // One byte misalignment happens only for byte arrays
1773         __ andcc(to, 1, G0);
1774         __ br(Assembler::zero, false, Assembler::pt, L_skip_align1);
1775         __ delayed()->nop();
1776         __ stb(value, to, 0);
1777         __ inc(to, 1);
1778         __ dec(count, 1);
1779         __ BIND(L_skip_align1);
1780       }
1781       // Two bytes misalignment happens only for byte and short (char) arrays
1782       __ andcc(to, 2, G0);
1783       __ br(Assembler::zero, false, Assembler::pt, L_skip_align2);
1784       __ delayed()->nop();
1785       __ sth(value, to, 0);
1786       __ inc(to, 2);
1787       __ dec(count, 1 << (shift - 1));
1788       __ BIND(L_skip_align2);
1789     }
1790 #ifdef _LP64
1791     if (!aligned) {
1792 #endif
1793     // align to 8 bytes, we know we are 4 byte aligned to start
1794     __ andcc(to, 7, G0);
1795     __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes);
1796     __ delayed()->nop();
1797     __ stw(value, to, 0);
1798     __ inc(to, 4);
1799     __ dec(count, 1 << shift);
1800     __ BIND(L_fill_32_bytes);
1801 #ifdef _LP64
1802     }
1803 #endif
1804 
1805     if (t == T_INT) {
1806       // Zero extend value
1807       __ srl(value, 0, value);
1808     }
1809     if (t == T_BYTE || t == T_SHORT || t == T_INT) {
1810       __ sllx(value, 32, O3);
1811       __ or3(value, O3, value);
1812     }
1813 
1814     Label L_check_fill_8_bytes;
1815     // Fill 32-byte chunks
1816     __ subcc(count, 8 << shift, count);
1817     __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes);
1818     __ delayed()->nop();
1819 
1820     Label L_fill_32_bytes_loop, L_fill_4_bytes;
1821     __ align(16);
1822     __ BIND(L_fill_32_bytes_loop);
1823 
1824     __ stx(value, to, 0);
1825     __ stx(value, to, 8);
1826     __ stx(value, to, 16);
1827     __ stx(value, to, 24);
1828 
1829     __ subcc(count, 8 << shift, count);
1830     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop);
1831     __ delayed()->add(to, 32, to);
1832 
1833     __ BIND(L_check_fill_8_bytes);
1834     __ addcc(count, 8 << shift, count);
1835     __ brx(Assembler::zero, false, Assembler::pn, L_exit);
1836     __ delayed()->subcc(count, 1 << (shift + 1), count);
1837     __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes);
1838     __ delayed()->andcc(count, 1<<shift, G0);
1839 
1840     //
1841     // length is too short, just fill 8 bytes at a time
1842     //
1843     Label L_fill_8_bytes_loop;
1844     __ BIND(L_fill_8_bytes_loop);
1845     __ stx(value, to, 0);
1846     __ subcc(count, 1 << (shift + 1), count);
1847     __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop);
1848     __ delayed()->add(to, 8, to);
1849 
1850     // fill trailing 4 bytes
1851     __ andcc(count, 1<<shift, G0);  // in delay slot of branches
1852     if (t == T_INT) {
1853       __ BIND(L_fill_elements);
1854     }
1855     __ BIND(L_fill_4_bytes);
1856     __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes);
1857     if (t == T_BYTE || t == T_SHORT) {
1858       __ delayed()->andcc(count, 1<<(shift-1), G0);
1859     } else {
1860       __ delayed()->nop();
1861     }
1862     __ stw(value, to, 0);
1863     if (t == T_BYTE || t == T_SHORT) {
1864       __ inc(to, 4);
1865       // fill trailing 2 bytes
1866       __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches
1867       __ BIND(L_fill_2_bytes);
1868       __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte);
1869       __ delayed()->andcc(count, 1, count);
1870       __ sth(value, to, 0);
1871       if (t == T_BYTE) {
1872         __ inc(to, 2);
1873         // fill trailing byte
1874         __ andcc(count, 1, count);  // in delay slot of branches
1875         __ BIND(L_fill_byte);
1876         __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1877         __ delayed()->nop();
1878         __ stb(value, to, 0);
1879       } else {
1880         __ BIND(L_fill_byte);
1881       }
1882     } else {
1883       __ BIND(L_fill_2_bytes);
1884     }
1885     __ BIND(L_exit);
1886     __ retl();
1887     __ delayed()->nop();
1888 
1889     // Handle copies less than 8 bytes.  Int is handled elsewhere.
1890     if (t == T_BYTE) {
1891       __ BIND(L_fill_elements);
1892       Label L_fill_2, L_fill_4;
1893       // in delay slot __ andcc(count, 1, G0);
1894       __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1895       __ delayed()->andcc(count, 2, G0);
1896       __ stb(value, to, 0);
1897       __ inc(to, 1);
1898       __ BIND(L_fill_2);
1899       __ brx(Assembler::zero, false, Assembler::pt, L_fill_4);
1900       __ delayed()->andcc(count, 4, G0);
1901       __ stb(value, to, 0);
1902       __ stb(value, to, 1);
1903       __ inc(to, 2);
1904       __ BIND(L_fill_4);
1905       __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1906       __ delayed()->nop();
1907       __ stb(value, to, 0);
1908       __ stb(value, to, 1);
1909       __ stb(value, to, 2);
1910       __ retl();
1911       __ delayed()->stb(value, to, 3);
1912     }
1913 
1914     if (t == T_SHORT) {
1915       Label L_fill_2;
1916       __ BIND(L_fill_elements);
1917       // in delay slot __ andcc(count, 1, G0);
1918       __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1919       __ delayed()->andcc(count, 2, G0);
1920       __ sth(value, to, 0);
1921       __ inc(to, 2);
1922       __ BIND(L_fill_2);
1923       __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1924       __ delayed()->nop();
1925       __ sth(value, to, 0);
1926       __ retl();
1927       __ delayed()->sth(value, to, 2);
1928     }
1929     return start;
1930   }
1931 
1932   //
1933   //  Generate stub for conjoint short copy.  If "aligned" is true, the
1934   //  "from" and "to" addresses are assumed to be heapword aligned.
1935   //
1936   // Arguments for generated stub:
1937   //      from:  O0
1938   //      to:    O1
1939   //      count: O2 treated as signed
1940   //
1941   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1942                                        address *entry, const char *name) {
1943     // Do reverse copy.
1944 
1945     __ align(CodeEntryAlignment);
1946     StubCodeMark mark(this, "StubRoutines", name);
1947     address start = __ pc();
1948 
1949     Label L_skip_alignment, L_skip_alignment2, L_aligned_copy;
1950     Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1951 
1952     const Register from      = O0;   // source array address
1953     const Register to        = O1;   // destination array address
1954     const Register count     = O2;   // elements count
1955     const Register end_from  = from; // source array end address
1956     const Register end_to    = to;   // destination array end address
1957 
1958     const Register byte_count = O3;  // bytes count to copy
1959 
1960     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1961 
1962     if (entry != NULL) {
1963       *entry = __ pc();
1964       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1965       BLOCK_COMMENT("Entry:");
1966     }
1967 
1968     array_overlap_test(nooverlap_target, 1);
1969 
1970     __ sllx(count, LogBytesPerShort, byte_count);
1971     __ add(to, byte_count, end_to);  // offset after last copied element
1972 
1973     // for short arrays, just do single element copy
1974     __ cmp(count, 11); // 8 + 3  (22 bytes)
1975     __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1976     __ delayed()->add(from, byte_count, end_from);
1977 
1978     {
1979       // Align end of arrays since they could be not aligned even
1980       // when arrays itself are aligned.
1981 
1982       // copy 1 element if necessary to align 'end_to' on an 4 bytes
1983       __ andcc(end_to, 3, G0);
1984       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1985       __ delayed()->lduh(end_from, -2, O3);
1986       __ dec(end_from, 2);
1987       __ dec(end_to, 2);
1988       __ dec(count);
1989       __ sth(O3, end_to, 0);
1990     __ BIND(L_skip_alignment);
1991 
1992       // copy 2 elements to align 'end_to' on an 8 byte boundary
1993       __ andcc(end_to, 7, G0);
1994       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1995       __ delayed()->lduh(end_from, -2, O3);
1996       __ dec(count, 2);
1997       __ lduh(end_from, -4, O4);
1998       __ dec(end_from, 4);
1999       __ dec(end_to, 4);
2000       __ sth(O3, end_to, 2);
2001       __ sth(O4, end_to, 0);
2002     __ BIND(L_skip_alignment2);
2003     }
2004 #ifdef _LP64
2005     if (aligned) {
2006       // Both arrays are aligned to 8-bytes in 64-bits VM.
2007       // The 'count' is decremented in copy_16_bytes_backward_with_shift()
2008       // in unaligned case.
2009       __ dec(count, 8);
2010     } else
2011 #endif
2012     {
2013       // Copy with shift 16 bytes per iteration if arrays do not have
2014       // the same alignment mod 8, otherwise jump to the next
2015       // code for aligned copy (and substracting 8 from 'count' before jump).
2016       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
2017       // Also jump over aligned copy after the copy with shift completed.
2018 
2019       copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
2020                                         L_aligned_copy, L_copy_2_bytes);
2021     }
2022     // copy 4 elements (16 bytes) at a time
2023       __ align(OptoLoopAlignment);
2024     __ BIND(L_aligned_copy);
2025       __ dec(end_from, 16);
2026       __ ldx(end_from, 8, O3);
2027       __ ldx(end_from, 0, O4);
2028       __ dec(end_to, 16);
2029       __ deccc(count, 8);
2030       __ stx(O3, end_to, 8);
2031       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
2032       __ delayed()->stx(O4, end_to, 0);
2033       __ inc(count, 8);
2034 
2035     // copy 1 element (2 bytes) at a time
2036     __ BIND(L_copy_2_bytes);
2037       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2038     __ BIND(L_copy_2_bytes_loop);
2039       __ dec(end_from, 2);
2040       __ dec(end_to, 2);
2041       __ lduh(end_from, 0, O4);
2042       __ deccc(count);
2043       __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
2044       __ delayed()->sth(O4, end_to, 0);
2045 
2046     __ BIND(L_exit);
2047     // O3, O4 are used as temp registers
2048     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
2049     __ retl();
2050     __ delayed()->mov(G0, O0); // return 0
2051     return start;
2052   }
2053 
2054   //
2055   // Helper methods for generate_disjoint_int_copy_core()
2056   //
2057   void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec,
2058                           Label& L_loop, bool use_prefetch, bool use_bis) {
2059 
2060     __ align(OptoLoopAlignment);
2061     __ BIND(L_loop);
2062     if (use_prefetch) {
2063       if (ArraycopySrcPrefetchDistance > 0) {
2064         __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
2065       }
2066       if (ArraycopyDstPrefetchDistance > 0) {
2067         __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
2068       }
2069     }
2070     __ ldx(from, 4, O4);
2071     __ ldx(from, 12, G4);
2072     __ inc(to, 16);
2073     __ inc(from, 16);
2074     __ deccc(count, 4); // Can we do next iteration after this one?
2075 
2076     __ srlx(O4, 32, G3);
2077     __ bset(G3, O3);
2078     __ sllx(O4, 32, O4);
2079     __ srlx(G4, 32, G3);
2080     __ bset(G3, O4);
2081     if (use_bis) {
2082       __ stxa(O3, to, -16);
2083       __ stxa(O4, to, -8);
2084     } else {
2085       __ stx(O3, to, -16);
2086       __ stx(O4, to, -8);
2087     }
2088     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
2089     __ delayed()->sllx(G4, 32,  O3);
2090 
2091   }
2092 
2093   //
2094   //  Generate core code for disjoint int copy (and oop copy on 32-bit).
2095   //  If "aligned" is true, the "from" and "to" addresses are assumed
2096   //  to be heapword aligned.
2097   //
2098   // Arguments:
2099   //      from:  O0
2100   //      to:    O1
2101   //      count: O2 treated as signed
2102   //
2103   void generate_disjoint_int_copy_core(bool aligned) {
2104 
2105     Label L_skip_alignment, L_aligned_copy;
2106     Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
2107 
2108     const Register from      = O0;   // source array address
2109     const Register to        = O1;   // destination array address
2110     const Register count     = O2;   // elements count
2111     const Register offset    = O5;   // offset from start of arrays
2112     // O3, O4, G3, G4 are used as temp registers
2113 
2114     // 'aligned' == true when it is known statically during compilation
2115     // of this arraycopy call site that both 'from' and 'to' addresses
2116     // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
2117     //
2118     // Aligned arrays have 4 bytes alignment in 32-bits VM
2119     // and 8 bytes - in 64-bits VM.
2120     //
2121 #ifdef _LP64
2122     if (!aligned)
2123 #endif
2124     {
2125       // The next check could be put under 'ifndef' since the code in
2126       // generate_disjoint_long_copy_core() has own checks and set 'offset'.
2127 
2128       // for short arrays, just do single element copy
2129       __ cmp(count, 5); // 4 + 1 (20 bytes)
2130       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
2131       __ delayed()->mov(G0, offset);
2132 
2133       // copy 1 element to align 'to' on an 8 byte boundary
2134       __ andcc(to, 7, G0);
2135       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
2136       __ delayed()->ld(from, 0, O3);
2137       __ inc(from, 4);
2138       __ inc(to, 4);
2139       __ dec(count);
2140       __ st(O3, to, -4);
2141     __ BIND(L_skip_alignment);
2142 
2143     // if arrays have same alignment mod 8, do 4 elements copy
2144       __ andcc(from, 7, G0);
2145       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2146       __ delayed()->ld(from, 0, O3);
2147 
2148     //
2149     // Load 2 aligned 8-bytes chunks and use one from previous iteration
2150     // to form 2 aligned 8-bytes chunks to store.
2151     //
2152     // copy_16_bytes_forward_with_shift() is not used here since this
2153     // code is more optimal.
2154 
2155     // copy with shift 4 elements (16 bytes) at a time
2156       __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
2157       __ sllx(O3, 32,  O3);
2158 
2159       disjoint_copy_core(from, to, count, 2, 16, copy_16_bytes_loop);
2160 
2161       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2162       __ delayed()->inc(count, 4); // restore 'count'
2163 
2164     __ BIND(L_aligned_copy);
2165     } // !aligned
2166 
2167     // copy 4 elements (16 bytes) at a time
2168       __ and3(count, 1, G4); // Save
2169       __ srl(count, 1, count);
2170      generate_disjoint_long_copy_core(aligned);
2171       __ mov(G4, count);     // Restore
2172 
2173     // copy 1 element at a time
2174     __ BIND(L_copy_4_bytes);
2175       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2176     __ BIND(L_copy_4_bytes_loop);
2177       __ ld(from, offset, O3);
2178       __ deccc(count);
2179       __ st(O3, to, offset);
2180       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
2181       __ delayed()->inc(offset, 4);
2182     __ BIND(L_exit);
2183   }
2184 
2185   //
2186   //  Generate stub for disjoint int copy.  If "aligned" is true, the
2187   //  "from" and "to" addresses are assumed to be heapword aligned.
2188   //
2189   // Arguments for generated stub:
2190   //      from:  O0
2191   //      to:    O1
2192   //      count: O2 treated as signed
2193   //
2194   address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) {
2195     __ align(CodeEntryAlignment);
2196     StubCodeMark mark(this, "StubRoutines", name);
2197     address start = __ pc();
2198 
2199     const Register count = O2;
2200     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2201 
2202     if (entry != NULL) {
2203       *entry = __ pc();
2204       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2205       BLOCK_COMMENT("Entry:");
2206     }
2207 
2208     generate_disjoint_int_copy_core(aligned);
2209 
2210     // O3, O4 are used as temp registers
2211     inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2212     __ retl();
2213     __ delayed()->mov(G0, O0); // return 0
2214     return start;
2215   }
2216 
2217   //
2218   //  Generate core code for conjoint int copy (and oop copy on 32-bit).
2219   //  If "aligned" is true, the "from" and "to" addresses are assumed
2220   //  to be heapword aligned.
2221   //
2222   // Arguments:
2223   //      from:  O0
2224   //      to:    O1
2225   //      count: O2 treated as signed
2226   //
2227   void generate_conjoint_int_copy_core(bool aligned) {
2228     // Do reverse copy.
2229 
2230     Label L_skip_alignment, L_aligned_copy;
2231     Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
2232 
2233     const Register from      = O0;   // source array address
2234     const Register to        = O1;   // destination array address
2235     const Register count     = O2;   // elements count
2236     const Register end_from  = from; // source array end address
2237     const Register end_to    = to;   // destination array end address
2238     // O3, O4, O5, G3 are used as temp registers
2239 
2240     const Register byte_count = O3;  // bytes count to copy
2241 
2242       __ sllx(count, LogBytesPerInt, byte_count);
2243       __ add(to, byte_count, end_to); // offset after last copied element
2244 
2245       __ cmp(count, 5); // for short arrays, just do single element copy
2246       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
2247       __ delayed()->add(from, byte_count, end_from);
2248 
2249     // copy 1 element to align 'to' on an 8 byte boundary
2250       __ andcc(end_to, 7, G0);
2251       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
2252       __ delayed()->nop();
2253       __ dec(count);
2254       __ dec(end_from, 4);
2255       __ dec(end_to,   4);
2256       __ ld(end_from, 0, O4);
2257       __ st(O4, end_to, 0);
2258     __ BIND(L_skip_alignment);
2259 
2260     // Check if 'end_from' and 'end_to' has the same alignment.
2261       __ andcc(end_from, 7, G0);
2262       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2263       __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4
2264 
2265     // copy with shift 4 elements (16 bytes) at a time
2266     //
2267     // Load 2 aligned 8-bytes chunks and use one from previous iteration
2268     // to form 2 aligned 8-bytes chunks to store.
2269     //
2270       __ ldx(end_from, -4, O3);
2271       __ align(OptoLoopAlignment);
2272     __ BIND(L_copy_16_bytes);
2273       __ ldx(end_from, -12, O4);
2274       __ deccc(count, 4);
2275       __ ldx(end_from, -20, O5);
2276       __ dec(end_to, 16);
2277       __ dec(end_from, 16);
2278       __ srlx(O3, 32, O3);
2279       __ sllx(O4, 32, G3);
2280       __ bset(G3, O3);
2281       __ stx(O3, end_to, 8);
2282       __ srlx(O4, 32, O4);
2283       __ sllx(O5, 32, G3);
2284       __ bset(O4, G3);
2285       __ stx(G3, end_to, 0);
2286       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2287       __ delayed()->mov(O5, O3);
2288 
2289       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2290       __ delayed()->inc(count, 4);
2291 
2292     // copy 4 elements (16 bytes) at a time
2293       __ align(OptoLoopAlignment);
2294     __ BIND(L_aligned_copy);
2295       __ dec(end_from, 16);
2296       __ ldx(end_from, 8, O3);
2297       __ ldx(end_from, 0, O4);
2298       __ dec(end_to, 16);
2299       __ deccc(count, 4);
2300       __ stx(O3, end_to, 8);
2301       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
2302       __ delayed()->stx(O4, end_to, 0);
2303       __ inc(count, 4);
2304 
2305     // copy 1 element (4 bytes) at a time
2306     __ BIND(L_copy_4_bytes);
2307       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2308     __ BIND(L_copy_4_bytes_loop);
2309       __ dec(end_from, 4);
2310       __ dec(end_to, 4);
2311       __ ld(end_from, 0, O4);
2312       __ deccc(count);
2313       __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop);
2314       __ delayed()->st(O4, end_to, 0);
2315     __ BIND(L_exit);
2316   }
2317 
2318   //
2319   //  Generate stub for conjoint int copy.  If "aligned" is true, the
2320   //  "from" and "to" addresses are assumed to be heapword aligned.
2321   //
2322   // Arguments for generated stub:
2323   //      from:  O0
2324   //      to:    O1
2325   //      count: O2 treated as signed
2326   //
2327   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
2328                                      address *entry, const char *name) {
2329     __ align(CodeEntryAlignment);
2330     StubCodeMark mark(this, "StubRoutines", name);
2331     address start = __ pc();
2332 
2333     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2334 
2335     if (entry != NULL) {
2336       *entry = __ pc();
2337       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2338       BLOCK_COMMENT("Entry:");
2339     }
2340 
2341     array_overlap_test(nooverlap_target, 2);
2342 
2343     generate_conjoint_int_copy_core(aligned);
2344 
2345     // O3, O4 are used as temp registers
2346     inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2347     __ retl();
2348     __ delayed()->mov(G0, O0); // return 0
2349     return start;
2350   }
2351 
2352   //
2353   // Helper methods for generate_disjoint_long_copy_core()
2354   //
2355   void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec,
2356                           Label& L_loop, bool use_prefetch, bool use_bis) {
2357     __ align(OptoLoopAlignment);
2358     __ BIND(L_loop);
2359     for (int off = 0; off < 64; off += 16) {
2360       if (use_prefetch && (off & 31) == 0) {
2361         if (ArraycopySrcPrefetchDistance > 0) {
2362           __ prefetch(from, ArraycopySrcPrefetchDistance+off, Assembler::severalReads);
2363         }
2364         if (ArraycopyDstPrefetchDistance > 0) {
2365           __ prefetch(to, ArraycopyDstPrefetchDistance+off, Assembler::severalWritesAndPossiblyReads);
2366         }
2367       }
2368       __ ldx(from,  off+0, O4);
2369       __ ldx(from,  off+8, O5);
2370       if (use_bis) {
2371         __ stxa(O4, to,  off+0);
2372         __ stxa(O5, to,  off+8);
2373       } else {
2374         __ stx(O4, to,  off+0);
2375         __ stx(O5, to,  off+8);
2376       }
2377     }
2378     __ deccc(count, 8);
2379     __ inc(from, 64);
2380     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
2381     __ delayed()->inc(to, 64);
2382   }
2383 
2384   //
2385   //  Generate core code for disjoint long copy (and oop copy on 64-bit).
2386   //  "aligned" is ignored, because we must make the stronger
2387   //  assumption that both addresses are always 64-bit aligned.
2388   //
2389   // Arguments:
2390   //      from:  O0
2391   //      to:    O1
2392   //      count: O2 treated as signed
2393   //
2394   // count -= 2;
2395   // if ( count >= 0 ) { // >= 2 elements
2396   //   if ( count > 6) { // >= 8 elements
2397   //     count -= 6; // original count - 8
2398   //     do {
2399   //       copy_8_elements;
2400   //       count -= 8;
2401   //     } while ( count >= 0 );
2402   //     count += 6;
2403   //   }
2404   //   if ( count >= 0 ) { // >= 2 elements
2405   //     do {
2406   //       copy_2_elements;
2407   //     } while ( (count=count-2) >= 0 );
2408   //   }
2409   // }
2410   // count += 2;
2411   // if ( count != 0 ) { // 1 element left
2412   //   copy_1_element;
2413   // }
2414   //
2415   void generate_disjoint_long_copy_core(bool aligned) {
2416     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2417     const Register from    = O0;  // source array address
2418     const Register to      = O1;  // destination array address
2419     const Register count   = O2;  // elements count
2420     const Register offset0 = O4;  // element offset
2421     const Register offset8 = O5;  // next element offset
2422 
2423     __ deccc(count, 2);
2424     __ mov(G0, offset0);   // offset from start of arrays (0)
2425     __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2426     __ delayed()->add(offset0, 8, offset8);
2427 
2428     // Copy by 64 bytes chunks
2429 
2430     const Register from64 = O3;  // source address
2431     const Register to64   = G3;  // destination address
2432     __ subcc(count, 6, O3);
2433     __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
2434     __ delayed()->mov(to,   to64);
2435     // Now we can use O4(offset0), O5(offset8) as temps
2436     __ mov(O3, count);
2437     // count >= 0 (original count - 8)
2438     __ mov(from, from64);
2439 
2440     disjoint_copy_core(from64, to64, count, 3, 64, copy_64_bytes_loop);
2441 
2442       // Restore O4(offset0), O5(offset8)
2443       __ sub(from64, from, offset0);
2444       __ inccc(count, 6); // restore count
2445       __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2446       __ delayed()->add(offset0, 8, offset8);
2447 
2448       // Copy by 16 bytes chunks
2449       __ align(OptoLoopAlignment);
2450     __ BIND(L_copy_16_bytes);
2451       __ ldx(from, offset0, O3);
2452       __ ldx(from, offset8, G3);
2453       __ deccc(count, 2);
2454       __ stx(O3, to, offset0);
2455       __ inc(offset0, 16);
2456       __ stx(G3, to, offset8);
2457       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2458       __ delayed()->inc(offset8, 16);
2459 
2460       // Copy last 8 bytes
2461     __ BIND(L_copy_8_bytes);
2462       __ inccc(count, 2);
2463       __ brx(Assembler::zero, true, Assembler::pn, L_exit );
2464       __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
2465       __ ldx(from, offset0, O3);
2466       __ stx(O3, to, offset0);
2467     __ BIND(L_exit);
2468   }
2469 
2470   //
2471   //  Generate stub for disjoint long copy.
2472   //  "aligned" is ignored, because we must make the stronger
2473   //  assumption that both addresses are always 64-bit aligned.
2474   //
2475   // Arguments for generated stub:
2476   //      from:  O0
2477   //      to:    O1
2478   //      count: O2 treated as signed
2479   //
2480   address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) {
2481     __ align(CodeEntryAlignment);
2482     StubCodeMark mark(this, "StubRoutines", name);
2483     address start = __ pc();
2484 
2485     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2486 
2487     if (entry != NULL) {
2488       *entry = __ pc();
2489       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2490       BLOCK_COMMENT("Entry:");
2491     }
2492 
2493     generate_disjoint_long_copy_core(aligned);
2494 
2495     // O3, O4 are used as temp registers
2496     inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2497     __ retl();
2498     __ delayed()->mov(G0, O0); // return 0
2499     return start;
2500   }
2501 
2502   //
2503   //  Generate core code for conjoint long copy (and oop copy on 64-bit).
2504   //  "aligned" is ignored, because we must make the stronger
2505   //  assumption that both addresses are always 64-bit aligned.
2506   //
2507   // Arguments:
2508   //      from:  O0
2509   //      to:    O1
2510   //      count: O2 treated as signed
2511   //
2512   void generate_conjoint_long_copy_core(bool aligned) {
2513     // Do reverse copy.
2514     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2515     const Register from    = O0;  // source array address
2516     const Register to      = O1;  // destination array address
2517     const Register count   = O2;  // elements count
2518     const Register offset8 = O4;  // element offset
2519     const Register offset0 = O5;  // previous element offset
2520 
2521       __ subcc(count, 1, count);
2522       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
2523       __ delayed()->sllx(count, LogBytesPerLong, offset8);
2524       __ sub(offset8, 8, offset0);
2525       __ align(OptoLoopAlignment);
2526     __ BIND(L_copy_16_bytes);
2527       __ ldx(from, offset8, O2);
2528       __ ldx(from, offset0, O3);
2529       __ stx(O2, to, offset8);
2530       __ deccc(offset8, 16);      // use offset8 as counter
2531       __ stx(O3, to, offset0);
2532       __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes);
2533       __ delayed()->dec(offset0, 16);
2534 
2535     __ BIND(L_copy_8_bytes);
2536       __ brx(Assembler::negative, false, Assembler::pn, L_exit );
2537       __ delayed()->nop();
2538       __ ldx(from, 0, O3);
2539       __ stx(O3, to, 0);
2540     __ BIND(L_exit);
2541   }
2542 
2543   //  Generate stub for conjoint long copy.
2544   //  "aligned" is ignored, because we must make the stronger
2545   //  assumption that both addresses are always 64-bit aligned.
2546   //
2547   // Arguments for generated stub:
2548   //      from:  O0
2549   //      to:    O1
2550   //      count: O2 treated as signed
2551   //
2552   address generate_conjoint_long_copy(bool aligned, address nooverlap_target,
2553                                       address *entry, const char *name) {
2554     __ align(CodeEntryAlignment);
2555     StubCodeMark mark(this, "StubRoutines", name);
2556     address start = __ pc();
2557 
2558     assert(aligned, "Should always be aligned");
2559 
2560     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2561 
2562     if (entry != NULL) {
2563       *entry = __ pc();
2564       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2565       BLOCK_COMMENT("Entry:");
2566     }
2567 
2568     array_overlap_test(nooverlap_target, 3);
2569 
2570     generate_conjoint_long_copy_core(aligned);
2571 
2572     // O3, O4 are used as temp registers
2573     inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2574     __ retl();
2575     __ delayed()->mov(G0, O0); // return 0
2576     return start;
2577   }
2578 
2579   //  Generate stub for disjoint oop copy.  If "aligned" is true, the
2580   //  "from" and "to" addresses are assumed to be heapword aligned.
2581   //
2582   // Arguments for generated stub:
2583   //      from:  O0
2584   //      to:    O1
2585   //      count: O2 treated as signed
2586   //
2587   address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name,
2588                                      bool dest_uninitialized = false) {
2589 
2590     const Register from  = O0;  // source array address
2591     const Register to    = O1;  // destination array address
2592     const Register count = O2;  // elements count
2593 
2594     __ align(CodeEntryAlignment);
2595     StubCodeMark mark(this, "StubRoutines", name);
2596     address start = __ pc();
2597 
2598     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2599 
2600     if (entry != NULL) {
2601       *entry = __ pc();
2602       // caller can pass a 64-bit byte count here
2603       BLOCK_COMMENT("Entry:");
2604     }
2605 
2606     // save arguments for barrier generation
2607     __ mov(to, G1);
2608     __ mov(count, G5);
2609     gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2610   #ifdef _LP64
2611     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2612     if (UseCompressedOops) {
2613       generate_disjoint_int_copy_core(aligned);
2614     } else {
2615       generate_disjoint_long_copy_core(aligned);
2616     }
2617   #else
2618     generate_disjoint_int_copy_core(aligned);
2619   #endif
2620     // O0 is used as temp register
2621     gen_write_ref_array_post_barrier(G1, G5, O0);
2622 
2623     // O3, O4 are used as temp registers
2624     inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2625     __ retl();
2626     __ delayed()->mov(G0, O0); // return 0
2627     return start;
2628   }
2629 
2630   //  Generate stub for conjoint oop copy.  If "aligned" is true, the
2631   //  "from" and "to" addresses are assumed to be heapword aligned.
2632   //
2633   // Arguments for generated stub:
2634   //      from:  O0
2635   //      to:    O1
2636   //      count: O2 treated as signed
2637   //
2638   address generate_conjoint_oop_copy(bool aligned, address nooverlap_target,
2639                                      address *entry, const char *name,
2640                                      bool dest_uninitialized = false) {
2641 
2642     const Register from  = O0;  // source array address
2643     const Register to    = O1;  // destination array address
2644     const Register count = O2;  // elements count
2645 
2646     __ align(CodeEntryAlignment);
2647     StubCodeMark mark(this, "StubRoutines", name);
2648     address start = __ pc();
2649 
2650     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2651 
2652     if (entry != NULL) {
2653       *entry = __ pc();
2654       // caller can pass a 64-bit byte count here
2655       BLOCK_COMMENT("Entry:");
2656     }
2657 
2658     array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
2659 
2660     // save arguments for barrier generation
2661     __ mov(to, G1);
2662     __ mov(count, G5);
2663     gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2664 
2665   #ifdef _LP64
2666     if (UseCompressedOops) {
2667       generate_conjoint_int_copy_core(aligned);
2668     } else {
2669       generate_conjoint_long_copy_core(aligned);
2670     }
2671   #else
2672     generate_conjoint_int_copy_core(aligned);
2673   #endif
2674 
2675     // O0 is used as temp register
2676     gen_write_ref_array_post_barrier(G1, G5, O0);
2677 
2678     // O3, O4 are used as temp registers
2679     inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2680     __ retl();
2681     __ delayed()->mov(G0, O0); // return 0
2682     return start;
2683   }
2684 
2685 
2686   // Helper for generating a dynamic type check.
2687   // Smashes only the given temp registers.
2688   void generate_type_check(Register sub_klass,
2689                            Register super_check_offset,
2690                            Register super_klass,
2691                            Register temp,
2692                            Label& L_success) {
2693     assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
2694 
2695     BLOCK_COMMENT("type_check:");
2696 
2697     Label L_miss, L_pop_to_miss;
2698 
2699     assert_clean_int(super_check_offset, temp);
2700 
2701     __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg,
2702                                      &L_success, &L_miss, NULL,
2703                                      super_check_offset);
2704 
2705     BLOCK_COMMENT("type_check_slow_path:");
2706     __ save_frame(0);
2707     __ check_klass_subtype_slow_path(sub_klass->after_save(),
2708                                      super_klass->after_save(),
2709                                      L0, L1, L2, L4,
2710                                      NULL, &L_pop_to_miss);
2711     __ ba(L_success);
2712     __ delayed()->restore();
2713 
2714     __ bind(L_pop_to_miss);
2715     __ restore();
2716 
2717     // Fall through on failure!
2718     __ BIND(L_miss);
2719   }
2720 
2721 
2722   //  Generate stub for checked oop copy.
2723   //
2724   // Arguments for generated stub:
2725   //      from:  O0
2726   //      to:    O1
2727   //      count: O2 treated as signed
2728   //      ckoff: O3 (super_check_offset)
2729   //      ckval: O4 (super_klass)
2730   //      ret:   O0 zero for success; (-1^K) where K is partial transfer count
2731   //
2732   address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) {
2733 
2734     const Register O0_from   = O0;      // source array address
2735     const Register O1_to     = O1;      // destination array address
2736     const Register O2_count  = O2;      // elements count
2737     const Register O3_ckoff  = O3;      // super_check_offset
2738     const Register O4_ckval  = O4;      // super_klass
2739 
2740     const Register O5_offset = O5;      // loop var, with stride wordSize
2741     const Register G1_remain = G1;      // loop var, with stride -1
2742     const Register G3_oop    = G3;      // actual oop copied
2743     const Register G4_klass  = G4;      // oop._klass
2744     const Register G5_super  = G5;      // oop._klass._primary_supers[ckval]
2745 
2746     __ align(CodeEntryAlignment);
2747     StubCodeMark mark(this, "StubRoutines", name);
2748     address start = __ pc();
2749 
2750 #ifdef ASSERT
2751     // We sometimes save a frame (see generate_type_check below).
2752     // If this will cause trouble, let's fail now instead of later.
2753     __ save_frame(0);
2754     __ restore();
2755 #endif
2756 
2757     assert_clean_int(O2_count, G1);     // Make sure 'count' is clean int.
2758 
2759 #ifdef ASSERT
2760     // caller guarantees that the arrays really are different
2761     // otherwise, we would have to make conjoint checks
2762     { Label L;
2763       __ mov(O3, G1);           // spill: overlap test smashes O3
2764       __ mov(O4, G4);           // spill: overlap test smashes O4
2765       array_overlap_test(L, LogBytesPerHeapOop);
2766       __ stop("checkcast_copy within a single array");
2767       __ bind(L);
2768       __ mov(G1, O3);
2769       __ mov(G4, O4);
2770     }
2771 #endif //ASSERT
2772 
2773     if (entry != NULL) {
2774       *entry = __ pc();
2775       // caller can pass a 64-bit byte count here (from generic stub)
2776       BLOCK_COMMENT("Entry:");
2777     }
2778     gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized);
2779 
2780     Label load_element, store_element, do_card_marks, fail, done;
2781     __ addcc(O2_count, 0, G1_remain);   // initialize loop index, and test it
2782     __ brx(Assembler::notZero, false, Assembler::pt, load_element);
2783     __ delayed()->mov(G0, O5_offset);   // offset from start of arrays
2784 
2785     // Empty array:  Nothing to do.
2786     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2787     __ retl();
2788     __ delayed()->set(0, O0);           // return 0 on (trivial) success
2789 
2790     // ======== begin loop ========
2791     // (Loop is rotated; its entry is load_element.)
2792     // Loop variables:
2793     //   (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
2794     //   (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
2795     //   G3, G4, G5 --- current oop, oop.klass, oop.klass.super
2796     __ align(OptoLoopAlignment);
2797 
2798     __ BIND(store_element);
2799     __ deccc(G1_remain);                // decrement the count
2800     __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
2801     __ inc(O5_offset, heapOopSize);     // step to next offset
2802     __ brx(Assembler::zero, true, Assembler::pt, do_card_marks);
2803     __ delayed()->set(0, O0);           // return -1 on success
2804 
2805     // ======== loop entry is here ========
2806     __ BIND(load_element);
2807     __ load_heap_oop(O0_from, O5_offset, G3_oop);  // load the oop
2808     __ br_null_short(G3_oop, Assembler::pt, store_element);
2809 
2810     __ load_klass(G3_oop, G4_klass); // query the object klass
2811 
2812     generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
2813                         // branch to this on success:
2814                         store_element);
2815     // ======== end loop ========
2816 
2817     // It was a real error; we must depend on the caller to finish the job.
2818     // Register G1 has number of *remaining* oops, O2 number of *total* oops.
2819     // Emit GC store barriers for the oops we have copied (O2 minus G1),
2820     // and report their number to the caller.
2821     __ BIND(fail);
2822     __ subcc(O2_count, G1_remain, O2_count);
2823     __ brx(Assembler::zero, false, Assembler::pt, done);
2824     __ delayed()->not1(O2_count, O0);   // report (-1^K) to caller
2825 
2826     __ BIND(do_card_marks);
2827     gen_write_ref_array_post_barrier(O1_to, O2_count, O3);   // store check on O1[0..O2]
2828 
2829     __ BIND(done);
2830     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2831     __ retl();
2832     __ delayed()->nop();             // return value in 00
2833 
2834     return start;
2835   }
2836 
2837 
2838   //  Generate 'unsafe' array copy stub
2839   //  Though just as safe as the other stubs, it takes an unscaled
2840   //  size_t argument instead of an element count.
2841   //
2842   // Arguments for generated stub:
2843   //      from:  O0
2844   //      to:    O1
2845   //      count: O2 byte count, treated as ssize_t, can be zero
2846   //
2847   // Examines the alignment of the operands and dispatches
2848   // to a long, int, short, or byte copy loop.
2849   //
2850   address generate_unsafe_copy(const char* name,
2851                                address byte_copy_entry,
2852                                address short_copy_entry,
2853                                address int_copy_entry,
2854                                address long_copy_entry) {
2855 
2856     const Register O0_from   = O0;      // source array address
2857     const Register O1_to     = O1;      // destination array address
2858     const Register O2_count  = O2;      // elements count
2859 
2860     const Register G1_bits   = G1;      // test copy of low bits
2861 
2862     __ align(CodeEntryAlignment);
2863     StubCodeMark mark(this, "StubRoutines", name);
2864     address start = __ pc();
2865 
2866     // bump this on entry, not on exit:
2867     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3);
2868 
2869     __ or3(O0_from, O1_to, G1_bits);
2870     __ or3(O2_count,       G1_bits, G1_bits);
2871 
2872     __ btst(BytesPerLong-1, G1_bits);
2873     __ br(Assembler::zero, true, Assembler::pt,
2874           long_copy_entry, relocInfo::runtime_call_type);
2875     // scale the count on the way out:
2876     __ delayed()->srax(O2_count, LogBytesPerLong, O2_count);
2877 
2878     __ btst(BytesPerInt-1, G1_bits);
2879     __ br(Assembler::zero, true, Assembler::pt,
2880           int_copy_entry, relocInfo::runtime_call_type);
2881     // scale the count on the way out:
2882     __ delayed()->srax(O2_count, LogBytesPerInt, O2_count);
2883 
2884     __ btst(BytesPerShort-1, G1_bits);
2885     __ br(Assembler::zero, true, Assembler::pt,
2886           short_copy_entry, relocInfo::runtime_call_type);
2887     // scale the count on the way out:
2888     __ delayed()->srax(O2_count, LogBytesPerShort, O2_count);
2889 
2890     __ br(Assembler::always, false, Assembler::pt,
2891           byte_copy_entry, relocInfo::runtime_call_type);
2892     __ delayed()->nop();
2893 
2894     return start;
2895   }
2896 
2897 
2898   // Perform range checks on the proposed arraycopy.
2899   // Kills the two temps, but nothing else.
2900   // Also, clean the sign bits of src_pos and dst_pos.
2901   void arraycopy_range_checks(Register src,     // source array oop (O0)
2902                               Register src_pos, // source position (O1)
2903                               Register dst,     // destination array oo (O2)
2904                               Register dst_pos, // destination position (O3)
2905                               Register length,  // length of copy (O4)
2906                               Register temp1, Register temp2,
2907                               Label& L_failed) {
2908     BLOCK_COMMENT("arraycopy_range_checks:");
2909 
2910     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
2911 
2912     const Register array_length = temp1;  // scratch
2913     const Register end_pos      = temp2;  // scratch
2914 
2915     // Note:  This next instruction may be in the delay slot of a branch:
2916     __ add(length, src_pos, end_pos);  // src_pos + length
2917     __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length);
2918     __ cmp(end_pos, array_length);
2919     __ br(Assembler::greater, false, Assembler::pn, L_failed);
2920 
2921     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2922     __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length
2923     __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length);
2924     __ cmp(end_pos, array_length);
2925     __ br(Assembler::greater, false, Assembler::pn, L_failed);
2926 
2927     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2928     // Move with sign extension can be used since they are positive.
2929     __ delayed()->signx(src_pos, src_pos);
2930     __ signx(dst_pos, dst_pos);
2931 
2932     BLOCK_COMMENT("arraycopy_range_checks done");
2933   }
2934 
2935 
2936   //
2937   //  Generate generic array copy stubs
2938   //
2939   //  Input:
2940   //    O0    -  src oop
2941   //    O1    -  src_pos
2942   //    O2    -  dst oop
2943   //    O3    -  dst_pos
2944   //    O4    -  element count
2945   //
2946   //  Output:
2947   //    O0 ==  0  -  success
2948   //    O0 == -1  -  need to call System.arraycopy
2949   //
2950   address generate_generic_copy(const char *name,
2951                                 address entry_jbyte_arraycopy,
2952                                 address entry_jshort_arraycopy,
2953                                 address entry_jint_arraycopy,
2954                                 address entry_oop_arraycopy,
2955                                 address entry_jlong_arraycopy,
2956                                 address entry_checkcast_arraycopy) {
2957     Label L_failed, L_objArray;
2958 
2959     // Input registers
2960     const Register src      = O0;  // source array oop
2961     const Register src_pos  = O1;  // source position
2962     const Register dst      = O2;  // destination array oop
2963     const Register dst_pos  = O3;  // destination position
2964     const Register length   = O4;  // elements count
2965 
2966     // registers used as temp
2967     const Register G3_src_klass = G3; // source array klass
2968     const Register G4_dst_klass = G4; // destination array klass
2969     const Register G5_lh        = G5; // layout handler
2970     const Register O5_temp      = O5;
2971 
2972     __ align(CodeEntryAlignment);
2973     StubCodeMark mark(this, "StubRoutines", name);
2974     address start = __ pc();
2975 
2976     // bump this on entry, not on exit:
2977     inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3);
2978 
2979     // In principle, the int arguments could be dirty.
2980     //assert_clean_int(src_pos, G1);
2981     //assert_clean_int(dst_pos, G1);
2982     //assert_clean_int(length, G1);
2983 
2984     //-----------------------------------------------------------------------
2985     // Assembler stubs will be used for this call to arraycopy
2986     // if the following conditions are met:
2987     //
2988     // (1) src and dst must not be null.
2989     // (2) src_pos must not be negative.
2990     // (3) dst_pos must not be negative.
2991     // (4) length  must not be negative.
2992     // (5) src klass and dst klass should be the same and not NULL.
2993     // (6) src and dst should be arrays.
2994     // (7) src_pos + length must not exceed length of src.
2995     // (8) dst_pos + length must not exceed length of dst.
2996     BLOCK_COMMENT("arraycopy initial argument checks");
2997 
2998     //  if (src == NULL) return -1;
2999     __ br_null(src, false, Assembler::pn, L_failed);
3000 
3001     //  if (src_pos < 0) return -1;
3002     __ delayed()->tst(src_pos);
3003     __ br(Assembler::negative, false, Assembler::pn, L_failed);
3004     __ delayed()->nop();
3005 
3006     //  if (dst == NULL) return -1;
3007     __ br_null(dst, false, Assembler::pn, L_failed);
3008 
3009     //  if (dst_pos < 0) return -1;
3010     __ delayed()->tst(dst_pos);
3011     __ br(Assembler::negative, false, Assembler::pn, L_failed);
3012 
3013     //  if (length < 0) return -1;
3014     __ delayed()->tst(length);
3015     __ br(Assembler::negative, false, Assembler::pn, L_failed);
3016 
3017     BLOCK_COMMENT("arraycopy argument klass checks");
3018     //  get src->klass()
3019     if (UseCompressedKlassPointers) {
3020       __ delayed()->nop(); // ??? not good
3021       __ load_klass(src, G3_src_klass);
3022     } else {
3023       __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass);
3024     }
3025 
3026 #ifdef ASSERT
3027     //  assert(src->klass() != NULL);
3028     BLOCK_COMMENT("assert klasses not null");
3029     { Label L_a, L_b;
3030       __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL
3031       __ bind(L_a);
3032       __ stop("broken null klass");
3033       __ bind(L_b);
3034       __ load_klass(dst, G4_dst_klass);
3035       __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also
3036       __ delayed()->mov(G0, G4_dst_klass);      // scribble the temp
3037       BLOCK_COMMENT("assert done");
3038     }
3039 #endif
3040 
3041     // Load layout helper
3042     //
3043     //  |array_tag|     | header_size | element_type |     |log2_element_size|
3044     // 32        30    24            16              8     2                 0
3045     //
3046     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
3047     //
3048 
3049     int lh_offset = in_bytes(Klass::layout_helper_offset());
3050 
3051     // Load 32-bits signed value. Use br() instruction with it to check icc.
3052     __ lduw(G3_src_klass, lh_offset, G5_lh);
3053 
3054     if (UseCompressedKlassPointers) {
3055       __ load_klass(dst, G4_dst_klass);
3056     }
3057     // Handle objArrays completely differently...
3058     juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
3059     __ set(objArray_lh, O5_temp);
3060     __ cmp(G5_lh,       O5_temp);
3061     __ br(Assembler::equal, false, Assembler::pt, L_objArray);
3062     if (UseCompressedKlassPointers) {
3063       __ delayed()->nop();
3064     } else {
3065       __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
3066     }
3067 
3068     //  if (src->klass() != dst->klass()) return -1;
3069     __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed);
3070 
3071     //  if (!src->is_Array()) return -1;
3072     __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
3073     __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed);
3074 
3075     // At this point, it is known to be a typeArray (array_tag 0x3).
3076 #ifdef ASSERT
3077     __ delayed()->nop();
3078     { Label L;
3079       jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
3080       __ set(lh_prim_tag_in_place, O5_temp);
3081       __ cmp(G5_lh,                O5_temp);
3082       __ br(Assembler::greaterEqual, false, Assembler::pt, L);
3083       __ delayed()->nop();
3084       __ stop("must be a primitive array");
3085       __ bind(L);
3086     }
3087 #else
3088     __ delayed();                               // match next insn to prev branch
3089 #endif
3090 
3091     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3092                            O5_temp, G4_dst_klass, L_failed);
3093 
3094     // TypeArrayKlass
3095     //
3096     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
3097     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
3098     //
3099 
3100     const Register G4_offset = G4_dst_klass;    // array offset
3101     const Register G3_elsize = G3_src_klass;    // log2 element size
3102 
3103     __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset);
3104     __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset
3105     __ add(src, G4_offset, src);       // src array offset
3106     __ add(dst, G4_offset, dst);       // dst array offset
3107     __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size
3108 
3109     // next registers should be set before the jump to corresponding stub
3110     const Register from     = O0;  // source array address
3111     const Register to       = O1;  // destination array address
3112     const Register count    = O2;  // elements count
3113 
3114     // 'from', 'to', 'count' registers should be set in this order
3115     // since they are the same as 'src', 'src_pos', 'dst'.
3116 
3117     BLOCK_COMMENT("scale indexes to element size");
3118     __ sll_ptr(src_pos, G3_elsize, src_pos);
3119     __ sll_ptr(dst_pos, G3_elsize, dst_pos);
3120     __ add(src, src_pos, from);       // src_addr
3121     __ add(dst, dst_pos, to);         // dst_addr
3122 
3123     BLOCK_COMMENT("choose copy loop based on element size");
3124     __ cmp(G3_elsize, 0);
3125     __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy);
3126     __ delayed()->signx(length, count); // length
3127 
3128     __ cmp(G3_elsize, LogBytesPerShort);
3129     __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy);
3130     __ delayed()->signx(length, count); // length
3131 
3132     __ cmp(G3_elsize, LogBytesPerInt);
3133     __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy);
3134     __ delayed()->signx(length, count); // length
3135 #ifdef ASSERT
3136     { Label L;
3137       __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L);
3138       __ stop("must be long copy, but elsize is wrong");
3139       __ bind(L);
3140     }
3141 #endif
3142     __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy);
3143     __ delayed()->signx(length, count); // length
3144 
3145     // ObjArrayKlass
3146   __ BIND(L_objArray);
3147     // live at this point:  G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length
3148 
3149     Label L_plain_copy, L_checkcast_copy;
3150     //  test array classes for subtyping
3151     __ cmp(G3_src_klass, G4_dst_klass);         // usual case is exact equality
3152     __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy);
3153     __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below
3154 
3155     // Identically typed arrays can be copied without element-wise checks.
3156     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3157                            O5_temp, G5_lh, L_failed);
3158 
3159     __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3160     __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3161     __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3162     __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3163     __ add(src, src_pos, from);       // src_addr
3164     __ add(dst, dst_pos, to);         // dst_addr
3165   __ BIND(L_plain_copy);
3166     __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy);
3167     __ delayed()->signx(length, count); // length
3168 
3169   __ BIND(L_checkcast_copy);
3170     // live at this point:  G3_src_klass, G4_dst_klass
3171     {
3172       // Before looking at dst.length, make sure dst is also an objArray.
3173       // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot
3174       __ cmp(G5_lh,                    O5_temp);
3175       __ br(Assembler::notEqual, false, Assembler::pn, L_failed);
3176 
3177       // It is safe to examine both src.length and dst.length.
3178       __ delayed();                             // match next insn to prev branch
3179       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3180                              O5_temp, G5_lh, L_failed);
3181 
3182       // Marshal the base address arguments now, freeing registers.
3183       __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3184       __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3185       __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3186       __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3187       __ add(src, src_pos, from);               // src_addr
3188       __ add(dst, dst_pos, to);                 // dst_addr
3189       __ signx(length, count);                  // length (reloaded)
3190 
3191       Register sco_temp = O3;                   // this register is free now
3192       assert_different_registers(from, to, count, sco_temp,
3193                                  G4_dst_klass, G3_src_klass);
3194 
3195       // Generate the type check.
3196       int sco_offset = in_bytes(Klass::super_check_offset_offset());
3197       __ lduw(G4_dst_klass, sco_offset, sco_temp);
3198       generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
3199                           O5_temp, L_plain_copy);
3200 
3201       // Fetch destination element klass from the ObjArrayKlass header.
3202       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3203 
3204       // the checkcast_copy loop needs two extra arguments:
3205       __ ld_ptr(G4_dst_klass, ek_offset, O4);   // dest elem klass
3206       // lduw(O4, sco_offset, O3);              // sco of elem klass
3207 
3208       __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy);
3209       __ delayed()->lduw(O4, sco_offset, O3);
3210     }
3211 
3212   __ BIND(L_failed);
3213     __ retl();
3214     __ delayed()->sub(G0, 1, O0); // return -1
3215     return start;
3216   }
3217 
3218   //
3219   //  Generate stub for heap zeroing.
3220   //  "to" address is aligned to jlong (8 bytes).
3221   //
3222   // Arguments for generated stub:
3223   //      to:    O0
3224   //      count: O1 treated as signed (count of HeapWord)
3225   //             count could be 0
3226   //
3227   address generate_zero_aligned_words(const char* name) {
3228     __ align(CodeEntryAlignment);
3229     StubCodeMark mark(this, "StubRoutines", name);
3230     address start = __ pc();
3231 
3232     const Register to    = O0;   // source array address
3233     const Register count = O1;   // HeapWords count
3234     const Register temp  = O2;   // scratch
3235 
3236     Label Ldone;
3237     __ sllx(count, LogHeapWordSize, count); // to bytes count
3238     // Use BIS for zeroing
3239     __ bis_zeroing(to, count, temp, Ldone);
3240     __ bind(Ldone);
3241     __ retl();
3242     __ delayed()->nop();
3243     return start;
3244 }
3245 
3246   void generate_arraycopy_stubs() {
3247     address entry;
3248     address entry_jbyte_arraycopy;
3249     address entry_jshort_arraycopy;
3250     address entry_jint_arraycopy;
3251     address entry_oop_arraycopy;
3252     address entry_jlong_arraycopy;
3253     address entry_checkcast_arraycopy;
3254 
3255     //*** jbyte
3256     // Always need aligned and unaligned versions
3257     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
3258                                                                                   "jbyte_disjoint_arraycopy");
3259     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
3260                                                                                   &entry_jbyte_arraycopy,
3261                                                                                   "jbyte_arraycopy");
3262     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
3263                                                                                   "arrayof_jbyte_disjoint_arraycopy");
3264     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
3265                                                                                   "arrayof_jbyte_arraycopy");
3266 
3267     //*** jshort
3268     // Always need aligned and unaligned versions
3269     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
3270                                                                                     "jshort_disjoint_arraycopy");
3271     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
3272                                                                                     &entry_jshort_arraycopy,
3273                                                                                     "jshort_arraycopy");
3274     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
3275                                                                                     "arrayof_jshort_disjoint_arraycopy");
3276     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
3277                                                                                     "arrayof_jshort_arraycopy");
3278 
3279     //*** jint
3280     // Aligned versions
3281     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
3282                                                                                 "arrayof_jint_disjoint_arraycopy");
3283     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
3284                                                                                 "arrayof_jint_arraycopy");
3285 #ifdef _LP64
3286     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
3287     // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it).
3288     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
3289                                                                                 "jint_disjoint_arraycopy");
3290     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
3291                                                                                 &entry_jint_arraycopy,
3292                                                                                 "jint_arraycopy");
3293 #else
3294     // In 32 bit jints are always HeapWordSize aligned, so always use the aligned version
3295     // (in fact in 32bit we always have a pre-loop part even in the aligned version,
3296     //  because it uses 64-bit loads/stores, so the aligned flag is actually ignored).
3297     StubRoutines::_jint_disjoint_arraycopy = StubRoutines::_arrayof_jint_disjoint_arraycopy;
3298     StubRoutines::_jint_arraycopy          = StubRoutines::_arrayof_jint_arraycopy;
3299 #endif
3300 
3301 
3302     //*** jlong
3303     // It is always aligned
3304     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
3305                                                                                   "arrayof_jlong_disjoint_arraycopy");
3306     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
3307                                                                                   "arrayof_jlong_arraycopy");
3308     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
3309     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
3310 
3311 
3312     //*** oops
3313     // Aligned versions
3314     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, &entry,
3315                                                                                       "arrayof_oop_disjoint_arraycopy");
3316     StubRoutines::_arrayof_oop_arraycopy                 = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy,
3317                                                                                       "arrayof_oop_arraycopy");
3318     // Aligned versions without pre-barriers
3319     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry,
3320                                                                                       "arrayof_oop_disjoint_arraycopy_uninit",
3321                                                                                       /*dest_uninitialized*/true);
3322     StubRoutines::_arrayof_oop_arraycopy_uninit          = generate_conjoint_oop_copy(true, entry, NULL,
3323                                                                                       "arrayof_oop_arraycopy_uninit",
3324                                                                                       /*dest_uninitialized*/true);
3325 #ifdef _LP64
3326     if (UseCompressedOops) {
3327       // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy.
3328       StubRoutines::_oop_disjoint_arraycopy            = generate_disjoint_oop_copy(false, &entry,
3329                                                                                     "oop_disjoint_arraycopy");
3330       StubRoutines::_oop_arraycopy                     = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy,
3331                                                                                     "oop_arraycopy");
3332       // Unaligned versions without pre-barriers
3333       StubRoutines::_oop_disjoint_arraycopy_uninit     = generate_disjoint_oop_copy(false, &entry,
3334                                                                                     "oop_disjoint_arraycopy_uninit",
3335                                                                                     /*dest_uninitialized*/true);
3336       StubRoutines::_oop_arraycopy_uninit              = generate_conjoint_oop_copy(false, entry, NULL,
3337                                                                                     "oop_arraycopy_uninit",
3338                                                                                     /*dest_uninitialized*/true);
3339     } else
3340 #endif
3341     {
3342       // oop arraycopy is always aligned on 32bit and 64bit without compressed oops
3343       StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
3344       StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
3345       StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
3346       StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
3347     }
3348 
3349     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3350     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3351                                                                         /*dest_uninitialized*/true);
3352 
3353     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3354                                                               entry_jbyte_arraycopy,
3355                                                               entry_jshort_arraycopy,
3356                                                               entry_jint_arraycopy,
3357                                                               entry_jlong_arraycopy);
3358     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3359                                                                entry_jbyte_arraycopy,
3360                                                                entry_jshort_arraycopy,
3361                                                                entry_jint_arraycopy,
3362                                                                entry_oop_arraycopy,
3363                                                                entry_jlong_arraycopy,
3364                                                                entry_checkcast_arraycopy);
3365 
3366     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3367     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3368     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3369     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3370     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3371     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3372 
3373     if (UseBlockZeroing) {
3374       StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words");
3375     }
3376   }
3377 
3378   void generate_initial() {
3379     // Generates all stubs and initializes the entry points
3380 
3381     //------------------------------------------------------------------------------------------------------------------------
3382     // entry points that exist in all platforms
3383     // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
3384     //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
3385     StubRoutines::_forward_exception_entry                 = generate_forward_exception();
3386 
3387     StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
3388     StubRoutines::_catch_exception_entry                   = generate_catch_exception();
3389 
3390     //------------------------------------------------------------------------------------------------------------------------
3391     // entry points that are platform specific
3392     StubRoutines::Sparc::_test_stop_entry                  = generate_test_stop();
3393 
3394     StubRoutines::Sparc::_stop_subroutine_entry            = generate_stop_subroutine();
3395     StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
3396 
3397 #if !defined(COMPILER2) && !defined(_LP64)
3398     StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();
3399     StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();
3400     StubRoutines::_atomic_add_entry          = generate_atomic_add();
3401     StubRoutines::_atomic_xchg_ptr_entry     = StubRoutines::_atomic_xchg_entry;
3402     StubRoutines::_atomic_cmpxchg_ptr_entry  = StubRoutines::_atomic_cmpxchg_entry;
3403     StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
3404     StubRoutines::_atomic_add_ptr_entry      = StubRoutines::_atomic_add_entry;
3405 #endif  // COMPILER2 !=> _LP64
3406 
3407     // Build this early so it's available for the interpreter.
3408     StubRoutines::_throw_StackOverflowError_entry          = generate_throw_exception("StackOverflowError throw_exception",           CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
3409   }
3410 
3411 
3412   void generate_all() {
3413     // Generates all stubs and initializes the entry points
3414 
3415     // Generate partial_subtype_check first here since its code depends on
3416     // UseZeroBaseCompressedOops which is defined after heap initialization.
3417     StubRoutines::Sparc::_partial_subtype_check                = generate_partial_subtype_check();
3418     // These entry points require SharedInfo::stack0 to be set up in non-core builds
3419     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
3420     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
3421     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
3422 
3423     StubRoutines::_handler_for_unsafe_access_entry =
3424       generate_handler_for_unsafe_access();
3425 
3426     // support for verify_oop (must happen after universe_init)
3427     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop_subroutine();
3428 
3429     // arraycopy stubs used by compilers
3430     generate_arraycopy_stubs();
3431 
3432     // Don't initialize the platform math functions since sparc
3433     // doesn't have intrinsics for these operations.
3434   }
3435 
3436 
3437  public:
3438   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
3439     // replace the standard masm with a special one:
3440     _masm = new MacroAssembler(code);
3441 
3442     _stub_count = !all ? 0x100 : 0x200;
3443     if (all) {
3444       generate_all();
3445     } else {
3446       generate_initial();
3447     }
3448 
3449     // make sure this stub is available for all local calls
3450     if (_atomic_add_stub.is_unbound()) {
3451       // generate a second time, if necessary
3452       (void) generate_atomic_add();
3453     }
3454   }
3455 
3456 
3457  private:
3458   int _stub_count;
3459   void stub_prolog(StubCodeDesc* cdesc) {
3460     # ifdef ASSERT
3461       // put extra information in the stub code, to make it more readable
3462 #ifdef _LP64
3463 // Write the high part of the address
3464 // [RGV] Check if there is a dependency on the size of this prolog
3465       __ emit_data((intptr_t)cdesc >> 32,    relocInfo::none);
3466 #endif
3467       __ emit_data((intptr_t)cdesc,    relocInfo::none);
3468       __ emit_data(++_stub_count, relocInfo::none);
3469     # endif
3470     align(true);
3471   }
3472 
3473   void align(bool at_header = false) {
3474     // %%%%% move this constant somewhere else
3475     // UltraSPARC cache line size is 8 instructions:
3476     const unsigned int icache_line_size = 32;
3477     const unsigned int icache_half_line_size = 16;
3478 
3479     if (at_header) {
3480       while ((intptr_t)(__ pc()) % icache_line_size != 0) {
3481         __ emit_data(0, relocInfo::none);
3482       }
3483     } else {
3484       while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
3485         __ nop();
3486       }
3487     }
3488   }
3489 
3490 }; // end class declaration
3491 
3492 void StubGenerator_generate(CodeBuffer* code, bool all) {
3493   StubGenerator g(code, all);
3494 }