New src/hotspot/cpu/sparc/stubGenerator

   1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.inline.hpp"
  27 #include "gc/shared/cardTable.hpp"
  28 #include "gc/shared/cardTableModRefBS.hpp"
  29 #include "interpreter/interpreter.hpp"
  30 #include "nativeInst_sparc.hpp"
  31 #include "oops/instanceOop.hpp"
  32 #include "oops/method.hpp"
  33 #include "oops/objArrayKlass.hpp"
  34 #include "oops/oop.inline.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/frame.inline.hpp"
  37 #include "runtime/handles.inline.hpp"
  38 #include "runtime/sharedRuntime.hpp"
  39 #include "runtime/stubCodeGenerator.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "runtime/thread.inline.hpp"
  42 #ifdef COMPILER2
  43 #include "opto/runtime.hpp"
  44 #endif
  45 
  46 // Declaration and definition of StubGenerator (no .hpp file).
  47 // For a more detailed description of the stub routine structure
  48 // see the comment in stubRoutines.hpp.
  49 
  50 #define __ _masm->
  51 
  52 #ifdef PRODUCT
  53 #define BLOCK_COMMENT(str) /* nothing */
  54 #else
  55 #define BLOCK_COMMENT(str) __ block_comment(str)
  56 #endif
  57 
  58 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  59 
  60 // Note:  The register L7 is used as L7_thread_cache, and may not be used
  61 //        any other way within this module.
  62 
  63 static const Register& Lstub_temp = L2;
  64 
  65 // -------------------------------------------------------------------------------------------------------------------------
  66 // Stub Code definitions
  67 
  68 class StubGenerator: public StubCodeGenerator {
  69  private:
  70 
  71 #ifdef PRODUCT
  72 #define inc_counter_np(a,b,c)
  73 #else
  74 #define inc_counter_np(counter, t1, t2) \
  75   BLOCK_COMMENT("inc_counter " #counter); \
  76   __ inc_counter(&counter, t1, t2);
  77 #endif
  78 
  79   //----------------------------------------------------------------------------------------------------
  80   // Call stubs are used to call Java from C
  81 
  82   address generate_call_stub(address& return_pc) {
  83     StubCodeMark mark(this, "StubRoutines", "call_stub");
  84     address start = __ pc();
  85 
  86     // Incoming arguments:
  87     //
  88     // o0         : call wrapper address
  89     // o1         : result (address)
  90     // o2         : result type
  91     // o3         : method
  92     // o4         : (interpreter) entry point
  93     // o5         : parameters (address)
  94     // [sp + 0x5c]: parameter size (in words)
  95     // [sp + 0x60]: thread
  96     //
  97     // +---------------+ <--- sp + 0
  98     // |               |
  99     // . reg save area .
 100     // |               |
 101     // +---------------+ <--- sp + 0x40
 102     // |               |
 103     // . extra 7 slots .
 104     // |               |
 105     // +---------------+ <--- sp + 0x5c
 106     // |  param. size  |
 107     // +---------------+ <--- sp + 0x60
 108     // |    thread     |
 109     // +---------------+
 110     // |               |
 111 
 112     // note: if the link argument position changes, adjust
 113     //       the code in frame::entry_frame_call_wrapper()
 114 
 115     const Argument link           = Argument(0, false); // used only for GC
 116     const Argument result         = Argument(1, false);
 117     const Argument result_type    = Argument(2, false);
 118     const Argument method         = Argument(3, false);
 119     const Argument entry_point    = Argument(4, false);
 120     const Argument parameters     = Argument(5, false);
 121     const Argument parameter_size = Argument(6, false);
 122     const Argument thread         = Argument(7, false);
 123 
 124     // setup thread register
 125     __ ld_ptr(thread.as_address(), G2_thread);
 126     __ reinit_heapbase();
 127 
 128 #ifdef ASSERT
 129     // make sure we have no pending exceptions
 130     { const Register t = G3_scratch;
 131       Label L;
 132       __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
 133       __ br_null_short(t, Assembler::pt, L);
 134       __ stop("StubRoutines::call_stub: entered with pending exception");
 135       __ bind(L);
 136     }
 137 #endif
 138 
 139     // create activation frame & allocate space for parameters
 140     { const Register t = G3_scratch;
 141       __ ld_ptr(parameter_size.as_address(), t);                // get parameter size (in words)
 142       __ add(t, frame::memory_parameter_word_sp_offset, t);     // add space for save area (in words)
 143       __ round_to(t, WordsPerLong);                             // make sure it is multiple of 2 (in words)
 144       __ sll(t, Interpreter::logStackElementSize, t);           // compute number of bytes
 145       __ neg(t);                                                // negate so it can be used with save
 146       __ save(SP, t, SP);                                       // setup new frame
 147     }
 148 
 149     // +---------------+ <--- sp + 0
 150     // |               |
 151     // . reg save area .
 152     // |               |
 153     // +---------------+ <--- sp + 0x40
 154     // |               |
 155     // . extra 7 slots .
 156     // |               |
 157     // +---------------+ <--- sp + 0x5c
 158     // |  empty slot   |      (only if parameter size is even)
 159     // +---------------+
 160     // |               |
 161     // .  parameters   .
 162     // |               |
 163     // +---------------+ <--- fp + 0
 164     // |               |
 165     // . reg save area .
 166     // |               |
 167     // +---------------+ <--- fp + 0x40
 168     // |               |
 169     // . extra 7 slots .
 170     // |               |
 171     // +---------------+ <--- fp + 0x5c
 172     // |  param. size  |
 173     // +---------------+ <--- fp + 0x60
 174     // |    thread     |
 175     // +---------------+
 176     // |               |
 177 
 178     // pass parameters if any
 179     BLOCK_COMMENT("pass parameters if any");
 180     { const Register src = parameters.as_in().as_register();
 181       const Register dst = Lentry_args;
 182       const Register tmp = G3_scratch;
 183       const Register cnt = G4_scratch;
 184 
 185       // test if any parameters & setup of Lentry_args
 186       Label exit;
 187       __ ld_ptr(parameter_size.as_in().as_address(), cnt);      // parameter counter
 188       __ add( FP, STACK_BIAS, dst );
 189       __ cmp_zero_and_br(Assembler::zero, cnt, exit);
 190       __ delayed()->sub(dst, BytesPerWord, dst);                 // setup Lentry_args
 191 
 192       // copy parameters if any
 193       Label loop;
 194       __ BIND(loop);
 195       // Store parameter value
 196       __ ld_ptr(src, 0, tmp);
 197       __ add(src, BytesPerWord, src);
 198       __ st_ptr(tmp, dst, 0);
 199       __ deccc(cnt);
 200       __ br(Assembler::greater, false, Assembler::pt, loop);
 201       __ delayed()->sub(dst, Interpreter::stackElementSize, dst);
 202 
 203       // done
 204       __ BIND(exit);
 205     }
 206 
 207     // setup parameters, method & call Java function
 208 #ifdef ASSERT
 209     // layout_activation_impl checks it's notion of saved SP against
 210     // this register, so if this changes update it as well.
 211     const Register saved_SP = Lscratch;
 212     __ mov(SP, saved_SP);                               // keep track of SP before call
 213 #endif
 214 
 215     // setup parameters
 216     const Register t = G3_scratch;
 217     __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words)
 218     __ sll(t, Interpreter::logStackElementSize, t);    // compute number of bytes
 219     __ sub(FP, t, Gargs);                              // setup parameter pointer
 220     __ add( Gargs, STACK_BIAS, Gargs );                // Account for LP64 stack bias
 221     __ mov(SP, O5_savedSP);
 222 
 223 
 224     // do the call
 225     //
 226     // the following register must be setup:
 227     //
 228     // G2_thread
 229     // G5_method
 230     // Gargs
 231     BLOCK_COMMENT("call Java function");
 232     __ jmpl(entry_point.as_in().as_register(), G0, O7);
 233     __ delayed()->mov(method.as_in().as_register(), G5_method);   // setup method
 234 
 235     BLOCK_COMMENT("call_stub_return_address:");
 236     return_pc = __ pc();
 237 
 238     // The callee, if it wasn't interpreted, can return with SP changed so
 239     // we can no longer assert of change of SP.
 240 
 241     // store result depending on type
 242     // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE
 243     //  is treated as T_INT)
 244     { const Register addr = result     .as_in().as_register();
 245       const Register type = result_type.as_in().as_register();
 246       Label is_long, is_float, is_double, is_object, exit;
 247       __            cmp(type, T_OBJECT);  __ br(Assembler::equal, false, Assembler::pn, is_object);
 248       __ delayed()->cmp(type, T_FLOAT);   __ br(Assembler::equal, false, Assembler::pn, is_float);
 249       __ delayed()->cmp(type, T_DOUBLE);  __ br(Assembler::equal, false, Assembler::pn, is_double);
 250       __ delayed()->cmp(type, T_LONG);    __ br(Assembler::equal, false, Assembler::pn, is_long);
 251       __ delayed()->nop();
 252 
 253       // store int result
 254       __ st(O0, addr, G0);
 255 
 256       __ BIND(exit);
 257       __ ret();
 258       __ delayed()->restore();
 259 
 260       __ BIND(is_object);
 261       __ ba(exit);
 262       __ delayed()->st_ptr(O0, addr, G0);
 263 
 264       __ BIND(is_float);
 265       __ ba(exit);
 266       __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
 267 
 268       __ BIND(is_double);
 269       __ ba(exit);
 270       __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
 271 
 272       __ BIND(is_long);
 273       __ ba(exit);
 274       __ delayed()->st_long(O0, addr, G0);      // store entire long
 275      }
 276      return start;
 277   }
 278 
 279 
 280   //----------------------------------------------------------------------------------------------------
 281   // Return point for a Java call if there's an exception thrown in Java code.
 282   // The exception is caught and transformed into a pending exception stored in
 283   // JavaThread that can be tested from within the VM.
 284   //
 285   // Oexception: exception oop
 286 
 287   address generate_catch_exception() {
 288     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 289 
 290     address start = __ pc();
 291     // verify that thread corresponds
 292     __ verify_thread();
 293 
 294     const Register& temp_reg = Gtemp;
 295     Address pending_exception_addr    (G2_thread, Thread::pending_exception_offset());
 296     Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset   ());
 297     Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset   ());
 298 
 299     // set pending exception
 300     __ verify_oop(Oexception);
 301     __ st_ptr(Oexception, pending_exception_addr);
 302     __ set((intptr_t)__FILE__, temp_reg);
 303     __ st_ptr(temp_reg, exception_file_offset_addr);
 304     __ set((intptr_t)__LINE__, temp_reg);
 305     __ st(temp_reg, exception_line_offset_addr);
 306 
 307     // complete return to VM
 308     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
 309 
 310     AddressLiteral stub_ret(StubRoutines::_call_stub_return_address);
 311     __ jump_to(stub_ret, temp_reg);
 312     __ delayed()->nop();
 313 
 314     return start;
 315   }
 316 
 317 
 318   //----------------------------------------------------------------------------------------------------
 319   // Continuation point for runtime calls returning with a pending exception
 320   // The pending exception check happened in the runtime or native call stub
 321   // The pending exception in Thread is converted into a Java-level exception
 322   //
 323   // Contract with Java-level exception handler: O0 = exception
 324   //                                             O1 = throwing pc
 325 
 326   address generate_forward_exception() {
 327     StubCodeMark mark(this, "StubRoutines", "forward_exception");
 328     address start = __ pc();
 329 
 330     // Upon entry, O7 has the return address returning into Java
 331     // (interpreted or compiled) code; i.e. the return address
 332     // becomes the throwing pc.
 333 
 334     const Register& handler_reg = Gtemp;
 335 
 336     Address exception_addr(G2_thread, Thread::pending_exception_offset());
 337 
 338 #ifdef ASSERT
 339     // make sure that this code is only executed if there is a pending exception
 340     { Label L;
 341       __ ld_ptr(exception_addr, Gtemp);
 342       __ br_notnull_short(Gtemp, Assembler::pt, L);
 343       __ stop("StubRoutines::forward exception: no pending exception (1)");
 344       __ bind(L);
 345     }
 346 #endif
 347 
 348     // compute exception handler into handler_reg
 349     __ get_thread();
 350     __ ld_ptr(exception_addr, Oexception);
 351     __ verify_oop(Oexception);
 352     __ save_frame(0);             // compensates for compiler weakness
 353     __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC
 354     BLOCK_COMMENT("call exception_handler_for_return_address");
 355     __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch);
 356     __ mov(O0, handler_reg);
 357     __ restore();                 // compensates for compiler weakness
 358 
 359     __ ld_ptr(exception_addr, Oexception);
 360     __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC
 361 
 362 #ifdef ASSERT
 363     // make sure exception is set
 364     { Label L;
 365       __ br_notnull_short(Oexception, Assembler::pt, L);
 366       __ stop("StubRoutines::forward exception: no pending exception (2)");
 367       __ bind(L);
 368     }
 369 #endif
 370     // jump to exception handler
 371     __ jmp(handler_reg, 0);
 372     // clear pending exception
 373     __ delayed()->st_ptr(G0, exception_addr);
 374 
 375     return start;
 376   }
 377 
 378   // Safefetch stubs.
 379   void generate_safefetch(const char* name, int size, address* entry,
 380                           address* fault_pc, address* continuation_pc) {
 381     // safefetch signatures:
 382     //   int      SafeFetch32(int*      adr, int      errValue);
 383     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
 384     //
 385     // arguments:
 386     //   o0 = adr
 387     //   o1 = errValue
 388     //
 389     // result:
 390     //   o0  = *adr or errValue
 391 
 392     StubCodeMark mark(this, "StubRoutines", name);
 393 
 394     // Entry point, pc or function descriptor.
 395     __ align(CodeEntryAlignment);
 396     *entry = __ pc();
 397 
 398     __ mov(O0, G1);  // g1 = o0
 399     __ mov(O1, O0);  // o0 = o1
 400     // Load *adr into c_rarg1, may fault.
 401     *fault_pc = __ pc();
 402     switch (size) {
 403       case 4:
 404         // int32_t
 405         __ ldsw(G1, 0, O0);  // o0 = [g1]
 406         break;
 407       case 8:
 408         // int64_t
 409         __ ldx(G1, 0, O0);   // o0 = [g1]
 410         break;
 411       default:
 412         ShouldNotReachHere();
 413     }
 414 
 415     // return errValue or *adr
 416     *continuation_pc = __ pc();
 417     // By convention with the trap handler we ensure there is a non-CTI
 418     // instruction in the trap shadow.
 419     __ nop();
 420     __ retl();
 421     __ delayed()->nop();
 422   }
 423 
 424   //------------------------------------------------------------------------------------------------------------------------
 425   // Continuation point for throwing of implicit exceptions that are not handled in
 426   // the current activation. Fabricates an exception oop and initiates normal
 427   // exception dispatching in this frame. Only callee-saved registers are preserved
 428   // (through the normal register window / RegisterMap handling).
 429   // If the compiler needs all registers to be preserved between the fault
 430   // point and the exception handler then it must assume responsibility for that in
 431   // AbstractCompiler::continuation_for_implicit_null_exception or
 432   // continuation_for_implicit_division_by_zero_exception. All other implicit
 433   // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
 434   // either at call sites or otherwise assume that stack unwinding will be initiated,
 435   // so caller saved registers were assumed volatile in the compiler.
 436 
 437   // Note that we generate only this stub into a RuntimeStub, because it needs to be
 438   // properly traversed and ignored during GC, so we change the meaning of the "__"
 439   // macro within this method.
 440 #undef __
 441 #define __ masm->
 442 
 443   address generate_throw_exception(const char* name, address runtime_entry,
 444                                    Register arg1 = noreg, Register arg2 = noreg) {
 445 #ifdef ASSERT
 446     int insts_size = VerifyThread ? 1 * K : 600;
 447 #else
 448     int insts_size = VerifyThread ? 1 * K : 256;
 449 #endif /* ASSERT */
 450     int locs_size  = 32;
 451 
 452     CodeBuffer      code(name, insts_size, locs_size);
 453     MacroAssembler* masm = new MacroAssembler(&code);
 454 
 455     __ verify_thread();
 456 
 457     // This is an inlined and slightly modified version of call_VM
 458     // which has the ability to fetch the return PC out of thread-local storage
 459     __ assert_not_delayed();
 460 
 461     // Note that we always push a frame because on the SPARC
 462     // architecture, for all of our implicit exception kinds at call
 463     // sites, the implicit exception is taken before the callee frame
 464     // is pushed.
 465     __ save_frame(0);
 466 
 467     int frame_complete = __ offset();
 468 
 469     // Note that we always have a runtime stub frame on the top of stack by this point
 470     Register last_java_sp = SP;
 471     // 64-bit last_java_sp is biased!
 472     __ set_last_Java_frame(last_java_sp, G0);
 473     if (VerifyThread)  __ mov(G2_thread, O0); // about to be smashed; pass early
 474     __ save_thread(noreg);
 475     if (arg1 != noreg) {
 476       assert(arg2 != O1, "clobbered");
 477       __ mov(arg1, O1);
 478     }
 479     if (arg2 != noreg) {
 480       __ mov(arg2, O2);
 481     }
 482     // do the call
 483     BLOCK_COMMENT("call runtime_entry");
 484     __ call(runtime_entry, relocInfo::runtime_call_type);
 485     if (!VerifyThread)
 486       __ delayed()->mov(G2_thread, O0);  // pass thread as first argument
 487     else
 488       __ delayed()->nop();             // (thread already passed)
 489     __ restore_thread(noreg);
 490     __ reset_last_Java_frame();
 491 
 492     // check for pending exceptions. use Gtemp as scratch register.
 493 #ifdef ASSERT
 494     Label L;
 495 
 496     Address exception_addr(G2_thread, Thread::pending_exception_offset());
 497     Register scratch_reg = Gtemp;
 498     __ ld_ptr(exception_addr, scratch_reg);
 499     __ br_notnull_short(scratch_reg, Assembler::pt, L);
 500     __ should_not_reach_here();
 501     __ bind(L);
 502 #endif // ASSERT
 503     BLOCK_COMMENT("call forward_exception_entry");
 504     __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
 505     // we use O7 linkage so that forward_exception_entry has the issuing PC
 506     __ delayed()->restore();
 507 
 508     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false);
 509     return stub->entry_point();
 510   }
 511 
 512 #undef __
 513 #define __ _masm->
 514 
 515 
 516   // Generate a routine that sets all the registers so we
 517   // can tell if the stop routine prints them correctly.
 518   address generate_test_stop() {
 519     StubCodeMark mark(this, "StubRoutines", "test_stop");
 520     address start = __ pc();
 521 
 522     int i;
 523 
 524     __ save_frame(0);
 525 
 526     static jfloat zero = 0.0, one = 1.0;
 527 
 528     // put addr in L0, then load through L0 to F0
 529     __ set((intptr_t)&zero, L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F0);
 530     __ set((intptr_t)&one,  L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1
 531 
 532     // use add to put 2..18 in F2..F18
 533     for ( i = 2;  i <= 18;  ++i ) {
 534       __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1),  as_FloatRegister(i));
 535     }
 536 
 537     // Now put double 2 in F16, double 18 in F18
 538     __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 );
 539     __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 );
 540 
 541     // use add to put 20..32 in F20..F32
 542     for (i = 20; i < 32; i += 2) {
 543       __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2),  as_FloatRegister(i));
 544     }
 545 
 546     // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's
 547     for ( i = 0; i < 8; ++i ) {
 548       if (i < 6) {
 549         __ set(     i, as_iRegister(i));
 550         __ set(16 + i, as_oRegister(i));
 551         __ set(24 + i, as_gRegister(i));
 552       }
 553       __ set( 8 + i, as_lRegister(i));
 554     }
 555 
 556     __ stop("testing stop");
 557 
 558 
 559     __ ret();
 560     __ delayed()->restore();
 561 
 562     return start;
 563   }
 564 
 565 
 566   address generate_stop_subroutine() {
 567     StubCodeMark mark(this, "StubRoutines", "stop_subroutine");
 568     address start = __ pc();
 569 
 570     __ stop_subroutine();
 571 
 572     return start;
 573   }
 574 
 575   address generate_flush_callers_register_windows() {
 576     StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows");
 577     address start = __ pc();
 578 
 579     __ flushw();
 580     __ retl(false);
 581     __ delayed()->add( FP, STACK_BIAS, O0 );
 582     // The returned value must be a stack pointer whose register save area
 583     // is flushed, and will stay flushed while the caller executes.
 584 
 585     return start;
 586   }
 587 
 588   // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest).
 589   //
 590   // Arguments:
 591   //
 592   //      exchange_value: O0
 593   //      dest:           O1
 594   //
 595   // Results:
 596   //
 597   //     O0: the value previously stored in dest
 598   //
 599   address generate_atomic_xchg() {
 600     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
 601     address start = __ pc();
 602 
 603     if (UseCASForSwap) {
 604       // Use CAS instead of swap, just in case the MP hardware
 605       // prefers to work with just one kind of synch. instruction.
 606       Label retry;
 607       __ BIND(retry);
 608       __ mov(O0, O3);       // scratch copy of exchange value
 609       __ ld(O1, 0, O2);     // observe the previous value
 610       // try to replace O2 with O3
 611       __ cas(O1, O2, O3);
 612       __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
 613 
 614       __ retl(false);
 615       __ delayed()->mov(O2, O0);  // report previous value to caller
 616     } else {
 617       __ retl(false);
 618       __ delayed()->swap(O1, 0, O0);
 619     }
 620 
 621     return start;
 622   }
 623 
 624 
 625   // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value)
 626   //
 627   // Arguments:
 628   //
 629   //      exchange_value: O0
 630   //      dest:           O1
 631   //      compare_value:  O2
 632   //
 633   // Results:
 634   //
 635   //     O0: the value previously stored in dest
 636   //
 637   address generate_atomic_cmpxchg() {
 638     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
 639     address start = __ pc();
 640 
 641     // cmpxchg(dest, compare_value, exchange_value)
 642     __ cas(O1, O2, O0);
 643     __ retl(false);
 644     __ delayed()->nop();
 645 
 646     return start;
 647   }
 648 
 649   // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
 650   //
 651   // Arguments:
 652   //
 653   //      exchange_value: O1:O0
 654   //      dest:           O2
 655   //      compare_value:  O4:O3
 656   //
 657   // Results:
 658   //
 659   //     O1:O0: the value previously stored in dest
 660   //
 661   // Overwrites: G1,G2,G3
 662   //
 663   address generate_atomic_cmpxchg_long() {
 664     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
 665     address start = __ pc();
 666 
 667     __ sllx(O0, 32, O0);
 668     __ srl(O1, 0, O1);
 669     __ or3(O0,O1,O0);      // O0 holds 64-bit value from compare_value
 670     __ sllx(O3, 32, O3);
 671     __ srl(O4, 0, O4);
 672     __ or3(O3,O4,O3);     // O3 holds 64-bit value from exchange_value
 673     __ casx(O2, O3, O0);
 674     __ srl(O0, 0, O1);    // unpacked return value in O1:O0
 675     __ retl(false);
 676     __ delayed()->srlx(O0, 32, O0);
 677 
 678     return start;
 679   }
 680 
 681 
 682   // Support for jint Atomic::add(jint add_value, volatile jint* dest).
 683   //
 684   // Arguments:
 685   //
 686   //      add_value: O0   (e.g., +1 or -1)
 687   //      dest:      O1
 688   //
 689   // Results:
 690   //
 691   //     O0: the new value stored in dest
 692   //
 693   // Overwrites: O3
 694   //
 695   address generate_atomic_add() {
 696     StubCodeMark mark(this, "StubRoutines", "atomic_add");
 697     address start = __ pc();
 698     __ BIND(_atomic_add_stub);
 699 
 700     Label(retry);
 701     __ BIND(retry);
 702 
 703     __ lduw(O1, 0, O2);
 704     __ add(O0, O2, O3);
 705     __ cas(O1, O2, O3);
 706     __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
 707     __ retl(false);
 708     __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
 709 
 710     return start;
 711   }
 712   Label _atomic_add_stub;  // called from other stubs
 713 
 714 
 715   // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super );
 716   // Arguments :
 717   //
 718   //      ret  : O0, returned
 719   //      icc/xcc: set as O0 (depending on wordSize)
 720   //      sub  : O1, argument, not changed
 721   //      super: O2, argument, not changed
 722   //      raddr: O7, blown by call
 723   address generate_partial_subtype_check() {
 724     __ align(CodeEntryAlignment);
 725     StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
 726     address start = __ pc();
 727     Label miss;
 728 
 729     __ save_frame(0);
 730     Register Rret   = I0;
 731     Register Rsub   = I1;
 732     Register Rsuper = I2;
 733 
 734     Register L0_ary_len = L0;
 735     Register L1_ary_ptr = L1;
 736     Register L2_super   = L2;
 737     Register L3_index   = L3;
 738 
 739     __ check_klass_subtype_slow_path(Rsub, Rsuper,
 740                                      L0, L1, L2, L3,
 741                                      NULL, &miss);
 742 
 743     // Match falls through here.
 744     __ addcc(G0,0,Rret);        // set Z flags, Z result
 745 
 746     __ ret();                   // Result in Rret is zero; flags set to Z
 747     __ delayed()->restore();
 748 
 749     __ BIND(miss);
 750     __ addcc(G0,1,Rret);        // set NZ flags, NZ result
 751 
 752     __ ret();                   // Result in Rret is != 0; flags set to NZ
 753     __ delayed()->restore();
 754 
 755     return start;
 756   }
 757 
 758 
 759   // Called from MacroAssembler::verify_oop
 760   //
 761   address generate_verify_oop_subroutine() {
 762     StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
 763 
 764     address start = __ pc();
 765 
 766     __ verify_oop_subroutine();
 767 
 768     return start;
 769   }
 770 
 771 
 772   //
 773   // Verify that a register contains clean 32-bits positive value
 774   // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax).
 775   //
 776   //  Input:
 777   //    Rint  -  32-bits value
 778   //    Rtmp  -  scratch
 779   //
 780   void assert_clean_int(Register Rint, Register Rtmp) {
 781   #if defined(ASSERT)
 782     __ signx(Rint, Rtmp);
 783     __ cmp(Rint, Rtmp);
 784     __ breakpoint_trap(Assembler::notEqual, Assembler::xcc);
 785   #endif
 786   }
 787 
 788   //
 789   //  Generate overlap test for array copy stubs
 790   //
 791   //  Input:
 792   //    O0    -  array1
 793   //    O1    -  array2
 794   //    O2    -  element count
 795   //
 796   //  Kills temps:  O3, O4
 797   //
 798   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
 799     assert(no_overlap_target != NULL, "must be generated");
 800     array_overlap_test(no_overlap_target, NULL, log2_elem_size);
 801   }
 802   void array_overlap_test(Label& L_no_overlap, int log2_elem_size) {
 803     array_overlap_test(NULL, &L_no_overlap, log2_elem_size);
 804   }
 805   void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) {
 806     const Register from       = O0;
 807     const Register to         = O1;
 808     const Register count      = O2;
 809     const Register to_from    = O3; // to - from
 810     const Register byte_count = O4; // count << log2_elem_size
 811 
 812       __ subcc(to, from, to_from);
 813       __ sll_ptr(count, log2_elem_size, byte_count);
 814       if (NOLp == NULL)
 815         __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target);
 816       else
 817         __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp));
 818       __ delayed()->cmp(to_from, byte_count);
 819       if (NOLp == NULL)
 820         __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target);
 821       else
 822         __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp));
 823       __ delayed()->nop();
 824   }
 825 
 826   // Generate code for an array load barrier
 827   //
 828   //     addr    -  starting address
 829   //     count   -  element count
 830   //
 831   //     Destroy no registers!
 832   //
 833   void gen_load_ref_array_barrier(Register addr, Register count) {
 834     BarrierSet* bs = Universe::heap()->barrier_set();
 835     switch (bs->kind()) {
 836       case BarrierSet::Z:
 837         __ save_frame_and_mov(0, addr, O0, count, O1);
 838         // Save the necessary global regs... will be used after.
 839         __ call(CAST_FROM_FN_PTR(address, static_cast<void (*)(volatile oop*, size_t)>(ZBarrier::load_barrier_on_oop_array)));
 840         __ delayed()->nop();
 841         __ restore();
 842         break;
 843       case BarrierSet::G1BarrierSet:
 844       case BarrierSet::CardTableModRef:
 845         // No barrier
 846         break;
 847       default:
 848         ShouldNotReachHere();
 849         break;
 850     }
 851   }
 852 
 853   //
 854   //  Generate pre-write barrier for array.
 855   //
 856   //  Input:
 857   //     addr     - register containing starting address
 858   //     count    - register containing element count
 859   //     tmp      - scratch register
 860   //
 861   //  The input registers are overwritten.
 862   //
 863   void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 864     BarrierSet* bs = Universe::heap()->barrier_set();
 865     switch (bs->kind()) {
 866       case BarrierSet::G1BarrierSet:
 867         // With G1, don't generate the call if we statically know that the target in uninitialized
 868         if (!dest_uninitialized) {
 869           Register tmp = O5;
 870           assert_different_registers(addr, count, tmp);
 871           Label filtered;
 872           // Is marking active?
 873           if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
 874             __ ld(G2, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), tmp);
 875           } else {
 876             guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1,
 877                       "Assumption");
 878             __ ldsb(G2, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), tmp);
 879           }
 880           // Is marking active?
 881           __ cmp_and_br_short(tmp, G0, Assembler::equal, Assembler::pt, filtered);
 882 
 883           __ save_frame(0);
 884           // Save the necessary global regs... will be used after.
 885           if (addr->is_global()) {
 886             __ mov(addr, L0);
 887           }
 888           if (count->is_global()) {
 889             __ mov(count, L1);
 890           }
 891           __ mov(addr->after_save(), O0);
 892           // Get the count into O1
 893           __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
 894           __ delayed()->mov(count->after_save(), O1);
 895           if (addr->is_global()) {
 896             __ mov(L0, addr);
 897           }
 898           if (count->is_global()) {
 899             __ mov(L1, count);
 900           }
 901           __ restore();
 902 
 903           __ bind(filtered);
 904           DEBUG_ONLY(__ set(0xDEADC0DE, tmp);) // we have killed tmp
 905         }
 906         break;
 907       case BarrierSet::CardTableModRef:
 908       case BarrierSet::Z:
 909         break;
 910       default:
 911         ShouldNotReachHere();
 912     }
 913   }
 914   //
 915   //  Generate post-write barrier for array.
 916   //
 917   //  Input:
 918   //     addr     - register containing starting address
 919   //     count    - register containing element count
 920   //     tmp      - scratch register
 921   //
 922   //  The input registers are overwritten.
 923   //
 924   void gen_write_ref_array_post_barrier(Register addr, Register count,
 925                                         Register tmp) {
 926     BarrierSet* bs = Universe::heap()->barrier_set();
 927 
 928     switch (bs->kind()) {
 929       case BarrierSet::G1BarrierSet:
 930         {
 931           // Get some new fresh output registers.
 932           __ save_frame(0);
 933           __ mov(addr->after_save(), O0);
 934           __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
 935           __ delayed()->mov(count->after_save(), O1);
 936           __ restore();
 937         }
 938         break;
 939       case BarrierSet::CardTableModRef:
 940         {
 941           CardTableModRefBS* ctbs = barrier_set_cast<CardTableModRefBS>(bs);
 942           CardTable* ct = ctbs->card_table();
 943           assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code");
 944           assert_different_registers(addr, count, tmp);
 945 
 946           Label L_loop, L_done;
 947 
 948           __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_done); // zero count - nothing to do
 949 
 950           __ sll_ptr(count, LogBytesPerHeapOop, count);
 951           __ sub(count, BytesPerHeapOop, count);
 952           __ add(count, addr, count);
 953           // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
 954           __ srl_ptr(addr, CardTable::card_shift, addr);
 955           __ srl_ptr(count, CardTable::card_shift, count);
 956           __ sub(count, addr, count);
 957           AddressLiteral rs(ct->byte_map_base());
 958           __ set(rs, tmp);
 959         __ BIND(L_loop);
 960           __ stb(G0, tmp, addr);
 961           __ subcc(count, 1, count);
 962           __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
 963           __ delayed()->add(addr, 1, addr);
 964         __ BIND(L_done);
 965         }
 966         break;
 967       case BarrierSet::ModRef:
 968       case BarrierSet::Z:
 969         break;
 970       default:
 971         ShouldNotReachHere();
 972     }
 973   }
 974 
 975   //
 976   // Generate main code for disjoint arraycopy
 977   //
 978   typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
 979                                               Label& L_loop, bool use_prefetch, bool use_bis);
 980 
 981   void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
 982                           int iter_size, StubGenerator::CopyLoopFunc copy_loop_func) {
 983     Label L_copy;
 984 
 985     assert(log2_elem_size <= 3, "the following code should be changed");
 986     int count_dec = 16>>log2_elem_size;
 987 
 988     int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance);
 989     assert(prefetch_dist < 4096, "invalid value");
 990     prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size
 991     int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count
 992 
 993     if (UseBlockCopy) {
 994       Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy;
 995 
 996       // 64 bytes tail + bytes copied in one loop iteration
 997       int tail_size = 64 + iter_size;
 998       int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size;
 999       // Use BIS copy only for big arrays since it requires membar.
1000       __ set(block_copy_count, O4);
1001       __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy);
1002       // This code is for disjoint source and destination:
1003       //   to <= from || to >= from+count
1004       // but BIS will stomp over 'from' if (to > from-tail_size && to <= from)
1005       __ sub(from, to, O4);
1006       __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm.
1007       __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy);
1008 
1009       __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
1010       // BIS should not be used to copy tail (64 bytes+iter_size)
1011       // to avoid zeroing of following values.
1012       __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0
1013 
1014       if (prefetch_count > 0) { // rounded up to one iteration count
1015         // Do prefetching only if copy size is bigger
1016         // than prefetch distance.
1017         __ set(prefetch_count, O4);
1018         __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy);
1019         __ sub(count, O4, count);
1020 
1021         (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true);
1022         __ set(prefetch_count, O4);
1023         __ add(count, O4, count);
1024 
1025       } // prefetch_count > 0
1026 
1027       (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true);
1028       __ add(count, (tail_size>>log2_elem_size), count); // restore count
1029 
1030       __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
1031       // BIS needs membar.
1032       __ membar(Assembler::StoreLoad);
1033       // Copy tail
1034       __ ba_short(L_copy);
1035 
1036       __ BIND(L_skip_block_copy);
1037     } // UseBlockCopy
1038 
1039     if (prefetch_count > 0) { // rounded up to one iteration count
1040       // Do prefetching only if copy size is bigger
1041       // than prefetch distance.
1042       __ set(prefetch_count, O4);
1043       __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy);
1044       __ sub(count, O4, count);
1045 
1046       Label L_copy_prefetch;
1047       (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false);
1048       __ set(prefetch_count, O4);
1049       __ add(count, O4, count);
1050 
1051     } // prefetch_count > 0
1052 
1053     (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false);
1054   }
1055 
1056 
1057 
1058   //
1059   // Helper methods for copy_16_bytes_forward_with_shift()
1060   //
1061   void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec,
1062                                 Label& L_loop, bool use_prefetch, bool use_bis) {
1063 
1064     const Register left_shift  = G1; // left  shift bit counter
1065     const Register right_shift = G5; // right shift bit counter
1066 
1067     __ align(OptoLoopAlignment);
1068     __ BIND(L_loop);
1069     if (use_prefetch) {
1070       if (ArraycopySrcPrefetchDistance > 0) {
1071         __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
1072       }
1073       if (ArraycopyDstPrefetchDistance > 0) {
1074         __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
1075       }
1076     }
1077     __ ldx(from, 0, O4);
1078     __ ldx(from, 8, G4);
1079     __ inc(to, 16);
1080     __ inc(from, 16);
1081     __ deccc(count, count_dec); // Can we do next iteration after this one?
1082     __ srlx(O4, right_shift, G3);
1083     __ bset(G3, O3);
1084     __ sllx(O4, left_shift,  O4);
1085     __ srlx(G4, right_shift, G3);
1086     __ bset(G3, O4);
1087     if (use_bis) {
1088       __ stxa(O3, to, -16);
1089       __ stxa(O4, to, -8);
1090     } else {
1091       __ stx(O3, to, -16);
1092       __ stx(O4, to, -8);
1093     }
1094     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1095     __ delayed()->sllx(G4, left_shift,  O3);
1096   }
1097 
1098   // Copy big chunks forward with shift
1099   //
1100   // Inputs:
1101   //   from      - source arrays
1102   //   to        - destination array aligned to 8-bytes
1103   //   count     - elements count to copy >= the count equivalent to 16 bytes
1104   //   count_dec - elements count's decrement equivalent to 16 bytes
1105   //   L_copy_bytes - copy exit label
1106   //
1107   void copy_16_bytes_forward_with_shift(Register from, Register to,
1108                      Register count, int log2_elem_size, Label& L_copy_bytes) {
1109     Label L_aligned_copy, L_copy_last_bytes;
1110     assert(log2_elem_size <= 3, "the following code should be changed");
1111     int count_dec = 16>>log2_elem_size;
1112 
1113     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1114     __ andcc(from, 7, G1); // misaligned bytes
1115     __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1116     __ delayed()->nop();
1117 
1118     const Register left_shift  = G1; // left  shift bit counter
1119     const Register right_shift = G5; // right shift bit counter
1120 
1121     __ sll(G1, LogBitsPerByte, left_shift);
1122     __ mov(64, right_shift);
1123     __ sub(right_shift, left_shift, right_shift);
1124 
1125     //
1126     // Load 2 aligned 8-bytes chunks and use one from previous iteration
1127     // to form 2 aligned 8-bytes chunks to store.
1128     //
1129     __ dec(count, count_dec);   // Pre-decrement 'count'
1130     __ andn(from, 7, from);     // Align address
1131     __ ldx(from, 0, O3);
1132     __ inc(from, 8);
1133     __ sllx(O3, left_shift,  O3);
1134 
1135     disjoint_copy_core(from, to, count, log2_elem_size, 16, &StubGenerator::copy_16_bytes_shift_loop);
1136 
1137     __ inccc(count, count_dec>>1 ); // + 8 bytes
1138     __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1139     __ delayed()->inc(count, count_dec>>1); // restore 'count'
1140 
1141     // copy 8 bytes, part of them already loaded in O3
1142     __ ldx(from, 0, O4);
1143     __ inc(to, 8);
1144     __ inc(from, 8);
1145     __ srlx(O4, right_shift, G3);
1146     __ bset(O3, G3);
1147     __ stx(G3, to, -8);
1148 
1149     __ BIND(L_copy_last_bytes);
1150     __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
1151     __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1152     __ delayed()->sub(from, right_shift, from);       // restore address
1153 
1154     __ BIND(L_aligned_copy);
1155   }
1156 
1157   // Copy big chunks backward with shift
1158   //
1159   // Inputs:
1160   //   end_from  - source arrays end address
1161   //   end_to    - destination array end address aligned to 8-bytes
1162   //   count     - elements count to copy >= the count equivalent to 16 bytes
1163   //   count_dec - elements count's decrement equivalent to 16 bytes
1164   //   L_aligned_copy - aligned copy exit label
1165   //   L_copy_bytes   - copy exit label
1166   //
1167   void copy_16_bytes_backward_with_shift(Register end_from, Register end_to,
1168                      Register count, int count_dec,
1169                      Label& L_aligned_copy, Label& L_copy_bytes) {
1170     Label L_loop, L_copy_last_bytes;
1171 
1172     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1173       __ andcc(end_from, 7, G1); // misaligned bytes
1174       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1175       __ delayed()->deccc(count, count_dec); // Pre-decrement 'count'
1176 
1177     const Register left_shift  = G1; // left  shift bit counter
1178     const Register right_shift = G5; // right shift bit counter
1179 
1180       __ sll(G1, LogBitsPerByte, left_shift);
1181       __ mov(64, right_shift);
1182       __ sub(right_shift, left_shift, right_shift);
1183 
1184     //
1185     // Load 2 aligned 8-bytes chunks and use one from previous iteration
1186     // to form 2 aligned 8-bytes chunks to store.
1187     //
1188       __ andn(end_from, 7, end_from);     // Align address
1189       __ ldx(end_from, 0, O3);
1190       __ align(OptoLoopAlignment);
1191     __ BIND(L_loop);
1192       __ ldx(end_from, -8, O4);
1193       __ deccc(count, count_dec); // Can we do next iteration after this one?
1194       __ ldx(end_from, -16, G4);
1195       __ dec(end_to, 16);
1196       __ dec(end_from, 16);
1197       __ srlx(O3, right_shift, O3);
1198       __ sllx(O4, left_shift,  G3);
1199       __ bset(G3, O3);
1200       __ stx(O3, end_to, 8);
1201       __ srlx(O4, right_shift, O4);
1202       __ sllx(G4, left_shift,  G3);
1203       __ bset(G3, O4);
1204       __ stx(O4, end_to, 0);
1205       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1206       __ delayed()->mov(G4, O3);
1207 
1208       __ inccc(count, count_dec>>1 ); // + 8 bytes
1209       __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1210       __ delayed()->inc(count, count_dec>>1); // restore 'count'
1211 
1212       // copy 8 bytes, part of them already loaded in O3
1213       __ ldx(end_from, -8, O4);
1214       __ dec(end_to, 8);
1215       __ dec(end_from, 8);
1216       __ srlx(O3, right_shift, O3);
1217       __ sllx(O4, left_shift,  G3);
1218       __ bset(O3, G3);
1219       __ stx(G3, end_to, 0);
1220 
1221     __ BIND(L_copy_last_bytes);
1222       __ srl(left_shift, LogBitsPerByte, left_shift);    // misaligned bytes
1223       __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1224       __ delayed()->add(end_from, left_shift, end_from); // restore address
1225   }
1226 
1227   //
1228   //  Generate stub for disjoint byte copy.  If "aligned" is true, the
1229   //  "from" and "to" addresses are assumed to be heapword aligned.
1230   //
1231   // Arguments for generated stub:
1232   //      from:  O0
1233   //      to:    O1
1234   //      count: O2 treated as signed
1235   //
1236   address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) {
1237     __ align(CodeEntryAlignment);
1238     StubCodeMark mark(this, "StubRoutines", name);
1239     address start = __ pc();
1240 
1241     Label L_skip_alignment, L_align;
1242     Label L_copy_byte, L_copy_byte_loop, L_exit;
1243 
1244     const Register from      = O0;   // source array address
1245     const Register to        = O1;   // destination array address
1246     const Register count     = O2;   // elements count
1247     const Register offset    = O5;   // offset from start of arrays
1248     // O3, O4, G3, G4 are used as temp registers
1249 
1250     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1251 
1252     if (entry != NULL) {
1253       *entry = __ pc();
1254       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1255       BLOCK_COMMENT("Entry:");
1256     }
1257 
1258     // for short arrays, just do single element copy
1259     __ cmp(count, 23); // 16 + 7
1260     __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1261     __ delayed()->mov(G0, offset);
1262 
1263     if (aligned) {
1264       // 'aligned' == true when it is known statically during compilation
1265       // of this arraycopy call site that both 'from' and 'to' addresses
1266       // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1267       //
1268       // Aligned arrays have 4 bytes alignment in 32-bits VM
1269       // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM
1270       //
1271     } else {
1272       // copy bytes to align 'to' on 8 byte boundary
1273       __ andcc(to, 7, G1); // misaligned bytes
1274       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1275       __ delayed()->neg(G1);
1276       __ inc(G1, 8);       // bytes need to copy to next 8-bytes alignment
1277       __ sub(count, G1, count);
1278     __ BIND(L_align);
1279       __ ldub(from, 0, O3);
1280       __ deccc(G1);
1281       __ inc(from);
1282       __ stb(O3, to, 0);
1283       __ br(Assembler::notZero, false, Assembler::pt, L_align);
1284       __ delayed()->inc(to);
1285     __ BIND(L_skip_alignment);
1286     }
1287     if (!aligned) {
1288       // Copy with shift 16 bytes per iteration if arrays do not have
1289       // the same alignment mod 8, otherwise fall through to the next
1290       // code for aligned copy.
1291       // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
1292       // Also jump over aligned copy after the copy with shift completed.
1293 
1294       copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte);
1295     }
1296 
1297     // Both array are 8 bytes aligned, copy 16 bytes at a time
1298       __ and3(count, 7, G4); // Save count
1299       __ srl(count, 3, count);
1300      generate_disjoint_long_copy_core(aligned);
1301       __ mov(G4, count);     // Restore count
1302 
1303     // copy tailing bytes
1304     __ BIND(L_copy_byte);
1305       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1306       __ align(OptoLoopAlignment);
1307     __ BIND(L_copy_byte_loop);
1308       __ ldub(from, offset, O3);
1309       __ deccc(count);
1310       __ stb(O3, to, offset);
1311       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1312       __ delayed()->inc(offset);
1313 
1314     __ BIND(L_exit);
1315       // O3, O4 are used as temp registers
1316       inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1317       __ retl();
1318       __ delayed()->mov(G0, O0); // return 0
1319     return start;
1320   }
1321 
1322   //
1323   //  Generate stub for conjoint byte copy.  If "aligned" is true, the
1324   //  "from" and "to" addresses are assumed to be heapword aligned.
1325   //
1326   // Arguments for generated stub:
1327   //      from:  O0
1328   //      to:    O1
1329   //      count: O2 treated as signed
1330   //
1331   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1332                                       address *entry, const char *name) {
1333     // Do reverse copy.
1334 
1335     __ align(CodeEntryAlignment);
1336     StubCodeMark mark(this, "StubRoutines", name);
1337     address start = __ pc();
1338 
1339     Label L_skip_alignment, L_align, L_aligned_copy;
1340     Label L_copy_byte, L_copy_byte_loop, L_exit;
1341 
1342     const Register from      = O0;   // source array address
1343     const Register to        = O1;   // destination array address
1344     const Register count     = O2;   // elements count
1345     const Register end_from  = from; // source array end address
1346     const Register end_to    = to;   // destination array end address
1347 
1348     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1349 
1350     if (entry != NULL) {
1351       *entry = __ pc();
1352       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1353       BLOCK_COMMENT("Entry:");
1354     }
1355 
1356     array_overlap_test(nooverlap_target, 0);
1357 
1358     __ add(to, count, end_to);       // offset after last copied element
1359 
1360     // for short arrays, just do single element copy
1361     __ cmp(count, 23); // 16 + 7
1362     __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1363     __ delayed()->add(from, count, end_from);
1364 
1365     {
1366       // Align end of arrays since they could be not aligned even
1367       // when arrays itself are aligned.
1368 
1369       // copy bytes to align 'end_to' on 8 byte boundary
1370       __ andcc(end_to, 7, G1); // misaligned bytes
1371       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1372       __ delayed()->nop();
1373       __ sub(count, G1, count);
1374     __ BIND(L_align);
1375       __ dec(end_from);
1376       __ dec(end_to);
1377       __ ldub(end_from, 0, O3);
1378       __ deccc(G1);
1379       __ brx(Assembler::notZero, false, Assembler::pt, L_align);
1380       __ delayed()->stb(O3, end_to, 0);
1381     __ BIND(L_skip_alignment);
1382     }
1383     if (aligned) {
1384       // Both arrays are aligned to 8-bytes in 64-bits VM.
1385       // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1386       // in unaligned case.
1387       __ dec(count, 16);
1388     } else {
1389       // Copy with shift 16 bytes per iteration if arrays do not have
1390       // the same alignment mod 8, otherwise jump to the next
1391       // code for aligned copy (and substracting 16 from 'count' before jump).
1392       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1393       // Also jump over aligned copy after the copy with shift completed.
1394 
1395       copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
1396                                         L_aligned_copy, L_copy_byte);
1397     }
1398     // copy 4 elements (16 bytes) at a time
1399       __ align(OptoLoopAlignment);
1400     __ BIND(L_aligned_copy);
1401       __ dec(end_from, 16);
1402       __ ldx(end_from, 8, O3);
1403       __ ldx(end_from, 0, O4);
1404       __ dec(end_to, 16);
1405       __ deccc(count, 16);
1406       __ stx(O3, end_to, 8);
1407       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1408       __ delayed()->stx(O4, end_to, 0);
1409       __ inc(count, 16);
1410 
1411     // copy 1 element (2 bytes) at a time
1412     __ BIND(L_copy_byte);
1413       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1414       __ align(OptoLoopAlignment);
1415     __ BIND(L_copy_byte_loop);
1416       __ dec(end_from);
1417       __ dec(end_to);
1418       __ ldub(end_from, 0, O4);
1419       __ deccc(count);
1420       __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop);
1421       __ delayed()->stb(O4, end_to, 0);
1422 
1423     __ BIND(L_exit);
1424     // O3, O4 are used as temp registers
1425     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1426     __ retl();
1427     __ delayed()->mov(G0, O0); // return 0
1428     return start;
1429   }
1430 
1431   //
1432   //  Generate stub for disjoint short copy.  If "aligned" is true, the
1433   //  "from" and "to" addresses are assumed to be heapword aligned.
1434   //
1435   // Arguments for generated stub:
1436   //      from:  O0
1437   //      to:    O1
1438   //      count: O2 treated as signed
1439   //
1440   address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) {
1441     __ align(CodeEntryAlignment);
1442     StubCodeMark mark(this, "StubRoutines", name);
1443     address start = __ pc();
1444 
1445     Label L_skip_alignment, L_skip_alignment2;
1446     Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1447 
1448     const Register from      = O0;   // source array address
1449     const Register to        = O1;   // destination array address
1450     const Register count     = O2;   // elements count
1451     const Register offset    = O5;   // offset from start of arrays
1452     // O3, O4, G3, G4 are used as temp registers
1453 
1454     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1455 
1456     if (entry != NULL) {
1457       *entry = __ pc();
1458       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1459       BLOCK_COMMENT("Entry:");
1460     }
1461 
1462     // for short arrays, just do single element copy
1463     __ cmp(count, 11); // 8 + 3  (22 bytes)
1464     __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1465     __ delayed()->mov(G0, offset);
1466 
1467     if (aligned) {
1468       // 'aligned' == true when it is known statically during compilation
1469       // of this arraycopy call site that both 'from' and 'to' addresses
1470       // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1471       //
1472       // Aligned arrays have 4 bytes alignment in 32-bits VM
1473       // and 8 bytes - in 64-bits VM.
1474       //
1475     } else {
1476       // copy 1 element if necessary to align 'to' on an 4 bytes
1477       __ andcc(to, 3, G0);
1478       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1479       __ delayed()->lduh(from, 0, O3);
1480       __ inc(from, 2);
1481       __ inc(to, 2);
1482       __ dec(count);
1483       __ sth(O3, to, -2);
1484     __ BIND(L_skip_alignment);
1485 
1486       // copy 2 elements to align 'to' on an 8 byte boundary
1487       __ andcc(to, 7, G0);
1488       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1489       __ delayed()->lduh(from, 0, O3);
1490       __ dec(count, 2);
1491       __ lduh(from, 2, O4);
1492       __ inc(from, 4);
1493       __ inc(to, 4);
1494       __ sth(O3, to, -4);
1495       __ sth(O4, to, -2);
1496     __ BIND(L_skip_alignment2);
1497     }
1498     if (!aligned) {
1499       // Copy with shift 16 bytes per iteration if arrays do not have
1500       // the same alignment mod 8, otherwise fall through to the next
1501       // code for aligned copy.
1502       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1503       // Also jump over aligned copy after the copy with shift completed.
1504 
1505       copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes);
1506     }
1507 
1508     // Both array are 8 bytes aligned, copy 16 bytes at a time
1509       __ and3(count, 3, G4); // Save
1510       __ srl(count, 2, count);
1511      generate_disjoint_long_copy_core(aligned);
1512       __ mov(G4, count); // restore
1513 
1514     // copy 1 element at a time
1515     __ BIND(L_copy_2_bytes);
1516       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1517       __ align(OptoLoopAlignment);
1518     __ BIND(L_copy_2_bytes_loop);
1519       __ lduh(from, offset, O3);
1520       __ deccc(count);
1521       __ sth(O3, to, offset);
1522       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1523       __ delayed()->inc(offset, 2);
1524 
1525     __ BIND(L_exit);
1526       // O3, O4 are used as temp registers
1527       inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1528       __ retl();
1529       __ delayed()->mov(G0, O0); // return 0
1530     return start;
1531   }
1532 
1533   //
1534   //  Generate stub for disjoint short fill.  If "aligned" is true, the
1535   //  "to" address is assumed to be heapword aligned.
1536   //
1537   // Arguments for generated stub:
1538   //      to:    O0
1539   //      value: O1
1540   //      count: O2 treated as signed
1541   //
1542   address generate_fill(BasicType t, bool aligned, const char* name) {
1543     __ align(CodeEntryAlignment);
1544     StubCodeMark mark(this, "StubRoutines", name);
1545     address start = __ pc();
1546 
1547     const Register to        = O0;   // source array address
1548     const Register value     = O1;   // fill value
1549     const Register count     = O2;   // elements count
1550     // O3 is used as a temp register
1551 
1552     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1553 
1554     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
1555     Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes;
1556 
1557     int shift = -1;
1558     switch (t) {
1559        case T_BYTE:
1560         shift = 2;
1561         break;
1562        case T_SHORT:
1563         shift = 1;
1564         break;
1565       case T_INT:
1566          shift = 0;
1567         break;
1568       default: ShouldNotReachHere();
1569     }
1570 
1571     BLOCK_COMMENT("Entry:");
1572 
1573     if (t == T_BYTE) {
1574       // Zero extend value
1575       __ and3(value, 0xff, value);
1576       __ sllx(value, 8, O3);
1577       __ or3(value, O3, value);
1578     }
1579     if (t == T_SHORT) {
1580       // Zero extend value
1581       __ sllx(value, 48, value);
1582       __ srlx(value, 48, value);
1583     }
1584     if (t == T_BYTE || t == T_SHORT) {
1585       __ sllx(value, 16, O3);
1586       __ or3(value, O3, value);
1587     }
1588 
1589     __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
1590     __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp
1591     __ delayed()->andcc(count, 1, G0);
1592 
1593     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
1594       // align source address at 4 bytes address boundary
1595       if (t == T_BYTE) {
1596         // One byte misalignment happens only for byte arrays
1597         __ andcc(to, 1, G0);
1598         __ br(Assembler::zero, false, Assembler::pt, L_skip_align1);
1599         __ delayed()->nop();
1600         __ stb(value, to, 0);
1601         __ inc(to, 1);
1602         __ dec(count, 1);
1603         __ BIND(L_skip_align1);
1604       }
1605       // Two bytes misalignment happens only for byte and short (char) arrays
1606       __ andcc(to, 2, G0);
1607       __ br(Assembler::zero, false, Assembler::pt, L_skip_align2);
1608       __ delayed()->nop();
1609       __ sth(value, to, 0);
1610       __ inc(to, 2);
1611       __ dec(count, 1 << (shift - 1));
1612       __ BIND(L_skip_align2);
1613     }
1614     if (!aligned) {
1615       // align to 8 bytes, we know we are 4 byte aligned to start
1616       __ andcc(to, 7, G0);
1617       __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes);
1618       __ delayed()->nop();
1619       __ stw(value, to, 0);
1620       __ inc(to, 4);
1621       __ dec(count, 1 << shift);
1622       __ BIND(L_fill_32_bytes);
1623     }
1624 
1625     if (t == T_INT) {
1626       // Zero extend value
1627       __ srl(value, 0, value);
1628     }
1629     if (t == T_BYTE || t == T_SHORT || t == T_INT) {
1630       __ sllx(value, 32, O3);
1631       __ or3(value, O3, value);
1632     }
1633 
1634     Label L_check_fill_8_bytes;
1635     // Fill 32-byte chunks
1636     __ subcc(count, 8 << shift, count);
1637     __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes);
1638     __ delayed()->nop();
1639 
1640     Label L_fill_32_bytes_loop, L_fill_4_bytes;
1641     __ align(16);
1642     __ BIND(L_fill_32_bytes_loop);
1643 
1644     __ stx(value, to, 0);
1645     __ stx(value, to, 8);
1646     __ stx(value, to, 16);
1647     __ stx(value, to, 24);
1648 
1649     __ subcc(count, 8 << shift, count);
1650     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop);
1651     __ delayed()->add(to, 32, to);
1652 
1653     __ BIND(L_check_fill_8_bytes);
1654     __ addcc(count, 8 << shift, count);
1655     __ brx(Assembler::zero, false, Assembler::pn, L_exit);
1656     __ delayed()->subcc(count, 1 << (shift + 1), count);
1657     __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes);
1658     __ delayed()->andcc(count, 1<<shift, G0);
1659 
1660     //
1661     // length is too short, just fill 8 bytes at a time
1662     //
1663     Label L_fill_8_bytes_loop;
1664     __ BIND(L_fill_8_bytes_loop);
1665     __ stx(value, to, 0);
1666     __ subcc(count, 1 << (shift + 1), count);
1667     __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop);
1668     __ delayed()->add(to, 8, to);
1669 
1670     // fill trailing 4 bytes
1671     __ andcc(count, 1<<shift, G0);  // in delay slot of branches
1672     if (t == T_INT) {
1673       __ BIND(L_fill_elements);
1674     }
1675     __ BIND(L_fill_4_bytes);
1676     __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes);
1677     if (t == T_BYTE || t == T_SHORT) {
1678       __ delayed()->andcc(count, 1<<(shift-1), G0);
1679     } else {
1680       __ delayed()->nop();
1681     }
1682     __ stw(value, to, 0);
1683     if (t == T_BYTE || t == T_SHORT) {
1684       __ inc(to, 4);
1685       // fill trailing 2 bytes
1686       __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches
1687       __ BIND(L_fill_2_bytes);
1688       __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte);
1689       __ delayed()->andcc(count, 1, count);
1690       __ sth(value, to, 0);
1691       if (t == T_BYTE) {
1692         __ inc(to, 2);
1693         // fill trailing byte
1694         __ andcc(count, 1, count);  // in delay slot of branches
1695         __ BIND(L_fill_byte);
1696         __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1697         __ delayed()->nop();
1698         __ stb(value, to, 0);
1699       } else {
1700         __ BIND(L_fill_byte);
1701       }
1702     } else {
1703       __ BIND(L_fill_2_bytes);
1704     }
1705     __ BIND(L_exit);
1706     __ retl();
1707     __ delayed()->nop();
1708 
1709     // Handle copies less than 8 bytes.  Int is handled elsewhere.
1710     if (t == T_BYTE) {
1711       __ BIND(L_fill_elements);
1712       Label L_fill_2, L_fill_4;
1713       // in delay slot __ andcc(count, 1, G0);
1714       __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1715       __ delayed()->andcc(count, 2, G0);
1716       __ stb(value, to, 0);
1717       __ inc(to, 1);
1718       __ BIND(L_fill_2);
1719       __ brx(Assembler::zero, false, Assembler::pt, L_fill_4);
1720       __ delayed()->andcc(count, 4, G0);
1721       __ stb(value, to, 0);
1722       __ stb(value, to, 1);
1723       __ inc(to, 2);
1724       __ BIND(L_fill_4);
1725       __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1726       __ delayed()->nop();
1727       __ stb(value, to, 0);
1728       __ stb(value, to, 1);
1729       __ stb(value, to, 2);
1730       __ retl();
1731       __ delayed()->stb(value, to, 3);
1732     }
1733 
1734     if (t == T_SHORT) {
1735       Label L_fill_2;
1736       __ BIND(L_fill_elements);
1737       // in delay slot __ andcc(count, 1, G0);
1738       __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1739       __ delayed()->andcc(count, 2, G0);
1740       __ sth(value, to, 0);
1741       __ inc(to, 2);
1742       __ BIND(L_fill_2);
1743       __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1744       __ delayed()->nop();
1745       __ sth(value, to, 0);
1746       __ retl();
1747       __ delayed()->sth(value, to, 2);
1748     }
1749     return start;
1750   }
1751 
1752   //
1753   //  Generate stub for conjoint short copy.  If "aligned" is true, the
1754   //  "from" and "to" addresses are assumed to be heapword aligned.
1755   //
1756   // Arguments for generated stub:
1757   //      from:  O0
1758   //      to:    O1
1759   //      count: O2 treated as signed
1760   //
1761   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1762                                        address *entry, const char *name) {
1763     // Do reverse copy.
1764 
1765     __ align(CodeEntryAlignment);
1766     StubCodeMark mark(this, "StubRoutines", name);
1767     address start = __ pc();
1768 
1769     Label L_skip_alignment, L_skip_alignment2, L_aligned_copy;
1770     Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1771 
1772     const Register from      = O0;   // source array address
1773     const Register to        = O1;   // destination array address
1774     const Register count     = O2;   // elements count
1775     const Register end_from  = from; // source array end address
1776     const Register end_to    = to;   // destination array end address
1777 
1778     const Register byte_count = O3;  // bytes count to copy
1779 
1780     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1781 
1782     if (entry != NULL) {
1783       *entry = __ pc();
1784       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1785       BLOCK_COMMENT("Entry:");
1786     }
1787 
1788     array_overlap_test(nooverlap_target, 1);
1789 
1790     __ sllx(count, LogBytesPerShort, byte_count);
1791     __ add(to, byte_count, end_to);  // offset after last copied element
1792 
1793     // for short arrays, just do single element copy
1794     __ cmp(count, 11); // 8 + 3  (22 bytes)
1795     __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1796     __ delayed()->add(from, byte_count, end_from);
1797 
1798     {
1799       // Align end of arrays since they could be not aligned even
1800       // when arrays itself are aligned.
1801 
1802       // copy 1 element if necessary to align 'end_to' on an 4 bytes
1803       __ andcc(end_to, 3, G0);
1804       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1805       __ delayed()->lduh(end_from, -2, O3);
1806       __ dec(end_from, 2);
1807       __ dec(end_to, 2);
1808       __ dec(count);
1809       __ sth(O3, end_to, 0);
1810     __ BIND(L_skip_alignment);
1811 
1812       // copy 2 elements to align 'end_to' on an 8 byte boundary
1813       __ andcc(end_to, 7, G0);
1814       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1815       __ delayed()->lduh(end_from, -2, O3);
1816       __ dec(count, 2);
1817       __ lduh(end_from, -4, O4);
1818       __ dec(end_from, 4);
1819       __ dec(end_to, 4);
1820       __ sth(O3, end_to, 2);
1821       __ sth(O4, end_to, 0);
1822     __ BIND(L_skip_alignment2);
1823     }
1824     if (aligned) {
1825       // Both arrays are aligned to 8-bytes in 64-bits VM.
1826       // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1827       // in unaligned case.
1828       __ dec(count, 8);
1829     } else {
1830       // Copy with shift 16 bytes per iteration if arrays do not have
1831       // the same alignment mod 8, otherwise jump to the next
1832       // code for aligned copy (and substracting 8 from 'count' before jump).
1833       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1834       // Also jump over aligned copy after the copy with shift completed.
1835 
1836       copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
1837                                         L_aligned_copy, L_copy_2_bytes);
1838     }
1839     // copy 4 elements (16 bytes) at a time
1840       __ align(OptoLoopAlignment);
1841     __ BIND(L_aligned_copy);
1842       __ dec(end_from, 16);
1843       __ ldx(end_from, 8, O3);
1844       __ ldx(end_from, 0, O4);
1845       __ dec(end_to, 16);
1846       __ deccc(count, 8);
1847       __ stx(O3, end_to, 8);
1848       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1849       __ delayed()->stx(O4, end_to, 0);
1850       __ inc(count, 8);
1851 
1852     // copy 1 element (2 bytes) at a time
1853     __ BIND(L_copy_2_bytes);
1854       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1855     __ BIND(L_copy_2_bytes_loop);
1856       __ dec(end_from, 2);
1857       __ dec(end_to, 2);
1858       __ lduh(end_from, 0, O4);
1859       __ deccc(count);
1860       __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
1861       __ delayed()->sth(O4, end_to, 0);
1862 
1863     __ BIND(L_exit);
1864     // O3, O4 are used as temp registers
1865     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1866     __ retl();
1867     __ delayed()->mov(G0, O0); // return 0
1868     return start;
1869   }
1870 
1871   //
1872   // Helper methods for generate_disjoint_int_copy_core()
1873   //
1874   void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec,
1875                           Label& L_loop, bool use_prefetch, bool use_bis) {
1876 
1877     __ align(OptoLoopAlignment);
1878     __ BIND(L_loop);
1879     if (use_prefetch) {
1880       if (ArraycopySrcPrefetchDistance > 0) {
1881         __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
1882       }
1883       if (ArraycopyDstPrefetchDistance > 0) {
1884         __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
1885       }
1886     }
1887     __ ldx(from, 4, O4);
1888     __ ldx(from, 12, G4);
1889     __ inc(to, 16);
1890     __ inc(from, 16);
1891     __ deccc(count, 4); // Can we do next iteration after this one?
1892 
1893     __ srlx(O4, 32, G3);
1894     __ bset(G3, O3);
1895     __ sllx(O4, 32, O4);
1896     __ srlx(G4, 32, G3);
1897     __ bset(G3, O4);
1898     if (use_bis) {
1899       __ stxa(O3, to, -16);
1900       __ stxa(O4, to, -8);
1901     } else {
1902       __ stx(O3, to, -16);
1903       __ stx(O4, to, -8);
1904     }
1905     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1906     __ delayed()->sllx(G4, 32,  O3);
1907 
1908   }
1909 
1910   //
1911   //  Generate core code for disjoint int copy (and oop copy on 32-bit).
1912   //  If "aligned" is true, the "from" and "to" addresses are assumed
1913   //  to be heapword aligned.
1914   //
1915   // Arguments:
1916   //      from:  O0
1917   //      to:    O1
1918   //      count: O2 treated as signed
1919   //
1920   void generate_disjoint_int_copy_core(bool aligned) {
1921 
1922     Label L_skip_alignment, L_aligned_copy;
1923     Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
1924 
1925     const Register from      = O0;   // source array address
1926     const Register to        = O1;   // destination array address
1927     const Register count     = O2;   // elements count
1928     const Register offset    = O5;   // offset from start of arrays
1929     // O3, O4, G3, G4 are used as temp registers
1930 
1931     // 'aligned' == true when it is known statically during compilation
1932     // of this arraycopy call site that both 'from' and 'to' addresses
1933     // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1934     //
1935     // Aligned arrays have 4 bytes alignment in 32-bits VM
1936     // and 8 bytes - in 64-bits VM.
1937     //
1938     if (!aligned) {
1939       // The next check could be put under 'ifndef' since the code in
1940       // generate_disjoint_long_copy_core() has own checks and set 'offset'.
1941 
1942       // for short arrays, just do single element copy
1943       __ cmp(count, 5); // 4 + 1 (20 bytes)
1944       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
1945       __ delayed()->mov(G0, offset);
1946 
1947       // copy 1 element to align 'to' on an 8 byte boundary
1948       __ andcc(to, 7, G0);
1949       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1950       __ delayed()->ld(from, 0, O3);
1951       __ inc(from, 4);
1952       __ inc(to, 4);
1953       __ dec(count);
1954       __ st(O3, to, -4);
1955     __ BIND(L_skip_alignment);
1956 
1957     // if arrays have same alignment mod 8, do 4 elements copy
1958       __ andcc(from, 7, G0);
1959       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1960       __ delayed()->ld(from, 0, O3);
1961 
1962     //
1963     // Load 2 aligned 8-bytes chunks and use one from previous iteration
1964     // to form 2 aligned 8-bytes chunks to store.
1965     //
1966     // copy_16_bytes_forward_with_shift() is not used here since this
1967     // code is more optimal.
1968 
1969     // copy with shift 4 elements (16 bytes) at a time
1970       __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
1971       __ sllx(O3, 32,  O3);
1972 
1973       disjoint_copy_core(from, to, count, 2, 16, &StubGenerator::copy_16_bytes_loop);
1974 
1975       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
1976       __ delayed()->inc(count, 4); // restore 'count'
1977 
1978     __ BIND(L_aligned_copy);
1979     } // !aligned
1980 
1981     // copy 4 elements (16 bytes) at a time
1982       __ and3(count, 1, G4); // Save
1983       __ srl(count, 1, count);
1984      generate_disjoint_long_copy_core(aligned);
1985       __ mov(G4, count);     // Restore
1986 
1987     // copy 1 element at a time
1988     __ BIND(L_copy_4_bytes);
1989       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1990     __ BIND(L_copy_4_bytes_loop);
1991       __ ld(from, offset, O3);
1992       __ deccc(count);
1993       __ st(O3, to, offset);
1994       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
1995       __ delayed()->inc(offset, 4);
1996     __ BIND(L_exit);
1997   }
1998 
1999   //
2000   //  Generate stub for disjoint int copy.  If "aligned" is true, the
2001   //  "from" and "to" addresses are assumed to be heapword aligned.
2002   //
2003   // Arguments for generated stub:
2004   //      from:  O0
2005   //      to:    O1
2006   //      count: O2 treated as signed
2007   //
2008   address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) {
2009     __ align(CodeEntryAlignment);
2010     StubCodeMark mark(this, "StubRoutines", name);
2011     address start = __ pc();
2012 
2013     const Register count = O2;
2014     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2015 
2016     if (entry != NULL) {
2017       *entry = __ pc();
2018       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2019       BLOCK_COMMENT("Entry:");
2020     }
2021 
2022     generate_disjoint_int_copy_core(aligned);
2023 
2024     // O3, O4 are used as temp registers
2025     inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2026     __ retl();
2027     __ delayed()->mov(G0, O0); // return 0
2028     return start;
2029   }
2030 
2031   //
2032   //  Generate core code for conjoint int copy (and oop copy on 32-bit).
2033   //  If "aligned" is true, the "from" and "to" addresses are assumed
2034   //  to be heapword aligned.
2035   //
2036   // Arguments:
2037   //      from:  O0
2038   //      to:    O1
2039   //      count: O2 treated as signed
2040   //
2041   void generate_conjoint_int_copy_core(bool aligned) {
2042     // Do reverse copy.
2043 
2044     Label L_skip_alignment, L_aligned_copy;
2045     Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
2046 
2047     const Register from      = O0;   // source array address
2048     const Register to        = O1;   // destination array address
2049     const Register count     = O2;   // elements count
2050     const Register end_from  = from; // source array end address
2051     const Register end_to    = to;   // destination array end address
2052     // O3, O4, O5, G3 are used as temp registers
2053 
2054     const Register byte_count = O3;  // bytes count to copy
2055 
2056       __ sllx(count, LogBytesPerInt, byte_count);
2057       __ add(to, byte_count, end_to); // offset after last copied element
2058 
2059       __ cmp(count, 5); // for short arrays, just do single element copy
2060       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
2061       __ delayed()->add(from, byte_count, end_from);
2062 
2063     // copy 1 element to align 'to' on an 8 byte boundary
2064       __ andcc(end_to, 7, G0);
2065       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
2066       __ delayed()->nop();
2067       __ dec(count);
2068       __ dec(end_from, 4);
2069       __ dec(end_to,   4);
2070       __ ld(end_from, 0, O4);
2071       __ st(O4, end_to, 0);
2072     __ BIND(L_skip_alignment);
2073 
2074     // Check if 'end_from' and 'end_to' has the same alignment.
2075       __ andcc(end_from, 7, G0);
2076       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2077       __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4
2078 
2079     // copy with shift 4 elements (16 bytes) at a time
2080     //
2081     // Load 2 aligned 8-bytes chunks and use one from previous iteration
2082     // to form 2 aligned 8-bytes chunks to store.
2083     //
2084       __ ldx(end_from, -4, O3);
2085       __ align(OptoLoopAlignment);
2086     __ BIND(L_copy_16_bytes);
2087       __ ldx(end_from, -12, O4);
2088       __ deccc(count, 4);
2089       __ ldx(end_from, -20, O5);
2090       __ dec(end_to, 16);
2091       __ dec(end_from, 16);
2092       __ srlx(O3, 32, O3);
2093       __ sllx(O4, 32, G3);
2094       __ bset(G3, O3);
2095       __ stx(O3, end_to, 8);
2096       __ srlx(O4, 32, O4);
2097       __ sllx(O5, 32, G3);
2098       __ bset(O4, G3);
2099       __ stx(G3, end_to, 0);
2100       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2101       __ delayed()->mov(O5, O3);
2102 
2103       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2104       __ delayed()->inc(count, 4);
2105 
2106     // copy 4 elements (16 bytes) at a time
2107       __ align(OptoLoopAlignment);
2108     __ BIND(L_aligned_copy);
2109       __ dec(end_from, 16);
2110       __ ldx(end_from, 8, O3);
2111       __ ldx(end_from, 0, O4);
2112       __ dec(end_to, 16);
2113       __ deccc(count, 4);
2114       __ stx(O3, end_to, 8);
2115       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
2116       __ delayed()->stx(O4, end_to, 0);
2117       __ inc(count, 4);
2118 
2119     // copy 1 element (4 bytes) at a time
2120     __ BIND(L_copy_4_bytes);
2121       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2122     __ BIND(L_copy_4_bytes_loop);
2123       __ dec(end_from, 4);
2124       __ dec(end_to, 4);
2125       __ ld(end_from, 0, O4);
2126       __ deccc(count);
2127       __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop);
2128       __ delayed()->st(O4, end_to, 0);
2129     __ BIND(L_exit);
2130   }
2131 
2132   //
2133   //  Generate stub for conjoint int copy.  If "aligned" is true, the
2134   //  "from" and "to" addresses are assumed to be heapword aligned.
2135   //
2136   // Arguments for generated stub:
2137   //      from:  O0
2138   //      to:    O1
2139   //      count: O2 treated as signed
2140   //
2141   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
2142                                      address *entry, const char *name) {
2143     __ align(CodeEntryAlignment);
2144     StubCodeMark mark(this, "StubRoutines", name);
2145     address start = __ pc();
2146 
2147     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2148 
2149     if (entry != NULL) {
2150       *entry = __ pc();
2151       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2152       BLOCK_COMMENT("Entry:");
2153     }
2154 
2155     array_overlap_test(nooverlap_target, 2);
2156 
2157     generate_conjoint_int_copy_core(aligned);
2158 
2159     // O3, O4 are used as temp registers
2160     inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2161     __ retl();
2162     __ delayed()->mov(G0, O0); // return 0
2163     return start;
2164   }
2165 
2166   //
2167   // Helper methods for generate_disjoint_long_copy_core()
2168   //
2169   void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec,
2170                           Label& L_loop, bool use_prefetch, bool use_bis) {
2171     __ align(OptoLoopAlignment);
2172     __ BIND(L_loop);
2173     for (int off = 0; off < 64; off += 16) {
2174       if (use_prefetch && (off & 31) == 0) {
2175         if (ArraycopySrcPrefetchDistance > 0) {
2176           __ prefetch(from, ArraycopySrcPrefetchDistance+off, Assembler::severalReads);
2177         }
2178         if (ArraycopyDstPrefetchDistance > 0) {
2179           __ prefetch(to, ArraycopyDstPrefetchDistance+off, Assembler::severalWritesAndPossiblyReads);
2180         }
2181       }
2182       __ ldx(from,  off+0, O4);
2183       __ ldx(from,  off+8, O5);
2184       if (use_bis) {
2185         __ stxa(O4, to,  off+0);
2186         __ stxa(O5, to,  off+8);
2187       } else {
2188         __ stx(O4, to,  off+0);
2189         __ stx(O5, to,  off+8);
2190       }
2191     }
2192     __ deccc(count, 8);
2193     __ inc(from, 64);
2194     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
2195     __ delayed()->inc(to, 64);
2196   }
2197 
2198   //
2199   //  Generate core code for disjoint long copy (and oop copy on 64-bit).
2200   //  "aligned" is ignored, because we must make the stronger
2201   //  assumption that both addresses are always 64-bit aligned.
2202   //
2203   // Arguments:
2204   //      from:  O0
2205   //      to:    O1
2206   //      count: O2 treated as signed
2207   //
2208   // count -= 2;
2209   // if ( count >= 0 ) { // >= 2 elements
2210   //   if ( count > 6) { // >= 8 elements
2211   //     count -= 6; // original count - 8
2212   //     do {
2213   //       copy_8_elements;
2214   //       count -= 8;
2215   //     } while ( count >= 0 );
2216   //     count += 6;
2217   //   }
2218   //   if ( count >= 0 ) { // >= 2 elements
2219   //     do {
2220   //       copy_2_elements;
2221   //     } while ( (count=count-2) >= 0 );
2222   //   }
2223   // }
2224   // count += 2;
2225   // if ( count != 0 ) { // 1 element left
2226   //   copy_1_element;
2227   // }
2228   //
2229   void generate_disjoint_long_copy_core(bool aligned) {
2230     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2231     const Register from    = O0;  // source array address
2232     const Register to      = O1;  // destination array address
2233     const Register count   = O2;  // elements count
2234     const Register offset0 = O4;  // element offset
2235     const Register offset8 = O5;  // next element offset
2236 
2237     __ deccc(count, 2);
2238     __ mov(G0, offset0);   // offset from start of arrays (0)
2239     __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2240     __ delayed()->add(offset0, 8, offset8);
2241 
2242     // Copy by 64 bytes chunks
2243 
2244     const Register from64 = O3;  // source address
2245     const Register to64   = G3;  // destination address
2246     __ subcc(count, 6, O3);
2247     __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
2248     __ delayed()->mov(to,   to64);
2249     // Now we can use O4(offset0), O5(offset8) as temps
2250     __ mov(O3, count);
2251     // count >= 0 (original count - 8)
2252     __ mov(from, from64);
2253 
2254     disjoint_copy_core(from64, to64, count, 3, 64, &StubGenerator::copy_64_bytes_loop);
2255 
2256       // Restore O4(offset0), O5(offset8)
2257       __ sub(from64, from, offset0);
2258       __ inccc(count, 6); // restore count
2259       __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2260       __ delayed()->add(offset0, 8, offset8);
2261 
2262       // Copy by 16 bytes chunks
2263       __ align(OptoLoopAlignment);
2264     __ BIND(L_copy_16_bytes);
2265       __ ldx(from, offset0, O3);
2266       __ ldx(from, offset8, G3);
2267       __ deccc(count, 2);
2268       __ stx(O3, to, offset0);
2269       __ inc(offset0, 16);
2270       __ stx(G3, to, offset8);
2271       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2272       __ delayed()->inc(offset8, 16);
2273 
2274       // Copy last 8 bytes
2275     __ BIND(L_copy_8_bytes);
2276       __ inccc(count, 2);
2277       __ brx(Assembler::zero, true, Assembler::pn, L_exit );
2278       __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
2279       __ ldx(from, offset0, O3);
2280       __ stx(O3, to, offset0);
2281     __ BIND(L_exit);
2282   }
2283 
2284   //
2285   //  Generate stub for disjoint long copy.
2286   //  "aligned" is ignored, because we must make the stronger
2287   //  assumption that both addresses are always 64-bit aligned.
2288   //
2289   // Arguments for generated stub:
2290   //      from:  O0
2291   //      to:    O1
2292   //      count: O2 treated as signed
2293   //
2294   address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) {
2295     __ align(CodeEntryAlignment);
2296     StubCodeMark mark(this, "StubRoutines", name);
2297     address start = __ pc();
2298 
2299     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2300 
2301     if (entry != NULL) {
2302       *entry = __ pc();
2303       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2304       BLOCK_COMMENT("Entry:");
2305     }
2306 
2307     generate_disjoint_long_copy_core(aligned);
2308 
2309     // O3, O4 are used as temp registers
2310     inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2311     __ retl();
2312     __ delayed()->mov(G0, O0); // return 0
2313     return start;
2314   }
2315 
2316   //
2317   //  Generate core code for conjoint long copy (and oop copy on 64-bit).
2318   //  "aligned" is ignored, because we must make the stronger
2319   //  assumption that both addresses are always 64-bit aligned.
2320   //
2321   // Arguments:
2322   //      from:  O0
2323   //      to:    O1
2324   //      count: O2 treated as signed
2325   //
2326   void generate_conjoint_long_copy_core(bool aligned) {
2327     // Do reverse copy.
2328     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2329     const Register from    = O0;  // source array address
2330     const Register to      = O1;  // destination array address
2331     const Register count   = O2;  // elements count
2332     const Register offset8 = O4;  // element offset
2333     const Register offset0 = O5;  // previous element offset
2334 
2335       __ subcc(count, 1, count);
2336       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
2337       __ delayed()->sllx(count, LogBytesPerLong, offset8);
2338       __ sub(offset8, 8, offset0);
2339       __ align(OptoLoopAlignment);
2340     __ BIND(L_copy_16_bytes);
2341       __ ldx(from, offset8, O2);
2342       __ ldx(from, offset0, O3);
2343       __ stx(O2, to, offset8);
2344       __ deccc(offset8, 16);      // use offset8 as counter
2345       __ stx(O3, to, offset0);
2346       __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes);
2347       __ delayed()->dec(offset0, 16);
2348 
2349     __ BIND(L_copy_8_bytes);
2350       __ brx(Assembler::negative, false, Assembler::pn, L_exit );
2351       __ delayed()->nop();
2352       __ ldx(from, 0, O3);
2353       __ stx(O3, to, 0);
2354     __ BIND(L_exit);
2355   }
2356 
2357   //  Generate stub for conjoint long copy.
2358   //  "aligned" is ignored, because we must make the stronger
2359   //  assumption that both addresses are always 64-bit aligned.
2360   //
2361   // Arguments for generated stub:
2362   //      from:  O0
2363   //      to:    O1
2364   //      count: O2 treated as signed
2365   //
2366   address generate_conjoint_long_copy(bool aligned, address nooverlap_target,
2367                                       address *entry, const char *name) {
2368     __ align(CodeEntryAlignment);
2369     StubCodeMark mark(this, "StubRoutines", name);
2370     address start = __ pc();
2371 
2372     assert(aligned, "Should always be aligned");
2373 
2374     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2375 
2376     if (entry != NULL) {
2377       *entry = __ pc();
2378       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2379       BLOCK_COMMENT("Entry:");
2380     }
2381 
2382     array_overlap_test(nooverlap_target, 3);
2383 
2384     generate_conjoint_long_copy_core(aligned);
2385 
2386     // O3, O4 are used as temp registers
2387     inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2388     __ retl();
2389     __ delayed()->mov(G0, O0); // return 0
2390     return start;
2391   }
2392 
2393   //  Generate stub for disjoint oop copy.  If "aligned" is true, the
2394   //  "from" and "to" addresses are assumed to be heapword aligned.
2395   //
2396   // Arguments for generated stub:
2397   //      from:  O0
2398   //      to:    O1
2399   //      count: O2 treated as signed
2400   //
2401   address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name,
2402                                      bool dest_uninitialized = false) {
2403 
2404     const Register from  = O0;  // source array address
2405     const Register to    = O1;  // destination array address
2406     const Register count = O2;  // elements count
2407 
2408     __ align(CodeEntryAlignment);
2409     StubCodeMark mark(this, "StubRoutines", name);
2410     address start = __ pc();
2411 
2412     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2413 
2414     if (entry != NULL) {
2415       *entry = __ pc();
2416       // caller can pass a 64-bit byte count here
2417       BLOCK_COMMENT("Entry:");
2418     }
2419 
2420     // save arguments for barrier generation
2421     if (UseZGC) {
2422       __ mov(from, G1);
2423     } else {
2424       __ mov(to, G1);
2425     }
2426     __ mov(count, G5);
2427     gen_load_ref_array_barrier(G1, G5);
2428     gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2429     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2430     if (UseCompressedOops) {
2431       generate_disjoint_int_copy_core(aligned);
2432     } else {
2433       generate_disjoint_long_copy_core(aligned);
2434     }
2435     // O0 is used as temp register
2436     gen_write_ref_array_post_barrier(G1, G5, O0);
2437 
2438     // O3, O4 are used as temp registers
2439     inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2440     __ retl();
2441     __ delayed()->mov(G0, O0); // return 0
2442     return start;
2443   }
2444 
2445   //  Generate stub for conjoint oop copy.  If "aligned" is true, the
2446   //  "from" and "to" addresses are assumed to be heapword aligned.
2447   //
2448   // Arguments for generated stub:
2449   //      from:  O0
2450   //      to:    O1
2451   //      count: O2 treated as signed
2452   //
2453   address generate_conjoint_oop_copy(bool aligned, address nooverlap_target,
2454                                      address *entry, const char *name,
2455                                      bool dest_uninitialized = false) {
2456 
2457     const Register from  = O0;  // source array address
2458     const Register to    = O1;  // destination array address
2459     const Register count = O2;  // elements count
2460 
2461     __ align(CodeEntryAlignment);
2462     StubCodeMark mark(this, "StubRoutines", name);
2463     address start = __ pc();
2464 
2465     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2466 
2467     if (entry != NULL) {
2468       *entry = __ pc();
2469       // caller can pass a 64-bit byte count here
2470       BLOCK_COMMENT("Entry:");
2471     }
2472 
2473     array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
2474 
2475     // save arguments for barrier generation
2476     if (UseZGC) {
2477       __ mov(from, G1);
2478     } else {
2479       __ mov(to, G1);
2480     }
2481     __ mov(count, G5);
2482     gen_load_ref_array_barrier(G1, G5);
2483     gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2484 
2485     if (UseCompressedOops) {
2486       generate_conjoint_int_copy_core(aligned);
2487     } else {
2488       generate_conjoint_long_copy_core(aligned);
2489     }
2490 
2491     // O0 is used as temp register
2492     gen_write_ref_array_post_barrier(G1, G5, O0);
2493 
2494     // O3, O4 are used as temp registers
2495     inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2496     __ retl();
2497     __ delayed()->mov(G0, O0); // return 0
2498     return start;
2499   }
2500 
2501 
2502   // Helper for generating a dynamic type check.
2503   // Smashes only the given temp registers.
2504   void generate_type_check(Register sub_klass,
2505                            Register super_check_offset,
2506                            Register super_klass,
2507                            Register temp,
2508                            Label& L_success) {
2509     assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
2510 
2511     BLOCK_COMMENT("type_check:");
2512 
2513     Label L_miss, L_pop_to_miss;
2514 
2515     assert_clean_int(super_check_offset, temp);
2516 
2517     __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg,
2518                                      &L_success, &L_miss, NULL,
2519                                      super_check_offset);
2520 
2521     BLOCK_COMMENT("type_check_slow_path:");
2522     __ save_frame(0);
2523     __ check_klass_subtype_slow_path(sub_klass->after_save(),
2524                                      super_klass->after_save(),
2525                                      L0, L1, L2, L4,
2526                                      NULL, &L_pop_to_miss);
2527     __ ba(L_success);
2528     __ delayed()->restore();
2529 
2530     __ bind(L_pop_to_miss);
2531     __ restore();
2532 
2533     // Fall through on failure!
2534     __ BIND(L_miss);
2535   }
2536 
2537 
2538   //  Generate stub for checked oop copy.
2539   //
2540   // Arguments for generated stub:
2541   //      from:  O0
2542   //      to:    O1
2543   //      count: O2 treated as signed
2544   //      ckoff: O3 (super_check_offset)
2545   //      ckval: O4 (super_klass)
2546   //      ret:   O0 zero for success; (-1^K) where K is partial transfer count
2547   //
2548   address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) {
2549 
2550     const Register O0_from   = O0;      // source array address
2551     const Register O1_to     = O1;      // destination array address
2552     const Register O2_count  = O2;      // elements count
2553     const Register O3_ckoff  = O3;      // super_check_offset
2554     const Register O4_ckval  = O4;      // super_klass
2555 
2556     const Register O5_offset = O5;      // loop var, with stride wordSize
2557     const Register G1_remain = G1;      // loop var, with stride -1
2558     const Register G3_oop    = G3;      // actual oop copied
2559     const Register G4_klass  = G4;      // oop._klass
2560     const Register G5_super  = G5;      // oop._klass._primary_supers[ckval]
2561 
2562     __ align(CodeEntryAlignment);
2563     StubCodeMark mark(this, "StubRoutines", name);
2564     address start = __ pc();
2565 
2566 #ifdef ASSERT
2567     // We sometimes save a frame (see generate_type_check below).
2568     // If this will cause trouble, let's fail now instead of later.
2569     __ save_frame(0);
2570     __ restore();
2571 #endif
2572 
2573     assert_clean_int(O2_count, G1);     // Make sure 'count' is clean int.
2574 
2575 #ifdef ASSERT
2576     // caller guarantees that the arrays really are different
2577     // otherwise, we would have to make conjoint checks
2578     { Label L;
2579       __ mov(O3, G1);           // spill: overlap test smashes O3
2580       __ mov(O4, G4);           // spill: overlap test smashes O4
2581       array_overlap_test(L, LogBytesPerHeapOop);
2582       __ stop("checkcast_copy within a single array");
2583       __ bind(L);
2584       __ mov(G1, O3);
2585       __ mov(G4, O4);
2586     }
2587 #endif //ASSERT
2588 
2589     if (entry != NULL) {
2590       *entry = __ pc();
2591       // caller can pass a 64-bit byte count here (from generic stub)
2592       BLOCK_COMMENT("Entry:");
2593     }
2594     gen_load_ref_array_barrier(O0_from, O2_count);
2595     gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized);
2596 
2597     Label load_element, store_element, do_card_marks, fail, done;
2598     __ addcc(O2_count, 0, G1_remain);   // initialize loop index, and test it
2599     __ brx(Assembler::notZero, false, Assembler::pt, load_element);
2600     __ delayed()->mov(G0, O5_offset);   // offset from start of arrays
2601 
2602     // Empty array:  Nothing to do.
2603     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2604     __ retl();
2605     __ delayed()->set(0, O0);           // return 0 on (trivial) success
2606 
2607     // ======== begin loop ========
2608     // (Loop is rotated; its entry is load_element.)
2609     // Loop variables:
2610     //   (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
2611     //   (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
2612     //   G3, G4, G5 --- current oop, oop.klass, oop.klass.super
2613     __ align(OptoLoopAlignment);
2614 
2615     __ BIND(store_element);
2616     __ deccc(G1_remain);                // decrement the count
2617     __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
2618     __ inc(O5_offset, heapOopSize);     // step to next offset
2619     __ brx(Assembler::zero, true, Assembler::pt, do_card_marks);
2620     __ delayed()->set(0, O0);           // return -1 on success
2621 
2622     // ======== loop entry is here ========
2623     __ BIND(load_element);
2624     __ load_heap_oop(O0_from, O5_offset, G3_oop);  // load the oop
2625     __ br_null_short(G3_oop, Assembler::pt, store_element);
2626 
2627     __ load_klass(G3_oop, G4_klass); // query the object klass
2628 
2629     generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
2630                         // branch to this on success:
2631                         store_element);
2632     // ======== end loop ========
2633 
2634     // It was a real error; we must depend on the caller to finish the job.
2635     // Register G1 has number of *remaining* oops, O2 number of *total* oops.
2636     // Emit GC store barriers for the oops we have copied (O2 minus G1),
2637     // and report their number to the caller.
2638     __ BIND(fail);
2639     __ subcc(O2_count, G1_remain, O2_count);
2640     __ brx(Assembler::zero, false, Assembler::pt, done);
2641     __ delayed()->not1(O2_count, O0);   // report (-1^K) to caller
2642 
2643     __ BIND(do_card_marks);
2644     gen_write_ref_array_post_barrier(O1_to, O2_count, O3);   // store check on O1[0..O2]
2645 
2646     __ BIND(done);
2647     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2648     __ retl();
2649     __ delayed()->nop();             // return value in 00
2650 
2651     return start;
2652   }
2653 
2654 
2655   //  Generate 'unsafe' array copy stub
2656   //  Though just as safe as the other stubs, it takes an unscaled
2657   //  size_t argument instead of an element count.
2658   //
2659   // Arguments for generated stub:
2660   //      from:  O0
2661   //      to:    O1
2662   //      count: O2 byte count, treated as ssize_t, can be zero
2663   //
2664   // Examines the alignment of the operands and dispatches
2665   // to a long, int, short, or byte copy loop.
2666   //
2667   address generate_unsafe_copy(const char* name,
2668                                address byte_copy_entry,
2669                                address short_copy_entry,
2670                                address int_copy_entry,
2671                                address long_copy_entry) {
2672 
2673     const Register O0_from   = O0;      // source array address
2674     const Register O1_to     = O1;      // destination array address
2675     const Register O2_count  = O2;      // elements count
2676 
2677     const Register G1_bits   = G1;      // test copy of low bits
2678 
2679     __ align(CodeEntryAlignment);
2680     StubCodeMark mark(this, "StubRoutines", name);
2681     address start = __ pc();
2682 
2683     // bump this on entry, not on exit:
2684     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3);
2685 
2686     __ or3(O0_from, O1_to, G1_bits);
2687     __ or3(O2_count,       G1_bits, G1_bits);
2688 
2689     __ btst(BytesPerLong-1, G1_bits);
2690     __ br(Assembler::zero, true, Assembler::pt,
2691           long_copy_entry, relocInfo::runtime_call_type);
2692     // scale the count on the way out:
2693     __ delayed()->srax(O2_count, LogBytesPerLong, O2_count);
2694 
2695     __ btst(BytesPerInt-1, G1_bits);
2696     __ br(Assembler::zero, true, Assembler::pt,
2697           int_copy_entry, relocInfo::runtime_call_type);
2698     // scale the count on the way out:
2699     __ delayed()->srax(O2_count, LogBytesPerInt, O2_count);
2700 
2701     __ btst(BytesPerShort-1, G1_bits);
2702     __ br(Assembler::zero, true, Assembler::pt,
2703           short_copy_entry, relocInfo::runtime_call_type);
2704     // scale the count on the way out:
2705     __ delayed()->srax(O2_count, LogBytesPerShort, O2_count);
2706 
2707     __ br(Assembler::always, false, Assembler::pt,
2708           byte_copy_entry, relocInfo::runtime_call_type);
2709     __ delayed()->nop();
2710 
2711     return start;
2712   }
2713 
2714 
2715   // Perform range checks on the proposed arraycopy.
2716   // Kills the two temps, but nothing else.
2717   // Also, clean the sign bits of src_pos and dst_pos.
2718   void arraycopy_range_checks(Register src,     // source array oop (O0)
2719                               Register src_pos, // source position (O1)
2720                               Register dst,     // destination array oo (O2)
2721                               Register dst_pos, // destination position (O3)
2722                               Register length,  // length of copy (O4)
2723                               Register temp1, Register temp2,
2724                               Label& L_failed) {
2725     BLOCK_COMMENT("arraycopy_range_checks:");
2726 
2727     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
2728 
2729     const Register array_length = temp1;  // scratch
2730     const Register end_pos      = temp2;  // scratch
2731 
2732     // Note:  This next instruction may be in the delay slot of a branch:
2733     __ add(length, src_pos, end_pos);  // src_pos + length
2734     __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length);
2735     __ cmp(end_pos, array_length);
2736     __ br(Assembler::greater, false, Assembler::pn, L_failed);
2737 
2738     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2739     __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length
2740     __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length);
2741     __ cmp(end_pos, array_length);
2742     __ br(Assembler::greater, false, Assembler::pn, L_failed);
2743 
2744     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2745     // Move with sign extension can be used since they are positive.
2746     __ delayed()->signx(src_pos, src_pos);
2747     __ signx(dst_pos, dst_pos);
2748 
2749     BLOCK_COMMENT("arraycopy_range_checks done");
2750   }
2751 
2752 
2753   //
2754   //  Generate generic array copy stubs
2755   //
2756   //  Input:
2757   //    O0    -  src oop
2758   //    O1    -  src_pos
2759   //    O2    -  dst oop
2760   //    O3    -  dst_pos
2761   //    O4    -  element count
2762   //
2763   //  Output:
2764   //    O0 ==  0  -  success
2765   //    O0 == -1  -  need to call System.arraycopy
2766   //
2767   address generate_generic_copy(const char *name,
2768                                 address entry_jbyte_arraycopy,
2769                                 address entry_jshort_arraycopy,
2770                                 address entry_jint_arraycopy,
2771                                 address entry_oop_arraycopy,
2772                                 address entry_jlong_arraycopy,
2773                                 address entry_checkcast_arraycopy) {
2774     Label L_failed, L_objArray;
2775 
2776     // Input registers
2777     const Register src      = O0;  // source array oop
2778     const Register src_pos  = O1;  // source position
2779     const Register dst      = O2;  // destination array oop
2780     const Register dst_pos  = O3;  // destination position
2781     const Register length   = O4;  // elements count
2782 
2783     // registers used as temp
2784     const Register G3_src_klass = G3; // source array klass
2785     const Register G4_dst_klass = G4; // destination array klass
2786     const Register G5_lh        = G5; // layout handler
2787     const Register O5_temp      = O5;
2788 
2789     __ align(CodeEntryAlignment);
2790     StubCodeMark mark(this, "StubRoutines", name);
2791     address start = __ pc();
2792 
2793     // bump this on entry, not on exit:
2794     inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3);
2795 
2796     // In principle, the int arguments could be dirty.
2797     //assert_clean_int(src_pos, G1);
2798     //assert_clean_int(dst_pos, G1);
2799     //assert_clean_int(length, G1);
2800 
2801     //-----------------------------------------------------------------------
2802     // Assembler stubs will be used for this call to arraycopy
2803     // if the following conditions are met:
2804     //
2805     // (1) src and dst must not be null.
2806     // (2) src_pos must not be negative.
2807     // (3) dst_pos must not be negative.
2808     // (4) length  must not be negative.
2809     // (5) src klass and dst klass should be the same and not NULL.
2810     // (6) src and dst should be arrays.
2811     // (7) src_pos + length must not exceed length of src.
2812     // (8) dst_pos + length must not exceed length of dst.
2813     BLOCK_COMMENT("arraycopy initial argument checks");
2814 
2815     //  if (src == NULL) return -1;
2816     __ br_null(src, false, Assembler::pn, L_failed);
2817 
2818     //  if (src_pos < 0) return -1;
2819     __ delayed()->tst(src_pos);
2820     __ br(Assembler::negative, false, Assembler::pn, L_failed);
2821     __ delayed()->nop();
2822 
2823     //  if (dst == NULL) return -1;
2824     __ br_null(dst, false, Assembler::pn, L_failed);
2825 
2826     //  if (dst_pos < 0) return -1;
2827     __ delayed()->tst(dst_pos);
2828     __ br(Assembler::negative, false, Assembler::pn, L_failed);
2829 
2830     //  if (length < 0) return -1;
2831     __ delayed()->tst(length);
2832     __ br(Assembler::negative, false, Assembler::pn, L_failed);
2833 
2834     BLOCK_COMMENT("arraycopy argument klass checks");
2835     //  get src->klass()
2836     if (UseCompressedClassPointers) {
2837       __ delayed()->nop(); // ??? not good
2838       __ load_klass(src, G3_src_klass);
2839     } else {
2840       __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass);
2841     }
2842 
2843 #ifdef ASSERT
2844     //  assert(src->klass() != NULL);
2845     BLOCK_COMMENT("assert klasses not null");
2846     { Label L_a, L_b;
2847       __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL
2848       __ bind(L_a);
2849       __ stop("broken null klass");
2850       __ bind(L_b);
2851       __ load_klass(dst, G4_dst_klass);
2852       __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also
2853       __ delayed()->mov(G0, G4_dst_klass);      // scribble the temp
2854       BLOCK_COMMENT("assert done");
2855     }
2856 #endif
2857 
2858     // Load layout helper
2859     //
2860     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2861     // 32        30    24            16              8     2                 0
2862     //
2863     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2864     //
2865 
2866     int lh_offset = in_bytes(Klass::layout_helper_offset());
2867 
2868     // Load 32-bits signed value. Use br() instruction with it to check icc.
2869     __ lduw(G3_src_klass, lh_offset, G5_lh);
2870 
2871     if (UseCompressedClassPointers) {
2872       __ load_klass(dst, G4_dst_klass);
2873     }
2874     // Handle objArrays completely differently...
2875     juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2876     __ set(objArray_lh, O5_temp);
2877     __ cmp(G5_lh,       O5_temp);
2878     __ br(Assembler::equal, false, Assembler::pt, L_objArray);
2879     if (UseCompressedClassPointers) {
2880       __ delayed()->nop();
2881     } else {
2882       __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
2883     }
2884 
2885     //  if (src->klass() != dst->klass()) return -1;
2886     __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed);
2887 
2888     //  if (!src->is_Array()) return -1;
2889     __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
2890     __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed);
2891 
2892     // At this point, it is known to be a typeArray (array_tag 0x3).
2893 #ifdef ASSERT
2894     __ delayed()->nop();
2895     { Label L;
2896       jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2897       __ set(lh_prim_tag_in_place, O5_temp);
2898       __ cmp(G5_lh,                O5_temp);
2899       __ br(Assembler::greaterEqual, false, Assembler::pt, L);
2900       __ delayed()->nop();
2901       __ stop("must be a primitive array");
2902       __ bind(L);
2903     }
2904 #else
2905     __ delayed();                               // match next insn to prev branch
2906 #endif
2907 
2908     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2909                            O5_temp, G4_dst_klass, L_failed);
2910 
2911     // TypeArrayKlass
2912     //
2913     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2914     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2915     //
2916 
2917     const Register G4_offset = G4_dst_klass;    // array offset
2918     const Register G3_elsize = G3_src_klass;    // log2 element size
2919 
2920     __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset);
2921     __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset
2922     __ add(src, G4_offset, src);       // src array offset
2923     __ add(dst, G4_offset, dst);       // dst array offset
2924     __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size
2925 
2926     // next registers should be set before the jump to corresponding stub
2927     const Register from     = O0;  // source array address
2928     const Register to       = O1;  // destination array address
2929     const Register count    = O2;  // elements count
2930 
2931     // 'from', 'to', 'count' registers should be set in this order
2932     // since they are the same as 'src', 'src_pos', 'dst'.
2933 
2934     BLOCK_COMMENT("scale indexes to element size");
2935     __ sll_ptr(src_pos, G3_elsize, src_pos);
2936     __ sll_ptr(dst_pos, G3_elsize, dst_pos);
2937     __ add(src, src_pos, from);       // src_addr
2938     __ add(dst, dst_pos, to);         // dst_addr
2939 
2940     BLOCK_COMMENT("choose copy loop based on element size");
2941     __ cmp(G3_elsize, 0);
2942     __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy);
2943     __ delayed()->signx(length, count); // length
2944 
2945     __ cmp(G3_elsize, LogBytesPerShort);
2946     __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy);
2947     __ delayed()->signx(length, count); // length
2948 
2949     __ cmp(G3_elsize, LogBytesPerInt);
2950     __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy);
2951     __ delayed()->signx(length, count); // length
2952 #ifdef ASSERT
2953     { Label L;
2954       __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L);
2955       __ stop("must be long copy, but elsize is wrong");
2956       __ bind(L);
2957     }
2958 #endif
2959     __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy);
2960     __ delayed()->signx(length, count); // length
2961 
2962     // ObjArrayKlass
2963   __ BIND(L_objArray);
2964     // live at this point:  G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length
2965 
2966     Label L_plain_copy, L_checkcast_copy;
2967     //  test array classes for subtyping
2968     __ cmp(G3_src_klass, G4_dst_klass);         // usual case is exact equality
2969     __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy);
2970     __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below
2971 
2972     // Identically typed arrays can be copied without element-wise checks.
2973     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2974                            O5_temp, G5_lh, L_failed);
2975 
2976     __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
2977     __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
2978     __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
2979     __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
2980     __ add(src, src_pos, from);       // src_addr
2981     __ add(dst, dst_pos, to);         // dst_addr
2982   __ BIND(L_plain_copy);
2983     __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy);
2984     __ delayed()->signx(length, count); // length
2985 
2986   __ BIND(L_checkcast_copy);
2987     // live at this point:  G3_src_klass, G4_dst_klass
2988     {
2989       // Before looking at dst.length, make sure dst is also an objArray.
2990       // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot
2991       __ cmp(G5_lh,                    O5_temp);
2992       __ br(Assembler::notEqual, false, Assembler::pn, L_failed);
2993 
2994       // It is safe to examine both src.length and dst.length.
2995       __ delayed();                             // match next insn to prev branch
2996       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2997                              O5_temp, G5_lh, L_failed);
2998 
2999       // Marshal the base address arguments now, freeing registers.
3000       __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3001       __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3002       __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3003       __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3004       __ add(src, src_pos, from);               // src_addr
3005       __ add(dst, dst_pos, to);                 // dst_addr
3006       __ signx(length, count);                  // length (reloaded)
3007 
3008       Register sco_temp = O3;                   // this register is free now
3009       assert_different_registers(from, to, count, sco_temp,
3010                                  G4_dst_klass, G3_src_klass);
3011 
3012       // Generate the type check.
3013       int sco_offset = in_bytes(Klass::super_check_offset_offset());
3014       __ lduw(G4_dst_klass, sco_offset, sco_temp);
3015       generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
3016                           O5_temp, L_plain_copy);
3017 
3018       // Fetch destination element klass from the ObjArrayKlass header.
3019       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3020 
3021       // the checkcast_copy loop needs two extra arguments:
3022       __ ld_ptr(G4_dst_klass, ek_offset, O4);   // dest elem klass
3023       // lduw(O4, sco_offset, O3);              // sco of elem klass
3024 
3025       __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy);
3026       __ delayed()->lduw(O4, sco_offset, O3);
3027     }
3028 
3029   __ BIND(L_failed);
3030     __ retl();
3031     __ delayed()->sub(G0, 1, O0); // return -1
3032     return start;
3033   }
3034 
3035   //
3036   //  Generate stub for heap zeroing.
3037   //  "to" address is aligned to jlong (8 bytes).
3038   //
3039   // Arguments for generated stub:
3040   //      to:    O0
3041   //      count: O1 treated as signed (count of HeapWord)
3042   //             count could be 0
3043   //
3044   address generate_zero_aligned_words(const char* name) {
3045     __ align(CodeEntryAlignment);
3046     StubCodeMark mark(this, "StubRoutines", name);
3047     address start = __ pc();
3048 
3049     const Register to    = O0;   // source array address
3050     const Register count = O1;   // HeapWords count
3051     const Register temp  = O2;   // scratch
3052 
3053     Label Ldone;
3054     __ sllx(count, LogHeapWordSize, count); // to bytes count
3055     // Use BIS for zeroing
3056     __ bis_zeroing(to, count, temp, Ldone);
3057     __ bind(Ldone);
3058     __ retl();
3059     __ delayed()->nop();
3060     return start;
3061 }
3062 
3063   void generate_arraycopy_stubs() {
3064     address entry;
3065     address entry_jbyte_arraycopy;
3066     address entry_jshort_arraycopy;
3067     address entry_jint_arraycopy;
3068     address entry_oop_arraycopy;
3069     address entry_jlong_arraycopy;
3070     address entry_checkcast_arraycopy;
3071 
3072     //*** jbyte
3073     // Always need aligned and unaligned versions
3074     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
3075                                                                                   "jbyte_disjoint_arraycopy");
3076     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
3077                                                                                   &entry_jbyte_arraycopy,
3078                                                                                   "jbyte_arraycopy");
3079     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
3080                                                                                   "arrayof_jbyte_disjoint_arraycopy");
3081     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
3082                                                                                   "arrayof_jbyte_arraycopy");
3083 
3084     //*** jshort
3085     // Always need aligned and unaligned versions
3086     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
3087                                                                                     "jshort_disjoint_arraycopy");
3088     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
3089                                                                                     &entry_jshort_arraycopy,
3090                                                                                     "jshort_arraycopy");
3091     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
3092                                                                                     "arrayof_jshort_disjoint_arraycopy");
3093     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
3094                                                                                     "arrayof_jshort_arraycopy");
3095 
3096     //*** jint
3097     // Aligned versions
3098     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
3099                                                                                 "arrayof_jint_disjoint_arraycopy");
3100     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
3101                                                                                 "arrayof_jint_arraycopy");
3102     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
3103     // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it).
3104     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
3105                                                                                 "jint_disjoint_arraycopy");
3106     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
3107                                                                                 &entry_jint_arraycopy,
3108                                                                                 "jint_arraycopy");
3109 
3110     //*** jlong
3111     // It is always aligned
3112     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
3113                                                                                   "arrayof_jlong_disjoint_arraycopy");
3114     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
3115                                                                                   "arrayof_jlong_arraycopy");
3116     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
3117     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
3118 
3119 
3120     //*** oops
3121     // Aligned versions
3122     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, &entry,
3123                                                                                       "arrayof_oop_disjoint_arraycopy");
3124     StubRoutines::_arrayof_oop_arraycopy                 = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy,
3125                                                                                       "arrayof_oop_arraycopy");
3126     // Aligned versions without pre-barriers
3127     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry,
3128                                                                                       "arrayof_oop_disjoint_arraycopy_uninit",
3129                                                                                       /*dest_uninitialized*/true);
3130     StubRoutines::_arrayof_oop_arraycopy_uninit          = generate_conjoint_oop_copy(true, entry, NULL,
3131                                                                                       "arrayof_oop_arraycopy_uninit",
3132                                                                                       /*dest_uninitialized*/true);
3133     if (UseCompressedOops) {
3134       // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy.
3135       StubRoutines::_oop_disjoint_arraycopy            = generate_disjoint_oop_copy(false, &entry,
3136                                                                                     "oop_disjoint_arraycopy");
3137       StubRoutines::_oop_arraycopy                     = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy,
3138                                                                                     "oop_arraycopy");
3139       // Unaligned versions without pre-barriers
3140       StubRoutines::_oop_disjoint_arraycopy_uninit     = generate_disjoint_oop_copy(false, &entry,
3141                                                                                     "oop_disjoint_arraycopy_uninit",
3142                                                                                     /*dest_uninitialized*/true);
3143       StubRoutines::_oop_arraycopy_uninit              = generate_conjoint_oop_copy(false, entry, NULL,
3144                                                                                     "oop_arraycopy_uninit",
3145                                                                                     /*dest_uninitialized*/true);
3146     } else {
3147       // oop arraycopy is always aligned on 32bit and 64bit without compressed oops
3148       StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
3149       StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
3150       StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
3151       StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
3152     }
3153 
3154     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3155     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3156                                                                         /*dest_uninitialized*/true);
3157 
3158     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3159                                                               entry_jbyte_arraycopy,
3160                                                               entry_jshort_arraycopy,
3161                                                               entry_jint_arraycopy,
3162                                                               entry_jlong_arraycopy);
3163     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3164                                                                entry_jbyte_arraycopy,
3165                                                                entry_jshort_arraycopy,
3166                                                                entry_jint_arraycopy,
3167                                                                entry_oop_arraycopy,
3168                                                                entry_jlong_arraycopy,
3169                                                                entry_checkcast_arraycopy);
3170 
3171     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3172     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3173     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3174     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3175     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3176     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3177 
3178     if (UseBlockZeroing) {
3179       StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words");
3180     }
3181   }
3182 
3183   address generate_aescrypt_encryptBlock() {
3184     // required since we read expanded key 'int' array starting first element without alignment considerations
3185     assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3186            "the following code assumes that first element of an int array is aligned to 8 bytes");
3187     __ align(CodeEntryAlignment);
3188     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3189     Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output;
3190     address start = __ pc();
3191     Register from = O0; // source byte array
3192     Register to = O1;   // destination byte array
3193     Register key = O2;  // expanded key array
3194     const Register keylen = O4; //reg for storing expanded key array length
3195 
3196     // read expanded key length
3197     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3198 
3199     // Method to address arbitrary alignment for load instructions:
3200     // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary
3201     // If zero/aligned then continue with double FP load instructions
3202     // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata
3203     // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address
3204     // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address
3205     // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs
3206 
3207     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3208     __ andcc(from, 7, G0);
3209     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
3210     __ delayed()->alignaddr(from, G0, from);
3211 
3212     // aligned case: load input into F54-F56
3213     __ ldf(FloatRegisterImpl::D, from, 0, F54);
3214     __ ldf(FloatRegisterImpl::D, from, 8, F56);
3215     __ ba_short(L_load_expanded_key);
3216 
3217     __ BIND(L_load_misaligned_input);
3218     __ ldf(FloatRegisterImpl::D, from, 0, F54);
3219     __ ldf(FloatRegisterImpl::D, from, 8, F56);
3220     __ ldf(FloatRegisterImpl::D, from, 16, F58);
3221     __ faligndata(F54, F56, F54);
3222     __ faligndata(F56, F58, F56);
3223 
3224     __ BIND(L_load_expanded_key);
3225     // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed
3226     for ( int i = 0;  i <= 38; i += 2 ) {
3227       __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i));
3228     }
3229 
3230     // perform cipher transformation
3231     __ fxor(FloatRegisterImpl::D, F0, F54, F54);
3232     __ fxor(FloatRegisterImpl::D, F2, F56, F56);
3233     // rounds 1 through 8
3234     for ( int i = 4;  i <= 28; i += 8 ) {
3235       __ aes_eround01(as_FloatRegister(i), F54, F56, F58);
3236       __ aes_eround23(as_FloatRegister(i+2), F54, F56, F60);
3237       __ aes_eround01(as_FloatRegister(i+4), F58, F60, F54);
3238       __ aes_eround23(as_FloatRegister(i+6), F58, F60, F56);
3239     }
3240     __ aes_eround01(F36, F54, F56, F58); //round 9
3241     __ aes_eround23(F38, F54, F56, F60);
3242 
3243     // 128-bit original key size
3244     __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_doLast128bit);
3245 
3246     for ( int i = 40;  i <= 50; i += 2 ) {
3247       __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i) );
3248     }
3249     __ aes_eround01(F40, F58, F60, F54); //round 10
3250     __ aes_eround23(F42, F58, F60, F56);
3251     __ aes_eround01(F44, F54, F56, F58); //round 11
3252     __ aes_eround23(F46, F54, F56, F60);
3253 
3254     // 192-bit original key size
3255     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_storeOutput);
3256 
3257     __ ldf(FloatRegisterImpl::D, key, 208, F52);
3258     __ aes_eround01(F48, F58, F60, F54); //round 12
3259     __ aes_eround23(F50, F58, F60, F56);
3260     __ ldf(FloatRegisterImpl::D, key, 216, F46);
3261     __ ldf(FloatRegisterImpl::D, key, 224, F48);
3262     __ ldf(FloatRegisterImpl::D, key, 232, F50);
3263     __ aes_eround01(F52, F54, F56, F58); //round 13
3264     __ aes_eround23(F46, F54, F56, F60);
3265     __ ba_short(L_storeOutput);
3266 
3267     __ BIND(L_doLast128bit);
3268     __ ldf(FloatRegisterImpl::D, key, 160, F48);
3269     __ ldf(FloatRegisterImpl::D, key, 168, F50);
3270 
3271     __ BIND(L_storeOutput);
3272     // perform last round of encryption common for all key sizes
3273     __ aes_eround01_l(F48, F58, F60, F54); //last round
3274     __ aes_eround23_l(F50, F58, F60, F56);
3275 
3276     // Method to address arbitrary alignment for store instructions:
3277     // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary
3278     // If zero/aligned then continue with double FP store instructions
3279     // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case)
3280     // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001
3281     // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case
3282     // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case.
3283     // Set GSR.align to (8-n) using alignaddr
3284     // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf
3285     // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address
3286     // Store (partial) the original first (8-n) bytes starting at the original 'dest' address
3287     // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address
3288     // We need to execute this process for both the 8-byte result values
3289 
3290     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3291     __ andcc(to, 7, O5);
3292     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
3293     __ delayed()->edge8n(to, G0, O3);
3294 
3295     // aligned case: store output into the destination array
3296     __ stf(FloatRegisterImpl::D, F54, to, 0);
3297     __ retl();
3298     __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8);
3299 
3300     __ BIND(L_store_misaligned_output);
3301     __ add(to, 8, O4);
3302     __ mov(8, O2);
3303     __ sub(O2, O5, O2);
3304     __ alignaddr(O2, G0, O2);
3305     __ faligndata(F54, F54, F54);
3306     __ faligndata(F56, F56, F56);
3307     __ and3(to, -8, to);
3308     __ and3(O4, -8, O4);
3309     __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
3310     __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
3311     __ add(to, 8, to);
3312     __ add(O4, 8, O4);
3313     __ orn(G0, O3, O3);
3314     __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
3315     __ retl();
3316     __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
3317 
3318     return start;
3319   }
3320 
3321   address generate_aescrypt_decryptBlock() {
3322     assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3323            "the following code assumes that first element of an int array is aligned to 8 bytes");
3324     // required since we read original key 'byte' array as well in the decryption stubs
3325     assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3326            "the following code assumes that first element of a byte array is aligned to 8 bytes");
3327     __ align(CodeEntryAlignment);
3328     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3329     address start = __ pc();
3330     Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input;
3331     Label L_256bit_transform, L_common_transform, L_store_misaligned_output;
3332     Register from = O0; // source byte array
3333     Register to = O1;   // destination byte array
3334     Register key = O2;  // expanded key array
3335     Register original_key = O3;  // original key array only required during decryption
3336     const Register keylen = O4;  // reg for storing expanded key array length
3337 
3338     // read expanded key array length
3339     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3340 
3341     // save 'from' since we may need to recheck alignment in case of 256-bit decryption
3342     __ mov(from, G1);
3343 
3344     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3345     __ andcc(from, 7, G0);
3346     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
3347     __ delayed()->alignaddr(from, G0, from);
3348 
3349     // aligned case: load input into F52-F54
3350     __ ldf(FloatRegisterImpl::D, from, 0, F52);
3351     __ ldf(FloatRegisterImpl::D, from, 8, F54);
3352     __ ba_short(L_load_original_key);
3353 
3354     __ BIND(L_load_misaligned_input);
3355     __ ldf(FloatRegisterImpl::D, from, 0, F52);
3356     __ ldf(FloatRegisterImpl::D, from, 8, F54);
3357     __ ldf(FloatRegisterImpl::D, from, 16, F56);
3358     __ faligndata(F52, F54, F52);
3359     __ faligndata(F54, F56, F54);
3360 
3361     __ BIND(L_load_original_key);
3362     // load original key from SunJCE expanded decryption key
3363     // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
3364     for ( int i = 0;  i <= 3; i++ ) {
3365       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3366     }
3367 
3368     // 256-bit original key size
3369     __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
3370 
3371     // 192-bit original key size
3372     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
3373 
3374     // 128-bit original key size
3375     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3376     for ( int i = 0;  i <= 36; i += 4 ) {
3377       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
3378       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
3379     }
3380 
3381     // perform 128-bit key specific inverse cipher transformation
3382     __ fxor(FloatRegisterImpl::D, F42, F54, F54);
3383     __ fxor(FloatRegisterImpl::D, F40, F52, F52);
3384     __ ba_short(L_common_transform);
3385 
3386     __ BIND(L_expand192bit);
3387 
3388     // start loading rest of the 192-bit key
3389     __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
3390     __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
3391 
3392     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3393     for ( int i = 0;  i <= 36; i += 6 ) {
3394       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
3395       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
3396       __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
3397     }
3398     __ aes_kexpand1(F42, F46, 7, F48);
3399     __ aes_kexpand2(F44, F48, F50);
3400 
3401     // perform 192-bit key specific inverse cipher transformation
3402     __ fxor(FloatRegisterImpl::D, F50, F54, F54);
3403     __ fxor(FloatRegisterImpl::D, F48, F52, F52);
3404     __ aes_dround23(F46, F52, F54, F58);
3405     __ aes_dround01(F44, F52, F54, F56);
3406     __ aes_dround23(F42, F56, F58, F54);
3407     __ aes_dround01(F40, F56, F58, F52);
3408     __ ba_short(L_common_transform);
3409 
3410     __ BIND(L_expand256bit);
3411 
3412     // load rest of the 256-bit key
3413     for ( int i = 4;  i <= 7; i++ ) {
3414       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3415     }
3416 
3417     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3418     for ( int i = 0;  i <= 40; i += 8 ) {
3419       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
3420       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
3421       __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
3422       __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
3423     }
3424     __ aes_kexpand1(F48, F54, 6, F56);
3425     __ aes_kexpand2(F50, F56, F58);
3426 
3427     for ( int i = 0;  i <= 6; i += 2 ) {
3428       __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
3429     }
3430 
3431     // reload original 'from' address
3432     __ mov(G1, from);
3433 
3434     // re-check 8-byte alignment
3435     __ andcc(from, 7, G0);
3436     __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input);
3437     __ delayed()->alignaddr(from, G0, from);
3438 
3439     // aligned case: load input into F52-F54
3440     __ ldf(FloatRegisterImpl::D, from, 0, F52);
3441     __ ldf(FloatRegisterImpl::D, from, 8, F54);
3442     __ ba_short(L_256bit_transform);
3443 
3444     __ BIND(L_reload_misaligned_input);
3445     __ ldf(FloatRegisterImpl::D, from, 0, F52);
3446     __ ldf(FloatRegisterImpl::D, from, 8, F54);
3447     __ ldf(FloatRegisterImpl::D, from, 16, F56);
3448     __ faligndata(F52, F54, F52);
3449     __ faligndata(F54, F56, F54);
3450 
3451     // perform 256-bit key specific inverse cipher transformation
3452     __ BIND(L_256bit_transform);
3453     __ fxor(FloatRegisterImpl::D, F0, F54, F54);
3454     __ fxor(FloatRegisterImpl::D, F2, F52, F52);
3455     __ aes_dround23(F4, F52, F54, F58);
3456     __ aes_dround01(F6, F52, F54, F56);
3457     __ aes_dround23(F50, F56, F58, F54);
3458     __ aes_dround01(F48, F56, F58, F52);
3459     __ aes_dround23(F46, F52, F54, F58);
3460     __ aes_dround01(F44, F52, F54, F56);
3461     __ aes_dround23(F42, F56, F58, F54);
3462     __ aes_dround01(F40, F56, F58, F52);
3463 
3464     for ( int i = 0;  i <= 7; i++ ) {
3465       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3466     }
3467 
3468     // perform inverse cipher transformations common for all key sizes
3469     __ BIND(L_common_transform);
3470     for ( int i = 38;  i >= 6; i -= 8 ) {
3471       __ aes_dround23(as_FloatRegister(i), F52, F54, F58);
3472       __ aes_dround01(as_FloatRegister(i-2), F52, F54, F56);
3473       if ( i != 6) {
3474         __ aes_dround23(as_FloatRegister(i-4), F56, F58, F54);
3475         __ aes_dround01(as_FloatRegister(i-6), F56, F58, F52);
3476       } else {
3477         __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54);
3478         __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52);
3479       }
3480     }
3481 
3482     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3483     __ andcc(to, 7, O5);
3484     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
3485     __ delayed()->edge8n(to, G0, O3);
3486 
3487     // aligned case: store output into the destination array
3488     __ stf(FloatRegisterImpl::D, F52, to, 0);
3489     __ retl();
3490     __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8);
3491 
3492     __ BIND(L_store_misaligned_output);
3493     __ add(to, 8, O4);
3494     __ mov(8, O2);
3495     __ sub(O2, O5, O2);
3496     __ alignaddr(O2, G0, O2);
3497     __ faligndata(F52, F52, F52);
3498     __ faligndata(F54, F54, F54);
3499     __ and3(to, -8, to);
3500     __ and3(O4, -8, O4);
3501     __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
3502     __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
3503     __ add(to, 8, to);
3504     __ add(O4, 8, O4);
3505     __ orn(G0, O3, O3);
3506     __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
3507     __ retl();
3508     __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
3509 
3510     return start;
3511   }
3512 
3513   address generate_cipherBlockChaining_encryptAESCrypt() {
3514     assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3515            "the following code assumes that first element of an int array is aligned to 8 bytes");
3516     assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3517            "the following code assumes that first element of a byte array is aligned to 8 bytes");
3518     __ align(CodeEntryAlignment);
3519     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3520     Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit;
3521     Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform;
3522     Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit;
3523     Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit;
3524     address start = __ pc();
3525     Register from = I0; // source byte array
3526     Register to = I1;   // destination byte array
3527     Register key = I2;  // expanded key array
3528     Register rvec = I3; // init vector
3529     const Register len_reg = I4; // cipher length
3530     const Register keylen = I5;  // reg for storing expanded key array length
3531 
3532     __ save_frame(0);
3533     // save cipher len to return in the end
3534     __ mov(len_reg, L0);
3535 
3536     // read expanded key length
3537     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3538 
3539     // load initial vector, 8-byte alignment is guranteed
3540     __ ldf(FloatRegisterImpl::D, rvec, 0, F60);
3541     __ ldf(FloatRegisterImpl::D, rvec, 8, F62);
3542     // load key, 8-byte alignment is guranteed
3543     __ ldx(key,0,G1);
3544     __ ldx(key,8,G5);
3545 
3546     // start loading expanded key, 8-byte alignment is guranteed
3547     for ( int i = 0, j = 16;  i <= 38; i += 2, j += 8 ) {
3548       __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3549     }
3550 
3551     // 128-bit original key size
3552     __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_cbcenc128);
3553 
3554     for ( int i = 40, j = 176;  i <= 46; i += 2, j += 8 ) {
3555       __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3556     }
3557 
3558     // 192-bit original key size
3559     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_cbcenc192);
3560 
3561     for ( int i = 48, j = 208;  i <= 54; i += 2, j += 8 ) {
3562       __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3563     }
3564 
3565     // 256-bit original key size
3566     __ ba_short(L_cbcenc256);
3567 
3568     __ align(OptoLoopAlignment);
3569     __ BIND(L_cbcenc128);
3570     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3571     __ andcc(from, 7, G0);
3572     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit);
3573     __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3574 
3575     // aligned case: load input into G3 and G4
3576     __ ldx(from,0,G3);
3577     __ ldx(from,8,G4);
3578     __ ba_short(L_128bit_transform);
3579 
3580     __ BIND(L_load_misaligned_input_128bit);
3581     // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
3582     __ alignaddr(from, G0, from);
3583     __ ldf(FloatRegisterImpl::D, from, 0, F48);
3584     __ ldf(FloatRegisterImpl::D, from, 8, F50);
3585     __ ldf(FloatRegisterImpl::D, from, 16, F52);
3586     __ faligndata(F48, F50, F48);
3587     __ faligndata(F50, F52, F50);
3588     __ movdtox(F48, G3);
3589     __ movdtox(F50, G4);
3590     __ mov(L1, from);
3591 
3592     __ BIND(L_128bit_transform);
3593     __ xor3(G1,G3,G3);
3594     __ xor3(G5,G4,G4);
3595     __ movxtod(G3,F56);
3596     __ movxtod(G4,F58);
3597     __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3598     __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3599 
3600     // TEN_EROUNDS
3601     for ( int i = 0;  i <= 32; i += 8 ) {
3602       __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
3603       __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
3604       if (i != 32 ) {
3605         __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
3606         __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
3607       } else {
3608         __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3609         __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3610       }
3611     }
3612 
3613     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3614     __ andcc(to, 7, L1);
3615     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit);
3616     __ delayed()->edge8n(to, G0, L2);
3617 
3618     // aligned case: store output into the destination array
3619     __ stf(FloatRegisterImpl::D, F60, to, 0);
3620     __ stf(FloatRegisterImpl::D, F62, to, 8);
3621     __ ba_short(L_check_loop_end_128bit);
3622 
3623     __ BIND(L_store_misaligned_output_128bit);
3624     __ add(to, 8, L3);
3625     __ mov(8, L4);
3626     __ sub(L4, L1, L4);
3627     __ alignaddr(L4, G0, L4);
3628     // save cipher text before circular right shift
3629     // as it needs to be stored as iv for next block (see code before next retl)
3630     __ movdtox(F60, L6);
3631     __ movdtox(F62, L7);
3632     __ faligndata(F60, F60, F60);
3633     __ faligndata(F62, F62, F62);
3634     __ mov(to, L5);
3635     __ and3(to, -8, to);
3636     __ and3(L3, -8, L3);
3637     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3638     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3639     __ add(to, 8, to);
3640     __ add(L3, 8, L3);
3641     __ orn(G0, L2, L2);
3642     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3643     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3644     __ mov(L5, to);
3645     __ movxtod(L6, F60);
3646     __ movxtod(L7, F62);
3647 
3648     __ BIND(L_check_loop_end_128bit);
3649     __ add(from, 16, from);
3650     __ add(to, 16, to);
3651     __ subcc(len_reg, 16, len_reg);
3652     __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128);
3653     __ delayed()->nop();
3654     // re-init intial vector for next block, 8-byte alignment is guaranteed
3655     __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3656     __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3657     __ mov(L0, I0);
3658     __ ret();
3659     __ delayed()->restore();
3660 
3661     __ align(OptoLoopAlignment);
3662     __ BIND(L_cbcenc192);
3663     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3664     __ andcc(from, 7, G0);
3665     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit);
3666     __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3667 
3668     // aligned case: load input into G3 and G4
3669     __ ldx(from,0,G3);
3670     __ ldx(from,8,G4);
3671     __ ba_short(L_192bit_transform);
3672 
3673     __ BIND(L_load_misaligned_input_192bit);
3674     // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
3675     __ alignaddr(from, G0, from);
3676     __ ldf(FloatRegisterImpl::D, from, 0, F48);
3677     __ ldf(FloatRegisterImpl::D, from, 8, F50);
3678     __ ldf(FloatRegisterImpl::D, from, 16, F52);
3679     __ faligndata(F48, F50, F48);
3680     __ faligndata(F50, F52, F50);
3681     __ movdtox(F48, G3);
3682     __ movdtox(F50, G4);
3683     __ mov(L1, from);
3684 
3685     __ BIND(L_192bit_transform);
3686     __ xor3(G1,G3,G3);
3687     __ xor3(G5,G4,G4);
3688     __ movxtod(G3,F56);
3689     __ movxtod(G4,F58);
3690     __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3691     __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3692 
3693     // TWELEVE_EROUNDS
3694     for ( int i = 0;  i <= 40; i += 8 ) {
3695       __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
3696       __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
3697       if (i != 40 ) {
3698         __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
3699         __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
3700       } else {
3701         __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3702         __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3703       }
3704     }
3705 
3706     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3707     __ andcc(to, 7, L1);
3708     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit);
3709     __ delayed()->edge8n(to, G0, L2);
3710 
3711     // aligned case: store output into the destination array
3712     __ stf(FloatRegisterImpl::D, F60, to, 0);
3713     __ stf(FloatRegisterImpl::D, F62, to, 8);
3714     __ ba_short(L_check_loop_end_192bit);
3715 
3716     __ BIND(L_store_misaligned_output_192bit);
3717     __ add(to, 8, L3);
3718     __ mov(8, L4);
3719     __ sub(L4, L1, L4);
3720     __ alignaddr(L4, G0, L4);
3721     __ movdtox(F60, L6);
3722     __ movdtox(F62, L7);
3723     __ faligndata(F60, F60, F60);
3724     __ faligndata(F62, F62, F62);
3725     __ mov(to, L5);
3726     __ and3(to, -8, to);
3727     __ and3(L3, -8, L3);
3728     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3729     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3730     __ add(to, 8, to);
3731     __ add(L3, 8, L3);
3732     __ orn(G0, L2, L2);
3733     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3734     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3735     __ mov(L5, to);
3736     __ movxtod(L6, F60);
3737     __ movxtod(L7, F62);
3738 
3739     __ BIND(L_check_loop_end_192bit);
3740     __ add(from, 16, from);
3741     __ subcc(len_reg, 16, len_reg);
3742     __ add(to, 16, to);
3743     __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192);
3744     __ delayed()->nop();
3745     // re-init intial vector for next block, 8-byte alignment is guaranteed
3746     __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3747     __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3748     __ mov(L0, I0);
3749     __ ret();
3750     __ delayed()->restore();
3751 
3752     __ align(OptoLoopAlignment);
3753     __ BIND(L_cbcenc256);
3754     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3755     __ andcc(from, 7, G0);
3756     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit);
3757     __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3758 
3759     // aligned case: load input into G3 and G4
3760     __ ldx(from,0,G3);
3761     __ ldx(from,8,G4);
3762     __ ba_short(L_256bit_transform);
3763 
3764     __ BIND(L_load_misaligned_input_256bit);
3765     // cannot clobber F48, F50 and F52. F56, F58 can be used though
3766     __ alignaddr(from, G0, from);
3767     __ movdtox(F60, L2); // save F60 before overwriting
3768     __ ldf(FloatRegisterImpl::D, from, 0, F56);
3769     __ ldf(FloatRegisterImpl::D, from, 8, F58);
3770     __ ldf(FloatRegisterImpl::D, from, 16, F60);
3771     __ faligndata(F56, F58, F56);
3772     __ faligndata(F58, F60, F58);
3773     __ movdtox(F56, G3);
3774     __ movdtox(F58, G4);
3775     __ mov(L1, from);
3776     __ movxtod(L2, F60);
3777 
3778     __ BIND(L_256bit_transform);
3779     __ xor3(G1,G3,G3);
3780     __ xor3(G5,G4,G4);
3781     __ movxtod(G3,F56);
3782     __ movxtod(G4,F58);
3783     __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3784     __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3785 
3786     // FOURTEEN_EROUNDS
3787     for ( int i = 0;  i <= 48; i += 8 ) {
3788       __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
3789       __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
3790       if (i != 48 ) {
3791         __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
3792         __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
3793       } else {
3794         __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3795         __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3796       }
3797     }
3798 
3799     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3800     __ andcc(to, 7, L1);
3801     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit);
3802     __ delayed()->edge8n(to, G0, L2);
3803 
3804     // aligned case: store output into the destination array
3805     __ stf(FloatRegisterImpl::D, F60, to, 0);
3806     __ stf(FloatRegisterImpl::D, F62, to, 8);
3807     __ ba_short(L_check_loop_end_256bit);
3808 
3809     __ BIND(L_store_misaligned_output_256bit);
3810     __ add(to, 8, L3);
3811     __ mov(8, L4);
3812     __ sub(L4, L1, L4);
3813     __ alignaddr(L4, G0, L4);
3814     __ movdtox(F60, L6);
3815     __ movdtox(F62, L7);
3816     __ faligndata(F60, F60, F60);
3817     __ faligndata(F62, F62, F62);
3818     __ mov(to, L5);
3819     __ and3(to, -8, to);
3820     __ and3(L3, -8, L3);
3821     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3822     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3823     __ add(to, 8, to);
3824     __ add(L3, 8, L3);
3825     __ orn(G0, L2, L2);
3826     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3827     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3828     __ mov(L5, to);
3829     __ movxtod(L6, F60);
3830     __ movxtod(L7, F62);
3831 
3832     __ BIND(L_check_loop_end_256bit);
3833     __ add(from, 16, from);
3834     __ subcc(len_reg, 16, len_reg);
3835     __ add(to, 16, to);
3836     __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256);
3837     __ delayed()->nop();
3838     // re-init intial vector for next block, 8-byte alignment is guaranteed
3839     __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3840     __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3841     __ mov(L0, I0);
3842     __ ret();
3843     __ delayed()->restore();
3844 
3845     return start;
3846   }
3847 
3848   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3849     assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3850            "the following code assumes that first element of an int array is aligned to 8 bytes");
3851     assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3852            "the following code assumes that first element of a byte array is aligned to 8 bytes");
3853     __ align(CodeEntryAlignment);
3854     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3855     Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start;
3856     Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256;
3857     Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128;
3858     Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256;
3859     Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128;
3860     Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192;
3861     Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256;
3862     address start = __ pc();
3863     Register from = I0; // source byte array
3864     Register to = I1;   // destination byte array
3865     Register key = I2;  // expanded key array
3866     Register rvec = I3; // init vector
3867     const Register len_reg = I4; // cipher length
3868     const Register original_key = I5;  // original key array only required during decryption
3869     const Register keylen = L6;  // reg for storing expanded key array length
3870 
3871     __ save_frame(0); //args are read from I* registers since we save the frame in the beginning
3872     // save cipher len to return in the end
3873     __ mov(len_reg, L7);
3874 
3875     // load original key from SunJCE expanded decryption key
3876     // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
3877     for ( int i = 0;  i <= 3; i++ ) {
3878       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3879     }
3880 
3881     // load initial vector, 8-byte alignment is guaranteed
3882     __ ldx(rvec,0,L0);
3883     __ ldx(rvec,8,L1);
3884 
3885     // read expanded key array length
3886     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3887 
3888     // 256-bit original key size
3889     __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
3890 
3891     // 192-bit original key size
3892     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
3893 
3894     // 128-bit original key size
3895     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3896     for ( int i = 0;  i <= 36; i += 4 ) {
3897       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
3898       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
3899     }
3900 
3901     // load expanded key[last-1] and key[last] elements
3902     __ movdtox(F40,L2);
3903     __ movdtox(F42,L3);
3904 
3905     __ and3(len_reg, 16, L4);
3906     __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128);
3907     __ nop();
3908 
3909     __ ba_short(L_dec_first_block_start);
3910 
3911     __ BIND(L_expand192bit);
3912     // load rest of the 192-bit key
3913     __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
3914     __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
3915 
3916     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3917     for ( int i = 0;  i <= 36; i += 6 ) {
3918       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
3919       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
3920       __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
3921     }
3922     __ aes_kexpand1(F42, F46, 7, F48);
3923     __ aes_kexpand2(F44, F48, F50);
3924 
3925     // load expanded key[last-1] and key[last] elements
3926     __ movdtox(F48,L2);
3927     __ movdtox(F50,L3);
3928 
3929     __ and3(len_reg, 16, L4);
3930     __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192);
3931     __ nop();
3932 
3933     __ ba_short(L_dec_first_block_start);
3934 
3935     __ BIND(L_expand256bit);
3936     // load rest of the 256-bit key
3937     for ( int i = 4;  i <= 7; i++ ) {
3938       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3939     }
3940 
3941     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3942     for ( int i = 0;  i <= 40; i += 8 ) {
3943       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
3944       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
3945       __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
3946       __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
3947     }
3948     __ aes_kexpand1(F48, F54, 6, F56);
3949     __ aes_kexpand2(F50, F56, F58);
3950 
3951     // load expanded key[last-1] and key[last] elements
3952     __ movdtox(F56,L2);
3953     __ movdtox(F58,L3);
3954 
3955     __ and3(len_reg, 16, L4);
3956     __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256);
3957 
3958     __ BIND(L_dec_first_block_start);
3959     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3960     __ andcc(from, 7, G0);
3961     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block);
3962     __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
3963 
3964     // aligned case: load input into L4 and L5
3965     __ ldx(from,0,L4);
3966     __ ldx(from,8,L5);
3967     __ ba_short(L_transform_first_block);
3968 
3969     __ BIND(L_load_misaligned_input_first_block);
3970     __ alignaddr(from, G0, from);
3971     // F58, F60, F62 can be clobbered
3972     __ ldf(FloatRegisterImpl::D, from, 0, F58);
3973     __ ldf(FloatRegisterImpl::D, from, 8, F60);
3974     __ ldf(FloatRegisterImpl::D, from, 16, F62);
3975     __ faligndata(F58, F60, F58);
3976     __ faligndata(F60, F62, F60);
3977     __ movdtox(F58, L4);
3978     __ movdtox(F60, L5);
3979     __ mov(G1, from);
3980 
3981     __ BIND(L_transform_first_block);
3982     __ xor3(L2,L4,G1);
3983     __ movxtod(G1,F60);
3984     __ xor3(L3,L5,G1);
3985     __ movxtod(G1,F62);
3986 
3987     // 128-bit original key size
3988     __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pn, L_dec_first_block128);
3989 
3990     // 192-bit original key size
3991     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_first_block192);
3992 
3993     __ aes_dround23(F54, F60, F62, F58);
3994     __ aes_dround01(F52, F60, F62, F56);
3995     __ aes_dround23(F50, F56, F58, F62);
3996     __ aes_dround01(F48, F56, F58, F60);
3997 
3998     __ BIND(L_dec_first_block192);
3999     __ aes_dround23(F46, F60, F62, F58);
4000     __ aes_dround01(F44, F60, F62, F56);
4001     __ aes_dround23(F42, F56, F58, F62);
4002     __ aes_dround01(F40, F56, F58, F60);
4003 
4004     __ BIND(L_dec_first_block128);
4005     for ( int i = 38;  i >= 6; i -= 8 ) {
4006       __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4007       __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4008       if ( i != 6) {
4009         __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4010         __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4011       } else {
4012         __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
4013         __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
4014       }
4015     }
4016 
4017     __ movxtod(L0,F56);
4018     __ movxtod(L1,F58);
4019     __ mov(L4,L0);
4020     __ mov(L5,L1);
4021     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4022     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4023 
4024     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4025     __ andcc(to, 7, G1);
4026     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block);
4027     __ delayed()->edge8n(to, G0, G2);
4028 
4029     // aligned case: store output into the destination array
4030     __ stf(FloatRegisterImpl::D, F60, to, 0);
4031     __ stf(FloatRegisterImpl::D, F62, to, 8);
4032     __ ba_short(L_check_decrypt_end);
4033 
4034     __ BIND(L_store_misaligned_output_first_block);
4035     __ add(to, 8, G3);
4036     __ mov(8, G4);
4037     __ sub(G4, G1, G4);
4038     __ alignaddr(G4, G0, G4);
4039     __ faligndata(F60, F60, F60);
4040     __ faligndata(F62, F62, F62);
4041     __ mov(to, G1);
4042     __ and3(to, -8, to);
4043     __ and3(G3, -8, G3);
4044     __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
4045     __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
4046     __ add(to, 8, to);
4047     __ add(G3, 8, G3);
4048     __ orn(G0, G2, G2);
4049     __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
4050     __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
4051     __ mov(G1, to);
4052 
4053     __ BIND(L_check_decrypt_end);
4054     __ add(from, 16, from);
4055     __ add(to, 16, to);
4056     __ subcc(len_reg, 16, len_reg);
4057     __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end);
4058     __ delayed()->nop();
4059 
4060     // 256-bit original key size
4061     __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_dec_next2_blocks256);
4062 
4063     // 192-bit original key size
4064     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_next2_blocks192);
4065 
4066     __ align(OptoLoopAlignment);
4067     __ BIND(L_dec_next2_blocks128);
4068     __ nop();
4069 
4070     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4071     __ andcc(from, 7, G0);
4072     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128);
4073     __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4074 
4075     // aligned case: load input into G4, G5, L4 and L5
4076     __ ldx(from,0,G4);
4077     __ ldx(from,8,G5);
4078     __ ldx(from,16,L4);
4079     __ ldx(from,24,L5);
4080     __ ba_short(L_transform_next2_blocks128);
4081 
4082     __ BIND(L_load_misaligned_next2_blocks128);
4083     __ alignaddr(from, G0, from);
4084     // F40, F42, F58, F60, F62 can be clobbered
4085     __ ldf(FloatRegisterImpl::D, from, 0, F40);
4086     __ ldf(FloatRegisterImpl::D, from, 8, F42);
4087     __ ldf(FloatRegisterImpl::D, from, 16, F60);
4088     __ ldf(FloatRegisterImpl::D, from, 24, F62);
4089     __ ldf(FloatRegisterImpl::D, from, 32, F58);
4090     __ faligndata(F40, F42, F40);
4091     __ faligndata(F42, F60, F42);
4092     __ faligndata(F60, F62, F60);
4093     __ faligndata(F62, F58, F62);
4094     __ movdtox(F40, G4);
4095     __ movdtox(F42, G5);
4096     __ movdtox(F60, L4);
4097     __ movdtox(F62, L5);
4098     __ mov(G1, from);
4099 
4100     __ BIND(L_transform_next2_blocks128);
4101     // F40:F42 used for first 16-bytes
4102     __ xor3(L2,G4,G1);
4103     __ movxtod(G1,F40);
4104     __ xor3(L3,G5,G1);
4105     __ movxtod(G1,F42);
4106 
4107     // F60:F62 used for next 16-bytes
4108     __ xor3(L2,L4,G1);
4109     __ movxtod(G1,F60);
4110     __ xor3(L3,L5,G1);
4111     __ movxtod(G1,F62);
4112 
4113     for ( int i = 38;  i >= 6; i -= 8 ) {
4114       __ aes_dround23(as_FloatRegister(i), F40, F42, F44);
4115       __ aes_dround01(as_FloatRegister(i-2), F40, F42, F46);
4116       __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4117       __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4118       if (i != 6 ) {
4119         __ aes_dround23(as_FloatRegister(i-4), F46, F44, F42);
4120         __ aes_dround01(as_FloatRegister(i-6), F46, F44, F40);
4121         __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4122         __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4123       } else {
4124         __ aes_dround23_l(as_FloatRegister(i-4), F46, F44, F42);
4125         __ aes_dround01_l(as_FloatRegister(i-6), F46, F44, F40);
4126         __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
4127         __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
4128       }
4129     }
4130 
4131     __ movxtod(L0,F46);
4132     __ movxtod(L1,F44);
4133     __ fxor(FloatRegisterImpl::D, F46, F40, F40);
4134     __ fxor(FloatRegisterImpl::D, F44, F42, F42);
4135 
4136     __ movxtod(G4,F56);
4137     __ movxtod(G5,F58);
4138     __ mov(L4,L0);
4139     __ mov(L5,L1);
4140     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4141     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4142 
4143     // For mis-aligned store of 32 bytes of result we can do:
4144     // Circular right-shift all 4 FP registers so that 'head' and 'tail'
4145     // parts that need to be stored starting at mis-aligned address are in a FP reg
4146     // the other 3 FP regs can thus be stored using regular store
4147     // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts
4148 
4149     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4150     __ andcc(to, 7, G1);
4151     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128);
4152     __ delayed()->edge8n(to, G0, G2);
4153 
4154     // aligned case: store output into the destination array
4155     __ stf(FloatRegisterImpl::D, F40, to, 0);
4156     __ stf(FloatRegisterImpl::D, F42, to, 8);
4157     __ stf(FloatRegisterImpl::D, F60, to, 16);
4158     __ stf(FloatRegisterImpl::D, F62, to, 24);
4159     __ ba_short(L_check_decrypt_loop_end128);
4160 
4161     __ BIND(L_store_misaligned_output_next2_blocks128);
4162     __ mov(8, G4);
4163     __ sub(G4, G1, G4);
4164     __ alignaddr(G4, G0, G4);
4165     __ faligndata(F40, F42, F56); // F56 can be clobbered
4166     __ faligndata(F42, F60, F42);
4167     __ faligndata(F60, F62, F60);
4168     __ faligndata(F62, F40, F40);
4169     __ mov(to, G1);
4170     __ and3(to, -8, to);
4171     __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
4172     __ stf(FloatRegisterImpl::D, F56, to, 8);
4173     __ stf(FloatRegisterImpl::D, F42, to, 16);
4174     __ stf(FloatRegisterImpl::D, F60, to, 24);
4175     __ add(to, 32, to);
4176     __ orn(G0, G2, G2);
4177     __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
4178     __ mov(G1, to);
4179 
4180     __ BIND(L_check_decrypt_loop_end128);
4181     __ add(from, 32, from);
4182     __ add(to, 32, to);
4183     __ subcc(len_reg, 32, len_reg);
4184     __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128);
4185     __ delayed()->nop();
4186     __ ba_short(L_cbcdec_end);
4187 
4188     __ align(OptoLoopAlignment);
4189     __ BIND(L_dec_next2_blocks192);
4190     __ nop();
4191 
4192     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4193     __ andcc(from, 7, G0);
4194     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192);
4195     __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4196 
4197     // aligned case: load input into G4, G5, L4 and L5
4198     __ ldx(from,0,G4);
4199     __ ldx(from,8,G5);
4200     __ ldx(from,16,L4);
4201     __ ldx(from,24,L5);
4202     __ ba_short(L_transform_next2_blocks192);
4203 
4204     __ BIND(L_load_misaligned_next2_blocks192);
4205     __ alignaddr(from, G0, from);
4206     // F48, F50, F52, F60, F62 can be clobbered
4207     __ ldf(FloatRegisterImpl::D, from, 0, F48);
4208     __ ldf(FloatRegisterImpl::D, from, 8, F50);
4209     __ ldf(FloatRegisterImpl::D, from, 16, F60);
4210     __ ldf(FloatRegisterImpl::D, from, 24, F62);
4211     __ ldf(FloatRegisterImpl::D, from, 32, F52);
4212     __ faligndata(F48, F50, F48);
4213     __ faligndata(F50, F60, F50);
4214     __ faligndata(F60, F62, F60);
4215     __ faligndata(F62, F52, F62);
4216     __ movdtox(F48, G4);
4217     __ movdtox(F50, G5);
4218     __ movdtox(F60, L4);
4219     __ movdtox(F62, L5);
4220     __ mov(G1, from);
4221 
4222     __ BIND(L_transform_next2_blocks192);
4223     // F48:F50 used for first 16-bytes
4224     __ xor3(L2,G4,G1);
4225     __ movxtod(G1,F48);
4226     __ xor3(L3,G5,G1);
4227     __ movxtod(G1,F50);
4228 
4229     // F60:F62 used for next 16-bytes
4230     __ xor3(L2,L4,G1);
4231     __ movxtod(G1,F60);
4232     __ xor3(L3,L5,G1);
4233     __ movxtod(G1,F62);
4234 
4235     for ( int i = 46;  i >= 6; i -= 8 ) {
4236       __ aes_dround23(as_FloatRegister(i), F48, F50, F52);
4237       __ aes_dround01(as_FloatRegister(i-2), F48, F50, F54);
4238       __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4239       __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4240       if (i != 6 ) {
4241         __ aes_dround23(as_FloatRegister(i-4), F54, F52, F50);
4242         __ aes_dround01(as_FloatRegister(i-6), F54, F52, F48);
4243         __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4244         __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4245       } else {
4246         __ aes_dround23_l(as_FloatRegister(i-4), F54, F52, F50);
4247         __ aes_dround01_l(as_FloatRegister(i-6), F54, F52, F48);
4248         __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
4249         __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
4250       }
4251     }
4252 
4253     __ movxtod(L0,F54);
4254     __ movxtod(L1,F52);
4255     __ fxor(FloatRegisterImpl::D, F54, F48, F48);
4256     __ fxor(FloatRegisterImpl::D, F52, F50, F50);
4257 
4258     __ movxtod(G4,F56);
4259     __ movxtod(G5,F58);
4260     __ mov(L4,L0);
4261     __ mov(L5,L1);
4262     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4263     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4264 
4265     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4266     __ andcc(to, 7, G1);
4267     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192);
4268     __ delayed()->edge8n(to, G0, G2);
4269 
4270     // aligned case: store output into the destination array
4271     __ stf(FloatRegisterImpl::D, F48, to, 0);
4272     __ stf(FloatRegisterImpl::D, F50, to, 8);
4273     __ stf(FloatRegisterImpl::D, F60, to, 16);
4274     __ stf(FloatRegisterImpl::D, F62, to, 24);
4275     __ ba_short(L_check_decrypt_loop_end192);
4276 
4277     __ BIND(L_store_misaligned_output_next2_blocks192);
4278     __ mov(8, G4);
4279     __ sub(G4, G1, G4);
4280     __ alignaddr(G4, G0, G4);
4281     __ faligndata(F48, F50, F56); // F56 can be clobbered
4282     __ faligndata(F50, F60, F50);
4283     __ faligndata(F60, F62, F60);
4284     __ faligndata(F62, F48, F48);
4285     __ mov(to, G1);
4286     __ and3(to, -8, to);
4287     __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
4288     __ stf(FloatRegisterImpl::D, F56, to, 8);
4289     __ stf(FloatRegisterImpl::D, F50, to, 16);
4290     __ stf(FloatRegisterImpl::D, F60, to, 24);
4291     __ add(to, 32, to);
4292     __ orn(G0, G2, G2);
4293     __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
4294     __ mov(G1, to);
4295 
4296     __ BIND(L_check_decrypt_loop_end192);
4297     __ add(from, 32, from);
4298     __ add(to, 32, to);
4299     __ subcc(len_reg, 32, len_reg);
4300     __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192);
4301     __ delayed()->nop();
4302     __ ba_short(L_cbcdec_end);
4303 
4304     __ align(OptoLoopAlignment);
4305     __ BIND(L_dec_next2_blocks256);
4306     __ nop();
4307 
4308     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4309     __ andcc(from, 7, G0);
4310     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256);
4311     __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4312 
4313     // aligned case: load input into G4, G5, L4 and L5
4314     __ ldx(from,0,G4);
4315     __ ldx(from,8,G5);
4316     __ ldx(from,16,L4);
4317     __ ldx(from,24,L5);
4318     __ ba_short(L_transform_next2_blocks256);
4319 
4320     __ BIND(L_load_misaligned_next2_blocks256);
4321     __ alignaddr(from, G0, from);
4322     // F0, F2, F4, F60, F62 can be clobbered
4323     __ ldf(FloatRegisterImpl::D, from, 0, F0);
4324     __ ldf(FloatRegisterImpl::D, from, 8, F2);
4325     __ ldf(FloatRegisterImpl::D, from, 16, F60);
4326     __ ldf(FloatRegisterImpl::D, from, 24, F62);
4327     __ ldf(FloatRegisterImpl::D, from, 32, F4);
4328     __ faligndata(F0, F2, F0);
4329     __ faligndata(F2, F60, F2);
4330     __ faligndata(F60, F62, F60);
4331     __ faligndata(F62, F4, F62);
4332     __ movdtox(F0, G4);
4333     __ movdtox(F2, G5);
4334     __ movdtox(F60, L4);
4335     __ movdtox(F62, L5);
4336     __ mov(G1, from);
4337 
4338     __ BIND(L_transform_next2_blocks256);
4339     // F0:F2 used for first 16-bytes
4340     __ xor3(L2,G4,G1);
4341     __ movxtod(G1,F0);
4342     __ xor3(L3,G5,G1);
4343     __ movxtod(G1,F2);
4344 
4345     // F60:F62 used for next 16-bytes
4346     __ xor3(L2,L4,G1);
4347     __ movxtod(G1,F60);
4348     __ xor3(L3,L5,G1);
4349     __ movxtod(G1,F62);
4350 
4351     __ aes_dround23(F54, F0, F2, F4);
4352     __ aes_dround01(F52, F0, F2, F6);
4353     __ aes_dround23(F54, F60, F62, F58);
4354     __ aes_dround01(F52, F60, F62, F56);
4355     __ aes_dround23(F50, F6, F4, F2);
4356     __ aes_dround01(F48, F6, F4, F0);
4357     __ aes_dround23(F50, F56, F58, F62);
4358     __ aes_dround01(F48, F56, F58, F60);
4359     // save F48:F54 in temp registers
4360     __ movdtox(F54,G2);
4361     __ movdtox(F52,G3);
4362     __ movdtox(F50,G6);
4363     __ movdtox(F48,G1);
4364     for ( int i = 46;  i >= 14; i -= 8 ) {
4365       __ aes_dround23(as_FloatRegister(i), F0, F2, F4);
4366       __ aes_dround01(as_FloatRegister(i-2), F0, F2, F6);
4367       __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4368       __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4369       __ aes_dround23(as_FloatRegister(i-4), F6, F4, F2);
4370       __ aes_dround01(as_FloatRegister(i-6), F6, F4, F0);
4371       __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4372       __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4373     }
4374     // init F48:F54 with F0:F6 values (original key)
4375     __ ldf(FloatRegisterImpl::D, original_key, 0, F48);
4376     __ ldf(FloatRegisterImpl::D, original_key, 8, F50);
4377     __ ldf(FloatRegisterImpl::D, original_key, 16, F52);
4378     __ ldf(FloatRegisterImpl::D, original_key, 24, F54);
4379     __ aes_dround23(F54, F0, F2, F4);
4380     __ aes_dround01(F52, F0, F2, F6);
4381     __ aes_dround23(F54, F60, F62, F58);
4382     __ aes_dround01(F52, F60, F62, F56);
4383     __ aes_dround23_l(F50, F6, F4, F2);
4384     __ aes_dround01_l(F48, F6, F4, F0);
4385     __ aes_dround23_l(F50, F56, F58, F62);
4386     __ aes_dround01_l(F48, F56, F58, F60);
4387     // re-init F48:F54 with their original values
4388     __ movxtod(G2,F54);
4389     __ movxtod(G3,F52);
4390     __ movxtod(G6,F50);
4391     __ movxtod(G1,F48);
4392 
4393     __ movxtod(L0,F6);
4394     __ movxtod(L1,F4);
4395     __ fxor(FloatRegisterImpl::D, F6, F0, F0);
4396     __ fxor(FloatRegisterImpl::D, F4, F2, F2);
4397 
4398     __ movxtod(G4,F56);
4399     __ movxtod(G5,F58);
4400     __ mov(L4,L0);
4401     __ mov(L5,L1);
4402     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4403     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4404 
4405     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4406     __ andcc(to, 7, G1);
4407     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256);
4408     __ delayed()->edge8n(to, G0, G2);
4409 
4410     // aligned case: store output into the destination array
4411     __ stf(FloatRegisterImpl::D, F0, to, 0);
4412     __ stf(FloatRegisterImpl::D, F2, to, 8);
4413     __ stf(FloatRegisterImpl::D, F60, to, 16);
4414     __ stf(FloatRegisterImpl::D, F62, to, 24);
4415     __ ba_short(L_check_decrypt_loop_end256);
4416 
4417     __ BIND(L_store_misaligned_output_next2_blocks256);
4418     __ mov(8, G4);
4419     __ sub(G4, G1, G4);
4420     __ alignaddr(G4, G0, G4);
4421     __ faligndata(F0, F2, F56); // F56 can be clobbered
4422     __ faligndata(F2, F60, F2);
4423     __ faligndata(F60, F62, F60);
4424     __ faligndata(F62, F0, F0);
4425     __ mov(to, G1);
4426     __ and3(to, -8, to);
4427     __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
4428     __ stf(FloatRegisterImpl::D, F56, to, 8);
4429     __ stf(FloatRegisterImpl::D, F2, to, 16);
4430     __ stf(FloatRegisterImpl::D, F60, to, 24);
4431     __ add(to, 32, to);
4432     __ orn(G0, G2, G2);
4433     __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
4434     __ mov(G1, to);
4435 
4436     __ BIND(L_check_decrypt_loop_end256);
4437     __ add(from, 32, from);
4438     __ add(to, 32, to);
4439     __ subcc(len_reg, 32, len_reg);
4440     __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256);
4441     __ delayed()->nop();
4442 
4443     __ BIND(L_cbcdec_end);
4444     // re-init intial vector for next block, 8-byte alignment is guaranteed
4445     __ stx(L0, rvec, 0);
4446     __ stx(L1, rvec, 8);
4447     __ mov(L7, I0);
4448     __ ret();
4449     __ delayed()->restore();
4450 
4451     return start;
4452   }
4453 
4454   address generate_sha1_implCompress(bool multi_block, const char *name) {
4455     __ align(CodeEntryAlignment);
4456     StubCodeMark mark(this, "StubRoutines", name);
4457     address start = __ pc();
4458 
4459     Label L_sha1_loop, L_sha1_unaligned_input, L_sha1_unaligned_input_loop;
4460     int i;
4461 
4462     Register buf   = O0; // byte[] source+offset
4463     Register state = O1; // int[]  SHA.state
4464     Register ofs   = O2; // int    offset
4465     Register limit = O3; // int    limit
4466 
4467     // load state into F0-F4
4468     for (i = 0; i < 5; i++) {
4469       __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i));
4470     }
4471 
4472     __ andcc(buf, 7, G0);
4473     __ br(Assembler::notZero, false, Assembler::pn, L_sha1_unaligned_input);
4474     __ delayed()->nop();
4475 
4476     __ BIND(L_sha1_loop);
4477     // load buf into F8-F22
4478     for (i = 0; i < 8; i++) {
4479       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4480     }
4481     __ sha1();
4482     if (multi_block) {
4483       __ add(ofs, 64, ofs);
4484       __ add(buf, 64, buf);
4485       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_loop);
4486       __ mov(ofs, O0); // to be returned
4487     }
4488 
4489     // store F0-F4 into state and return
4490     for (i = 0; i < 4; i++) {
4491       __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4492     }
4493     __ retl();
4494     __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10);
4495 
4496     __ BIND(L_sha1_unaligned_input);
4497     __ alignaddr(buf, G0, buf);
4498 
4499     __ BIND(L_sha1_unaligned_input_loop);
4500     // load buf into F8-F22
4501     for (i = 0; i < 9; i++) {
4502       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4503     }
4504     for (i = 0; i < 8; i++) {
4505       __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8));
4506     }
4507     __ sha1();
4508     if (multi_block) {
4509       __ add(ofs, 64, ofs);
4510       __ add(buf, 64, buf);
4511       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_unaligned_input_loop);
4512       __ mov(ofs, O0); // to be returned
4513     }
4514 
4515     // store F0-F4 into state and return
4516     for (i = 0; i < 4; i++) {
4517       __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4518     }
4519     __ retl();
4520     __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10);
4521 
4522     return start;
4523   }
4524 
4525   address generate_sha256_implCompress(bool multi_block, const char *name) {
4526     __ align(CodeEntryAlignment);
4527     StubCodeMark mark(this, "StubRoutines", name);
4528     address start = __ pc();
4529 
4530     Label L_sha256_loop, L_sha256_unaligned_input, L_sha256_unaligned_input_loop;
4531     int i;
4532 
4533     Register buf   = O0; // byte[] source+offset
4534     Register state = O1; // int[]  SHA2.state
4535     Register ofs   = O2; // int    offset
4536     Register limit = O3; // int    limit
4537 
4538     // load state into F0-F7
4539     for (i = 0; i < 8; i++) {
4540       __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i));
4541     }
4542 
4543     __ andcc(buf, 7, G0);
4544     __ br(Assembler::notZero, false, Assembler::pn, L_sha256_unaligned_input);
4545     __ delayed()->nop();
4546 
4547     __ BIND(L_sha256_loop);
4548     // load buf into F8-F22
4549     for (i = 0; i < 8; i++) {
4550       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4551     }
4552     __ sha256();
4553     if (multi_block) {
4554       __ add(ofs, 64, ofs);
4555       __ add(buf, 64, buf);
4556       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_loop);
4557       __ mov(ofs, O0); // to be returned
4558     }
4559 
4560     // store F0-F7 into state and return
4561     for (i = 0; i < 7; i++) {
4562       __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4563     }
4564     __ retl();
4565     __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c);
4566 
4567     __ BIND(L_sha256_unaligned_input);
4568     __ alignaddr(buf, G0, buf);
4569 
4570     __ BIND(L_sha256_unaligned_input_loop);
4571     // load buf into F8-F22
4572     for (i = 0; i < 9; i++) {
4573       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4574     }
4575     for (i = 0; i < 8; i++) {
4576       __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8));
4577     }
4578     __ sha256();
4579     if (multi_block) {
4580       __ add(ofs, 64, ofs);
4581       __ add(buf, 64, buf);
4582       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_unaligned_input_loop);
4583       __ mov(ofs, O0); // to be returned
4584     }
4585 
4586     // store F0-F7 into state and return
4587     for (i = 0; i < 7; i++) {
4588       __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4589     }
4590     __ retl();
4591     __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c);
4592 
4593     return start;
4594   }
4595 
4596   address generate_sha512_implCompress(bool multi_block, const char *name) {
4597     __ align(CodeEntryAlignment);
4598     StubCodeMark mark(this, "StubRoutines", name);
4599     address start = __ pc();
4600 
4601     Label L_sha512_loop, L_sha512_unaligned_input, L_sha512_unaligned_input_loop;
4602     int i;
4603 
4604     Register buf   = O0; // byte[] source+offset
4605     Register state = O1; // long[] SHA5.state
4606     Register ofs   = O2; // int    offset
4607     Register limit = O3; // int    limit
4608 
4609     // load state into F0-F14
4610     for (i = 0; i < 8; i++) {
4611       __ ldf(FloatRegisterImpl::D, state, i*8, as_FloatRegister(i*2));
4612     }
4613 
4614     __ andcc(buf, 7, G0);
4615     __ br(Assembler::notZero, false, Assembler::pn, L_sha512_unaligned_input);
4616     __ delayed()->nop();
4617 
4618     __ BIND(L_sha512_loop);
4619     // load buf into F16-F46
4620     for (i = 0; i < 16; i++) {
4621       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16));
4622     }
4623     __ sha512();
4624     if (multi_block) {
4625       __ add(ofs, 128, ofs);
4626       __ add(buf, 128, buf);
4627       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_loop);
4628       __ mov(ofs, O0); // to be returned
4629     }
4630 
4631     // store F0-F14 into state and return
4632     for (i = 0; i < 7; i++) {
4633       __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8);
4634     }
4635     __ retl();
4636     __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38);
4637 
4638     __ BIND(L_sha512_unaligned_input);
4639     __ alignaddr(buf, G0, buf);
4640 
4641     __ BIND(L_sha512_unaligned_input_loop);
4642     // load buf into F16-F46
4643     for (i = 0; i < 17; i++) {
4644       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16));
4645     }
4646     for (i = 0; i < 16; i++) {
4647       __ faligndata(as_FloatRegister(i*2 + 16), as_FloatRegister(i*2 + 18), as_FloatRegister(i*2 + 16));
4648     }
4649     __ sha512();
4650     if (multi_block) {
4651       __ add(ofs, 128, ofs);
4652       __ add(buf, 128, buf);
4653       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_unaligned_input_loop);
4654       __ mov(ofs, O0); // to be returned
4655     }
4656 
4657     // store F0-F14 into state and return
4658     for (i = 0; i < 7; i++) {
4659       __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8);
4660     }
4661     __ retl();
4662     __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38);
4663 
4664     return start;
4665   }
4666 
4667   /* Single and multi-block ghash operations */
4668   address generate_ghash_processBlocks() {
4669       __ align(CodeEntryAlignment);
4670       Label L_ghash_loop, L_aligned, L_main;
4671       StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4672       address start = __ pc();
4673 
4674       Register state = I0;
4675       Register subkeyH = I1;
4676       Register data = I2;
4677       Register len = I3;
4678 
4679       __ save_frame(0);
4680 
4681       __ ldx(state, 0, O0);
4682       __ ldx(state, 8, O1);
4683 
4684       // Loop label for multiblock operations
4685       __ BIND(L_ghash_loop);
4686 
4687       // Check if 'data' is unaligned
4688       __ andcc(data, 7, G1);
4689       __ br(Assembler::zero, false, Assembler::pt, L_aligned);
4690       __ delayed()->nop();
4691 
4692       Register left_shift = L1;
4693       Register right_shift = L2;
4694       Register data_ptr = L3;
4695 
4696       // Get left and right shift values in bits
4697       __ sll(G1, LogBitsPerByte, left_shift);
4698       __ mov(64, right_shift);
4699       __ sub(right_shift, left_shift, right_shift);
4700 
4701       // Align to read 'data'
4702       __ sub(data, G1, data_ptr);
4703 
4704       // Load first 8 bytes of 'data'
4705       __ ldx(data_ptr, 0, O4);
4706       __ sllx(O4, left_shift, O4);
4707       __ ldx(data_ptr, 8, O5);
4708       __ srlx(O5, right_shift, G4);
4709       __ bset(G4, O4);
4710 
4711       // Load second 8 bytes of 'data'
4712       __ sllx(O5, left_shift, O5);
4713       __ ldx(data_ptr, 16, G4);
4714       __ srlx(G4, right_shift, G4);
4715       __ ba(L_main);
4716       __ delayed()->bset(G4, O5);
4717 
4718       // If 'data' is aligned, load normally
4719       __ BIND(L_aligned);
4720       __ ldx(data, 0, O4);
4721       __ ldx(data, 8, O5);
4722 
4723       __ BIND(L_main);
4724       __ ldx(subkeyH, 0, O2);
4725       __ ldx(subkeyH, 8, O3);
4726 
4727       __ xor3(O0, O4, O0);
4728       __ xor3(O1, O5, O1);
4729 
4730       __ xmulxhi(O0, O3, G3);
4731       __ xmulx(O0, O2, O5);
4732       __ xmulxhi(O1, O2, G4);
4733       __ xmulxhi(O1, O3, G5);
4734       __ xmulx(O0, O3, G1);
4735       __ xmulx(O1, O3, G2);
4736       __ xmulx(O1, O2, O3);
4737       __ xmulxhi(O0, O2, O4);
4738 
4739       __ mov(0xE1, O0);
4740       __ sllx(O0, 56, O0);
4741 
4742       __ xor3(O5, G3, O5);
4743       __ xor3(O5, G4, O5);
4744       __ xor3(G5, G1, G1);
4745       __ xor3(G1, O3, G1);
4746       __ srlx(G2, 63, O1);
4747       __ srlx(G1, 63, G3);
4748       __ sllx(G2, 63, O3);
4749       __ sllx(G2, 58, O2);
4750       __ xor3(O3, O2, O2);
4751 
4752       __ sllx(G1, 1, G1);
4753       __ or3(G1, O1, G1);
4754 
4755       __ xor3(G1, O2, G1);
4756 
4757       __ sllx(G2, 1, G2);
4758 
4759       __ xmulxhi(G1, O0, O1);
4760       __ xmulx(G1, O0, O2);
4761       __ xmulxhi(G2, O0, O3);
4762       __ xmulx(G2, O0, G1);
4763 
4764       __ xor3(O4, O1, O4);
4765       __ xor3(O5, O2, O5);
4766       __ xor3(O5, O3, O5);
4767 
4768       __ sllx(O4, 1, O2);
4769       __ srlx(O5, 63, O3);
4770 
4771       __ or3(O2, O3, O0);
4772 
4773       __ sllx(O5, 1, O1);
4774       __ srlx(G1, 63, O2);
4775       __ or3(O1, O2, O1);
4776       __ xor3(O1, G3, O1);
4777 
4778       __ deccc(len);
4779       __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop);
4780       __ delayed()->add(data, 16, data);
4781 
4782       __ stx(O0, I0, 0);
4783       __ stx(O1, I0, 8);
4784 
4785       __ ret();
4786       __ delayed()->restore();
4787 
4788       return start;
4789   }
4790 
4791   /**
4792    *  Arguments:
4793    *
4794    * Inputs:
4795    *   O0   - int   crc
4796    *   O1   - byte* buf
4797    *   O2   - int   len
4798    *   O3   - int*  table
4799    *
4800    * Output:
4801    *   O0   - int crc result
4802    */
4803   address generate_updateBytesCRC32C() {
4804     assert(UseCRC32CIntrinsics, "need CRC32C instruction");
4805 
4806     __ align(CodeEntryAlignment);
4807     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4808     address start = __ pc();
4809 
4810     const Register crc   = O0;  // crc
4811     const Register buf   = O1;  // source java byte array address
4812     const Register len   = O2;  // number of bytes
4813     const Register table = O3;  // byteTable
4814 
4815     __ kernel_crc32c(crc, buf, len, table);
4816 
4817     __ retl();
4818     __ delayed()->nop();
4819 
4820     return start;
4821   }
4822 
4823 #define ADLER32_NUM_TEMPS 16
4824 
4825   /**
4826    *  Arguments:
4827    *
4828    * Inputs:
4829    *   O0   - int   adler
4830    *   O1   - byte* buff
4831    *   O2   - int   len
4832    *
4833    * Output:
4834    *   O0   - int adler result
4835    */
4836   address generate_updateBytesAdler32() {
4837     __ align(CodeEntryAlignment);
4838     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4839     address start = __ pc();
4840 
4841     Label L_cleanup_loop, L_cleanup_loop_check;
4842     Label L_main_loop_check, L_main_loop, L_inner_loop, L_inner_loop_check;
4843     Label L_nmax_check_done;
4844 
4845     // Aliases
4846     Register s1     = O0;
4847     Register s2     = O3;
4848     Register buff   = O1;
4849     Register len    = O2;
4850     Register temp[ADLER32_NUM_TEMPS] = {L0, L1, L2, L3, L4, L5, L6, L7, I0, I1, I2, I3, I4, I5, G3, I7};
4851 
4852     // Max number of bytes we can process before having to take the mod
4853     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4854     unsigned long NMAX = 0x15B0;
4855 
4856     // Zero-out the upper bits of len
4857     __ clruwu(len);
4858 
4859     // Create the mask 0xFFFF
4860     __ set64(0x00FFFF, O4, O5); // O5 is the temp register
4861 
4862     // s1 is initialized to the lower 16 bits of adler
4863     // s2 is initialized to the upper 16 bits of adler
4864     __ srlx(O0, 16, O5); // adler >> 16
4865     __ and3(O0, O4, s1); // s1  = (adler & 0xFFFF)
4866     __ and3(O5, O4, s2); // s2  = ((adler >> 16) & 0xFFFF)
4867 
4868     // The pipelined loop needs at least 16 elements for 1 iteration
4869     // It does check this, but it is more effective to skip to the cleanup loop
4870     // Setup the constant for cutoff checking
4871     __ mov(15, O4);
4872 
4873     // Check if we are above the cutoff, if not go to the cleanup loop immediately
4874     __ cmp_and_br_short(len, O4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_loop_check);
4875 
4876     // Free up some registers for our use
4877     for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
4878       __ movxtod(temp[i], as_FloatRegister(2*i));
4879     }
4880 
4881     // Loop maintenance stuff is done at the end of the loop, so skip to there
4882     __ ba_short(L_main_loop_check);
4883 
4884     __ BIND(L_main_loop);
4885 
4886     // Prologue for inner loop
4887     __ ldub(buff, 0, L0);
4888     __ dec(O5);
4889 
4890     for (int i = 1; i < 8; i++) {
4891       __ ldub(buff, i, temp[i]);
4892     }
4893 
4894     __ inc(buff, 8);
4895 
4896     // Inner loop processes 16 elements at a time, might never execute if only 16 elements
4897     // to be processed by the outter loop
4898     __ ba_short(L_inner_loop_check);
4899 
4900     __ BIND(L_inner_loop);
4901 
4902     for (int i = 0; i < 8; i++) {
4903       __ ldub(buff, (2*i), temp[(8+(2*i)) % ADLER32_NUM_TEMPS]);
4904       __ add(s1, temp[i], s1);
4905       __ ldub(buff, (2*i)+1, temp[(8+(2*i)+1) % ADLER32_NUM_TEMPS]);
4906       __ add(s2, s1, s2);
4907     }
4908 
4909     // Original temp 0-7 used and new loads to temp 0-7 issued
4910     // temp 8-15 ready to be consumed
4911     __ add(s1, I0, s1);
4912     __ dec(O5);
4913     __ add(s2, s1, s2);
4914     __ add(s1, I1, s1);
4915     __ inc(buff, 16);
4916     __ add(s2, s1, s2);
4917 
4918     for (int i = 0; i < 6; i++) {
4919       __ add(s1, temp[10+i], s1);
4920       __ add(s2, s1, s2);
4921     }
4922 
4923     __ BIND(L_inner_loop_check);
4924     __ nop();
4925     __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_inner_loop);
4926 
4927     // Epilogue
4928     for (int i = 0; i < 4; i++) {
4929       __ ldub(buff, (2*i), temp[8+(2*i)]);
4930       __ add(s1, temp[i], s1);
4931       __ ldub(buff, (2*i)+1, temp[8+(2*i)+1]);
4932       __ add(s2, s1, s2);
4933     }
4934 
4935     __ add(s1, temp[4], s1);
4936     __ inc(buff, 8);
4937 
4938     for (int i = 0; i < 11; i++) {
4939       __ add(s2, s1, s2);
4940       __ add(s1, temp[5+i], s1);
4941     }
4942 
4943     __ add(s2, s1, s2);
4944 
4945     // Take the mod for s1 and s2
4946     __ set64(0xFFF1, L0, L1);
4947     __ udivx(s1, L0, L1);
4948     __ udivx(s2, L0, L2);
4949     __ mulx(L0, L1, L1);
4950     __ mulx(L0, L2, L2);
4951     __ sub(s1, L1, s1);
4952     __ sub(s2, L2, s2);
4953 
4954     // Make sure there is something left to process
4955     __ BIND(L_main_loop_check);
4956     __ set64(NMAX, L0, L1);
4957     // k = len < NMAX ? len : NMAX
4958     __ cmp_and_br_short(len, L0, Assembler::greaterEqualUnsigned, Assembler::pt, L_nmax_check_done);
4959     __ andn(len, 0x0F, L0); // only loop a multiple of 16 times
4960     __ BIND(L_nmax_check_done);
4961     __ mov(L0, O5);
4962     __ sub(len, L0, len); // len -= k
4963 
4964     __ srlx(O5, 4, O5); // multiplies of 16
4965     __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_main_loop);
4966 
4967     // Restore anything we used, take the mod one last time, combine and return
4968     // Restore any registers we saved
4969     for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
4970       __ movdtox(as_FloatRegister(2*i), temp[i]);
4971     }
4972 
4973     // There might be nothing left to process
4974     __ ba_short(L_cleanup_loop_check);
4975 
4976     __ BIND(L_cleanup_loop);
4977     __ ldub(buff, 0, O4); // load single byte form buffer
4978     __ inc(buff); // buff++
4979     __ add(s1, O4, s1); // s1 += *buff++;
4980     __ dec(len); // len--
4981     __ add(s1, s2, s2); // s2 += s1;
4982     __ BIND(L_cleanup_loop_check);
4983     __ nop();
4984     __ cmp_and_br_short(len, 0, Assembler::notEqual, Assembler::pt, L_cleanup_loop);
4985 
4986     // Take the mod one last time
4987     __ set64(0xFFF1, O1, O2);
4988     __ udivx(s1, O1, O2);
4989     __ udivx(s2, O1, O5);
4990     __ mulx(O1, O2, O2);
4991     __ mulx(O1, O5, O5);
4992     __ sub(s1, O2, s1);
4993     __ sub(s2, O5, s2);
4994 
4995     // Combine lower bits and higher bits
4996     __ sllx(s2, 16, s2); // s2 = s2 << 16
4997     __ or3(s1, s2, s1);  // adler = s2 | s1
4998     // Final return value is in O0
4999     __ retl();
5000     __ delayed()->nop();
5001 
5002     return start;
5003   }
5004 
5005   /**
5006    *  Arguments:
5007    *
5008    * Inputs:
5009    *   O0   - int   crc
5010    *   O1   - byte* buf
5011    *   O2   - int   len
5012    *   O3   - int*  table
5013    *
5014    * Output:
5015    *   O0   - int crc result
5016    */
5017   address generate_updateBytesCRC32() {
5018     assert(UseCRC32Intrinsics, "need VIS3 instructions");
5019 
5020     __ align(CodeEntryAlignment);
5021     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
5022     address start = __ pc();
5023 
5024     const Register crc   = O0; // crc
5025     const Register buf   = O1; // source java byte array address
5026     const Register len   = O2; // length
5027     const Register table = O3; // crc_table address (reuse register)
5028 
5029     __ kernel_crc32(crc, buf, len, table);
5030 
5031     __ retl();
5032     __ delayed()->nop();
5033 
5034     return start;
5035   }
5036 
5037   /**
5038    * Arguments:
5039    *
5040    * Inputs:
5041    *   I0   - int* x-addr
5042    *   I1   - int  x-len
5043    *   I2   - int* y-addr
5044    *   I3   - int  y-len
5045    *   I4   - int* z-addr   (output vector)
5046    *   I5   - int  z-len
5047    */
5048   address generate_multiplyToLen() {
5049     assert(UseMultiplyToLenIntrinsic, "need VIS3 instructions");
5050 
5051     __ align(CodeEntryAlignment);
5052     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
5053     address start = __ pc();
5054 
5055     __ save_frame(0);
5056 
5057     const Register xptr = I0; // input address
5058     const Register xlen = I1; // ...and length in 32b-words
5059     const Register yptr = I2; //
5060     const Register ylen = I3; //
5061     const Register zptr = I4; // output address
5062     const Register zlen = I5; // ...and length in 32b-words
5063 
5064     /* The minimal "limb" representation suggest that odd length vectors are as
5065      * likely as even length dittos. This in turn suggests that we need to cope
5066      * with odd/even length arrays and data not aligned properly for 64-bit read
5067      * and write operations. We thus use a number of different kernels:
5068      *
5069      *   if (is_even(x.len) && is_even(y.len))
5070      *      if (is_align64(x) && is_align64(y) && is_align64(z))
5071      *         if (x.len == y.len && 16 <= x.len && x.len <= 64)
5072      *            memv_mult_mpmul(...)
5073      *         else
5074      *            memv_mult_64x64(...)
5075      *      else
5076      *         memv_mult_64x64u(...)
5077      *   else
5078      *      memv_mult_32x32(...)
5079      *
5080      * Here we assume VIS3 support (for 'umulxhi', 'addxc' and 'addxccc').
5081      * In case CBCOND instructions are supported, we will use 'cxbX'. If the
5082      * MPMUL instruction is supported, we will generate a kernel using 'mpmul'
5083      * (for vectors with proper characteristics).
5084      */
5085     const Register tmp0 = L0;
5086     const Register tmp1 = L1;
5087 
5088     Label L_mult_32x32;
5089     Label L_mult_64x64u;
5090     Label L_mult_64x64;
5091     Label L_exit;
5092 
5093     if_both_even(xlen, ylen, tmp0, false, L_mult_32x32);
5094     if_all3_aligned(xptr, yptr, zptr, tmp1, 64, false, L_mult_64x64u);
5095 
5096     if (UseMPMUL) {
5097       if_eq(xlen, ylen, false, L_mult_64x64);
5098       if_in_rng(xlen, 16, 64, tmp0, tmp1, false, L_mult_64x64);
5099 
5100       // 1. Multiply naturally aligned 64b-datums using a generic 'mpmul' kernel,
5101       //    operating on equal length vectors of size [16..64].
5102       gen_mult_mpmul(xlen, xptr, yptr, zptr, L_exit);
5103     }
5104 
5105     // 2. Multiply naturally aligned 64-bit datums (64x64).
5106     __ bind(L_mult_64x64);
5107     gen_mult_64x64(xptr, xlen, yptr, ylen, zptr, zlen, L_exit);
5108 
5109     // 3. Multiply unaligned 64-bit datums (64x64).
5110     __ bind(L_mult_64x64u);
5111     gen_mult_64x64_unaligned(xptr, xlen, yptr, ylen, zptr, zlen, L_exit);
5112 
5113     // 4. Multiply naturally aligned 32-bit datums (32x32).
5114     __ bind(L_mult_32x32);
5115     gen_mult_32x32(xptr, xlen, yptr, ylen, zptr, zlen, L_exit);
5116 
5117     __ bind(L_exit);
5118     __ ret();
5119     __ delayed()->restore();
5120 
5121     return start;
5122   }
5123 
5124   // Additional help functions used by multiplyToLen generation.
5125 
5126   void if_both_even(Register r1, Register r2, Register tmp, bool iseven, Label &L)
5127   {
5128     __ or3(r1, r2, tmp);
5129     __ andcc(tmp, 0x1, tmp);
5130     __ br_icc_zero(iseven, Assembler::pn, L);
5131   }
5132 
5133   void if_all3_aligned(Register r1, Register r2, Register r3,
5134                        Register tmp, uint align, bool isalign, Label &L)
5135   {
5136     __ or3(r1, r2, tmp);
5137     __ or3(r3, tmp, tmp);
5138     __ andcc(tmp, (align - 1), tmp);
5139     __ br_icc_zero(isalign, Assembler::pn, L);
5140   }
5141 
5142   void if_eq(Register x, Register y, bool iseq, Label &L)
5143   {
5144     Assembler::Condition cf = (iseq ? Assembler::equal : Assembler::notEqual);
5145     __ cmp_and_br_short(x, y, cf, Assembler::pt, L);
5146   }
5147 
5148   void if_in_rng(Register x, int lb, int ub, Register t1, Register t2, bool inrng, Label &L)
5149   {
5150     assert(Assembler::is_simm13(lb), "Small ints only!");
5151     assert(Assembler::is_simm13(ub), "Small ints only!");
5152     // Compute (x - lb) * (ub - x) >= 0
5153     // NOTE: With the local use of this routine, we rely on small integers to
5154     //       guarantee that we do not overflow in the multiplication.
5155     __ add(G0, ub, t2);
5156     __ sub(x, lb, t1);
5157     __ sub(t2, x, t2);
5158     __ mulx(t1, t2, t1);
5159     Assembler::Condition cf = (inrng ? Assembler::greaterEqual : Assembler::less);
5160     __ cmp_and_br_short(t1, G0, cf, Assembler::pt, L);
5161   }
5162 
5163   void ldd_entry(Register base, Register offs, FloatRegister dest)
5164   {
5165     __ ldd(base, offs, dest);
5166     __ inc(offs, 8);
5167   }
5168 
5169   void ldx_entry(Register base, Register offs, Register dest)
5170   {
5171     __ ldx(base, offs, dest);
5172     __ inc(offs, 8);
5173   }
5174 
5175   void mpmul_entry(int m, Label &next)
5176   {
5177     __ mpmul(m);
5178     __ cbcond(Assembler::equal, Assembler::icc, G0, G0, next);
5179   }
5180 
5181   void stx_entry(Label &L, Register r1, Register r2, Register base, Register offs)
5182   {
5183     __ bind(L);
5184     __ stx(r1, base, offs);
5185     __ inc(offs, 8);
5186     __ stx(r2, base, offs);
5187     __ inc(offs, 8);
5188   }
5189 
5190   void offs_entry(Label &Lbl0, Label &Lbl1)
5191   {
5192     assert(Lbl0.is_bound(), "must be");
5193     assert(Lbl1.is_bound(), "must be");
5194 
5195     int offset = Lbl0.loc_pos() - Lbl1.loc_pos();
5196 
5197     __ emit_data(offset);
5198   }
5199 
5200   /* Generate the actual multiplication kernels for BigInteger vectors:
5201    *
5202    *   1. gen_mult_mpmul(...)
5203    *
5204    *   2. gen_mult_64x64(...)
5205    *
5206    *   3. gen_mult_64x64_unaligned(...)
5207    *
5208    *   4. gen_mult_32x32(...)
5209    */
5210   void gen_mult_mpmul(Register len, Register xptr, Register yptr, Register zptr,
5211                       Label &L_exit)
5212   {
5213     const Register zero = G0;
5214     const Register gxp  = G1;   // Need to use global registers across RWs.
5215     const Register gyp  = G2;
5216     const Register gzp  = G3;
5217     const Register disp = G4;
5218     const Register offs = G5;
5219 
5220     __ mov(xptr, gxp);
5221     __ mov(yptr, gyp);
5222     __ mov(zptr, gzp);
5223 
5224     /* Compute jump vector entry:
5225      *
5226      *   1. mpmul input size (0..31) x 64b
5227      *   2. vector input size in 32b limbs (even number)
5228      *   3. branch entries in reverse order (31..0), using two
5229      *      instructions per entry (2 * 4 bytes).
5230      *
5231      *   displacement = byte_offset(bra_offset(len))
5232      *                = byte_offset((64 - len)/2)
5233      *                = 8 * (64 - len)/2
5234      *                = 4 * (64 - len)
5235      */
5236     Register temp = I5;         // Alright to use input regs. in first batch.
5237 
5238     __ sub(zero, len, temp);
5239     __ add(temp, 64, temp);
5240     __ sllx(temp, 2, disp);     // disp := (64 - len) << 2
5241 
5242     // Dispatch relative current PC, into instruction table below.
5243     __ rdpc(temp);
5244     __ add(temp, 16, temp);
5245     __ jmp(temp, disp);
5246     __ delayed()->clr(offs);
5247 
5248     ldd_entry(gxp, offs, F22);
5249     ldd_entry(gxp, offs, F20);
5250     ldd_entry(gxp, offs, F18);
5251     ldd_entry(gxp, offs, F16);
5252     ldd_entry(gxp, offs, F14);
5253     ldd_entry(gxp, offs, F12);
5254     ldd_entry(gxp, offs, F10);
5255     ldd_entry(gxp, offs, F8);
5256     ldd_entry(gxp, offs, F6);
5257     ldd_entry(gxp, offs, F4);
5258     ldx_entry(gxp, offs, I5);
5259     ldx_entry(gxp, offs, I4);
5260     ldx_entry(gxp, offs, I3);
5261     ldx_entry(gxp, offs, I2);
5262     ldx_entry(gxp, offs, I1);
5263     ldx_entry(gxp, offs, I0);
5264     ldx_entry(gxp, offs, L7);
5265     ldx_entry(gxp, offs, L6);
5266     ldx_entry(gxp, offs, L5);
5267     ldx_entry(gxp, offs, L4);
5268     ldx_entry(gxp, offs, L3);
5269     ldx_entry(gxp, offs, L2);
5270     ldx_entry(gxp, offs, L1);
5271     ldx_entry(gxp, offs, L0);
5272     ldd_entry(gxp, offs, F2);
5273     ldd_entry(gxp, offs, F0);
5274     ldx_entry(gxp, offs, O5);
5275     ldx_entry(gxp, offs, O4);
5276     ldx_entry(gxp, offs, O3);
5277     ldx_entry(gxp, offs, O2);
5278     ldx_entry(gxp, offs, O1);
5279     ldx_entry(gxp, offs, O0);
5280 
5281     __ save(SP, -176, SP);
5282 
5283     const Register addr = gxp;  // Alright to reuse 'gxp'.
5284 
5285     // Dispatch relative current PC, into instruction table below.
5286     __ rdpc(addr);
5287     __ add(addr, 16, addr);
5288     __ jmp(addr, disp);
5289     __ delayed()->clr(offs);
5290 
5291     ldd_entry(gyp, offs, F58);
5292     ldd_entry(gyp, offs, F56);
5293     ldd_entry(gyp, offs, F54);
5294     ldd_entry(gyp, offs, F52);
5295     ldd_entry(gyp, offs, F50);
5296     ldd_entry(gyp, offs, F48);
5297     ldd_entry(gyp, offs, F46);
5298     ldd_entry(gyp, offs, F44);
5299     ldd_entry(gyp, offs, F42);
5300     ldd_entry(gyp, offs, F40);
5301     ldd_entry(gyp, offs, F38);
5302     ldd_entry(gyp, offs, F36);
5303     ldd_entry(gyp, offs, F34);
5304     ldd_entry(gyp, offs, F32);
5305     ldd_entry(gyp, offs, F30);
5306     ldd_entry(gyp, offs, F28);
5307     ldd_entry(gyp, offs, F26);
5308     ldd_entry(gyp, offs, F24);
5309     ldx_entry(gyp, offs, O5);
5310     ldx_entry(gyp, offs, O4);
5311     ldx_entry(gyp, offs, O3);
5312     ldx_entry(gyp, offs, O2);
5313     ldx_entry(gyp, offs, O1);
5314     ldx_entry(gyp, offs, O0);
5315     ldx_entry(gyp, offs, L7);
5316     ldx_entry(gyp, offs, L6);
5317     ldx_entry(gyp, offs, L5);
5318     ldx_entry(gyp, offs, L4);
5319     ldx_entry(gyp, offs, L3);
5320     ldx_entry(gyp, offs, L2);
5321     ldx_entry(gyp, offs, L1);
5322     ldx_entry(gyp, offs, L0);
5323 
5324     __ save(SP, -176, SP);
5325     __ save(SP, -176, SP);
5326     __ save(SP, -176, SP);
5327     __ save(SP, -176, SP);
5328     __ save(SP, -176, SP);
5329 
5330     Label L_mpmul_restore_4, L_mpmul_restore_3, L_mpmul_restore_2;
5331     Label L_mpmul_restore_1, L_mpmul_restore_0;
5332 
5333     // Dispatch relative current PC, into instruction table below.
5334     __ rdpc(addr);
5335     __ add(addr, 16, addr);
5336     __ jmp(addr, disp);
5337     __ delayed()->clr(offs);
5338 
5339     mpmul_entry(31, L_mpmul_restore_0);
5340     mpmul_entry(30, L_mpmul_restore_0);
5341     mpmul_entry(29, L_mpmul_restore_0);
5342     mpmul_entry(28, L_mpmul_restore_0);
5343     mpmul_entry(27, L_mpmul_restore_1);
5344     mpmul_entry(26, L_mpmul_restore_1);
5345     mpmul_entry(25, L_mpmul_restore_1);
5346     mpmul_entry(24, L_mpmul_restore_1);
5347     mpmul_entry(23, L_mpmul_restore_1);
5348     mpmul_entry(22, L_mpmul_restore_1);
5349     mpmul_entry(21, L_mpmul_restore_1);
5350     mpmul_entry(20, L_mpmul_restore_2);
5351     mpmul_entry(19, L_mpmul_restore_2);
5352     mpmul_entry(18, L_mpmul_restore_2);
5353     mpmul_entry(17, L_mpmul_restore_2);
5354     mpmul_entry(16, L_mpmul_restore_2);
5355     mpmul_entry(15, L_mpmul_restore_2);
5356     mpmul_entry(14, L_mpmul_restore_2);
5357     mpmul_entry(13, L_mpmul_restore_3);
5358     mpmul_entry(12, L_mpmul_restore_3);
5359     mpmul_entry(11, L_mpmul_restore_3);
5360     mpmul_entry(10, L_mpmul_restore_3);
5361     mpmul_entry( 9, L_mpmul_restore_3);
5362     mpmul_entry( 8, L_mpmul_restore_3);
5363     mpmul_entry( 7, L_mpmul_restore_3);
5364     mpmul_entry( 6, L_mpmul_restore_4);
5365     mpmul_entry( 5, L_mpmul_restore_4);
5366     mpmul_entry( 4, L_mpmul_restore_4);
5367     mpmul_entry( 3, L_mpmul_restore_4);
5368     mpmul_entry( 2, L_mpmul_restore_4);
5369     mpmul_entry( 1, L_mpmul_restore_4);
5370     mpmul_entry( 0, L_mpmul_restore_4);
5371 
5372     Label L_z31, L_z30, L_z29, L_z28, L_z27, L_z26, L_z25, L_z24;
5373     Label L_z23, L_z22, L_z21, L_z20, L_z19, L_z18, L_z17, L_z16;
5374     Label L_z15, L_z14, L_z13, L_z12, L_z11, L_z10, L_z09, L_z08;
5375     Label L_z07, L_z06, L_z05, L_z04, L_z03, L_z02, L_z01, L_z00;
5376 
5377     Label L_zst_base;    // Store sequence base address.
5378     __ bind(L_zst_base);
5379 
5380     stx_entry(L_z31, L7, L6, gzp, offs);
5381     stx_entry(L_z30, L5, L4, gzp, offs);
5382     stx_entry(L_z29, L3, L2, gzp, offs);
5383     stx_entry(L_z28, L1, L0, gzp, offs);
5384     __ restore();
5385     stx_entry(L_z27, O5, O4, gzp, offs);
5386     stx_entry(L_z26, O3, O2, gzp, offs);
5387     stx_entry(L_z25, O1, O0, gzp, offs);
5388     stx_entry(L_z24, L7, L6, gzp, offs);
5389     stx_entry(L_z23, L5, L4, gzp, offs);
5390     stx_entry(L_z22, L3, L2, gzp, offs);
5391     stx_entry(L_z21, L1, L0, gzp, offs);
5392     __ restore();
5393     stx_entry(L_z20, O5, O4, gzp, offs);
5394     stx_entry(L_z19, O3, O2, gzp, offs);
5395     stx_entry(L_z18, O1, O0, gzp, offs);
5396     stx_entry(L_z17, L7, L6, gzp, offs);
5397     stx_entry(L_z16, L5, L4, gzp, offs);
5398     stx_entry(L_z15, L3, L2, gzp, offs);
5399     stx_entry(L_z14, L1, L0, gzp, offs);
5400     __ restore();
5401     stx_entry(L_z13, O5, O4, gzp, offs);
5402     stx_entry(L_z12, O3, O2, gzp, offs);
5403     stx_entry(L_z11, O1, O0, gzp, offs);
5404     stx_entry(L_z10, L7, L6, gzp, offs);
5405     stx_entry(L_z09, L5, L4, gzp, offs);
5406     stx_entry(L_z08, L3, L2, gzp, offs);
5407     stx_entry(L_z07, L1, L0, gzp, offs);
5408     __ restore();
5409     stx_entry(L_z06, O5, O4, gzp, offs);
5410     stx_entry(L_z05, O3, O2, gzp, offs);
5411     stx_entry(L_z04, O1, O0, gzp, offs);
5412     stx_entry(L_z03, L7, L6, gzp, offs);
5413     stx_entry(L_z02, L5, L4, gzp, offs);
5414     stx_entry(L_z01, L3, L2, gzp, offs);
5415     stx_entry(L_z00, L1, L0, gzp, offs);
5416 
5417     __ restore();
5418     __ restore();
5419     // Exit out of 'mpmul' routine, back to multiplyToLen.
5420     __ ba_short(L_exit);
5421 
5422     Label L_zst_offs;
5423     __ bind(L_zst_offs);
5424 
5425     offs_entry(L_z31, L_zst_base);  // index 31: 2048x2048
5426     offs_entry(L_z30, L_zst_base);
5427     offs_entry(L_z29, L_zst_base);
5428     offs_entry(L_z28, L_zst_base);
5429     offs_entry(L_z27, L_zst_base);
5430     offs_entry(L_z26, L_zst_base);
5431     offs_entry(L_z25, L_zst_base);
5432     offs_entry(L_z24, L_zst_base);
5433     offs_entry(L_z23, L_zst_base);
5434     offs_entry(L_z22, L_zst_base);
5435     offs_entry(L_z21, L_zst_base);
5436     offs_entry(L_z20, L_zst_base);
5437     offs_entry(L_z19, L_zst_base);
5438     offs_entry(L_z18, L_zst_base);
5439     offs_entry(L_z17, L_zst_base);
5440     offs_entry(L_z16, L_zst_base);
5441     offs_entry(L_z15, L_zst_base);
5442     offs_entry(L_z14, L_zst_base);
5443     offs_entry(L_z13, L_zst_base);
5444     offs_entry(L_z12, L_zst_base);
5445     offs_entry(L_z11, L_zst_base);
5446     offs_entry(L_z10, L_zst_base);
5447     offs_entry(L_z09, L_zst_base);
5448     offs_entry(L_z08, L_zst_base);
5449     offs_entry(L_z07, L_zst_base);
5450     offs_entry(L_z06, L_zst_base);
5451     offs_entry(L_z05, L_zst_base);
5452     offs_entry(L_z04, L_zst_base);
5453     offs_entry(L_z03, L_zst_base);
5454     offs_entry(L_z02, L_zst_base);
5455     offs_entry(L_z01, L_zst_base);
5456     offs_entry(L_z00, L_zst_base);  // index  0:   64x64
5457 
5458     __ bind(L_mpmul_restore_4);
5459     __ restore();
5460     __ bind(L_mpmul_restore_3);
5461     __ restore();
5462     __ bind(L_mpmul_restore_2);
5463     __ restore();
5464     __ bind(L_mpmul_restore_1);
5465     __ restore();
5466     __ bind(L_mpmul_restore_0);
5467 
5468     // Dispatch via offset vector entry, into z-store sequence.
5469     Label L_zst_rdpc;
5470     __ bind(L_zst_rdpc);
5471 
5472     assert(L_zst_base.is_bound(), "must be");
5473     assert(L_zst_offs.is_bound(), "must be");
5474     assert(L_zst_rdpc.is_bound(), "must be");
5475 
5476     int dbase = L_zst_rdpc.loc_pos() - L_zst_base.loc_pos();
5477     int doffs = L_zst_rdpc.loc_pos() - L_zst_offs.loc_pos();
5478 
5479     temp = gyp;   // Alright to reuse 'gyp'.
5480 
5481     __ rdpc(addr);
5482     __ sub(addr, doffs, temp);
5483     __ srlx(disp, 1, disp);
5484     __ lduw(temp, disp, offs);
5485     __ sub(addr, dbase, temp);
5486     __ jmp(temp, offs);
5487     __ delayed()->clr(offs);
5488   }
5489 
5490   void gen_mult_64x64(Register xp, Register xn,
5491                       Register yp, Register yn,
5492                       Register zp, Register zn, Label &L_exit)
5493   {
5494     // Assuming that a stack frame has already been created, i.e. local and
5495     // output registers are available for immediate use.
5496 
5497     const Register ri = L0;     // Outer loop index, xv[i]
5498     const Register rj = L1;     // Inner loop index, yv[j]
5499     const Register rk = L2;     // Output loop index, zv[k]
5500     const Register rx = L4;     // x-vector datum [i]
5501     const Register ry = L5;     // y-vector datum [j]
5502     const Register rz = L6;     // z-vector datum [k]
5503     const Register rc = L7;     // carry over (to z-vector datum [k-1])
5504 
5505     const Register lop = O0;    // lo-64b product
5506     const Register hip = O1;    // hi-64b product
5507 
5508     const Register zero = G0;
5509 
5510     Label L_loop_i,  L_exit_loop_i;
5511     Label L_loop_j;
5512     Label L_loop_i2, L_exit_loop_i2;
5513 
5514     __ srlx(xn, 1, xn);         // index for u32 to u64 ditto
5515     __ srlx(yn, 1, yn);         // index for u32 to u64 ditto
5516     __ srlx(zn, 1, zn);         // index for u32 to u64 ditto
5517     __ dec(xn);                 // Adjust [0..(N/2)-1]
5518     __ dec(yn);
5519     __ dec(zn);
5520     __ clr(rc);                 // u64 c = 0
5521     __ sllx(xn, 3, ri);         // int i = xn (byte offset i = 8*xn)
5522     __ sllx(yn, 3, rj);         // int j = yn (byte offset i = 8*xn)
5523     __ sllx(zn, 3, rk);         // int k = zn (byte offset k = 8*zn)
5524     __ ldx(yp, rj, ry);         // u64 y = yp[yn]
5525 
5526     // for (int i = xn; i >= 0; i--)
5527     __ bind(L_loop_i);
5528 
5529     __ cmp_and_br_short(ri, 0,  // i >= 0
5530                         Assembler::less, Assembler::pn, L_exit_loop_i);
5531     __ ldx(xp, ri, rx);         // x = xp[i]
5532     __ mulx(rx, ry, lop);       // lo-64b-part of result 64x64
5533     __ umulxhi(rx, ry, hip);    // hi-64b-part of result 64x64
5534     __ addcc(rc, lop, lop);     // Accumulate lower order bits (producing carry)
5535     __ addxc(hip, zero, rc);    // carry over to next datum [k-1]
5536     __ stx(lop, zp, rk);        // z[k] = lop
5537     __ dec(rk, 8);              // k--
5538     __ dec(ri, 8);              // i--
5539     __ ba_short(L_loop_i);
5540 
5541     __ bind(L_exit_loop_i);
5542     __ stx(rc, zp, rk);         // z[k] = c
5543 
5544     // for (int j = yn - 1; j >= 0; j--)
5545     __ sllx(yn, 3, rj);         // int j = yn - 1 (byte offset j = 8*yn)
5546     __ dec(rj, 8);
5547 
5548     __ bind(L_loop_j);
5549 
5550     __ cmp_and_br_short(rj, 0,  // j >= 0
5551                         Assembler::less, Assembler::pn, L_exit);
5552     __ clr(rc);                 // u64 c = 0
5553     __ ldx(yp, rj, ry);         // u64 y = yp[j]
5554 
5555     // for (int i = xn, k = --zn; i >= 0; i--)
5556     __ dec(zn);                 // --zn
5557     __ sllx(xn, 3, ri);         // int i = xn (byte offset i = 8*xn)
5558     __ sllx(zn, 3, rk);         // int k = zn (byte offset k = 8*zn)
5559 
5560     __ bind(L_loop_i2);
5561 
5562     __ cmp_and_br_short(ri, 0,  // i >= 0
5563                         Assembler::less, Assembler::pn, L_exit_loop_i2);
5564     __ ldx(xp, ri, rx);         // x = xp[i]
5565     __ ldx(zp, rk, rz);         // z = zp[k], accumulator
5566     __ mulx(rx, ry, lop);       // lo-64b-part of result 64x64
5567     __ umulxhi(rx, ry, hip);    // hi-64b-part of result 64x64
5568     __ addcc(rz, rc, rz);       // Accumulate lower order bits,
5569     __ addxc(hip, zero, rc);    // Accumulate higher order bits to carry
5570     __ addcc(rz, lop, rz);      //    z += lo(p) + c
5571     __ addxc(rc, zero, rc);
5572     __ stx(rz, zp, rk);         // zp[k] = z
5573     __ dec(rk, 8);              // k--
5574     __ dec(ri, 8);              // i--
5575     __ ba_short(L_loop_i2);
5576 
5577     __ bind(L_exit_loop_i2);
5578     __ stx(rc, zp, rk);         // z[k] = c
5579     __ dec(rj, 8);              // j--
5580     __ ba_short(L_loop_j);
5581   }
5582 
5583   void gen_mult_64x64_unaligned(Register xp, Register xn,
5584                                 Register yp, Register yn,
5585                                 Register zp, Register zn, Label &L_exit)
5586   {
5587     // Assuming that a stack frame has already been created, i.e. local and
5588     // output registers are available for use.
5589 
5590     const Register xpc = L0;    // Outer loop cursor, xp[i]
5591     const Register ypc = L1;    // Inner loop cursor, yp[j]
5592     const Register zpc = L2;    // Output loop cursor, zp[k]
5593     const Register rx  = L4;    // x-vector datum [i]
5594     const Register ry  = L5;    // y-vector datum [j]
5595     const Register rz  = L6;    // z-vector datum [k]
5596     const Register rc  = L7;    // carry over (to z-vector datum [k-1])
5597     const Register rt  = O2;
5598 
5599     const Register lop = O0;    // lo-64b product
5600     const Register hip = O1;    // hi-64b product
5601 
5602     const Register zero = G0;
5603 
5604     Label L_loop_i,  L_exit_loop_i;
5605     Label L_loop_j;
5606     Label L_loop_i2, L_exit_loop_i2;
5607 
5608     __ srlx(xn, 1, xn);         // index for u32 to u64 ditto
5609     __ srlx(yn, 1, yn);         // index for u32 to u64 ditto
5610     __ srlx(zn, 1, zn);         // index for u32 to u64 ditto
5611     __ dec(xn);                 // Adjust [0..(N/2)-1]
5612     __ dec(yn);
5613     __ dec(zn);
5614     __ clr(rc);                 // u64 c = 0
5615     __ sllx(xn, 3, xpc);        // u32* xpc = &xp[xn] (byte offset 8*xn)
5616     __ add(xp, xpc, xpc);
5617     __ sllx(yn, 3, ypc);        // u32* ypc = &yp[yn] (byte offset 8*yn)
5618     __ add(yp, ypc, ypc);
5619     __ sllx(zn, 3, zpc);        // u32* zpc = &zp[zn] (byte offset 8*zn)
5620     __ add(zp, zpc, zpc);
5621     __ lduw(ypc, 0, rt);        // u64 y = yp[yn]
5622     __ lduw(ypc, 4, ry);        //   ...
5623     __ sllx(rt, 32, rt);
5624     __ or3(rt, ry, ry);
5625 
5626     // for (int i = xn; i >= 0; i--)
5627     __ bind(L_loop_i);
5628 
5629     __ cmp_and_brx_short(xpc, xp,// i >= 0
5630                          Assembler::lessUnsigned, Assembler::pn, L_exit_loop_i);
5631     __ lduw(xpc, 0, rt);        // u64 x = xp[i]
5632     __ lduw(xpc, 4, rx);        //   ...
5633     __ sllx(rt, 32, rt);
5634     __ or3(rt, rx, rx);
5635     __ mulx(rx, ry, lop);       // lo-64b-part of result 64x64
5636     __ umulxhi(rx, ry, hip);    // hi-64b-part of result 64x64
5637     __ addcc(rc, lop, lop);     // Accumulate lower order bits (producing carry)
5638     __ addxc(hip, zero, rc);    // carry over to next datum [k-1]
5639     __ srlx(lop, 32, rt);
5640     __ stw(rt, zpc, 0);         // z[k] = lop
5641     __ stw(lop, zpc, 4);        //   ...
5642     __ dec(zpc, 8);             // k-- (zpc--)
5643     __ dec(xpc, 8);             // i-- (xpc--)
5644     __ ba_short(L_loop_i);
5645 
5646     __ bind(L_exit_loop_i);
5647     __ srlx(rc, 32, rt);
5648     __ stw(rt, zpc, 0);         // z[k] = c
5649     __ stw(rc, zpc, 4);
5650 
5651     // for (int j = yn - 1; j >= 0; j--)
5652     __ sllx(yn, 3, ypc);        // u32* ypc = &yp[yn] (byte offset 8*yn)
5653     __ add(yp, ypc, ypc);
5654     __ dec(ypc, 8);             // yn - 1 (ypc--)
5655 
5656     __ bind(L_loop_j);
5657 
5658     __ cmp_and_brx_short(ypc, yp,// j >= 0
5659                          Assembler::lessUnsigned, Assembler::pn, L_exit);
5660     __ clr(rc);                 // u64 c = 0
5661     __ lduw(ypc, 0, rt);        // u64 y = yp[j] (= *ypc)
5662     __ lduw(ypc, 4, ry);        //   ...
5663     __ sllx(rt, 32, rt);
5664     __ or3(rt, ry, ry);
5665 
5666     // for (int i = xn, k = --zn; i >= 0; i--)
5667     __ sllx(xn, 3, xpc);        // u32* xpc = &xp[xn] (byte offset 8*xn)
5668     __ add(xp, xpc, xpc);
5669     __ dec(zn);                 // --zn
5670     __ sllx(zn, 3, zpc);        // u32* zpc = &zp[zn] (byte offset 8*zn)
5671     __ add(zp, zpc, zpc);
5672 
5673     __ bind(L_loop_i2);
5674 
5675     __ cmp_and_brx_short(xpc, xp,// i >= 0
5676                          Assembler::lessUnsigned, Assembler::pn, L_exit_loop_i2);
5677     __ lduw(xpc, 0, rt);        // u64 x = xp[i] (= *xpc)
5678     __ lduw(xpc, 4, rx);        //   ...
5679     __ sllx(rt, 32, rt);
5680     __ or3(rt, rx, rx);
5681 
5682     __ lduw(zpc, 0, rt);        // u64 z = zp[k] (= *zpc)
5683     __ lduw(zpc, 4, rz);        //   ...
5684     __ sllx(rt, 32, rt);
5685     __ or3(rt, rz, rz);
5686 
5687     __ mulx(rx, ry, lop);       // lo-64b-part of result 64x64
5688     __ umulxhi(rx, ry, hip);    // hi-64b-part of result 64x64
5689     __ addcc(rz, rc, rz);       // Accumulate lower order bits...
5690     __ addxc(hip, zero, rc);    // Accumulate higher order bits to carry
5691     __ addcc(rz, lop, rz);      // ... z += lo(p) + c
5692     __ addxccc(rc, zero, rc);
5693     __ srlx(rz, 32, rt);
5694     __ stw(rt, zpc, 0);         // zp[k] = z    (*zpc = z)
5695     __ stw(rz, zpc, 4);
5696     __ dec(zpc, 8);             // k-- (zpc--)
5697     __ dec(xpc, 8);             // i-- (xpc--)
5698     __ ba_short(L_loop_i2);
5699 
5700     __ bind(L_exit_loop_i2);
5701     __ srlx(rc, 32, rt);
5702     __ stw(rt, zpc, 0);         // z[k] = c
5703     __ stw(rc, zpc, 4);
5704     __ dec(ypc, 8);             // j-- (ypc--)
5705     __ ba_short(L_loop_j);
5706   }
5707 
5708   void gen_mult_32x32(Register xp, Register xn,
5709                       Register yp, Register yn,
5710                       Register zp, Register zn, Label &L_exit)
5711   {
5712     // Assuming that a stack frame has already been created, i.e. local and
5713     // output registers are available for use.
5714 
5715     const Register ri = L0;     // Outer loop index, xv[i]
5716     const Register rj = L1;     // Inner loop index, yv[j]
5717     const Register rk = L2;     // Output loop index, zv[k]
5718     const Register rx = L4;     // x-vector datum [i]
5719     const Register ry = L5;     // y-vector datum [j]
5720     const Register rz = L6;     // z-vector datum [k]
5721     const Register rc = L7;     // carry over (to z-vector datum [k-1])
5722 
5723     const Register p64 = O0;    // 64b product
5724     const Register z65 = O1;    // carry+64b accumulator
5725     const Register c65 = O2;    // carry at bit 65
5726     const Register c33 = O2;    // carry at bit 33 (after shift)
5727 
5728     const Register zero = G0;
5729 
5730     Label L_loop_i,  L_exit_loop_i;
5731     Label L_loop_j;
5732     Label L_loop_i2, L_exit_loop_i2;
5733 
5734     __ dec(xn);                 // Adjust [0..N-1]
5735     __ dec(yn);
5736     __ dec(zn);
5737     __ clr(rc);                 // u32 c = 0
5738     __ sllx(xn, 2, ri);         // int i = xn (byte offset i = 4*xn)
5739     __ sllx(yn, 2, rj);         // int j = yn (byte offset i = 4*xn)
5740     __ sllx(zn, 2, rk);         // int k = zn (byte offset k = 4*zn)
5741     __ lduw(yp, rj, ry);        // u32 y = yp[yn]
5742 
5743     // for (int i = xn; i >= 0; i--)
5744     __ bind(L_loop_i);
5745 
5746     __ cmp_and_br_short(ri, 0,  // i >= 0
5747                         Assembler::less, Assembler::pn, L_exit_loop_i);
5748     __ lduw(xp, ri, rx);        // x = xp[i]
5749     __ mulx(rx, ry, p64);       // 64b result of 32x32
5750     __ addcc(rc, p64, z65);     // Accumulate to 65 bits (producing carry)
5751     __ addxc(zero, zero, c65);  // Materialise carry (in bit 65) into lsb,
5752     __ sllx(c65, 32, c33);      // and shift into bit 33
5753     __ srlx(z65, 32, rc);       // carry = c33 | hi(z65) >> 32
5754     __ add(c33, rc, rc);        // carry over to next datum [k-1]
5755     __ stw(z65, zp, rk);        // z[k] = lo(z65)
5756     __ dec(rk, 4);              // k--
5757     __ dec(ri, 4);              // i--
5758     __ ba_short(L_loop_i);
5759 
5760     __ bind(L_exit_loop_i);
5761     __ stw(rc, zp, rk);         // z[k] = c
5762 
5763     // for (int j = yn - 1; j >= 0; j--)
5764     __ sllx(yn, 2, rj);         // int j = yn - 1 (byte offset j = 4*yn)
5765     __ dec(rj, 4);
5766 
5767     __ bind(L_loop_j);
5768 
5769     __ cmp_and_br_short(rj, 0,  // j >= 0
5770                         Assembler::less, Assembler::pn, L_exit);
5771     __ clr(rc);                 // u32 c = 0
5772     __ lduw(yp, rj, ry);        // u32 y = yp[j]
5773 
5774     // for (int i = xn, k = --zn; i >= 0; i--)
5775     __ dec(zn);                 // --zn
5776     __ sllx(xn, 2, ri);         // int i = xn (byte offset i = 4*xn)
5777     __ sllx(zn, 2, rk);         // int k = zn (byte offset k = 4*zn)
5778 
5779     __ bind(L_loop_i2);
5780 
5781     __ cmp_and_br_short(ri, 0,  // i >= 0
5782                         Assembler::less, Assembler::pn, L_exit_loop_i2);
5783     __ lduw(xp, ri, rx);        // x = xp[i]
5784     __ lduw(zp, rk, rz);        // z = zp[k], accumulator
5785     __ mulx(rx, ry, p64);       // 64b result of 32x32
5786     __ add(rz, rc, rz);         // Accumulate lower order bits,
5787     __ addcc(rz, p64, z65);     //   z += lo(p64) + c
5788     __ addxc(zero, zero, c65);  // Materialise carry (in bit 65) into lsb,
5789     __ sllx(c65, 32, c33);      // and shift into bit 33
5790     __ srlx(z65, 32, rc);       // carry = c33 | hi(z65) >> 32
5791     __ add(c33, rc, rc);        // carry over to next datum [k-1]
5792     __ stw(z65, zp, rk);        // zp[k] = lo(z65)
5793     __ dec(rk, 4);              // k--
5794     __ dec(ri, 4);              // i--
5795     __ ba_short(L_loop_i2);
5796 
5797     __ bind(L_exit_loop_i2);
5798     __ stw(rc, zp, rk);         // z[k] = c
5799     __ dec(rj, 4);              // j--
5800     __ ba_short(L_loop_j);
5801   }
5802 
5803 
5804   void generate_initial() {
5805     // Generates all stubs and initializes the entry points
5806 
5807     //------------------------------------------------------------------------------------------------------------------------
5808     // entry points that exist in all platforms
5809     // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
5810     //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
5811     StubRoutines::_forward_exception_entry                 = generate_forward_exception();
5812 
5813     StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
5814     StubRoutines::_catch_exception_entry                   = generate_catch_exception();
5815 
5816     //------------------------------------------------------------------------------------------------------------------------
5817     // entry points that are platform specific
5818     StubRoutines::Sparc::_test_stop_entry                  = generate_test_stop();
5819 
5820     StubRoutines::Sparc::_stop_subroutine_entry            = generate_stop_subroutine();
5821     StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
5822 
5823     // Build this early so it's available for the interpreter.
5824     StubRoutines::_throw_StackOverflowError_entry =
5825             generate_throw_exception("StackOverflowError throw_exception",
5826             CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
5827     StubRoutines::_throw_delayed_StackOverflowError_entry =
5828             generate_throw_exception("delayed StackOverflowError throw_exception",
5829             CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError));
5830 
5831     if (UseCRC32Intrinsics) {
5832       // set table address before stub generation which use it
5833       StubRoutines::_crc_table_adr = (address)StubRoutines::Sparc::_crc_table;
5834       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5835     }
5836 
5837     if (UseCRC32CIntrinsics) {
5838       // set table address before stub generation which use it
5839       StubRoutines::_crc32c_table_addr = (address)StubRoutines::Sparc::_crc32c_table;
5840       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5841     }
5842   }
5843 
5844 
5845   void generate_all() {
5846     // Generates all stubs and initializes the entry points
5847 
5848     // Generate partial_subtype_check first here since its code depends on
5849     // UseZeroBaseCompressedOops which is defined after heap initialization.
5850     StubRoutines::Sparc::_partial_subtype_check                = generate_partial_subtype_check();
5851     // These entry points require SharedInfo::stack0 to be set up in non-core builds
5852     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
5853     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
5854     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
5855 
5856     // support for verify_oop (must happen after universe_init)
5857     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop_subroutine();
5858 
5859     // arraycopy stubs used by compilers
5860     generate_arraycopy_stubs();
5861 
5862     // Don't initialize the platform math functions since sparc
5863     // doesn't have intrinsics for these operations.
5864 
5865     // Safefetch stubs.
5866     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5867                                                        &StubRoutines::_safefetch32_fault_pc,
5868                                                        &StubRoutines::_safefetch32_continuation_pc);
5869     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5870                                                        &StubRoutines::_safefetchN_fault_pc,
5871                                                        &StubRoutines::_safefetchN_continuation_pc);
5872 
5873     // generate AES intrinsics code
5874     if (UseAESIntrinsics) {
5875       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5876       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5877       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5878       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
5879     }
5880     // generate GHASH intrinsics code
5881     if (UseGHASHIntrinsics) {
5882       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5883     }
5884 
5885     // generate SHA1/SHA256/SHA512 intrinsics code
5886     if (UseSHA1Intrinsics) {
5887       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5888       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5889     }
5890     if (UseSHA256Intrinsics) {
5891       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5892       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5893     }
5894     if (UseSHA512Intrinsics) {
5895       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
5896       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
5897     }
5898     // generate Adler32 intrinsics code
5899     if (UseAdler32Intrinsics) {
5900       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5901     }
5902 
5903 #ifdef COMPILER2
5904     // Intrinsics supported by C2 only:
5905     if (UseMultiplyToLenIntrinsic) {
5906       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5907     }
5908 #endif // COMPILER2
5909   }
5910 
5911  public:
5912   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5913     // replace the standard masm with a special one:
5914     _masm = new MacroAssembler(code);
5915 
5916     _stub_count = !all ? 0x100 : 0x200;
5917     if (all) {
5918       generate_all();
5919     } else {
5920       generate_initial();
5921     }
5922 
5923     // make sure this stub is available for all local calls
5924     if (_atomic_add_stub.is_unbound()) {
5925       // generate a second time, if necessary
5926       (void) generate_atomic_add();
5927     }
5928   }
5929 
5930 
5931  private:
5932   int _stub_count;
5933   void stub_prolog(StubCodeDesc* cdesc) {
5934     # ifdef ASSERT
5935       // put extra information in the stub code, to make it more readable
5936       // Write the high part of the address
5937       // [RGV] Check if there is a dependency on the size of this prolog
5938       __ emit_data((intptr_t)cdesc >> 32,    relocInfo::none);
5939       __ emit_data((intptr_t)cdesc,    relocInfo::none);
5940       __ emit_data(++_stub_count, relocInfo::none);
5941     # endif
5942     align(true);
5943   }
5944 
5945   void align(bool at_header = false) {
5946     // %%%%% move this constant somewhere else
5947     // UltraSPARC cache line size is 8 instructions:
5948     const unsigned int icache_line_size = 32;
5949     const unsigned int icache_half_line_size = 16;
5950 
5951     if (at_header) {
5952       while ((intptr_t)(__ pc()) % icache_line_size != 0) {
5953         __ emit_data(0, relocInfo::none);
5954       }
5955     } else {
5956       while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
5957         __ nop();
5958       }
5959     }
5960   }
5961 
5962 }; // end class declaration
5963 
5964 void StubGenerator_generate(CodeBuffer* code, bool all) {
5965   StubGenerator g(code, all);
5966 }