1 /*
   2  * Copyright (c) 1997, 2018, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.inline.hpp"
  27 #include "gc/shared/cardTable.hpp"
  28 #include "gc/shared/cardTableModRefBS.hpp"
  29 #include "interpreter/interpreter.hpp"
  30 #include "nativeInst_sparc.hpp"
  31 #include "oops/instanceOop.hpp"
  32 #include "oops/method.hpp"
  33 #include "oops/objArrayKlass.hpp"
  34 #include "oops/oop.inline.hpp"
  35 #include "prims/methodHandles.hpp"
  36 #include "runtime/frame.inline.hpp"
  37 #include "runtime/handles.inline.hpp"
  38 #include "runtime/sharedRuntime.hpp"
  39 #include "runtime/stubCodeGenerator.hpp"
  40 #include "runtime/stubRoutines.hpp"
  41 #include "runtime/thread.inline.hpp"
  42 #ifdef COMPILER2
  43 #include "opto/runtime.hpp"
  44 #endif
  45 
  46 // Declaration and definition of StubGenerator (no .hpp file).
  47 // For a more detailed description of the stub routine structure
  48 // see the comment in stubRoutines.hpp.
  49 
  50 #define __ _masm->
  51 
  52 #ifdef PRODUCT
  53 #define BLOCK_COMMENT(str) /* nothing */
  54 #else
  55 #define BLOCK_COMMENT(str) __ block_comment(str)
  56 #endif
  57 
  58 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  59 
  60 // Note:  The register L7 is used as L7_thread_cache, and may not be used
  61 //        any other way within this module.
  62 
  63 static const Register& Lstub_temp = L2;
  64 
  65 // -------------------------------------------------------------------------------------------------------------------------
  66 // Stub Code definitions
  67 
  68 class StubGenerator: public StubCodeGenerator {
  69  private:
  70 
  71 #ifdef PRODUCT
  72 #define inc_counter_np(a,b,c)
  73 #else
  74 #define inc_counter_np(counter, t1, t2) \
  75   BLOCK_COMMENT("inc_counter " #counter); \
  76   __ inc_counter(&counter, t1, t2);
  77 #endif
  78 
  79   //----------------------------------------------------------------------------------------------------
  80   // Call stubs are used to call Java from C
  81 
  82   address generate_call_stub(address& return_pc) {
  83     StubCodeMark mark(this, "StubRoutines", "call_stub");
  84     address start = __ pc();
  85 
  86     // Incoming arguments:
  87     //
  88     // o0         : call wrapper address
  89     // o1         : result (address)
  90     // o2         : result type
  91     // o3         : method
  92     // o4         : (interpreter) entry point
  93     // o5         : parameters (address)
  94     // [sp + 0x5c]: parameter size (in words)
  95     // [sp + 0x60]: thread
  96     //
  97     // +---------------+ <--- sp + 0
  98     // |               |
  99     // . reg save area .
 100     // |               |
 101     // +---------------+ <--- sp + 0x40
 102     // |               |
 103     // . extra 7 slots .
 104     // |               |
 105     // +---------------+ <--- sp + 0x5c
 106     // |  param. size  |
 107     // +---------------+ <--- sp + 0x60
 108     // |    thread     |
 109     // +---------------+
 110     // |               |
 111 
 112     // note: if the link argument position changes, adjust
 113     //       the code in frame::entry_frame_call_wrapper()
 114 
 115     const Argument link           = Argument(0, false); // used only for GC
 116     const Argument result         = Argument(1, false);
 117     const Argument result_type    = Argument(2, false);
 118     const Argument method         = Argument(3, false);
 119     const Argument entry_point    = Argument(4, false);
 120     const Argument parameters     = Argument(5, false);
 121     const Argument parameter_size = Argument(6, false);
 122     const Argument thread         = Argument(7, false);
 123 
 124     // setup thread register
 125     __ ld_ptr(thread.as_address(), G2_thread);
 126     __ reinit_heapbase();
 127 
 128 #ifdef ASSERT
 129     // make sure we have no pending exceptions
 130     { const Register t = G3_scratch;
 131       Label L;
 132       __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
 133       __ br_null_short(t, Assembler::pt, L);
 134       __ stop("StubRoutines::call_stub: entered with pending exception");
 135       __ bind(L);
 136     }
 137 #endif
 138 
 139     // create activation frame & allocate space for parameters
 140     { const Register t = G3_scratch;
 141       __ ld_ptr(parameter_size.as_address(), t);                // get parameter size (in words)
 142       __ add(t, frame::memory_parameter_word_sp_offset, t);     // add space for save area (in words)
 143       __ round_to(t, WordsPerLong);                             // make sure it is multiple of 2 (in words)
 144       __ sll(t, Interpreter::logStackElementSize, t);           // compute number of bytes
 145       __ neg(t);                                                // negate so it can be used with save
 146       __ save(SP, t, SP);                                       // setup new frame
 147     }
 148 
 149     // +---------------+ <--- sp + 0
 150     // |               |
 151     // . reg save area .
 152     // |               |
 153     // +---------------+ <--- sp + 0x40
 154     // |               |
 155     // . extra 7 slots .
 156     // |               |
 157     // +---------------+ <--- sp + 0x5c
 158     // |  empty slot   |      (only if parameter size is even)
 159     // +---------------+
 160     // |               |
 161     // .  parameters   .
 162     // |               |
 163     // +---------------+ <--- fp + 0
 164     // |               |
 165     // . reg save area .
 166     // |               |
 167     // +---------------+ <--- fp + 0x40
 168     // |               |
 169     // . extra 7 slots .
 170     // |               |
 171     // +---------------+ <--- fp + 0x5c
 172     // |  param. size  |
 173     // +---------------+ <--- fp + 0x60
 174     // |    thread     |
 175     // +---------------+
 176     // |               |
 177 
 178     // pass parameters if any
 179     BLOCK_COMMENT("pass parameters if any");
 180     { const Register src = parameters.as_in().as_register();
 181       const Register dst = Lentry_args;
 182       const Register tmp = G3_scratch;
 183       const Register cnt = G4_scratch;
 184 
 185       // test if any parameters & setup of Lentry_args
 186       Label exit;
 187       __ ld_ptr(parameter_size.as_in().as_address(), cnt);      // parameter counter
 188       __ add( FP, STACK_BIAS, dst );
 189       __ cmp_zero_and_br(Assembler::zero, cnt, exit);
 190       __ delayed()->sub(dst, BytesPerWord, dst);                 // setup Lentry_args
 191 
 192       // copy parameters if any
 193       Label loop;
 194       __ BIND(loop);
 195       // Store parameter value
 196       __ ld_ptr(src, 0, tmp);
 197       __ add(src, BytesPerWord, src);
 198       __ st_ptr(tmp, dst, 0);
 199       __ deccc(cnt);
 200       __ br(Assembler::greater, false, Assembler::pt, loop);
 201       __ delayed()->sub(dst, Interpreter::stackElementSize, dst);
 202 
 203       // done
 204       __ BIND(exit);
 205     }
 206 
 207     // setup parameters, method & call Java function
 208 #ifdef ASSERT
 209     // layout_activation_impl checks it's notion of saved SP against
 210     // this register, so if this changes update it as well.
 211     const Register saved_SP = Lscratch;
 212     __ mov(SP, saved_SP);                               // keep track of SP before call
 213 #endif
 214 
 215     // setup parameters
 216     const Register t = G3_scratch;
 217     __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words)
 218     __ sll(t, Interpreter::logStackElementSize, t);    // compute number of bytes
 219     __ sub(FP, t, Gargs);                              // setup parameter pointer
 220     __ add( Gargs, STACK_BIAS, Gargs );                // Account for LP64 stack bias
 221     __ mov(SP, O5_savedSP);
 222 
 223 
 224     // do the call
 225     //
 226     // the following register must be setup:
 227     //
 228     // G2_thread
 229     // G5_method
 230     // Gargs
 231     BLOCK_COMMENT("call Java function");
 232     __ jmpl(entry_point.as_in().as_register(), G0, O7);
 233     __ delayed()->mov(method.as_in().as_register(), G5_method);   // setup method
 234 
 235     BLOCK_COMMENT("call_stub_return_address:");
 236     return_pc = __ pc();
 237 
 238     // The callee, if it wasn't interpreted, can return with SP changed so
 239     // we can no longer assert of change of SP.
 240 
 241     // store result depending on type
 242     // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE
 243     //  is treated as T_INT)
 244     { const Register addr = result     .as_in().as_register();
 245       const Register type = result_type.as_in().as_register();
 246       Label is_long, is_float, is_double, is_object, exit;
 247       __            cmp(type, T_OBJECT);  __ br(Assembler::equal, false, Assembler::pn, is_object);
 248       __ delayed()->cmp(type, T_FLOAT);   __ br(Assembler::equal, false, Assembler::pn, is_float);
 249       __ delayed()->cmp(type, T_DOUBLE);  __ br(Assembler::equal, false, Assembler::pn, is_double);
 250       __ delayed()->cmp(type, T_LONG);    __ br(Assembler::equal, false, Assembler::pn, is_long);
 251       __ delayed()->nop();
 252 
 253       // store int result
 254       __ st(O0, addr, G0);
 255 
 256       __ BIND(exit);
 257       __ ret();
 258       __ delayed()->restore();
 259 
 260       __ BIND(is_object);
 261       __ ba(exit);
 262       __ delayed()->st_ptr(O0, addr, G0);
 263 
 264       __ BIND(is_float);
 265       __ ba(exit);
 266       __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
 267 
 268       __ BIND(is_double);
 269       __ ba(exit);
 270       __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
 271 
 272       __ BIND(is_long);
 273       __ ba(exit);
 274       __ delayed()->st_long(O0, addr, G0);      // store entire long
 275      }
 276      return start;
 277   }
 278 
 279 
 280   //----------------------------------------------------------------------------------------------------
 281   // Return point for a Java call if there's an exception thrown in Java code.
 282   // The exception is caught and transformed into a pending exception stored in
 283   // JavaThread that can be tested from within the VM.
 284   //
 285   // Oexception: exception oop
 286 
 287   address generate_catch_exception() {
 288     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 289 
 290     address start = __ pc();
 291     // verify that thread corresponds
 292     __ verify_thread();
 293 
 294     const Register& temp_reg = Gtemp;
 295     Address pending_exception_addr    (G2_thread, Thread::pending_exception_offset());
 296     Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset   ());
 297     Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset   ());
 298 
 299     // set pending exception
 300     __ verify_oop(Oexception);
 301     __ st_ptr(Oexception, pending_exception_addr);
 302     __ set((intptr_t)__FILE__, temp_reg);
 303     __ st_ptr(temp_reg, exception_file_offset_addr);
 304     __ set((intptr_t)__LINE__, temp_reg);
 305     __ st(temp_reg, exception_line_offset_addr);
 306 
 307     // complete return to VM
 308     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
 309 
 310     AddressLiteral stub_ret(StubRoutines::_call_stub_return_address);
 311     __ jump_to(stub_ret, temp_reg);
 312     __ delayed()->nop();
 313 
 314     return start;
 315   }
 316 
 317 
 318   //----------------------------------------------------------------------------------------------------
 319   // Continuation point for runtime calls returning with a pending exception
 320   // The pending exception check happened in the runtime or native call stub
 321   // The pending exception in Thread is converted into a Java-level exception
 322   //
 323   // Contract with Java-level exception handler: O0 = exception
 324   //                                             O1 = throwing pc
 325 
 326   address generate_forward_exception() {
 327     StubCodeMark mark(this, "StubRoutines", "forward_exception");
 328     address start = __ pc();
 329 
 330     // Upon entry, O7 has the return address returning into Java
 331     // (interpreted or compiled) code; i.e. the return address
 332     // becomes the throwing pc.
 333 
 334     const Register& handler_reg = Gtemp;
 335 
 336     Address exception_addr(G2_thread, Thread::pending_exception_offset());
 337 
 338 #ifdef ASSERT
 339     // make sure that this code is only executed if there is a pending exception
 340     { Label L;
 341       __ ld_ptr(exception_addr, Gtemp);
 342       __ br_notnull_short(Gtemp, Assembler::pt, L);
 343       __ stop("StubRoutines::forward exception: no pending exception (1)");
 344       __ bind(L);
 345     }
 346 #endif
 347 
 348     // compute exception handler into handler_reg
 349     __ get_thread();
 350     __ ld_ptr(exception_addr, Oexception);
 351     __ verify_oop(Oexception);
 352     __ save_frame(0);             // compensates for compiler weakness
 353     __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC
 354     BLOCK_COMMENT("call exception_handler_for_return_address");
 355     __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch);
 356     __ mov(O0, handler_reg);
 357     __ restore();                 // compensates for compiler weakness
 358 
 359     __ ld_ptr(exception_addr, Oexception);
 360     __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC
 361 
 362 #ifdef ASSERT
 363     // make sure exception is set
 364     { Label L;
 365       __ br_notnull_short(Oexception, Assembler::pt, L);
 366       __ stop("StubRoutines::forward exception: no pending exception (2)");
 367       __ bind(L);
 368     }
 369 #endif
 370     // jump to exception handler
 371     __ jmp(handler_reg, 0);
 372     // clear pending exception
 373     __ delayed()->st_ptr(G0, exception_addr);
 374 
 375     return start;
 376   }
 377 
 378   // Safefetch stubs.
 379   void generate_safefetch(const char* name, int size, address* entry,
 380                           address* fault_pc, address* continuation_pc) {
 381     // safefetch signatures:
 382     //   int      SafeFetch32(int*      adr, int      errValue);
 383     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
 384     //
 385     // arguments:
 386     //   o0 = adr
 387     //   o1 = errValue
 388     //
 389     // result:
 390     //   o0  = *adr or errValue
 391 
 392     StubCodeMark mark(this, "StubRoutines", name);
 393 
 394     // Entry point, pc or function descriptor.
 395     __ align(CodeEntryAlignment);
 396     *entry = __ pc();
 397 
 398     __ mov(O0, G1);  // g1 = o0
 399     __ mov(O1, O0);  // o0 = o1
 400     // Load *adr into c_rarg1, may fault.
 401     *fault_pc = __ pc();
 402     switch (size) {
 403       case 4:
 404         // int32_t
 405         __ ldsw(G1, 0, O0);  // o0 = [g1]
 406         break;
 407       case 8:
 408         // int64_t
 409         __ ldx(G1, 0, O0);   // o0 = [g1]
 410         break;
 411       default:
 412         ShouldNotReachHere();
 413     }
 414 
 415     // return errValue or *adr
 416     *continuation_pc = __ pc();
 417     // By convention with the trap handler we ensure there is a non-CTI
 418     // instruction in the trap shadow.
 419     __ nop();
 420     __ retl();
 421     __ delayed()->nop();
 422   }
 423 
 424   //------------------------------------------------------------------------------------------------------------------------
 425   // Continuation point for throwing of implicit exceptions that are not handled in
 426   // the current activation. Fabricates an exception oop and initiates normal
 427   // exception dispatching in this frame. Only callee-saved registers are preserved
 428   // (through the normal register window / RegisterMap handling).
 429   // If the compiler needs all registers to be preserved between the fault
 430   // point and the exception handler then it must assume responsibility for that in
 431   // AbstractCompiler::continuation_for_implicit_null_exception or
 432   // continuation_for_implicit_division_by_zero_exception. All other implicit
 433   // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
 434   // either at call sites or otherwise assume that stack unwinding will be initiated,
 435   // so caller saved registers were assumed volatile in the compiler.
 436 
 437   // Note that we generate only this stub into a RuntimeStub, because it needs to be
 438   // properly traversed and ignored during GC, so we change the meaning of the "__"
 439   // macro within this method.
 440 #undef __
 441 #define __ masm->
 442 
 443   address generate_throw_exception(const char* name, address runtime_entry,
 444                                    Register arg1 = noreg, Register arg2 = noreg) {
 445 #ifdef ASSERT
 446     int insts_size = VerifyThread ? 1 * K : 600;
 447 #else
 448     int insts_size = VerifyThread ? 1 * K : 256;
 449 #endif /* ASSERT */
 450     int locs_size  = 32;
 451 
 452     CodeBuffer      code(name, insts_size, locs_size);
 453     MacroAssembler* masm = new MacroAssembler(&code);
 454 
 455     __ verify_thread();
 456 
 457     // This is an inlined and slightly modified version of call_VM
 458     // which has the ability to fetch the return PC out of thread-local storage
 459     __ assert_not_delayed();
 460 
 461     // Note that we always push a frame because on the SPARC
 462     // architecture, for all of our implicit exception kinds at call
 463     // sites, the implicit exception is taken before the callee frame
 464     // is pushed.
 465     __ save_frame(0);
 466 
 467     int frame_complete = __ offset();
 468 
 469     // Note that we always have a runtime stub frame on the top of stack by this point
 470     Register last_java_sp = SP;
 471     // 64-bit last_java_sp is biased!
 472     __ set_last_Java_frame(last_java_sp, G0);
 473     if (VerifyThread)  __ mov(G2_thread, O0); // about to be smashed; pass early
 474     __ save_thread(noreg);
 475     if (arg1 != noreg) {
 476       assert(arg2 != O1, "clobbered");
 477       __ mov(arg1, O1);
 478     }
 479     if (arg2 != noreg) {
 480       __ mov(arg2, O2);
 481     }
 482     // do the call
 483     BLOCK_COMMENT("call runtime_entry");
 484     __ call(runtime_entry, relocInfo::runtime_call_type);
 485     if (!VerifyThread)
 486       __ delayed()->mov(G2_thread, O0);  // pass thread as first argument
 487     else
 488       __ delayed()->nop();             // (thread already passed)
 489     __ restore_thread(noreg);
 490     __ reset_last_Java_frame();
 491 
 492     // check for pending exceptions. use Gtemp as scratch register.
 493 #ifdef ASSERT
 494     Label L;
 495 
 496     Address exception_addr(G2_thread, Thread::pending_exception_offset());
 497     Register scratch_reg = Gtemp;
 498     __ ld_ptr(exception_addr, scratch_reg);
 499     __ br_notnull_short(scratch_reg, Assembler::pt, L);
 500     __ should_not_reach_here();
 501     __ bind(L);
 502 #endif // ASSERT
 503     BLOCK_COMMENT("call forward_exception_entry");
 504     __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
 505     // we use O7 linkage so that forward_exception_entry has the issuing PC
 506     __ delayed()->restore();
 507 
 508     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false);
 509     return stub->entry_point();
 510   }
 511 
 512 #undef __
 513 #define __ _masm->
 514 
 515 
 516   // Generate a routine that sets all the registers so we
 517   // can tell if the stop routine prints them correctly.
 518   address generate_test_stop() {
 519     StubCodeMark mark(this, "StubRoutines", "test_stop");
 520     address start = __ pc();
 521 
 522     int i;
 523 
 524     __ save_frame(0);
 525 
 526     static jfloat zero = 0.0, one = 1.0;
 527 
 528     // put addr in L0, then load through L0 to F0
 529     __ set((intptr_t)&zero, L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F0);
 530     __ set((intptr_t)&one,  L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1
 531 
 532     // use add to put 2..18 in F2..F18
 533     for ( i = 2;  i <= 18;  ++i ) {
 534       __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1),  as_FloatRegister(i));
 535     }
 536 
 537     // Now put double 2 in F16, double 18 in F18
 538     __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 );
 539     __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 );
 540 
 541     // use add to put 20..32 in F20..F32
 542     for (i = 20; i < 32; i += 2) {
 543       __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2),  as_FloatRegister(i));
 544     }
 545 
 546     // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's
 547     for ( i = 0; i < 8; ++i ) {
 548       if (i < 6) {
 549         __ set(     i, as_iRegister(i));
 550         __ set(16 + i, as_oRegister(i));
 551         __ set(24 + i, as_gRegister(i));
 552       }
 553       __ set( 8 + i, as_lRegister(i));
 554     }
 555 
 556     __ stop("testing stop");
 557 
 558 
 559     __ ret();
 560     __ delayed()->restore();
 561 
 562     return start;
 563   }
 564 
 565 
 566   address generate_stop_subroutine() {
 567     StubCodeMark mark(this, "StubRoutines", "stop_subroutine");
 568     address start = __ pc();
 569 
 570     __ stop_subroutine();
 571 
 572     return start;
 573   }
 574 
 575   address generate_flush_callers_register_windows() {
 576     StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows");
 577     address start = __ pc();
 578 
 579     __ flushw();
 580     __ retl(false);
 581     __ delayed()->add( FP, STACK_BIAS, O0 );
 582     // The returned value must be a stack pointer whose register save area
 583     // is flushed, and will stay flushed while the caller executes.
 584 
 585     return start;
 586   }
 587 
 588   // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest).
 589   //
 590   // Arguments:
 591   //
 592   //      exchange_value: O0
 593   //      dest:           O1
 594   //
 595   // Results:
 596   //
 597   //     O0: the value previously stored in dest
 598   //
 599   address generate_atomic_xchg() {
 600     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
 601     address start = __ pc();
 602 
 603     if (UseCASForSwap) {
 604       // Use CAS instead of swap, just in case the MP hardware
 605       // prefers to work with just one kind of synch. instruction.
 606       Label retry;
 607       __ BIND(retry);
 608       __ mov(O0, O3);       // scratch copy of exchange value
 609       __ ld(O1, 0, O2);     // observe the previous value
 610       // try to replace O2 with O3
 611       __ cas(O1, O2, O3);
 612       __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
 613 
 614       __ retl(false);
 615       __ delayed()->mov(O2, O0);  // report previous value to caller
 616     } else {
 617       __ retl(false);
 618       __ delayed()->swap(O1, 0, O0);
 619     }
 620 
 621     return start;
 622   }
 623 
 624 
 625   // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value)
 626   //
 627   // Arguments:
 628   //
 629   //      exchange_value: O0
 630   //      dest:           O1
 631   //      compare_value:  O2
 632   //
 633   // Results:
 634   //
 635   //     O0: the value previously stored in dest
 636   //
 637   address generate_atomic_cmpxchg() {
 638     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
 639     address start = __ pc();
 640 
 641     // cmpxchg(dest, compare_value, exchange_value)
 642     __ cas(O1, O2, O0);
 643     __ retl(false);
 644     __ delayed()->nop();
 645 
 646     return start;
 647   }
 648 
 649   // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
 650   //
 651   // Arguments:
 652   //
 653   //      exchange_value: O1:O0
 654   //      dest:           O2
 655   //      compare_value:  O4:O3
 656   //
 657   // Results:
 658   //
 659   //     O1:O0: the value previously stored in dest
 660   //
 661   // Overwrites: G1,G2,G3
 662   //
 663   address generate_atomic_cmpxchg_long() {
 664     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
 665     address start = __ pc();
 666 
 667     __ sllx(O0, 32, O0);
 668     __ srl(O1, 0, O1);
 669     __ or3(O0,O1,O0);      // O0 holds 64-bit value from compare_value
 670     __ sllx(O3, 32, O3);
 671     __ srl(O4, 0, O4);
 672     __ or3(O3,O4,O3);     // O3 holds 64-bit value from exchange_value
 673     __ casx(O2, O3, O0);
 674     __ srl(O0, 0, O1);    // unpacked return value in O1:O0
 675     __ retl(false);
 676     __ delayed()->srlx(O0, 32, O0);
 677 
 678     return start;
 679   }
 680 
 681 
 682   // Support for jint Atomic::add(jint add_value, volatile jint* dest).
 683   //
 684   // Arguments:
 685   //
 686   //      add_value: O0   (e.g., +1 or -1)
 687   //      dest:      O1
 688   //
 689   // Results:
 690   //
 691   //     O0: the new value stored in dest
 692   //
 693   // Overwrites: O3
 694   //
 695   address generate_atomic_add() {
 696     StubCodeMark mark(this, "StubRoutines", "atomic_add");
 697     address start = __ pc();
 698     __ BIND(_atomic_add_stub);
 699 
 700     Label(retry);
 701     __ BIND(retry);
 702 
 703     __ lduw(O1, 0, O2);
 704     __ add(O0, O2, O3);
 705     __ cas(O1, O2, O3);
 706     __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
 707     __ retl(false);
 708     __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
 709 
 710     return start;
 711   }
 712   Label _atomic_add_stub;  // called from other stubs
 713 
 714 
 715   // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super );
 716   // Arguments :
 717   //
 718   //      ret  : O0, returned
 719   //      icc/xcc: set as O0 (depending on wordSize)
 720   //      sub  : O1, argument, not changed
 721   //      super: O2, argument, not changed
 722   //      raddr: O7, blown by call
 723   address generate_partial_subtype_check() {
 724     __ align(CodeEntryAlignment);
 725     StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
 726     address start = __ pc();
 727     Label miss;
 728 
 729     __ save_frame(0);
 730     Register Rret   = I0;
 731     Register Rsub   = I1;
 732     Register Rsuper = I2;
 733 
 734     Register L0_ary_len = L0;
 735     Register L1_ary_ptr = L1;
 736     Register L2_super   = L2;
 737     Register L3_index   = L3;
 738 
 739     __ check_klass_subtype_slow_path(Rsub, Rsuper,
 740                                      L0, L1, L2, L3,
 741                                      NULL, &miss);
 742 
 743     // Match falls through here.
 744     __ addcc(G0,0,Rret);        // set Z flags, Z result
 745 
 746     __ ret();                   // Result in Rret is zero; flags set to Z
 747     __ delayed()->restore();
 748 
 749     __ BIND(miss);
 750     __ addcc(G0,1,Rret);        // set NZ flags, NZ result
 751 
 752     __ ret();                   // Result in Rret is != 0; flags set to NZ
 753     __ delayed()->restore();
 754 
 755     return start;
 756   }
 757 
 758 
 759   // Called from MacroAssembler::verify_oop
 760   //
 761   address generate_verify_oop_subroutine() {
 762     StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
 763 
 764     address start = __ pc();
 765 
 766     __ verify_oop_subroutine();
 767 
 768     return start;
 769   }
 770 
 771 
 772   //
 773   // Verify that a register contains clean 32-bits positive value
 774   // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax).
 775   //
 776   //  Input:
 777   //    Rint  -  32-bits value
 778   //    Rtmp  -  scratch
 779   //
 780   void assert_clean_int(Register Rint, Register Rtmp) {
 781   #if defined(ASSERT)
 782     __ signx(Rint, Rtmp);
 783     __ cmp(Rint, Rtmp);
 784     __ breakpoint_trap(Assembler::notEqual, Assembler::xcc);
 785   #endif
 786   }
 787 
 788   //
 789   //  Generate overlap test for array copy stubs
 790   //
 791   //  Input:
 792   //    O0    -  array1
 793   //    O1    -  array2
 794   //    O2    -  element count
 795   //
 796   //  Kills temps:  O3, O4
 797   //
 798   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
 799     assert(no_overlap_target != NULL, "must be generated");
 800     array_overlap_test(no_overlap_target, NULL, log2_elem_size);
 801   }
 802   void array_overlap_test(Label& L_no_overlap, int log2_elem_size) {
 803     array_overlap_test(NULL, &L_no_overlap, log2_elem_size);
 804   }
 805   void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) {
 806     const Register from       = O0;
 807     const Register to         = O1;
 808     const Register count      = O2;
 809     const Register to_from    = O3; // to - from
 810     const Register byte_count = O4; // count << log2_elem_size
 811 
 812       __ subcc(to, from, to_from);
 813       __ sll_ptr(count, log2_elem_size, byte_count);
 814       if (NOLp == NULL)
 815         __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target);
 816       else
 817         __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp));
 818       __ delayed()->cmp(to_from, byte_count);
 819       if (NOLp == NULL)
 820         __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target);
 821       else
 822         __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp));
 823       __ delayed()->nop();
 824   }
 825 
 826   //
 827   //  Generate pre-write barrier for array.
 828   //
 829   //  Input:
 830   //     addr     - register containing starting address
 831   //     count    - register containing element count
 832   //     tmp      - scratch register
 833   //
 834   //  The input registers are overwritten.
 835   //
 836   void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 837     BarrierSet* bs = Universe::heap()->barrier_set();
 838     switch (bs->kind()) {
 839       case BarrierSet::G1BarrierSet:
 840         // With G1, don't generate the call if we statically know that the target in uninitialized
 841         if (!dest_uninitialized) {
 842           Register tmp = O5;
 843           assert_different_registers(addr, count, tmp);
 844           Label filtered;
 845           // Is marking active?
 846           if (in_bytes(SATBMarkQueue::byte_width_of_active()) == 4) {
 847             __ ld(G2, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), tmp);
 848           } else {
 849             guarantee(in_bytes(SATBMarkQueue::byte_width_of_active()) == 1,
 850                       "Assumption");
 851             __ ldsb(G2, in_bytes(JavaThread::satb_mark_queue_offset() + SATBMarkQueue::byte_offset_of_active()), tmp);
 852           }
 853           // Is marking active?
 854           __ cmp_and_br_short(tmp, G0, Assembler::equal, Assembler::pt, filtered);
 855 
 856           __ save_frame(0);
 857           // Save the necessary global regs... will be used after.
 858           if (addr->is_global()) {
 859             __ mov(addr, L0);
 860           }
 861           if (count->is_global()) {
 862             __ mov(count, L1);
 863           }
 864           __ mov(addr->after_save(), O0);
 865           // Get the count into O1
 866           __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
 867           __ delayed()->mov(count->after_save(), O1);
 868           if (addr->is_global()) {
 869             __ mov(L0, addr);
 870           }
 871           if (count->is_global()) {
 872             __ mov(L1, count);
 873           }
 874           __ restore();
 875 
 876           __ bind(filtered);
 877           DEBUG_ONLY(__ set(0xDEADC0DE, tmp);) // we have killed tmp
 878         }
 879         break;
 880       case BarrierSet::CardTableModRef:
 881         break;
 882       default:
 883         ShouldNotReachHere();
 884     }
 885   }
 886   //
 887   //  Generate post-write barrier for array.
 888   //
 889   //  Input:
 890   //     addr     - register containing starting address
 891   //     count    - register containing element count
 892   //     tmp      - scratch register
 893   //
 894   //  The input registers are overwritten.
 895   //
 896   void gen_write_ref_array_post_barrier(Register addr, Register count,
 897                                         Register tmp) {
 898     BarrierSet* bs = Universe::heap()->barrier_set();
 899 
 900     switch (bs->kind()) {
 901       case BarrierSet::G1BarrierSet:
 902         {
 903           // Get some new fresh output registers.
 904           __ save_frame(0);
 905           __ mov(addr->after_save(), O0);
 906           __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
 907           __ delayed()->mov(count->after_save(), O1);
 908           __ restore();
 909         }
 910         break;
 911       case BarrierSet::CardTableModRef:
 912         {
 913           CardTableModRefBS* ctbs = barrier_set_cast<CardTableModRefBS>(bs);
 914           CardTable* ct = ctbs->card_table();
 915           assert(sizeof(*ct->byte_map_base()) == sizeof(jbyte), "adjust this code");
 916           assert_different_registers(addr, count, tmp);
 917 
 918           Label L_loop, L_done;
 919 
 920           __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_done); // zero count - nothing to do
 921 
 922           __ sll_ptr(count, LogBytesPerHeapOop, count);
 923           __ sub(count, BytesPerHeapOop, count);
 924           __ add(count, addr, count);
 925           // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
 926           __ srl_ptr(addr, CardTable::card_shift, addr);
 927           __ srl_ptr(count, CardTable::card_shift, count);
 928           __ sub(count, addr, count);
 929           AddressLiteral rs(ct->byte_map_base());
 930           __ set(rs, tmp);
 931         __ BIND(L_loop);
 932           __ stb(G0, tmp, addr);
 933           __ subcc(count, 1, count);
 934           __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
 935           __ delayed()->add(addr, 1, addr);
 936         __ BIND(L_done);
 937         }
 938         break;
 939       case BarrierSet::ModRef:
 940         break;
 941       default:
 942         ShouldNotReachHere();
 943     }
 944   }
 945 
 946   //
 947   // Generate main code for disjoint arraycopy
 948   //
 949   typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
 950                                               Label& L_loop, bool use_prefetch, bool use_bis);
 951 
 952   void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
 953                           int iter_size, StubGenerator::CopyLoopFunc copy_loop_func) {
 954     Label L_copy;
 955 
 956     assert(log2_elem_size <= 3, "the following code should be changed");
 957     int count_dec = 16>>log2_elem_size;
 958 
 959     int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance);
 960     assert(prefetch_dist < 4096, "invalid value");
 961     prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size
 962     int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count
 963 
 964     if (UseBlockCopy) {
 965       Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy;
 966 
 967       // 64 bytes tail + bytes copied in one loop iteration
 968       int tail_size = 64 + iter_size;
 969       int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size;
 970       // Use BIS copy only for big arrays since it requires membar.
 971       __ set(block_copy_count, O4);
 972       __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy);
 973       // This code is for disjoint source and destination:
 974       //   to <= from || to >= from+count
 975       // but BIS will stomp over 'from' if (to > from-tail_size && to <= from)
 976       __ sub(from, to, O4);
 977       __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm.
 978       __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy);
 979 
 980       __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
 981       // BIS should not be used to copy tail (64 bytes+iter_size)
 982       // to avoid zeroing of following values.
 983       __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0
 984 
 985       if (prefetch_count > 0) { // rounded up to one iteration count
 986         // Do prefetching only if copy size is bigger
 987         // than prefetch distance.
 988         __ set(prefetch_count, O4);
 989         __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy);
 990         __ sub(count, O4, count);
 991 
 992         (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true);
 993         __ set(prefetch_count, O4);
 994         __ add(count, O4, count);
 995 
 996       } // prefetch_count > 0
 997 
 998       (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true);
 999       __ add(count, (tail_size>>log2_elem_size), count); // restore count
1000 
1001       __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
1002       // BIS needs membar.
1003       __ membar(Assembler::StoreLoad);
1004       // Copy tail
1005       __ ba_short(L_copy);
1006 
1007       __ BIND(L_skip_block_copy);
1008     } // UseBlockCopy
1009 
1010     if (prefetch_count > 0) { // rounded up to one iteration count
1011       // Do prefetching only if copy size is bigger
1012       // than prefetch distance.
1013       __ set(prefetch_count, O4);
1014       __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy);
1015       __ sub(count, O4, count);
1016 
1017       Label L_copy_prefetch;
1018       (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false);
1019       __ set(prefetch_count, O4);
1020       __ add(count, O4, count);
1021 
1022     } // prefetch_count > 0
1023 
1024     (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false);
1025   }
1026 
1027 
1028 
1029   //
1030   // Helper methods for copy_16_bytes_forward_with_shift()
1031   //
1032   void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec,
1033                                 Label& L_loop, bool use_prefetch, bool use_bis) {
1034 
1035     const Register left_shift  = G1; // left  shift bit counter
1036     const Register right_shift = G5; // right shift bit counter
1037 
1038     __ align(OptoLoopAlignment);
1039     __ BIND(L_loop);
1040     if (use_prefetch) {
1041       if (ArraycopySrcPrefetchDistance > 0) {
1042         __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
1043       }
1044       if (ArraycopyDstPrefetchDistance > 0) {
1045         __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
1046       }
1047     }
1048     __ ldx(from, 0, O4);
1049     __ ldx(from, 8, G4);
1050     __ inc(to, 16);
1051     __ inc(from, 16);
1052     __ deccc(count, count_dec); // Can we do next iteration after this one?
1053     __ srlx(O4, right_shift, G3);
1054     __ bset(G3, O3);
1055     __ sllx(O4, left_shift,  O4);
1056     __ srlx(G4, right_shift, G3);
1057     __ bset(G3, O4);
1058     if (use_bis) {
1059       __ stxa(O3, to, -16);
1060       __ stxa(O4, to, -8);
1061     } else {
1062       __ stx(O3, to, -16);
1063       __ stx(O4, to, -8);
1064     }
1065     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1066     __ delayed()->sllx(G4, left_shift,  O3);
1067   }
1068 
1069   // Copy big chunks forward with shift
1070   //
1071   // Inputs:
1072   //   from      - source arrays
1073   //   to        - destination array aligned to 8-bytes
1074   //   count     - elements count to copy >= the count equivalent to 16 bytes
1075   //   count_dec - elements count's decrement equivalent to 16 bytes
1076   //   L_copy_bytes - copy exit label
1077   //
1078   void copy_16_bytes_forward_with_shift(Register from, Register to,
1079                      Register count, int log2_elem_size, Label& L_copy_bytes) {
1080     Label L_aligned_copy, L_copy_last_bytes;
1081     assert(log2_elem_size <= 3, "the following code should be changed");
1082     int count_dec = 16>>log2_elem_size;
1083 
1084     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1085     __ andcc(from, 7, G1); // misaligned bytes
1086     __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1087     __ delayed()->nop();
1088 
1089     const Register left_shift  = G1; // left  shift bit counter
1090     const Register right_shift = G5; // right shift bit counter
1091 
1092     __ sll(G1, LogBitsPerByte, left_shift);
1093     __ mov(64, right_shift);
1094     __ sub(right_shift, left_shift, right_shift);
1095 
1096     //
1097     // Load 2 aligned 8-bytes chunks and use one from previous iteration
1098     // to form 2 aligned 8-bytes chunks to store.
1099     //
1100     __ dec(count, count_dec);   // Pre-decrement 'count'
1101     __ andn(from, 7, from);     // Align address
1102     __ ldx(from, 0, O3);
1103     __ inc(from, 8);
1104     __ sllx(O3, left_shift,  O3);
1105 
1106     disjoint_copy_core(from, to, count, log2_elem_size, 16, &StubGenerator::copy_16_bytes_shift_loop);
1107 
1108     __ inccc(count, count_dec>>1 ); // + 8 bytes
1109     __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1110     __ delayed()->inc(count, count_dec>>1); // restore 'count'
1111 
1112     // copy 8 bytes, part of them already loaded in O3
1113     __ ldx(from, 0, O4);
1114     __ inc(to, 8);
1115     __ inc(from, 8);
1116     __ srlx(O4, right_shift, G3);
1117     __ bset(O3, G3);
1118     __ stx(G3, to, -8);
1119 
1120     __ BIND(L_copy_last_bytes);
1121     __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
1122     __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1123     __ delayed()->sub(from, right_shift, from);       // restore address
1124 
1125     __ BIND(L_aligned_copy);
1126   }
1127 
1128   // Copy big chunks backward with shift
1129   //
1130   // Inputs:
1131   //   end_from  - source arrays end address
1132   //   end_to    - destination array end address aligned to 8-bytes
1133   //   count     - elements count to copy >= the count equivalent to 16 bytes
1134   //   count_dec - elements count's decrement equivalent to 16 bytes
1135   //   L_aligned_copy - aligned copy exit label
1136   //   L_copy_bytes   - copy exit label
1137   //
1138   void copy_16_bytes_backward_with_shift(Register end_from, Register end_to,
1139                      Register count, int count_dec,
1140                      Label& L_aligned_copy, Label& L_copy_bytes) {
1141     Label L_loop, L_copy_last_bytes;
1142 
1143     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1144       __ andcc(end_from, 7, G1); // misaligned bytes
1145       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1146       __ delayed()->deccc(count, count_dec); // Pre-decrement 'count'
1147 
1148     const Register left_shift  = G1; // left  shift bit counter
1149     const Register right_shift = G5; // right shift bit counter
1150 
1151       __ sll(G1, LogBitsPerByte, left_shift);
1152       __ mov(64, right_shift);
1153       __ sub(right_shift, left_shift, right_shift);
1154 
1155     //
1156     // Load 2 aligned 8-bytes chunks and use one from previous iteration
1157     // to form 2 aligned 8-bytes chunks to store.
1158     //
1159       __ andn(end_from, 7, end_from);     // Align address
1160       __ ldx(end_from, 0, O3);
1161       __ align(OptoLoopAlignment);
1162     __ BIND(L_loop);
1163       __ ldx(end_from, -8, O4);
1164       __ deccc(count, count_dec); // Can we do next iteration after this one?
1165       __ ldx(end_from, -16, G4);
1166       __ dec(end_to, 16);
1167       __ dec(end_from, 16);
1168       __ srlx(O3, right_shift, O3);
1169       __ sllx(O4, left_shift,  G3);
1170       __ bset(G3, O3);
1171       __ stx(O3, end_to, 8);
1172       __ srlx(O4, right_shift, O4);
1173       __ sllx(G4, left_shift,  G3);
1174       __ bset(G3, O4);
1175       __ stx(O4, end_to, 0);
1176       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1177       __ delayed()->mov(G4, O3);
1178 
1179       __ inccc(count, count_dec>>1 ); // + 8 bytes
1180       __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1181       __ delayed()->inc(count, count_dec>>1); // restore 'count'
1182 
1183       // copy 8 bytes, part of them already loaded in O3
1184       __ ldx(end_from, -8, O4);
1185       __ dec(end_to, 8);
1186       __ dec(end_from, 8);
1187       __ srlx(O3, right_shift, O3);
1188       __ sllx(O4, left_shift,  G3);
1189       __ bset(O3, G3);
1190       __ stx(G3, end_to, 0);
1191 
1192     __ BIND(L_copy_last_bytes);
1193       __ srl(left_shift, LogBitsPerByte, left_shift);    // misaligned bytes
1194       __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1195       __ delayed()->add(end_from, left_shift, end_from); // restore address
1196   }
1197 
1198   //
1199   //  Generate stub for disjoint byte copy.  If "aligned" is true, the
1200   //  "from" and "to" addresses are assumed to be heapword aligned.
1201   //
1202   // Arguments for generated stub:
1203   //      from:  O0
1204   //      to:    O1
1205   //      count: O2 treated as signed
1206   //
1207   address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) {
1208     __ align(CodeEntryAlignment);
1209     StubCodeMark mark(this, "StubRoutines", name);
1210     address start = __ pc();
1211 
1212     Label L_skip_alignment, L_align;
1213     Label L_copy_byte, L_copy_byte_loop, L_exit;
1214 
1215     const Register from      = O0;   // source array address
1216     const Register to        = O1;   // destination array address
1217     const Register count     = O2;   // elements count
1218     const Register offset    = O5;   // offset from start of arrays
1219     // O3, O4, G3, G4 are used as temp registers
1220 
1221     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1222 
1223     if (entry != NULL) {
1224       *entry = __ pc();
1225       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1226       BLOCK_COMMENT("Entry:");
1227     }
1228 
1229     // for short arrays, just do single element copy
1230     __ cmp(count, 23); // 16 + 7
1231     __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1232     __ delayed()->mov(G0, offset);
1233 
1234     if (aligned) {
1235       // 'aligned' == true when it is known statically during compilation
1236       // of this arraycopy call site that both 'from' and 'to' addresses
1237       // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1238       //
1239       // Aligned arrays have 4 bytes alignment in 32-bits VM
1240       // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM
1241       //
1242     } else {
1243       // copy bytes to align 'to' on 8 byte boundary
1244       __ andcc(to, 7, G1); // misaligned bytes
1245       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1246       __ delayed()->neg(G1);
1247       __ inc(G1, 8);       // bytes need to copy to next 8-bytes alignment
1248       __ sub(count, G1, count);
1249     __ BIND(L_align);
1250       __ ldub(from, 0, O3);
1251       __ deccc(G1);
1252       __ inc(from);
1253       __ stb(O3, to, 0);
1254       __ br(Assembler::notZero, false, Assembler::pt, L_align);
1255       __ delayed()->inc(to);
1256     __ BIND(L_skip_alignment);
1257     }
1258     if (!aligned) {
1259       // Copy with shift 16 bytes per iteration if arrays do not have
1260       // the same alignment mod 8, otherwise fall through to the next
1261       // code for aligned copy.
1262       // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
1263       // Also jump over aligned copy after the copy with shift completed.
1264 
1265       copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte);
1266     }
1267 
1268     // Both array are 8 bytes aligned, copy 16 bytes at a time
1269       __ and3(count, 7, G4); // Save count
1270       __ srl(count, 3, count);
1271      generate_disjoint_long_copy_core(aligned);
1272       __ mov(G4, count);     // Restore count
1273 
1274     // copy tailing bytes
1275     __ BIND(L_copy_byte);
1276       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1277       __ align(OptoLoopAlignment);
1278     __ BIND(L_copy_byte_loop);
1279       __ ldub(from, offset, O3);
1280       __ deccc(count);
1281       __ stb(O3, to, offset);
1282       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1283       __ delayed()->inc(offset);
1284 
1285     __ BIND(L_exit);
1286       // O3, O4 are used as temp registers
1287       inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1288       __ retl();
1289       __ delayed()->mov(G0, O0); // return 0
1290     return start;
1291   }
1292 
1293   //
1294   //  Generate stub for conjoint byte copy.  If "aligned" is true, the
1295   //  "from" and "to" addresses are assumed to be heapword aligned.
1296   //
1297   // Arguments for generated stub:
1298   //      from:  O0
1299   //      to:    O1
1300   //      count: O2 treated as signed
1301   //
1302   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1303                                       address *entry, const char *name) {
1304     // Do reverse copy.
1305 
1306     __ align(CodeEntryAlignment);
1307     StubCodeMark mark(this, "StubRoutines", name);
1308     address start = __ pc();
1309 
1310     Label L_skip_alignment, L_align, L_aligned_copy;
1311     Label L_copy_byte, L_copy_byte_loop, L_exit;
1312 
1313     const Register from      = O0;   // source array address
1314     const Register to        = O1;   // destination array address
1315     const Register count     = O2;   // elements count
1316     const Register end_from  = from; // source array end address
1317     const Register end_to    = to;   // destination array end address
1318 
1319     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1320 
1321     if (entry != NULL) {
1322       *entry = __ pc();
1323       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1324       BLOCK_COMMENT("Entry:");
1325     }
1326 
1327     array_overlap_test(nooverlap_target, 0);
1328 
1329     __ add(to, count, end_to);       // offset after last copied element
1330 
1331     // for short arrays, just do single element copy
1332     __ cmp(count, 23); // 16 + 7
1333     __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1334     __ delayed()->add(from, count, end_from);
1335 
1336     {
1337       // Align end of arrays since they could be not aligned even
1338       // when arrays itself are aligned.
1339 
1340       // copy bytes to align 'end_to' on 8 byte boundary
1341       __ andcc(end_to, 7, G1); // misaligned bytes
1342       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1343       __ delayed()->nop();
1344       __ sub(count, G1, count);
1345     __ BIND(L_align);
1346       __ dec(end_from);
1347       __ dec(end_to);
1348       __ ldub(end_from, 0, O3);
1349       __ deccc(G1);
1350       __ brx(Assembler::notZero, false, Assembler::pt, L_align);
1351       __ delayed()->stb(O3, end_to, 0);
1352     __ BIND(L_skip_alignment);
1353     }
1354     if (aligned) {
1355       // Both arrays are aligned to 8-bytes in 64-bits VM.
1356       // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1357       // in unaligned case.
1358       __ dec(count, 16);
1359     } else {
1360       // Copy with shift 16 bytes per iteration if arrays do not have
1361       // the same alignment mod 8, otherwise jump to the next
1362       // code for aligned copy (and substracting 16 from 'count' before jump).
1363       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1364       // Also jump over aligned copy after the copy with shift completed.
1365 
1366       copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
1367                                         L_aligned_copy, L_copy_byte);
1368     }
1369     // copy 4 elements (16 bytes) at a time
1370       __ align(OptoLoopAlignment);
1371     __ BIND(L_aligned_copy);
1372       __ dec(end_from, 16);
1373       __ ldx(end_from, 8, O3);
1374       __ ldx(end_from, 0, O4);
1375       __ dec(end_to, 16);
1376       __ deccc(count, 16);
1377       __ stx(O3, end_to, 8);
1378       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1379       __ delayed()->stx(O4, end_to, 0);
1380       __ inc(count, 16);
1381 
1382     // copy 1 element (2 bytes) at a time
1383     __ BIND(L_copy_byte);
1384       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1385       __ align(OptoLoopAlignment);
1386     __ BIND(L_copy_byte_loop);
1387       __ dec(end_from);
1388       __ dec(end_to);
1389       __ ldub(end_from, 0, O4);
1390       __ deccc(count);
1391       __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop);
1392       __ delayed()->stb(O4, end_to, 0);
1393 
1394     __ BIND(L_exit);
1395     // O3, O4 are used as temp registers
1396     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1397     __ retl();
1398     __ delayed()->mov(G0, O0); // return 0
1399     return start;
1400   }
1401 
1402   //
1403   //  Generate stub for disjoint short copy.  If "aligned" is true, the
1404   //  "from" and "to" addresses are assumed to be heapword aligned.
1405   //
1406   // Arguments for generated stub:
1407   //      from:  O0
1408   //      to:    O1
1409   //      count: O2 treated as signed
1410   //
1411   address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) {
1412     __ align(CodeEntryAlignment);
1413     StubCodeMark mark(this, "StubRoutines", name);
1414     address start = __ pc();
1415 
1416     Label L_skip_alignment, L_skip_alignment2;
1417     Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1418 
1419     const Register from      = O0;   // source array address
1420     const Register to        = O1;   // destination array address
1421     const Register count     = O2;   // elements count
1422     const Register offset    = O5;   // offset from start of arrays
1423     // O3, O4, G3, G4 are used as temp registers
1424 
1425     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1426 
1427     if (entry != NULL) {
1428       *entry = __ pc();
1429       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1430       BLOCK_COMMENT("Entry:");
1431     }
1432 
1433     // for short arrays, just do single element copy
1434     __ cmp(count, 11); // 8 + 3  (22 bytes)
1435     __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1436     __ delayed()->mov(G0, offset);
1437 
1438     if (aligned) {
1439       // 'aligned' == true when it is known statically during compilation
1440       // of this arraycopy call site that both 'from' and 'to' addresses
1441       // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1442       //
1443       // Aligned arrays have 4 bytes alignment in 32-bits VM
1444       // and 8 bytes - in 64-bits VM.
1445       //
1446     } else {
1447       // copy 1 element if necessary to align 'to' on an 4 bytes
1448       __ andcc(to, 3, G0);
1449       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1450       __ delayed()->lduh(from, 0, O3);
1451       __ inc(from, 2);
1452       __ inc(to, 2);
1453       __ dec(count);
1454       __ sth(O3, to, -2);
1455     __ BIND(L_skip_alignment);
1456 
1457       // copy 2 elements to align 'to' on an 8 byte boundary
1458       __ andcc(to, 7, G0);
1459       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1460       __ delayed()->lduh(from, 0, O3);
1461       __ dec(count, 2);
1462       __ lduh(from, 2, O4);
1463       __ inc(from, 4);
1464       __ inc(to, 4);
1465       __ sth(O3, to, -4);
1466       __ sth(O4, to, -2);
1467     __ BIND(L_skip_alignment2);
1468     }
1469     if (!aligned) {
1470       // Copy with shift 16 bytes per iteration if arrays do not have
1471       // the same alignment mod 8, otherwise fall through to the next
1472       // code for aligned copy.
1473       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1474       // Also jump over aligned copy after the copy with shift completed.
1475 
1476       copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes);
1477     }
1478 
1479     // Both array are 8 bytes aligned, copy 16 bytes at a time
1480       __ and3(count, 3, G4); // Save
1481       __ srl(count, 2, count);
1482      generate_disjoint_long_copy_core(aligned);
1483       __ mov(G4, count); // restore
1484 
1485     // copy 1 element at a time
1486     __ BIND(L_copy_2_bytes);
1487       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1488       __ align(OptoLoopAlignment);
1489     __ BIND(L_copy_2_bytes_loop);
1490       __ lduh(from, offset, O3);
1491       __ deccc(count);
1492       __ sth(O3, to, offset);
1493       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1494       __ delayed()->inc(offset, 2);
1495 
1496     __ BIND(L_exit);
1497       // O3, O4 are used as temp registers
1498       inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1499       __ retl();
1500       __ delayed()->mov(G0, O0); // return 0
1501     return start;
1502   }
1503 
1504   //
1505   //  Generate stub for disjoint short fill.  If "aligned" is true, the
1506   //  "to" address is assumed to be heapword aligned.
1507   //
1508   // Arguments for generated stub:
1509   //      to:    O0
1510   //      value: O1
1511   //      count: O2 treated as signed
1512   //
1513   address generate_fill(BasicType t, bool aligned, const char* name) {
1514     __ align(CodeEntryAlignment);
1515     StubCodeMark mark(this, "StubRoutines", name);
1516     address start = __ pc();
1517 
1518     const Register to        = O0;   // source array address
1519     const Register value     = O1;   // fill value
1520     const Register count     = O2;   // elements count
1521     // O3 is used as a temp register
1522 
1523     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1524 
1525     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
1526     Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes;
1527 
1528     int shift = -1;
1529     switch (t) {
1530        case T_BYTE:
1531         shift = 2;
1532         break;
1533        case T_SHORT:
1534         shift = 1;
1535         break;
1536       case T_INT:
1537          shift = 0;
1538         break;
1539       default: ShouldNotReachHere();
1540     }
1541 
1542     BLOCK_COMMENT("Entry:");
1543 
1544     if (t == T_BYTE) {
1545       // Zero extend value
1546       __ and3(value, 0xff, value);
1547       __ sllx(value, 8, O3);
1548       __ or3(value, O3, value);
1549     }
1550     if (t == T_SHORT) {
1551       // Zero extend value
1552       __ sllx(value, 48, value);
1553       __ srlx(value, 48, value);
1554     }
1555     if (t == T_BYTE || t == T_SHORT) {
1556       __ sllx(value, 16, O3);
1557       __ or3(value, O3, value);
1558     }
1559 
1560     __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
1561     __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp
1562     __ delayed()->andcc(count, 1, G0);
1563 
1564     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
1565       // align source address at 4 bytes address boundary
1566       if (t == T_BYTE) {
1567         // One byte misalignment happens only for byte arrays
1568         __ andcc(to, 1, G0);
1569         __ br(Assembler::zero, false, Assembler::pt, L_skip_align1);
1570         __ delayed()->nop();
1571         __ stb(value, to, 0);
1572         __ inc(to, 1);
1573         __ dec(count, 1);
1574         __ BIND(L_skip_align1);
1575       }
1576       // Two bytes misalignment happens only for byte and short (char) arrays
1577       __ andcc(to, 2, G0);
1578       __ br(Assembler::zero, false, Assembler::pt, L_skip_align2);
1579       __ delayed()->nop();
1580       __ sth(value, to, 0);
1581       __ inc(to, 2);
1582       __ dec(count, 1 << (shift - 1));
1583       __ BIND(L_skip_align2);
1584     }
1585     if (!aligned) {
1586       // align to 8 bytes, we know we are 4 byte aligned to start
1587       __ andcc(to, 7, G0);
1588       __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes);
1589       __ delayed()->nop();
1590       __ stw(value, to, 0);
1591       __ inc(to, 4);
1592       __ dec(count, 1 << shift);
1593       __ BIND(L_fill_32_bytes);
1594     }
1595 
1596     if (t == T_INT) {
1597       // Zero extend value
1598       __ srl(value, 0, value);
1599     }
1600     if (t == T_BYTE || t == T_SHORT || t == T_INT) {
1601       __ sllx(value, 32, O3);
1602       __ or3(value, O3, value);
1603     }
1604 
1605     Label L_check_fill_8_bytes;
1606     // Fill 32-byte chunks
1607     __ subcc(count, 8 << shift, count);
1608     __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes);
1609     __ delayed()->nop();
1610 
1611     Label L_fill_32_bytes_loop, L_fill_4_bytes;
1612     __ align(16);
1613     __ BIND(L_fill_32_bytes_loop);
1614 
1615     __ stx(value, to, 0);
1616     __ stx(value, to, 8);
1617     __ stx(value, to, 16);
1618     __ stx(value, to, 24);
1619 
1620     __ subcc(count, 8 << shift, count);
1621     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop);
1622     __ delayed()->add(to, 32, to);
1623 
1624     __ BIND(L_check_fill_8_bytes);
1625     __ addcc(count, 8 << shift, count);
1626     __ brx(Assembler::zero, false, Assembler::pn, L_exit);
1627     __ delayed()->subcc(count, 1 << (shift + 1), count);
1628     __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes);
1629     __ delayed()->andcc(count, 1<<shift, G0);
1630 
1631     //
1632     // length is too short, just fill 8 bytes at a time
1633     //
1634     Label L_fill_8_bytes_loop;
1635     __ BIND(L_fill_8_bytes_loop);
1636     __ stx(value, to, 0);
1637     __ subcc(count, 1 << (shift + 1), count);
1638     __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop);
1639     __ delayed()->add(to, 8, to);
1640 
1641     // fill trailing 4 bytes
1642     __ andcc(count, 1<<shift, G0);  // in delay slot of branches
1643     if (t == T_INT) {
1644       __ BIND(L_fill_elements);
1645     }
1646     __ BIND(L_fill_4_bytes);
1647     __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes);
1648     if (t == T_BYTE || t == T_SHORT) {
1649       __ delayed()->andcc(count, 1<<(shift-1), G0);
1650     } else {
1651       __ delayed()->nop();
1652     }
1653     __ stw(value, to, 0);
1654     if (t == T_BYTE || t == T_SHORT) {
1655       __ inc(to, 4);
1656       // fill trailing 2 bytes
1657       __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches
1658       __ BIND(L_fill_2_bytes);
1659       __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte);
1660       __ delayed()->andcc(count, 1, count);
1661       __ sth(value, to, 0);
1662       if (t == T_BYTE) {
1663         __ inc(to, 2);
1664         // fill trailing byte
1665         __ andcc(count, 1, count);  // in delay slot of branches
1666         __ BIND(L_fill_byte);
1667         __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1668         __ delayed()->nop();
1669         __ stb(value, to, 0);
1670       } else {
1671         __ BIND(L_fill_byte);
1672       }
1673     } else {
1674       __ BIND(L_fill_2_bytes);
1675     }
1676     __ BIND(L_exit);
1677     __ retl();
1678     __ delayed()->nop();
1679 
1680     // Handle copies less than 8 bytes.  Int is handled elsewhere.
1681     if (t == T_BYTE) {
1682       __ BIND(L_fill_elements);
1683       Label L_fill_2, L_fill_4;
1684       // in delay slot __ andcc(count, 1, G0);
1685       __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1686       __ delayed()->andcc(count, 2, G0);
1687       __ stb(value, to, 0);
1688       __ inc(to, 1);
1689       __ BIND(L_fill_2);
1690       __ brx(Assembler::zero, false, Assembler::pt, L_fill_4);
1691       __ delayed()->andcc(count, 4, G0);
1692       __ stb(value, to, 0);
1693       __ stb(value, to, 1);
1694       __ inc(to, 2);
1695       __ BIND(L_fill_4);
1696       __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1697       __ delayed()->nop();
1698       __ stb(value, to, 0);
1699       __ stb(value, to, 1);
1700       __ stb(value, to, 2);
1701       __ retl();
1702       __ delayed()->stb(value, to, 3);
1703     }
1704 
1705     if (t == T_SHORT) {
1706       Label L_fill_2;
1707       __ BIND(L_fill_elements);
1708       // in delay slot __ andcc(count, 1, G0);
1709       __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1710       __ delayed()->andcc(count, 2, G0);
1711       __ sth(value, to, 0);
1712       __ inc(to, 2);
1713       __ BIND(L_fill_2);
1714       __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1715       __ delayed()->nop();
1716       __ sth(value, to, 0);
1717       __ retl();
1718       __ delayed()->sth(value, to, 2);
1719     }
1720     return start;
1721   }
1722 
1723   //
1724   //  Generate stub for conjoint short copy.  If "aligned" is true, the
1725   //  "from" and "to" addresses are assumed to be heapword aligned.
1726   //
1727   // Arguments for generated stub:
1728   //      from:  O0
1729   //      to:    O1
1730   //      count: O2 treated as signed
1731   //
1732   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1733                                        address *entry, const char *name) {
1734     // Do reverse copy.
1735 
1736     __ align(CodeEntryAlignment);
1737     StubCodeMark mark(this, "StubRoutines", name);
1738     address start = __ pc();
1739 
1740     Label L_skip_alignment, L_skip_alignment2, L_aligned_copy;
1741     Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1742 
1743     const Register from      = O0;   // source array address
1744     const Register to        = O1;   // destination array address
1745     const Register count     = O2;   // elements count
1746     const Register end_from  = from; // source array end address
1747     const Register end_to    = to;   // destination array end address
1748 
1749     const Register byte_count = O3;  // bytes count to copy
1750 
1751     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1752 
1753     if (entry != NULL) {
1754       *entry = __ pc();
1755       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1756       BLOCK_COMMENT("Entry:");
1757     }
1758 
1759     array_overlap_test(nooverlap_target, 1);
1760 
1761     __ sllx(count, LogBytesPerShort, byte_count);
1762     __ add(to, byte_count, end_to);  // offset after last copied element
1763 
1764     // for short arrays, just do single element copy
1765     __ cmp(count, 11); // 8 + 3  (22 bytes)
1766     __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1767     __ delayed()->add(from, byte_count, end_from);
1768 
1769     {
1770       // Align end of arrays since they could be not aligned even
1771       // when arrays itself are aligned.
1772 
1773       // copy 1 element if necessary to align 'end_to' on an 4 bytes
1774       __ andcc(end_to, 3, G0);
1775       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1776       __ delayed()->lduh(end_from, -2, O3);
1777       __ dec(end_from, 2);
1778       __ dec(end_to, 2);
1779       __ dec(count);
1780       __ sth(O3, end_to, 0);
1781     __ BIND(L_skip_alignment);
1782 
1783       // copy 2 elements to align 'end_to' on an 8 byte boundary
1784       __ andcc(end_to, 7, G0);
1785       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1786       __ delayed()->lduh(end_from, -2, O3);
1787       __ dec(count, 2);
1788       __ lduh(end_from, -4, O4);
1789       __ dec(end_from, 4);
1790       __ dec(end_to, 4);
1791       __ sth(O3, end_to, 2);
1792       __ sth(O4, end_to, 0);
1793     __ BIND(L_skip_alignment2);
1794     }
1795     if (aligned) {
1796       // Both arrays are aligned to 8-bytes in 64-bits VM.
1797       // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1798       // in unaligned case.
1799       __ dec(count, 8);
1800     } else {
1801       // Copy with shift 16 bytes per iteration if arrays do not have
1802       // the same alignment mod 8, otherwise jump to the next
1803       // code for aligned copy (and substracting 8 from 'count' before jump).
1804       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1805       // Also jump over aligned copy after the copy with shift completed.
1806 
1807       copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
1808                                         L_aligned_copy, L_copy_2_bytes);
1809     }
1810     // copy 4 elements (16 bytes) at a time
1811       __ align(OptoLoopAlignment);
1812     __ BIND(L_aligned_copy);
1813       __ dec(end_from, 16);
1814       __ ldx(end_from, 8, O3);
1815       __ ldx(end_from, 0, O4);
1816       __ dec(end_to, 16);
1817       __ deccc(count, 8);
1818       __ stx(O3, end_to, 8);
1819       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1820       __ delayed()->stx(O4, end_to, 0);
1821       __ inc(count, 8);
1822 
1823     // copy 1 element (2 bytes) at a time
1824     __ BIND(L_copy_2_bytes);
1825       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1826     __ BIND(L_copy_2_bytes_loop);
1827       __ dec(end_from, 2);
1828       __ dec(end_to, 2);
1829       __ lduh(end_from, 0, O4);
1830       __ deccc(count);
1831       __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
1832       __ delayed()->sth(O4, end_to, 0);
1833 
1834     __ BIND(L_exit);
1835     // O3, O4 are used as temp registers
1836     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1837     __ retl();
1838     __ delayed()->mov(G0, O0); // return 0
1839     return start;
1840   }
1841 
1842   //
1843   // Helper methods for generate_disjoint_int_copy_core()
1844   //
1845   void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec,
1846                           Label& L_loop, bool use_prefetch, bool use_bis) {
1847 
1848     __ align(OptoLoopAlignment);
1849     __ BIND(L_loop);
1850     if (use_prefetch) {
1851       if (ArraycopySrcPrefetchDistance > 0) {
1852         __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
1853       }
1854       if (ArraycopyDstPrefetchDistance > 0) {
1855         __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
1856       }
1857     }
1858     __ ldx(from, 4, O4);
1859     __ ldx(from, 12, G4);
1860     __ inc(to, 16);
1861     __ inc(from, 16);
1862     __ deccc(count, 4); // Can we do next iteration after this one?
1863 
1864     __ srlx(O4, 32, G3);
1865     __ bset(G3, O3);
1866     __ sllx(O4, 32, O4);
1867     __ srlx(G4, 32, G3);
1868     __ bset(G3, O4);
1869     if (use_bis) {
1870       __ stxa(O3, to, -16);
1871       __ stxa(O4, to, -8);
1872     } else {
1873       __ stx(O3, to, -16);
1874       __ stx(O4, to, -8);
1875     }
1876     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1877     __ delayed()->sllx(G4, 32,  O3);
1878 
1879   }
1880 
1881   //
1882   //  Generate core code for disjoint int copy (and oop copy on 32-bit).
1883   //  If "aligned" is true, the "from" and "to" addresses are assumed
1884   //  to be heapword aligned.
1885   //
1886   // Arguments:
1887   //      from:  O0
1888   //      to:    O1
1889   //      count: O2 treated as signed
1890   //
1891   void generate_disjoint_int_copy_core(bool aligned) {
1892 
1893     Label L_skip_alignment, L_aligned_copy;
1894     Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
1895 
1896     const Register from      = O0;   // source array address
1897     const Register to        = O1;   // destination array address
1898     const Register count     = O2;   // elements count
1899     const Register offset    = O5;   // offset from start of arrays
1900     // O3, O4, G3, G4 are used as temp registers
1901 
1902     // 'aligned' == true when it is known statically during compilation
1903     // of this arraycopy call site that both 'from' and 'to' addresses
1904     // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1905     //
1906     // Aligned arrays have 4 bytes alignment in 32-bits VM
1907     // and 8 bytes - in 64-bits VM.
1908     //
1909     if (!aligned) {
1910       // The next check could be put under 'ifndef' since the code in
1911       // generate_disjoint_long_copy_core() has own checks and set 'offset'.
1912 
1913       // for short arrays, just do single element copy
1914       __ cmp(count, 5); // 4 + 1 (20 bytes)
1915       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
1916       __ delayed()->mov(G0, offset);
1917 
1918       // copy 1 element to align 'to' on an 8 byte boundary
1919       __ andcc(to, 7, G0);
1920       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1921       __ delayed()->ld(from, 0, O3);
1922       __ inc(from, 4);
1923       __ inc(to, 4);
1924       __ dec(count);
1925       __ st(O3, to, -4);
1926     __ BIND(L_skip_alignment);
1927 
1928     // if arrays have same alignment mod 8, do 4 elements copy
1929       __ andcc(from, 7, G0);
1930       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1931       __ delayed()->ld(from, 0, O3);
1932 
1933     //
1934     // Load 2 aligned 8-bytes chunks and use one from previous iteration
1935     // to form 2 aligned 8-bytes chunks to store.
1936     //
1937     // copy_16_bytes_forward_with_shift() is not used here since this
1938     // code is more optimal.
1939 
1940     // copy with shift 4 elements (16 bytes) at a time
1941       __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
1942       __ sllx(O3, 32,  O3);
1943 
1944       disjoint_copy_core(from, to, count, 2, 16, &StubGenerator::copy_16_bytes_loop);
1945 
1946       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
1947       __ delayed()->inc(count, 4); // restore 'count'
1948 
1949     __ BIND(L_aligned_copy);
1950     } // !aligned
1951 
1952     // copy 4 elements (16 bytes) at a time
1953       __ and3(count, 1, G4); // Save
1954       __ srl(count, 1, count);
1955      generate_disjoint_long_copy_core(aligned);
1956       __ mov(G4, count);     // Restore
1957 
1958     // copy 1 element at a time
1959     __ BIND(L_copy_4_bytes);
1960       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1961     __ BIND(L_copy_4_bytes_loop);
1962       __ ld(from, offset, O3);
1963       __ deccc(count);
1964       __ st(O3, to, offset);
1965       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
1966       __ delayed()->inc(offset, 4);
1967     __ BIND(L_exit);
1968   }
1969 
1970   //
1971   //  Generate stub for disjoint int copy.  If "aligned" is true, the
1972   //  "from" and "to" addresses are assumed to be heapword aligned.
1973   //
1974   // Arguments for generated stub:
1975   //      from:  O0
1976   //      to:    O1
1977   //      count: O2 treated as signed
1978   //
1979   address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) {
1980     __ align(CodeEntryAlignment);
1981     StubCodeMark mark(this, "StubRoutines", name);
1982     address start = __ pc();
1983 
1984     const Register count = O2;
1985     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1986 
1987     if (entry != NULL) {
1988       *entry = __ pc();
1989       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1990       BLOCK_COMMENT("Entry:");
1991     }
1992 
1993     generate_disjoint_int_copy_core(aligned);
1994 
1995     // O3, O4 are used as temp registers
1996     inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
1997     __ retl();
1998     __ delayed()->mov(G0, O0); // return 0
1999     return start;
2000   }
2001 
2002   //
2003   //  Generate core code for conjoint int copy (and oop copy on 32-bit).
2004   //  If "aligned" is true, the "from" and "to" addresses are assumed
2005   //  to be heapword aligned.
2006   //
2007   // Arguments:
2008   //      from:  O0
2009   //      to:    O1
2010   //      count: O2 treated as signed
2011   //
2012   void generate_conjoint_int_copy_core(bool aligned) {
2013     // Do reverse copy.
2014 
2015     Label L_skip_alignment, L_aligned_copy;
2016     Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
2017 
2018     const Register from      = O0;   // source array address
2019     const Register to        = O1;   // destination array address
2020     const Register count     = O2;   // elements count
2021     const Register end_from  = from; // source array end address
2022     const Register end_to    = to;   // destination array end address
2023     // O3, O4, O5, G3 are used as temp registers
2024 
2025     const Register byte_count = O3;  // bytes count to copy
2026 
2027       __ sllx(count, LogBytesPerInt, byte_count);
2028       __ add(to, byte_count, end_to); // offset after last copied element
2029 
2030       __ cmp(count, 5); // for short arrays, just do single element copy
2031       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
2032       __ delayed()->add(from, byte_count, end_from);
2033 
2034     // copy 1 element to align 'to' on an 8 byte boundary
2035       __ andcc(end_to, 7, G0);
2036       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
2037       __ delayed()->nop();
2038       __ dec(count);
2039       __ dec(end_from, 4);
2040       __ dec(end_to,   4);
2041       __ ld(end_from, 0, O4);
2042       __ st(O4, end_to, 0);
2043     __ BIND(L_skip_alignment);
2044 
2045     // Check if 'end_from' and 'end_to' has the same alignment.
2046       __ andcc(end_from, 7, G0);
2047       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2048       __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4
2049 
2050     // copy with shift 4 elements (16 bytes) at a time
2051     //
2052     // Load 2 aligned 8-bytes chunks and use one from previous iteration
2053     // to form 2 aligned 8-bytes chunks to store.
2054     //
2055       __ ldx(end_from, -4, O3);
2056       __ align(OptoLoopAlignment);
2057     __ BIND(L_copy_16_bytes);
2058       __ ldx(end_from, -12, O4);
2059       __ deccc(count, 4);
2060       __ ldx(end_from, -20, O5);
2061       __ dec(end_to, 16);
2062       __ dec(end_from, 16);
2063       __ srlx(O3, 32, O3);
2064       __ sllx(O4, 32, G3);
2065       __ bset(G3, O3);
2066       __ stx(O3, end_to, 8);
2067       __ srlx(O4, 32, O4);
2068       __ sllx(O5, 32, G3);
2069       __ bset(O4, G3);
2070       __ stx(G3, end_to, 0);
2071       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2072       __ delayed()->mov(O5, O3);
2073 
2074       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2075       __ delayed()->inc(count, 4);
2076 
2077     // copy 4 elements (16 bytes) at a time
2078       __ align(OptoLoopAlignment);
2079     __ BIND(L_aligned_copy);
2080       __ dec(end_from, 16);
2081       __ ldx(end_from, 8, O3);
2082       __ ldx(end_from, 0, O4);
2083       __ dec(end_to, 16);
2084       __ deccc(count, 4);
2085       __ stx(O3, end_to, 8);
2086       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
2087       __ delayed()->stx(O4, end_to, 0);
2088       __ inc(count, 4);
2089 
2090     // copy 1 element (4 bytes) at a time
2091     __ BIND(L_copy_4_bytes);
2092       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2093     __ BIND(L_copy_4_bytes_loop);
2094       __ dec(end_from, 4);
2095       __ dec(end_to, 4);
2096       __ ld(end_from, 0, O4);
2097       __ deccc(count);
2098       __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop);
2099       __ delayed()->st(O4, end_to, 0);
2100     __ BIND(L_exit);
2101   }
2102 
2103   //
2104   //  Generate stub for conjoint int copy.  If "aligned" is true, the
2105   //  "from" and "to" addresses are assumed to be heapword aligned.
2106   //
2107   // Arguments for generated stub:
2108   //      from:  O0
2109   //      to:    O1
2110   //      count: O2 treated as signed
2111   //
2112   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
2113                                      address *entry, const char *name) {
2114     __ align(CodeEntryAlignment);
2115     StubCodeMark mark(this, "StubRoutines", name);
2116     address start = __ pc();
2117 
2118     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2119 
2120     if (entry != NULL) {
2121       *entry = __ pc();
2122       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2123       BLOCK_COMMENT("Entry:");
2124     }
2125 
2126     array_overlap_test(nooverlap_target, 2);
2127 
2128     generate_conjoint_int_copy_core(aligned);
2129 
2130     // O3, O4 are used as temp registers
2131     inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2132     __ retl();
2133     __ delayed()->mov(G0, O0); // return 0
2134     return start;
2135   }
2136 
2137   //
2138   // Helper methods for generate_disjoint_long_copy_core()
2139   //
2140   void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec,
2141                           Label& L_loop, bool use_prefetch, bool use_bis) {
2142     __ align(OptoLoopAlignment);
2143     __ BIND(L_loop);
2144     for (int off = 0; off < 64; off += 16) {
2145       if (use_prefetch && (off & 31) == 0) {
2146         if (ArraycopySrcPrefetchDistance > 0) {
2147           __ prefetch(from, ArraycopySrcPrefetchDistance+off, Assembler::severalReads);
2148         }
2149         if (ArraycopyDstPrefetchDistance > 0) {
2150           __ prefetch(to, ArraycopyDstPrefetchDistance+off, Assembler::severalWritesAndPossiblyReads);
2151         }
2152       }
2153       __ ldx(from,  off+0, O4);
2154       __ ldx(from,  off+8, O5);
2155       if (use_bis) {
2156         __ stxa(O4, to,  off+0);
2157         __ stxa(O5, to,  off+8);
2158       } else {
2159         __ stx(O4, to,  off+0);
2160         __ stx(O5, to,  off+8);
2161       }
2162     }
2163     __ deccc(count, 8);
2164     __ inc(from, 64);
2165     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
2166     __ delayed()->inc(to, 64);
2167   }
2168 
2169   //
2170   //  Generate core code for disjoint long copy (and oop copy on 64-bit).
2171   //  "aligned" is ignored, because we must make the stronger
2172   //  assumption that both addresses are always 64-bit aligned.
2173   //
2174   // Arguments:
2175   //      from:  O0
2176   //      to:    O1
2177   //      count: O2 treated as signed
2178   //
2179   // count -= 2;
2180   // if ( count >= 0 ) { // >= 2 elements
2181   //   if ( count > 6) { // >= 8 elements
2182   //     count -= 6; // original count - 8
2183   //     do {
2184   //       copy_8_elements;
2185   //       count -= 8;
2186   //     } while ( count >= 0 );
2187   //     count += 6;
2188   //   }
2189   //   if ( count >= 0 ) { // >= 2 elements
2190   //     do {
2191   //       copy_2_elements;
2192   //     } while ( (count=count-2) >= 0 );
2193   //   }
2194   // }
2195   // count += 2;
2196   // if ( count != 0 ) { // 1 element left
2197   //   copy_1_element;
2198   // }
2199   //
2200   void generate_disjoint_long_copy_core(bool aligned) {
2201     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2202     const Register from    = O0;  // source array address
2203     const Register to      = O1;  // destination array address
2204     const Register count   = O2;  // elements count
2205     const Register offset0 = O4;  // element offset
2206     const Register offset8 = O5;  // next element offset
2207 
2208     __ deccc(count, 2);
2209     __ mov(G0, offset0);   // offset from start of arrays (0)
2210     __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2211     __ delayed()->add(offset0, 8, offset8);
2212 
2213     // Copy by 64 bytes chunks
2214 
2215     const Register from64 = O3;  // source address
2216     const Register to64   = G3;  // destination address
2217     __ subcc(count, 6, O3);
2218     __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
2219     __ delayed()->mov(to,   to64);
2220     // Now we can use O4(offset0), O5(offset8) as temps
2221     __ mov(O3, count);
2222     // count >= 0 (original count - 8)
2223     __ mov(from, from64);
2224 
2225     disjoint_copy_core(from64, to64, count, 3, 64, &StubGenerator::copy_64_bytes_loop);
2226 
2227       // Restore O4(offset0), O5(offset8)
2228       __ sub(from64, from, offset0);
2229       __ inccc(count, 6); // restore count
2230       __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2231       __ delayed()->add(offset0, 8, offset8);
2232 
2233       // Copy by 16 bytes chunks
2234       __ align(OptoLoopAlignment);
2235     __ BIND(L_copy_16_bytes);
2236       __ ldx(from, offset0, O3);
2237       __ ldx(from, offset8, G3);
2238       __ deccc(count, 2);
2239       __ stx(O3, to, offset0);
2240       __ inc(offset0, 16);
2241       __ stx(G3, to, offset8);
2242       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2243       __ delayed()->inc(offset8, 16);
2244 
2245       // Copy last 8 bytes
2246     __ BIND(L_copy_8_bytes);
2247       __ inccc(count, 2);
2248       __ brx(Assembler::zero, true, Assembler::pn, L_exit );
2249       __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
2250       __ ldx(from, offset0, O3);
2251       __ stx(O3, to, offset0);
2252     __ BIND(L_exit);
2253   }
2254 
2255   //
2256   //  Generate stub for disjoint long copy.
2257   //  "aligned" is ignored, because we must make the stronger
2258   //  assumption that both addresses are always 64-bit aligned.
2259   //
2260   // Arguments for generated stub:
2261   //      from:  O0
2262   //      to:    O1
2263   //      count: O2 treated as signed
2264   //
2265   address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) {
2266     __ align(CodeEntryAlignment);
2267     StubCodeMark mark(this, "StubRoutines", name);
2268     address start = __ pc();
2269 
2270     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2271 
2272     if (entry != NULL) {
2273       *entry = __ pc();
2274       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2275       BLOCK_COMMENT("Entry:");
2276     }
2277 
2278     generate_disjoint_long_copy_core(aligned);
2279 
2280     // O3, O4 are used as temp registers
2281     inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2282     __ retl();
2283     __ delayed()->mov(G0, O0); // return 0
2284     return start;
2285   }
2286 
2287   //
2288   //  Generate core code for conjoint long copy (and oop copy on 64-bit).
2289   //  "aligned" is ignored, because we must make the stronger
2290   //  assumption that both addresses are always 64-bit aligned.
2291   //
2292   // Arguments:
2293   //      from:  O0
2294   //      to:    O1
2295   //      count: O2 treated as signed
2296   //
2297   void generate_conjoint_long_copy_core(bool aligned) {
2298     // Do reverse copy.
2299     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2300     const Register from    = O0;  // source array address
2301     const Register to      = O1;  // destination array address
2302     const Register count   = O2;  // elements count
2303     const Register offset8 = O4;  // element offset
2304     const Register offset0 = O5;  // previous element offset
2305 
2306       __ subcc(count, 1, count);
2307       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
2308       __ delayed()->sllx(count, LogBytesPerLong, offset8);
2309       __ sub(offset8, 8, offset0);
2310       __ align(OptoLoopAlignment);
2311     __ BIND(L_copy_16_bytes);
2312       __ ldx(from, offset8, O2);
2313       __ ldx(from, offset0, O3);
2314       __ stx(O2, to, offset8);
2315       __ deccc(offset8, 16);      // use offset8 as counter
2316       __ stx(O3, to, offset0);
2317       __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes);
2318       __ delayed()->dec(offset0, 16);
2319 
2320     __ BIND(L_copy_8_bytes);
2321       __ brx(Assembler::negative, false, Assembler::pn, L_exit );
2322       __ delayed()->nop();
2323       __ ldx(from, 0, O3);
2324       __ stx(O3, to, 0);
2325     __ BIND(L_exit);
2326   }
2327 
2328   //  Generate stub for conjoint long copy.
2329   //  "aligned" is ignored, because we must make the stronger
2330   //  assumption that both addresses are always 64-bit aligned.
2331   //
2332   // Arguments for generated stub:
2333   //      from:  O0
2334   //      to:    O1
2335   //      count: O2 treated as signed
2336   //
2337   address generate_conjoint_long_copy(bool aligned, address nooverlap_target,
2338                                       address *entry, const char *name) {
2339     __ align(CodeEntryAlignment);
2340     StubCodeMark mark(this, "StubRoutines", name);
2341     address start = __ pc();
2342 
2343     assert(aligned, "Should always be aligned");
2344 
2345     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2346 
2347     if (entry != NULL) {
2348       *entry = __ pc();
2349       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2350       BLOCK_COMMENT("Entry:");
2351     }
2352 
2353     array_overlap_test(nooverlap_target, 3);
2354 
2355     generate_conjoint_long_copy_core(aligned);
2356 
2357     // O3, O4 are used as temp registers
2358     inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2359     __ retl();
2360     __ delayed()->mov(G0, O0); // return 0
2361     return start;
2362   }
2363 
2364   //  Generate stub for disjoint oop copy.  If "aligned" is true, the
2365   //  "from" and "to" addresses are assumed to be heapword aligned.
2366   //
2367   // Arguments for generated stub:
2368   //      from:  O0
2369   //      to:    O1
2370   //      count: O2 treated as signed
2371   //
2372   address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name,
2373                                      bool dest_uninitialized = false) {
2374 
2375     const Register from  = O0;  // source array address
2376     const Register to    = O1;  // destination array address
2377     const Register count = O2;  // elements count
2378 
2379     __ align(CodeEntryAlignment);
2380     StubCodeMark mark(this, "StubRoutines", name);
2381     address start = __ pc();
2382 
2383     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2384 
2385     if (entry != NULL) {
2386       *entry = __ pc();
2387       // caller can pass a 64-bit byte count here
2388       BLOCK_COMMENT("Entry:");
2389     }
2390 
2391     // save arguments for barrier generation
2392     __ mov(to, G1);
2393     __ mov(count, G5);
2394     gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2395     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2396     if (UseCompressedOops) {
2397       generate_disjoint_int_copy_core(aligned);
2398     } else {
2399       generate_disjoint_long_copy_core(aligned);
2400     }
2401     // O0 is used as temp register
2402     gen_write_ref_array_post_barrier(G1, G5, O0);
2403 
2404     // O3, O4 are used as temp registers
2405     inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2406     __ retl();
2407     __ delayed()->mov(G0, O0); // return 0
2408     return start;
2409   }
2410 
2411   //  Generate stub for conjoint oop copy.  If "aligned" is true, the
2412   //  "from" and "to" addresses are assumed to be heapword aligned.
2413   //
2414   // Arguments for generated stub:
2415   //      from:  O0
2416   //      to:    O1
2417   //      count: O2 treated as signed
2418   //
2419   address generate_conjoint_oop_copy(bool aligned, address nooverlap_target,
2420                                      address *entry, const char *name,
2421                                      bool dest_uninitialized = false) {
2422 
2423     const Register from  = O0;  // source array address
2424     const Register to    = O1;  // destination array address
2425     const Register count = O2;  // elements count
2426 
2427     __ align(CodeEntryAlignment);
2428     StubCodeMark mark(this, "StubRoutines", name);
2429     address start = __ pc();
2430 
2431     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2432 
2433     if (entry != NULL) {
2434       *entry = __ pc();
2435       // caller can pass a 64-bit byte count here
2436       BLOCK_COMMENT("Entry:");
2437     }
2438 
2439     array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
2440 
2441     // save arguments for barrier generation
2442     __ mov(to, G1);
2443     __ mov(count, G5);
2444     gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2445 
2446     if (UseCompressedOops) {
2447       generate_conjoint_int_copy_core(aligned);
2448     } else {
2449       generate_conjoint_long_copy_core(aligned);
2450     }
2451 
2452     // O0 is used as temp register
2453     gen_write_ref_array_post_barrier(G1, G5, O0);
2454 
2455     // O3, O4 are used as temp registers
2456     inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2457     __ retl();
2458     __ delayed()->mov(G0, O0); // return 0
2459     return start;
2460   }
2461 
2462 
2463   // Helper for generating a dynamic type check.
2464   // Smashes only the given temp registers.
2465   void generate_type_check(Register sub_klass,
2466                            Register super_check_offset,
2467                            Register super_klass,
2468                            Register temp,
2469                            Label& L_success) {
2470     assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
2471 
2472     BLOCK_COMMENT("type_check:");
2473 
2474     Label L_miss, L_pop_to_miss;
2475 
2476     assert_clean_int(super_check_offset, temp);
2477 
2478     __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg,
2479                                      &L_success, &L_miss, NULL,
2480                                      super_check_offset);
2481 
2482     BLOCK_COMMENT("type_check_slow_path:");
2483     __ save_frame(0);
2484     __ check_klass_subtype_slow_path(sub_klass->after_save(),
2485                                      super_klass->after_save(),
2486                                      L0, L1, L2, L4,
2487                                      NULL, &L_pop_to_miss);
2488     __ ba(L_success);
2489     __ delayed()->restore();
2490 
2491     __ bind(L_pop_to_miss);
2492     __ restore();
2493 
2494     // Fall through on failure!
2495     __ BIND(L_miss);
2496   }
2497 
2498 
2499   //  Generate stub for checked oop copy.
2500   //
2501   // Arguments for generated stub:
2502   //      from:  O0
2503   //      to:    O1
2504   //      count: O2 treated as signed
2505   //      ckoff: O3 (super_check_offset)
2506   //      ckval: O4 (super_klass)
2507   //      ret:   O0 zero for success; (-1^K) where K is partial transfer count
2508   //
2509   address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) {
2510 
2511     const Register O0_from   = O0;      // source array address
2512     const Register O1_to     = O1;      // destination array address
2513     const Register O2_count  = O2;      // elements count
2514     const Register O3_ckoff  = O3;      // super_check_offset
2515     const Register O4_ckval  = O4;      // super_klass
2516 
2517     const Register O5_offset = O5;      // loop var, with stride wordSize
2518     const Register G1_remain = G1;      // loop var, with stride -1
2519     const Register G3_oop    = G3;      // actual oop copied
2520     const Register G4_klass  = G4;      // oop._klass
2521     const Register G5_super  = G5;      // oop._klass._primary_supers[ckval]
2522 
2523     __ align(CodeEntryAlignment);
2524     StubCodeMark mark(this, "StubRoutines", name);
2525     address start = __ pc();
2526 
2527 #ifdef ASSERT
2528     // We sometimes save a frame (see generate_type_check below).
2529     // If this will cause trouble, let's fail now instead of later.
2530     __ save_frame(0);
2531     __ restore();
2532 #endif
2533 
2534     assert_clean_int(O2_count, G1);     // Make sure 'count' is clean int.
2535 
2536 #ifdef ASSERT
2537     // caller guarantees that the arrays really are different
2538     // otherwise, we would have to make conjoint checks
2539     { Label L;
2540       __ mov(O3, G1);           // spill: overlap test smashes O3
2541       __ mov(O4, G4);           // spill: overlap test smashes O4
2542       array_overlap_test(L, LogBytesPerHeapOop);
2543       __ stop("checkcast_copy within a single array");
2544       __ bind(L);
2545       __ mov(G1, O3);
2546       __ mov(G4, O4);
2547     }
2548 #endif //ASSERT
2549 
2550     if (entry != NULL) {
2551       *entry = __ pc();
2552       // caller can pass a 64-bit byte count here (from generic stub)
2553       BLOCK_COMMENT("Entry:");
2554     }
2555     gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized);
2556 
2557     Label load_element, store_element, do_card_marks, fail, done;
2558     __ addcc(O2_count, 0, G1_remain);   // initialize loop index, and test it
2559     __ brx(Assembler::notZero, false, Assembler::pt, load_element);
2560     __ delayed()->mov(G0, O5_offset);   // offset from start of arrays
2561 
2562     // Empty array:  Nothing to do.
2563     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2564     __ retl();
2565     __ delayed()->set(0, O0);           // return 0 on (trivial) success
2566 
2567     // ======== begin loop ========
2568     // (Loop is rotated; its entry is load_element.)
2569     // Loop variables:
2570     //   (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
2571     //   (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
2572     //   G3, G4, G5 --- current oop, oop.klass, oop.klass.super
2573     __ align(OptoLoopAlignment);
2574 
2575     __ BIND(store_element);
2576     __ deccc(G1_remain);                // decrement the count
2577     __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
2578     __ inc(O5_offset, heapOopSize);     // step to next offset
2579     __ brx(Assembler::zero, true, Assembler::pt, do_card_marks);
2580     __ delayed()->set(0, O0);           // return -1 on success
2581 
2582     // ======== loop entry is here ========
2583     __ BIND(load_element);
2584     __ load_heap_oop(O0_from, O5_offset, G3_oop);  // load the oop
2585     __ br_null_short(G3_oop, Assembler::pt, store_element);
2586 
2587     __ load_klass(G3_oop, G4_klass); // query the object klass
2588 
2589     generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
2590                         // branch to this on success:
2591                         store_element);
2592     // ======== end loop ========
2593 
2594     // It was a real error; we must depend on the caller to finish the job.
2595     // Register G1 has number of *remaining* oops, O2 number of *total* oops.
2596     // Emit GC store barriers for the oops we have copied (O2 minus G1),
2597     // and report their number to the caller.
2598     __ BIND(fail);
2599     __ subcc(O2_count, G1_remain, O2_count);
2600     __ brx(Assembler::zero, false, Assembler::pt, done);
2601     __ delayed()->not1(O2_count, O0);   // report (-1^K) to caller
2602 
2603     __ BIND(do_card_marks);
2604     gen_write_ref_array_post_barrier(O1_to, O2_count, O3);   // store check on O1[0..O2]
2605 
2606     __ BIND(done);
2607     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2608     __ retl();
2609     __ delayed()->nop();             // return value in 00
2610 
2611     return start;
2612   }
2613 
2614 
2615   //  Generate 'unsafe' array copy stub
2616   //  Though just as safe as the other stubs, it takes an unscaled
2617   //  size_t argument instead of an element count.
2618   //
2619   // Arguments for generated stub:
2620   //      from:  O0
2621   //      to:    O1
2622   //      count: O2 byte count, treated as ssize_t, can be zero
2623   //
2624   // Examines the alignment of the operands and dispatches
2625   // to a long, int, short, or byte copy loop.
2626   //
2627   address generate_unsafe_copy(const char* name,
2628                                address byte_copy_entry,
2629                                address short_copy_entry,
2630                                address int_copy_entry,
2631                                address long_copy_entry) {
2632 
2633     const Register O0_from   = O0;      // source array address
2634     const Register O1_to     = O1;      // destination array address
2635     const Register O2_count  = O2;      // elements count
2636 
2637     const Register G1_bits   = G1;      // test copy of low bits
2638 
2639     __ align(CodeEntryAlignment);
2640     StubCodeMark mark(this, "StubRoutines", name);
2641     address start = __ pc();
2642 
2643     // bump this on entry, not on exit:
2644     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3);
2645 
2646     __ or3(O0_from, O1_to, G1_bits);
2647     __ or3(O2_count,       G1_bits, G1_bits);
2648 
2649     __ btst(BytesPerLong-1, G1_bits);
2650     __ br(Assembler::zero, true, Assembler::pt,
2651           long_copy_entry, relocInfo::runtime_call_type);
2652     // scale the count on the way out:
2653     __ delayed()->srax(O2_count, LogBytesPerLong, O2_count);
2654 
2655     __ btst(BytesPerInt-1, G1_bits);
2656     __ br(Assembler::zero, true, Assembler::pt,
2657           int_copy_entry, relocInfo::runtime_call_type);
2658     // scale the count on the way out:
2659     __ delayed()->srax(O2_count, LogBytesPerInt, O2_count);
2660 
2661     __ btst(BytesPerShort-1, G1_bits);
2662     __ br(Assembler::zero, true, Assembler::pt,
2663           short_copy_entry, relocInfo::runtime_call_type);
2664     // scale the count on the way out:
2665     __ delayed()->srax(O2_count, LogBytesPerShort, O2_count);
2666 
2667     __ br(Assembler::always, false, Assembler::pt,
2668           byte_copy_entry, relocInfo::runtime_call_type);
2669     __ delayed()->nop();
2670 
2671     return start;
2672   }
2673 
2674 
2675   // Perform range checks on the proposed arraycopy.
2676   // Kills the two temps, but nothing else.
2677   // Also, clean the sign bits of src_pos and dst_pos.
2678   void arraycopy_range_checks(Register src,     // source array oop (O0)
2679                               Register src_pos, // source position (O1)
2680                               Register dst,     // destination array oo (O2)
2681                               Register dst_pos, // destination position (O3)
2682                               Register length,  // length of copy (O4)
2683                               Register temp1, Register temp2,
2684                               Label& L_failed) {
2685     BLOCK_COMMENT("arraycopy_range_checks:");
2686 
2687     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
2688 
2689     const Register array_length = temp1;  // scratch
2690     const Register end_pos      = temp2;  // scratch
2691 
2692     // Note:  This next instruction may be in the delay slot of a branch:
2693     __ add(length, src_pos, end_pos);  // src_pos + length
2694     __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length);
2695     __ cmp(end_pos, array_length);
2696     __ br(Assembler::greater, false, Assembler::pn, L_failed);
2697 
2698     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2699     __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length
2700     __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length);
2701     __ cmp(end_pos, array_length);
2702     __ br(Assembler::greater, false, Assembler::pn, L_failed);
2703 
2704     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2705     // Move with sign extension can be used since they are positive.
2706     __ delayed()->signx(src_pos, src_pos);
2707     __ signx(dst_pos, dst_pos);
2708 
2709     BLOCK_COMMENT("arraycopy_range_checks done");
2710   }
2711 
2712 
2713   //
2714   //  Generate generic array copy stubs
2715   //
2716   //  Input:
2717   //    O0    -  src oop
2718   //    O1    -  src_pos
2719   //    O2    -  dst oop
2720   //    O3    -  dst_pos
2721   //    O4    -  element count
2722   //
2723   //  Output:
2724   //    O0 ==  0  -  success
2725   //    O0 == -1  -  need to call System.arraycopy
2726   //
2727   address generate_generic_copy(const char *name,
2728                                 address entry_jbyte_arraycopy,
2729                                 address entry_jshort_arraycopy,
2730                                 address entry_jint_arraycopy,
2731                                 address entry_oop_arraycopy,
2732                                 address entry_jlong_arraycopy,
2733                                 address entry_checkcast_arraycopy) {
2734     Label L_failed, L_objArray;
2735 
2736     // Input registers
2737     const Register src      = O0;  // source array oop
2738     const Register src_pos  = O1;  // source position
2739     const Register dst      = O2;  // destination array oop
2740     const Register dst_pos  = O3;  // destination position
2741     const Register length   = O4;  // elements count
2742 
2743     // registers used as temp
2744     const Register G3_src_klass = G3; // source array klass
2745     const Register G4_dst_klass = G4; // destination array klass
2746     const Register G5_lh        = G5; // layout handler
2747     const Register O5_temp      = O5;
2748 
2749     __ align(CodeEntryAlignment);
2750     StubCodeMark mark(this, "StubRoutines", name);
2751     address start = __ pc();
2752 
2753     // bump this on entry, not on exit:
2754     inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3);
2755 
2756     // In principle, the int arguments could be dirty.
2757     //assert_clean_int(src_pos, G1);
2758     //assert_clean_int(dst_pos, G1);
2759     //assert_clean_int(length, G1);
2760 
2761     //-----------------------------------------------------------------------
2762     // Assembler stubs will be used for this call to arraycopy
2763     // if the following conditions are met:
2764     //
2765     // (1) src and dst must not be null.
2766     // (2) src_pos must not be negative.
2767     // (3) dst_pos must not be negative.
2768     // (4) length  must not be negative.
2769     // (5) src klass and dst klass should be the same and not NULL.
2770     // (6) src and dst should be arrays.
2771     // (7) src_pos + length must not exceed length of src.
2772     // (8) dst_pos + length must not exceed length of dst.
2773     BLOCK_COMMENT("arraycopy initial argument checks");
2774 
2775     //  if (src == NULL) return -1;
2776     __ br_null(src, false, Assembler::pn, L_failed);
2777 
2778     //  if (src_pos < 0) return -1;
2779     __ delayed()->tst(src_pos);
2780     __ br(Assembler::negative, false, Assembler::pn, L_failed);
2781     __ delayed()->nop();
2782 
2783     //  if (dst == NULL) return -1;
2784     __ br_null(dst, false, Assembler::pn, L_failed);
2785 
2786     //  if (dst_pos < 0) return -1;
2787     __ delayed()->tst(dst_pos);
2788     __ br(Assembler::negative, false, Assembler::pn, L_failed);
2789 
2790     //  if (length < 0) return -1;
2791     __ delayed()->tst(length);
2792     __ br(Assembler::negative, false, Assembler::pn, L_failed);
2793 
2794     BLOCK_COMMENT("arraycopy argument klass checks");
2795     //  get src->klass()
2796     if (UseCompressedClassPointers) {
2797       __ delayed()->nop(); // ??? not good
2798       __ load_klass(src, G3_src_klass);
2799     } else {
2800       __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass);
2801     }
2802 
2803 #ifdef ASSERT
2804     //  assert(src->klass() != NULL);
2805     BLOCK_COMMENT("assert klasses not null");
2806     { Label L_a, L_b;
2807       __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL
2808       __ bind(L_a);
2809       __ stop("broken null klass");
2810       __ bind(L_b);
2811       __ load_klass(dst, G4_dst_klass);
2812       __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also
2813       __ delayed()->mov(G0, G4_dst_klass);      // scribble the temp
2814       BLOCK_COMMENT("assert done");
2815     }
2816 #endif
2817 
2818     // Load layout helper
2819     //
2820     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2821     // 32        30    24            16              8     2                 0
2822     //
2823     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2824     //
2825 
2826     int lh_offset = in_bytes(Klass::layout_helper_offset());
2827 
2828     // Load 32-bits signed value. Use br() instruction with it to check icc.
2829     __ lduw(G3_src_klass, lh_offset, G5_lh);
2830 
2831     if (UseCompressedClassPointers) {
2832       __ load_klass(dst, G4_dst_klass);
2833     }
2834     // Handle objArrays completely differently...
2835     juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2836     __ set(objArray_lh, O5_temp);
2837     __ cmp(G5_lh,       O5_temp);
2838     __ br(Assembler::equal, false, Assembler::pt, L_objArray);
2839     if (UseCompressedClassPointers) {
2840       __ delayed()->nop();
2841     } else {
2842       __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
2843     }
2844 
2845     //  if (src->klass() != dst->klass()) return -1;
2846     __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed);
2847 
2848     //  if (!src->is_Array()) return -1;
2849     __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
2850     __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed);
2851 
2852     // At this point, it is known to be a typeArray (array_tag 0x3).
2853 #ifdef ASSERT
2854     __ delayed()->nop();
2855     { Label L;
2856       jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2857       __ set(lh_prim_tag_in_place, O5_temp);
2858       __ cmp(G5_lh,                O5_temp);
2859       __ br(Assembler::greaterEqual, false, Assembler::pt, L);
2860       __ delayed()->nop();
2861       __ stop("must be a primitive array");
2862       __ bind(L);
2863     }
2864 #else
2865     __ delayed();                               // match next insn to prev branch
2866 #endif
2867 
2868     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2869                            O5_temp, G4_dst_klass, L_failed);
2870 
2871     // TypeArrayKlass
2872     //
2873     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2874     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2875     //
2876 
2877     const Register G4_offset = G4_dst_klass;    // array offset
2878     const Register G3_elsize = G3_src_klass;    // log2 element size
2879 
2880     __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset);
2881     __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset
2882     __ add(src, G4_offset, src);       // src array offset
2883     __ add(dst, G4_offset, dst);       // dst array offset
2884     __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size
2885 
2886     // next registers should be set before the jump to corresponding stub
2887     const Register from     = O0;  // source array address
2888     const Register to       = O1;  // destination array address
2889     const Register count    = O2;  // elements count
2890 
2891     // 'from', 'to', 'count' registers should be set in this order
2892     // since they are the same as 'src', 'src_pos', 'dst'.
2893 
2894     BLOCK_COMMENT("scale indexes to element size");
2895     __ sll_ptr(src_pos, G3_elsize, src_pos);
2896     __ sll_ptr(dst_pos, G3_elsize, dst_pos);
2897     __ add(src, src_pos, from);       // src_addr
2898     __ add(dst, dst_pos, to);         // dst_addr
2899 
2900     BLOCK_COMMENT("choose copy loop based on element size");
2901     __ cmp(G3_elsize, 0);
2902     __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy);
2903     __ delayed()->signx(length, count); // length
2904 
2905     __ cmp(G3_elsize, LogBytesPerShort);
2906     __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy);
2907     __ delayed()->signx(length, count); // length
2908 
2909     __ cmp(G3_elsize, LogBytesPerInt);
2910     __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy);
2911     __ delayed()->signx(length, count); // length
2912 #ifdef ASSERT
2913     { Label L;
2914       __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L);
2915       __ stop("must be long copy, but elsize is wrong");
2916       __ bind(L);
2917     }
2918 #endif
2919     __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy);
2920     __ delayed()->signx(length, count); // length
2921 
2922     // ObjArrayKlass
2923   __ BIND(L_objArray);
2924     // live at this point:  G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length
2925 
2926     Label L_plain_copy, L_checkcast_copy;
2927     //  test array classes for subtyping
2928     __ cmp(G3_src_klass, G4_dst_klass);         // usual case is exact equality
2929     __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy);
2930     __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below
2931 
2932     // Identically typed arrays can be copied without element-wise checks.
2933     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2934                            O5_temp, G5_lh, L_failed);
2935 
2936     __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
2937     __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
2938     __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
2939     __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
2940     __ add(src, src_pos, from);       // src_addr
2941     __ add(dst, dst_pos, to);         // dst_addr
2942   __ BIND(L_plain_copy);
2943     __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy);
2944     __ delayed()->signx(length, count); // length
2945 
2946   __ BIND(L_checkcast_copy);
2947     // live at this point:  G3_src_klass, G4_dst_klass
2948     {
2949       // Before looking at dst.length, make sure dst is also an objArray.
2950       // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot
2951       __ cmp(G5_lh,                    O5_temp);
2952       __ br(Assembler::notEqual, false, Assembler::pn, L_failed);
2953 
2954       // It is safe to examine both src.length and dst.length.
2955       __ delayed();                             // match next insn to prev branch
2956       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2957                              O5_temp, G5_lh, L_failed);
2958 
2959       // Marshal the base address arguments now, freeing registers.
2960       __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
2961       __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
2962       __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
2963       __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
2964       __ add(src, src_pos, from);               // src_addr
2965       __ add(dst, dst_pos, to);                 // dst_addr
2966       __ signx(length, count);                  // length (reloaded)
2967 
2968       Register sco_temp = O3;                   // this register is free now
2969       assert_different_registers(from, to, count, sco_temp,
2970                                  G4_dst_klass, G3_src_klass);
2971 
2972       // Generate the type check.
2973       int sco_offset = in_bytes(Klass::super_check_offset_offset());
2974       __ lduw(G4_dst_klass, sco_offset, sco_temp);
2975       generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
2976                           O5_temp, L_plain_copy);
2977 
2978       // Fetch destination element klass from the ObjArrayKlass header.
2979       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2980 
2981       // the checkcast_copy loop needs two extra arguments:
2982       __ ld_ptr(G4_dst_klass, ek_offset, O4);   // dest elem klass
2983       // lduw(O4, sco_offset, O3);              // sco of elem klass
2984 
2985       __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy);
2986       __ delayed()->lduw(O4, sco_offset, O3);
2987     }
2988 
2989   __ BIND(L_failed);
2990     __ retl();
2991     __ delayed()->sub(G0, 1, O0); // return -1
2992     return start;
2993   }
2994 
2995   //
2996   //  Generate stub for heap zeroing.
2997   //  "to" address is aligned to jlong (8 bytes).
2998   //
2999   // Arguments for generated stub:
3000   //      to:    O0
3001   //      count: O1 treated as signed (count of HeapWord)
3002   //             count could be 0
3003   //
3004   address generate_zero_aligned_words(const char* name) {
3005     __ align(CodeEntryAlignment);
3006     StubCodeMark mark(this, "StubRoutines", name);
3007     address start = __ pc();
3008 
3009     const Register to    = O0;   // source array address
3010     const Register count = O1;   // HeapWords count
3011     const Register temp  = O2;   // scratch
3012 
3013     Label Ldone;
3014     __ sllx(count, LogHeapWordSize, count); // to bytes count
3015     // Use BIS for zeroing
3016     __ bis_zeroing(to, count, temp, Ldone);
3017     __ bind(Ldone);
3018     __ retl();
3019     __ delayed()->nop();
3020     return start;
3021 }
3022 
3023   void generate_arraycopy_stubs() {
3024     address entry;
3025     address entry_jbyte_arraycopy;
3026     address entry_jshort_arraycopy;
3027     address entry_jint_arraycopy;
3028     address entry_oop_arraycopy;
3029     address entry_jlong_arraycopy;
3030     address entry_checkcast_arraycopy;
3031 
3032     //*** jbyte
3033     // Always need aligned and unaligned versions
3034     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
3035                                                                                   "jbyte_disjoint_arraycopy");
3036     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
3037                                                                                   &entry_jbyte_arraycopy,
3038                                                                                   "jbyte_arraycopy");
3039     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
3040                                                                                   "arrayof_jbyte_disjoint_arraycopy");
3041     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
3042                                                                                   "arrayof_jbyte_arraycopy");
3043 
3044     //*** jshort
3045     // Always need aligned and unaligned versions
3046     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
3047                                                                                     "jshort_disjoint_arraycopy");
3048     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
3049                                                                                     &entry_jshort_arraycopy,
3050                                                                                     "jshort_arraycopy");
3051     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
3052                                                                                     "arrayof_jshort_disjoint_arraycopy");
3053     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
3054                                                                                     "arrayof_jshort_arraycopy");
3055 
3056     //*** jint
3057     // Aligned versions
3058     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
3059                                                                                 "arrayof_jint_disjoint_arraycopy");
3060     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
3061                                                                                 "arrayof_jint_arraycopy");
3062     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
3063     // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it).
3064     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
3065                                                                                 "jint_disjoint_arraycopy");
3066     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
3067                                                                                 &entry_jint_arraycopy,
3068                                                                                 "jint_arraycopy");
3069 
3070     //*** jlong
3071     // It is always aligned
3072     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
3073                                                                                   "arrayof_jlong_disjoint_arraycopy");
3074     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
3075                                                                                   "arrayof_jlong_arraycopy");
3076     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
3077     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
3078 
3079 
3080     //*** oops
3081     // Aligned versions
3082     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, &entry,
3083                                                                                       "arrayof_oop_disjoint_arraycopy");
3084     StubRoutines::_arrayof_oop_arraycopy                 = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy,
3085                                                                                       "arrayof_oop_arraycopy");
3086     // Aligned versions without pre-barriers
3087     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry,
3088                                                                                       "arrayof_oop_disjoint_arraycopy_uninit",
3089                                                                                       /*dest_uninitialized*/true);
3090     StubRoutines::_arrayof_oop_arraycopy_uninit          = generate_conjoint_oop_copy(true, entry, NULL,
3091                                                                                       "arrayof_oop_arraycopy_uninit",
3092                                                                                       /*dest_uninitialized*/true);
3093     if (UseCompressedOops) {
3094       // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy.
3095       StubRoutines::_oop_disjoint_arraycopy            = generate_disjoint_oop_copy(false, &entry,
3096                                                                                     "oop_disjoint_arraycopy");
3097       StubRoutines::_oop_arraycopy                     = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy,
3098                                                                                     "oop_arraycopy");
3099       // Unaligned versions without pre-barriers
3100       StubRoutines::_oop_disjoint_arraycopy_uninit     = generate_disjoint_oop_copy(false, &entry,
3101                                                                                     "oop_disjoint_arraycopy_uninit",
3102                                                                                     /*dest_uninitialized*/true);
3103       StubRoutines::_oop_arraycopy_uninit              = generate_conjoint_oop_copy(false, entry, NULL,
3104                                                                                     "oop_arraycopy_uninit",
3105                                                                                     /*dest_uninitialized*/true);
3106     } else {
3107       // oop arraycopy is always aligned on 32bit and 64bit without compressed oops
3108       StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
3109       StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
3110       StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
3111       StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
3112     }
3113 
3114     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3115     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3116                                                                         /*dest_uninitialized*/true);
3117 
3118     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3119                                                               entry_jbyte_arraycopy,
3120                                                               entry_jshort_arraycopy,
3121                                                               entry_jint_arraycopy,
3122                                                               entry_jlong_arraycopy);
3123     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3124                                                                entry_jbyte_arraycopy,
3125                                                                entry_jshort_arraycopy,
3126                                                                entry_jint_arraycopy,
3127                                                                entry_oop_arraycopy,
3128                                                                entry_jlong_arraycopy,
3129                                                                entry_checkcast_arraycopy);
3130 
3131     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3132     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3133     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3134     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3135     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3136     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3137 
3138     if (UseBlockZeroing) {
3139       StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words");
3140     }
3141   }
3142 
3143   address generate_aescrypt_encryptBlock() {
3144     // required since we read expanded key 'int' array starting first element without alignment considerations
3145     assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3146            "the following code assumes that first element of an int array is aligned to 8 bytes");
3147     __ align(CodeEntryAlignment);
3148     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3149     Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output;
3150     address start = __ pc();
3151     Register from = O0; // source byte array
3152     Register to = O1;   // destination byte array
3153     Register key = O2;  // expanded key array
3154     const Register keylen = O4; //reg for storing expanded key array length
3155 
3156     // read expanded key length
3157     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3158 
3159     // Method to address arbitrary alignment for load instructions:
3160     // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary
3161     // If zero/aligned then continue with double FP load instructions
3162     // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata
3163     // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address
3164     // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address
3165     // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs
3166 
3167     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3168     __ andcc(from, 7, G0);
3169     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
3170     __ delayed()->alignaddr(from, G0, from);
3171 
3172     // aligned case: load input into F54-F56
3173     __ ldf(FloatRegisterImpl::D, from, 0, F54);
3174     __ ldf(FloatRegisterImpl::D, from, 8, F56);
3175     __ ba_short(L_load_expanded_key);
3176 
3177     __ BIND(L_load_misaligned_input);
3178     __ ldf(FloatRegisterImpl::D, from, 0, F54);
3179     __ ldf(FloatRegisterImpl::D, from, 8, F56);
3180     __ ldf(FloatRegisterImpl::D, from, 16, F58);
3181     __ faligndata(F54, F56, F54);
3182     __ faligndata(F56, F58, F56);
3183 
3184     __ BIND(L_load_expanded_key);
3185     // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed
3186     for ( int i = 0;  i <= 38; i += 2 ) {
3187       __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i));
3188     }
3189 
3190     // perform cipher transformation
3191     __ fxor(FloatRegisterImpl::D, F0, F54, F54);
3192     __ fxor(FloatRegisterImpl::D, F2, F56, F56);
3193     // rounds 1 through 8
3194     for ( int i = 4;  i <= 28; i += 8 ) {
3195       __ aes_eround01(as_FloatRegister(i), F54, F56, F58);
3196       __ aes_eround23(as_FloatRegister(i+2), F54, F56, F60);
3197       __ aes_eround01(as_FloatRegister(i+4), F58, F60, F54);
3198       __ aes_eround23(as_FloatRegister(i+6), F58, F60, F56);
3199     }
3200     __ aes_eround01(F36, F54, F56, F58); //round 9
3201     __ aes_eround23(F38, F54, F56, F60);
3202 
3203     // 128-bit original key size
3204     __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_doLast128bit);
3205 
3206     for ( int i = 40;  i <= 50; i += 2 ) {
3207       __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i) );
3208     }
3209     __ aes_eround01(F40, F58, F60, F54); //round 10
3210     __ aes_eround23(F42, F58, F60, F56);
3211     __ aes_eround01(F44, F54, F56, F58); //round 11
3212     __ aes_eround23(F46, F54, F56, F60);
3213 
3214     // 192-bit original key size
3215     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_storeOutput);
3216 
3217     __ ldf(FloatRegisterImpl::D, key, 208, F52);
3218     __ aes_eround01(F48, F58, F60, F54); //round 12
3219     __ aes_eround23(F50, F58, F60, F56);
3220     __ ldf(FloatRegisterImpl::D, key, 216, F46);
3221     __ ldf(FloatRegisterImpl::D, key, 224, F48);
3222     __ ldf(FloatRegisterImpl::D, key, 232, F50);
3223     __ aes_eround01(F52, F54, F56, F58); //round 13
3224     __ aes_eround23(F46, F54, F56, F60);
3225     __ ba_short(L_storeOutput);
3226 
3227     __ BIND(L_doLast128bit);
3228     __ ldf(FloatRegisterImpl::D, key, 160, F48);
3229     __ ldf(FloatRegisterImpl::D, key, 168, F50);
3230 
3231     __ BIND(L_storeOutput);
3232     // perform last round of encryption common for all key sizes
3233     __ aes_eround01_l(F48, F58, F60, F54); //last round
3234     __ aes_eround23_l(F50, F58, F60, F56);
3235 
3236     // Method to address arbitrary alignment for store instructions:
3237     // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary
3238     // If zero/aligned then continue with double FP store instructions
3239     // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case)
3240     // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001
3241     // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case
3242     // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case.
3243     // Set GSR.align to (8-n) using alignaddr
3244     // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf
3245     // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address
3246     // Store (partial) the original first (8-n) bytes starting at the original 'dest' address
3247     // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address
3248     // We need to execute this process for both the 8-byte result values
3249 
3250     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3251     __ andcc(to, 7, O5);
3252     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
3253     __ delayed()->edge8n(to, G0, O3);
3254 
3255     // aligned case: store output into the destination array
3256     __ stf(FloatRegisterImpl::D, F54, to, 0);
3257     __ retl();
3258     __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8);
3259 
3260     __ BIND(L_store_misaligned_output);
3261     __ add(to, 8, O4);
3262     __ mov(8, O2);
3263     __ sub(O2, O5, O2);
3264     __ alignaddr(O2, G0, O2);
3265     __ faligndata(F54, F54, F54);
3266     __ faligndata(F56, F56, F56);
3267     __ and3(to, -8, to);
3268     __ and3(O4, -8, O4);
3269     __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
3270     __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
3271     __ add(to, 8, to);
3272     __ add(O4, 8, O4);
3273     __ orn(G0, O3, O3);
3274     __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
3275     __ retl();
3276     __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
3277 
3278     return start;
3279   }
3280 
3281   address generate_aescrypt_decryptBlock() {
3282     assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3283            "the following code assumes that first element of an int array is aligned to 8 bytes");
3284     // required since we read original key 'byte' array as well in the decryption stubs
3285     assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3286            "the following code assumes that first element of a byte array is aligned to 8 bytes");
3287     __ align(CodeEntryAlignment);
3288     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3289     address start = __ pc();
3290     Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input;
3291     Label L_256bit_transform, L_common_transform, L_store_misaligned_output;
3292     Register from = O0; // source byte array
3293     Register to = O1;   // destination byte array
3294     Register key = O2;  // expanded key array
3295     Register original_key = O3;  // original key array only required during decryption
3296     const Register keylen = O4;  // reg for storing expanded key array length
3297 
3298     // read expanded key array length
3299     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3300 
3301     // save 'from' since we may need to recheck alignment in case of 256-bit decryption
3302     __ mov(from, G1);
3303 
3304     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3305     __ andcc(from, 7, G0);
3306     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
3307     __ delayed()->alignaddr(from, G0, from);
3308 
3309     // aligned case: load input into F52-F54
3310     __ ldf(FloatRegisterImpl::D, from, 0, F52);
3311     __ ldf(FloatRegisterImpl::D, from, 8, F54);
3312     __ ba_short(L_load_original_key);
3313 
3314     __ BIND(L_load_misaligned_input);
3315     __ ldf(FloatRegisterImpl::D, from, 0, F52);
3316     __ ldf(FloatRegisterImpl::D, from, 8, F54);
3317     __ ldf(FloatRegisterImpl::D, from, 16, F56);
3318     __ faligndata(F52, F54, F52);
3319     __ faligndata(F54, F56, F54);
3320 
3321     __ BIND(L_load_original_key);
3322     // load original key from SunJCE expanded decryption key
3323     // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
3324     for ( int i = 0;  i <= 3; i++ ) {
3325       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3326     }
3327 
3328     // 256-bit original key size
3329     __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
3330 
3331     // 192-bit original key size
3332     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
3333 
3334     // 128-bit original key size
3335     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3336     for ( int i = 0;  i <= 36; i += 4 ) {
3337       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
3338       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
3339     }
3340 
3341     // perform 128-bit key specific inverse cipher transformation
3342     __ fxor(FloatRegisterImpl::D, F42, F54, F54);
3343     __ fxor(FloatRegisterImpl::D, F40, F52, F52);
3344     __ ba_short(L_common_transform);
3345 
3346     __ BIND(L_expand192bit);
3347 
3348     // start loading rest of the 192-bit key
3349     __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
3350     __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
3351 
3352     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3353     for ( int i = 0;  i <= 36; i += 6 ) {
3354       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
3355       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
3356       __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
3357     }
3358     __ aes_kexpand1(F42, F46, 7, F48);
3359     __ aes_kexpand2(F44, F48, F50);
3360 
3361     // perform 192-bit key specific inverse cipher transformation
3362     __ fxor(FloatRegisterImpl::D, F50, F54, F54);
3363     __ fxor(FloatRegisterImpl::D, F48, F52, F52);
3364     __ aes_dround23(F46, F52, F54, F58);
3365     __ aes_dround01(F44, F52, F54, F56);
3366     __ aes_dround23(F42, F56, F58, F54);
3367     __ aes_dround01(F40, F56, F58, F52);
3368     __ ba_short(L_common_transform);
3369 
3370     __ BIND(L_expand256bit);
3371 
3372     // load rest of the 256-bit key
3373     for ( int i = 4;  i <= 7; i++ ) {
3374       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3375     }
3376 
3377     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3378     for ( int i = 0;  i <= 40; i += 8 ) {
3379       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
3380       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
3381       __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
3382       __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
3383     }
3384     __ aes_kexpand1(F48, F54, 6, F56);
3385     __ aes_kexpand2(F50, F56, F58);
3386 
3387     for ( int i = 0;  i <= 6; i += 2 ) {
3388       __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
3389     }
3390 
3391     // reload original 'from' address
3392     __ mov(G1, from);
3393 
3394     // re-check 8-byte alignment
3395     __ andcc(from, 7, G0);
3396     __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input);
3397     __ delayed()->alignaddr(from, G0, from);
3398 
3399     // aligned case: load input into F52-F54
3400     __ ldf(FloatRegisterImpl::D, from, 0, F52);
3401     __ ldf(FloatRegisterImpl::D, from, 8, F54);
3402     __ ba_short(L_256bit_transform);
3403 
3404     __ BIND(L_reload_misaligned_input);
3405     __ ldf(FloatRegisterImpl::D, from, 0, F52);
3406     __ ldf(FloatRegisterImpl::D, from, 8, F54);
3407     __ ldf(FloatRegisterImpl::D, from, 16, F56);
3408     __ faligndata(F52, F54, F52);
3409     __ faligndata(F54, F56, F54);
3410 
3411     // perform 256-bit key specific inverse cipher transformation
3412     __ BIND(L_256bit_transform);
3413     __ fxor(FloatRegisterImpl::D, F0, F54, F54);
3414     __ fxor(FloatRegisterImpl::D, F2, F52, F52);
3415     __ aes_dround23(F4, F52, F54, F58);
3416     __ aes_dround01(F6, F52, F54, F56);
3417     __ aes_dround23(F50, F56, F58, F54);
3418     __ aes_dround01(F48, F56, F58, F52);
3419     __ aes_dround23(F46, F52, F54, F58);
3420     __ aes_dround01(F44, F52, F54, F56);
3421     __ aes_dround23(F42, F56, F58, F54);
3422     __ aes_dround01(F40, F56, F58, F52);
3423 
3424     for ( int i = 0;  i <= 7; i++ ) {
3425       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3426     }
3427 
3428     // perform inverse cipher transformations common for all key sizes
3429     __ BIND(L_common_transform);
3430     for ( int i = 38;  i >= 6; i -= 8 ) {
3431       __ aes_dround23(as_FloatRegister(i), F52, F54, F58);
3432       __ aes_dround01(as_FloatRegister(i-2), F52, F54, F56);
3433       if ( i != 6) {
3434         __ aes_dround23(as_FloatRegister(i-4), F56, F58, F54);
3435         __ aes_dround01(as_FloatRegister(i-6), F56, F58, F52);
3436       } else {
3437         __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54);
3438         __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52);
3439       }
3440     }
3441 
3442     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3443     __ andcc(to, 7, O5);
3444     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
3445     __ delayed()->edge8n(to, G0, O3);
3446 
3447     // aligned case: store output into the destination array
3448     __ stf(FloatRegisterImpl::D, F52, to, 0);
3449     __ retl();
3450     __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8);
3451 
3452     __ BIND(L_store_misaligned_output);
3453     __ add(to, 8, O4);
3454     __ mov(8, O2);
3455     __ sub(O2, O5, O2);
3456     __ alignaddr(O2, G0, O2);
3457     __ faligndata(F52, F52, F52);
3458     __ faligndata(F54, F54, F54);
3459     __ and3(to, -8, to);
3460     __ and3(O4, -8, O4);
3461     __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
3462     __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
3463     __ add(to, 8, to);
3464     __ add(O4, 8, O4);
3465     __ orn(G0, O3, O3);
3466     __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
3467     __ retl();
3468     __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
3469 
3470     return start;
3471   }
3472 
3473   address generate_cipherBlockChaining_encryptAESCrypt() {
3474     assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3475            "the following code assumes that first element of an int array is aligned to 8 bytes");
3476     assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3477            "the following code assumes that first element of a byte array is aligned to 8 bytes");
3478     __ align(CodeEntryAlignment);
3479     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3480     Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit;
3481     Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform;
3482     Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit;
3483     Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit;
3484     address start = __ pc();
3485     Register from = I0; // source byte array
3486     Register to = I1;   // destination byte array
3487     Register key = I2;  // expanded key array
3488     Register rvec = I3; // init vector
3489     const Register len_reg = I4; // cipher length
3490     const Register keylen = I5;  // reg for storing expanded key array length
3491 
3492     __ save_frame(0);
3493     // save cipher len to return in the end
3494     __ mov(len_reg, L0);
3495 
3496     // read expanded key length
3497     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3498 
3499     // load initial vector, 8-byte alignment is guranteed
3500     __ ldf(FloatRegisterImpl::D, rvec, 0, F60);
3501     __ ldf(FloatRegisterImpl::D, rvec, 8, F62);
3502     // load key, 8-byte alignment is guranteed
3503     __ ldx(key,0,G1);
3504     __ ldx(key,8,G5);
3505 
3506     // start loading expanded key, 8-byte alignment is guranteed
3507     for ( int i = 0, j = 16;  i <= 38; i += 2, j += 8 ) {
3508       __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3509     }
3510 
3511     // 128-bit original key size
3512     __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_cbcenc128);
3513 
3514     for ( int i = 40, j = 176;  i <= 46; i += 2, j += 8 ) {
3515       __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3516     }
3517 
3518     // 192-bit original key size
3519     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_cbcenc192);
3520 
3521     for ( int i = 48, j = 208;  i <= 54; i += 2, j += 8 ) {
3522       __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3523     }
3524 
3525     // 256-bit original key size
3526     __ ba_short(L_cbcenc256);
3527 
3528     __ align(OptoLoopAlignment);
3529     __ BIND(L_cbcenc128);
3530     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3531     __ andcc(from, 7, G0);
3532     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit);
3533     __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3534 
3535     // aligned case: load input into G3 and G4
3536     __ ldx(from,0,G3);
3537     __ ldx(from,8,G4);
3538     __ ba_short(L_128bit_transform);
3539 
3540     __ BIND(L_load_misaligned_input_128bit);
3541     // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
3542     __ alignaddr(from, G0, from);
3543     __ ldf(FloatRegisterImpl::D, from, 0, F48);
3544     __ ldf(FloatRegisterImpl::D, from, 8, F50);
3545     __ ldf(FloatRegisterImpl::D, from, 16, F52);
3546     __ faligndata(F48, F50, F48);
3547     __ faligndata(F50, F52, F50);
3548     __ movdtox(F48, G3);
3549     __ movdtox(F50, G4);
3550     __ mov(L1, from);
3551 
3552     __ BIND(L_128bit_transform);
3553     __ xor3(G1,G3,G3);
3554     __ xor3(G5,G4,G4);
3555     __ movxtod(G3,F56);
3556     __ movxtod(G4,F58);
3557     __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3558     __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3559 
3560     // TEN_EROUNDS
3561     for ( int i = 0;  i <= 32; i += 8 ) {
3562       __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
3563       __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
3564       if (i != 32 ) {
3565         __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
3566         __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
3567       } else {
3568         __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3569         __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3570       }
3571     }
3572 
3573     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3574     __ andcc(to, 7, L1);
3575     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit);
3576     __ delayed()->edge8n(to, G0, L2);
3577 
3578     // aligned case: store output into the destination array
3579     __ stf(FloatRegisterImpl::D, F60, to, 0);
3580     __ stf(FloatRegisterImpl::D, F62, to, 8);
3581     __ ba_short(L_check_loop_end_128bit);
3582 
3583     __ BIND(L_store_misaligned_output_128bit);
3584     __ add(to, 8, L3);
3585     __ mov(8, L4);
3586     __ sub(L4, L1, L4);
3587     __ alignaddr(L4, G0, L4);
3588     // save cipher text before circular right shift
3589     // as it needs to be stored as iv for next block (see code before next retl)
3590     __ movdtox(F60, L6);
3591     __ movdtox(F62, L7);
3592     __ faligndata(F60, F60, F60);
3593     __ faligndata(F62, F62, F62);
3594     __ mov(to, L5);
3595     __ and3(to, -8, to);
3596     __ and3(L3, -8, L3);
3597     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3598     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3599     __ add(to, 8, to);
3600     __ add(L3, 8, L3);
3601     __ orn(G0, L2, L2);
3602     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3603     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3604     __ mov(L5, to);
3605     __ movxtod(L6, F60);
3606     __ movxtod(L7, F62);
3607 
3608     __ BIND(L_check_loop_end_128bit);
3609     __ add(from, 16, from);
3610     __ add(to, 16, to);
3611     __ subcc(len_reg, 16, len_reg);
3612     __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128);
3613     __ delayed()->nop();
3614     // re-init intial vector for next block, 8-byte alignment is guaranteed
3615     __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3616     __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3617     __ mov(L0, I0);
3618     __ ret();
3619     __ delayed()->restore();
3620 
3621     __ align(OptoLoopAlignment);
3622     __ BIND(L_cbcenc192);
3623     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3624     __ andcc(from, 7, G0);
3625     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit);
3626     __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3627 
3628     // aligned case: load input into G3 and G4
3629     __ ldx(from,0,G3);
3630     __ ldx(from,8,G4);
3631     __ ba_short(L_192bit_transform);
3632 
3633     __ BIND(L_load_misaligned_input_192bit);
3634     // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
3635     __ alignaddr(from, G0, from);
3636     __ ldf(FloatRegisterImpl::D, from, 0, F48);
3637     __ ldf(FloatRegisterImpl::D, from, 8, F50);
3638     __ ldf(FloatRegisterImpl::D, from, 16, F52);
3639     __ faligndata(F48, F50, F48);
3640     __ faligndata(F50, F52, F50);
3641     __ movdtox(F48, G3);
3642     __ movdtox(F50, G4);
3643     __ mov(L1, from);
3644 
3645     __ BIND(L_192bit_transform);
3646     __ xor3(G1,G3,G3);
3647     __ xor3(G5,G4,G4);
3648     __ movxtod(G3,F56);
3649     __ movxtod(G4,F58);
3650     __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3651     __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3652 
3653     // TWELEVE_EROUNDS
3654     for ( int i = 0;  i <= 40; i += 8 ) {
3655       __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
3656       __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
3657       if (i != 40 ) {
3658         __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
3659         __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
3660       } else {
3661         __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3662         __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3663       }
3664     }
3665 
3666     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3667     __ andcc(to, 7, L1);
3668     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit);
3669     __ delayed()->edge8n(to, G0, L2);
3670 
3671     // aligned case: store output into the destination array
3672     __ stf(FloatRegisterImpl::D, F60, to, 0);
3673     __ stf(FloatRegisterImpl::D, F62, to, 8);
3674     __ ba_short(L_check_loop_end_192bit);
3675 
3676     __ BIND(L_store_misaligned_output_192bit);
3677     __ add(to, 8, L3);
3678     __ mov(8, L4);
3679     __ sub(L4, L1, L4);
3680     __ alignaddr(L4, G0, L4);
3681     __ movdtox(F60, L6);
3682     __ movdtox(F62, L7);
3683     __ faligndata(F60, F60, F60);
3684     __ faligndata(F62, F62, F62);
3685     __ mov(to, L5);
3686     __ and3(to, -8, to);
3687     __ and3(L3, -8, L3);
3688     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3689     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3690     __ add(to, 8, to);
3691     __ add(L3, 8, L3);
3692     __ orn(G0, L2, L2);
3693     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3694     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3695     __ mov(L5, to);
3696     __ movxtod(L6, F60);
3697     __ movxtod(L7, F62);
3698 
3699     __ BIND(L_check_loop_end_192bit);
3700     __ add(from, 16, from);
3701     __ subcc(len_reg, 16, len_reg);
3702     __ add(to, 16, to);
3703     __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192);
3704     __ delayed()->nop();
3705     // re-init intial vector for next block, 8-byte alignment is guaranteed
3706     __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3707     __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3708     __ mov(L0, I0);
3709     __ ret();
3710     __ delayed()->restore();
3711 
3712     __ align(OptoLoopAlignment);
3713     __ BIND(L_cbcenc256);
3714     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3715     __ andcc(from, 7, G0);
3716     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit);
3717     __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3718 
3719     // aligned case: load input into G3 and G4
3720     __ ldx(from,0,G3);
3721     __ ldx(from,8,G4);
3722     __ ba_short(L_256bit_transform);
3723 
3724     __ BIND(L_load_misaligned_input_256bit);
3725     // cannot clobber F48, F50 and F52. F56, F58 can be used though
3726     __ alignaddr(from, G0, from);
3727     __ movdtox(F60, L2); // save F60 before overwriting
3728     __ ldf(FloatRegisterImpl::D, from, 0, F56);
3729     __ ldf(FloatRegisterImpl::D, from, 8, F58);
3730     __ ldf(FloatRegisterImpl::D, from, 16, F60);
3731     __ faligndata(F56, F58, F56);
3732     __ faligndata(F58, F60, F58);
3733     __ movdtox(F56, G3);
3734     __ movdtox(F58, G4);
3735     __ mov(L1, from);
3736     __ movxtod(L2, F60);
3737 
3738     __ BIND(L_256bit_transform);
3739     __ xor3(G1,G3,G3);
3740     __ xor3(G5,G4,G4);
3741     __ movxtod(G3,F56);
3742     __ movxtod(G4,F58);
3743     __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3744     __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3745 
3746     // FOURTEEN_EROUNDS
3747     for ( int i = 0;  i <= 48; i += 8 ) {
3748       __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
3749       __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
3750       if (i != 48 ) {
3751         __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
3752         __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
3753       } else {
3754         __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3755         __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3756       }
3757     }
3758 
3759     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3760     __ andcc(to, 7, L1);
3761     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit);
3762     __ delayed()->edge8n(to, G0, L2);
3763 
3764     // aligned case: store output into the destination array
3765     __ stf(FloatRegisterImpl::D, F60, to, 0);
3766     __ stf(FloatRegisterImpl::D, F62, to, 8);
3767     __ ba_short(L_check_loop_end_256bit);
3768 
3769     __ BIND(L_store_misaligned_output_256bit);
3770     __ add(to, 8, L3);
3771     __ mov(8, L4);
3772     __ sub(L4, L1, L4);
3773     __ alignaddr(L4, G0, L4);
3774     __ movdtox(F60, L6);
3775     __ movdtox(F62, L7);
3776     __ faligndata(F60, F60, F60);
3777     __ faligndata(F62, F62, F62);
3778     __ mov(to, L5);
3779     __ and3(to, -8, to);
3780     __ and3(L3, -8, L3);
3781     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3782     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3783     __ add(to, 8, to);
3784     __ add(L3, 8, L3);
3785     __ orn(G0, L2, L2);
3786     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3787     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3788     __ mov(L5, to);
3789     __ movxtod(L6, F60);
3790     __ movxtod(L7, F62);
3791 
3792     __ BIND(L_check_loop_end_256bit);
3793     __ add(from, 16, from);
3794     __ subcc(len_reg, 16, len_reg);
3795     __ add(to, 16, to);
3796     __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256);
3797     __ delayed()->nop();
3798     // re-init intial vector for next block, 8-byte alignment is guaranteed
3799     __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3800     __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3801     __ mov(L0, I0);
3802     __ ret();
3803     __ delayed()->restore();
3804 
3805     return start;
3806   }
3807 
3808   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3809     assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3810            "the following code assumes that first element of an int array is aligned to 8 bytes");
3811     assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3812            "the following code assumes that first element of a byte array is aligned to 8 bytes");
3813     __ align(CodeEntryAlignment);
3814     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3815     Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start;
3816     Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256;
3817     Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128;
3818     Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256;
3819     Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128;
3820     Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192;
3821     Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256;
3822     address start = __ pc();
3823     Register from = I0; // source byte array
3824     Register to = I1;   // destination byte array
3825     Register key = I2;  // expanded key array
3826     Register rvec = I3; // init vector
3827     const Register len_reg = I4; // cipher length
3828     const Register original_key = I5;  // original key array only required during decryption
3829     const Register keylen = L6;  // reg for storing expanded key array length
3830 
3831     __ save_frame(0); //args are read from I* registers since we save the frame in the beginning
3832     // save cipher len to return in the end
3833     __ mov(len_reg, L7);
3834 
3835     // load original key from SunJCE expanded decryption key
3836     // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
3837     for ( int i = 0;  i <= 3; i++ ) {
3838       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3839     }
3840 
3841     // load initial vector, 8-byte alignment is guaranteed
3842     __ ldx(rvec,0,L0);
3843     __ ldx(rvec,8,L1);
3844 
3845     // read expanded key array length
3846     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3847 
3848     // 256-bit original key size
3849     __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
3850 
3851     // 192-bit original key size
3852     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
3853 
3854     // 128-bit original key size
3855     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3856     for ( int i = 0;  i <= 36; i += 4 ) {
3857       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
3858       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
3859     }
3860 
3861     // load expanded key[last-1] and key[last] elements
3862     __ movdtox(F40,L2);
3863     __ movdtox(F42,L3);
3864 
3865     __ and3(len_reg, 16, L4);
3866     __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128);
3867     __ nop();
3868 
3869     __ ba_short(L_dec_first_block_start);
3870 
3871     __ BIND(L_expand192bit);
3872     // load rest of the 192-bit key
3873     __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
3874     __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
3875 
3876     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3877     for ( int i = 0;  i <= 36; i += 6 ) {
3878       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
3879       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
3880       __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
3881     }
3882     __ aes_kexpand1(F42, F46, 7, F48);
3883     __ aes_kexpand2(F44, F48, F50);
3884 
3885     // load expanded key[last-1] and key[last] elements
3886     __ movdtox(F48,L2);
3887     __ movdtox(F50,L3);
3888 
3889     __ and3(len_reg, 16, L4);
3890     __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192);
3891     __ nop();
3892 
3893     __ ba_short(L_dec_first_block_start);
3894 
3895     __ BIND(L_expand256bit);
3896     // load rest of the 256-bit key
3897     for ( int i = 4;  i <= 7; i++ ) {
3898       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3899     }
3900 
3901     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3902     for ( int i = 0;  i <= 40; i += 8 ) {
3903       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
3904       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
3905       __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
3906       __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
3907     }
3908     __ aes_kexpand1(F48, F54, 6, F56);
3909     __ aes_kexpand2(F50, F56, F58);
3910 
3911     // load expanded key[last-1] and key[last] elements
3912     __ movdtox(F56,L2);
3913     __ movdtox(F58,L3);
3914 
3915     __ and3(len_reg, 16, L4);
3916     __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256);
3917 
3918     __ BIND(L_dec_first_block_start);
3919     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3920     __ andcc(from, 7, G0);
3921     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block);
3922     __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
3923 
3924     // aligned case: load input into L4 and L5
3925     __ ldx(from,0,L4);
3926     __ ldx(from,8,L5);
3927     __ ba_short(L_transform_first_block);
3928 
3929     __ BIND(L_load_misaligned_input_first_block);
3930     __ alignaddr(from, G0, from);
3931     // F58, F60, F62 can be clobbered
3932     __ ldf(FloatRegisterImpl::D, from, 0, F58);
3933     __ ldf(FloatRegisterImpl::D, from, 8, F60);
3934     __ ldf(FloatRegisterImpl::D, from, 16, F62);
3935     __ faligndata(F58, F60, F58);
3936     __ faligndata(F60, F62, F60);
3937     __ movdtox(F58, L4);
3938     __ movdtox(F60, L5);
3939     __ mov(G1, from);
3940 
3941     __ BIND(L_transform_first_block);
3942     __ xor3(L2,L4,G1);
3943     __ movxtod(G1,F60);
3944     __ xor3(L3,L5,G1);
3945     __ movxtod(G1,F62);
3946 
3947     // 128-bit original key size
3948     __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pn, L_dec_first_block128);
3949 
3950     // 192-bit original key size
3951     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_first_block192);
3952 
3953     __ aes_dround23(F54, F60, F62, F58);
3954     __ aes_dround01(F52, F60, F62, F56);
3955     __ aes_dround23(F50, F56, F58, F62);
3956     __ aes_dround01(F48, F56, F58, F60);
3957 
3958     __ BIND(L_dec_first_block192);
3959     __ aes_dround23(F46, F60, F62, F58);
3960     __ aes_dround01(F44, F60, F62, F56);
3961     __ aes_dround23(F42, F56, F58, F62);
3962     __ aes_dround01(F40, F56, F58, F60);
3963 
3964     __ BIND(L_dec_first_block128);
3965     for ( int i = 38;  i >= 6; i -= 8 ) {
3966       __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
3967       __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
3968       if ( i != 6) {
3969         __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
3970         __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
3971       } else {
3972         __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
3973         __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
3974       }
3975     }
3976 
3977     __ movxtod(L0,F56);
3978     __ movxtod(L1,F58);
3979     __ mov(L4,L0);
3980     __ mov(L5,L1);
3981     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
3982     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
3983 
3984     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3985     __ andcc(to, 7, G1);
3986     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block);
3987     __ delayed()->edge8n(to, G0, G2);
3988 
3989     // aligned case: store output into the destination array
3990     __ stf(FloatRegisterImpl::D, F60, to, 0);
3991     __ stf(FloatRegisterImpl::D, F62, to, 8);
3992     __ ba_short(L_check_decrypt_end);
3993 
3994     __ BIND(L_store_misaligned_output_first_block);
3995     __ add(to, 8, G3);
3996     __ mov(8, G4);
3997     __ sub(G4, G1, G4);
3998     __ alignaddr(G4, G0, G4);
3999     __ faligndata(F60, F60, F60);
4000     __ faligndata(F62, F62, F62);
4001     __ mov(to, G1);
4002     __ and3(to, -8, to);
4003     __ and3(G3, -8, G3);
4004     __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
4005     __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
4006     __ add(to, 8, to);
4007     __ add(G3, 8, G3);
4008     __ orn(G0, G2, G2);
4009     __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
4010     __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
4011     __ mov(G1, to);
4012 
4013     __ BIND(L_check_decrypt_end);
4014     __ add(from, 16, from);
4015     __ add(to, 16, to);
4016     __ subcc(len_reg, 16, len_reg);
4017     __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end);
4018     __ delayed()->nop();
4019 
4020     // 256-bit original key size
4021     __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_dec_next2_blocks256);
4022 
4023     // 192-bit original key size
4024     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_next2_blocks192);
4025 
4026     __ align(OptoLoopAlignment);
4027     __ BIND(L_dec_next2_blocks128);
4028     __ nop();
4029 
4030     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4031     __ andcc(from, 7, G0);
4032     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128);
4033     __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4034 
4035     // aligned case: load input into G4, G5, L4 and L5
4036     __ ldx(from,0,G4);
4037     __ ldx(from,8,G5);
4038     __ ldx(from,16,L4);
4039     __ ldx(from,24,L5);
4040     __ ba_short(L_transform_next2_blocks128);
4041 
4042     __ BIND(L_load_misaligned_next2_blocks128);
4043     __ alignaddr(from, G0, from);
4044     // F40, F42, F58, F60, F62 can be clobbered
4045     __ ldf(FloatRegisterImpl::D, from, 0, F40);
4046     __ ldf(FloatRegisterImpl::D, from, 8, F42);
4047     __ ldf(FloatRegisterImpl::D, from, 16, F60);
4048     __ ldf(FloatRegisterImpl::D, from, 24, F62);
4049     __ ldf(FloatRegisterImpl::D, from, 32, F58);
4050     __ faligndata(F40, F42, F40);
4051     __ faligndata(F42, F60, F42);
4052     __ faligndata(F60, F62, F60);
4053     __ faligndata(F62, F58, F62);
4054     __ movdtox(F40, G4);
4055     __ movdtox(F42, G5);
4056     __ movdtox(F60, L4);
4057     __ movdtox(F62, L5);
4058     __ mov(G1, from);
4059 
4060     __ BIND(L_transform_next2_blocks128);
4061     // F40:F42 used for first 16-bytes
4062     __ xor3(L2,G4,G1);
4063     __ movxtod(G1,F40);
4064     __ xor3(L3,G5,G1);
4065     __ movxtod(G1,F42);
4066 
4067     // F60:F62 used for next 16-bytes
4068     __ xor3(L2,L4,G1);
4069     __ movxtod(G1,F60);
4070     __ xor3(L3,L5,G1);
4071     __ movxtod(G1,F62);
4072 
4073     for ( int i = 38;  i >= 6; i -= 8 ) {
4074       __ aes_dround23(as_FloatRegister(i), F40, F42, F44);
4075       __ aes_dround01(as_FloatRegister(i-2), F40, F42, F46);
4076       __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4077       __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4078       if (i != 6 ) {
4079         __ aes_dround23(as_FloatRegister(i-4), F46, F44, F42);
4080         __ aes_dround01(as_FloatRegister(i-6), F46, F44, F40);
4081         __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4082         __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4083       } else {
4084         __ aes_dround23_l(as_FloatRegister(i-4), F46, F44, F42);
4085         __ aes_dround01_l(as_FloatRegister(i-6), F46, F44, F40);
4086         __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
4087         __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
4088       }
4089     }
4090 
4091     __ movxtod(L0,F46);
4092     __ movxtod(L1,F44);
4093     __ fxor(FloatRegisterImpl::D, F46, F40, F40);
4094     __ fxor(FloatRegisterImpl::D, F44, F42, F42);
4095 
4096     __ movxtod(G4,F56);
4097     __ movxtod(G5,F58);
4098     __ mov(L4,L0);
4099     __ mov(L5,L1);
4100     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4101     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4102 
4103     // For mis-aligned store of 32 bytes of result we can do:
4104     // Circular right-shift all 4 FP registers so that 'head' and 'tail'
4105     // parts that need to be stored starting at mis-aligned address are in a FP reg
4106     // the other 3 FP regs can thus be stored using regular store
4107     // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts
4108 
4109     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4110     __ andcc(to, 7, G1);
4111     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128);
4112     __ delayed()->edge8n(to, G0, G2);
4113 
4114     // aligned case: store output into the destination array
4115     __ stf(FloatRegisterImpl::D, F40, to, 0);
4116     __ stf(FloatRegisterImpl::D, F42, to, 8);
4117     __ stf(FloatRegisterImpl::D, F60, to, 16);
4118     __ stf(FloatRegisterImpl::D, F62, to, 24);
4119     __ ba_short(L_check_decrypt_loop_end128);
4120 
4121     __ BIND(L_store_misaligned_output_next2_blocks128);
4122     __ mov(8, G4);
4123     __ sub(G4, G1, G4);
4124     __ alignaddr(G4, G0, G4);
4125     __ faligndata(F40, F42, F56); // F56 can be clobbered
4126     __ faligndata(F42, F60, F42);
4127     __ faligndata(F60, F62, F60);
4128     __ faligndata(F62, F40, F40);
4129     __ mov(to, G1);
4130     __ and3(to, -8, to);
4131     __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
4132     __ stf(FloatRegisterImpl::D, F56, to, 8);
4133     __ stf(FloatRegisterImpl::D, F42, to, 16);
4134     __ stf(FloatRegisterImpl::D, F60, to, 24);
4135     __ add(to, 32, to);
4136     __ orn(G0, G2, G2);
4137     __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
4138     __ mov(G1, to);
4139 
4140     __ BIND(L_check_decrypt_loop_end128);
4141     __ add(from, 32, from);
4142     __ add(to, 32, to);
4143     __ subcc(len_reg, 32, len_reg);
4144     __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128);
4145     __ delayed()->nop();
4146     __ ba_short(L_cbcdec_end);
4147 
4148     __ align(OptoLoopAlignment);
4149     __ BIND(L_dec_next2_blocks192);
4150     __ nop();
4151 
4152     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4153     __ andcc(from, 7, G0);
4154     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192);
4155     __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4156 
4157     // aligned case: load input into G4, G5, L4 and L5
4158     __ ldx(from,0,G4);
4159     __ ldx(from,8,G5);
4160     __ ldx(from,16,L4);
4161     __ ldx(from,24,L5);
4162     __ ba_short(L_transform_next2_blocks192);
4163 
4164     __ BIND(L_load_misaligned_next2_blocks192);
4165     __ alignaddr(from, G0, from);
4166     // F48, F50, F52, F60, F62 can be clobbered
4167     __ ldf(FloatRegisterImpl::D, from, 0, F48);
4168     __ ldf(FloatRegisterImpl::D, from, 8, F50);
4169     __ ldf(FloatRegisterImpl::D, from, 16, F60);
4170     __ ldf(FloatRegisterImpl::D, from, 24, F62);
4171     __ ldf(FloatRegisterImpl::D, from, 32, F52);
4172     __ faligndata(F48, F50, F48);
4173     __ faligndata(F50, F60, F50);
4174     __ faligndata(F60, F62, F60);
4175     __ faligndata(F62, F52, F62);
4176     __ movdtox(F48, G4);
4177     __ movdtox(F50, G5);
4178     __ movdtox(F60, L4);
4179     __ movdtox(F62, L5);
4180     __ mov(G1, from);
4181 
4182     __ BIND(L_transform_next2_blocks192);
4183     // F48:F50 used for first 16-bytes
4184     __ xor3(L2,G4,G1);
4185     __ movxtod(G1,F48);
4186     __ xor3(L3,G5,G1);
4187     __ movxtod(G1,F50);
4188 
4189     // F60:F62 used for next 16-bytes
4190     __ xor3(L2,L4,G1);
4191     __ movxtod(G1,F60);
4192     __ xor3(L3,L5,G1);
4193     __ movxtod(G1,F62);
4194 
4195     for ( int i = 46;  i >= 6; i -= 8 ) {
4196       __ aes_dround23(as_FloatRegister(i), F48, F50, F52);
4197       __ aes_dround01(as_FloatRegister(i-2), F48, F50, F54);
4198       __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4199       __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4200       if (i != 6 ) {
4201         __ aes_dround23(as_FloatRegister(i-4), F54, F52, F50);
4202         __ aes_dround01(as_FloatRegister(i-6), F54, F52, F48);
4203         __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4204         __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4205       } else {
4206         __ aes_dround23_l(as_FloatRegister(i-4), F54, F52, F50);
4207         __ aes_dround01_l(as_FloatRegister(i-6), F54, F52, F48);
4208         __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
4209         __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
4210       }
4211     }
4212 
4213     __ movxtod(L0,F54);
4214     __ movxtod(L1,F52);
4215     __ fxor(FloatRegisterImpl::D, F54, F48, F48);
4216     __ fxor(FloatRegisterImpl::D, F52, F50, F50);
4217 
4218     __ movxtod(G4,F56);
4219     __ movxtod(G5,F58);
4220     __ mov(L4,L0);
4221     __ mov(L5,L1);
4222     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4223     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4224 
4225     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4226     __ andcc(to, 7, G1);
4227     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192);
4228     __ delayed()->edge8n(to, G0, G2);
4229 
4230     // aligned case: store output into the destination array
4231     __ stf(FloatRegisterImpl::D, F48, to, 0);
4232     __ stf(FloatRegisterImpl::D, F50, to, 8);
4233     __ stf(FloatRegisterImpl::D, F60, to, 16);
4234     __ stf(FloatRegisterImpl::D, F62, to, 24);
4235     __ ba_short(L_check_decrypt_loop_end192);
4236 
4237     __ BIND(L_store_misaligned_output_next2_blocks192);
4238     __ mov(8, G4);
4239     __ sub(G4, G1, G4);
4240     __ alignaddr(G4, G0, G4);
4241     __ faligndata(F48, F50, F56); // F56 can be clobbered
4242     __ faligndata(F50, F60, F50);
4243     __ faligndata(F60, F62, F60);
4244     __ faligndata(F62, F48, F48);
4245     __ mov(to, G1);
4246     __ and3(to, -8, to);
4247     __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
4248     __ stf(FloatRegisterImpl::D, F56, to, 8);
4249     __ stf(FloatRegisterImpl::D, F50, to, 16);
4250     __ stf(FloatRegisterImpl::D, F60, to, 24);
4251     __ add(to, 32, to);
4252     __ orn(G0, G2, G2);
4253     __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
4254     __ mov(G1, to);
4255 
4256     __ BIND(L_check_decrypt_loop_end192);
4257     __ add(from, 32, from);
4258     __ add(to, 32, to);
4259     __ subcc(len_reg, 32, len_reg);
4260     __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192);
4261     __ delayed()->nop();
4262     __ ba_short(L_cbcdec_end);
4263 
4264     __ align(OptoLoopAlignment);
4265     __ BIND(L_dec_next2_blocks256);
4266     __ nop();
4267 
4268     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4269     __ andcc(from, 7, G0);
4270     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256);
4271     __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4272 
4273     // aligned case: load input into G4, G5, L4 and L5
4274     __ ldx(from,0,G4);
4275     __ ldx(from,8,G5);
4276     __ ldx(from,16,L4);
4277     __ ldx(from,24,L5);
4278     __ ba_short(L_transform_next2_blocks256);
4279 
4280     __ BIND(L_load_misaligned_next2_blocks256);
4281     __ alignaddr(from, G0, from);
4282     // F0, F2, F4, F60, F62 can be clobbered
4283     __ ldf(FloatRegisterImpl::D, from, 0, F0);
4284     __ ldf(FloatRegisterImpl::D, from, 8, F2);
4285     __ ldf(FloatRegisterImpl::D, from, 16, F60);
4286     __ ldf(FloatRegisterImpl::D, from, 24, F62);
4287     __ ldf(FloatRegisterImpl::D, from, 32, F4);
4288     __ faligndata(F0, F2, F0);
4289     __ faligndata(F2, F60, F2);
4290     __ faligndata(F60, F62, F60);
4291     __ faligndata(F62, F4, F62);
4292     __ movdtox(F0, G4);
4293     __ movdtox(F2, G5);
4294     __ movdtox(F60, L4);
4295     __ movdtox(F62, L5);
4296     __ mov(G1, from);
4297 
4298     __ BIND(L_transform_next2_blocks256);
4299     // F0:F2 used for first 16-bytes
4300     __ xor3(L2,G4,G1);
4301     __ movxtod(G1,F0);
4302     __ xor3(L3,G5,G1);
4303     __ movxtod(G1,F2);
4304 
4305     // F60:F62 used for next 16-bytes
4306     __ xor3(L2,L4,G1);
4307     __ movxtod(G1,F60);
4308     __ xor3(L3,L5,G1);
4309     __ movxtod(G1,F62);
4310 
4311     __ aes_dround23(F54, F0, F2, F4);
4312     __ aes_dround01(F52, F0, F2, F6);
4313     __ aes_dround23(F54, F60, F62, F58);
4314     __ aes_dround01(F52, F60, F62, F56);
4315     __ aes_dround23(F50, F6, F4, F2);
4316     __ aes_dround01(F48, F6, F4, F0);
4317     __ aes_dround23(F50, F56, F58, F62);
4318     __ aes_dround01(F48, F56, F58, F60);
4319     // save F48:F54 in temp registers
4320     __ movdtox(F54,G2);
4321     __ movdtox(F52,G3);
4322     __ movdtox(F50,G6);
4323     __ movdtox(F48,G1);
4324     for ( int i = 46;  i >= 14; i -= 8 ) {
4325       __ aes_dround23(as_FloatRegister(i), F0, F2, F4);
4326       __ aes_dround01(as_FloatRegister(i-2), F0, F2, F6);
4327       __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4328       __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4329       __ aes_dround23(as_FloatRegister(i-4), F6, F4, F2);
4330       __ aes_dround01(as_FloatRegister(i-6), F6, F4, F0);
4331       __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4332       __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4333     }
4334     // init F48:F54 with F0:F6 values (original key)
4335     __ ldf(FloatRegisterImpl::D, original_key, 0, F48);
4336     __ ldf(FloatRegisterImpl::D, original_key, 8, F50);
4337     __ ldf(FloatRegisterImpl::D, original_key, 16, F52);
4338     __ ldf(FloatRegisterImpl::D, original_key, 24, F54);
4339     __ aes_dround23(F54, F0, F2, F4);
4340     __ aes_dround01(F52, F0, F2, F6);
4341     __ aes_dround23(F54, F60, F62, F58);
4342     __ aes_dround01(F52, F60, F62, F56);
4343     __ aes_dround23_l(F50, F6, F4, F2);
4344     __ aes_dround01_l(F48, F6, F4, F0);
4345     __ aes_dround23_l(F50, F56, F58, F62);
4346     __ aes_dround01_l(F48, F56, F58, F60);
4347     // re-init F48:F54 with their original values
4348     __ movxtod(G2,F54);
4349     __ movxtod(G3,F52);
4350     __ movxtod(G6,F50);
4351     __ movxtod(G1,F48);
4352 
4353     __ movxtod(L0,F6);
4354     __ movxtod(L1,F4);
4355     __ fxor(FloatRegisterImpl::D, F6, F0, F0);
4356     __ fxor(FloatRegisterImpl::D, F4, F2, F2);
4357 
4358     __ movxtod(G4,F56);
4359     __ movxtod(G5,F58);
4360     __ mov(L4,L0);
4361     __ mov(L5,L1);
4362     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4363     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4364 
4365     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4366     __ andcc(to, 7, G1);
4367     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256);
4368     __ delayed()->edge8n(to, G0, G2);
4369 
4370     // aligned case: store output into the destination array
4371     __ stf(FloatRegisterImpl::D, F0, to, 0);
4372     __ stf(FloatRegisterImpl::D, F2, to, 8);
4373     __ stf(FloatRegisterImpl::D, F60, to, 16);
4374     __ stf(FloatRegisterImpl::D, F62, to, 24);
4375     __ ba_short(L_check_decrypt_loop_end256);
4376 
4377     __ BIND(L_store_misaligned_output_next2_blocks256);
4378     __ mov(8, G4);
4379     __ sub(G4, G1, G4);
4380     __ alignaddr(G4, G0, G4);
4381     __ faligndata(F0, F2, F56); // F56 can be clobbered
4382     __ faligndata(F2, F60, F2);
4383     __ faligndata(F60, F62, F60);
4384     __ faligndata(F62, F0, F0);
4385     __ mov(to, G1);
4386     __ and3(to, -8, to);
4387     __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
4388     __ stf(FloatRegisterImpl::D, F56, to, 8);
4389     __ stf(FloatRegisterImpl::D, F2, to, 16);
4390     __ stf(FloatRegisterImpl::D, F60, to, 24);
4391     __ add(to, 32, to);
4392     __ orn(G0, G2, G2);
4393     __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
4394     __ mov(G1, to);
4395 
4396     __ BIND(L_check_decrypt_loop_end256);
4397     __ add(from, 32, from);
4398     __ add(to, 32, to);
4399     __ subcc(len_reg, 32, len_reg);
4400     __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256);
4401     __ delayed()->nop();
4402 
4403     __ BIND(L_cbcdec_end);
4404     // re-init intial vector for next block, 8-byte alignment is guaranteed
4405     __ stx(L0, rvec, 0);
4406     __ stx(L1, rvec, 8);
4407     __ mov(L7, I0);
4408     __ ret();
4409     __ delayed()->restore();
4410 
4411     return start;
4412   }
4413 
4414   address generate_sha1_implCompress(bool multi_block, const char *name) {
4415     __ align(CodeEntryAlignment);
4416     StubCodeMark mark(this, "StubRoutines", name);
4417     address start = __ pc();
4418 
4419     Label L_sha1_loop, L_sha1_unaligned_input, L_sha1_unaligned_input_loop;
4420     int i;
4421 
4422     Register buf   = O0; // byte[] source+offset
4423     Register state = O1; // int[]  SHA.state
4424     Register ofs   = O2; // int    offset
4425     Register limit = O3; // int    limit
4426 
4427     // load state into F0-F4
4428     for (i = 0; i < 5; i++) {
4429       __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i));
4430     }
4431 
4432     __ andcc(buf, 7, G0);
4433     __ br(Assembler::notZero, false, Assembler::pn, L_sha1_unaligned_input);
4434     __ delayed()->nop();
4435 
4436     __ BIND(L_sha1_loop);
4437     // load buf into F8-F22
4438     for (i = 0; i < 8; i++) {
4439       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4440     }
4441     __ sha1();
4442     if (multi_block) {
4443       __ add(ofs, 64, ofs);
4444       __ add(buf, 64, buf);
4445       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_loop);
4446       __ mov(ofs, O0); // to be returned
4447     }
4448 
4449     // store F0-F4 into state and return
4450     for (i = 0; i < 4; i++) {
4451       __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4452     }
4453     __ retl();
4454     __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10);
4455 
4456     __ BIND(L_sha1_unaligned_input);
4457     __ alignaddr(buf, G0, buf);
4458 
4459     __ BIND(L_sha1_unaligned_input_loop);
4460     // load buf into F8-F22
4461     for (i = 0; i < 9; i++) {
4462       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4463     }
4464     for (i = 0; i < 8; i++) {
4465       __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8));
4466     }
4467     __ sha1();
4468     if (multi_block) {
4469       __ add(ofs, 64, ofs);
4470       __ add(buf, 64, buf);
4471       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_unaligned_input_loop);
4472       __ mov(ofs, O0); // to be returned
4473     }
4474 
4475     // store F0-F4 into state and return
4476     for (i = 0; i < 4; i++) {
4477       __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4478     }
4479     __ retl();
4480     __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10);
4481 
4482     return start;
4483   }
4484 
4485   address generate_sha256_implCompress(bool multi_block, const char *name) {
4486     __ align(CodeEntryAlignment);
4487     StubCodeMark mark(this, "StubRoutines", name);
4488     address start = __ pc();
4489 
4490     Label L_sha256_loop, L_sha256_unaligned_input, L_sha256_unaligned_input_loop;
4491     int i;
4492 
4493     Register buf   = O0; // byte[] source+offset
4494     Register state = O1; // int[]  SHA2.state
4495     Register ofs   = O2; // int    offset
4496     Register limit = O3; // int    limit
4497 
4498     // load state into F0-F7
4499     for (i = 0; i < 8; i++) {
4500       __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i));
4501     }
4502 
4503     __ andcc(buf, 7, G0);
4504     __ br(Assembler::notZero, false, Assembler::pn, L_sha256_unaligned_input);
4505     __ delayed()->nop();
4506 
4507     __ BIND(L_sha256_loop);
4508     // load buf into F8-F22
4509     for (i = 0; i < 8; i++) {
4510       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4511     }
4512     __ sha256();
4513     if (multi_block) {
4514       __ add(ofs, 64, ofs);
4515       __ add(buf, 64, buf);
4516       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_loop);
4517       __ mov(ofs, O0); // to be returned
4518     }
4519 
4520     // store F0-F7 into state and return
4521     for (i = 0; i < 7; i++) {
4522       __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4523     }
4524     __ retl();
4525     __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c);
4526 
4527     __ BIND(L_sha256_unaligned_input);
4528     __ alignaddr(buf, G0, buf);
4529 
4530     __ BIND(L_sha256_unaligned_input_loop);
4531     // load buf into F8-F22
4532     for (i = 0; i < 9; i++) {
4533       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4534     }
4535     for (i = 0; i < 8; i++) {
4536       __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8));
4537     }
4538     __ sha256();
4539     if (multi_block) {
4540       __ add(ofs, 64, ofs);
4541       __ add(buf, 64, buf);
4542       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_unaligned_input_loop);
4543       __ mov(ofs, O0); // to be returned
4544     }
4545 
4546     // store F0-F7 into state and return
4547     for (i = 0; i < 7; i++) {
4548       __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4549     }
4550     __ retl();
4551     __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c);
4552 
4553     return start;
4554   }
4555 
4556   address generate_sha512_implCompress(bool multi_block, const char *name) {
4557     __ align(CodeEntryAlignment);
4558     StubCodeMark mark(this, "StubRoutines", name);
4559     address start = __ pc();
4560 
4561     Label L_sha512_loop, L_sha512_unaligned_input, L_sha512_unaligned_input_loop;
4562     int i;
4563 
4564     Register buf   = O0; // byte[] source+offset
4565     Register state = O1; // long[] SHA5.state
4566     Register ofs   = O2; // int    offset
4567     Register limit = O3; // int    limit
4568 
4569     // load state into F0-F14
4570     for (i = 0; i < 8; i++) {
4571       __ ldf(FloatRegisterImpl::D, state, i*8, as_FloatRegister(i*2));
4572     }
4573 
4574     __ andcc(buf, 7, G0);
4575     __ br(Assembler::notZero, false, Assembler::pn, L_sha512_unaligned_input);
4576     __ delayed()->nop();
4577 
4578     __ BIND(L_sha512_loop);
4579     // load buf into F16-F46
4580     for (i = 0; i < 16; i++) {
4581       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16));
4582     }
4583     __ sha512();
4584     if (multi_block) {
4585       __ add(ofs, 128, ofs);
4586       __ add(buf, 128, buf);
4587       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_loop);
4588       __ mov(ofs, O0); // to be returned
4589     }
4590 
4591     // store F0-F14 into state and return
4592     for (i = 0; i < 7; i++) {
4593       __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8);
4594     }
4595     __ retl();
4596     __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38);
4597 
4598     __ BIND(L_sha512_unaligned_input);
4599     __ alignaddr(buf, G0, buf);
4600 
4601     __ BIND(L_sha512_unaligned_input_loop);
4602     // load buf into F16-F46
4603     for (i = 0; i < 17; i++) {
4604       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16));
4605     }
4606     for (i = 0; i < 16; i++) {
4607       __ faligndata(as_FloatRegister(i*2 + 16), as_FloatRegister(i*2 + 18), as_FloatRegister(i*2 + 16));
4608     }
4609     __ sha512();
4610     if (multi_block) {
4611       __ add(ofs, 128, ofs);
4612       __ add(buf, 128, buf);
4613       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_unaligned_input_loop);
4614       __ mov(ofs, O0); // to be returned
4615     }
4616 
4617     // store F0-F14 into state and return
4618     for (i = 0; i < 7; i++) {
4619       __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8);
4620     }
4621     __ retl();
4622     __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38);
4623 
4624     return start;
4625   }
4626 
4627   /* Single and multi-block ghash operations */
4628   address generate_ghash_processBlocks() {
4629       __ align(CodeEntryAlignment);
4630       Label L_ghash_loop, L_aligned, L_main;
4631       StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4632       address start = __ pc();
4633 
4634       Register state = I0;
4635       Register subkeyH = I1;
4636       Register data = I2;
4637       Register len = I3;
4638 
4639       __ save_frame(0);
4640 
4641       __ ldx(state, 0, O0);
4642       __ ldx(state, 8, O1);
4643 
4644       // Loop label for multiblock operations
4645       __ BIND(L_ghash_loop);
4646 
4647       // Check if 'data' is unaligned
4648       __ andcc(data, 7, G1);
4649       __ br(Assembler::zero, false, Assembler::pt, L_aligned);
4650       __ delayed()->nop();
4651 
4652       Register left_shift = L1;
4653       Register right_shift = L2;
4654       Register data_ptr = L3;
4655 
4656       // Get left and right shift values in bits
4657       __ sll(G1, LogBitsPerByte, left_shift);
4658       __ mov(64, right_shift);
4659       __ sub(right_shift, left_shift, right_shift);
4660 
4661       // Align to read 'data'
4662       __ sub(data, G1, data_ptr);
4663 
4664       // Load first 8 bytes of 'data'
4665       __ ldx(data_ptr, 0, O4);
4666       __ sllx(O4, left_shift, O4);
4667       __ ldx(data_ptr, 8, O5);
4668       __ srlx(O5, right_shift, G4);
4669       __ bset(G4, O4);
4670 
4671       // Load second 8 bytes of 'data'
4672       __ sllx(O5, left_shift, O5);
4673       __ ldx(data_ptr, 16, G4);
4674       __ srlx(G4, right_shift, G4);
4675       __ ba(L_main);
4676       __ delayed()->bset(G4, O5);
4677 
4678       // If 'data' is aligned, load normally
4679       __ BIND(L_aligned);
4680       __ ldx(data, 0, O4);
4681       __ ldx(data, 8, O5);
4682 
4683       __ BIND(L_main);
4684       __ ldx(subkeyH, 0, O2);
4685       __ ldx(subkeyH, 8, O3);
4686 
4687       __ xor3(O0, O4, O0);
4688       __ xor3(O1, O5, O1);
4689 
4690       __ xmulxhi(O0, O3, G3);
4691       __ xmulx(O0, O2, O5);
4692       __ xmulxhi(O1, O2, G4);
4693       __ xmulxhi(O1, O3, G5);
4694       __ xmulx(O0, O3, G1);
4695       __ xmulx(O1, O3, G2);
4696       __ xmulx(O1, O2, O3);
4697       __ xmulxhi(O0, O2, O4);
4698 
4699       __ mov(0xE1, O0);
4700       __ sllx(O0, 56, O0);
4701 
4702       __ xor3(O5, G3, O5);
4703       __ xor3(O5, G4, O5);
4704       __ xor3(G5, G1, G1);
4705       __ xor3(G1, O3, G1);
4706       __ srlx(G2, 63, O1);
4707       __ srlx(G1, 63, G3);
4708       __ sllx(G2, 63, O3);
4709       __ sllx(G2, 58, O2);
4710       __ xor3(O3, O2, O2);
4711 
4712       __ sllx(G1, 1, G1);
4713       __ or3(G1, O1, G1);
4714 
4715       __ xor3(G1, O2, G1);
4716 
4717       __ sllx(G2, 1, G2);
4718 
4719       __ xmulxhi(G1, O0, O1);
4720       __ xmulx(G1, O0, O2);
4721       __ xmulxhi(G2, O0, O3);
4722       __ xmulx(G2, O0, G1);
4723 
4724       __ xor3(O4, O1, O4);
4725       __ xor3(O5, O2, O5);
4726       __ xor3(O5, O3, O5);
4727 
4728       __ sllx(O4, 1, O2);
4729       __ srlx(O5, 63, O3);
4730 
4731       __ or3(O2, O3, O0);
4732 
4733       __ sllx(O5, 1, O1);
4734       __ srlx(G1, 63, O2);
4735       __ or3(O1, O2, O1);
4736       __ xor3(O1, G3, O1);
4737 
4738       __ deccc(len);
4739       __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop);
4740       __ delayed()->add(data, 16, data);
4741 
4742       __ stx(O0, I0, 0);
4743       __ stx(O1, I0, 8);
4744 
4745       __ ret();
4746       __ delayed()->restore();
4747 
4748       return start;
4749   }
4750 
4751   /**
4752    *  Arguments:
4753    *
4754    * Inputs:
4755    *   O0   - int   crc
4756    *   O1   - byte* buf
4757    *   O2   - int   len
4758    *   O3   - int*  table
4759    *
4760    * Output:
4761    *   O0   - int crc result
4762    */
4763   address generate_updateBytesCRC32C() {
4764     assert(UseCRC32CIntrinsics, "need CRC32C instruction");
4765 
4766     __ align(CodeEntryAlignment);
4767     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4768     address start = __ pc();
4769 
4770     const Register crc   = O0;  // crc
4771     const Register buf   = O1;  // source java byte array address
4772     const Register len   = O2;  // number of bytes
4773     const Register table = O3;  // byteTable
4774 
4775     __ kernel_crc32c(crc, buf, len, table);
4776 
4777     __ retl();
4778     __ delayed()->nop();
4779 
4780     return start;
4781   }
4782 
4783 #define ADLER32_NUM_TEMPS 16
4784 
4785   /**
4786    *  Arguments:
4787    *
4788    * Inputs:
4789    *   O0   - int   adler
4790    *   O1   - byte* buff
4791    *   O2   - int   len
4792    *
4793    * Output:
4794    *   O0   - int adler result
4795    */
4796   address generate_updateBytesAdler32() {
4797     __ align(CodeEntryAlignment);
4798     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4799     address start = __ pc();
4800 
4801     Label L_cleanup_loop, L_cleanup_loop_check;
4802     Label L_main_loop_check, L_main_loop, L_inner_loop, L_inner_loop_check;
4803     Label L_nmax_check_done;
4804 
4805     // Aliases
4806     Register s1     = O0;
4807     Register s2     = O3;
4808     Register buff   = O1;
4809     Register len    = O2;
4810     Register temp[ADLER32_NUM_TEMPS] = {L0, L1, L2, L3, L4, L5, L6, L7, I0, I1, I2, I3, I4, I5, G3, I7};
4811 
4812     // Max number of bytes we can process before having to take the mod
4813     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4814     unsigned long NMAX = 0x15B0;
4815 
4816     // Zero-out the upper bits of len
4817     __ clruwu(len);
4818 
4819     // Create the mask 0xFFFF
4820     __ set64(0x00FFFF, O4, O5); // O5 is the temp register
4821 
4822     // s1 is initialized to the lower 16 bits of adler
4823     // s2 is initialized to the upper 16 bits of adler
4824     __ srlx(O0, 16, O5); // adler >> 16
4825     __ and3(O0, O4, s1); // s1  = (adler & 0xFFFF)
4826     __ and3(O5, O4, s2); // s2  = ((adler >> 16) & 0xFFFF)
4827 
4828     // The pipelined loop needs at least 16 elements for 1 iteration
4829     // It does check this, but it is more effective to skip to the cleanup loop
4830     // Setup the constant for cutoff checking
4831     __ mov(15, O4);
4832 
4833     // Check if we are above the cutoff, if not go to the cleanup loop immediately
4834     __ cmp_and_br_short(len, O4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_loop_check);
4835 
4836     // Free up some registers for our use
4837     for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
4838       __ movxtod(temp[i], as_FloatRegister(2*i));
4839     }
4840 
4841     // Loop maintenance stuff is done at the end of the loop, so skip to there
4842     __ ba_short(L_main_loop_check);
4843 
4844     __ BIND(L_main_loop);
4845 
4846     // Prologue for inner loop
4847     __ ldub(buff, 0, L0);
4848     __ dec(O5);
4849 
4850     for (int i = 1; i < 8; i++) {
4851       __ ldub(buff, i, temp[i]);
4852     }
4853 
4854     __ inc(buff, 8);
4855 
4856     // Inner loop processes 16 elements at a time, might never execute if only 16 elements
4857     // to be processed by the outter loop
4858     __ ba_short(L_inner_loop_check);
4859 
4860     __ BIND(L_inner_loop);
4861 
4862     for (int i = 0; i < 8; i++) {
4863       __ ldub(buff, (2*i), temp[(8+(2*i)) % ADLER32_NUM_TEMPS]);
4864       __ add(s1, temp[i], s1);
4865       __ ldub(buff, (2*i)+1, temp[(8+(2*i)+1) % ADLER32_NUM_TEMPS]);
4866       __ add(s2, s1, s2);
4867     }
4868 
4869     // Original temp 0-7 used and new loads to temp 0-7 issued
4870     // temp 8-15 ready to be consumed
4871     __ add(s1, I0, s1);
4872     __ dec(O5);
4873     __ add(s2, s1, s2);
4874     __ add(s1, I1, s1);
4875     __ inc(buff, 16);
4876     __ add(s2, s1, s2);
4877 
4878     for (int i = 0; i < 6; i++) {
4879       __ add(s1, temp[10+i], s1);
4880       __ add(s2, s1, s2);
4881     }
4882 
4883     __ BIND(L_inner_loop_check);
4884     __ nop();
4885     __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_inner_loop);
4886 
4887     // Epilogue
4888     for (int i = 0; i < 4; i++) {
4889       __ ldub(buff, (2*i), temp[8+(2*i)]);
4890       __ add(s1, temp[i], s1);
4891       __ ldub(buff, (2*i)+1, temp[8+(2*i)+1]);
4892       __ add(s2, s1, s2);
4893     }
4894 
4895     __ add(s1, temp[4], s1);
4896     __ inc(buff, 8);
4897 
4898     for (int i = 0; i < 11; i++) {
4899       __ add(s2, s1, s2);
4900       __ add(s1, temp[5+i], s1);
4901     }
4902 
4903     __ add(s2, s1, s2);
4904 
4905     // Take the mod for s1 and s2
4906     __ set64(0xFFF1, L0, L1);
4907     __ udivx(s1, L0, L1);
4908     __ udivx(s2, L0, L2);
4909     __ mulx(L0, L1, L1);
4910     __ mulx(L0, L2, L2);
4911     __ sub(s1, L1, s1);
4912     __ sub(s2, L2, s2);
4913 
4914     // Make sure there is something left to process
4915     __ BIND(L_main_loop_check);
4916     __ set64(NMAX, L0, L1);
4917     // k = len < NMAX ? len : NMAX
4918     __ cmp_and_br_short(len, L0, Assembler::greaterEqualUnsigned, Assembler::pt, L_nmax_check_done);
4919     __ andn(len, 0x0F, L0); // only loop a multiple of 16 times
4920     __ BIND(L_nmax_check_done);
4921     __ mov(L0, O5);
4922     __ sub(len, L0, len); // len -= k
4923 
4924     __ srlx(O5, 4, O5); // multiplies of 16
4925     __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_main_loop);
4926 
4927     // Restore anything we used, take the mod one last time, combine and return
4928     // Restore any registers we saved
4929     for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
4930       __ movdtox(as_FloatRegister(2*i), temp[i]);
4931     }
4932 
4933     // There might be nothing left to process
4934     __ ba_short(L_cleanup_loop_check);
4935 
4936     __ BIND(L_cleanup_loop);
4937     __ ldub(buff, 0, O4); // load single byte form buffer
4938     __ inc(buff); // buff++
4939     __ add(s1, O4, s1); // s1 += *buff++;
4940     __ dec(len); // len--
4941     __ add(s1, s2, s2); // s2 += s1;
4942     __ BIND(L_cleanup_loop_check);
4943     __ nop();
4944     __ cmp_and_br_short(len, 0, Assembler::notEqual, Assembler::pt, L_cleanup_loop);
4945 
4946     // Take the mod one last time
4947     __ set64(0xFFF1, O1, O2);
4948     __ udivx(s1, O1, O2);
4949     __ udivx(s2, O1, O5);
4950     __ mulx(O1, O2, O2);
4951     __ mulx(O1, O5, O5);
4952     __ sub(s1, O2, s1);
4953     __ sub(s2, O5, s2);
4954 
4955     // Combine lower bits and higher bits
4956     __ sllx(s2, 16, s2); // s2 = s2 << 16
4957     __ or3(s1, s2, s1);  // adler = s2 | s1
4958     // Final return value is in O0
4959     __ retl();
4960     __ delayed()->nop();
4961 
4962     return start;
4963   }
4964 
4965   /**
4966    *  Arguments:
4967    *
4968    * Inputs:
4969    *   O0   - int   crc
4970    *   O1   - byte* buf
4971    *   O2   - int   len
4972    *   O3   - int*  table
4973    *
4974    * Output:
4975    *   O0   - int crc result
4976    */
4977   address generate_updateBytesCRC32() {
4978     assert(UseCRC32Intrinsics, "need VIS3 instructions");
4979 
4980     __ align(CodeEntryAlignment);
4981     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4982     address start = __ pc();
4983 
4984     const Register crc   = O0; // crc
4985     const Register buf   = O1; // source java byte array address
4986     const Register len   = O2; // length
4987     const Register table = O3; // crc_table address (reuse register)
4988 
4989     __ kernel_crc32(crc, buf, len, table);
4990 
4991     __ retl();
4992     __ delayed()->nop();
4993 
4994     return start;
4995   }
4996 
4997   /**
4998    * Arguments:
4999    *
5000    * Inputs:
5001    *   I0   - int* x-addr
5002    *   I1   - int  x-len
5003    *   I2   - int* y-addr
5004    *   I3   - int  y-len
5005    *   I4   - int* z-addr   (output vector)
5006    *   I5   - int  z-len
5007    */
5008   address generate_multiplyToLen() {
5009     assert(UseMultiplyToLenIntrinsic, "need VIS3 instructions");
5010 
5011     __ align(CodeEntryAlignment);
5012     StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
5013     address start = __ pc();
5014 
5015     __ save_frame(0);
5016 
5017     const Register xptr = I0; // input address
5018     const Register xlen = I1; // ...and length in 32b-words
5019     const Register yptr = I2; //
5020     const Register ylen = I3; //
5021     const Register zptr = I4; // output address
5022     const Register zlen = I5; // ...and length in 32b-words
5023 
5024     /* The minimal "limb" representation suggest that odd length vectors are as
5025      * likely as even length dittos. This in turn suggests that we need to cope
5026      * with odd/even length arrays and data not aligned properly for 64-bit read
5027      * and write operations. We thus use a number of different kernels:
5028      *
5029      *   if (is_even(x.len) && is_even(y.len))
5030      *      if (is_align64(x) && is_align64(y) && is_align64(z))
5031      *         if (x.len == y.len && 16 <= x.len && x.len <= 64)
5032      *            memv_mult_mpmul(...)
5033      *         else
5034      *            memv_mult_64x64(...)
5035      *      else
5036      *         memv_mult_64x64u(...)
5037      *   else
5038      *      memv_mult_32x32(...)
5039      *
5040      * Here we assume VIS3 support (for 'umulxhi', 'addxc' and 'addxccc').
5041      * In case CBCOND instructions are supported, we will use 'cxbX'. If the
5042      * MPMUL instruction is supported, we will generate a kernel using 'mpmul'
5043      * (for vectors with proper characteristics).
5044      */
5045     const Register tmp0 = L0;
5046     const Register tmp1 = L1;
5047 
5048     Label L_mult_32x32;
5049     Label L_mult_64x64u;
5050     Label L_mult_64x64;
5051     Label L_exit;
5052 
5053     if_both_even(xlen, ylen, tmp0, false, L_mult_32x32);
5054     if_all3_aligned(xptr, yptr, zptr, tmp1, 64, false, L_mult_64x64u);
5055 
5056     if (UseMPMUL) {
5057       if_eq(xlen, ylen, false, L_mult_64x64);
5058       if_in_rng(xlen, 16, 64, tmp0, tmp1, false, L_mult_64x64);
5059 
5060       // 1. Multiply naturally aligned 64b-datums using a generic 'mpmul' kernel,
5061       //    operating on equal length vectors of size [16..64].
5062       gen_mult_mpmul(xlen, xptr, yptr, zptr, L_exit);
5063     }
5064 
5065     // 2. Multiply naturally aligned 64-bit datums (64x64).
5066     __ bind(L_mult_64x64);
5067     gen_mult_64x64(xptr, xlen, yptr, ylen, zptr, zlen, L_exit);
5068 
5069     // 3. Multiply unaligned 64-bit datums (64x64).
5070     __ bind(L_mult_64x64u);
5071     gen_mult_64x64_unaligned(xptr, xlen, yptr, ylen, zptr, zlen, L_exit);
5072 
5073     // 4. Multiply naturally aligned 32-bit datums (32x32).
5074     __ bind(L_mult_32x32);
5075     gen_mult_32x32(xptr, xlen, yptr, ylen, zptr, zlen, L_exit);
5076 
5077     __ bind(L_exit);
5078     __ ret();
5079     __ delayed()->restore();
5080 
5081     return start;
5082   }
5083 
5084   // Additional help functions used by multiplyToLen generation.
5085 
5086   void if_both_even(Register r1, Register r2, Register tmp, bool iseven, Label &L)
5087   {
5088     __ or3(r1, r2, tmp);
5089     __ andcc(tmp, 0x1, tmp);
5090     __ br_icc_zero(iseven, Assembler::pn, L);
5091   }
5092 
5093   void if_all3_aligned(Register r1, Register r2, Register r3,
5094                        Register tmp, uint align, bool isalign, Label &L)
5095   {
5096     __ or3(r1, r2, tmp);
5097     __ or3(r3, tmp, tmp);
5098     __ andcc(tmp, (align - 1), tmp);
5099     __ br_icc_zero(isalign, Assembler::pn, L);
5100   }
5101 
5102   void if_eq(Register x, Register y, bool iseq, Label &L)
5103   {
5104     Assembler::Condition cf = (iseq ? Assembler::equal : Assembler::notEqual);
5105     __ cmp_and_br_short(x, y, cf, Assembler::pt, L);
5106   }
5107 
5108   void if_in_rng(Register x, int lb, int ub, Register t1, Register t2, bool inrng, Label &L)
5109   {
5110     assert(Assembler::is_simm13(lb), "Small ints only!");
5111     assert(Assembler::is_simm13(ub), "Small ints only!");
5112     // Compute (x - lb) * (ub - x) >= 0
5113     // NOTE: With the local use of this routine, we rely on small integers to
5114     //       guarantee that we do not overflow in the multiplication.
5115     __ add(G0, ub, t2);
5116     __ sub(x, lb, t1);
5117     __ sub(t2, x, t2);
5118     __ mulx(t1, t2, t1);
5119     Assembler::Condition cf = (inrng ? Assembler::greaterEqual : Assembler::less);
5120     __ cmp_and_br_short(t1, G0, cf, Assembler::pt, L);
5121   }
5122 
5123   void ldd_entry(Register base, Register offs, FloatRegister dest)
5124   {
5125     __ ldd(base, offs, dest);
5126     __ inc(offs, 8);
5127   }
5128 
5129   void ldx_entry(Register base, Register offs, Register dest)
5130   {
5131     __ ldx(base, offs, dest);
5132     __ inc(offs, 8);
5133   }
5134 
5135   void mpmul_entry(int m, Label &next)
5136   {
5137     __ mpmul(m);
5138     __ cbcond(Assembler::equal, Assembler::icc, G0, G0, next);
5139   }
5140 
5141   void stx_entry(Label &L, Register r1, Register r2, Register base, Register offs)
5142   {
5143     __ bind(L);
5144     __ stx(r1, base, offs);
5145     __ inc(offs, 8);
5146     __ stx(r2, base, offs);
5147     __ inc(offs, 8);
5148   }
5149 
5150   void offs_entry(Label &Lbl0, Label &Lbl1)
5151   {
5152     assert(Lbl0.is_bound(), "must be");
5153     assert(Lbl1.is_bound(), "must be");
5154 
5155     int offset = Lbl0.loc_pos() - Lbl1.loc_pos();
5156 
5157     __ emit_data(offset);
5158   }
5159 
5160   /* Generate the actual multiplication kernels for BigInteger vectors:
5161    *
5162    *   1. gen_mult_mpmul(...)
5163    *
5164    *   2. gen_mult_64x64(...)
5165    *
5166    *   3. gen_mult_64x64_unaligned(...)
5167    *
5168    *   4. gen_mult_32x32(...)
5169    */
5170   void gen_mult_mpmul(Register len, Register xptr, Register yptr, Register zptr,
5171                       Label &L_exit)
5172   {
5173     const Register zero = G0;
5174     const Register gxp  = G1;   // Need to use global registers across RWs.
5175     const Register gyp  = G2;
5176     const Register gzp  = G3;
5177     const Register disp = G4;
5178     const Register offs = G5;
5179 
5180     __ mov(xptr, gxp);
5181     __ mov(yptr, gyp);
5182     __ mov(zptr, gzp);
5183 
5184     /* Compute jump vector entry:
5185      *
5186      *   1. mpmul input size (0..31) x 64b
5187      *   2. vector input size in 32b limbs (even number)
5188      *   3. branch entries in reverse order (31..0), using two
5189      *      instructions per entry (2 * 4 bytes).
5190      *
5191      *   displacement = byte_offset(bra_offset(len))
5192      *                = byte_offset((64 - len)/2)
5193      *                = 8 * (64 - len)/2
5194      *                = 4 * (64 - len)
5195      */
5196     Register temp = I5;         // Alright to use input regs. in first batch.
5197 
5198     __ sub(zero, len, temp);
5199     __ add(temp, 64, temp);
5200     __ sllx(temp, 2, disp);     // disp := (64 - len) << 2
5201 
5202     // Dispatch relative current PC, into instruction table below.
5203     __ rdpc(temp);
5204     __ add(temp, 16, temp);
5205     __ jmp(temp, disp);
5206     __ delayed()->clr(offs);
5207 
5208     ldd_entry(gxp, offs, F22);
5209     ldd_entry(gxp, offs, F20);
5210     ldd_entry(gxp, offs, F18);
5211     ldd_entry(gxp, offs, F16);
5212     ldd_entry(gxp, offs, F14);
5213     ldd_entry(gxp, offs, F12);
5214     ldd_entry(gxp, offs, F10);
5215     ldd_entry(gxp, offs, F8);
5216     ldd_entry(gxp, offs, F6);
5217     ldd_entry(gxp, offs, F4);
5218     ldx_entry(gxp, offs, I5);
5219     ldx_entry(gxp, offs, I4);
5220     ldx_entry(gxp, offs, I3);
5221     ldx_entry(gxp, offs, I2);
5222     ldx_entry(gxp, offs, I1);
5223     ldx_entry(gxp, offs, I0);
5224     ldx_entry(gxp, offs, L7);
5225     ldx_entry(gxp, offs, L6);
5226     ldx_entry(gxp, offs, L5);
5227     ldx_entry(gxp, offs, L4);
5228     ldx_entry(gxp, offs, L3);
5229     ldx_entry(gxp, offs, L2);
5230     ldx_entry(gxp, offs, L1);
5231     ldx_entry(gxp, offs, L0);
5232     ldd_entry(gxp, offs, F2);
5233     ldd_entry(gxp, offs, F0);
5234     ldx_entry(gxp, offs, O5);
5235     ldx_entry(gxp, offs, O4);
5236     ldx_entry(gxp, offs, O3);
5237     ldx_entry(gxp, offs, O2);
5238     ldx_entry(gxp, offs, O1);
5239     ldx_entry(gxp, offs, O0);
5240 
5241     __ save(SP, -176, SP);
5242 
5243     const Register addr = gxp;  // Alright to reuse 'gxp'.
5244 
5245     // Dispatch relative current PC, into instruction table below.
5246     __ rdpc(addr);
5247     __ add(addr, 16, addr);
5248     __ jmp(addr, disp);
5249     __ delayed()->clr(offs);
5250 
5251     ldd_entry(gyp, offs, F58);
5252     ldd_entry(gyp, offs, F56);
5253     ldd_entry(gyp, offs, F54);
5254     ldd_entry(gyp, offs, F52);
5255     ldd_entry(gyp, offs, F50);
5256     ldd_entry(gyp, offs, F48);
5257     ldd_entry(gyp, offs, F46);
5258     ldd_entry(gyp, offs, F44);
5259     ldd_entry(gyp, offs, F42);
5260     ldd_entry(gyp, offs, F40);
5261     ldd_entry(gyp, offs, F38);
5262     ldd_entry(gyp, offs, F36);
5263     ldd_entry(gyp, offs, F34);
5264     ldd_entry(gyp, offs, F32);
5265     ldd_entry(gyp, offs, F30);
5266     ldd_entry(gyp, offs, F28);
5267     ldd_entry(gyp, offs, F26);
5268     ldd_entry(gyp, offs, F24);
5269     ldx_entry(gyp, offs, O5);
5270     ldx_entry(gyp, offs, O4);
5271     ldx_entry(gyp, offs, O3);
5272     ldx_entry(gyp, offs, O2);
5273     ldx_entry(gyp, offs, O1);
5274     ldx_entry(gyp, offs, O0);
5275     ldx_entry(gyp, offs, L7);
5276     ldx_entry(gyp, offs, L6);
5277     ldx_entry(gyp, offs, L5);
5278     ldx_entry(gyp, offs, L4);
5279     ldx_entry(gyp, offs, L3);
5280     ldx_entry(gyp, offs, L2);
5281     ldx_entry(gyp, offs, L1);
5282     ldx_entry(gyp, offs, L0);
5283 
5284     __ save(SP, -176, SP);
5285     __ save(SP, -176, SP);
5286     __ save(SP, -176, SP);
5287     __ save(SP, -176, SP);
5288     __ save(SP, -176, SP);
5289 
5290     Label L_mpmul_restore_4, L_mpmul_restore_3, L_mpmul_restore_2;
5291     Label L_mpmul_restore_1, L_mpmul_restore_0;
5292 
5293     // Dispatch relative current PC, into instruction table below.
5294     __ rdpc(addr);
5295     __ add(addr, 16, addr);
5296     __ jmp(addr, disp);
5297     __ delayed()->clr(offs);
5298 
5299     mpmul_entry(31, L_mpmul_restore_0);
5300     mpmul_entry(30, L_mpmul_restore_0);
5301     mpmul_entry(29, L_mpmul_restore_0);
5302     mpmul_entry(28, L_mpmul_restore_0);
5303     mpmul_entry(27, L_mpmul_restore_1);
5304     mpmul_entry(26, L_mpmul_restore_1);
5305     mpmul_entry(25, L_mpmul_restore_1);
5306     mpmul_entry(24, L_mpmul_restore_1);
5307     mpmul_entry(23, L_mpmul_restore_1);
5308     mpmul_entry(22, L_mpmul_restore_1);
5309     mpmul_entry(21, L_mpmul_restore_1);
5310     mpmul_entry(20, L_mpmul_restore_2);
5311     mpmul_entry(19, L_mpmul_restore_2);
5312     mpmul_entry(18, L_mpmul_restore_2);
5313     mpmul_entry(17, L_mpmul_restore_2);
5314     mpmul_entry(16, L_mpmul_restore_2);
5315     mpmul_entry(15, L_mpmul_restore_2);
5316     mpmul_entry(14, L_mpmul_restore_2);
5317     mpmul_entry(13, L_mpmul_restore_3);
5318     mpmul_entry(12, L_mpmul_restore_3);
5319     mpmul_entry(11, L_mpmul_restore_3);
5320     mpmul_entry(10, L_mpmul_restore_3);
5321     mpmul_entry( 9, L_mpmul_restore_3);
5322     mpmul_entry( 8, L_mpmul_restore_3);
5323     mpmul_entry( 7, L_mpmul_restore_3);
5324     mpmul_entry( 6, L_mpmul_restore_4);
5325     mpmul_entry( 5, L_mpmul_restore_4);
5326     mpmul_entry( 4, L_mpmul_restore_4);
5327     mpmul_entry( 3, L_mpmul_restore_4);
5328     mpmul_entry( 2, L_mpmul_restore_4);
5329     mpmul_entry( 1, L_mpmul_restore_4);
5330     mpmul_entry( 0, L_mpmul_restore_4);
5331 
5332     Label L_z31, L_z30, L_z29, L_z28, L_z27, L_z26, L_z25, L_z24;
5333     Label L_z23, L_z22, L_z21, L_z20, L_z19, L_z18, L_z17, L_z16;
5334     Label L_z15, L_z14, L_z13, L_z12, L_z11, L_z10, L_z09, L_z08;
5335     Label L_z07, L_z06, L_z05, L_z04, L_z03, L_z02, L_z01, L_z00;
5336 
5337     Label L_zst_base;    // Store sequence base address.
5338     __ bind(L_zst_base);
5339 
5340     stx_entry(L_z31, L7, L6, gzp, offs);
5341     stx_entry(L_z30, L5, L4, gzp, offs);
5342     stx_entry(L_z29, L3, L2, gzp, offs);
5343     stx_entry(L_z28, L1, L0, gzp, offs);
5344     __ restore();
5345     stx_entry(L_z27, O5, O4, gzp, offs);
5346     stx_entry(L_z26, O3, O2, gzp, offs);
5347     stx_entry(L_z25, O1, O0, gzp, offs);
5348     stx_entry(L_z24, L7, L6, gzp, offs);
5349     stx_entry(L_z23, L5, L4, gzp, offs);
5350     stx_entry(L_z22, L3, L2, gzp, offs);
5351     stx_entry(L_z21, L1, L0, gzp, offs);
5352     __ restore();
5353     stx_entry(L_z20, O5, O4, gzp, offs);
5354     stx_entry(L_z19, O3, O2, gzp, offs);
5355     stx_entry(L_z18, O1, O0, gzp, offs);
5356     stx_entry(L_z17, L7, L6, gzp, offs);
5357     stx_entry(L_z16, L5, L4, gzp, offs);
5358     stx_entry(L_z15, L3, L2, gzp, offs);
5359     stx_entry(L_z14, L1, L0, gzp, offs);
5360     __ restore();
5361     stx_entry(L_z13, O5, O4, gzp, offs);
5362     stx_entry(L_z12, O3, O2, gzp, offs);
5363     stx_entry(L_z11, O1, O0, gzp, offs);
5364     stx_entry(L_z10, L7, L6, gzp, offs);
5365     stx_entry(L_z09, L5, L4, gzp, offs);
5366     stx_entry(L_z08, L3, L2, gzp, offs);
5367     stx_entry(L_z07, L1, L0, gzp, offs);
5368     __ restore();
5369     stx_entry(L_z06, O5, O4, gzp, offs);
5370     stx_entry(L_z05, O3, O2, gzp, offs);
5371     stx_entry(L_z04, O1, O0, gzp, offs);
5372     stx_entry(L_z03, L7, L6, gzp, offs);
5373     stx_entry(L_z02, L5, L4, gzp, offs);
5374     stx_entry(L_z01, L3, L2, gzp, offs);
5375     stx_entry(L_z00, L1, L0, gzp, offs);
5376 
5377     __ restore();
5378     __ restore();
5379     // Exit out of 'mpmul' routine, back to multiplyToLen.
5380     __ ba_short(L_exit);
5381 
5382     Label L_zst_offs;
5383     __ bind(L_zst_offs);
5384 
5385     offs_entry(L_z31, L_zst_base);  // index 31: 2048x2048
5386     offs_entry(L_z30, L_zst_base);
5387     offs_entry(L_z29, L_zst_base);
5388     offs_entry(L_z28, L_zst_base);
5389     offs_entry(L_z27, L_zst_base);
5390     offs_entry(L_z26, L_zst_base);
5391     offs_entry(L_z25, L_zst_base);
5392     offs_entry(L_z24, L_zst_base);
5393     offs_entry(L_z23, L_zst_base);
5394     offs_entry(L_z22, L_zst_base);
5395     offs_entry(L_z21, L_zst_base);
5396     offs_entry(L_z20, L_zst_base);
5397     offs_entry(L_z19, L_zst_base);
5398     offs_entry(L_z18, L_zst_base);
5399     offs_entry(L_z17, L_zst_base);
5400     offs_entry(L_z16, L_zst_base);
5401     offs_entry(L_z15, L_zst_base);
5402     offs_entry(L_z14, L_zst_base);
5403     offs_entry(L_z13, L_zst_base);
5404     offs_entry(L_z12, L_zst_base);
5405     offs_entry(L_z11, L_zst_base);
5406     offs_entry(L_z10, L_zst_base);
5407     offs_entry(L_z09, L_zst_base);
5408     offs_entry(L_z08, L_zst_base);
5409     offs_entry(L_z07, L_zst_base);
5410     offs_entry(L_z06, L_zst_base);
5411     offs_entry(L_z05, L_zst_base);
5412     offs_entry(L_z04, L_zst_base);
5413     offs_entry(L_z03, L_zst_base);
5414     offs_entry(L_z02, L_zst_base);
5415     offs_entry(L_z01, L_zst_base);
5416     offs_entry(L_z00, L_zst_base);  // index  0:   64x64
5417 
5418     __ bind(L_mpmul_restore_4);
5419     __ restore();
5420     __ bind(L_mpmul_restore_3);
5421     __ restore();
5422     __ bind(L_mpmul_restore_2);
5423     __ restore();
5424     __ bind(L_mpmul_restore_1);
5425     __ restore();
5426     __ bind(L_mpmul_restore_0);
5427 
5428     // Dispatch via offset vector entry, into z-store sequence.
5429     Label L_zst_rdpc;
5430     __ bind(L_zst_rdpc);
5431 
5432     assert(L_zst_base.is_bound(), "must be");
5433     assert(L_zst_offs.is_bound(), "must be");
5434     assert(L_zst_rdpc.is_bound(), "must be");
5435 
5436     int dbase = L_zst_rdpc.loc_pos() - L_zst_base.loc_pos();
5437     int doffs = L_zst_rdpc.loc_pos() - L_zst_offs.loc_pos();
5438 
5439     temp = gyp;   // Alright to reuse 'gyp'.
5440 
5441     __ rdpc(addr);
5442     __ sub(addr, doffs, temp);
5443     __ srlx(disp, 1, disp);
5444     __ lduw(temp, disp, offs);
5445     __ sub(addr, dbase, temp);
5446     __ jmp(temp, offs);
5447     __ delayed()->clr(offs);
5448   }
5449 
5450   void gen_mult_64x64(Register xp, Register xn,
5451                       Register yp, Register yn,
5452                       Register zp, Register zn, Label &L_exit)
5453   {
5454     // Assuming that a stack frame has already been created, i.e. local and
5455     // output registers are available for immediate use.
5456 
5457     const Register ri = L0;     // Outer loop index, xv[i]
5458     const Register rj = L1;     // Inner loop index, yv[j]
5459     const Register rk = L2;     // Output loop index, zv[k]
5460     const Register rx = L4;     // x-vector datum [i]
5461     const Register ry = L5;     // y-vector datum [j]
5462     const Register rz = L6;     // z-vector datum [k]
5463     const Register rc = L7;     // carry over (to z-vector datum [k-1])
5464 
5465     const Register lop = O0;    // lo-64b product
5466     const Register hip = O1;    // hi-64b product
5467 
5468     const Register zero = G0;
5469 
5470     Label L_loop_i,  L_exit_loop_i;
5471     Label L_loop_j;
5472     Label L_loop_i2, L_exit_loop_i2;
5473 
5474     __ srlx(xn, 1, xn);         // index for u32 to u64 ditto
5475     __ srlx(yn, 1, yn);         // index for u32 to u64 ditto
5476     __ srlx(zn, 1, zn);         // index for u32 to u64 ditto
5477     __ dec(xn);                 // Adjust [0..(N/2)-1]
5478     __ dec(yn);
5479     __ dec(zn);
5480     __ clr(rc);                 // u64 c = 0
5481     __ sllx(xn, 3, ri);         // int i = xn (byte offset i = 8*xn)
5482     __ sllx(yn, 3, rj);         // int j = yn (byte offset i = 8*xn)
5483     __ sllx(zn, 3, rk);         // int k = zn (byte offset k = 8*zn)
5484     __ ldx(yp, rj, ry);         // u64 y = yp[yn]
5485 
5486     // for (int i = xn; i >= 0; i--)
5487     __ bind(L_loop_i);
5488 
5489     __ cmp_and_br_short(ri, 0,  // i >= 0
5490                         Assembler::less, Assembler::pn, L_exit_loop_i);
5491     __ ldx(xp, ri, rx);         // x = xp[i]
5492     __ mulx(rx, ry, lop);       // lo-64b-part of result 64x64
5493     __ umulxhi(rx, ry, hip);    // hi-64b-part of result 64x64
5494     __ addcc(rc, lop, lop);     // Accumulate lower order bits (producing carry)
5495     __ addxc(hip, zero, rc);    // carry over to next datum [k-1]
5496     __ stx(lop, zp, rk);        // z[k] = lop
5497     __ dec(rk, 8);              // k--
5498     __ dec(ri, 8);              // i--
5499     __ ba_short(L_loop_i);
5500 
5501     __ bind(L_exit_loop_i);
5502     __ stx(rc, zp, rk);         // z[k] = c
5503 
5504     // for (int j = yn - 1; j >= 0; j--)
5505     __ sllx(yn, 3, rj);         // int j = yn - 1 (byte offset j = 8*yn)
5506     __ dec(rj, 8);
5507 
5508     __ bind(L_loop_j);
5509 
5510     __ cmp_and_br_short(rj, 0,  // j >= 0
5511                         Assembler::less, Assembler::pn, L_exit);
5512     __ clr(rc);                 // u64 c = 0
5513     __ ldx(yp, rj, ry);         // u64 y = yp[j]
5514 
5515     // for (int i = xn, k = --zn; i >= 0; i--)
5516     __ dec(zn);                 // --zn
5517     __ sllx(xn, 3, ri);         // int i = xn (byte offset i = 8*xn)
5518     __ sllx(zn, 3, rk);         // int k = zn (byte offset k = 8*zn)
5519 
5520     __ bind(L_loop_i2);
5521 
5522     __ cmp_and_br_short(ri, 0,  // i >= 0
5523                         Assembler::less, Assembler::pn, L_exit_loop_i2);
5524     __ ldx(xp, ri, rx);         // x = xp[i]
5525     __ ldx(zp, rk, rz);         // z = zp[k], accumulator
5526     __ mulx(rx, ry, lop);       // lo-64b-part of result 64x64
5527     __ umulxhi(rx, ry, hip);    // hi-64b-part of result 64x64
5528     __ addcc(rz, rc, rz);       // Accumulate lower order bits,
5529     __ addxc(hip, zero, rc);    // Accumulate higher order bits to carry
5530     __ addcc(rz, lop, rz);      //    z += lo(p) + c
5531     __ addxc(rc, zero, rc);
5532     __ stx(rz, zp, rk);         // zp[k] = z
5533     __ dec(rk, 8);              // k--
5534     __ dec(ri, 8);              // i--
5535     __ ba_short(L_loop_i2);
5536 
5537     __ bind(L_exit_loop_i2);
5538     __ stx(rc, zp, rk);         // z[k] = c
5539     __ dec(rj, 8);              // j--
5540     __ ba_short(L_loop_j);
5541   }
5542 
5543   void gen_mult_64x64_unaligned(Register xp, Register xn,
5544                                 Register yp, Register yn,
5545                                 Register zp, Register zn, Label &L_exit)
5546   {
5547     // Assuming that a stack frame has already been created, i.e. local and
5548     // output registers are available for use.
5549 
5550     const Register xpc = L0;    // Outer loop cursor, xp[i]
5551     const Register ypc = L1;    // Inner loop cursor, yp[j]
5552     const Register zpc = L2;    // Output loop cursor, zp[k]
5553     const Register rx  = L4;    // x-vector datum [i]
5554     const Register ry  = L5;    // y-vector datum [j]
5555     const Register rz  = L6;    // z-vector datum [k]
5556     const Register rc  = L7;    // carry over (to z-vector datum [k-1])
5557     const Register rt  = O2;
5558 
5559     const Register lop = O0;    // lo-64b product
5560     const Register hip = O1;    // hi-64b product
5561 
5562     const Register zero = G0;
5563 
5564     Label L_loop_i,  L_exit_loop_i;
5565     Label L_loop_j;
5566     Label L_loop_i2, L_exit_loop_i2;
5567 
5568     __ srlx(xn, 1, xn);         // index for u32 to u64 ditto
5569     __ srlx(yn, 1, yn);         // index for u32 to u64 ditto
5570     __ srlx(zn, 1, zn);         // index for u32 to u64 ditto
5571     __ dec(xn);                 // Adjust [0..(N/2)-1]
5572     __ dec(yn);
5573     __ dec(zn);
5574     __ clr(rc);                 // u64 c = 0
5575     __ sllx(xn, 3, xpc);        // u32* xpc = &xp[xn] (byte offset 8*xn)
5576     __ add(xp, xpc, xpc);
5577     __ sllx(yn, 3, ypc);        // u32* ypc = &yp[yn] (byte offset 8*yn)
5578     __ add(yp, ypc, ypc);
5579     __ sllx(zn, 3, zpc);        // u32* zpc = &zp[zn] (byte offset 8*zn)
5580     __ add(zp, zpc, zpc);
5581     __ lduw(ypc, 0, rt);        // u64 y = yp[yn]
5582     __ lduw(ypc, 4, ry);        //   ...
5583     __ sllx(rt, 32, rt);
5584     __ or3(rt, ry, ry);
5585 
5586     // for (int i = xn; i >= 0; i--)
5587     __ bind(L_loop_i);
5588 
5589     __ cmp_and_brx_short(xpc, xp,// i >= 0
5590                          Assembler::lessUnsigned, Assembler::pn, L_exit_loop_i);
5591     __ lduw(xpc, 0, rt);        // u64 x = xp[i]
5592     __ lduw(xpc, 4, rx);        //   ...
5593     __ sllx(rt, 32, rt);
5594     __ or3(rt, rx, rx);
5595     __ mulx(rx, ry, lop);       // lo-64b-part of result 64x64
5596     __ umulxhi(rx, ry, hip);    // hi-64b-part of result 64x64
5597     __ addcc(rc, lop, lop);     // Accumulate lower order bits (producing carry)
5598     __ addxc(hip, zero, rc);    // carry over to next datum [k-1]
5599     __ srlx(lop, 32, rt);
5600     __ stw(rt, zpc, 0);         // z[k] = lop
5601     __ stw(lop, zpc, 4);        //   ...
5602     __ dec(zpc, 8);             // k-- (zpc--)
5603     __ dec(xpc, 8);             // i-- (xpc--)
5604     __ ba_short(L_loop_i);
5605 
5606     __ bind(L_exit_loop_i);
5607     __ srlx(rc, 32, rt);
5608     __ stw(rt, zpc, 0);         // z[k] = c
5609     __ stw(rc, zpc, 4);
5610 
5611     // for (int j = yn - 1; j >= 0; j--)
5612     __ sllx(yn, 3, ypc);        // u32* ypc = &yp[yn] (byte offset 8*yn)
5613     __ add(yp, ypc, ypc);
5614     __ dec(ypc, 8);             // yn - 1 (ypc--)
5615 
5616     __ bind(L_loop_j);
5617 
5618     __ cmp_and_brx_short(ypc, yp,// j >= 0
5619                          Assembler::lessUnsigned, Assembler::pn, L_exit);
5620     __ clr(rc);                 // u64 c = 0
5621     __ lduw(ypc, 0, rt);        // u64 y = yp[j] (= *ypc)
5622     __ lduw(ypc, 4, ry);        //   ...
5623     __ sllx(rt, 32, rt);
5624     __ or3(rt, ry, ry);
5625 
5626     // for (int i = xn, k = --zn; i >= 0; i--)
5627     __ sllx(xn, 3, xpc);        // u32* xpc = &xp[xn] (byte offset 8*xn)
5628     __ add(xp, xpc, xpc);
5629     __ dec(zn);                 // --zn
5630     __ sllx(zn, 3, zpc);        // u32* zpc = &zp[zn] (byte offset 8*zn)
5631     __ add(zp, zpc, zpc);
5632 
5633     __ bind(L_loop_i2);
5634 
5635     __ cmp_and_brx_short(xpc, xp,// i >= 0
5636                          Assembler::lessUnsigned, Assembler::pn, L_exit_loop_i2);
5637     __ lduw(xpc, 0, rt);        // u64 x = xp[i] (= *xpc)
5638     __ lduw(xpc, 4, rx);        //   ...
5639     __ sllx(rt, 32, rt);
5640     __ or3(rt, rx, rx);
5641 
5642     __ lduw(zpc, 0, rt);        // u64 z = zp[k] (= *zpc)
5643     __ lduw(zpc, 4, rz);        //   ...
5644     __ sllx(rt, 32, rt);
5645     __ or3(rt, rz, rz);
5646 
5647     __ mulx(rx, ry, lop);       // lo-64b-part of result 64x64
5648     __ umulxhi(rx, ry, hip);    // hi-64b-part of result 64x64
5649     __ addcc(rz, rc, rz);       // Accumulate lower order bits...
5650     __ addxc(hip, zero, rc);    // Accumulate higher order bits to carry
5651     __ addcc(rz, lop, rz);      // ... z += lo(p) + c
5652     __ addxccc(rc, zero, rc);
5653     __ srlx(rz, 32, rt);
5654     __ stw(rt, zpc, 0);         // zp[k] = z    (*zpc = z)
5655     __ stw(rz, zpc, 4);
5656     __ dec(zpc, 8);             // k-- (zpc--)
5657     __ dec(xpc, 8);             // i-- (xpc--)
5658     __ ba_short(L_loop_i2);
5659 
5660     __ bind(L_exit_loop_i2);
5661     __ srlx(rc, 32, rt);
5662     __ stw(rt, zpc, 0);         // z[k] = c
5663     __ stw(rc, zpc, 4);
5664     __ dec(ypc, 8);             // j-- (ypc--)
5665     __ ba_short(L_loop_j);
5666   }
5667 
5668   void gen_mult_32x32(Register xp, Register xn,
5669                       Register yp, Register yn,
5670                       Register zp, Register zn, Label &L_exit)
5671   {
5672     // Assuming that a stack frame has already been created, i.e. local and
5673     // output registers are available for use.
5674 
5675     const Register ri = L0;     // Outer loop index, xv[i]
5676     const Register rj = L1;     // Inner loop index, yv[j]
5677     const Register rk = L2;     // Output loop index, zv[k]
5678     const Register rx = L4;     // x-vector datum [i]
5679     const Register ry = L5;     // y-vector datum [j]
5680     const Register rz = L6;     // z-vector datum [k]
5681     const Register rc = L7;     // carry over (to z-vector datum [k-1])
5682 
5683     const Register p64 = O0;    // 64b product
5684     const Register z65 = O1;    // carry+64b accumulator
5685     const Register c65 = O2;    // carry at bit 65
5686     const Register c33 = O2;    // carry at bit 33 (after shift)
5687 
5688     const Register zero = G0;
5689 
5690     Label L_loop_i,  L_exit_loop_i;
5691     Label L_loop_j;
5692     Label L_loop_i2, L_exit_loop_i2;
5693 
5694     __ dec(xn);                 // Adjust [0..N-1]
5695     __ dec(yn);
5696     __ dec(zn);
5697     __ clr(rc);                 // u32 c = 0
5698     __ sllx(xn, 2, ri);         // int i = xn (byte offset i = 4*xn)
5699     __ sllx(yn, 2, rj);         // int j = yn (byte offset i = 4*xn)
5700     __ sllx(zn, 2, rk);         // int k = zn (byte offset k = 4*zn)
5701     __ lduw(yp, rj, ry);        // u32 y = yp[yn]
5702 
5703     // for (int i = xn; i >= 0; i--)
5704     __ bind(L_loop_i);
5705 
5706     __ cmp_and_br_short(ri, 0,  // i >= 0
5707                         Assembler::less, Assembler::pn, L_exit_loop_i);
5708     __ lduw(xp, ri, rx);        // x = xp[i]
5709     __ mulx(rx, ry, p64);       // 64b result of 32x32
5710     __ addcc(rc, p64, z65);     // Accumulate to 65 bits (producing carry)
5711     __ addxc(zero, zero, c65);  // Materialise carry (in bit 65) into lsb,
5712     __ sllx(c65, 32, c33);      // and shift into bit 33
5713     __ srlx(z65, 32, rc);       // carry = c33 | hi(z65) >> 32
5714     __ add(c33, rc, rc);        // carry over to next datum [k-1]
5715     __ stw(z65, zp, rk);        // z[k] = lo(z65)
5716     __ dec(rk, 4);              // k--
5717     __ dec(ri, 4);              // i--
5718     __ ba_short(L_loop_i);
5719 
5720     __ bind(L_exit_loop_i);
5721     __ stw(rc, zp, rk);         // z[k] = c
5722 
5723     // for (int j = yn - 1; j >= 0; j--)
5724     __ sllx(yn, 2, rj);         // int j = yn - 1 (byte offset j = 4*yn)
5725     __ dec(rj, 4);
5726 
5727     __ bind(L_loop_j);
5728 
5729     __ cmp_and_br_short(rj, 0,  // j >= 0
5730                         Assembler::less, Assembler::pn, L_exit);
5731     __ clr(rc);                 // u32 c = 0
5732     __ lduw(yp, rj, ry);        // u32 y = yp[j]
5733 
5734     // for (int i = xn, k = --zn; i >= 0; i--)
5735     __ dec(zn);                 // --zn
5736     __ sllx(xn, 2, ri);         // int i = xn (byte offset i = 4*xn)
5737     __ sllx(zn, 2, rk);         // int k = zn (byte offset k = 4*zn)
5738 
5739     __ bind(L_loop_i2);
5740 
5741     __ cmp_and_br_short(ri, 0,  // i >= 0
5742                         Assembler::less, Assembler::pn, L_exit_loop_i2);
5743     __ lduw(xp, ri, rx);        // x = xp[i]
5744     __ lduw(zp, rk, rz);        // z = zp[k], accumulator
5745     __ mulx(rx, ry, p64);       // 64b result of 32x32
5746     __ add(rz, rc, rz);         // Accumulate lower order bits,
5747     __ addcc(rz, p64, z65);     //   z += lo(p64) + c
5748     __ addxc(zero, zero, c65);  // Materialise carry (in bit 65) into lsb,
5749     __ sllx(c65, 32, c33);      // and shift into bit 33
5750     __ srlx(z65, 32, rc);       // carry = c33 | hi(z65) >> 32
5751     __ add(c33, rc, rc);        // carry over to next datum [k-1]
5752     __ stw(z65, zp, rk);        // zp[k] = lo(z65)
5753     __ dec(rk, 4);              // k--
5754     __ dec(ri, 4);              // i--
5755     __ ba_short(L_loop_i2);
5756 
5757     __ bind(L_exit_loop_i2);
5758     __ stw(rc, zp, rk);         // z[k] = c
5759     __ dec(rj, 4);              // j--
5760     __ ba_short(L_loop_j);
5761   }
5762 
5763 
5764   void generate_initial() {
5765     // Generates all stubs and initializes the entry points
5766 
5767     //------------------------------------------------------------------------------------------------------------------------
5768     // entry points that exist in all platforms
5769     // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
5770     //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
5771     StubRoutines::_forward_exception_entry                 = generate_forward_exception();
5772 
5773     StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
5774     StubRoutines::_catch_exception_entry                   = generate_catch_exception();
5775 
5776     //------------------------------------------------------------------------------------------------------------------------
5777     // entry points that are platform specific
5778     StubRoutines::Sparc::_test_stop_entry                  = generate_test_stop();
5779 
5780     StubRoutines::Sparc::_stop_subroutine_entry            = generate_stop_subroutine();
5781     StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
5782 
5783     // Build this early so it's available for the interpreter.
5784     StubRoutines::_throw_StackOverflowError_entry =
5785             generate_throw_exception("StackOverflowError throw_exception",
5786             CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
5787     StubRoutines::_throw_delayed_StackOverflowError_entry =
5788             generate_throw_exception("delayed StackOverflowError throw_exception",
5789             CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError));
5790 
5791     if (UseCRC32Intrinsics) {
5792       // set table address before stub generation which use it
5793       StubRoutines::_crc_table_adr = (address)StubRoutines::Sparc::_crc_table;
5794       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5795     }
5796 
5797     if (UseCRC32CIntrinsics) {
5798       // set table address before stub generation which use it
5799       StubRoutines::_crc32c_table_addr = (address)StubRoutines::Sparc::_crc32c_table;
5800       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5801     }
5802   }
5803 
5804 
5805   void generate_all() {
5806     // Generates all stubs and initializes the entry points
5807 
5808     // Generate partial_subtype_check first here since its code depends on
5809     // UseZeroBaseCompressedOops which is defined after heap initialization.
5810     StubRoutines::Sparc::_partial_subtype_check                = generate_partial_subtype_check();
5811     // These entry points require SharedInfo::stack0 to be set up in non-core builds
5812     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
5813     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
5814     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
5815 
5816     // support for verify_oop (must happen after universe_init)
5817     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop_subroutine();
5818 
5819     // arraycopy stubs used by compilers
5820     generate_arraycopy_stubs();
5821 
5822     // Don't initialize the platform math functions since sparc
5823     // doesn't have intrinsics for these operations.
5824 
5825     // Safefetch stubs.
5826     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5827                                                        &StubRoutines::_safefetch32_fault_pc,
5828                                                        &StubRoutines::_safefetch32_continuation_pc);
5829     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5830                                                        &StubRoutines::_safefetchN_fault_pc,
5831                                                        &StubRoutines::_safefetchN_continuation_pc);
5832 
5833     // generate AES intrinsics code
5834     if (UseAESIntrinsics) {
5835       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5836       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5837       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5838       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
5839     }
5840     // generate GHASH intrinsics code
5841     if (UseGHASHIntrinsics) {
5842       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5843     }
5844 
5845     // generate SHA1/SHA256/SHA512 intrinsics code
5846     if (UseSHA1Intrinsics) {
5847       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5848       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5849     }
5850     if (UseSHA256Intrinsics) {
5851       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5852       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5853     }
5854     if (UseSHA512Intrinsics) {
5855       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
5856       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
5857     }
5858     // generate Adler32 intrinsics code
5859     if (UseAdler32Intrinsics) {
5860       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5861     }
5862 
5863 #ifdef COMPILER2
5864     // Intrinsics supported by C2 only:
5865     if (UseMultiplyToLenIntrinsic) {
5866       StubRoutines::_multiplyToLen = generate_multiplyToLen();
5867     }
5868 #endif // COMPILER2
5869   }
5870 
5871  public:
5872   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5873     // replace the standard masm with a special one:
5874     _masm = new MacroAssembler(code);
5875 
5876     _stub_count = !all ? 0x100 : 0x200;
5877     if (all) {
5878       generate_all();
5879     } else {
5880       generate_initial();
5881     }
5882 
5883     // make sure this stub is available for all local calls
5884     if (_atomic_add_stub.is_unbound()) {
5885       // generate a second time, if necessary
5886       (void) generate_atomic_add();
5887     }
5888   }
5889 
5890 
5891  private:
5892   int _stub_count;
5893   void stub_prolog(StubCodeDesc* cdesc) {
5894     # ifdef ASSERT
5895       // put extra information in the stub code, to make it more readable
5896       // Write the high part of the address
5897       // [RGV] Check if there is a dependency on the size of this prolog
5898       __ emit_data((intptr_t)cdesc >> 32,    relocInfo::none);
5899       __ emit_data((intptr_t)cdesc,    relocInfo::none);
5900       __ emit_data(++_stub_count, relocInfo::none);
5901     # endif
5902     align(true);
5903   }
5904 
5905   void align(bool at_header = false) {
5906     // %%%%% move this constant somewhere else
5907     // UltraSPARC cache line size is 8 instructions:
5908     const unsigned int icache_line_size = 32;
5909     const unsigned int icache_half_line_size = 16;
5910 
5911     if (at_header) {
5912       while ((intptr_t)(__ pc()) % icache_line_size != 0) {
5913         __ emit_data(0, relocInfo::none);
5914       }
5915     } else {
5916       while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
5917         __ nop();
5918       }
5919     }
5920   }
5921 
5922 }; // end class declaration
5923 
5924 void StubGenerator_generate(CodeBuffer* code, bool all) {
5925   StubGenerator g(code, all);
5926 }