Old src/cpu/sparc/vm/stubGenerator

   1 /*
   2  * Copyright (c) 1997, 2016, Oracle and/or its affiliates. All rights reserved.
   3  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
   4  *
   5  * This code is free software; you can redistribute it and/or modify it
   6  * under the terms of the GNU General Public License version 2 only, as
   7  * published by the Free Software Foundation.
   8  *
   9  * This code is distributed in the hope that it will be useful, but WITHOUT
  10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
  11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
  12  * version 2 for more details (a copy is included in the LICENSE file that
  13  * accompanied this code).
  14  *
  15  * You should have received a copy of the GNU General Public License version
  16  * 2 along with this work; if not, write to the Free Software Foundation,
  17  * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
  18  *
  19  * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
  20  * or visit www.oracle.com if you need additional information or have any
  21  * questions.
  22  *
  23  */
  24 
  25 #include "precompiled.hpp"
  26 #include "asm/macroAssembler.inline.hpp"
  27 #include "interpreter/interpreter.hpp"
  28 #include "nativeInst_sparc.hpp"
  29 #include "oops/instanceOop.hpp"
  30 #include "oops/method.hpp"
  31 #include "oops/objArrayKlass.hpp"
  32 #include "oops/oop.inline.hpp"
  33 #include "prims/methodHandles.hpp"
  34 #include "runtime/frame.inline.hpp"
  35 #include "runtime/handles.inline.hpp"
  36 #include "runtime/sharedRuntime.hpp"
  37 #include "runtime/stubCodeGenerator.hpp"
  38 #include "runtime/stubRoutines.hpp"
  39 #include "runtime/thread.inline.hpp"
  40 #ifdef COMPILER2
  41 #include "opto/runtime.hpp"
  42 #endif
  43 
  44 // Declaration and definition of StubGenerator (no .hpp file).
  45 // For a more detailed description of the stub routine structure
  46 // see the comment in stubRoutines.hpp.
  47 
  48 #define __ _masm->
  49 
  50 #ifdef PRODUCT
  51 #define BLOCK_COMMENT(str) /* nothing */
  52 #else
  53 #define BLOCK_COMMENT(str) __ block_comment(str)
  54 #endif
  55 
  56 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
  57 
  58 // Note:  The register L7 is used as L7_thread_cache, and may not be used
  59 //        any other way within this module.
  60 
  61 
  62 static const Register& Lstub_temp = L2;
  63 
  64 // -------------------------------------------------------------------------------------------------------------------------
  65 // Stub Code definitions
  66 
  67 class StubGenerator: public StubCodeGenerator {
  68  private:
  69 
  70 #ifdef PRODUCT
  71 #define inc_counter_np(a,b,c)
  72 #else
  73 #define inc_counter_np(counter, t1, t2) \
  74   BLOCK_COMMENT("inc_counter " #counter); \
  75   __ inc_counter(&counter, t1, t2);
  76 #endif
  77 
  78   //----------------------------------------------------------------------------------------------------
  79   // Call stubs are used to call Java from C
  80 
  81   address generate_call_stub(address& return_pc) {
  82     StubCodeMark mark(this, "StubRoutines", "call_stub");
  83     address start = __ pc();
  84 
  85     // Incoming arguments:
  86     //
  87     // o0         : call wrapper address
  88     // o1         : result (address)
  89     // o2         : result type
  90     // o3         : method
  91     // o4         : (interpreter) entry point
  92     // o5         : parameters (address)
  93     // [sp + 0x5c]: parameter size (in words)
  94     // [sp + 0x60]: thread
  95     //
  96     // +---------------+ <--- sp + 0
  97     // |               |
  98     // . reg save area .
  99     // |               |
 100     // +---------------+ <--- sp + 0x40
 101     // |               |
 102     // . extra 7 slots .
 103     // |               |
 104     // +---------------+ <--- sp + 0x5c
 105     // |  param. size  |
 106     // +---------------+ <--- sp + 0x60
 107     // |    thread     |
 108     // +---------------+
 109     // |               |
 110 
 111     // note: if the link argument position changes, adjust
 112     //       the code in frame::entry_frame_call_wrapper()
 113 
 114     const Argument link           = Argument(0, false); // used only for GC
 115     const Argument result         = Argument(1, false);
 116     const Argument result_type    = Argument(2, false);
 117     const Argument method         = Argument(3, false);
 118     const Argument entry_point    = Argument(4, false);
 119     const Argument parameters     = Argument(5, false);
 120     const Argument parameter_size = Argument(6, false);
 121     const Argument thread         = Argument(7, false);
 122 
 123     // setup thread register
 124     __ ld_ptr(thread.as_address(), G2_thread);
 125     __ reinit_heapbase();
 126 
 127 #ifdef ASSERT
 128     // make sure we have no pending exceptions
 129     { const Register t = G3_scratch;
 130       Label L;
 131       __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
 132       __ br_null_short(t, Assembler::pt, L);
 133       __ stop("StubRoutines::call_stub: entered with pending exception");
 134       __ bind(L);
 135     }
 136 #endif
 137 
 138     // create activation frame & allocate space for parameters
 139     { const Register t = G3_scratch;
 140       __ ld_ptr(parameter_size.as_address(), t);                // get parameter size (in words)
 141       __ add(t, frame::memory_parameter_word_sp_offset, t);     // add space for save area (in words)
 142       __ round_to(t, WordsPerLong);                             // make sure it is multiple of 2 (in words)
 143       __ sll(t, Interpreter::logStackElementSize, t);           // compute number of bytes
 144       __ neg(t);                                                // negate so it can be used with save
 145       __ save(SP, t, SP);                                       // setup new frame
 146     }
 147 
 148     // +---------------+ <--- sp + 0
 149     // |               |
 150     // . reg save area .
 151     // |               |
 152     // +---------------+ <--- sp + 0x40
 153     // |               |
 154     // . extra 7 slots .
 155     // |               |
 156     // +---------------+ <--- sp + 0x5c
 157     // |  empty slot   |      (only if parameter size is even)
 158     // +---------------+
 159     // |               |
 160     // .  parameters   .
 161     // |               |
 162     // +---------------+ <--- fp + 0
 163     // |               |
 164     // . reg save area .
 165     // |               |
 166     // +---------------+ <--- fp + 0x40
 167     // |               |
 168     // . extra 7 slots .
 169     // |               |
 170     // +---------------+ <--- fp + 0x5c
 171     // |  param. size  |
 172     // +---------------+ <--- fp + 0x60
 173     // |    thread     |
 174     // +---------------+
 175     // |               |
 176 
 177     // pass parameters if any
 178     BLOCK_COMMENT("pass parameters if any");
 179     { const Register src = parameters.as_in().as_register();
 180       const Register dst = Lentry_args;
 181       const Register tmp = G3_scratch;
 182       const Register cnt = G4_scratch;
 183 
 184       // test if any parameters & setup of Lentry_args
 185       Label exit;
 186       __ ld_ptr(parameter_size.as_in().as_address(), cnt);      // parameter counter
 187       __ add( FP, STACK_BIAS, dst );
 188       __ cmp_zero_and_br(Assembler::zero, cnt, exit);
 189       __ delayed()->sub(dst, BytesPerWord, dst);                 // setup Lentry_args
 190 
 191       // copy parameters if any
 192       Label loop;
 193       __ BIND(loop);
 194       // Store parameter value
 195       __ ld_ptr(src, 0, tmp);
 196       __ add(src, BytesPerWord, src);
 197       __ st_ptr(tmp, dst, 0);
 198       __ deccc(cnt);
 199       __ br(Assembler::greater, false, Assembler::pt, loop);
 200       __ delayed()->sub(dst, Interpreter::stackElementSize, dst);
 201 
 202       // done
 203       __ BIND(exit);
 204     }
 205 
 206     // setup parameters, method & call Java function
 207 #ifdef ASSERT
 208     // layout_activation_impl checks it's notion of saved SP against
 209     // this register, so if this changes update it as well.
 210     const Register saved_SP = Lscratch;
 211     __ mov(SP, saved_SP);                               // keep track of SP before call
 212 #endif
 213 
 214     // setup parameters
 215     const Register t = G3_scratch;
 216     __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words)
 217     __ sll(t, Interpreter::logStackElementSize, t);    // compute number of bytes
 218     __ sub(FP, t, Gargs);                              // setup parameter pointer
 219 #ifdef _LP64
 220     __ add( Gargs, STACK_BIAS, Gargs );                // Account for LP64 stack bias
 221 #endif
 222     __ mov(SP, O5_savedSP);
 223 
 224 
 225     // do the call
 226     //
 227     // the following register must be setup:
 228     //
 229     // G2_thread
 230     // G5_method
 231     // Gargs
 232     BLOCK_COMMENT("call Java function");
 233     __ jmpl(entry_point.as_in().as_register(), G0, O7);
 234     __ delayed()->mov(method.as_in().as_register(), G5_method);   // setup method
 235 
 236     BLOCK_COMMENT("call_stub_return_address:");
 237     return_pc = __ pc();
 238 
 239     // The callee, if it wasn't interpreted, can return with SP changed so
 240     // we can no longer assert of change of SP.
 241 
 242     // store result depending on type
 243     // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE
 244     //  is treated as T_INT)
 245     { const Register addr = result     .as_in().as_register();
 246       const Register type = result_type.as_in().as_register();
 247       Label is_long, is_float, is_double, is_object, exit;
 248       __            cmp(type, T_OBJECT);  __ br(Assembler::equal, false, Assembler::pn, is_object);
 249       __ delayed()->cmp(type, T_FLOAT);   __ br(Assembler::equal, false, Assembler::pn, is_float);
 250       __ delayed()->cmp(type, T_DOUBLE);  __ br(Assembler::equal, false, Assembler::pn, is_double);
 251       __ delayed()->cmp(type, T_LONG);    __ br(Assembler::equal, false, Assembler::pn, is_long);
 252       __ delayed()->nop();
 253 
 254       // store int result
 255       __ st(O0, addr, G0);
 256 
 257       __ BIND(exit);
 258       __ ret();
 259       __ delayed()->restore();
 260 
 261       __ BIND(is_object);
 262       __ ba(exit);
 263       __ delayed()->st_ptr(O0, addr, G0);
 264 
 265       __ BIND(is_float);
 266       __ ba(exit);
 267       __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
 268 
 269       __ BIND(is_double);
 270       __ ba(exit);
 271       __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
 272 
 273       __ BIND(is_long);
 274 #ifdef _LP64
 275       __ ba(exit);
 276       __ delayed()->st_long(O0, addr, G0);      // store entire long
 277 #else
 278 #if defined(COMPILER2)
 279   // All return values are where we want them, except for Longs.  C2 returns
 280   // longs in G1 in the 32-bit build whereas the interpreter wants them in O0/O1.
 281   // Since the interpreter will return longs in G1 and O0/O1 in the 32bit
 282   // build we simply always use G1.
 283   // Note: I tried to make c2 return longs in O0/O1 and G1 so we wouldn't have to
 284   // do this here. Unfortunately if we did a rethrow we'd see an machepilog node
 285   // first which would move g1 -> O0/O1 and destroy the exception we were throwing.
 286 
 287       __ ba(exit);
 288       __ delayed()->stx(G1, addr, G0);  // store entire long
 289 #else
 290       __ st(O1, addr, BytesPerInt);
 291       __ ba(exit);
 292       __ delayed()->st(O0, addr, G0);
 293 #endif /* COMPILER2 */
 294 #endif /* _LP64 */
 295      }
 296      return start;
 297   }
 298 
 299 
 300   //----------------------------------------------------------------------------------------------------
 301   // Return point for a Java call if there's an exception thrown in Java code.
 302   // The exception is caught and transformed into a pending exception stored in
 303   // JavaThread that can be tested from within the VM.
 304   //
 305   // Oexception: exception oop
 306 
 307   address generate_catch_exception() {
 308     StubCodeMark mark(this, "StubRoutines", "catch_exception");
 309 
 310     address start = __ pc();
 311     // verify that thread corresponds
 312     __ verify_thread();
 313 
 314     const Register& temp_reg = Gtemp;
 315     Address pending_exception_addr    (G2_thread, Thread::pending_exception_offset());
 316     Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset   ());
 317     Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset   ());
 318 
 319     // set pending exception
 320     __ verify_oop(Oexception);
 321     __ st_ptr(Oexception, pending_exception_addr);
 322     __ set((intptr_t)__FILE__, temp_reg);
 323     __ st_ptr(temp_reg, exception_file_offset_addr);
 324     __ set((intptr_t)__LINE__, temp_reg);
 325     __ st(temp_reg, exception_line_offset_addr);
 326 
 327     // complete return to VM
 328     assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
 329 
 330     AddressLiteral stub_ret(StubRoutines::_call_stub_return_address);
 331     __ jump_to(stub_ret, temp_reg);
 332     __ delayed()->nop();
 333 
 334     return start;
 335   }
 336 
 337 
 338   //----------------------------------------------------------------------------------------------------
 339   // Continuation point for runtime calls returning with a pending exception
 340   // The pending exception check happened in the runtime or native call stub
 341   // The pending exception in Thread is converted into a Java-level exception
 342   //
 343   // Contract with Java-level exception handler: O0 = exception
 344   //                                             O1 = throwing pc
 345 
 346   address generate_forward_exception() {
 347     StubCodeMark mark(this, "StubRoutines", "forward_exception");
 348     address start = __ pc();
 349 
 350     // Upon entry, O7 has the return address returning into Java
 351     // (interpreted or compiled) code; i.e. the return address
 352     // becomes the throwing pc.
 353 
 354     const Register& handler_reg = Gtemp;
 355 
 356     Address exception_addr(G2_thread, Thread::pending_exception_offset());
 357 
 358 #ifdef ASSERT
 359     // make sure that this code is only executed if there is a pending exception
 360     { Label L;
 361       __ ld_ptr(exception_addr, Gtemp);
 362       __ br_notnull_short(Gtemp, Assembler::pt, L);
 363       __ stop("StubRoutines::forward exception: no pending exception (1)");
 364       __ bind(L);
 365     }
 366 #endif
 367 
 368     // compute exception handler into handler_reg
 369     __ get_thread();
 370     __ ld_ptr(exception_addr, Oexception);
 371     __ verify_oop(Oexception);
 372     __ save_frame(0);             // compensates for compiler weakness
 373     __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC
 374     BLOCK_COMMENT("call exception_handler_for_return_address");
 375     __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch);
 376     __ mov(O0, handler_reg);
 377     __ restore();                 // compensates for compiler weakness
 378 
 379     __ ld_ptr(exception_addr, Oexception);
 380     __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC
 381 
 382 #ifdef ASSERT
 383     // make sure exception is set
 384     { Label L;
 385       __ br_notnull_short(Oexception, Assembler::pt, L);
 386       __ stop("StubRoutines::forward exception: no pending exception (2)");
 387       __ bind(L);
 388     }
 389 #endif
 390     // jump to exception handler
 391     __ jmp(handler_reg, 0);
 392     // clear pending exception
 393     __ delayed()->st_ptr(G0, exception_addr);
 394 
 395     return start;
 396   }
 397 
 398   // Safefetch stubs.
 399   void generate_safefetch(const char* name, int size, address* entry,
 400                           address* fault_pc, address* continuation_pc) {
 401     // safefetch signatures:
 402     //   int      SafeFetch32(int*      adr, int      errValue);
 403     //   intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
 404     //
 405     // arguments:
 406     //   o0 = adr
 407     //   o1 = errValue
 408     //
 409     // result:
 410     //   o0  = *adr or errValue
 411 
 412     StubCodeMark mark(this, "StubRoutines", name);
 413 
 414     // Entry point, pc or function descriptor.
 415     __ align(CodeEntryAlignment);
 416     *entry = __ pc();
 417 
 418     __ mov(O0, G1);  // g1 = o0
 419     __ mov(O1, O0);  // o0 = o1
 420     // Load *adr into c_rarg1, may fault.
 421     *fault_pc = __ pc();
 422     switch (size) {
 423       case 4:
 424         // int32_t
 425         __ ldsw(G1, 0, O0);  // o0 = [g1]
 426         break;
 427       case 8:
 428         // int64_t
 429         __ ldx(G1, 0, O0);   // o0 = [g1]
 430         break;
 431       default:
 432         ShouldNotReachHere();
 433     }
 434 
 435     // return errValue or *adr
 436     *continuation_pc = __ pc();
 437     // By convention with the trap handler we ensure there is a non-CTI
 438     // instruction in the trap shadow.
 439     __ nop();
 440     __ retl();
 441     __ delayed()->nop();
 442   }
 443 
 444   //------------------------------------------------------------------------------------------------------------------------
 445   // Continuation point for throwing of implicit exceptions that are not handled in
 446   // the current activation. Fabricates an exception oop and initiates normal
 447   // exception dispatching in this frame. Only callee-saved registers are preserved
 448   // (through the normal register window / RegisterMap handling).
 449   // If the compiler needs all registers to be preserved between the fault
 450   // point and the exception handler then it must assume responsibility for that in
 451   // AbstractCompiler::continuation_for_implicit_null_exception or
 452   // continuation_for_implicit_division_by_zero_exception. All other implicit
 453   // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
 454   // either at call sites or otherwise assume that stack unwinding will be initiated,
 455   // so caller saved registers were assumed volatile in the compiler.
 456 
 457   // Note that we generate only this stub into a RuntimeStub, because it needs to be
 458   // properly traversed and ignored during GC, so we change the meaning of the "__"
 459   // macro within this method.
 460 #undef __
 461 #define __ masm->
 462 
 463   address generate_throw_exception(const char* name, address runtime_entry,
 464                                    Register arg1 = noreg, Register arg2 = noreg) {
 465 #ifdef ASSERT
 466     int insts_size = VerifyThread ? 1 * K : 600;
 467 #else
 468     int insts_size = VerifyThread ? 1 * K : 256;
 469 #endif /* ASSERT */
 470     int locs_size  = 32;
 471 
 472     CodeBuffer      code(name, insts_size, locs_size);
 473     MacroAssembler* masm = new MacroAssembler(&code);
 474 
 475     __ verify_thread();
 476 
 477     // This is an inlined and slightly modified version of call_VM
 478     // which has the ability to fetch the return PC out of thread-local storage
 479     __ assert_not_delayed();
 480 
 481     // Note that we always push a frame because on the SPARC
 482     // architecture, for all of our implicit exception kinds at call
 483     // sites, the implicit exception is taken before the callee frame
 484     // is pushed.
 485     __ save_frame(0);
 486 
 487     int frame_complete = __ offset();
 488 
 489     // Note that we always have a runtime stub frame on the top of stack by this point
 490     Register last_java_sp = SP;
 491     // 64-bit last_java_sp is biased!
 492     __ set_last_Java_frame(last_java_sp, G0);
 493     if (VerifyThread)  __ mov(G2_thread, O0); // about to be smashed; pass early
 494     __ save_thread(noreg);
 495     if (arg1 != noreg) {
 496       assert(arg2 != O1, "clobbered");
 497       __ mov(arg1, O1);
 498     }
 499     if (arg2 != noreg) {
 500       __ mov(arg2, O2);
 501     }
 502     // do the call
 503     BLOCK_COMMENT("call runtime_entry");
 504     __ call(runtime_entry, relocInfo::runtime_call_type);
 505     if (!VerifyThread)
 506       __ delayed()->mov(G2_thread, O0);  // pass thread as first argument
 507     else
 508       __ delayed()->nop();             // (thread already passed)
 509     __ restore_thread(noreg);
 510     __ reset_last_Java_frame();
 511 
 512     // check for pending exceptions. use Gtemp as scratch register.
 513 #ifdef ASSERT
 514     Label L;
 515 
 516     Address exception_addr(G2_thread, Thread::pending_exception_offset());
 517     Register scratch_reg = Gtemp;
 518     __ ld_ptr(exception_addr, scratch_reg);
 519     __ br_notnull_short(scratch_reg, Assembler::pt, L);
 520     __ should_not_reach_here();
 521     __ bind(L);
 522 #endif // ASSERT
 523     BLOCK_COMMENT("call forward_exception_entry");
 524     __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
 525     // we use O7 linkage so that forward_exception_entry has the issuing PC
 526     __ delayed()->restore();
 527 
 528     RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false);
 529     return stub->entry_point();
 530   }
 531 
 532 #undef __
 533 #define __ _masm->
 534 
 535 
 536   // Generate a routine that sets all the registers so we
 537   // can tell if the stop routine prints them correctly.
 538   address generate_test_stop() {
 539     StubCodeMark mark(this, "StubRoutines", "test_stop");
 540     address start = __ pc();
 541 
 542     int i;
 543 
 544     __ save_frame(0);
 545 
 546     static jfloat zero = 0.0, one = 1.0;
 547 
 548     // put addr in L0, then load through L0 to F0
 549     __ set((intptr_t)&zero, L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F0);
 550     __ set((intptr_t)&one,  L0);  __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1
 551 
 552     // use add to put 2..18 in F2..F18
 553     for ( i = 2;  i <= 18;  ++i ) {
 554       __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1),  as_FloatRegister(i));
 555     }
 556 
 557     // Now put double 2 in F16, double 18 in F18
 558     __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 );
 559     __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 );
 560 
 561     // use add to put 20..32 in F20..F32
 562     for (i = 20; i < 32; i += 2) {
 563       __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2),  as_FloatRegister(i));
 564     }
 565 
 566     // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's
 567     for ( i = 0; i < 8; ++i ) {
 568       if (i < 6) {
 569         __ set(     i, as_iRegister(i));
 570         __ set(16 + i, as_oRegister(i));
 571         __ set(24 + i, as_gRegister(i));
 572       }
 573       __ set( 8 + i, as_lRegister(i));
 574     }
 575 
 576     __ stop("testing stop");
 577 
 578 
 579     __ ret();
 580     __ delayed()->restore();
 581 
 582     return start;
 583   }
 584 
 585 
 586   address generate_stop_subroutine() {
 587     StubCodeMark mark(this, "StubRoutines", "stop_subroutine");
 588     address start = __ pc();
 589 
 590     __ stop_subroutine();
 591 
 592     return start;
 593   }
 594 
 595   address generate_flush_callers_register_windows() {
 596     StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows");
 597     address start = __ pc();
 598 
 599     __ flushw();
 600     __ retl(false);
 601     __ delayed()->add( FP, STACK_BIAS, O0 );
 602     // The returned value must be a stack pointer whose register save area
 603     // is flushed, and will stay flushed while the caller executes.
 604 
 605     return start;
 606   }
 607 
 608   // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest).
 609   //
 610   // Arguments:
 611   //
 612   //      exchange_value: O0
 613   //      dest:           O1
 614   //
 615   // Results:
 616   //
 617   //     O0: the value previously stored in dest
 618   //
 619   address generate_atomic_xchg() {
 620     StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
 621     address start = __ pc();
 622 
 623     if (UseCASForSwap) {
 624       // Use CAS instead of swap, just in case the MP hardware
 625       // prefers to work with just one kind of synch. instruction.
 626       Label retry;
 627       __ BIND(retry);
 628       __ mov(O0, O3);       // scratch copy of exchange value
 629       __ ld(O1, 0, O2);     // observe the previous value
 630       // try to replace O2 with O3
 631       __ cas(O1, O2, O3);
 632       __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
 633 
 634       __ retl(false);
 635       __ delayed()->mov(O2, O0);  // report previous value to caller
 636     } else {
 637       __ retl(false);
 638       __ delayed()->swap(O1, 0, O0);
 639     }
 640 
 641     return start;
 642   }
 643 
 644 
 645   // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value)
 646   //
 647   // Arguments:
 648   //
 649   //      exchange_value: O0
 650   //      dest:           O1
 651   //      compare_value:  O2
 652   //
 653   // Results:
 654   //
 655   //     O0: the value previously stored in dest
 656   //
 657   address generate_atomic_cmpxchg() {
 658     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
 659     address start = __ pc();
 660 
 661     // cmpxchg(dest, compare_value, exchange_value)
 662     __ cas(O1, O2, O0);
 663     __ retl(false);
 664     __ delayed()->nop();
 665 
 666     return start;
 667   }
 668 
 669   // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
 670   //
 671   // Arguments:
 672   //
 673   //      exchange_value: O1:O0
 674   //      dest:           O2
 675   //      compare_value:  O4:O3
 676   //
 677   // Results:
 678   //
 679   //     O1:O0: the value previously stored in dest
 680   //
 681   // Overwrites: G1,G2,G3
 682   //
 683   address generate_atomic_cmpxchg_long() {
 684     StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
 685     address start = __ pc();
 686 
 687     __ sllx(O0, 32, O0);
 688     __ srl(O1, 0, O1);
 689     __ or3(O0,O1,O0);      // O0 holds 64-bit value from compare_value
 690     __ sllx(O3, 32, O3);
 691     __ srl(O4, 0, O4);
 692     __ or3(O3,O4,O3);     // O3 holds 64-bit value from exchange_value
 693     __ casx(O2, O3, O0);
 694     __ srl(O0, 0, O1);    // unpacked return value in O1:O0
 695     __ retl(false);
 696     __ delayed()->srlx(O0, 32, O0);
 697 
 698     return start;
 699   }
 700 
 701 
 702   // Support for jint Atomic::add(jint add_value, volatile jint* dest).
 703   //
 704   // Arguments:
 705   //
 706   //      add_value: O0   (e.g., +1 or -1)
 707   //      dest:      O1
 708   //
 709   // Results:
 710   //
 711   //     O0: the new value stored in dest
 712   //
 713   // Overwrites: O3
 714   //
 715   address generate_atomic_add() {
 716     StubCodeMark mark(this, "StubRoutines", "atomic_add");
 717     address start = __ pc();
 718     __ BIND(_atomic_add_stub);
 719 
 720     Label(retry);
 721     __ BIND(retry);
 722 
 723     __ lduw(O1, 0, O2);
 724     __ add(O0, O2, O3);
 725     __ cas(O1, O2, O3);
 726     __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
 727     __ retl(false);
 728     __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
 729 
 730     return start;
 731   }
 732   Label _atomic_add_stub;  // called from other stubs
 733 
 734 
 735   // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super );
 736   // Arguments :
 737   //
 738   //      ret  : O0, returned
 739   //      icc/xcc: set as O0 (depending on wordSize)
 740   //      sub  : O1, argument, not changed
 741   //      super: O2, argument, not changed
 742   //      raddr: O7, blown by call
 743   address generate_partial_subtype_check() {
 744     __ align(CodeEntryAlignment);
 745     StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
 746     address start = __ pc();
 747     Label miss;
 748 
 749 #if defined(COMPILER2) && !defined(_LP64)
 750     // Do not use a 'save' because it blows the 64-bit O registers.
 751     __ add(SP,-4*wordSize,SP);  // Make space for 4 temps (stack must be 2 words aligned)
 752     __ st_ptr(L0,SP,(frame::register_save_words+0)*wordSize);
 753     __ st_ptr(L1,SP,(frame::register_save_words+1)*wordSize);
 754     __ st_ptr(L2,SP,(frame::register_save_words+2)*wordSize);
 755     __ st_ptr(L3,SP,(frame::register_save_words+3)*wordSize);
 756     Register Rret   = O0;
 757     Register Rsub   = O1;
 758     Register Rsuper = O2;
 759 #else
 760     __ save_frame(0);
 761     Register Rret   = I0;
 762     Register Rsub   = I1;
 763     Register Rsuper = I2;
 764 #endif
 765 
 766     Register L0_ary_len = L0;
 767     Register L1_ary_ptr = L1;
 768     Register L2_super   = L2;
 769     Register L3_index   = L3;
 770 
 771     __ check_klass_subtype_slow_path(Rsub, Rsuper,
 772                                      L0, L1, L2, L3,
 773                                      NULL, &miss);
 774 
 775     // Match falls through here.
 776     __ addcc(G0,0,Rret);        // set Z flags, Z result
 777 
 778 #if defined(COMPILER2) && !defined(_LP64)
 779     __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
 780     __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
 781     __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
 782     __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
 783     __ retl();                  // Result in Rret is zero; flags set to Z
 784     __ delayed()->add(SP,4*wordSize,SP);
 785 #else
 786     __ ret();                   // Result in Rret is zero; flags set to Z
 787     __ delayed()->restore();
 788 #endif
 789 
 790     __ BIND(miss);
 791     __ addcc(G0,1,Rret);        // set NZ flags, NZ result
 792 
 793 #if defined(COMPILER2) && !defined(_LP64)
 794     __ ld_ptr(SP,(frame::register_save_words+0)*wordSize,L0);
 795     __ ld_ptr(SP,(frame::register_save_words+1)*wordSize,L1);
 796     __ ld_ptr(SP,(frame::register_save_words+2)*wordSize,L2);
 797     __ ld_ptr(SP,(frame::register_save_words+3)*wordSize,L3);
 798     __ retl();                  // Result in Rret is != 0; flags set to NZ
 799     __ delayed()->add(SP,4*wordSize,SP);
 800 #else
 801     __ ret();                   // Result in Rret is != 0; flags set to NZ
 802     __ delayed()->restore();
 803 #endif
 804 
 805     return start;
 806   }
 807 
 808 
 809   // Called from MacroAssembler::verify_oop
 810   //
 811   address generate_verify_oop_subroutine() {
 812     StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
 813 
 814     address start = __ pc();
 815 
 816     __ verify_oop_subroutine();
 817 
 818     return start;
 819   }
 820 
 821 
 822   //
 823   // Verify that a register contains clean 32-bits positive value
 824   // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax).
 825   //
 826   //  Input:
 827   //    Rint  -  32-bits value
 828   //    Rtmp  -  scratch
 829   //
 830   void assert_clean_int(Register Rint, Register Rtmp) {
 831 #if defined(ASSERT) && defined(_LP64)
 832     __ signx(Rint, Rtmp);
 833     __ cmp(Rint, Rtmp);
 834     __ breakpoint_trap(Assembler::notEqual, Assembler::xcc);
 835 #endif
 836   }
 837 
 838   //
 839   //  Generate overlap test for array copy stubs
 840   //
 841   //  Input:
 842   //    O0    -  array1
 843   //    O1    -  array2
 844   //    O2    -  element count
 845   //
 846   //  Kills temps:  O3, O4
 847   //
 848   void array_overlap_test(address no_overlap_target, int log2_elem_size) {
 849     assert(no_overlap_target != NULL, "must be generated");
 850     array_overlap_test(no_overlap_target, NULL, log2_elem_size);
 851   }
 852   void array_overlap_test(Label& L_no_overlap, int log2_elem_size) {
 853     array_overlap_test(NULL, &L_no_overlap, log2_elem_size);
 854   }
 855   void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) {
 856     const Register from       = O0;
 857     const Register to         = O1;
 858     const Register count      = O2;
 859     const Register to_from    = O3; // to - from
 860     const Register byte_count = O4; // count << log2_elem_size
 861 
 862       __ subcc(to, from, to_from);
 863       __ sll_ptr(count, log2_elem_size, byte_count);
 864       if (NOLp == NULL)
 865         __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target);
 866       else
 867         __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp));
 868       __ delayed()->cmp(to_from, byte_count);
 869       if (NOLp == NULL)
 870         __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target);
 871       else
 872         __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp));
 873       __ delayed()->nop();
 874   }
 875 
 876   //
 877   //  Generate pre-write barrier for array.
 878   //
 879   //  Input:
 880   //     addr     - register containing starting address
 881   //     count    - register containing element count
 882   //     tmp      - scratch register
 883   //
 884   //  The input registers are overwritten.
 885   //
 886   void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) {
 887     BarrierSet* bs = Universe::heap()->barrier_set();
 888     switch (bs->kind()) {
 889       case BarrierSet::G1SATBCTLogging:
 890         // With G1, don't generate the call if we statically know that the target in uninitialized
 891         if (!dest_uninitialized) {
 892           __ save_frame(0);
 893           // Save the necessary global regs... will be used after.
 894           if (addr->is_global()) {
 895             __ mov(addr, L0);
 896           }
 897           if (count->is_global()) {
 898             __ mov(count, L1);
 899           }
 900           __ mov(addr->after_save(), O0);
 901           // Get the count into O1
 902           __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre));
 903           __ delayed()->mov(count->after_save(), O1);
 904           if (addr->is_global()) {
 905             __ mov(L0, addr);
 906           }
 907           if (count->is_global()) {
 908             __ mov(L1, count);
 909           }
 910           __ restore();
 911         }
 912         break;
 913       case BarrierSet::CardTableForRS:
 914       case BarrierSet::CardTableExtension:
 915       case BarrierSet::ModRef:
 916         break;
 917       default:
 918         ShouldNotReachHere();
 919     }
 920   }
 921   //
 922   //  Generate post-write barrier for array.
 923   //
 924   //  Input:
 925   //     addr     - register containing starting address
 926   //     count    - register containing element count
 927   //     tmp      - scratch register
 928   //
 929   //  The input registers are overwritten.
 930   //
 931   void gen_write_ref_array_post_barrier(Register addr, Register count,
 932                                         Register tmp) {
 933     BarrierSet* bs = Universe::heap()->barrier_set();
 934 
 935     switch (bs->kind()) {
 936       case BarrierSet::G1SATBCTLogging:
 937         {
 938           // Get some new fresh output registers.
 939           __ save_frame(0);
 940           __ mov(addr->after_save(), O0);
 941           __ call(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post));
 942           __ delayed()->mov(count->after_save(), O1);
 943           __ restore();
 944         }
 945         break;
 946       case BarrierSet::CardTableForRS:
 947       case BarrierSet::CardTableExtension:
 948         {
 949           CardTableModRefBS* ct = barrier_set_cast<CardTableModRefBS>(bs);
 950           assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code");
 951           assert_different_registers(addr, count, tmp);
 952 
 953           Label L_loop;
 954 
 955           __ sll_ptr(count, LogBytesPerHeapOop, count);
 956           __ sub(count, BytesPerHeapOop, count);
 957           __ add(count, addr, count);
 958           // Use two shifts to clear out those low order two bits! (Cannot opt. into 1.)
 959           __ srl_ptr(addr, CardTableModRefBS::card_shift, addr);
 960           __ srl_ptr(count, CardTableModRefBS::card_shift, count);
 961           __ sub(count, addr, count);
 962           AddressLiteral rs(ct->byte_map_base);
 963           __ set(rs, tmp);
 964         __ BIND(L_loop);
 965           __ stb(G0, tmp, addr);
 966           __ subcc(count, 1, count);
 967           __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
 968           __ delayed()->add(addr, 1, addr);
 969         }
 970         break;
 971       case BarrierSet::ModRef:
 972         break;
 973       default:
 974         ShouldNotReachHere();
 975     }
 976   }
 977 
 978   //
 979   // Generate main code for disjoint arraycopy
 980   //
 981   typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
 982                                               Label& L_loop, bool use_prefetch, bool use_bis);
 983 
 984   void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
 985                           int iter_size, StubGenerator::CopyLoopFunc copy_loop_func) {
 986     Label L_copy;
 987 
 988     assert(log2_elem_size <= 3, "the following code should be changed");
 989     int count_dec = 16>>log2_elem_size;
 990 
 991     int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance);
 992     assert(prefetch_dist < 4096, "invalid value");
 993     prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size
 994     int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count
 995 
 996     if (UseBlockCopy) {
 997       Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy;
 998 
 999       // 64 bytes tail + bytes copied in one loop iteration
1000       int tail_size = 64 + iter_size;
1001       int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size;
1002       // Use BIS copy only for big arrays since it requires membar.
1003       __ set(block_copy_count, O4);
1004       __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy);
1005       // This code is for disjoint source and destination:
1006       //   to <= from || to >= from+count
1007       // but BIS will stomp over 'from' if (to > from-tail_size && to <= from)
1008       __ sub(from, to, O4);
1009       __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm.
1010       __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy);
1011 
1012       __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
1013       // BIS should not be used to copy tail (64 bytes+iter_size)
1014       // to avoid zeroing of following values.
1015       __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0
1016 
1017       if (prefetch_count > 0) { // rounded up to one iteration count
1018         // Do prefetching only if copy size is bigger
1019         // than prefetch distance.
1020         __ set(prefetch_count, O4);
1021         __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy);
1022         __ sub(count, prefetch_count, count);
1023 
1024         (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true);
1025         __ add(count, prefetch_count, count); // restore count
1026 
1027       } // prefetch_count > 0
1028 
1029       (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true);
1030       __ add(count, (tail_size>>log2_elem_size), count); // restore count
1031 
1032       __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
1033       // BIS needs membar.
1034       __ membar(Assembler::StoreLoad);
1035       // Copy tail
1036       __ ba_short(L_copy);
1037 
1038       __ BIND(L_skip_block_copy);
1039     } // UseBlockCopy
1040 
1041     if (prefetch_count > 0) { // rounded up to one iteration count
1042       // Do prefetching only if copy size is bigger
1043       // than prefetch distance.
1044       __ set(prefetch_count, O4);
1045       __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy);
1046       __ sub(count, prefetch_count, count);
1047 
1048       Label L_copy_prefetch;
1049       (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false);
1050       __ add(count, prefetch_count, count); // restore count
1051 
1052     } // prefetch_count > 0
1053 
1054     (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false);
1055   }
1056 
1057 
1058 
1059   //
1060   // Helper methods for copy_16_bytes_forward_with_shift()
1061   //
1062   void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec,
1063                                 Label& L_loop, bool use_prefetch, bool use_bis) {
1064 
1065     const Register left_shift  = G1; // left  shift bit counter
1066     const Register right_shift = G5; // right shift bit counter
1067 
1068     __ align(OptoLoopAlignment);
1069     __ BIND(L_loop);
1070     if (use_prefetch) {
1071       if (ArraycopySrcPrefetchDistance > 0) {
1072         __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
1073       }
1074       if (ArraycopyDstPrefetchDistance > 0) {
1075         __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
1076       }
1077     }
1078     __ ldx(from, 0, O4);
1079     __ ldx(from, 8, G4);
1080     __ inc(to, 16);
1081     __ inc(from, 16);
1082     __ deccc(count, count_dec); // Can we do next iteration after this one?
1083     __ srlx(O4, right_shift, G3);
1084     __ bset(G3, O3);
1085     __ sllx(O4, left_shift,  O4);
1086     __ srlx(G4, right_shift, G3);
1087     __ bset(G3, O4);
1088     if (use_bis) {
1089       __ stxa(O3, to, -16);
1090       __ stxa(O4, to, -8);
1091     } else {
1092       __ stx(O3, to, -16);
1093       __ stx(O4, to, -8);
1094     }
1095     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1096     __ delayed()->sllx(G4, left_shift,  O3);
1097   }
1098 
1099   // Copy big chunks forward with shift
1100   //
1101   // Inputs:
1102   //   from      - source arrays
1103   //   to        - destination array aligned to 8-bytes
1104   //   count     - elements count to copy >= the count equivalent to 16 bytes
1105   //   count_dec - elements count's decrement equivalent to 16 bytes
1106   //   L_copy_bytes - copy exit label
1107   //
1108   void copy_16_bytes_forward_with_shift(Register from, Register to,
1109                      Register count, int log2_elem_size, Label& L_copy_bytes) {
1110     Label L_aligned_copy, L_copy_last_bytes;
1111     assert(log2_elem_size <= 3, "the following code should be changed");
1112     int count_dec = 16>>log2_elem_size;
1113 
1114     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1115     __ andcc(from, 7, G1); // misaligned bytes
1116     __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1117     __ delayed()->nop();
1118 
1119     const Register left_shift  = G1; // left  shift bit counter
1120     const Register right_shift = G5; // right shift bit counter
1121 
1122     __ sll(G1, LogBitsPerByte, left_shift);
1123     __ mov(64, right_shift);
1124     __ sub(right_shift, left_shift, right_shift);
1125 
1126     //
1127     // Load 2 aligned 8-bytes chunks and use one from previous iteration
1128     // to form 2 aligned 8-bytes chunks to store.
1129     //
1130     __ dec(count, count_dec);   // Pre-decrement 'count'
1131     __ andn(from, 7, from);     // Align address
1132     __ ldx(from, 0, O3);
1133     __ inc(from, 8);
1134     __ sllx(O3, left_shift,  O3);
1135 
1136     disjoint_copy_core(from, to, count, log2_elem_size, 16, &StubGenerator::copy_16_bytes_shift_loop);
1137 
1138     __ inccc(count, count_dec>>1 ); // + 8 bytes
1139     __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1140     __ delayed()->inc(count, count_dec>>1); // restore 'count'
1141 
1142     // copy 8 bytes, part of them already loaded in O3
1143     __ ldx(from, 0, O4);
1144     __ inc(to, 8);
1145     __ inc(from, 8);
1146     __ srlx(O4, right_shift, G3);
1147     __ bset(O3, G3);
1148     __ stx(G3, to, -8);
1149 
1150     __ BIND(L_copy_last_bytes);
1151     __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
1152     __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1153     __ delayed()->sub(from, right_shift, from);       // restore address
1154 
1155     __ BIND(L_aligned_copy);
1156   }
1157 
1158   // Copy big chunks backward with shift
1159   //
1160   // Inputs:
1161   //   end_from  - source arrays end address
1162   //   end_to    - destination array end address aligned to 8-bytes
1163   //   count     - elements count to copy >= the count equivalent to 16 bytes
1164   //   count_dec - elements count's decrement equivalent to 16 bytes
1165   //   L_aligned_copy - aligned copy exit label
1166   //   L_copy_bytes   - copy exit label
1167   //
1168   void copy_16_bytes_backward_with_shift(Register end_from, Register end_to,
1169                      Register count, int count_dec,
1170                      Label& L_aligned_copy, Label& L_copy_bytes) {
1171     Label L_loop, L_copy_last_bytes;
1172 
1173     // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1174       __ andcc(end_from, 7, G1); // misaligned bytes
1175       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1176       __ delayed()->deccc(count, count_dec); // Pre-decrement 'count'
1177 
1178     const Register left_shift  = G1; // left  shift bit counter
1179     const Register right_shift = G5; // right shift bit counter
1180 
1181       __ sll(G1, LogBitsPerByte, left_shift);
1182       __ mov(64, right_shift);
1183       __ sub(right_shift, left_shift, right_shift);
1184 
1185     //
1186     // Load 2 aligned 8-bytes chunks and use one from previous iteration
1187     // to form 2 aligned 8-bytes chunks to store.
1188     //
1189       __ andn(end_from, 7, end_from);     // Align address
1190       __ ldx(end_from, 0, O3);
1191       __ align(OptoLoopAlignment);
1192     __ BIND(L_loop);
1193       __ ldx(end_from, -8, O4);
1194       __ deccc(count, count_dec); // Can we do next iteration after this one?
1195       __ ldx(end_from, -16, G4);
1196       __ dec(end_to, 16);
1197       __ dec(end_from, 16);
1198       __ srlx(O3, right_shift, O3);
1199       __ sllx(O4, left_shift,  G3);
1200       __ bset(G3, O3);
1201       __ stx(O3, end_to, 8);
1202       __ srlx(O4, right_shift, O4);
1203       __ sllx(G4, left_shift,  G3);
1204       __ bset(G3, O4);
1205       __ stx(O4, end_to, 0);
1206       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1207       __ delayed()->mov(G4, O3);
1208 
1209       __ inccc(count, count_dec>>1 ); // + 8 bytes
1210       __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1211       __ delayed()->inc(count, count_dec>>1); // restore 'count'
1212 
1213       // copy 8 bytes, part of them already loaded in O3
1214       __ ldx(end_from, -8, O4);
1215       __ dec(end_to, 8);
1216       __ dec(end_from, 8);
1217       __ srlx(O3, right_shift, O3);
1218       __ sllx(O4, left_shift,  G3);
1219       __ bset(O3, G3);
1220       __ stx(G3, end_to, 0);
1221 
1222     __ BIND(L_copy_last_bytes);
1223       __ srl(left_shift, LogBitsPerByte, left_shift);    // misaligned bytes
1224       __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1225       __ delayed()->add(end_from, left_shift, end_from); // restore address
1226   }
1227 
1228   //
1229   //  Generate stub for disjoint byte copy.  If "aligned" is true, the
1230   //  "from" and "to" addresses are assumed to be heapword aligned.
1231   //
1232   // Arguments for generated stub:
1233   //      from:  O0
1234   //      to:    O1
1235   //      count: O2 treated as signed
1236   //
1237   address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) {
1238     __ align(CodeEntryAlignment);
1239     StubCodeMark mark(this, "StubRoutines", name);
1240     address start = __ pc();
1241 
1242     Label L_skip_alignment, L_align;
1243     Label L_copy_byte, L_copy_byte_loop, L_exit;
1244 
1245     const Register from      = O0;   // source array address
1246     const Register to        = O1;   // destination array address
1247     const Register count     = O2;   // elements count
1248     const Register offset    = O5;   // offset from start of arrays
1249     // O3, O4, G3, G4 are used as temp registers
1250 
1251     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1252 
1253     if (entry != NULL) {
1254       *entry = __ pc();
1255       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1256       BLOCK_COMMENT("Entry:");
1257     }
1258 
1259     // for short arrays, just do single element copy
1260     __ cmp(count, 23); // 16 + 7
1261     __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1262     __ delayed()->mov(G0, offset);
1263 
1264     if (aligned) {
1265       // 'aligned' == true when it is known statically during compilation
1266       // of this arraycopy call site that both 'from' and 'to' addresses
1267       // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1268       //
1269       // Aligned arrays have 4 bytes alignment in 32-bits VM
1270       // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM
1271       //
1272 #ifndef _LP64
1273       // copy a 4-bytes word if necessary to align 'to' to 8 bytes
1274       __ andcc(to, 7, G0);
1275       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment);
1276       __ delayed()->ld(from, 0, O3);
1277       __ inc(from, 4);
1278       __ inc(to, 4);
1279       __ dec(count, 4);
1280       __ st(O3, to, -4);
1281     __ BIND(L_skip_alignment);
1282 #endif
1283     } else {
1284       // copy bytes to align 'to' on 8 byte boundary
1285       __ andcc(to, 7, G1); // misaligned bytes
1286       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1287       __ delayed()->neg(G1);
1288       __ inc(G1, 8);       // bytes need to copy to next 8-bytes alignment
1289       __ sub(count, G1, count);
1290     __ BIND(L_align);
1291       __ ldub(from, 0, O3);
1292       __ deccc(G1);
1293       __ inc(from);
1294       __ stb(O3, to, 0);
1295       __ br(Assembler::notZero, false, Assembler::pt, L_align);
1296       __ delayed()->inc(to);
1297     __ BIND(L_skip_alignment);
1298     }
1299 #ifdef _LP64
1300     if (!aligned)
1301 #endif
1302     {
1303       // Copy with shift 16 bytes per iteration if arrays do not have
1304       // the same alignment mod 8, otherwise fall through to the next
1305       // code for aligned copy.
1306       // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
1307       // Also jump over aligned copy after the copy with shift completed.
1308 
1309       copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte);
1310     }
1311 
1312     // Both array are 8 bytes aligned, copy 16 bytes at a time
1313       __ and3(count, 7, G4); // Save count
1314       __ srl(count, 3, count);
1315      generate_disjoint_long_copy_core(aligned);
1316       __ mov(G4, count);     // Restore count
1317 
1318     // copy tailing bytes
1319     __ BIND(L_copy_byte);
1320       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1321       __ align(OptoLoopAlignment);
1322     __ BIND(L_copy_byte_loop);
1323       __ ldub(from, offset, O3);
1324       __ deccc(count);
1325       __ stb(O3, to, offset);
1326       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1327       __ delayed()->inc(offset);
1328 
1329     __ BIND(L_exit);
1330       // O3, O4 are used as temp registers
1331       inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1332       __ retl();
1333       __ delayed()->mov(G0, O0); // return 0
1334     return start;
1335   }
1336 
1337   //
1338   //  Generate stub for conjoint byte copy.  If "aligned" is true, the
1339   //  "from" and "to" addresses are assumed to be heapword aligned.
1340   //
1341   // Arguments for generated stub:
1342   //      from:  O0
1343   //      to:    O1
1344   //      count: O2 treated as signed
1345   //
1346   address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1347                                       address *entry, const char *name) {
1348     // Do reverse copy.
1349 
1350     __ align(CodeEntryAlignment);
1351     StubCodeMark mark(this, "StubRoutines", name);
1352     address start = __ pc();
1353 
1354     Label L_skip_alignment, L_align, L_aligned_copy;
1355     Label L_copy_byte, L_copy_byte_loop, L_exit;
1356 
1357     const Register from      = O0;   // source array address
1358     const Register to        = O1;   // destination array address
1359     const Register count     = O2;   // elements count
1360     const Register end_from  = from; // source array end address
1361     const Register end_to    = to;   // destination array end address
1362 
1363     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1364 
1365     if (entry != NULL) {
1366       *entry = __ pc();
1367       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1368       BLOCK_COMMENT("Entry:");
1369     }
1370 
1371     array_overlap_test(nooverlap_target, 0);
1372 
1373     __ add(to, count, end_to);       // offset after last copied element
1374 
1375     // for short arrays, just do single element copy
1376     __ cmp(count, 23); // 16 + 7
1377     __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1378     __ delayed()->add(from, count, end_from);
1379 
1380     {
1381       // Align end of arrays since they could be not aligned even
1382       // when arrays itself are aligned.
1383 
1384       // copy bytes to align 'end_to' on 8 byte boundary
1385       __ andcc(end_to, 7, G1); // misaligned bytes
1386       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1387       __ delayed()->nop();
1388       __ sub(count, G1, count);
1389     __ BIND(L_align);
1390       __ dec(end_from);
1391       __ dec(end_to);
1392       __ ldub(end_from, 0, O3);
1393       __ deccc(G1);
1394       __ brx(Assembler::notZero, false, Assembler::pt, L_align);
1395       __ delayed()->stb(O3, end_to, 0);
1396     __ BIND(L_skip_alignment);
1397     }
1398 #ifdef _LP64
1399     if (aligned) {
1400       // Both arrays are aligned to 8-bytes in 64-bits VM.
1401       // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1402       // in unaligned case.
1403       __ dec(count, 16);
1404     } else
1405 #endif
1406     {
1407       // Copy with shift 16 bytes per iteration if arrays do not have
1408       // the same alignment mod 8, otherwise jump to the next
1409       // code for aligned copy (and substracting 16 from 'count' before jump).
1410       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1411       // Also jump over aligned copy after the copy with shift completed.
1412 
1413       copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
1414                                         L_aligned_copy, L_copy_byte);
1415     }
1416     // copy 4 elements (16 bytes) at a time
1417       __ align(OptoLoopAlignment);
1418     __ BIND(L_aligned_copy);
1419       __ dec(end_from, 16);
1420       __ ldx(end_from, 8, O3);
1421       __ ldx(end_from, 0, O4);
1422       __ dec(end_to, 16);
1423       __ deccc(count, 16);
1424       __ stx(O3, end_to, 8);
1425       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1426       __ delayed()->stx(O4, end_to, 0);
1427       __ inc(count, 16);
1428 
1429     // copy 1 element (2 bytes) at a time
1430     __ BIND(L_copy_byte);
1431       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1432       __ align(OptoLoopAlignment);
1433     __ BIND(L_copy_byte_loop);
1434       __ dec(end_from);
1435       __ dec(end_to);
1436       __ ldub(end_from, 0, O4);
1437       __ deccc(count);
1438       __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop);
1439       __ delayed()->stb(O4, end_to, 0);
1440 
1441     __ BIND(L_exit);
1442     // O3, O4 are used as temp registers
1443     inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1444     __ retl();
1445     __ delayed()->mov(G0, O0); // return 0
1446     return start;
1447   }
1448 
1449   //
1450   //  Generate stub for disjoint short copy.  If "aligned" is true, the
1451   //  "from" and "to" addresses are assumed to be heapword aligned.
1452   //
1453   // Arguments for generated stub:
1454   //      from:  O0
1455   //      to:    O1
1456   //      count: O2 treated as signed
1457   //
1458   address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) {
1459     __ align(CodeEntryAlignment);
1460     StubCodeMark mark(this, "StubRoutines", name);
1461     address start = __ pc();
1462 
1463     Label L_skip_alignment, L_skip_alignment2;
1464     Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1465 
1466     const Register from      = O0;   // source array address
1467     const Register to        = O1;   // destination array address
1468     const Register count     = O2;   // elements count
1469     const Register offset    = O5;   // offset from start of arrays
1470     // O3, O4, G3, G4 are used as temp registers
1471 
1472     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1473 
1474     if (entry != NULL) {
1475       *entry = __ pc();
1476       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1477       BLOCK_COMMENT("Entry:");
1478     }
1479 
1480     // for short arrays, just do single element copy
1481     __ cmp(count, 11); // 8 + 3  (22 bytes)
1482     __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1483     __ delayed()->mov(G0, offset);
1484 
1485     if (aligned) {
1486       // 'aligned' == true when it is known statically during compilation
1487       // of this arraycopy call site that both 'from' and 'to' addresses
1488       // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1489       //
1490       // Aligned arrays have 4 bytes alignment in 32-bits VM
1491       // and 8 bytes - in 64-bits VM.
1492       //
1493 #ifndef _LP64
1494       // copy a 2-elements word if necessary to align 'to' to 8 bytes
1495       __ andcc(to, 7, G0);
1496       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1497       __ delayed()->ld(from, 0, O3);
1498       __ inc(from, 4);
1499       __ inc(to, 4);
1500       __ dec(count, 2);
1501       __ st(O3, to, -4);
1502     __ BIND(L_skip_alignment);
1503 #endif
1504     } else {
1505       // copy 1 element if necessary to align 'to' on an 4 bytes
1506       __ andcc(to, 3, G0);
1507       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1508       __ delayed()->lduh(from, 0, O3);
1509       __ inc(from, 2);
1510       __ inc(to, 2);
1511       __ dec(count);
1512       __ sth(O3, to, -2);
1513     __ BIND(L_skip_alignment);
1514 
1515       // copy 2 elements to align 'to' on an 8 byte boundary
1516       __ andcc(to, 7, G0);
1517       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1518       __ delayed()->lduh(from, 0, O3);
1519       __ dec(count, 2);
1520       __ lduh(from, 2, O4);
1521       __ inc(from, 4);
1522       __ inc(to, 4);
1523       __ sth(O3, to, -4);
1524       __ sth(O4, to, -2);
1525     __ BIND(L_skip_alignment2);
1526     }
1527 #ifdef _LP64
1528     if (!aligned)
1529 #endif
1530     {
1531       // Copy with shift 16 bytes per iteration if arrays do not have
1532       // the same alignment mod 8, otherwise fall through to the next
1533       // code for aligned copy.
1534       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1535       // Also jump over aligned copy after the copy with shift completed.
1536 
1537       copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes);
1538     }
1539 
1540     // Both array are 8 bytes aligned, copy 16 bytes at a time
1541       __ and3(count, 3, G4); // Save
1542       __ srl(count, 2, count);
1543      generate_disjoint_long_copy_core(aligned);
1544       __ mov(G4, count); // restore
1545 
1546     // copy 1 element at a time
1547     __ BIND(L_copy_2_bytes);
1548       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1549       __ align(OptoLoopAlignment);
1550     __ BIND(L_copy_2_bytes_loop);
1551       __ lduh(from, offset, O3);
1552       __ deccc(count);
1553       __ sth(O3, to, offset);
1554       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1555       __ delayed()->inc(offset, 2);
1556 
1557     __ BIND(L_exit);
1558       // O3, O4 are used as temp registers
1559       inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1560       __ retl();
1561       __ delayed()->mov(G0, O0); // return 0
1562     return start;
1563   }
1564 
1565   //
1566   //  Generate stub for disjoint short fill.  If "aligned" is true, the
1567   //  "to" address is assumed to be heapword aligned.
1568   //
1569   // Arguments for generated stub:
1570   //      to:    O0
1571   //      value: O1
1572   //      count: O2 treated as signed
1573   //
1574   address generate_fill(BasicType t, bool aligned, const char* name) {
1575     __ align(CodeEntryAlignment);
1576     StubCodeMark mark(this, "StubRoutines", name);
1577     address start = __ pc();
1578 
1579     const Register to        = O0;   // source array address
1580     const Register value     = O1;   // fill value
1581     const Register count     = O2;   // elements count
1582     // O3 is used as a temp register
1583 
1584     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1585 
1586     Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
1587     Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes;
1588 
1589     int shift = -1;
1590     switch (t) {
1591        case T_BYTE:
1592         shift = 2;
1593         break;
1594        case T_SHORT:
1595         shift = 1;
1596         break;
1597       case T_INT:
1598          shift = 0;
1599         break;
1600       default: ShouldNotReachHere();
1601     }
1602 
1603     BLOCK_COMMENT("Entry:");
1604 
1605     if (t == T_BYTE) {
1606       // Zero extend value
1607       __ and3(value, 0xff, value);
1608       __ sllx(value, 8, O3);
1609       __ or3(value, O3, value);
1610     }
1611     if (t == T_SHORT) {
1612       // Zero extend value
1613       __ sllx(value, 48, value);
1614       __ srlx(value, 48, value);
1615     }
1616     if (t == T_BYTE || t == T_SHORT) {
1617       __ sllx(value, 16, O3);
1618       __ or3(value, O3, value);
1619     }
1620 
1621     __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
1622     __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp
1623     __ delayed()->andcc(count, 1, G0);
1624 
1625     if (!aligned && (t == T_BYTE || t == T_SHORT)) {
1626       // align source address at 4 bytes address boundary
1627       if (t == T_BYTE) {
1628         // One byte misalignment happens only for byte arrays
1629         __ andcc(to, 1, G0);
1630         __ br(Assembler::zero, false, Assembler::pt, L_skip_align1);
1631         __ delayed()->nop();
1632         __ stb(value, to, 0);
1633         __ inc(to, 1);
1634         __ dec(count, 1);
1635         __ BIND(L_skip_align1);
1636       }
1637       // Two bytes misalignment happens only for byte and short (char) arrays
1638       __ andcc(to, 2, G0);
1639       __ br(Assembler::zero, false, Assembler::pt, L_skip_align2);
1640       __ delayed()->nop();
1641       __ sth(value, to, 0);
1642       __ inc(to, 2);
1643       __ dec(count, 1 << (shift - 1));
1644       __ BIND(L_skip_align2);
1645     }
1646 #ifdef _LP64
1647     if (!aligned) {
1648 #endif
1649     // align to 8 bytes, we know we are 4 byte aligned to start
1650     __ andcc(to, 7, G0);
1651     __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes);
1652     __ delayed()->nop();
1653     __ stw(value, to, 0);
1654     __ inc(to, 4);
1655     __ dec(count, 1 << shift);
1656     __ BIND(L_fill_32_bytes);
1657 #ifdef _LP64
1658     }
1659 #endif
1660 
1661     if (t == T_INT) {
1662       // Zero extend value
1663       __ srl(value, 0, value);
1664     }
1665     if (t == T_BYTE || t == T_SHORT || t == T_INT) {
1666       __ sllx(value, 32, O3);
1667       __ or3(value, O3, value);
1668     }
1669 
1670     Label L_check_fill_8_bytes;
1671     // Fill 32-byte chunks
1672     __ subcc(count, 8 << shift, count);
1673     __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes);
1674     __ delayed()->nop();
1675 
1676     Label L_fill_32_bytes_loop, L_fill_4_bytes;
1677     __ align(16);
1678     __ BIND(L_fill_32_bytes_loop);
1679 
1680     __ stx(value, to, 0);
1681     __ stx(value, to, 8);
1682     __ stx(value, to, 16);
1683     __ stx(value, to, 24);
1684 
1685     __ subcc(count, 8 << shift, count);
1686     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop);
1687     __ delayed()->add(to, 32, to);
1688 
1689     __ BIND(L_check_fill_8_bytes);
1690     __ addcc(count, 8 << shift, count);
1691     __ brx(Assembler::zero, false, Assembler::pn, L_exit);
1692     __ delayed()->subcc(count, 1 << (shift + 1), count);
1693     __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes);
1694     __ delayed()->andcc(count, 1<<shift, G0);
1695 
1696     //
1697     // length is too short, just fill 8 bytes at a time
1698     //
1699     Label L_fill_8_bytes_loop;
1700     __ BIND(L_fill_8_bytes_loop);
1701     __ stx(value, to, 0);
1702     __ subcc(count, 1 << (shift + 1), count);
1703     __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop);
1704     __ delayed()->add(to, 8, to);
1705 
1706     // fill trailing 4 bytes
1707     __ andcc(count, 1<<shift, G0);  // in delay slot of branches
1708     if (t == T_INT) {
1709       __ BIND(L_fill_elements);
1710     }
1711     __ BIND(L_fill_4_bytes);
1712     __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes);
1713     if (t == T_BYTE || t == T_SHORT) {
1714       __ delayed()->andcc(count, 1<<(shift-1), G0);
1715     } else {
1716       __ delayed()->nop();
1717     }
1718     __ stw(value, to, 0);
1719     if (t == T_BYTE || t == T_SHORT) {
1720       __ inc(to, 4);
1721       // fill trailing 2 bytes
1722       __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches
1723       __ BIND(L_fill_2_bytes);
1724       __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte);
1725       __ delayed()->andcc(count, 1, count);
1726       __ sth(value, to, 0);
1727       if (t == T_BYTE) {
1728         __ inc(to, 2);
1729         // fill trailing byte
1730         __ andcc(count, 1, count);  // in delay slot of branches
1731         __ BIND(L_fill_byte);
1732         __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1733         __ delayed()->nop();
1734         __ stb(value, to, 0);
1735       } else {
1736         __ BIND(L_fill_byte);
1737       }
1738     } else {
1739       __ BIND(L_fill_2_bytes);
1740     }
1741     __ BIND(L_exit);
1742     __ retl();
1743     __ delayed()->nop();
1744 
1745     // Handle copies less than 8 bytes.  Int is handled elsewhere.
1746     if (t == T_BYTE) {
1747       __ BIND(L_fill_elements);
1748       Label L_fill_2, L_fill_4;
1749       // in delay slot __ andcc(count, 1, G0);
1750       __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1751       __ delayed()->andcc(count, 2, G0);
1752       __ stb(value, to, 0);
1753       __ inc(to, 1);
1754       __ BIND(L_fill_2);
1755       __ brx(Assembler::zero, false, Assembler::pt, L_fill_4);
1756       __ delayed()->andcc(count, 4, G0);
1757       __ stb(value, to, 0);
1758       __ stb(value, to, 1);
1759       __ inc(to, 2);
1760       __ BIND(L_fill_4);
1761       __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1762       __ delayed()->nop();
1763       __ stb(value, to, 0);
1764       __ stb(value, to, 1);
1765       __ stb(value, to, 2);
1766       __ retl();
1767       __ delayed()->stb(value, to, 3);
1768     }
1769 
1770     if (t == T_SHORT) {
1771       Label L_fill_2;
1772       __ BIND(L_fill_elements);
1773       // in delay slot __ andcc(count, 1, G0);
1774       __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1775       __ delayed()->andcc(count, 2, G0);
1776       __ sth(value, to, 0);
1777       __ inc(to, 2);
1778       __ BIND(L_fill_2);
1779       __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1780       __ delayed()->nop();
1781       __ sth(value, to, 0);
1782       __ retl();
1783       __ delayed()->sth(value, to, 2);
1784     }
1785     return start;
1786   }
1787 
1788   //
1789   //  Generate stub for conjoint short copy.  If "aligned" is true, the
1790   //  "from" and "to" addresses are assumed to be heapword aligned.
1791   //
1792   // Arguments for generated stub:
1793   //      from:  O0
1794   //      to:    O1
1795   //      count: O2 treated as signed
1796   //
1797   address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1798                                        address *entry, const char *name) {
1799     // Do reverse copy.
1800 
1801     __ align(CodeEntryAlignment);
1802     StubCodeMark mark(this, "StubRoutines", name);
1803     address start = __ pc();
1804 
1805     Label L_skip_alignment, L_skip_alignment2, L_aligned_copy;
1806     Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1807 
1808     const Register from      = O0;   // source array address
1809     const Register to        = O1;   // destination array address
1810     const Register count     = O2;   // elements count
1811     const Register end_from  = from; // source array end address
1812     const Register end_to    = to;   // destination array end address
1813 
1814     const Register byte_count = O3;  // bytes count to copy
1815 
1816     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
1817 
1818     if (entry != NULL) {
1819       *entry = __ pc();
1820       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1821       BLOCK_COMMENT("Entry:");
1822     }
1823 
1824     array_overlap_test(nooverlap_target, 1);
1825 
1826     __ sllx(count, LogBytesPerShort, byte_count);
1827     __ add(to, byte_count, end_to);  // offset after last copied element
1828 
1829     // for short arrays, just do single element copy
1830     __ cmp(count, 11); // 8 + 3  (22 bytes)
1831     __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1832     __ delayed()->add(from, byte_count, end_from);
1833 
1834     {
1835       // Align end of arrays since they could be not aligned even
1836       // when arrays itself are aligned.
1837 
1838       // copy 1 element if necessary to align 'end_to' on an 4 bytes
1839       __ andcc(end_to, 3, G0);
1840       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1841       __ delayed()->lduh(end_from, -2, O3);
1842       __ dec(end_from, 2);
1843       __ dec(end_to, 2);
1844       __ dec(count);
1845       __ sth(O3, end_to, 0);
1846     __ BIND(L_skip_alignment);
1847 
1848       // copy 2 elements to align 'end_to' on an 8 byte boundary
1849       __ andcc(end_to, 7, G0);
1850       __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1851       __ delayed()->lduh(end_from, -2, O3);
1852       __ dec(count, 2);
1853       __ lduh(end_from, -4, O4);
1854       __ dec(end_from, 4);
1855       __ dec(end_to, 4);
1856       __ sth(O3, end_to, 2);
1857       __ sth(O4, end_to, 0);
1858     __ BIND(L_skip_alignment2);
1859     }
1860 #ifdef _LP64
1861     if (aligned) {
1862       // Both arrays are aligned to 8-bytes in 64-bits VM.
1863       // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1864       // in unaligned case.
1865       __ dec(count, 8);
1866     } else
1867 #endif
1868     {
1869       // Copy with shift 16 bytes per iteration if arrays do not have
1870       // the same alignment mod 8, otherwise jump to the next
1871       // code for aligned copy (and substracting 8 from 'count' before jump).
1872       // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1873       // Also jump over aligned copy after the copy with shift completed.
1874 
1875       copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
1876                                         L_aligned_copy, L_copy_2_bytes);
1877     }
1878     // copy 4 elements (16 bytes) at a time
1879       __ align(OptoLoopAlignment);
1880     __ BIND(L_aligned_copy);
1881       __ dec(end_from, 16);
1882       __ ldx(end_from, 8, O3);
1883       __ ldx(end_from, 0, O4);
1884       __ dec(end_to, 16);
1885       __ deccc(count, 8);
1886       __ stx(O3, end_to, 8);
1887       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1888       __ delayed()->stx(O4, end_to, 0);
1889       __ inc(count, 8);
1890 
1891     // copy 1 element (2 bytes) at a time
1892     __ BIND(L_copy_2_bytes);
1893       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1894     __ BIND(L_copy_2_bytes_loop);
1895       __ dec(end_from, 2);
1896       __ dec(end_to, 2);
1897       __ lduh(end_from, 0, O4);
1898       __ deccc(count);
1899       __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
1900       __ delayed()->sth(O4, end_to, 0);
1901 
1902     __ BIND(L_exit);
1903     // O3, O4 are used as temp registers
1904     inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1905     __ retl();
1906     __ delayed()->mov(G0, O0); // return 0
1907     return start;
1908   }
1909 
1910   //
1911   // Helper methods for generate_disjoint_int_copy_core()
1912   //
1913   void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec,
1914                           Label& L_loop, bool use_prefetch, bool use_bis) {
1915 
1916     __ align(OptoLoopAlignment);
1917     __ BIND(L_loop);
1918     if (use_prefetch) {
1919       if (ArraycopySrcPrefetchDistance > 0) {
1920         __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
1921       }
1922       if (ArraycopyDstPrefetchDistance > 0) {
1923         __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
1924       }
1925     }
1926     __ ldx(from, 4, O4);
1927     __ ldx(from, 12, G4);
1928     __ inc(to, 16);
1929     __ inc(from, 16);
1930     __ deccc(count, 4); // Can we do next iteration after this one?
1931 
1932     __ srlx(O4, 32, G3);
1933     __ bset(G3, O3);
1934     __ sllx(O4, 32, O4);
1935     __ srlx(G4, 32, G3);
1936     __ bset(G3, O4);
1937     if (use_bis) {
1938       __ stxa(O3, to, -16);
1939       __ stxa(O4, to, -8);
1940     } else {
1941       __ stx(O3, to, -16);
1942       __ stx(O4, to, -8);
1943     }
1944     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1945     __ delayed()->sllx(G4, 32,  O3);
1946 
1947   }
1948 
1949   //
1950   //  Generate core code for disjoint int copy (and oop copy on 32-bit).
1951   //  If "aligned" is true, the "from" and "to" addresses are assumed
1952   //  to be heapword aligned.
1953   //
1954   // Arguments:
1955   //      from:  O0
1956   //      to:    O1
1957   //      count: O2 treated as signed
1958   //
1959   void generate_disjoint_int_copy_core(bool aligned) {
1960 
1961     Label L_skip_alignment, L_aligned_copy;
1962     Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
1963 
1964     const Register from      = O0;   // source array address
1965     const Register to        = O1;   // destination array address
1966     const Register count     = O2;   // elements count
1967     const Register offset    = O5;   // offset from start of arrays
1968     // O3, O4, G3, G4 are used as temp registers
1969 
1970     // 'aligned' == true when it is known statically during compilation
1971     // of this arraycopy call site that both 'from' and 'to' addresses
1972     // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1973     //
1974     // Aligned arrays have 4 bytes alignment in 32-bits VM
1975     // and 8 bytes - in 64-bits VM.
1976     //
1977 #ifdef _LP64
1978     if (!aligned)
1979 #endif
1980     {
1981       // The next check could be put under 'ifndef' since the code in
1982       // generate_disjoint_long_copy_core() has own checks and set 'offset'.
1983 
1984       // for short arrays, just do single element copy
1985       __ cmp(count, 5); // 4 + 1 (20 bytes)
1986       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
1987       __ delayed()->mov(G0, offset);
1988 
1989       // copy 1 element to align 'to' on an 8 byte boundary
1990       __ andcc(to, 7, G0);
1991       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1992       __ delayed()->ld(from, 0, O3);
1993       __ inc(from, 4);
1994       __ inc(to, 4);
1995       __ dec(count);
1996       __ st(O3, to, -4);
1997     __ BIND(L_skip_alignment);
1998 
1999     // if arrays have same alignment mod 8, do 4 elements copy
2000       __ andcc(from, 7, G0);
2001       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2002       __ delayed()->ld(from, 0, O3);
2003 
2004     //
2005     // Load 2 aligned 8-bytes chunks and use one from previous iteration
2006     // to form 2 aligned 8-bytes chunks to store.
2007     //
2008     // copy_16_bytes_forward_with_shift() is not used here since this
2009     // code is more optimal.
2010 
2011     // copy with shift 4 elements (16 bytes) at a time
2012       __ dec(count, 4);   // The cmp at the beginning guaranty count >= 4
2013       __ sllx(O3, 32,  O3);
2014 
2015       disjoint_copy_core(from, to, count, 2, 16, &StubGenerator::copy_16_bytes_loop);
2016 
2017       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2018       __ delayed()->inc(count, 4); // restore 'count'
2019 
2020     __ BIND(L_aligned_copy);
2021     } // !aligned
2022 
2023     // copy 4 elements (16 bytes) at a time
2024       __ and3(count, 1, G4); // Save
2025       __ srl(count, 1, count);
2026      generate_disjoint_long_copy_core(aligned);
2027       __ mov(G4, count);     // Restore
2028 
2029     // copy 1 element at a time
2030     __ BIND(L_copy_4_bytes);
2031       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2032     __ BIND(L_copy_4_bytes_loop);
2033       __ ld(from, offset, O3);
2034       __ deccc(count);
2035       __ st(O3, to, offset);
2036       __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
2037       __ delayed()->inc(offset, 4);
2038     __ BIND(L_exit);
2039   }
2040 
2041   //
2042   //  Generate stub for disjoint int copy.  If "aligned" is true, the
2043   //  "from" and "to" addresses are assumed to be heapword aligned.
2044   //
2045   // Arguments for generated stub:
2046   //      from:  O0
2047   //      to:    O1
2048   //      count: O2 treated as signed
2049   //
2050   address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) {
2051     __ align(CodeEntryAlignment);
2052     StubCodeMark mark(this, "StubRoutines", name);
2053     address start = __ pc();
2054 
2055     const Register count = O2;
2056     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2057 
2058     if (entry != NULL) {
2059       *entry = __ pc();
2060       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2061       BLOCK_COMMENT("Entry:");
2062     }
2063 
2064     generate_disjoint_int_copy_core(aligned);
2065 
2066     // O3, O4 are used as temp registers
2067     inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2068     __ retl();
2069     __ delayed()->mov(G0, O0); // return 0
2070     return start;
2071   }
2072 
2073   //
2074   //  Generate core code for conjoint int copy (and oop copy on 32-bit).
2075   //  If "aligned" is true, the "from" and "to" addresses are assumed
2076   //  to be heapword aligned.
2077   //
2078   // Arguments:
2079   //      from:  O0
2080   //      to:    O1
2081   //      count: O2 treated as signed
2082   //
2083   void generate_conjoint_int_copy_core(bool aligned) {
2084     // Do reverse copy.
2085 
2086     Label L_skip_alignment, L_aligned_copy;
2087     Label L_copy_16_bytes,  L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
2088 
2089     const Register from      = O0;   // source array address
2090     const Register to        = O1;   // destination array address
2091     const Register count     = O2;   // elements count
2092     const Register end_from  = from; // source array end address
2093     const Register end_to    = to;   // destination array end address
2094     // O3, O4, O5, G3 are used as temp registers
2095 
2096     const Register byte_count = O3;  // bytes count to copy
2097 
2098       __ sllx(count, LogBytesPerInt, byte_count);
2099       __ add(to, byte_count, end_to); // offset after last copied element
2100 
2101       __ cmp(count, 5); // for short arrays, just do single element copy
2102       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
2103       __ delayed()->add(from, byte_count, end_from);
2104 
2105     // copy 1 element to align 'to' on an 8 byte boundary
2106       __ andcc(end_to, 7, G0);
2107       __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
2108       __ delayed()->nop();
2109       __ dec(count);
2110       __ dec(end_from, 4);
2111       __ dec(end_to,   4);
2112       __ ld(end_from, 0, O4);
2113       __ st(O4, end_to, 0);
2114     __ BIND(L_skip_alignment);
2115 
2116     // Check if 'end_from' and 'end_to' has the same alignment.
2117       __ andcc(end_from, 7, G0);
2118       __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
2119       __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4
2120 
2121     // copy with shift 4 elements (16 bytes) at a time
2122     //
2123     // Load 2 aligned 8-bytes chunks and use one from previous iteration
2124     // to form 2 aligned 8-bytes chunks to store.
2125     //
2126       __ ldx(end_from, -4, O3);
2127       __ align(OptoLoopAlignment);
2128     __ BIND(L_copy_16_bytes);
2129       __ ldx(end_from, -12, O4);
2130       __ deccc(count, 4);
2131       __ ldx(end_from, -20, O5);
2132       __ dec(end_to, 16);
2133       __ dec(end_from, 16);
2134       __ srlx(O3, 32, O3);
2135       __ sllx(O4, 32, G3);
2136       __ bset(G3, O3);
2137       __ stx(O3, end_to, 8);
2138       __ srlx(O4, 32, O4);
2139       __ sllx(O5, 32, G3);
2140       __ bset(O4, G3);
2141       __ stx(G3, end_to, 0);
2142       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2143       __ delayed()->mov(O5, O3);
2144 
2145       __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
2146       __ delayed()->inc(count, 4);
2147 
2148     // copy 4 elements (16 bytes) at a time
2149       __ align(OptoLoopAlignment);
2150     __ BIND(L_aligned_copy);
2151       __ dec(end_from, 16);
2152       __ ldx(end_from, 8, O3);
2153       __ ldx(end_from, 0, O4);
2154       __ dec(end_to, 16);
2155       __ deccc(count, 4);
2156       __ stx(O3, end_to, 8);
2157       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
2158       __ delayed()->stx(O4, end_to, 0);
2159       __ inc(count, 4);
2160 
2161     // copy 1 element (4 bytes) at a time
2162     __ BIND(L_copy_4_bytes);
2163       __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2164     __ BIND(L_copy_4_bytes_loop);
2165       __ dec(end_from, 4);
2166       __ dec(end_to, 4);
2167       __ ld(end_from, 0, O4);
2168       __ deccc(count);
2169       __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop);
2170       __ delayed()->st(O4, end_to, 0);
2171     __ BIND(L_exit);
2172   }
2173 
2174   //
2175   //  Generate stub for conjoint int copy.  If "aligned" is true, the
2176   //  "from" and "to" addresses are assumed to be heapword aligned.
2177   //
2178   // Arguments for generated stub:
2179   //      from:  O0
2180   //      to:    O1
2181   //      count: O2 treated as signed
2182   //
2183   address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
2184                                      address *entry, const char *name) {
2185     __ align(CodeEntryAlignment);
2186     StubCodeMark mark(this, "StubRoutines", name);
2187     address start = __ pc();
2188 
2189     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2190 
2191     if (entry != NULL) {
2192       *entry = __ pc();
2193       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2194       BLOCK_COMMENT("Entry:");
2195     }
2196 
2197     array_overlap_test(nooverlap_target, 2);
2198 
2199     generate_conjoint_int_copy_core(aligned);
2200 
2201     // O3, O4 are used as temp registers
2202     inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2203     __ retl();
2204     __ delayed()->mov(G0, O0); // return 0
2205     return start;
2206   }
2207 
2208   //
2209   // Helper methods for generate_disjoint_long_copy_core()
2210   //
2211   void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec,
2212                           Label& L_loop, bool use_prefetch, bool use_bis) {
2213     __ align(OptoLoopAlignment);
2214     __ BIND(L_loop);
2215     for (int off = 0; off < 64; off += 16) {
2216       if (use_prefetch && (off & 31) == 0) {
2217         if (ArraycopySrcPrefetchDistance > 0) {
2218           __ prefetch(from, ArraycopySrcPrefetchDistance+off, Assembler::severalReads);
2219         }
2220         if (ArraycopyDstPrefetchDistance > 0) {
2221           __ prefetch(to, ArraycopyDstPrefetchDistance+off, Assembler::severalWritesAndPossiblyReads);
2222         }
2223       }
2224       __ ldx(from,  off+0, O4);
2225       __ ldx(from,  off+8, O5);
2226       if (use_bis) {
2227         __ stxa(O4, to,  off+0);
2228         __ stxa(O5, to,  off+8);
2229       } else {
2230         __ stx(O4, to,  off+0);
2231         __ stx(O5, to,  off+8);
2232       }
2233     }
2234     __ deccc(count, 8);
2235     __ inc(from, 64);
2236     __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
2237     __ delayed()->inc(to, 64);
2238   }
2239 
2240   //
2241   //  Generate core code for disjoint long copy (and oop copy on 64-bit).
2242   //  "aligned" is ignored, because we must make the stronger
2243   //  assumption that both addresses are always 64-bit aligned.
2244   //
2245   // Arguments:
2246   //      from:  O0
2247   //      to:    O1
2248   //      count: O2 treated as signed
2249   //
2250   // count -= 2;
2251   // if ( count >= 0 ) { // >= 2 elements
2252   //   if ( count > 6) { // >= 8 elements
2253   //     count -= 6; // original count - 8
2254   //     do {
2255   //       copy_8_elements;
2256   //       count -= 8;
2257   //     } while ( count >= 0 );
2258   //     count += 6;
2259   //   }
2260   //   if ( count >= 0 ) { // >= 2 elements
2261   //     do {
2262   //       copy_2_elements;
2263   //     } while ( (count=count-2) >= 0 );
2264   //   }
2265   // }
2266   // count += 2;
2267   // if ( count != 0 ) { // 1 element left
2268   //   copy_1_element;
2269   // }
2270   //
2271   void generate_disjoint_long_copy_core(bool aligned) {
2272     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2273     const Register from    = O0;  // source array address
2274     const Register to      = O1;  // destination array address
2275     const Register count   = O2;  // elements count
2276     const Register offset0 = O4;  // element offset
2277     const Register offset8 = O5;  // next element offset
2278 
2279     __ deccc(count, 2);
2280     __ mov(G0, offset0);   // offset from start of arrays (0)
2281     __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2282     __ delayed()->add(offset0, 8, offset8);
2283 
2284     // Copy by 64 bytes chunks
2285 
2286     const Register from64 = O3;  // source address
2287     const Register to64   = G3;  // destination address
2288     __ subcc(count, 6, O3);
2289     __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
2290     __ delayed()->mov(to,   to64);
2291     // Now we can use O4(offset0), O5(offset8) as temps
2292     __ mov(O3, count);
2293     // count >= 0 (original count - 8)
2294     __ mov(from, from64);
2295 
2296     disjoint_copy_core(from64, to64, count, 3, 64, &StubGenerator::copy_64_bytes_loop);
2297 
2298       // Restore O4(offset0), O5(offset8)
2299       __ sub(from64, from, offset0);
2300       __ inccc(count, 6); // restore count
2301       __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2302       __ delayed()->add(offset0, 8, offset8);
2303 
2304       // Copy by 16 bytes chunks
2305       __ align(OptoLoopAlignment);
2306     __ BIND(L_copy_16_bytes);
2307       __ ldx(from, offset0, O3);
2308       __ ldx(from, offset8, G3);
2309       __ deccc(count, 2);
2310       __ stx(O3, to, offset0);
2311       __ inc(offset0, 16);
2312       __ stx(G3, to, offset8);
2313       __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2314       __ delayed()->inc(offset8, 16);
2315 
2316       // Copy last 8 bytes
2317     __ BIND(L_copy_8_bytes);
2318       __ inccc(count, 2);
2319       __ brx(Assembler::zero, true, Assembler::pn, L_exit );
2320       __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
2321       __ ldx(from, offset0, O3);
2322       __ stx(O3, to, offset0);
2323     __ BIND(L_exit);
2324   }
2325 
2326   //
2327   //  Generate stub for disjoint long copy.
2328   //  "aligned" is ignored, because we must make the stronger
2329   //  assumption that both addresses are always 64-bit aligned.
2330   //
2331   // Arguments for generated stub:
2332   //      from:  O0
2333   //      to:    O1
2334   //      count: O2 treated as signed
2335   //
2336   address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) {
2337     __ align(CodeEntryAlignment);
2338     StubCodeMark mark(this, "StubRoutines", name);
2339     address start = __ pc();
2340 
2341     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2342 
2343     if (entry != NULL) {
2344       *entry = __ pc();
2345       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2346       BLOCK_COMMENT("Entry:");
2347     }
2348 
2349     generate_disjoint_long_copy_core(aligned);
2350 
2351     // O3, O4 are used as temp registers
2352     inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2353     __ retl();
2354     __ delayed()->mov(G0, O0); // return 0
2355     return start;
2356   }
2357 
2358   //
2359   //  Generate core code for conjoint long copy (and oop copy on 64-bit).
2360   //  "aligned" is ignored, because we must make the stronger
2361   //  assumption that both addresses are always 64-bit aligned.
2362   //
2363   // Arguments:
2364   //      from:  O0
2365   //      to:    O1
2366   //      count: O2 treated as signed
2367   //
2368   void generate_conjoint_long_copy_core(bool aligned) {
2369     // Do reverse copy.
2370     Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2371     const Register from    = O0;  // source array address
2372     const Register to      = O1;  // destination array address
2373     const Register count   = O2;  // elements count
2374     const Register offset8 = O4;  // element offset
2375     const Register offset0 = O5;  // previous element offset
2376 
2377       __ subcc(count, 1, count);
2378       __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
2379       __ delayed()->sllx(count, LogBytesPerLong, offset8);
2380       __ sub(offset8, 8, offset0);
2381       __ align(OptoLoopAlignment);
2382     __ BIND(L_copy_16_bytes);
2383       __ ldx(from, offset8, O2);
2384       __ ldx(from, offset0, O3);
2385       __ stx(O2, to, offset8);
2386       __ deccc(offset8, 16);      // use offset8 as counter
2387       __ stx(O3, to, offset0);
2388       __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes);
2389       __ delayed()->dec(offset0, 16);
2390 
2391     __ BIND(L_copy_8_bytes);
2392       __ brx(Assembler::negative, false, Assembler::pn, L_exit );
2393       __ delayed()->nop();
2394       __ ldx(from, 0, O3);
2395       __ stx(O3, to, 0);
2396     __ BIND(L_exit);
2397   }
2398 
2399   //  Generate stub for conjoint long copy.
2400   //  "aligned" is ignored, because we must make the stronger
2401   //  assumption that both addresses are always 64-bit aligned.
2402   //
2403   // Arguments for generated stub:
2404   //      from:  O0
2405   //      to:    O1
2406   //      count: O2 treated as signed
2407   //
2408   address generate_conjoint_long_copy(bool aligned, address nooverlap_target,
2409                                       address *entry, const char *name) {
2410     __ align(CodeEntryAlignment);
2411     StubCodeMark mark(this, "StubRoutines", name);
2412     address start = __ pc();
2413 
2414     assert(aligned, "Should always be aligned");
2415 
2416     assert_clean_int(O2, O3);     // Make sure 'count' is clean int.
2417 
2418     if (entry != NULL) {
2419       *entry = __ pc();
2420       // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2421       BLOCK_COMMENT("Entry:");
2422     }
2423 
2424     array_overlap_test(nooverlap_target, 3);
2425 
2426     generate_conjoint_long_copy_core(aligned);
2427 
2428     // O3, O4 are used as temp registers
2429     inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2430     __ retl();
2431     __ delayed()->mov(G0, O0); // return 0
2432     return start;
2433   }
2434 
2435   //  Generate stub for disjoint oop copy.  If "aligned" is true, the
2436   //  "from" and "to" addresses are assumed to be heapword aligned.
2437   //
2438   // Arguments for generated stub:
2439   //      from:  O0
2440   //      to:    O1
2441   //      count: O2 treated as signed
2442   //
2443   address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name,
2444                                      bool dest_uninitialized = false) {
2445 
2446     const Register from  = O0;  // source array address
2447     const Register to    = O1;  // destination array address
2448     const Register count = O2;  // elements count
2449 
2450     __ align(CodeEntryAlignment);
2451     StubCodeMark mark(this, "StubRoutines", name);
2452     address start = __ pc();
2453 
2454     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2455 
2456     if (entry != NULL) {
2457       *entry = __ pc();
2458       // caller can pass a 64-bit byte count here
2459       BLOCK_COMMENT("Entry:");
2460     }
2461 
2462     // save arguments for barrier generation
2463     __ mov(to, G1);
2464     __ mov(count, G5);
2465     gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2466   #ifdef _LP64
2467     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2468     if (UseCompressedOops) {
2469       generate_disjoint_int_copy_core(aligned);
2470     } else {
2471       generate_disjoint_long_copy_core(aligned);
2472     }
2473   #else
2474     generate_disjoint_int_copy_core(aligned);
2475   #endif
2476     // O0 is used as temp register
2477     gen_write_ref_array_post_barrier(G1, G5, O0);
2478 
2479     // O3, O4 are used as temp registers
2480     inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2481     __ retl();
2482     __ delayed()->mov(G0, O0); // return 0
2483     return start;
2484   }
2485 
2486   //  Generate stub for conjoint oop copy.  If "aligned" is true, the
2487   //  "from" and "to" addresses are assumed to be heapword aligned.
2488   //
2489   // Arguments for generated stub:
2490   //      from:  O0
2491   //      to:    O1
2492   //      count: O2 treated as signed
2493   //
2494   address generate_conjoint_oop_copy(bool aligned, address nooverlap_target,
2495                                      address *entry, const char *name,
2496                                      bool dest_uninitialized = false) {
2497 
2498     const Register from  = O0;  // source array address
2499     const Register to    = O1;  // destination array address
2500     const Register count = O2;  // elements count
2501 
2502     __ align(CodeEntryAlignment);
2503     StubCodeMark mark(this, "StubRoutines", name);
2504     address start = __ pc();
2505 
2506     assert_clean_int(count, O3);     // Make sure 'count' is clean int.
2507 
2508     if (entry != NULL) {
2509       *entry = __ pc();
2510       // caller can pass a 64-bit byte count here
2511       BLOCK_COMMENT("Entry:");
2512     }
2513 
2514     array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
2515 
2516     // save arguments for barrier generation
2517     __ mov(to, G1);
2518     __ mov(count, G5);
2519     gen_write_ref_array_pre_barrier(G1, G5, dest_uninitialized);
2520 
2521   #ifdef _LP64
2522     if (UseCompressedOops) {
2523       generate_conjoint_int_copy_core(aligned);
2524     } else {
2525       generate_conjoint_long_copy_core(aligned);
2526     }
2527   #else
2528     generate_conjoint_int_copy_core(aligned);
2529   #endif
2530 
2531     // O0 is used as temp register
2532     gen_write_ref_array_post_barrier(G1, G5, O0);
2533 
2534     // O3, O4 are used as temp registers
2535     inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2536     __ retl();
2537     __ delayed()->mov(G0, O0); // return 0
2538     return start;
2539   }
2540 
2541 
2542   // Helper for generating a dynamic type check.
2543   // Smashes only the given temp registers.
2544   void generate_type_check(Register sub_klass,
2545                            Register super_check_offset,
2546                            Register super_klass,
2547                            Register temp,
2548                            Label& L_success) {
2549     assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
2550 
2551     BLOCK_COMMENT("type_check:");
2552 
2553     Label L_miss, L_pop_to_miss;
2554 
2555     assert_clean_int(super_check_offset, temp);
2556 
2557     __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg,
2558                                      &L_success, &L_miss, NULL,
2559                                      super_check_offset);
2560 
2561     BLOCK_COMMENT("type_check_slow_path:");
2562     __ save_frame(0);
2563     __ check_klass_subtype_slow_path(sub_klass->after_save(),
2564                                      super_klass->after_save(),
2565                                      L0, L1, L2, L4,
2566                                      NULL, &L_pop_to_miss);
2567     __ ba(L_success);
2568     __ delayed()->restore();
2569 
2570     __ bind(L_pop_to_miss);
2571     __ restore();
2572 
2573     // Fall through on failure!
2574     __ BIND(L_miss);
2575   }
2576 
2577 
2578   //  Generate stub for checked oop copy.
2579   //
2580   // Arguments for generated stub:
2581   //      from:  O0
2582   //      to:    O1
2583   //      count: O2 treated as signed
2584   //      ckoff: O3 (super_check_offset)
2585   //      ckval: O4 (super_klass)
2586   //      ret:   O0 zero for success; (-1^K) where K is partial transfer count
2587   //
2588   address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) {
2589 
2590     const Register O0_from   = O0;      // source array address
2591     const Register O1_to     = O1;      // destination array address
2592     const Register O2_count  = O2;      // elements count
2593     const Register O3_ckoff  = O3;      // super_check_offset
2594     const Register O4_ckval  = O4;      // super_klass
2595 
2596     const Register O5_offset = O5;      // loop var, with stride wordSize
2597     const Register G1_remain = G1;      // loop var, with stride -1
2598     const Register G3_oop    = G3;      // actual oop copied
2599     const Register G4_klass  = G4;      // oop._klass
2600     const Register G5_super  = G5;      // oop._klass._primary_supers[ckval]
2601 
2602     __ align(CodeEntryAlignment);
2603     StubCodeMark mark(this, "StubRoutines", name);
2604     address start = __ pc();
2605 
2606 #ifdef ASSERT
2607     // We sometimes save a frame (see generate_type_check below).
2608     // If this will cause trouble, let's fail now instead of later.
2609     __ save_frame(0);
2610     __ restore();
2611 #endif
2612 
2613     assert_clean_int(O2_count, G1);     // Make sure 'count' is clean int.
2614 
2615 #ifdef ASSERT
2616     // caller guarantees that the arrays really are different
2617     // otherwise, we would have to make conjoint checks
2618     { Label L;
2619       __ mov(O3, G1);           // spill: overlap test smashes O3
2620       __ mov(O4, G4);           // spill: overlap test smashes O4
2621       array_overlap_test(L, LogBytesPerHeapOop);
2622       __ stop("checkcast_copy within a single array");
2623       __ bind(L);
2624       __ mov(G1, O3);
2625       __ mov(G4, O4);
2626     }
2627 #endif //ASSERT
2628 
2629     if (entry != NULL) {
2630       *entry = __ pc();
2631       // caller can pass a 64-bit byte count here (from generic stub)
2632       BLOCK_COMMENT("Entry:");
2633     }
2634     gen_write_ref_array_pre_barrier(O1_to, O2_count, dest_uninitialized);
2635 
2636     Label load_element, store_element, do_card_marks, fail, done;
2637     __ addcc(O2_count, 0, G1_remain);   // initialize loop index, and test it
2638     __ brx(Assembler::notZero, false, Assembler::pt, load_element);
2639     __ delayed()->mov(G0, O5_offset);   // offset from start of arrays
2640 
2641     // Empty array:  Nothing to do.
2642     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2643     __ retl();
2644     __ delayed()->set(0, O0);           // return 0 on (trivial) success
2645 
2646     // ======== begin loop ========
2647     // (Loop is rotated; its entry is load_element.)
2648     // Loop variables:
2649     //   (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
2650     //   (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
2651     //   G3, G4, G5 --- current oop, oop.klass, oop.klass.super
2652     __ align(OptoLoopAlignment);
2653 
2654     __ BIND(store_element);
2655     __ deccc(G1_remain);                // decrement the count
2656     __ store_heap_oop(G3_oop, O1_to, O5_offset); // store the oop
2657     __ inc(O5_offset, heapOopSize);     // step to next offset
2658     __ brx(Assembler::zero, true, Assembler::pt, do_card_marks);
2659     __ delayed()->set(0, O0);           // return -1 on success
2660 
2661     // ======== loop entry is here ========
2662     __ BIND(load_element);
2663     __ load_heap_oop(O0_from, O5_offset, G3_oop);  // load the oop
2664     __ br_null_short(G3_oop, Assembler::pt, store_element);
2665 
2666     __ load_klass(G3_oop, G4_klass); // query the object klass
2667 
2668     generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
2669                         // branch to this on success:
2670                         store_element);
2671     // ======== end loop ========
2672 
2673     // It was a real error; we must depend on the caller to finish the job.
2674     // Register G1 has number of *remaining* oops, O2 number of *total* oops.
2675     // Emit GC store barriers for the oops we have copied (O2 minus G1),
2676     // and report their number to the caller.
2677     __ BIND(fail);
2678     __ subcc(O2_count, G1_remain, O2_count);
2679     __ brx(Assembler::zero, false, Assembler::pt, done);
2680     __ delayed()->not1(O2_count, O0);   // report (-1^K) to caller
2681 
2682     __ BIND(do_card_marks);
2683     gen_write_ref_array_post_barrier(O1_to, O2_count, O3);   // store check on O1[0..O2]
2684 
2685     __ BIND(done);
2686     inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2687     __ retl();
2688     __ delayed()->nop();             // return value in 00
2689 
2690     return start;
2691   }
2692 
2693 
2694   //  Generate 'unsafe' array copy stub
2695   //  Though just as safe as the other stubs, it takes an unscaled
2696   //  size_t argument instead of an element count.
2697   //
2698   // Arguments for generated stub:
2699   //      from:  O0
2700   //      to:    O1
2701   //      count: O2 byte count, treated as ssize_t, can be zero
2702   //
2703   // Examines the alignment of the operands and dispatches
2704   // to a long, int, short, or byte copy loop.
2705   //
2706   address generate_unsafe_copy(const char* name,
2707                                address byte_copy_entry,
2708                                address short_copy_entry,
2709                                address int_copy_entry,
2710                                address long_copy_entry) {
2711 
2712     const Register O0_from   = O0;      // source array address
2713     const Register O1_to     = O1;      // destination array address
2714     const Register O2_count  = O2;      // elements count
2715 
2716     const Register G1_bits   = G1;      // test copy of low bits
2717 
2718     __ align(CodeEntryAlignment);
2719     StubCodeMark mark(this, "StubRoutines", name);
2720     address start = __ pc();
2721 
2722     // bump this on entry, not on exit:
2723     inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3);
2724 
2725     __ or3(O0_from, O1_to, G1_bits);
2726     __ or3(O2_count,       G1_bits, G1_bits);
2727 
2728     __ btst(BytesPerLong-1, G1_bits);
2729     __ br(Assembler::zero, true, Assembler::pt,
2730           long_copy_entry, relocInfo::runtime_call_type);
2731     // scale the count on the way out:
2732     __ delayed()->srax(O2_count, LogBytesPerLong, O2_count);
2733 
2734     __ btst(BytesPerInt-1, G1_bits);
2735     __ br(Assembler::zero, true, Assembler::pt,
2736           int_copy_entry, relocInfo::runtime_call_type);
2737     // scale the count on the way out:
2738     __ delayed()->srax(O2_count, LogBytesPerInt, O2_count);
2739 
2740     __ btst(BytesPerShort-1, G1_bits);
2741     __ br(Assembler::zero, true, Assembler::pt,
2742           short_copy_entry, relocInfo::runtime_call_type);
2743     // scale the count on the way out:
2744     __ delayed()->srax(O2_count, LogBytesPerShort, O2_count);
2745 
2746     __ br(Assembler::always, false, Assembler::pt,
2747           byte_copy_entry, relocInfo::runtime_call_type);
2748     __ delayed()->nop();
2749 
2750     return start;
2751   }
2752 
2753 
2754   // Perform range checks on the proposed arraycopy.
2755   // Kills the two temps, but nothing else.
2756   // Also, clean the sign bits of src_pos and dst_pos.
2757   void arraycopy_range_checks(Register src,     // source array oop (O0)
2758                               Register src_pos, // source position (O1)
2759                               Register dst,     // destination array oo (O2)
2760                               Register dst_pos, // destination position (O3)
2761                               Register length,  // length of copy (O4)
2762                               Register temp1, Register temp2,
2763                               Label& L_failed) {
2764     BLOCK_COMMENT("arraycopy_range_checks:");
2765 
2766     //  if (src_pos + length > arrayOop(src)->length() ) FAIL;
2767 
2768     const Register array_length = temp1;  // scratch
2769     const Register end_pos      = temp2;  // scratch
2770 
2771     // Note:  This next instruction may be in the delay slot of a branch:
2772     __ add(length, src_pos, end_pos);  // src_pos + length
2773     __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length);
2774     __ cmp(end_pos, array_length);
2775     __ br(Assembler::greater, false, Assembler::pn, L_failed);
2776 
2777     //  if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2778     __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length
2779     __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length);
2780     __ cmp(end_pos, array_length);
2781     __ br(Assembler::greater, false, Assembler::pn, L_failed);
2782 
2783     // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2784     // Move with sign extension can be used since they are positive.
2785     __ delayed()->signx(src_pos, src_pos);
2786     __ signx(dst_pos, dst_pos);
2787 
2788     BLOCK_COMMENT("arraycopy_range_checks done");
2789   }
2790 
2791 
2792   //
2793   //  Generate generic array copy stubs
2794   //
2795   //  Input:
2796   //    O0    -  src oop
2797   //    O1    -  src_pos
2798   //    O2    -  dst oop
2799   //    O3    -  dst_pos
2800   //    O4    -  element count
2801   //
2802   //  Output:
2803   //    O0 ==  0  -  success
2804   //    O0 == -1  -  need to call System.arraycopy
2805   //
2806   address generate_generic_copy(const char *name,
2807                                 address entry_jbyte_arraycopy,
2808                                 address entry_jshort_arraycopy,
2809                                 address entry_jint_arraycopy,
2810                                 address entry_oop_arraycopy,
2811                                 address entry_jlong_arraycopy,
2812                                 address entry_checkcast_arraycopy) {
2813     Label L_failed, L_objArray;
2814 
2815     // Input registers
2816     const Register src      = O0;  // source array oop
2817     const Register src_pos  = O1;  // source position
2818     const Register dst      = O2;  // destination array oop
2819     const Register dst_pos  = O3;  // destination position
2820     const Register length   = O4;  // elements count
2821 
2822     // registers used as temp
2823     const Register G3_src_klass = G3; // source array klass
2824     const Register G4_dst_klass = G4; // destination array klass
2825     const Register G5_lh        = G5; // layout handler
2826     const Register O5_temp      = O5;
2827 
2828     __ align(CodeEntryAlignment);
2829     StubCodeMark mark(this, "StubRoutines", name);
2830     address start = __ pc();
2831 
2832     // bump this on entry, not on exit:
2833     inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3);
2834 
2835     // In principle, the int arguments could be dirty.
2836     //assert_clean_int(src_pos, G1);
2837     //assert_clean_int(dst_pos, G1);
2838     //assert_clean_int(length, G1);
2839 
2840     //-----------------------------------------------------------------------
2841     // Assembler stubs will be used for this call to arraycopy
2842     // if the following conditions are met:
2843     //
2844     // (1) src and dst must not be null.
2845     // (2) src_pos must not be negative.
2846     // (3) dst_pos must not be negative.
2847     // (4) length  must not be negative.
2848     // (5) src klass and dst klass should be the same and not NULL.
2849     // (6) src and dst should be arrays.
2850     // (7) src_pos + length must not exceed length of src.
2851     // (8) dst_pos + length must not exceed length of dst.
2852     BLOCK_COMMENT("arraycopy initial argument checks");
2853 
2854     //  if (src == NULL) return -1;
2855     __ br_null(src, false, Assembler::pn, L_failed);
2856 
2857     //  if (src_pos < 0) return -1;
2858     __ delayed()->tst(src_pos);
2859     __ br(Assembler::negative, false, Assembler::pn, L_failed);
2860     __ delayed()->nop();
2861 
2862     //  if (dst == NULL) return -1;
2863     __ br_null(dst, false, Assembler::pn, L_failed);
2864 
2865     //  if (dst_pos < 0) return -1;
2866     __ delayed()->tst(dst_pos);
2867     __ br(Assembler::negative, false, Assembler::pn, L_failed);
2868 
2869     //  if (length < 0) return -1;
2870     __ delayed()->tst(length);
2871     __ br(Assembler::negative, false, Assembler::pn, L_failed);
2872 
2873     BLOCK_COMMENT("arraycopy argument klass checks");
2874     //  get src->klass()
2875     if (UseCompressedClassPointers) {
2876       __ delayed()->nop(); // ??? not good
2877       __ load_klass(src, G3_src_klass);
2878     } else {
2879       __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass);
2880     }
2881 
2882 #ifdef ASSERT
2883     //  assert(src->klass() != NULL);
2884     BLOCK_COMMENT("assert klasses not null");
2885     { Label L_a, L_b;
2886       __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL
2887       __ bind(L_a);
2888       __ stop("broken null klass");
2889       __ bind(L_b);
2890       __ load_klass(dst, G4_dst_klass);
2891       __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also
2892       __ delayed()->mov(G0, G4_dst_klass);      // scribble the temp
2893       BLOCK_COMMENT("assert done");
2894     }
2895 #endif
2896 
2897     // Load layout helper
2898     //
2899     //  |array_tag|     | header_size | element_type |     |log2_element_size|
2900     // 32        30    24            16              8     2                 0
2901     //
2902     //   array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2903     //
2904 
2905     int lh_offset = in_bytes(Klass::layout_helper_offset());
2906 
2907     // Load 32-bits signed value. Use br() instruction with it to check icc.
2908     __ lduw(G3_src_klass, lh_offset, G5_lh);
2909 
2910     if (UseCompressedClassPointers) {
2911       __ load_klass(dst, G4_dst_klass);
2912     }
2913     // Handle objArrays completely differently...
2914     juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2915     __ set(objArray_lh, O5_temp);
2916     __ cmp(G5_lh,       O5_temp);
2917     __ br(Assembler::equal, false, Assembler::pt, L_objArray);
2918     if (UseCompressedClassPointers) {
2919       __ delayed()->nop();
2920     } else {
2921       __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
2922     }
2923 
2924     //  if (src->klass() != dst->klass()) return -1;
2925     __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed);
2926 
2927     //  if (!src->is_Array()) return -1;
2928     __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
2929     __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed);
2930 
2931     // At this point, it is known to be a typeArray (array_tag 0x3).
2932 #ifdef ASSERT
2933     __ delayed()->nop();
2934     { Label L;
2935       jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2936       __ set(lh_prim_tag_in_place, O5_temp);
2937       __ cmp(G5_lh,                O5_temp);
2938       __ br(Assembler::greaterEqual, false, Assembler::pt, L);
2939       __ delayed()->nop();
2940       __ stop("must be a primitive array");
2941       __ bind(L);
2942     }
2943 #else
2944     __ delayed();                               // match next insn to prev branch
2945 #endif
2946 
2947     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2948                            O5_temp, G4_dst_klass, L_failed);
2949 
2950     // TypeArrayKlass
2951     //
2952     // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2953     // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2954     //
2955 
2956     const Register G4_offset = G4_dst_klass;    // array offset
2957     const Register G3_elsize = G3_src_klass;    // log2 element size
2958 
2959     __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset);
2960     __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset
2961     __ add(src, G4_offset, src);       // src array offset
2962     __ add(dst, G4_offset, dst);       // dst array offset
2963     __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size
2964 
2965     // next registers should be set before the jump to corresponding stub
2966     const Register from     = O0;  // source array address
2967     const Register to       = O1;  // destination array address
2968     const Register count    = O2;  // elements count
2969 
2970     // 'from', 'to', 'count' registers should be set in this order
2971     // since they are the same as 'src', 'src_pos', 'dst'.
2972 
2973     BLOCK_COMMENT("scale indexes to element size");
2974     __ sll_ptr(src_pos, G3_elsize, src_pos);
2975     __ sll_ptr(dst_pos, G3_elsize, dst_pos);
2976     __ add(src, src_pos, from);       // src_addr
2977     __ add(dst, dst_pos, to);         // dst_addr
2978 
2979     BLOCK_COMMENT("choose copy loop based on element size");
2980     __ cmp(G3_elsize, 0);
2981     __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy);
2982     __ delayed()->signx(length, count); // length
2983 
2984     __ cmp(G3_elsize, LogBytesPerShort);
2985     __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy);
2986     __ delayed()->signx(length, count); // length
2987 
2988     __ cmp(G3_elsize, LogBytesPerInt);
2989     __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy);
2990     __ delayed()->signx(length, count); // length
2991 #ifdef ASSERT
2992     { Label L;
2993       __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L);
2994       __ stop("must be long copy, but elsize is wrong");
2995       __ bind(L);
2996     }
2997 #endif
2998     __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy);
2999     __ delayed()->signx(length, count); // length
3000 
3001     // ObjArrayKlass
3002   __ BIND(L_objArray);
3003     // live at this point:  G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length
3004 
3005     Label L_plain_copy, L_checkcast_copy;
3006     //  test array classes for subtyping
3007     __ cmp(G3_src_klass, G4_dst_klass);         // usual case is exact equality
3008     __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy);
3009     __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below
3010 
3011     // Identically typed arrays can be copied without element-wise checks.
3012     arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3013                            O5_temp, G5_lh, L_failed);
3014 
3015     __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3016     __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3017     __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3018     __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3019     __ add(src, src_pos, from);       // src_addr
3020     __ add(dst, dst_pos, to);         // dst_addr
3021   __ BIND(L_plain_copy);
3022     __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy);
3023     __ delayed()->signx(length, count); // length
3024 
3025   __ BIND(L_checkcast_copy);
3026     // live at this point:  G3_src_klass, G4_dst_klass
3027     {
3028       // Before looking at dst.length, make sure dst is also an objArray.
3029       // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot
3030       __ cmp(G5_lh,                    O5_temp);
3031       __ br(Assembler::notEqual, false, Assembler::pn, L_failed);
3032 
3033       // It is safe to examine both src.length and dst.length.
3034       __ delayed();                             // match next insn to prev branch
3035       arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
3036                              O5_temp, G5_lh, L_failed);
3037 
3038       // Marshal the base address arguments now, freeing registers.
3039       __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
3040       __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
3041       __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
3042       __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
3043       __ add(src, src_pos, from);               // src_addr
3044       __ add(dst, dst_pos, to);                 // dst_addr
3045       __ signx(length, count);                  // length (reloaded)
3046 
3047       Register sco_temp = O3;                   // this register is free now
3048       assert_different_registers(from, to, count, sco_temp,
3049                                  G4_dst_klass, G3_src_klass);
3050 
3051       // Generate the type check.
3052       int sco_offset = in_bytes(Klass::super_check_offset_offset());
3053       __ lduw(G4_dst_klass, sco_offset, sco_temp);
3054       generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
3055                           O5_temp, L_plain_copy);
3056 
3057       // Fetch destination element klass from the ObjArrayKlass header.
3058       int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
3059 
3060       // the checkcast_copy loop needs two extra arguments:
3061       __ ld_ptr(G4_dst_klass, ek_offset, O4);   // dest elem klass
3062       // lduw(O4, sco_offset, O3);              // sco of elem klass
3063 
3064       __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy);
3065       __ delayed()->lduw(O4, sco_offset, O3);
3066     }
3067 
3068   __ BIND(L_failed);
3069     __ retl();
3070     __ delayed()->sub(G0, 1, O0); // return -1
3071     return start;
3072   }
3073 
3074   //
3075   //  Generate stub for heap zeroing.
3076   //  "to" address is aligned to jlong (8 bytes).
3077   //
3078   // Arguments for generated stub:
3079   //      to:    O0
3080   //      count: O1 treated as signed (count of HeapWord)
3081   //             count could be 0
3082   //
3083   address generate_zero_aligned_words(const char* name) {
3084     __ align(CodeEntryAlignment);
3085     StubCodeMark mark(this, "StubRoutines", name);
3086     address start = __ pc();
3087 
3088     const Register to    = O0;   // source array address
3089     const Register count = O1;   // HeapWords count
3090     const Register temp  = O2;   // scratch
3091 
3092     Label Ldone;
3093     __ sllx(count, LogHeapWordSize, count); // to bytes count
3094     // Use BIS for zeroing
3095     __ bis_zeroing(to, count, temp, Ldone);
3096     __ bind(Ldone);
3097     __ retl();
3098     __ delayed()->nop();
3099     return start;
3100 }
3101 
3102   void generate_arraycopy_stubs() {
3103     address entry;
3104     address entry_jbyte_arraycopy;
3105     address entry_jshort_arraycopy;
3106     address entry_jint_arraycopy;
3107     address entry_oop_arraycopy;
3108     address entry_jlong_arraycopy;
3109     address entry_checkcast_arraycopy;
3110 
3111     //*** jbyte
3112     // Always need aligned and unaligned versions
3113     StubRoutines::_jbyte_disjoint_arraycopy         = generate_disjoint_byte_copy(false, &entry,
3114                                                                                   "jbyte_disjoint_arraycopy");
3115     StubRoutines::_jbyte_arraycopy                  = generate_conjoint_byte_copy(false, entry,
3116                                                                                   &entry_jbyte_arraycopy,
3117                                                                                   "jbyte_arraycopy");
3118     StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
3119                                                                                   "arrayof_jbyte_disjoint_arraycopy");
3120     StubRoutines::_arrayof_jbyte_arraycopy          = generate_conjoint_byte_copy(true, entry, NULL,
3121                                                                                   "arrayof_jbyte_arraycopy");
3122 
3123     //*** jshort
3124     // Always need aligned and unaligned versions
3125     StubRoutines::_jshort_disjoint_arraycopy         = generate_disjoint_short_copy(false, &entry,
3126                                                                                     "jshort_disjoint_arraycopy");
3127     StubRoutines::_jshort_arraycopy                  = generate_conjoint_short_copy(false, entry,
3128                                                                                     &entry_jshort_arraycopy,
3129                                                                                     "jshort_arraycopy");
3130     StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
3131                                                                                     "arrayof_jshort_disjoint_arraycopy");
3132     StubRoutines::_arrayof_jshort_arraycopy          = generate_conjoint_short_copy(true, entry, NULL,
3133                                                                                     "arrayof_jshort_arraycopy");
3134 
3135     //*** jint
3136     // Aligned versions
3137     StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
3138                                                                                 "arrayof_jint_disjoint_arraycopy");
3139     StubRoutines::_arrayof_jint_arraycopy          = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
3140                                                                                 "arrayof_jint_arraycopy");
3141 #ifdef _LP64
3142     // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
3143     // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it).
3144     StubRoutines::_jint_disjoint_arraycopy         = generate_disjoint_int_copy(false, &entry,
3145                                                                                 "jint_disjoint_arraycopy");
3146     StubRoutines::_jint_arraycopy                  = generate_conjoint_int_copy(false, entry,
3147                                                                                 &entry_jint_arraycopy,
3148                                                                                 "jint_arraycopy");
3149 #else
3150     // In 32 bit jints are always HeapWordSize aligned, so always use the aligned version
3151     // (in fact in 32bit we always have a pre-loop part even in the aligned version,
3152     //  because it uses 64-bit loads/stores, so the aligned flag is actually ignored).
3153     StubRoutines::_jint_disjoint_arraycopy = StubRoutines::_arrayof_jint_disjoint_arraycopy;
3154     StubRoutines::_jint_arraycopy          = StubRoutines::_arrayof_jint_arraycopy;
3155 #endif
3156 
3157 
3158     //*** jlong
3159     // It is always aligned
3160     StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
3161                                                                                   "arrayof_jlong_disjoint_arraycopy");
3162     StubRoutines::_arrayof_jlong_arraycopy          = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
3163                                                                                   "arrayof_jlong_arraycopy");
3164     StubRoutines::_jlong_disjoint_arraycopy         = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
3165     StubRoutines::_jlong_arraycopy                  = StubRoutines::_arrayof_jlong_arraycopy;
3166 
3167 
3168     //*** oops
3169     // Aligned versions
3170     StubRoutines::_arrayof_oop_disjoint_arraycopy        = generate_disjoint_oop_copy(true, &entry,
3171                                                                                       "arrayof_oop_disjoint_arraycopy");
3172     StubRoutines::_arrayof_oop_arraycopy                 = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy,
3173                                                                                       "arrayof_oop_arraycopy");
3174     // Aligned versions without pre-barriers
3175     StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry,
3176                                                                                       "arrayof_oop_disjoint_arraycopy_uninit",
3177                                                                                       /*dest_uninitialized*/true);
3178     StubRoutines::_arrayof_oop_arraycopy_uninit          = generate_conjoint_oop_copy(true, entry, NULL,
3179                                                                                       "arrayof_oop_arraycopy_uninit",
3180                                                                                       /*dest_uninitialized*/true);
3181 #ifdef _LP64
3182     if (UseCompressedOops) {
3183       // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy.
3184       StubRoutines::_oop_disjoint_arraycopy            = generate_disjoint_oop_copy(false, &entry,
3185                                                                                     "oop_disjoint_arraycopy");
3186       StubRoutines::_oop_arraycopy                     = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy,
3187                                                                                     "oop_arraycopy");
3188       // Unaligned versions without pre-barriers
3189       StubRoutines::_oop_disjoint_arraycopy_uninit     = generate_disjoint_oop_copy(false, &entry,
3190                                                                                     "oop_disjoint_arraycopy_uninit",
3191                                                                                     /*dest_uninitialized*/true);
3192       StubRoutines::_oop_arraycopy_uninit              = generate_conjoint_oop_copy(false, entry, NULL,
3193                                                                                     "oop_arraycopy_uninit",
3194                                                                                     /*dest_uninitialized*/true);
3195     } else
3196 #endif
3197     {
3198       // oop arraycopy is always aligned on 32bit and 64bit without compressed oops
3199       StubRoutines::_oop_disjoint_arraycopy            = StubRoutines::_arrayof_oop_disjoint_arraycopy;
3200       StubRoutines::_oop_arraycopy                     = StubRoutines::_arrayof_oop_arraycopy;
3201       StubRoutines::_oop_disjoint_arraycopy_uninit     = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
3202       StubRoutines::_oop_arraycopy_uninit              = StubRoutines::_arrayof_oop_arraycopy_uninit;
3203     }
3204 
3205     StubRoutines::_checkcast_arraycopy        = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3206     StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3207                                                                         /*dest_uninitialized*/true);
3208 
3209     StubRoutines::_unsafe_arraycopy    = generate_unsafe_copy("unsafe_arraycopy",
3210                                                               entry_jbyte_arraycopy,
3211                                                               entry_jshort_arraycopy,
3212                                                               entry_jint_arraycopy,
3213                                                               entry_jlong_arraycopy);
3214     StubRoutines::_generic_arraycopy   = generate_generic_copy("generic_arraycopy",
3215                                                                entry_jbyte_arraycopy,
3216                                                                entry_jshort_arraycopy,
3217                                                                entry_jint_arraycopy,
3218                                                                entry_oop_arraycopy,
3219                                                                entry_jlong_arraycopy,
3220                                                                entry_checkcast_arraycopy);
3221 
3222     StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3223     StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3224     StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3225     StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3226     StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3227     StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3228 
3229     if (UseBlockZeroing) {
3230       StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words");
3231     }
3232   }
3233 
3234   address generate_aescrypt_encryptBlock() {
3235     // required since we read expanded key 'int' array starting first element without alignment considerations
3236     assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3237            "the following code assumes that first element of an int array is aligned to 8 bytes");
3238     __ align(CodeEntryAlignment);
3239     StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3240     Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output;
3241     address start = __ pc();
3242     Register from = O0; // source byte array
3243     Register to = O1;   // destination byte array
3244     Register key = O2;  // expanded key array
3245     const Register keylen = O4; //reg for storing expanded key array length
3246 
3247     // read expanded key length
3248     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3249 
3250     // Method to address arbitrary alignment for load instructions:
3251     // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary
3252     // If zero/aligned then continue with double FP load instructions
3253     // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata
3254     // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address
3255     // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address
3256     // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs
3257 
3258     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3259     __ andcc(from, 7, G0);
3260     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
3261     __ delayed()->alignaddr(from, G0, from);
3262 
3263     // aligned case: load input into F54-F56
3264     __ ldf(FloatRegisterImpl::D, from, 0, F54);
3265     __ ldf(FloatRegisterImpl::D, from, 8, F56);
3266     __ ba_short(L_load_expanded_key);
3267 
3268     __ BIND(L_load_misaligned_input);
3269     __ ldf(FloatRegisterImpl::D, from, 0, F54);
3270     __ ldf(FloatRegisterImpl::D, from, 8, F56);
3271     __ ldf(FloatRegisterImpl::D, from, 16, F58);
3272     __ faligndata(F54, F56, F54);
3273     __ faligndata(F56, F58, F56);
3274 
3275     __ BIND(L_load_expanded_key);
3276     // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed
3277     for ( int i = 0;  i <= 38; i += 2 ) {
3278       __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i));
3279     }
3280 
3281     // perform cipher transformation
3282     __ fxor(FloatRegisterImpl::D, F0, F54, F54);
3283     __ fxor(FloatRegisterImpl::D, F2, F56, F56);
3284     // rounds 1 through 8
3285     for ( int i = 4;  i <= 28; i += 8 ) {
3286       __ aes_eround01(as_FloatRegister(i), F54, F56, F58);
3287       __ aes_eround23(as_FloatRegister(i+2), F54, F56, F60);
3288       __ aes_eround01(as_FloatRegister(i+4), F58, F60, F54);
3289       __ aes_eround23(as_FloatRegister(i+6), F58, F60, F56);
3290     }
3291     __ aes_eround01(F36, F54, F56, F58); //round 9
3292     __ aes_eround23(F38, F54, F56, F60);
3293 
3294     // 128-bit original key size
3295     __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_doLast128bit);
3296 
3297     for ( int i = 40;  i <= 50; i += 2 ) {
3298       __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i) );
3299     }
3300     __ aes_eround01(F40, F58, F60, F54); //round 10
3301     __ aes_eround23(F42, F58, F60, F56);
3302     __ aes_eround01(F44, F54, F56, F58); //round 11
3303     __ aes_eround23(F46, F54, F56, F60);
3304 
3305     // 192-bit original key size
3306     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_storeOutput);
3307 
3308     __ ldf(FloatRegisterImpl::D, key, 208, F52);
3309     __ aes_eround01(F48, F58, F60, F54); //round 12
3310     __ aes_eround23(F50, F58, F60, F56);
3311     __ ldf(FloatRegisterImpl::D, key, 216, F46);
3312     __ ldf(FloatRegisterImpl::D, key, 224, F48);
3313     __ ldf(FloatRegisterImpl::D, key, 232, F50);
3314     __ aes_eround01(F52, F54, F56, F58); //round 13
3315     __ aes_eround23(F46, F54, F56, F60);
3316     __ ba_short(L_storeOutput);
3317 
3318     __ BIND(L_doLast128bit);
3319     __ ldf(FloatRegisterImpl::D, key, 160, F48);
3320     __ ldf(FloatRegisterImpl::D, key, 168, F50);
3321 
3322     __ BIND(L_storeOutput);
3323     // perform last round of encryption common for all key sizes
3324     __ aes_eround01_l(F48, F58, F60, F54); //last round
3325     __ aes_eround23_l(F50, F58, F60, F56);
3326 
3327     // Method to address arbitrary alignment for store instructions:
3328     // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary
3329     // If zero/aligned then continue with double FP store instructions
3330     // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case)
3331     // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001
3332     // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case
3333     // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case.
3334     // Set GSR.align to (8-n) using alignaddr
3335     // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf
3336     // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address
3337     // Store (partial) the original first (8-n) bytes starting at the original 'dest' address
3338     // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address
3339     // We need to execute this process for both the 8-byte result values
3340 
3341     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3342     __ andcc(to, 7, O5);
3343     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
3344     __ delayed()->edge8n(to, G0, O3);
3345 
3346     // aligned case: store output into the destination array
3347     __ stf(FloatRegisterImpl::D, F54, to, 0);
3348     __ retl();
3349     __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8);
3350 
3351     __ BIND(L_store_misaligned_output);
3352     __ add(to, 8, O4);
3353     __ mov(8, O2);
3354     __ sub(O2, O5, O2);
3355     __ alignaddr(O2, G0, O2);
3356     __ faligndata(F54, F54, F54);
3357     __ faligndata(F56, F56, F56);
3358     __ and3(to, -8, to);
3359     __ and3(O4, -8, O4);
3360     __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
3361     __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
3362     __ add(to, 8, to);
3363     __ add(O4, 8, O4);
3364     __ orn(G0, O3, O3);
3365     __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
3366     __ retl();
3367     __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
3368 
3369     return start;
3370   }
3371 
3372   address generate_aescrypt_decryptBlock() {
3373     assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3374            "the following code assumes that first element of an int array is aligned to 8 bytes");
3375     // required since we read original key 'byte' array as well in the decryption stubs
3376     assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3377            "the following code assumes that first element of a byte array is aligned to 8 bytes");
3378     __ align(CodeEntryAlignment);
3379     StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3380     address start = __ pc();
3381     Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input;
3382     Label L_256bit_transform, L_common_transform, L_store_misaligned_output;
3383     Register from = O0; // source byte array
3384     Register to = O1;   // destination byte array
3385     Register key = O2;  // expanded key array
3386     Register original_key = O3;  // original key array only required during decryption
3387     const Register keylen = O4;  // reg for storing expanded key array length
3388 
3389     // read expanded key array length
3390     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3391 
3392     // save 'from' since we may need to recheck alignment in case of 256-bit decryption
3393     __ mov(from, G1);
3394 
3395     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3396     __ andcc(from, 7, G0);
3397     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
3398     __ delayed()->alignaddr(from, G0, from);
3399 
3400     // aligned case: load input into F52-F54
3401     __ ldf(FloatRegisterImpl::D, from, 0, F52);
3402     __ ldf(FloatRegisterImpl::D, from, 8, F54);
3403     __ ba_short(L_load_original_key);
3404 
3405     __ BIND(L_load_misaligned_input);
3406     __ ldf(FloatRegisterImpl::D, from, 0, F52);
3407     __ ldf(FloatRegisterImpl::D, from, 8, F54);
3408     __ ldf(FloatRegisterImpl::D, from, 16, F56);
3409     __ faligndata(F52, F54, F52);
3410     __ faligndata(F54, F56, F54);
3411 
3412     __ BIND(L_load_original_key);
3413     // load original key from SunJCE expanded decryption key
3414     // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
3415     for ( int i = 0;  i <= 3; i++ ) {
3416       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3417     }
3418 
3419     // 256-bit original key size
3420     __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
3421 
3422     // 192-bit original key size
3423     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
3424 
3425     // 128-bit original key size
3426     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3427     for ( int i = 0;  i <= 36; i += 4 ) {
3428       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
3429       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
3430     }
3431 
3432     // perform 128-bit key specific inverse cipher transformation
3433     __ fxor(FloatRegisterImpl::D, F42, F54, F54);
3434     __ fxor(FloatRegisterImpl::D, F40, F52, F52);
3435     __ ba_short(L_common_transform);
3436 
3437     __ BIND(L_expand192bit);
3438 
3439     // start loading rest of the 192-bit key
3440     __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
3441     __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
3442 
3443     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3444     for ( int i = 0;  i <= 36; i += 6 ) {
3445       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
3446       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
3447       __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
3448     }
3449     __ aes_kexpand1(F42, F46, 7, F48);
3450     __ aes_kexpand2(F44, F48, F50);
3451 
3452     // perform 192-bit key specific inverse cipher transformation
3453     __ fxor(FloatRegisterImpl::D, F50, F54, F54);
3454     __ fxor(FloatRegisterImpl::D, F48, F52, F52);
3455     __ aes_dround23(F46, F52, F54, F58);
3456     __ aes_dround01(F44, F52, F54, F56);
3457     __ aes_dround23(F42, F56, F58, F54);
3458     __ aes_dround01(F40, F56, F58, F52);
3459     __ ba_short(L_common_transform);
3460 
3461     __ BIND(L_expand256bit);
3462 
3463     // load rest of the 256-bit key
3464     for ( int i = 4;  i <= 7; i++ ) {
3465       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3466     }
3467 
3468     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3469     for ( int i = 0;  i <= 40; i += 8 ) {
3470       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
3471       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
3472       __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
3473       __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
3474     }
3475     __ aes_kexpand1(F48, F54, 6, F56);
3476     __ aes_kexpand2(F50, F56, F58);
3477 
3478     for ( int i = 0;  i <= 6; i += 2 ) {
3479       __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
3480     }
3481 
3482     // reload original 'from' address
3483     __ mov(G1, from);
3484 
3485     // re-check 8-byte alignment
3486     __ andcc(from, 7, G0);
3487     __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input);
3488     __ delayed()->alignaddr(from, G0, from);
3489 
3490     // aligned case: load input into F52-F54
3491     __ ldf(FloatRegisterImpl::D, from, 0, F52);
3492     __ ldf(FloatRegisterImpl::D, from, 8, F54);
3493     __ ba_short(L_256bit_transform);
3494 
3495     __ BIND(L_reload_misaligned_input);
3496     __ ldf(FloatRegisterImpl::D, from, 0, F52);
3497     __ ldf(FloatRegisterImpl::D, from, 8, F54);
3498     __ ldf(FloatRegisterImpl::D, from, 16, F56);
3499     __ faligndata(F52, F54, F52);
3500     __ faligndata(F54, F56, F54);
3501 
3502     // perform 256-bit key specific inverse cipher transformation
3503     __ BIND(L_256bit_transform);
3504     __ fxor(FloatRegisterImpl::D, F0, F54, F54);
3505     __ fxor(FloatRegisterImpl::D, F2, F52, F52);
3506     __ aes_dround23(F4, F52, F54, F58);
3507     __ aes_dround01(F6, F52, F54, F56);
3508     __ aes_dround23(F50, F56, F58, F54);
3509     __ aes_dround01(F48, F56, F58, F52);
3510     __ aes_dround23(F46, F52, F54, F58);
3511     __ aes_dround01(F44, F52, F54, F56);
3512     __ aes_dround23(F42, F56, F58, F54);
3513     __ aes_dround01(F40, F56, F58, F52);
3514 
3515     for ( int i = 0;  i <= 7; i++ ) {
3516       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3517     }
3518 
3519     // perform inverse cipher transformations common for all key sizes
3520     __ BIND(L_common_transform);
3521     for ( int i = 38;  i >= 6; i -= 8 ) {
3522       __ aes_dround23(as_FloatRegister(i), F52, F54, F58);
3523       __ aes_dround01(as_FloatRegister(i-2), F52, F54, F56);
3524       if ( i != 6) {
3525         __ aes_dround23(as_FloatRegister(i-4), F56, F58, F54);
3526         __ aes_dround01(as_FloatRegister(i-6), F56, F58, F52);
3527       } else {
3528         __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54);
3529         __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52);
3530       }
3531     }
3532 
3533     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3534     __ andcc(to, 7, O5);
3535     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
3536     __ delayed()->edge8n(to, G0, O3);
3537 
3538     // aligned case: store output into the destination array
3539     __ stf(FloatRegisterImpl::D, F52, to, 0);
3540     __ retl();
3541     __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8);
3542 
3543     __ BIND(L_store_misaligned_output);
3544     __ add(to, 8, O4);
3545     __ mov(8, O2);
3546     __ sub(O2, O5, O2);
3547     __ alignaddr(O2, G0, O2);
3548     __ faligndata(F52, F52, F52);
3549     __ faligndata(F54, F54, F54);
3550     __ and3(to, -8, to);
3551     __ and3(O4, -8, O4);
3552     __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
3553     __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
3554     __ add(to, 8, to);
3555     __ add(O4, 8, O4);
3556     __ orn(G0, O3, O3);
3557     __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
3558     __ retl();
3559     __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
3560 
3561     return start;
3562   }
3563 
3564   address generate_cipherBlockChaining_encryptAESCrypt() {
3565     assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3566            "the following code assumes that first element of an int array is aligned to 8 bytes");
3567     assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3568            "the following code assumes that first element of a byte array is aligned to 8 bytes");
3569     __ align(CodeEntryAlignment);
3570     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3571     Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit;
3572     Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform;
3573     Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit;
3574     Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit;
3575     address start = __ pc();
3576     Register from = I0; // source byte array
3577     Register to = I1;   // destination byte array
3578     Register key = I2;  // expanded key array
3579     Register rvec = I3; // init vector
3580     const Register len_reg = I4; // cipher length
3581     const Register keylen = I5;  // reg for storing expanded key array length
3582 
3583     __ save_frame(0);
3584     // save cipher len to return in the end
3585     __ mov(len_reg, L0);
3586 
3587     // read expanded key length
3588     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3589 
3590     // load initial vector, 8-byte alignment is guranteed
3591     __ ldf(FloatRegisterImpl::D, rvec, 0, F60);
3592     __ ldf(FloatRegisterImpl::D, rvec, 8, F62);
3593     // load key, 8-byte alignment is guranteed
3594     __ ldx(key,0,G1);
3595     __ ldx(key,8,G5);
3596 
3597     // start loading expanded key, 8-byte alignment is guranteed
3598     for ( int i = 0, j = 16;  i <= 38; i += 2, j += 8 ) {
3599       __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3600     }
3601 
3602     // 128-bit original key size
3603     __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_cbcenc128);
3604 
3605     for ( int i = 40, j = 176;  i <= 46; i += 2, j += 8 ) {
3606       __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3607     }
3608 
3609     // 192-bit original key size
3610     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_cbcenc192);
3611 
3612     for ( int i = 48, j = 208;  i <= 54; i += 2, j += 8 ) {
3613       __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3614     }
3615 
3616     // 256-bit original key size
3617     __ ba_short(L_cbcenc256);
3618 
3619     __ align(OptoLoopAlignment);
3620     __ BIND(L_cbcenc128);
3621     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3622     __ andcc(from, 7, G0);
3623     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit);
3624     __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3625 
3626     // aligned case: load input into G3 and G4
3627     __ ldx(from,0,G3);
3628     __ ldx(from,8,G4);
3629     __ ba_short(L_128bit_transform);
3630 
3631     __ BIND(L_load_misaligned_input_128bit);
3632     // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
3633     __ alignaddr(from, G0, from);
3634     __ ldf(FloatRegisterImpl::D, from, 0, F48);
3635     __ ldf(FloatRegisterImpl::D, from, 8, F50);
3636     __ ldf(FloatRegisterImpl::D, from, 16, F52);
3637     __ faligndata(F48, F50, F48);
3638     __ faligndata(F50, F52, F50);
3639     __ movdtox(F48, G3);
3640     __ movdtox(F50, G4);
3641     __ mov(L1, from);
3642 
3643     __ BIND(L_128bit_transform);
3644     __ xor3(G1,G3,G3);
3645     __ xor3(G5,G4,G4);
3646     __ movxtod(G3,F56);
3647     __ movxtod(G4,F58);
3648     __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3649     __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3650 
3651     // TEN_EROUNDS
3652     for ( int i = 0;  i <= 32; i += 8 ) {
3653       __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
3654       __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
3655       if (i != 32 ) {
3656         __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
3657         __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
3658       } else {
3659         __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3660         __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3661       }
3662     }
3663 
3664     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3665     __ andcc(to, 7, L1);
3666     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit);
3667     __ delayed()->edge8n(to, G0, L2);
3668 
3669     // aligned case: store output into the destination array
3670     __ stf(FloatRegisterImpl::D, F60, to, 0);
3671     __ stf(FloatRegisterImpl::D, F62, to, 8);
3672     __ ba_short(L_check_loop_end_128bit);
3673 
3674     __ BIND(L_store_misaligned_output_128bit);
3675     __ add(to, 8, L3);
3676     __ mov(8, L4);
3677     __ sub(L4, L1, L4);
3678     __ alignaddr(L4, G0, L4);
3679     // save cipher text before circular right shift
3680     // as it needs to be stored as iv for next block (see code before next retl)
3681     __ movdtox(F60, L6);
3682     __ movdtox(F62, L7);
3683     __ faligndata(F60, F60, F60);
3684     __ faligndata(F62, F62, F62);
3685     __ mov(to, L5);
3686     __ and3(to, -8, to);
3687     __ and3(L3, -8, L3);
3688     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3689     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3690     __ add(to, 8, to);
3691     __ add(L3, 8, L3);
3692     __ orn(G0, L2, L2);
3693     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3694     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3695     __ mov(L5, to);
3696     __ movxtod(L6, F60);
3697     __ movxtod(L7, F62);
3698 
3699     __ BIND(L_check_loop_end_128bit);
3700     __ add(from, 16, from);
3701     __ add(to, 16, to);
3702     __ subcc(len_reg, 16, len_reg);
3703     __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128);
3704     __ delayed()->nop();
3705     // re-init intial vector for next block, 8-byte alignment is guaranteed
3706     __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3707     __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3708     __ mov(L0, I0);
3709     __ ret();
3710     __ delayed()->restore();
3711 
3712     __ align(OptoLoopAlignment);
3713     __ BIND(L_cbcenc192);
3714     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3715     __ andcc(from, 7, G0);
3716     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit);
3717     __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3718 
3719     // aligned case: load input into G3 and G4
3720     __ ldx(from,0,G3);
3721     __ ldx(from,8,G4);
3722     __ ba_short(L_192bit_transform);
3723 
3724     __ BIND(L_load_misaligned_input_192bit);
3725     // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
3726     __ alignaddr(from, G0, from);
3727     __ ldf(FloatRegisterImpl::D, from, 0, F48);
3728     __ ldf(FloatRegisterImpl::D, from, 8, F50);
3729     __ ldf(FloatRegisterImpl::D, from, 16, F52);
3730     __ faligndata(F48, F50, F48);
3731     __ faligndata(F50, F52, F50);
3732     __ movdtox(F48, G3);
3733     __ movdtox(F50, G4);
3734     __ mov(L1, from);
3735 
3736     __ BIND(L_192bit_transform);
3737     __ xor3(G1,G3,G3);
3738     __ xor3(G5,G4,G4);
3739     __ movxtod(G3,F56);
3740     __ movxtod(G4,F58);
3741     __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3742     __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3743 
3744     // TWELEVE_EROUNDS
3745     for ( int i = 0;  i <= 40; i += 8 ) {
3746       __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
3747       __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
3748       if (i != 40 ) {
3749         __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
3750         __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
3751       } else {
3752         __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3753         __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3754       }
3755     }
3756 
3757     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3758     __ andcc(to, 7, L1);
3759     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit);
3760     __ delayed()->edge8n(to, G0, L2);
3761 
3762     // aligned case: store output into the destination array
3763     __ stf(FloatRegisterImpl::D, F60, to, 0);
3764     __ stf(FloatRegisterImpl::D, F62, to, 8);
3765     __ ba_short(L_check_loop_end_192bit);
3766 
3767     __ BIND(L_store_misaligned_output_192bit);
3768     __ add(to, 8, L3);
3769     __ mov(8, L4);
3770     __ sub(L4, L1, L4);
3771     __ alignaddr(L4, G0, L4);
3772     __ movdtox(F60, L6);
3773     __ movdtox(F62, L7);
3774     __ faligndata(F60, F60, F60);
3775     __ faligndata(F62, F62, F62);
3776     __ mov(to, L5);
3777     __ and3(to, -8, to);
3778     __ and3(L3, -8, L3);
3779     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3780     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3781     __ add(to, 8, to);
3782     __ add(L3, 8, L3);
3783     __ orn(G0, L2, L2);
3784     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3785     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3786     __ mov(L5, to);
3787     __ movxtod(L6, F60);
3788     __ movxtod(L7, F62);
3789 
3790     __ BIND(L_check_loop_end_192bit);
3791     __ add(from, 16, from);
3792     __ subcc(len_reg, 16, len_reg);
3793     __ add(to, 16, to);
3794     __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192);
3795     __ delayed()->nop();
3796     // re-init intial vector for next block, 8-byte alignment is guaranteed
3797     __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3798     __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3799     __ mov(L0, I0);
3800     __ ret();
3801     __ delayed()->restore();
3802 
3803     __ align(OptoLoopAlignment);
3804     __ BIND(L_cbcenc256);
3805     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3806     __ andcc(from, 7, G0);
3807     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit);
3808     __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3809 
3810     // aligned case: load input into G3 and G4
3811     __ ldx(from,0,G3);
3812     __ ldx(from,8,G4);
3813     __ ba_short(L_256bit_transform);
3814 
3815     __ BIND(L_load_misaligned_input_256bit);
3816     // cannot clobber F48, F50 and F52. F56, F58 can be used though
3817     __ alignaddr(from, G0, from);
3818     __ movdtox(F60, L2); // save F60 before overwriting
3819     __ ldf(FloatRegisterImpl::D, from, 0, F56);
3820     __ ldf(FloatRegisterImpl::D, from, 8, F58);
3821     __ ldf(FloatRegisterImpl::D, from, 16, F60);
3822     __ faligndata(F56, F58, F56);
3823     __ faligndata(F58, F60, F58);
3824     __ movdtox(F56, G3);
3825     __ movdtox(F58, G4);
3826     __ mov(L1, from);
3827     __ movxtod(L2, F60);
3828 
3829     __ BIND(L_256bit_transform);
3830     __ xor3(G1,G3,G3);
3831     __ xor3(G5,G4,G4);
3832     __ movxtod(G3,F56);
3833     __ movxtod(G4,F58);
3834     __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3835     __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3836 
3837     // FOURTEEN_EROUNDS
3838     for ( int i = 0;  i <= 48; i += 8 ) {
3839       __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
3840       __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
3841       if (i != 48 ) {
3842         __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
3843         __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
3844       } else {
3845         __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3846         __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3847       }
3848     }
3849 
3850     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3851     __ andcc(to, 7, L1);
3852     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit);
3853     __ delayed()->edge8n(to, G0, L2);
3854 
3855     // aligned case: store output into the destination array
3856     __ stf(FloatRegisterImpl::D, F60, to, 0);
3857     __ stf(FloatRegisterImpl::D, F62, to, 8);
3858     __ ba_short(L_check_loop_end_256bit);
3859 
3860     __ BIND(L_store_misaligned_output_256bit);
3861     __ add(to, 8, L3);
3862     __ mov(8, L4);
3863     __ sub(L4, L1, L4);
3864     __ alignaddr(L4, G0, L4);
3865     __ movdtox(F60, L6);
3866     __ movdtox(F62, L7);
3867     __ faligndata(F60, F60, F60);
3868     __ faligndata(F62, F62, F62);
3869     __ mov(to, L5);
3870     __ and3(to, -8, to);
3871     __ and3(L3, -8, L3);
3872     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3873     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3874     __ add(to, 8, to);
3875     __ add(L3, 8, L3);
3876     __ orn(G0, L2, L2);
3877     __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3878     __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3879     __ mov(L5, to);
3880     __ movxtod(L6, F60);
3881     __ movxtod(L7, F62);
3882 
3883     __ BIND(L_check_loop_end_256bit);
3884     __ add(from, 16, from);
3885     __ subcc(len_reg, 16, len_reg);
3886     __ add(to, 16, to);
3887     __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256);
3888     __ delayed()->nop();
3889     // re-init intial vector for next block, 8-byte alignment is guaranteed
3890     __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3891     __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3892     __ mov(L0, I0);
3893     __ ret();
3894     __ delayed()->restore();
3895 
3896     return start;
3897   }
3898 
3899   address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3900     assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3901            "the following code assumes that first element of an int array is aligned to 8 bytes");
3902     assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3903            "the following code assumes that first element of a byte array is aligned to 8 bytes");
3904     __ align(CodeEntryAlignment);
3905     StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3906     Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start;
3907     Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256;
3908     Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128;
3909     Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256;
3910     Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128;
3911     Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192;
3912     Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256;
3913     address start = __ pc();
3914     Register from = I0; // source byte array
3915     Register to = I1;   // destination byte array
3916     Register key = I2;  // expanded key array
3917     Register rvec = I3; // init vector
3918     const Register len_reg = I4; // cipher length
3919     const Register original_key = I5;  // original key array only required during decryption
3920     const Register keylen = L6;  // reg for storing expanded key array length
3921 
3922     __ save_frame(0); //args are read from I* registers since we save the frame in the beginning
3923     // save cipher len to return in the end
3924     __ mov(len_reg, L7);
3925 
3926     // load original key from SunJCE expanded decryption key
3927     // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
3928     for ( int i = 0;  i <= 3; i++ ) {
3929       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3930     }
3931 
3932     // load initial vector, 8-byte alignment is guaranteed
3933     __ ldx(rvec,0,L0);
3934     __ ldx(rvec,8,L1);
3935 
3936     // read expanded key array length
3937     __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3938 
3939     // 256-bit original key size
3940     __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
3941 
3942     // 192-bit original key size
3943     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
3944 
3945     // 128-bit original key size
3946     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3947     for ( int i = 0;  i <= 36; i += 4 ) {
3948       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
3949       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
3950     }
3951 
3952     // load expanded key[last-1] and key[last] elements
3953     __ movdtox(F40,L2);
3954     __ movdtox(F42,L3);
3955 
3956     __ and3(len_reg, 16, L4);
3957     __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128);
3958     __ nop();
3959 
3960     __ ba_short(L_dec_first_block_start);
3961 
3962     __ BIND(L_expand192bit);
3963     // load rest of the 192-bit key
3964     __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
3965     __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
3966 
3967     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3968     for ( int i = 0;  i <= 36; i += 6 ) {
3969       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
3970       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
3971       __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
3972     }
3973     __ aes_kexpand1(F42, F46, 7, F48);
3974     __ aes_kexpand2(F44, F48, F50);
3975 
3976     // load expanded key[last-1] and key[last] elements
3977     __ movdtox(F48,L2);
3978     __ movdtox(F50,L3);
3979 
3980     __ and3(len_reg, 16, L4);
3981     __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192);
3982     __ nop();
3983 
3984     __ ba_short(L_dec_first_block_start);
3985 
3986     __ BIND(L_expand256bit);
3987     // load rest of the 256-bit key
3988     for ( int i = 4;  i <= 7; i++ ) {
3989       __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3990     }
3991 
3992     // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3993     for ( int i = 0;  i <= 40; i += 8 ) {
3994       __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
3995       __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
3996       __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
3997       __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
3998     }
3999     __ aes_kexpand1(F48, F54, 6, F56);
4000     __ aes_kexpand2(F50, F56, F58);
4001 
4002     // load expanded key[last-1] and key[last] elements
4003     __ movdtox(F56,L2);
4004     __ movdtox(F58,L3);
4005 
4006     __ and3(len_reg, 16, L4);
4007     __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256);
4008 
4009     __ BIND(L_dec_first_block_start);
4010     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4011     __ andcc(from, 7, G0);
4012     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block);
4013     __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4014 
4015     // aligned case: load input into L4 and L5
4016     __ ldx(from,0,L4);
4017     __ ldx(from,8,L5);
4018     __ ba_short(L_transform_first_block);
4019 
4020     __ BIND(L_load_misaligned_input_first_block);
4021     __ alignaddr(from, G0, from);
4022     // F58, F60, F62 can be clobbered
4023     __ ldf(FloatRegisterImpl::D, from, 0, F58);
4024     __ ldf(FloatRegisterImpl::D, from, 8, F60);
4025     __ ldf(FloatRegisterImpl::D, from, 16, F62);
4026     __ faligndata(F58, F60, F58);
4027     __ faligndata(F60, F62, F60);
4028     __ movdtox(F58, L4);
4029     __ movdtox(F60, L5);
4030     __ mov(G1, from);
4031 
4032     __ BIND(L_transform_first_block);
4033     __ xor3(L2,L4,G1);
4034     __ movxtod(G1,F60);
4035     __ xor3(L3,L5,G1);
4036     __ movxtod(G1,F62);
4037 
4038     // 128-bit original key size
4039     __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pn, L_dec_first_block128);
4040 
4041     // 192-bit original key size
4042     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_first_block192);
4043 
4044     __ aes_dround23(F54, F60, F62, F58);
4045     __ aes_dround01(F52, F60, F62, F56);
4046     __ aes_dround23(F50, F56, F58, F62);
4047     __ aes_dround01(F48, F56, F58, F60);
4048 
4049     __ BIND(L_dec_first_block192);
4050     __ aes_dround23(F46, F60, F62, F58);
4051     __ aes_dround01(F44, F60, F62, F56);
4052     __ aes_dround23(F42, F56, F58, F62);
4053     __ aes_dround01(F40, F56, F58, F60);
4054 
4055     __ BIND(L_dec_first_block128);
4056     for ( int i = 38;  i >= 6; i -= 8 ) {
4057       __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4058       __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4059       if ( i != 6) {
4060         __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4061         __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4062       } else {
4063         __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
4064         __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
4065       }
4066     }
4067 
4068     __ movxtod(L0,F56);
4069     __ movxtod(L1,F58);
4070     __ mov(L4,L0);
4071     __ mov(L5,L1);
4072     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4073     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4074 
4075     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4076     __ andcc(to, 7, G1);
4077     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block);
4078     __ delayed()->edge8n(to, G0, G2);
4079 
4080     // aligned case: store output into the destination array
4081     __ stf(FloatRegisterImpl::D, F60, to, 0);
4082     __ stf(FloatRegisterImpl::D, F62, to, 8);
4083     __ ba_short(L_check_decrypt_end);
4084 
4085     __ BIND(L_store_misaligned_output_first_block);
4086     __ add(to, 8, G3);
4087     __ mov(8, G4);
4088     __ sub(G4, G1, G4);
4089     __ alignaddr(G4, G0, G4);
4090     __ faligndata(F60, F60, F60);
4091     __ faligndata(F62, F62, F62);
4092     __ mov(to, G1);
4093     __ and3(to, -8, to);
4094     __ and3(G3, -8, G3);
4095     __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
4096     __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
4097     __ add(to, 8, to);
4098     __ add(G3, 8, G3);
4099     __ orn(G0, G2, G2);
4100     __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
4101     __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
4102     __ mov(G1, to);
4103 
4104     __ BIND(L_check_decrypt_end);
4105     __ add(from, 16, from);
4106     __ add(to, 16, to);
4107     __ subcc(len_reg, 16, len_reg);
4108     __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end);
4109     __ delayed()->nop();
4110 
4111     // 256-bit original key size
4112     __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_dec_next2_blocks256);
4113 
4114     // 192-bit original key size
4115     __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_next2_blocks192);
4116 
4117     __ align(OptoLoopAlignment);
4118     __ BIND(L_dec_next2_blocks128);
4119     __ nop();
4120 
4121     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4122     __ andcc(from, 7, G0);
4123     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128);
4124     __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4125 
4126     // aligned case: load input into G4, G5, L4 and L5
4127     __ ldx(from,0,G4);
4128     __ ldx(from,8,G5);
4129     __ ldx(from,16,L4);
4130     __ ldx(from,24,L5);
4131     __ ba_short(L_transform_next2_blocks128);
4132 
4133     __ BIND(L_load_misaligned_next2_blocks128);
4134     __ alignaddr(from, G0, from);
4135     // F40, F42, F58, F60, F62 can be clobbered
4136     __ ldf(FloatRegisterImpl::D, from, 0, F40);
4137     __ ldf(FloatRegisterImpl::D, from, 8, F42);
4138     __ ldf(FloatRegisterImpl::D, from, 16, F60);
4139     __ ldf(FloatRegisterImpl::D, from, 24, F62);
4140     __ ldf(FloatRegisterImpl::D, from, 32, F58);
4141     __ faligndata(F40, F42, F40);
4142     __ faligndata(F42, F60, F42);
4143     __ faligndata(F60, F62, F60);
4144     __ faligndata(F62, F58, F62);
4145     __ movdtox(F40, G4);
4146     __ movdtox(F42, G5);
4147     __ movdtox(F60, L4);
4148     __ movdtox(F62, L5);
4149     __ mov(G1, from);
4150 
4151     __ BIND(L_transform_next2_blocks128);
4152     // F40:F42 used for first 16-bytes
4153     __ xor3(L2,G4,G1);
4154     __ movxtod(G1,F40);
4155     __ xor3(L3,G5,G1);
4156     __ movxtod(G1,F42);
4157 
4158     // F60:F62 used for next 16-bytes
4159     __ xor3(L2,L4,G1);
4160     __ movxtod(G1,F60);
4161     __ xor3(L3,L5,G1);
4162     __ movxtod(G1,F62);
4163 
4164     for ( int i = 38;  i >= 6; i -= 8 ) {
4165       __ aes_dround23(as_FloatRegister(i), F40, F42, F44);
4166       __ aes_dround01(as_FloatRegister(i-2), F40, F42, F46);
4167       __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4168       __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4169       if (i != 6 ) {
4170         __ aes_dround23(as_FloatRegister(i-4), F46, F44, F42);
4171         __ aes_dround01(as_FloatRegister(i-6), F46, F44, F40);
4172         __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4173         __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4174       } else {
4175         __ aes_dround23_l(as_FloatRegister(i-4), F46, F44, F42);
4176         __ aes_dround01_l(as_FloatRegister(i-6), F46, F44, F40);
4177         __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
4178         __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
4179       }
4180     }
4181 
4182     __ movxtod(L0,F46);
4183     __ movxtod(L1,F44);
4184     __ fxor(FloatRegisterImpl::D, F46, F40, F40);
4185     __ fxor(FloatRegisterImpl::D, F44, F42, F42);
4186 
4187     __ movxtod(G4,F56);
4188     __ movxtod(G5,F58);
4189     __ mov(L4,L0);
4190     __ mov(L5,L1);
4191     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4192     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4193 
4194     // For mis-aligned store of 32 bytes of result we can do:
4195     // Circular right-shift all 4 FP registers so that 'head' and 'tail'
4196     // parts that need to be stored starting at mis-aligned address are in a FP reg
4197     // the other 3 FP regs can thus be stored using regular store
4198     // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts
4199 
4200     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4201     __ andcc(to, 7, G1);
4202     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128);
4203     __ delayed()->edge8n(to, G0, G2);
4204 
4205     // aligned case: store output into the destination array
4206     __ stf(FloatRegisterImpl::D, F40, to, 0);
4207     __ stf(FloatRegisterImpl::D, F42, to, 8);
4208     __ stf(FloatRegisterImpl::D, F60, to, 16);
4209     __ stf(FloatRegisterImpl::D, F62, to, 24);
4210     __ ba_short(L_check_decrypt_loop_end128);
4211 
4212     __ BIND(L_store_misaligned_output_next2_blocks128);
4213     __ mov(8, G4);
4214     __ sub(G4, G1, G4);
4215     __ alignaddr(G4, G0, G4);
4216     __ faligndata(F40, F42, F56); // F56 can be clobbered
4217     __ faligndata(F42, F60, F42);
4218     __ faligndata(F60, F62, F60);
4219     __ faligndata(F62, F40, F40);
4220     __ mov(to, G1);
4221     __ and3(to, -8, to);
4222     __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
4223     __ stf(FloatRegisterImpl::D, F56, to, 8);
4224     __ stf(FloatRegisterImpl::D, F42, to, 16);
4225     __ stf(FloatRegisterImpl::D, F60, to, 24);
4226     __ add(to, 32, to);
4227     __ orn(G0, G2, G2);
4228     __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
4229     __ mov(G1, to);
4230 
4231     __ BIND(L_check_decrypt_loop_end128);
4232     __ add(from, 32, from);
4233     __ add(to, 32, to);
4234     __ subcc(len_reg, 32, len_reg);
4235     __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128);
4236     __ delayed()->nop();
4237     __ ba_short(L_cbcdec_end);
4238 
4239     __ align(OptoLoopAlignment);
4240     __ BIND(L_dec_next2_blocks192);
4241     __ nop();
4242 
4243     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4244     __ andcc(from, 7, G0);
4245     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192);
4246     __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4247 
4248     // aligned case: load input into G4, G5, L4 and L5
4249     __ ldx(from,0,G4);
4250     __ ldx(from,8,G5);
4251     __ ldx(from,16,L4);
4252     __ ldx(from,24,L5);
4253     __ ba_short(L_transform_next2_blocks192);
4254 
4255     __ BIND(L_load_misaligned_next2_blocks192);
4256     __ alignaddr(from, G0, from);
4257     // F48, F50, F52, F60, F62 can be clobbered
4258     __ ldf(FloatRegisterImpl::D, from, 0, F48);
4259     __ ldf(FloatRegisterImpl::D, from, 8, F50);
4260     __ ldf(FloatRegisterImpl::D, from, 16, F60);
4261     __ ldf(FloatRegisterImpl::D, from, 24, F62);
4262     __ ldf(FloatRegisterImpl::D, from, 32, F52);
4263     __ faligndata(F48, F50, F48);
4264     __ faligndata(F50, F60, F50);
4265     __ faligndata(F60, F62, F60);
4266     __ faligndata(F62, F52, F62);
4267     __ movdtox(F48, G4);
4268     __ movdtox(F50, G5);
4269     __ movdtox(F60, L4);
4270     __ movdtox(F62, L5);
4271     __ mov(G1, from);
4272 
4273     __ BIND(L_transform_next2_blocks192);
4274     // F48:F50 used for first 16-bytes
4275     __ xor3(L2,G4,G1);
4276     __ movxtod(G1,F48);
4277     __ xor3(L3,G5,G1);
4278     __ movxtod(G1,F50);
4279 
4280     // F60:F62 used for next 16-bytes
4281     __ xor3(L2,L4,G1);
4282     __ movxtod(G1,F60);
4283     __ xor3(L3,L5,G1);
4284     __ movxtod(G1,F62);
4285 
4286     for ( int i = 46;  i >= 6; i -= 8 ) {
4287       __ aes_dround23(as_FloatRegister(i), F48, F50, F52);
4288       __ aes_dround01(as_FloatRegister(i-2), F48, F50, F54);
4289       __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4290       __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4291       if (i != 6 ) {
4292         __ aes_dround23(as_FloatRegister(i-4), F54, F52, F50);
4293         __ aes_dround01(as_FloatRegister(i-6), F54, F52, F48);
4294         __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4295         __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4296       } else {
4297         __ aes_dround23_l(as_FloatRegister(i-4), F54, F52, F50);
4298         __ aes_dround01_l(as_FloatRegister(i-6), F54, F52, F48);
4299         __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
4300         __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
4301       }
4302     }
4303 
4304     __ movxtod(L0,F54);
4305     __ movxtod(L1,F52);
4306     __ fxor(FloatRegisterImpl::D, F54, F48, F48);
4307     __ fxor(FloatRegisterImpl::D, F52, F50, F50);
4308 
4309     __ movxtod(G4,F56);
4310     __ movxtod(G5,F58);
4311     __ mov(L4,L0);
4312     __ mov(L5,L1);
4313     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4314     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4315 
4316     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4317     __ andcc(to, 7, G1);
4318     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192);
4319     __ delayed()->edge8n(to, G0, G2);
4320 
4321     // aligned case: store output into the destination array
4322     __ stf(FloatRegisterImpl::D, F48, to, 0);
4323     __ stf(FloatRegisterImpl::D, F50, to, 8);
4324     __ stf(FloatRegisterImpl::D, F60, to, 16);
4325     __ stf(FloatRegisterImpl::D, F62, to, 24);
4326     __ ba_short(L_check_decrypt_loop_end192);
4327 
4328     __ BIND(L_store_misaligned_output_next2_blocks192);
4329     __ mov(8, G4);
4330     __ sub(G4, G1, G4);
4331     __ alignaddr(G4, G0, G4);
4332     __ faligndata(F48, F50, F56); // F56 can be clobbered
4333     __ faligndata(F50, F60, F50);
4334     __ faligndata(F60, F62, F60);
4335     __ faligndata(F62, F48, F48);
4336     __ mov(to, G1);
4337     __ and3(to, -8, to);
4338     __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
4339     __ stf(FloatRegisterImpl::D, F56, to, 8);
4340     __ stf(FloatRegisterImpl::D, F50, to, 16);
4341     __ stf(FloatRegisterImpl::D, F60, to, 24);
4342     __ add(to, 32, to);
4343     __ orn(G0, G2, G2);
4344     __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
4345     __ mov(G1, to);
4346 
4347     __ BIND(L_check_decrypt_loop_end192);
4348     __ add(from, 32, from);
4349     __ add(to, 32, to);
4350     __ subcc(len_reg, 32, len_reg);
4351     __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192);
4352     __ delayed()->nop();
4353     __ ba_short(L_cbcdec_end);
4354 
4355     __ align(OptoLoopAlignment);
4356     __ BIND(L_dec_next2_blocks256);
4357     __ nop();
4358 
4359     // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4360     __ andcc(from, 7, G0);
4361     __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256);
4362     __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4363 
4364     // aligned case: load input into G4, G5, L4 and L5
4365     __ ldx(from,0,G4);
4366     __ ldx(from,8,G5);
4367     __ ldx(from,16,L4);
4368     __ ldx(from,24,L5);
4369     __ ba_short(L_transform_next2_blocks256);
4370 
4371     __ BIND(L_load_misaligned_next2_blocks256);
4372     __ alignaddr(from, G0, from);
4373     // F0, F2, F4, F60, F62 can be clobbered
4374     __ ldf(FloatRegisterImpl::D, from, 0, F0);
4375     __ ldf(FloatRegisterImpl::D, from, 8, F2);
4376     __ ldf(FloatRegisterImpl::D, from, 16, F60);
4377     __ ldf(FloatRegisterImpl::D, from, 24, F62);
4378     __ ldf(FloatRegisterImpl::D, from, 32, F4);
4379     __ faligndata(F0, F2, F0);
4380     __ faligndata(F2, F60, F2);
4381     __ faligndata(F60, F62, F60);
4382     __ faligndata(F62, F4, F62);
4383     __ movdtox(F0, G4);
4384     __ movdtox(F2, G5);
4385     __ movdtox(F60, L4);
4386     __ movdtox(F62, L5);
4387     __ mov(G1, from);
4388 
4389     __ BIND(L_transform_next2_blocks256);
4390     // F0:F2 used for first 16-bytes
4391     __ xor3(L2,G4,G1);
4392     __ movxtod(G1,F0);
4393     __ xor3(L3,G5,G1);
4394     __ movxtod(G1,F2);
4395 
4396     // F60:F62 used for next 16-bytes
4397     __ xor3(L2,L4,G1);
4398     __ movxtod(G1,F60);
4399     __ xor3(L3,L5,G1);
4400     __ movxtod(G1,F62);
4401 
4402     __ aes_dround23(F54, F0, F2, F4);
4403     __ aes_dround01(F52, F0, F2, F6);
4404     __ aes_dround23(F54, F60, F62, F58);
4405     __ aes_dround01(F52, F60, F62, F56);
4406     __ aes_dround23(F50, F6, F4, F2);
4407     __ aes_dround01(F48, F6, F4, F0);
4408     __ aes_dround23(F50, F56, F58, F62);
4409     __ aes_dround01(F48, F56, F58, F60);
4410     // save F48:F54 in temp registers
4411     __ movdtox(F54,G2);
4412     __ movdtox(F52,G3);
4413     __ movdtox(F50,G6);
4414     __ movdtox(F48,G1);
4415     for ( int i = 46;  i >= 14; i -= 8 ) {
4416       __ aes_dround23(as_FloatRegister(i), F0, F2, F4);
4417       __ aes_dround01(as_FloatRegister(i-2), F0, F2, F6);
4418       __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4419       __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4420       __ aes_dround23(as_FloatRegister(i-4), F6, F4, F2);
4421       __ aes_dround01(as_FloatRegister(i-6), F6, F4, F0);
4422       __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4423       __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4424     }
4425     // init F48:F54 with F0:F6 values (original key)
4426     __ ldf(FloatRegisterImpl::D, original_key, 0, F48);
4427     __ ldf(FloatRegisterImpl::D, original_key, 8, F50);
4428     __ ldf(FloatRegisterImpl::D, original_key, 16, F52);
4429     __ ldf(FloatRegisterImpl::D, original_key, 24, F54);
4430     __ aes_dround23(F54, F0, F2, F4);
4431     __ aes_dround01(F52, F0, F2, F6);
4432     __ aes_dround23(F54, F60, F62, F58);
4433     __ aes_dround01(F52, F60, F62, F56);
4434     __ aes_dround23_l(F50, F6, F4, F2);
4435     __ aes_dround01_l(F48, F6, F4, F0);
4436     __ aes_dround23_l(F50, F56, F58, F62);
4437     __ aes_dround01_l(F48, F56, F58, F60);
4438     // re-init F48:F54 with their original values
4439     __ movxtod(G2,F54);
4440     __ movxtod(G3,F52);
4441     __ movxtod(G6,F50);
4442     __ movxtod(G1,F48);
4443 
4444     __ movxtod(L0,F6);
4445     __ movxtod(L1,F4);
4446     __ fxor(FloatRegisterImpl::D, F6, F0, F0);
4447     __ fxor(FloatRegisterImpl::D, F4, F2, F2);
4448 
4449     __ movxtod(G4,F56);
4450     __ movxtod(G5,F58);
4451     __ mov(L4,L0);
4452     __ mov(L5,L1);
4453     __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4454     __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4455 
4456     // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4457     __ andcc(to, 7, G1);
4458     __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256);
4459     __ delayed()->edge8n(to, G0, G2);
4460 
4461     // aligned case: store output into the destination array
4462     __ stf(FloatRegisterImpl::D, F0, to, 0);
4463     __ stf(FloatRegisterImpl::D, F2, to, 8);
4464     __ stf(FloatRegisterImpl::D, F60, to, 16);
4465     __ stf(FloatRegisterImpl::D, F62, to, 24);
4466     __ ba_short(L_check_decrypt_loop_end256);
4467 
4468     __ BIND(L_store_misaligned_output_next2_blocks256);
4469     __ mov(8, G4);
4470     __ sub(G4, G1, G4);
4471     __ alignaddr(G4, G0, G4);
4472     __ faligndata(F0, F2, F56); // F56 can be clobbered
4473     __ faligndata(F2, F60, F2);
4474     __ faligndata(F60, F62, F60);
4475     __ faligndata(F62, F0, F0);
4476     __ mov(to, G1);
4477     __ and3(to, -8, to);
4478     __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
4479     __ stf(FloatRegisterImpl::D, F56, to, 8);
4480     __ stf(FloatRegisterImpl::D, F2, to, 16);
4481     __ stf(FloatRegisterImpl::D, F60, to, 24);
4482     __ add(to, 32, to);
4483     __ orn(G0, G2, G2);
4484     __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
4485     __ mov(G1, to);
4486 
4487     __ BIND(L_check_decrypt_loop_end256);
4488     __ add(from, 32, from);
4489     __ add(to, 32, to);
4490     __ subcc(len_reg, 32, len_reg);
4491     __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256);
4492     __ delayed()->nop();
4493 
4494     __ BIND(L_cbcdec_end);
4495     // re-init intial vector for next block, 8-byte alignment is guaranteed
4496     __ stx(L0, rvec, 0);
4497     __ stx(L1, rvec, 8);
4498     __ mov(L7, I0);
4499     __ ret();
4500     __ delayed()->restore();
4501 
4502     return start;
4503   }
4504 
4505   address generate_sha1_implCompress(bool multi_block, const char *name) {
4506     __ align(CodeEntryAlignment);
4507     StubCodeMark mark(this, "StubRoutines", name);
4508     address start = __ pc();
4509 
4510     Label L_sha1_loop, L_sha1_unaligned_input, L_sha1_unaligned_input_loop;
4511     int i;
4512 
4513     Register buf   = O0; // byte[] source+offset
4514     Register state = O1; // int[]  SHA.state
4515     Register ofs   = O2; // int    offset
4516     Register limit = O3; // int    limit
4517 
4518     // load state into F0-F4
4519     for (i = 0; i < 5; i++) {
4520       __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i));
4521     }
4522 
4523     __ andcc(buf, 7, G0);
4524     __ br(Assembler::notZero, false, Assembler::pn, L_sha1_unaligned_input);
4525     __ delayed()->nop();
4526 
4527     __ BIND(L_sha1_loop);
4528     // load buf into F8-F22
4529     for (i = 0; i < 8; i++) {
4530       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4531     }
4532     __ sha1();
4533     if (multi_block) {
4534       __ add(ofs, 64, ofs);
4535       __ add(buf, 64, buf);
4536       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_loop);
4537       __ mov(ofs, O0); // to be returned
4538     }
4539 
4540     // store F0-F4 into state and return
4541     for (i = 0; i < 4; i++) {
4542       __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4543     }
4544     __ retl();
4545     __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10);
4546 
4547     __ BIND(L_sha1_unaligned_input);
4548     __ alignaddr(buf, G0, buf);
4549 
4550     __ BIND(L_sha1_unaligned_input_loop);
4551     // load buf into F8-F22
4552     for (i = 0; i < 9; i++) {
4553       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4554     }
4555     for (i = 0; i < 8; i++) {
4556       __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8));
4557     }
4558     __ sha1();
4559     if (multi_block) {
4560       __ add(ofs, 64, ofs);
4561       __ add(buf, 64, buf);
4562       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_unaligned_input_loop);
4563       __ mov(ofs, O0); // to be returned
4564     }
4565 
4566     // store F0-F4 into state and return
4567     for (i = 0; i < 4; i++) {
4568       __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4569     }
4570     __ retl();
4571     __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10);
4572 
4573     return start;
4574   }
4575 
4576   address generate_sha256_implCompress(bool multi_block, const char *name) {
4577     __ align(CodeEntryAlignment);
4578     StubCodeMark mark(this, "StubRoutines", name);
4579     address start = __ pc();
4580 
4581     Label L_sha256_loop, L_sha256_unaligned_input, L_sha256_unaligned_input_loop;
4582     int i;
4583 
4584     Register buf   = O0; // byte[] source+offset
4585     Register state = O1; // int[]  SHA2.state
4586     Register ofs   = O2; // int    offset
4587     Register limit = O3; // int    limit
4588 
4589     // load state into F0-F7
4590     for (i = 0; i < 8; i++) {
4591       __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i));
4592     }
4593 
4594     __ andcc(buf, 7, G0);
4595     __ br(Assembler::notZero, false, Assembler::pn, L_sha256_unaligned_input);
4596     __ delayed()->nop();
4597 
4598     __ BIND(L_sha256_loop);
4599     // load buf into F8-F22
4600     for (i = 0; i < 8; i++) {
4601       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4602     }
4603     __ sha256();
4604     if (multi_block) {
4605       __ add(ofs, 64, ofs);
4606       __ add(buf, 64, buf);
4607       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_loop);
4608       __ mov(ofs, O0); // to be returned
4609     }
4610 
4611     // store F0-F7 into state and return
4612     for (i = 0; i < 7; i++) {
4613       __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4614     }
4615     __ retl();
4616     __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c);
4617 
4618     __ BIND(L_sha256_unaligned_input);
4619     __ alignaddr(buf, G0, buf);
4620 
4621     __ BIND(L_sha256_unaligned_input_loop);
4622     // load buf into F8-F22
4623     for (i = 0; i < 9; i++) {
4624       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4625     }
4626     for (i = 0; i < 8; i++) {
4627       __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8));
4628     }
4629     __ sha256();
4630     if (multi_block) {
4631       __ add(ofs, 64, ofs);
4632       __ add(buf, 64, buf);
4633       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_unaligned_input_loop);
4634       __ mov(ofs, O0); // to be returned
4635     }
4636 
4637     // store F0-F7 into state and return
4638     for (i = 0; i < 7; i++) {
4639       __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4640     }
4641     __ retl();
4642     __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c);
4643 
4644     return start;
4645   }
4646 
4647   address generate_sha512_implCompress(bool multi_block, const char *name) {
4648     __ align(CodeEntryAlignment);
4649     StubCodeMark mark(this, "StubRoutines", name);
4650     address start = __ pc();
4651 
4652     Label L_sha512_loop, L_sha512_unaligned_input, L_sha512_unaligned_input_loop;
4653     int i;
4654 
4655     Register buf   = O0; // byte[] source+offset
4656     Register state = O1; // long[] SHA5.state
4657     Register ofs   = O2; // int    offset
4658     Register limit = O3; // int    limit
4659 
4660     // load state into F0-F14
4661     for (i = 0; i < 8; i++) {
4662       __ ldf(FloatRegisterImpl::D, state, i*8, as_FloatRegister(i*2));
4663     }
4664 
4665     __ andcc(buf, 7, G0);
4666     __ br(Assembler::notZero, false, Assembler::pn, L_sha512_unaligned_input);
4667     __ delayed()->nop();
4668 
4669     __ BIND(L_sha512_loop);
4670     // load buf into F16-F46
4671     for (i = 0; i < 16; i++) {
4672       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16));
4673     }
4674     __ sha512();
4675     if (multi_block) {
4676       __ add(ofs, 128, ofs);
4677       __ add(buf, 128, buf);
4678       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_loop);
4679       __ mov(ofs, O0); // to be returned
4680     }
4681 
4682     // store F0-F14 into state and return
4683     for (i = 0; i < 7; i++) {
4684       __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8);
4685     }
4686     __ retl();
4687     __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38);
4688 
4689     __ BIND(L_sha512_unaligned_input);
4690     __ alignaddr(buf, G0, buf);
4691 
4692     __ BIND(L_sha512_unaligned_input_loop);
4693     // load buf into F16-F46
4694     for (i = 0; i < 17; i++) {
4695       __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16));
4696     }
4697     for (i = 0; i < 16; i++) {
4698       __ faligndata(as_FloatRegister(i*2 + 16), as_FloatRegister(i*2 + 18), as_FloatRegister(i*2 + 16));
4699     }
4700     __ sha512();
4701     if (multi_block) {
4702       __ add(ofs, 128, ofs);
4703       __ add(buf, 128, buf);
4704       __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_unaligned_input_loop);
4705       __ mov(ofs, O0); // to be returned
4706     }
4707 
4708     // store F0-F14 into state and return
4709     for (i = 0; i < 7; i++) {
4710       __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8);
4711     }
4712     __ retl();
4713     __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38);
4714 
4715     return start;
4716   }
4717 
4718   /* Single and multi-block ghash operations */
4719   address generate_ghash_processBlocks() {
4720       __ align(CodeEntryAlignment);
4721       Label L_ghash_loop, L_aligned, L_main;
4722       StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4723       address start = __ pc();
4724 
4725       Register state = I0;
4726       Register subkeyH = I1;
4727       Register data = I2;
4728       Register len = I3;
4729 
4730       __ save_frame(0);
4731 
4732       __ ldx(state, 0, O0);
4733       __ ldx(state, 8, O1);
4734 
4735       // Loop label for multiblock operations
4736       __ BIND(L_ghash_loop);
4737 
4738       // Check if 'data' is unaligned
4739       __ andcc(data, 7, G1);
4740       __ br(Assembler::zero, false, Assembler::pt, L_aligned);
4741       __ delayed()->nop();
4742 
4743       Register left_shift = L1;
4744       Register right_shift = L2;
4745       Register data_ptr = L3;
4746 
4747       // Get left and right shift values in bits
4748       __ sll(G1, LogBitsPerByte, left_shift);
4749       __ mov(64, right_shift);
4750       __ sub(right_shift, left_shift, right_shift);
4751 
4752       // Align to read 'data'
4753       __ sub(data, G1, data_ptr);
4754 
4755       // Load first 8 bytes of 'data'
4756       __ ldx(data_ptr, 0, O4);
4757       __ sllx(O4, left_shift, O4);
4758       __ ldx(data_ptr, 8, O5);
4759       __ srlx(O5, right_shift, G4);
4760       __ bset(G4, O4);
4761 
4762       // Load second 8 bytes of 'data'
4763       __ sllx(O5, left_shift, O5);
4764       __ ldx(data_ptr, 16, G4);
4765       __ srlx(G4, right_shift, G4);
4766       __ ba(L_main);
4767       __ delayed()->bset(G4, O5);
4768 
4769       // If 'data' is aligned, load normally
4770       __ BIND(L_aligned);
4771       __ ldx(data, 0, O4);
4772       __ ldx(data, 8, O5);
4773 
4774       __ BIND(L_main);
4775       __ ldx(subkeyH, 0, O2);
4776       __ ldx(subkeyH, 8, O3);
4777 
4778       __ xor3(O0, O4, O0);
4779       __ xor3(O1, O5, O1);
4780 
4781       __ xmulxhi(O0, O3, G3);
4782       __ xmulx(O0, O2, O5);
4783       __ xmulxhi(O1, O2, G4);
4784       __ xmulxhi(O1, O3, G5);
4785       __ xmulx(O0, O3, G1);
4786       __ xmulx(O1, O3, G2);
4787       __ xmulx(O1, O2, O3);
4788       __ xmulxhi(O0, O2, O4);
4789 
4790       __ mov(0xE1, O0);
4791       __ sllx(O0, 56, O0);
4792 
4793       __ xor3(O5, G3, O5);
4794       __ xor3(O5, G4, O5);
4795       __ xor3(G5, G1, G1);
4796       __ xor3(G1, O3, G1);
4797       __ srlx(G2, 63, O1);
4798       __ srlx(G1, 63, G3);
4799       __ sllx(G2, 63, O3);
4800       __ sllx(G2, 58, O2);
4801       __ xor3(O3, O2, O2);
4802 
4803       __ sllx(G1, 1, G1);
4804       __ or3(G1, O1, G1);
4805 
4806       __ xor3(G1, O2, G1);
4807 
4808       __ sllx(G2, 1, G2);
4809 
4810       __ xmulxhi(G1, O0, O1);
4811       __ xmulx(G1, O0, O2);
4812       __ xmulxhi(G2, O0, O3);
4813       __ xmulx(G2, O0, G1);
4814 
4815       __ xor3(O4, O1, O4);
4816       __ xor3(O5, O2, O5);
4817       __ xor3(O5, O3, O5);
4818 
4819       __ sllx(O4, 1, O2);
4820       __ srlx(O5, 63, O3);
4821 
4822       __ or3(O2, O3, O0);
4823 
4824       __ sllx(O5, 1, O1);
4825       __ srlx(G1, 63, O2);
4826       __ or3(O1, O2, O1);
4827       __ xor3(O1, G3, O1);
4828 
4829       __ deccc(len);
4830       __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop);
4831       __ delayed()->add(data, 16, data);
4832 
4833       __ stx(O0, I0, 0);
4834       __ stx(O1, I0, 8);
4835 
4836       __ ret();
4837       __ delayed()->restore();
4838 
4839       return start;
4840   }
4841 
4842   /**
4843    *  Arguments:
4844    *
4845    * Inputs:
4846    *   O0   - int   crc
4847    *   O1   - byte* buf
4848    *   O2   - int   len
4849    *   O3   - int*  table
4850    *
4851    * Output:
4852    *   O0   - int crc result
4853    */
4854   address generate_updateBytesCRC32C() {
4855     assert(UseCRC32CIntrinsics, "need CRC32C instruction");
4856 
4857     __ align(CodeEntryAlignment);
4858     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4859     address start = __ pc();
4860 
4861     const Register crc   = O0;  // crc
4862     const Register buf   = O1;  // source java byte array address
4863     const Register len   = O2;  // number of bytes
4864     const Register table = O3;  // byteTable
4865 
4866     __ kernel_crc32c(crc, buf, len, table);
4867 
4868     __ retl();
4869     __ delayed()->nop();
4870 
4871     return start;
4872   }
4873 
4874 #define ADLER32_NUM_TEMPS 16
4875 
4876   /**
4877    *  Arguments:
4878    *
4879    * Inputs:
4880    *   O0   - int   adler
4881    *   O1   - byte* buff
4882    *   O2   - int   len
4883    *
4884    * Output:
4885    *   O0   - int adler result
4886    */
4887   address generate_updateBytesAdler32() {
4888     __ align(CodeEntryAlignment);
4889     StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4890     address start = __ pc();
4891 
4892     Label L_cleanup_loop, L_cleanup_loop_check;
4893     Label L_main_loop_check, L_main_loop, L_inner_loop, L_inner_loop_check;
4894     Label L_nmax_check_done;
4895 
4896     // Aliases
4897     Register s1     = O0;
4898     Register s2     = O3;
4899     Register buff   = O1;
4900     Register len    = O2;
4901     Register temp[ADLER32_NUM_TEMPS] = {L0, L1, L2, L3, L4, L5, L6, L7, I0, I1, I2, I3, I4, I5, G3, I7};
4902 
4903     // Max number of bytes we can process before having to take the mod
4904     // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4905     unsigned long NMAX = 0x15B0;
4906 
4907     // Zero-out the upper bits of len
4908     __ clruwu(len);
4909 
4910     // Create the mask 0xFFFF
4911     __ set64(0x00FFFF, O4, O5); // O5 is the temp register
4912 
4913     // s1 is initialized to the lower 16 bits of adler
4914     // s2 is initialized to the upper 16 bits of adler
4915     __ srlx(O0, 16, O5); // adler >> 16
4916     __ and3(O0, O4, s1); // s1  = (adler & 0xFFFF)
4917     __ and3(O5, O4, s2); // s2  = ((adler >> 16) & 0xFFFF)
4918 
4919     // The pipelined loop needs at least 16 elements for 1 iteration
4920     // It does check this, but it is more effective to skip to the cleanup loop
4921     // Setup the constant for cutoff checking
4922     __ mov(15, O4);
4923 
4924     // Check if we are above the cutoff, if not go to the cleanup loop immediately
4925     __ cmp_and_br_short(len, O4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_loop_check);
4926 
4927     // Free up some registers for our use
4928     for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
4929       __ movxtod(temp[i], as_FloatRegister(2*i));
4930     }
4931 
4932     // Loop maintenance stuff is done at the end of the loop, so skip to there
4933     __ ba_short(L_main_loop_check);
4934 
4935     __ BIND(L_main_loop);
4936 
4937     // Prologue for inner loop
4938     __ ldub(buff, 0, L0);
4939     __ dec(O5);
4940 
4941     for (int i = 1; i < 8; i++) {
4942       __ ldub(buff, i, temp[i]);
4943     }
4944 
4945     __ inc(buff, 8);
4946 
4947     // Inner loop processes 16 elements at a time, might never execute if only 16 elements
4948     // to be processed by the outter loop
4949     __ ba_short(L_inner_loop_check);
4950 
4951     __ BIND(L_inner_loop);
4952 
4953     for (int i = 0; i < 8; i++) {
4954       __ ldub(buff, (2*i), temp[(8+(2*i)) % ADLER32_NUM_TEMPS]);
4955       __ add(s1, temp[i], s1);
4956       __ ldub(buff, (2*i)+1, temp[(8+(2*i)+1) % ADLER32_NUM_TEMPS]);
4957       __ add(s2, s1, s2);
4958     }
4959 
4960     // Original temp 0-7 used and new loads to temp 0-7 issued
4961     // temp 8-15 ready to be consumed
4962     __ add(s1, I0, s1);
4963     __ dec(O5);
4964     __ add(s2, s1, s2);
4965     __ add(s1, I1, s1);
4966     __ inc(buff, 16);
4967     __ add(s2, s1, s2);
4968 
4969     for (int i = 0; i < 6; i++) {
4970       __ add(s1, temp[10+i], s1);
4971       __ add(s2, s1, s2);
4972     }
4973 
4974     __ BIND(L_inner_loop_check);
4975     __ nop();
4976     __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_inner_loop);
4977 
4978     // Epilogue
4979     for (int i = 0; i < 4; i++) {
4980       __ ldub(buff, (2*i), temp[8+(2*i)]);
4981       __ add(s1, temp[i], s1);
4982       __ ldub(buff, (2*i)+1, temp[8+(2*i)+1]);
4983       __ add(s2, s1, s2);
4984     }
4985 
4986     __ add(s1, temp[4], s1);
4987     __ inc(buff, 8);
4988 
4989     for (int i = 0; i < 11; i++) {
4990       __ add(s2, s1, s2);
4991       __ add(s1, temp[5+i], s1);
4992     }
4993 
4994     __ add(s2, s1, s2);
4995 
4996     // Take the mod for s1 and s2
4997     __ set64(0xFFF1, L0, L1);
4998     __ udivx(s1, L0, L1);
4999     __ udivx(s2, L0, L2);
5000     __ mulx(L0, L1, L1);
5001     __ mulx(L0, L2, L2);
5002     __ sub(s1, L1, s1);
5003     __ sub(s2, L2, s2);
5004 
5005     // Make sure there is something left to process
5006     __ BIND(L_main_loop_check);
5007     __ set64(NMAX, L0, L1);
5008     // k = len < NMAX ? len : NMAX
5009     __ cmp_and_br_short(len, L0, Assembler::greaterEqualUnsigned, Assembler::pt, L_nmax_check_done);
5010     __ andn(len, 0x0F, L0); // only loop a multiple of 16 times
5011     __ BIND(L_nmax_check_done);
5012     __ mov(L0, O5);
5013     __ sub(len, L0, len); // len -= k
5014 
5015     __ srlx(O5, 4, O5); // multiplies of 16
5016     __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_main_loop);
5017 
5018     // Restore anything we used, take the mod one last time, combine and return
5019     // Restore any registers we saved
5020     for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
5021       __ movdtox(as_FloatRegister(2*i), temp[i]);
5022     }
5023 
5024     // There might be nothing left to process
5025     __ ba_short(L_cleanup_loop_check);
5026 
5027     __ BIND(L_cleanup_loop);
5028     __ ldub(buff, 0, O4); // load single byte form buffer
5029     __ inc(buff); // buff++
5030     __ add(s1, O4, s1); // s1 += *buff++;
5031     __ dec(len); // len--
5032     __ add(s1, s2, s2); // s2 += s1;
5033     __ BIND(L_cleanup_loop_check);
5034     __ nop();
5035     __ cmp_and_br_short(len, 0, Assembler::notEqual, Assembler::pt, L_cleanup_loop);
5036 
5037     // Take the mod one last time
5038     __ set64(0xFFF1, O1, O2);
5039     __ udivx(s1, O1, O2);
5040     __ udivx(s2, O1, O5);
5041     __ mulx(O1, O2, O2);
5042     __ mulx(O1, O5, O5);
5043     __ sub(s1, O2, s1);
5044     __ sub(s2, O5, s2);
5045 
5046     // Combine lower bits and higher bits
5047     __ sllx(s2, 16, s2); // s2 = s2 << 16
5048     __ or3(s1, s2, s1);  // adler = s2 | s1
5049     // Final return value is in O0
5050     __ retl();
5051     __ delayed()->nop();
5052 
5053     return start;
5054   }
5055 
5056 /**
5057    *  Arguments:
5058    *
5059    * Inputs:
5060    *   O0   - int   crc
5061    *   O1   - byte* buf
5062    *   O2   - int   len
5063    *   O3   - int*  table
5064    *
5065    * Output:
5066    *   O0   - int crc result
5067    */
5068   address generate_updateBytesCRC32() {
5069     assert(UseCRC32Intrinsics, "need VIS3 instructions");
5070 
5071     __ align(CodeEntryAlignment);
5072     StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
5073     address start = __ pc();
5074 
5075     const Register crc   = O0; // crc
5076     const Register buf   = O1; // source java byte array address
5077     const Register len   = O2; // length
5078     const Register table = O3; // crc_table address (reuse register)
5079 
5080     __ kernel_crc32(crc, buf, len, table);
5081 
5082     __ retl();
5083     __ delayed()->nop();
5084 
5085     return start;
5086   }
5087 
5088   void generate_initial() {
5089     // Generates all stubs and initializes the entry points
5090 
5091     //------------------------------------------------------------------------------------------------------------------------
5092     // entry points that exist in all platforms
5093     // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
5094     //       the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
5095     StubRoutines::_forward_exception_entry                 = generate_forward_exception();
5096 
5097     StubRoutines::_call_stub_entry                         = generate_call_stub(StubRoutines::_call_stub_return_address);
5098     StubRoutines::_catch_exception_entry                   = generate_catch_exception();
5099 
5100     //------------------------------------------------------------------------------------------------------------------------
5101     // entry points that are platform specific
5102     StubRoutines::Sparc::_test_stop_entry                  = generate_test_stop();
5103 
5104     StubRoutines::Sparc::_stop_subroutine_entry            = generate_stop_subroutine();
5105     StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
5106 
5107 #if !defined(COMPILER2) && !defined(_LP64)
5108     StubRoutines::_atomic_xchg_entry         = generate_atomic_xchg();
5109     StubRoutines::_atomic_cmpxchg_entry      = generate_atomic_cmpxchg();
5110     StubRoutines::_atomic_add_entry          = generate_atomic_add();
5111     StubRoutines::_atomic_xchg_ptr_entry     = StubRoutines::_atomic_xchg_entry;
5112     StubRoutines::_atomic_cmpxchg_ptr_entry  = StubRoutines::_atomic_cmpxchg_entry;
5113     StubRoutines::_atomic_cmpxchg_byte_entry = ShouldNotCallThisStub();
5114     StubRoutines::_atomic_cmpxchg_long_entry = generate_atomic_cmpxchg_long();
5115     StubRoutines::_atomic_add_ptr_entry      = StubRoutines::_atomic_add_entry;
5116 #endif  // COMPILER2 !=> _LP64
5117 
5118     // Build this early so it's available for the interpreter.
5119     StubRoutines::_throw_StackOverflowError_entry =
5120             generate_throw_exception("StackOverflowError throw_exception",
5121             CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
5122     StubRoutines::_throw_delayed_StackOverflowError_entry =
5123             generate_throw_exception("delayed StackOverflowError throw_exception",
5124             CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError));
5125 
5126     if (UseCRC32Intrinsics) {
5127       // set table address before stub generation which use it
5128       StubRoutines::_crc_table_adr = (address)StubRoutines::Sparc::_crc_table;
5129       StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5130     }
5131 
5132     if (UseCRC32CIntrinsics) {
5133       // set table address before stub generation which use it
5134       StubRoutines::_crc32c_table_addr = (address)StubRoutines::Sparc::_crc32c_table;
5135       StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5136     }
5137   }
5138 
5139 
5140   void generate_all() {
5141     // Generates all stubs and initializes the entry points
5142 
5143     // Generate partial_subtype_check first here since its code depends on
5144     // UseZeroBaseCompressedOops which is defined after heap initialization.
5145     StubRoutines::Sparc::_partial_subtype_check                = generate_partial_subtype_check();
5146     // These entry points require SharedInfo::stack0 to be set up in non-core builds
5147     StubRoutines::_throw_AbstractMethodError_entry         = generate_throw_exception("AbstractMethodError throw_exception",          CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
5148     StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
5149     StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
5150 
5151     // support for verify_oop (must happen after universe_init)
5152     StubRoutines::_verify_oop_subroutine_entry     = generate_verify_oop_subroutine();
5153 
5154     // arraycopy stubs used by compilers
5155     generate_arraycopy_stubs();
5156 
5157     // Don't initialize the platform math functions since sparc
5158     // doesn't have intrinsics for these operations.
5159 
5160     // Safefetch stubs.
5161     generate_safefetch("SafeFetch32", sizeof(int),     &StubRoutines::_safefetch32_entry,
5162                                                        &StubRoutines::_safefetch32_fault_pc,
5163                                                        &StubRoutines::_safefetch32_continuation_pc);
5164     generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5165                                                        &StubRoutines::_safefetchN_fault_pc,
5166                                                        &StubRoutines::_safefetchN_continuation_pc);
5167 
5168     // generate AES intrinsics code
5169     if (UseAESIntrinsics) {
5170       StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5171       StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5172       StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5173       StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
5174     }
5175     // generate GHASH intrinsics code
5176     if (UseGHASHIntrinsics) {
5177       StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5178     }
5179 
5180     // generate SHA1/SHA256/SHA512 intrinsics code
5181     if (UseSHA1Intrinsics) {
5182       StubRoutines::_sha1_implCompress     = generate_sha1_implCompress(false,   "sha1_implCompress");
5183       StubRoutines::_sha1_implCompressMB   = generate_sha1_implCompress(true,    "sha1_implCompressMB");
5184     }
5185     if (UseSHA256Intrinsics) {
5186       StubRoutines::_sha256_implCompress   = generate_sha256_implCompress(false, "sha256_implCompress");
5187       StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true,  "sha256_implCompressMB");
5188     }
5189     if (UseSHA512Intrinsics) {
5190       StubRoutines::_sha512_implCompress   = generate_sha512_implCompress(false, "sha512_implCompress");
5191       StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true,  "sha512_implCompressMB");
5192     }
5193     // generate Adler32 intrinsics code
5194     if (UseAdler32Intrinsics) {
5195       StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5196     }
5197   }
5198 
5199 
5200  public:
5201   StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5202     // replace the standard masm with a special one:
5203     _masm = new MacroAssembler(code);
5204 
5205     _stub_count = !all ? 0x100 : 0x200;
5206     if (all) {
5207       generate_all();
5208     } else {
5209       generate_initial();
5210     }
5211 
5212     // make sure this stub is available for all local calls
5213     if (_atomic_add_stub.is_unbound()) {
5214       // generate a second time, if necessary
5215       (void) generate_atomic_add();
5216     }
5217   }
5218 
5219 
5220  private:
5221   int _stub_count;
5222   void stub_prolog(StubCodeDesc* cdesc) {
5223     # ifdef ASSERT
5224       // put extra information in the stub code, to make it more readable
5225 #ifdef _LP64
5226 // Write the high part of the address
5227 // [RGV] Check if there is a dependency on the size of this prolog
5228       __ emit_data((intptr_t)cdesc >> 32,    relocInfo::none);
5229 #endif
5230       __ emit_data((intptr_t)cdesc,    relocInfo::none);
5231       __ emit_data(++_stub_count, relocInfo::none);
5232     # endif
5233     align(true);
5234   }
5235 
5236   void align(bool at_header = false) {
5237     // %%%%% move this constant somewhere else
5238     // UltraSPARC cache line size is 8 instructions:
5239     const unsigned int icache_line_size = 32;
5240     const unsigned int icache_half_line_size = 16;
5241 
5242     if (at_header) {
5243       while ((intptr_t)(__ pc()) % icache_line_size != 0) {
5244         __ emit_data(0, relocInfo::none);
5245       }
5246     } else {
5247       while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
5248         __ nop();
5249       }
5250     }
5251   }
5252 
5253 }; // end class declaration
5254 
5255 void StubGenerator_generate(CodeBuffer* code, bool all) {
5256   StubGenerator g(code, all);
5257 }