1 /*
2 * Copyright (c) 1997, 2019, Oracle and/or its affiliates. All rights reserved.
3 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER.
4 *
5 * This code is free software; you can redistribute it and/or modify it
6 * under the terms of the GNU General Public License version 2 only, as
7 * published by the Free Software Foundation.
8 *
9 * This code is distributed in the hope that it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
12 * version 2 for more details (a copy is included in the LICENSE file that
13 * accompanied this code).
14 *
15 * You should have received a copy of the GNU General Public License version
16 * 2 along with this work; if not, write to the Free Software Foundation,
17 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA.
18 *
19 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA
20 * or visit www.oracle.com if you need additional information or have any
21 * questions.
22 *
23 */
24
25 #include "precompiled.hpp"
26 #include "asm/macroAssembler.inline.hpp"
27 #include "gc/shared/barrierSet.hpp"
28 #include "gc/shared/barrierSetAssembler.hpp"
29 #include "interpreter/interpreter.hpp"
30 #include "nativeInst_sparc.hpp"
31 #include "oops/instanceOop.hpp"
32 #include "oops/method.hpp"
33 #include "oops/objArrayKlass.hpp"
34 #include "oops/oop.inline.hpp"
35 #include "prims/methodHandles.hpp"
36 #include "runtime/frame.inline.hpp"
37 #include "runtime/handles.inline.hpp"
38 #include "runtime/sharedRuntime.hpp"
39 #include "runtime/stubCodeGenerator.hpp"
40 #include "runtime/stubRoutines.hpp"
41 #include "runtime/thread.inline.hpp"
42 #ifdef COMPILER2
43 #include "opto/runtime.hpp"
44 #endif
45
46 // Declaration and definition of StubGenerator (no .hpp file).
47 // For a more detailed description of the stub routine structure
48 // see the comment in stubRoutines.hpp.
49
50 #define __ _masm->
51
52 #ifdef PRODUCT
53 #define BLOCK_COMMENT(str) /* nothing */
54 #else
55 #define BLOCK_COMMENT(str) __ block_comment(str)
56 #endif
57
58 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":")
59
60 // Note: The register L7 is used as L7_thread_cache, and may not be used
61 // any other way within this module.
62
63 static const Register& Lstub_temp = L2;
64
65 // -------------------------------------------------------------------------------------------------------------------------
66 // Stub Code definitions
67
68 class StubGenerator: public StubCodeGenerator {
69 private:
70
71 #ifdef PRODUCT
72 #define inc_counter_np(a,b,c)
73 #else
74 #define inc_counter_np(counter, t1, t2) \
75 BLOCK_COMMENT("inc_counter " #counter); \
76 __ inc_counter(&counter, t1, t2);
77 #endif
78
79 //----------------------------------------------------------------------------------------------------
80 // Call stubs are used to call Java from C
81
82 address generate_call_stub(address& return_pc) {
83 StubCodeMark mark(this, "StubRoutines", "call_stub");
84 address start = __ pc();
85
86 // Incoming arguments:
87 //
88 // o0 : call wrapper address
89 // o1 : result (address)
90 // o2 : result type
91 // o3 : method
92 // o4 : (interpreter) entry point
93 // o5 : parameters (address)
94 // [sp + 0x5c]: parameter size (in words)
95 // [sp + 0x60]: thread
96 //
97 // +---------------+ <--- sp + 0
98 // | |
99 // . reg save area .
100 // | |
101 // +---------------+ <--- sp + 0x40
102 // | |
103 // . extra 7 slots .
104 // | |
105 // +---------------+ <--- sp + 0x5c
106 // | param. size |
107 // +---------------+ <--- sp + 0x60
108 // | thread |
109 // +---------------+
110 // | |
111
112 // note: if the link argument position changes, adjust
113 // the code in frame::entry_frame_call_wrapper()
114
115 const Argument link = Argument(0, false); // used only for GC
116 const Argument result = Argument(1, false);
117 const Argument result_type = Argument(2, false);
118 const Argument method = Argument(3, false);
119 const Argument entry_point = Argument(4, false);
120 const Argument parameters = Argument(5, false);
121 const Argument parameter_size = Argument(6, false);
122 const Argument thread = Argument(7, false);
123
124 // setup thread register
125 __ ld_ptr(thread.as_address(), G2_thread);
126 __ reinit_heapbase();
127
128 #ifdef ASSERT
129 // make sure we have no pending exceptions
130 { const Register t = G3_scratch;
131 Label L;
132 __ ld_ptr(G2_thread, in_bytes(Thread::pending_exception_offset()), t);
133 __ br_null_short(t, Assembler::pt, L);
134 __ stop("StubRoutines::call_stub: entered with pending exception");
135 __ bind(L);
136 }
137 #endif
138
139 // create activation frame & allocate space for parameters
140 { const Register t = G3_scratch;
141 __ ld_ptr(parameter_size.as_address(), t); // get parameter size (in words)
142 __ add(t, frame::memory_parameter_word_sp_offset, t); // add space for save area (in words)
143 __ round_to(t, WordsPerLong); // make sure it is multiple of 2 (in words)
144 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes
145 __ neg(t); // negate so it can be used with save
146 __ save(SP, t, SP); // setup new frame
147 }
148
149 // +---------------+ <--- sp + 0
150 // | |
151 // . reg save area .
152 // | |
153 // +---------------+ <--- sp + 0x40
154 // | |
155 // . extra 7 slots .
156 // | |
157 // +---------------+ <--- sp + 0x5c
158 // | empty slot | (only if parameter size is even)
159 // +---------------+
160 // | |
161 // . parameters .
162 // | |
163 // +---------------+ <--- fp + 0
164 // | |
165 // . reg save area .
166 // | |
167 // +---------------+ <--- fp + 0x40
168 // | |
169 // . extra 7 slots .
170 // | |
171 // +---------------+ <--- fp + 0x5c
172 // | param. size |
173 // +---------------+ <--- fp + 0x60
174 // | thread |
175 // +---------------+
176 // | |
177
178 // pass parameters if any
179 BLOCK_COMMENT("pass parameters if any");
180 { const Register src = parameters.as_in().as_register();
181 const Register dst = Lentry_args;
182 const Register tmp = G3_scratch;
183 const Register cnt = G4_scratch;
184
185 // test if any parameters & setup of Lentry_args
186 Label exit;
187 __ ld_ptr(parameter_size.as_in().as_address(), cnt); // parameter counter
188 __ add( FP, STACK_BIAS, dst );
189 __ cmp_zero_and_br(Assembler::zero, cnt, exit);
190 __ delayed()->sub(dst, BytesPerWord, dst); // setup Lentry_args
191
192 // copy parameters if any
193 Label loop;
194 __ BIND(loop);
195 // Store parameter value
196 __ ld_ptr(src, 0, tmp);
197 __ add(src, BytesPerWord, src);
198 __ st_ptr(tmp, dst, 0);
199 __ deccc(cnt);
200 __ br(Assembler::greater, false, Assembler::pt, loop);
201 __ delayed()->sub(dst, Interpreter::stackElementSize, dst);
202
203 // done
204 __ BIND(exit);
205 }
206
207 // setup parameters, method & call Java function
208 #ifdef ASSERT
209 // layout_activation_impl checks it's notion of saved SP against
210 // this register, so if this changes update it as well.
211 const Register saved_SP = Lscratch;
212 __ mov(SP, saved_SP); // keep track of SP before call
213 #endif
214
215 // setup parameters
216 const Register t = G3_scratch;
217 __ ld_ptr(parameter_size.as_in().as_address(), t); // get parameter size (in words)
218 __ sll(t, Interpreter::logStackElementSize, t); // compute number of bytes
219 __ sub(FP, t, Gargs); // setup parameter pointer
220 __ add( Gargs, STACK_BIAS, Gargs ); // Account for LP64 stack bias
221 __ mov(SP, O5_savedSP);
222
223
224 // do the call
225 //
226 // the following register must be setup:
227 //
228 // G2_thread
229 // G5_method
230 // Gargs
231 BLOCK_COMMENT("call Java function");
232 __ jmpl(entry_point.as_in().as_register(), G0, O7);
233 __ delayed()->mov(method.as_in().as_register(), G5_method); // setup method
234
235 BLOCK_COMMENT("call_stub_return_address:");
236 return_pc = __ pc();
237
238 // The callee, if it wasn't interpreted, can return with SP changed so
239 // we can no longer assert of change of SP.
240
241 // store result depending on type
242 // (everything that is not T_OBJECT, T_LONG, T_FLOAT, or T_DOUBLE
243 // is treated as T_INT)
244 { const Register addr = result .as_in().as_register();
245 const Register type = result_type.as_in().as_register();
246 Label is_long, is_float, is_double, is_object, exit;
247 __ cmp(type, T_OBJECT); __ br(Assembler::equal, false, Assembler::pn, is_object);
248 __ delayed()->cmp(type, T_FLOAT); __ br(Assembler::equal, false, Assembler::pn, is_float);
249 __ delayed()->cmp(type, T_DOUBLE); __ br(Assembler::equal, false, Assembler::pn, is_double);
250 __ delayed()->cmp(type, T_LONG); __ br(Assembler::equal, false, Assembler::pn, is_long);
251 __ delayed()->nop();
252
253 // store int result
254 __ st(O0, addr, G0);
255
256 __ BIND(exit);
257 __ ret();
258 __ delayed()->restore();
259
260 __ BIND(is_object);
261 __ ba(exit);
262 __ delayed()->st_ptr(O0, addr, G0);
263
264 __ BIND(is_float);
265 __ ba(exit);
266 __ delayed()->stf(FloatRegisterImpl::S, F0, addr, G0);
267
268 __ BIND(is_double);
269 __ ba(exit);
270 __ delayed()->stf(FloatRegisterImpl::D, F0, addr, G0);
271
272 __ BIND(is_long);
273 __ ba(exit);
274 __ delayed()->st_long(O0, addr, G0); // store entire long
275 }
276 return start;
277 }
278
279
280 //----------------------------------------------------------------------------------------------------
281 // Return point for a Java call if there's an exception thrown in Java code.
282 // The exception is caught and transformed into a pending exception stored in
283 // JavaThread that can be tested from within the VM.
284 //
285 // Oexception: exception oop
286
287 address generate_catch_exception() {
288 StubCodeMark mark(this, "StubRoutines", "catch_exception");
289
290 address start = __ pc();
291 // verify that thread corresponds
292 __ verify_thread();
293
294 const Register& temp_reg = Gtemp;
295 Address pending_exception_addr (G2_thread, Thread::pending_exception_offset());
296 Address exception_file_offset_addr(G2_thread, Thread::exception_file_offset ());
297 Address exception_line_offset_addr(G2_thread, Thread::exception_line_offset ());
298
299 // set pending exception
300 __ verify_oop(Oexception);
301 __ st_ptr(Oexception, pending_exception_addr);
302 __ set((intptr_t)__FILE__, temp_reg);
303 __ st_ptr(temp_reg, exception_file_offset_addr);
304 __ set((intptr_t)__LINE__, temp_reg);
305 __ st(temp_reg, exception_line_offset_addr);
306
307 // complete return to VM
308 assert(StubRoutines::_call_stub_return_address != NULL, "must have been generated before");
309
310 AddressLiteral stub_ret(StubRoutines::_call_stub_return_address);
311 __ jump_to(stub_ret, temp_reg);
312 __ delayed()->nop();
313
314 return start;
315 }
316
317
318 //----------------------------------------------------------------------------------------------------
319 // Continuation point for runtime calls returning with a pending exception
320 // The pending exception check happened in the runtime or native call stub
321 // The pending exception in Thread is converted into a Java-level exception
322 //
323 // Contract with Java-level exception handler: O0 = exception
324 // O1 = throwing pc
325
326 address generate_forward_exception() {
327 StubCodeMark mark(this, "StubRoutines", "forward_exception");
328 address start = __ pc();
329
330 // Upon entry, O7 has the return address returning into Java
331 // (interpreted or compiled) code; i.e. the return address
332 // becomes the throwing pc.
333
334 const Register& handler_reg = Gtemp;
335
336 Address exception_addr(G2_thread, Thread::pending_exception_offset());
337
338 #ifdef ASSERT
339 // make sure that this code is only executed if there is a pending exception
340 { Label L;
341 __ ld_ptr(exception_addr, Gtemp);
342 __ br_notnull_short(Gtemp, Assembler::pt, L);
343 __ stop("StubRoutines::forward exception: no pending exception (1)");
344 __ bind(L);
345 }
346 #endif
347
348 // compute exception handler into handler_reg
349 __ get_thread();
350 __ ld_ptr(exception_addr, Oexception);
351 __ verify_oop(Oexception);
352 __ save_frame(0); // compensates for compiler weakness
353 __ add(O7->after_save(), frame::pc_return_offset, Lscratch); // save the issuing PC
354 BLOCK_COMMENT("call exception_handler_for_return_address");
355 __ call_VM_leaf(L7_thread_cache, CAST_FROM_FN_PTR(address, SharedRuntime::exception_handler_for_return_address), G2_thread, Lscratch);
356 __ mov(O0, handler_reg);
357 __ restore(); // compensates for compiler weakness
358
359 __ ld_ptr(exception_addr, Oexception);
360 __ add(O7, frame::pc_return_offset, Oissuing_pc); // save the issuing PC
361
362 #ifdef ASSERT
363 // make sure exception is set
364 { Label L;
365 __ br_notnull_short(Oexception, Assembler::pt, L);
366 __ stop("StubRoutines::forward exception: no pending exception (2)");
367 __ bind(L);
368 }
369 #endif
370 // jump to exception handler
371 __ jmp(handler_reg, 0);
372 // clear pending exception
373 __ delayed()->st_ptr(G0, exception_addr);
374
375 return start;
376 }
377
378 // Safefetch stubs.
379 void generate_safefetch(const char* name, int size, address* entry,
380 address* fault_pc, address* continuation_pc) {
381 // safefetch signatures:
382 // int SafeFetch32(int* adr, int errValue);
383 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue);
384 //
385 // arguments:
386 // o0 = adr
387 // o1 = errValue
388 //
389 // result:
390 // o0 = *adr or errValue
391
392 StubCodeMark mark(this, "StubRoutines", name);
393
394 // Entry point, pc or function descriptor.
395 __ align(CodeEntryAlignment);
396 *entry = __ pc();
397
398 __ mov(O0, G1); // g1 = o0
399 __ mov(O1, O0); // o0 = o1
400 // Load *adr into c_rarg1, may fault.
401 *fault_pc = __ pc();
402 switch (size) {
403 case 4:
404 // int32_t
405 __ ldsw(G1, 0, O0); // o0 = [g1]
406 break;
407 case 8:
408 // int64_t
409 __ ldx(G1, 0, O0); // o0 = [g1]
410 break;
411 default:
412 ShouldNotReachHere();
413 }
414
415 // return errValue or *adr
416 *continuation_pc = __ pc();
417 // By convention with the trap handler we ensure there is a non-CTI
418 // instruction in the trap shadow.
419 __ nop();
420 __ retl();
421 __ delayed()->nop();
422 }
423
424 //------------------------------------------------------------------------------------------------------------------------
425 // Continuation point for throwing of implicit exceptions that are not handled in
426 // the current activation. Fabricates an exception oop and initiates normal
427 // exception dispatching in this frame. Only callee-saved registers are preserved
428 // (through the normal register window / RegisterMap handling).
429 // If the compiler needs all registers to be preserved between the fault
430 // point and the exception handler then it must assume responsibility for that in
431 // AbstractCompiler::continuation_for_implicit_null_exception or
432 // continuation_for_implicit_division_by_zero_exception. All other implicit
433 // exceptions (e.g., NullPointerException or AbstractMethodError on entry) are
434 // either at call sites or otherwise assume that stack unwinding will be initiated,
435 // so caller saved registers were assumed volatile in the compiler.
436
437 // Note that we generate only this stub into a RuntimeStub, because it needs to be
438 // properly traversed and ignored during GC, so we change the meaning of the "__"
439 // macro within this method.
440 #undef __
441 #define __ masm->
442
443 address generate_throw_exception(const char* name, address runtime_entry,
444 Register arg1 = noreg, Register arg2 = noreg) {
445 #ifdef ASSERT
446 int insts_size = VerifyThread ? 1 * K : 600;
447 #else
448 int insts_size = VerifyThread ? 1 * K : 256;
449 #endif /* ASSERT */
450 int locs_size = 32;
451
452 CodeBuffer code(name, insts_size, locs_size);
453 MacroAssembler* masm = new MacroAssembler(&code);
454
455 __ verify_thread();
456
457 // This is an inlined and slightly modified version of call_VM
458 // which has the ability to fetch the return PC out of thread-local storage
459 __ assert_not_delayed();
460
461 // Note that we always push a frame because on the SPARC
462 // architecture, for all of our implicit exception kinds at call
463 // sites, the implicit exception is taken before the callee frame
464 // is pushed.
465 __ save_frame(0);
466
467 int frame_complete = __ offset();
468
469 // Note that we always have a runtime stub frame on the top of stack by this point
470 Register last_java_sp = SP;
471 // 64-bit last_java_sp is biased!
472 __ set_last_Java_frame(last_java_sp, G0);
473 if (VerifyThread) __ mov(G2_thread, O0); // about to be smashed; pass early
474 __ save_thread(noreg);
475 if (arg1 != noreg) {
476 assert(arg2 != O1, "clobbered");
477 __ mov(arg1, O1);
478 }
479 if (arg2 != noreg) {
480 __ mov(arg2, O2);
481 }
482 // do the call
483 BLOCK_COMMENT("call runtime_entry");
484 __ call(runtime_entry, relocInfo::runtime_call_type);
485 if (!VerifyThread)
486 __ delayed()->mov(G2_thread, O0); // pass thread as first argument
487 else
488 __ delayed()->nop(); // (thread already passed)
489 __ restore_thread(noreg);
490 __ reset_last_Java_frame();
491
492 // check for pending exceptions. use Gtemp as scratch register.
493 #ifdef ASSERT
494 Label L;
495
496 Address exception_addr(G2_thread, Thread::pending_exception_offset());
497 Register scratch_reg = Gtemp;
498 __ ld_ptr(exception_addr, scratch_reg);
499 __ br_notnull_short(scratch_reg, Assembler::pt, L);
500 __ should_not_reach_here();
501 __ bind(L);
502 #endif // ASSERT
503 BLOCK_COMMENT("call forward_exception_entry");
504 __ call(StubRoutines::forward_exception_entry(), relocInfo::runtime_call_type);
505 // we use O7 linkage so that forward_exception_entry has the issuing PC
506 __ delayed()->restore();
507
508 RuntimeStub* stub = RuntimeStub::new_runtime_stub(name, &code, frame_complete, masm->total_frame_size_in_bytes(0), NULL, false);
509 return stub->entry_point();
510 }
511
512 #undef __
513 #define __ _masm->
514
515
516 // Generate a routine that sets all the registers so we
517 // can tell if the stop routine prints them correctly.
518 address generate_test_stop() {
519 StubCodeMark mark(this, "StubRoutines", "test_stop");
520 address start = __ pc();
521
522 int i;
523
524 __ save_frame(0);
525
526 static jfloat zero = 0.0, one = 1.0;
527
528 // put addr in L0, then load through L0 to F0
529 __ set((intptr_t)&zero, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F0);
530 __ set((intptr_t)&one, L0); __ ldf( FloatRegisterImpl::S, L0, 0, F1); // 1.0 to F1
531
532 // use add to put 2..18 in F2..F18
533 for ( i = 2; i <= 18; ++i ) {
534 __ fadd( FloatRegisterImpl::S, F1, as_FloatRegister(i-1), as_FloatRegister(i));
535 }
536
537 // Now put double 2 in F16, double 18 in F18
538 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F2, F16 );
539 __ ftof( FloatRegisterImpl::S, FloatRegisterImpl::D, F18, F18 );
540
541 // use add to put 20..32 in F20..F32
542 for (i = 20; i < 32; i += 2) {
543 __ fadd( FloatRegisterImpl::D, F16, as_FloatRegister(i-2), as_FloatRegister(i));
544 }
545
546 // put 0..7 in i's, 8..15 in l's, 16..23 in o's, 24..31 in g's
547 for ( i = 0; i < 8; ++i ) {
548 if (i < 6) {
549 __ set( i, as_iRegister(i));
550 __ set(16 + i, as_oRegister(i));
551 __ set(24 + i, as_gRegister(i));
552 }
553 __ set( 8 + i, as_lRegister(i));
554 }
555
556 __ stop("testing stop");
557
558
559 __ ret();
560 __ delayed()->restore();
561
562 return start;
563 }
564
565
566 address generate_stop_subroutine() {
567 StubCodeMark mark(this, "StubRoutines", "stop_subroutine");
568 address start = __ pc();
569
570 __ stop_subroutine();
571
572 return start;
573 }
574
575 address generate_flush_callers_register_windows() {
576 StubCodeMark mark(this, "StubRoutines", "flush_callers_register_windows");
577 address start = __ pc();
578
579 __ flushw();
580 __ retl(false);
581 __ delayed()->add( FP, STACK_BIAS, O0 );
582 // The returned value must be a stack pointer whose register save area
583 // is flushed, and will stay flushed while the caller executes.
584
585 return start;
586 }
587
588 // Support for jint Atomic::xchg(jint exchange_value, volatile jint* dest).
589 //
590 // Arguments:
591 //
592 // exchange_value: O0
593 // dest: O1
594 //
595 // Results:
596 //
597 // O0: the value previously stored in dest
598 //
599 address generate_atomic_xchg() {
600 StubCodeMark mark(this, "StubRoutines", "atomic_xchg");
601 address start = __ pc();
602
603 if (UseCASForSwap) {
604 // Use CAS instead of swap, just in case the MP hardware
605 // prefers to work with just one kind of synch. instruction.
606 Label retry;
607 __ BIND(retry);
608 __ mov(O0, O3); // scratch copy of exchange value
609 __ ld(O1, 0, O2); // observe the previous value
610 // try to replace O2 with O3
611 __ cas(O1, O2, O3);
612 __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
613
614 __ retl(false);
615 __ delayed()->mov(O2, O0); // report previous value to caller
616 } else {
617 __ retl(false);
618 __ delayed()->swap(O1, 0, O0);
619 }
620
621 return start;
622 }
623
624
625 // Support for jint Atomic::cmpxchg(jint exchange_value, volatile jint* dest, jint compare_value)
626 //
627 // Arguments:
628 //
629 // exchange_value: O0
630 // dest: O1
631 // compare_value: O2
632 //
633 // Results:
634 //
635 // O0: the value previously stored in dest
636 //
637 address generate_atomic_cmpxchg() {
638 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg");
639 address start = __ pc();
640
641 // cmpxchg(dest, compare_value, exchange_value)
642 __ cas(O1, O2, O0);
643 __ retl(false);
644 __ delayed()->nop();
645
646 return start;
647 }
648
649 // Support for jlong Atomic::cmpxchg(jlong exchange_value, volatile jlong *dest, jlong compare_value)
650 //
651 // Arguments:
652 //
653 // exchange_value: O1:O0
654 // dest: O2
655 // compare_value: O4:O3
656 //
657 // Results:
658 //
659 // O1:O0: the value previously stored in dest
660 //
661 // Overwrites: G1,G2,G3
662 //
663 address generate_atomic_cmpxchg_long() {
664 StubCodeMark mark(this, "StubRoutines", "atomic_cmpxchg_long");
665 address start = __ pc();
666
667 __ sllx(O0, 32, O0);
668 __ srl(O1, 0, O1);
669 __ or3(O0,O1,O0); // O0 holds 64-bit value from compare_value
670 __ sllx(O3, 32, O3);
671 __ srl(O4, 0, O4);
672 __ or3(O3,O4,O3); // O3 holds 64-bit value from exchange_value
673 __ casx(O2, O3, O0);
674 __ srl(O0, 0, O1); // unpacked return value in O1:O0
675 __ retl(false);
676 __ delayed()->srlx(O0, 32, O0);
677
678 return start;
679 }
680
681
682 // Support for jint Atomic::add(volatile jint* dest, jint add_value).
683 //
684 // Arguments:
685 //
686 // add_value: O0 (e.g., +1 or -1)
687 // dest: O1
688 //
689 // Results:
690 //
691 // O0: the new value stored in dest
692 //
693 // Overwrites: O3
694 //
695 address generate_atomic_add() {
696 StubCodeMark mark(this, "StubRoutines", "atomic_add");
697 address start = __ pc();
698 __ BIND(_atomic_add_stub);
699
700 Label(retry);
701 __ BIND(retry);
702
703 __ lduw(O1, 0, O2);
704 __ add(O0, O2, O3);
705 __ cas(O1, O2, O3);
706 __ cmp_and_br_short(O2, O3, Assembler::notEqual, Assembler::pn, retry);
707 __ retl(false);
708 __ delayed()->add(O0, O2, O0); // note that cas made O2==O3
709
710 return start;
711 }
712 Label _atomic_add_stub; // called from other stubs
713
714
715 // Support for uint StubRoutine::Sparc::partial_subtype_check( Klass sub, Klass super );
716 // Arguments :
717 //
718 // ret : O0, returned
719 // icc/xcc: set as O0 (depending on wordSize)
720 // sub : O1, argument, not changed
721 // super: O2, argument, not changed
722 // raddr: O7, blown by call
723 address generate_partial_subtype_check() {
724 __ align(CodeEntryAlignment);
725 StubCodeMark mark(this, "StubRoutines", "partial_subtype_check");
726 address start = __ pc();
727 Label miss;
728
729 __ save_frame(0);
730 Register Rret = I0;
731 Register Rsub = I1;
732 Register Rsuper = I2;
733
734 Register L0_ary_len = L0;
735 Register L1_ary_ptr = L1;
736 Register L2_super = L2;
737 Register L3_index = L3;
738
739 __ check_klass_subtype_slow_path(Rsub, Rsuper,
740 L0, L1, L2, L3,
741 NULL, &miss);
742
743 // Match falls through here.
744 __ addcc(G0,0,Rret); // set Z flags, Z result
745
746 __ ret(); // Result in Rret is zero; flags set to Z
747 __ delayed()->restore();
748
749 __ BIND(miss);
750 __ addcc(G0,1,Rret); // set NZ flags, NZ result
751
752 __ ret(); // Result in Rret is != 0; flags set to NZ
753 __ delayed()->restore();
754
755 return start;
756 }
757
758
759 // Called from MacroAssembler::verify_oop
760 //
761 address generate_verify_oop_subroutine() {
762 StubCodeMark mark(this, "StubRoutines", "verify_oop_stub");
763
764 address start = __ pc();
765
766 __ verify_oop_subroutine();
767
768 return start;
769 }
770
771
772 //
773 // Verify that a register contains clean 32-bits positive value
774 // (high 32-bits are 0) so it could be used in 64-bits shifts (sllx, srax).
775 //
776 // Input:
777 // Rint - 32-bits value
778 // Rtmp - scratch
779 //
780 void assert_clean_int(Register Rint, Register Rtmp) {
781 #if defined(ASSERT)
782 __ signx(Rint, Rtmp);
783 __ cmp(Rint, Rtmp);
784 __ breakpoint_trap(Assembler::notEqual, Assembler::xcc);
785 #endif
786 }
787
788 //
789 // Generate overlap test for array copy stubs
790 //
791 // Input:
792 // O0 - array1
793 // O1 - array2
794 // O2 - element count
795 //
796 // Kills temps: O3, O4
797 //
798 void array_overlap_test(address no_overlap_target, int log2_elem_size) {
799 assert(no_overlap_target != NULL, "must be generated");
800 array_overlap_test(no_overlap_target, NULL, log2_elem_size);
801 }
802 void array_overlap_test(Label& L_no_overlap, int log2_elem_size) {
803 array_overlap_test(NULL, &L_no_overlap, log2_elem_size);
804 }
805 void array_overlap_test(address no_overlap_target, Label* NOLp, int log2_elem_size) {
806 const Register from = O0;
807 const Register to = O1;
808 const Register count = O2;
809 const Register to_from = O3; // to - from
810 const Register byte_count = O4; // count << log2_elem_size
811
812 __ subcc(to, from, to_from);
813 __ sll_ptr(count, log2_elem_size, byte_count);
814 if (NOLp == NULL)
815 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, no_overlap_target);
816 else
817 __ brx(Assembler::lessEqualUnsigned, false, Assembler::pt, (*NOLp));
818 __ delayed()->cmp(to_from, byte_count);
819 if (NOLp == NULL)
820 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, no_overlap_target);
821 else
822 __ brx(Assembler::greaterEqualUnsigned, false, Assembler::pt, (*NOLp));
823 __ delayed()->nop();
824 }
825
826
827 //
828 // Generate main code for disjoint arraycopy
829 //
830 typedef void (StubGenerator::*CopyLoopFunc)(Register from, Register to, Register count, int count_dec,
831 Label& L_loop, bool use_prefetch, bool use_bis);
832
833 void disjoint_copy_core(Register from, Register to, Register count, int log2_elem_size,
834 int iter_size, StubGenerator::CopyLoopFunc copy_loop_func) {
835 Label L_copy;
836
837 assert(log2_elem_size <= 3, "the following code should be changed");
838 int count_dec = 16>>log2_elem_size;
839
840 int prefetch_dist = MAX2(ArraycopySrcPrefetchDistance, ArraycopyDstPrefetchDistance);
841 assert(prefetch_dist < 4096, "invalid value");
842 prefetch_dist = (prefetch_dist + (iter_size-1)) & (-iter_size); // round up to one iteration copy size
843 int prefetch_count = (prefetch_dist >> log2_elem_size); // elements count
844
845 if (UseBlockCopy) {
846 Label L_block_copy, L_block_copy_prefetch, L_skip_block_copy;
847
848 // 64 bytes tail + bytes copied in one loop iteration
849 int tail_size = 64 + iter_size;
850 int block_copy_count = (MAX2(tail_size, (int)BlockCopyLowLimit)) >> log2_elem_size;
851 // Use BIS copy only for big arrays since it requires membar.
852 __ set(block_copy_count, O4);
853 __ cmp_and_br_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_skip_block_copy);
854 // This code is for disjoint source and destination:
855 // to <= from || to >= from+count
856 // but BIS will stomp over 'from' if (to > from-tail_size && to <= from)
857 __ sub(from, to, O4);
858 __ srax(O4, 4, O4); // divide by 16 since following short branch have only 5 bits for imm.
859 __ cmp_and_br_short(O4, (tail_size>>4), Assembler::lessEqualUnsigned, Assembler::pn, L_skip_block_copy);
860
861 __ wrasi(G0, Assembler::ASI_ST_BLKINIT_PRIMARY);
862 // BIS should not be used to copy tail (64 bytes+iter_size)
863 // to avoid zeroing of following values.
864 __ sub(count, (tail_size>>log2_elem_size), count); // count is still positive >= 0
865
866 if (prefetch_count > 0) { // rounded up to one iteration count
867 // Do prefetching only if copy size is bigger
868 // than prefetch distance.
869 __ set(prefetch_count, O4);
870 __ cmp_and_brx_short(count, O4, Assembler::less, Assembler::pt, L_block_copy);
871 __ sub(count, O4, count);
872
873 (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy_prefetch, true, true);
874 __ set(prefetch_count, O4);
875 __ add(count, O4, count);
876
877 } // prefetch_count > 0
878
879 (this->*copy_loop_func)(from, to, count, count_dec, L_block_copy, false, true);
880 __ add(count, (tail_size>>log2_elem_size), count); // restore count
881
882 __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
883 // BIS needs membar.
884 __ membar(Assembler::StoreLoad);
885 // Copy tail
886 __ ba_short(L_copy);
887
888 __ BIND(L_skip_block_copy);
889 } // UseBlockCopy
890
891 if (prefetch_count > 0) { // rounded up to one iteration count
892 // Do prefetching only if copy size is bigger
893 // than prefetch distance.
894 __ set(prefetch_count, O4);
895 __ cmp_and_brx_short(count, O4, Assembler::lessUnsigned, Assembler::pt, L_copy);
896 __ sub(count, O4, count);
897
898 Label L_copy_prefetch;
899 (this->*copy_loop_func)(from, to, count, count_dec, L_copy_prefetch, true, false);
900 __ set(prefetch_count, O4);
901 __ add(count, O4, count);
902
903 } // prefetch_count > 0
904
905 (this->*copy_loop_func)(from, to, count, count_dec, L_copy, false, false);
906 }
907
908
909
910 //
911 // Helper methods for copy_16_bytes_forward_with_shift()
912 //
913 void copy_16_bytes_shift_loop(Register from, Register to, Register count, int count_dec,
914 Label& L_loop, bool use_prefetch, bool use_bis) {
915
916 const Register left_shift = G1; // left shift bit counter
917 const Register right_shift = G5; // right shift bit counter
918
919 __ align(OptoLoopAlignment);
920 __ BIND(L_loop);
921 if (use_prefetch) {
922 if (ArraycopySrcPrefetchDistance > 0) {
923 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
924 }
925 if (ArraycopyDstPrefetchDistance > 0) {
926 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
927 }
928 }
929 __ ldx(from, 0, O4);
930 __ ldx(from, 8, G4);
931 __ inc(to, 16);
932 __ inc(from, 16);
933 __ deccc(count, count_dec); // Can we do next iteration after this one?
934 __ srlx(O4, right_shift, G3);
935 __ bset(G3, O3);
936 __ sllx(O4, left_shift, O4);
937 __ srlx(G4, right_shift, G3);
938 __ bset(G3, O4);
939 if (use_bis) {
940 __ stxa(O3, to, -16);
941 __ stxa(O4, to, -8);
942 } else {
943 __ stx(O3, to, -16);
944 __ stx(O4, to, -8);
945 }
946 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
947 __ delayed()->sllx(G4, left_shift, O3);
948 }
949
950 // Copy big chunks forward with shift
951 //
952 // Inputs:
953 // from - source arrays
954 // to - destination array aligned to 8-bytes
955 // count - elements count to copy >= the count equivalent to 16 bytes
956 // count_dec - elements count's decrement equivalent to 16 bytes
957 // L_copy_bytes - copy exit label
958 //
959 void copy_16_bytes_forward_with_shift(Register from, Register to,
960 Register count, int log2_elem_size, Label& L_copy_bytes) {
961 Label L_aligned_copy, L_copy_last_bytes;
962 assert(log2_elem_size <= 3, "the following code should be changed");
963 int count_dec = 16>>log2_elem_size;
964
965 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
966 __ andcc(from, 7, G1); // misaligned bytes
967 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
968 __ delayed()->nop();
969
970 const Register left_shift = G1; // left shift bit counter
971 const Register right_shift = G5; // right shift bit counter
972
973 __ sll(G1, LogBitsPerByte, left_shift);
974 __ mov(64, right_shift);
975 __ sub(right_shift, left_shift, right_shift);
976
977 //
978 // Load 2 aligned 8-bytes chunks and use one from previous iteration
979 // to form 2 aligned 8-bytes chunks to store.
980 //
981 __ dec(count, count_dec); // Pre-decrement 'count'
982 __ andn(from, 7, from); // Align address
983 __ ldx(from, 0, O3);
984 __ inc(from, 8);
985 __ sllx(O3, left_shift, O3);
986
987 disjoint_copy_core(from, to, count, log2_elem_size, 16, &StubGenerator::copy_16_bytes_shift_loop);
988
989 __ inccc(count, count_dec>>1 ); // + 8 bytes
990 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
991 __ delayed()->inc(count, count_dec>>1); // restore 'count'
992
993 // copy 8 bytes, part of them already loaded in O3
994 __ ldx(from, 0, O4);
995 __ inc(to, 8);
996 __ inc(from, 8);
997 __ srlx(O4, right_shift, G3);
998 __ bset(O3, G3);
999 __ stx(G3, to, -8);
1000
1001 __ BIND(L_copy_last_bytes);
1002 __ srl(right_shift, LogBitsPerByte, right_shift); // misaligned bytes
1003 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1004 __ delayed()->sub(from, right_shift, from); // restore address
1005
1006 __ BIND(L_aligned_copy);
1007 }
1008
1009 // Copy big chunks backward with shift
1010 //
1011 // Inputs:
1012 // end_from - source arrays end address
1013 // end_to - destination array end address aligned to 8-bytes
1014 // count - elements count to copy >= the count equivalent to 16 bytes
1015 // count_dec - elements count's decrement equivalent to 16 bytes
1016 // L_aligned_copy - aligned copy exit label
1017 // L_copy_bytes - copy exit label
1018 //
1019 void copy_16_bytes_backward_with_shift(Register end_from, Register end_to,
1020 Register count, int count_dec,
1021 Label& L_aligned_copy, Label& L_copy_bytes) {
1022 Label L_loop, L_copy_last_bytes;
1023
1024 // if both arrays have the same alignment mod 8, do 8 bytes aligned copy
1025 __ andcc(end_from, 7, G1); // misaligned bytes
1026 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1027 __ delayed()->deccc(count, count_dec); // Pre-decrement 'count'
1028
1029 const Register left_shift = G1; // left shift bit counter
1030 const Register right_shift = G5; // right shift bit counter
1031
1032 __ sll(G1, LogBitsPerByte, left_shift);
1033 __ mov(64, right_shift);
1034 __ sub(right_shift, left_shift, right_shift);
1035
1036 //
1037 // Load 2 aligned 8-bytes chunks and use one from previous iteration
1038 // to form 2 aligned 8-bytes chunks to store.
1039 //
1040 __ andn(end_from, 7, end_from); // Align address
1041 __ ldx(end_from, 0, O3);
1042 __ align(OptoLoopAlignment);
1043 __ BIND(L_loop);
1044 __ ldx(end_from, -8, O4);
1045 __ deccc(count, count_dec); // Can we do next iteration after this one?
1046 __ ldx(end_from, -16, G4);
1047 __ dec(end_to, 16);
1048 __ dec(end_from, 16);
1049 __ srlx(O3, right_shift, O3);
1050 __ sllx(O4, left_shift, G3);
1051 __ bset(G3, O3);
1052 __ stx(O3, end_to, 8);
1053 __ srlx(O4, right_shift, O4);
1054 __ sllx(G4, left_shift, G3);
1055 __ bset(G3, O4);
1056 __ stx(O4, end_to, 0);
1057 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1058 __ delayed()->mov(G4, O3);
1059
1060 __ inccc(count, count_dec>>1 ); // + 8 bytes
1061 __ brx(Assembler::negative, true, Assembler::pn, L_copy_last_bytes);
1062 __ delayed()->inc(count, count_dec>>1); // restore 'count'
1063
1064 // copy 8 bytes, part of them already loaded in O3
1065 __ ldx(end_from, -8, O4);
1066 __ dec(end_to, 8);
1067 __ dec(end_from, 8);
1068 __ srlx(O3, right_shift, O3);
1069 __ sllx(O4, left_shift, G3);
1070 __ bset(O3, G3);
1071 __ stx(G3, end_to, 0);
1072
1073 __ BIND(L_copy_last_bytes);
1074 __ srl(left_shift, LogBitsPerByte, left_shift); // misaligned bytes
1075 __ br(Assembler::always, false, Assembler::pt, L_copy_bytes);
1076 __ delayed()->add(end_from, left_shift, end_from); // restore address
1077 }
1078
1079 address generate_unsafecopy_common_error_exit() {
1080 address start_pc = __ pc();
1081 if (UseBlockCopy) {
1082 __ wrasi(G0, Assembler::ASI_PRIMARY_NOFAULT);
1083 __ membar(Assembler::StoreLoad);
1084 }
1085 __ retl();
1086 __ delayed()->mov(G0, O0); // return 0
1087 return start_pc;
1088 }
1089
1090 //
1091 // Generate stub for disjoint byte copy. If "aligned" is true, the
1092 // "from" and "to" addresses are assumed to be heapword aligned.
1093 //
1094 // Arguments for generated stub:
1095 // from: O0
1096 // to: O1
1097 // count: O2 treated as signed
1098 //
1099 address generate_disjoint_byte_copy(bool aligned, address *entry, const char *name) {
1100 __ align(CodeEntryAlignment);
1101 StubCodeMark mark(this, "StubRoutines", name);
1102 address start = __ pc();
1103
1104 Label L_skip_alignment, L_align;
1105 Label L_copy_byte, L_copy_byte_loop, L_exit;
1106
1107 const Register from = O0; // source array address
1108 const Register to = O1; // destination array address
1109 const Register count = O2; // elements count
1110 const Register offset = O5; // offset from start of arrays
1111 // O3, O4, G3, G4 are used as temp registers
1112
1113 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1114
1115 if (entry != NULL) {
1116 *entry = __ pc();
1117 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1118 BLOCK_COMMENT("Entry:");
1119 }
1120
1121 {
1122 // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1123 UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1124
1125 // for short arrays, just do single element copy
1126 __ cmp(count, 23); // 16 + 7
1127 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1128 __ delayed()->mov(G0, offset);
1129
1130 if (aligned) {
1131 // 'aligned' == true when it is known statically during compilation
1132 // of this arraycopy call site that both 'from' and 'to' addresses
1133 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1134 //
1135 // Aligned arrays have 4 bytes alignment in 32-bits VM
1136 // and 8 bytes - in 64-bits VM. So we do it only for 32-bits VM
1137 //
1138 } else {
1139 // copy bytes to align 'to' on 8 byte boundary
1140 __ andcc(to, 7, G1); // misaligned bytes
1141 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1142 __ delayed()->neg(G1);
1143 __ inc(G1, 8); // bytes need to copy to next 8-bytes alignment
1144 __ sub(count, G1, count);
1145 __ BIND(L_align);
1146 __ ldub(from, 0, O3);
1147 __ deccc(G1);
1148 __ inc(from);
1149 __ stb(O3, to, 0);
1150 __ br(Assembler::notZero, false, Assembler::pt, L_align);
1151 __ delayed()->inc(to);
1152 __ BIND(L_skip_alignment);
1153 }
1154 if (!aligned) {
1155 // Copy with shift 16 bytes per iteration if arrays do not have
1156 // the same alignment mod 8, otherwise fall through to the next
1157 // code for aligned copy.
1158 // The compare above (count >= 23) guarantes 'count' >= 16 bytes.
1159 // Also jump over aligned copy after the copy with shift completed.
1160
1161 copy_16_bytes_forward_with_shift(from, to, count, 0, L_copy_byte);
1162 }
1163
1164 // Both array are 8 bytes aligned, copy 16 bytes at a time
1165 __ and3(count, 7, G4); // Save count
1166 __ srl(count, 3, count);
1167 generate_disjoint_long_copy_core(aligned);
1168 __ mov(G4, count); // Restore count
1169
1170 // copy tailing bytes
1171 __ BIND(L_copy_byte);
1172 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1173 __ align(OptoLoopAlignment);
1174 __ BIND(L_copy_byte_loop);
1175 __ ldub(from, offset, O3);
1176 __ deccc(count);
1177 __ stb(O3, to, offset);
1178 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_byte_loop);
1179 __ delayed()->inc(offset);
1180 }
1181
1182 __ BIND(L_exit);
1183 // O3, O4 are used as temp registers
1184 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1185 __ retl();
1186 __ delayed()->mov(G0, O0); // return 0
1187 return start;
1188 }
1189
1190 //
1191 // Generate stub for conjoint byte copy. If "aligned" is true, the
1192 // "from" and "to" addresses are assumed to be heapword aligned.
1193 //
1194 // Arguments for generated stub:
1195 // from: O0
1196 // to: O1
1197 // count: O2 treated as signed
1198 //
1199 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target,
1200 address *entry, const char *name) {
1201 // Do reverse copy.
1202
1203 __ align(CodeEntryAlignment);
1204 StubCodeMark mark(this, "StubRoutines", name);
1205 address start = __ pc();
1206
1207 Label L_skip_alignment, L_align, L_aligned_copy;
1208 Label L_copy_byte, L_copy_byte_loop, L_exit;
1209
1210 const Register from = O0; // source array address
1211 const Register to = O1; // destination array address
1212 const Register count = O2; // elements count
1213 const Register end_from = from; // source array end address
1214 const Register end_to = to; // destination array end address
1215
1216 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1217
1218 if (entry != NULL) {
1219 *entry = __ pc();
1220 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1221 BLOCK_COMMENT("Entry:");
1222 }
1223
1224 array_overlap_test(nooverlap_target, 0);
1225
1226 {
1227 // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1228 UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1229
1230 __ add(to, count, end_to); // offset after last copied element
1231
1232 // for short arrays, just do single element copy
1233 __ cmp(count, 23); // 16 + 7
1234 __ brx(Assembler::less, false, Assembler::pn, L_copy_byte);
1235 __ delayed()->add(from, count, end_from);
1236
1237 {
1238 // Align end of arrays since they could be not aligned even
1239 // when arrays itself are aligned.
1240
1241 // copy bytes to align 'end_to' on 8 byte boundary
1242 __ andcc(end_to, 7, G1); // misaligned bytes
1243 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1244 __ delayed()->nop();
1245 __ sub(count, G1, count);
1246 __ BIND(L_align);
1247 __ dec(end_from);
1248 __ dec(end_to);
1249 __ ldub(end_from, 0, O3);
1250 __ deccc(G1);
1251 __ brx(Assembler::notZero, false, Assembler::pt, L_align);
1252 __ delayed()->stb(O3, end_to, 0);
1253 __ BIND(L_skip_alignment);
1254 }
1255 if (aligned) {
1256 // Both arrays are aligned to 8-bytes in 64-bits VM.
1257 // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1258 // in unaligned case.
1259 __ dec(count, 16);
1260 } else {
1261 // Copy with shift 16 bytes per iteration if arrays do not have
1262 // the same alignment mod 8, otherwise jump to the next
1263 // code for aligned copy (and substracting 16 from 'count' before jump).
1264 // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1265 // Also jump over aligned copy after the copy with shift completed.
1266
1267 copy_16_bytes_backward_with_shift(end_from, end_to, count, 16,
1268 L_aligned_copy, L_copy_byte);
1269 }
1270 // copy 4 elements (16 bytes) at a time
1271 __ align(OptoLoopAlignment);
1272 __ BIND(L_aligned_copy);
1273 __ dec(end_from, 16);
1274 __ ldx(end_from, 8, O3);
1275 __ ldx(end_from, 0, O4);
1276 __ dec(end_to, 16);
1277 __ deccc(count, 16);
1278 __ stx(O3, end_to, 8);
1279 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1280 __ delayed()->stx(O4, end_to, 0);
1281 __ inc(count, 16);
1282
1283 // copy 1 element (2 bytes) at a time
1284 __ BIND(L_copy_byte);
1285 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1286 __ align(OptoLoopAlignment);
1287 __ BIND(L_copy_byte_loop);
1288 __ dec(end_from);
1289 __ dec(end_to);
1290 __ ldub(end_from, 0, O4);
1291 __ deccc(count);
1292 __ brx(Assembler::greater, false, Assembler::pt, L_copy_byte_loop);
1293 __ delayed()->stb(O4, end_to, 0);
1294 }
1295
1296 __ BIND(L_exit);
1297 // O3, O4 are used as temp registers
1298 inc_counter_np(SharedRuntime::_jbyte_array_copy_ctr, O3, O4);
1299 __ retl();
1300 __ delayed()->mov(G0, O0); // return 0
1301 return start;
1302 }
1303
1304 //
1305 // Generate stub for disjoint short copy. If "aligned" is true, the
1306 // "from" and "to" addresses are assumed to be heapword aligned.
1307 //
1308 // Arguments for generated stub:
1309 // from: O0
1310 // to: O1
1311 // count: O2 treated as signed
1312 //
1313 address generate_disjoint_short_copy(bool aligned, address *entry, const char * name) {
1314 __ align(CodeEntryAlignment);
1315 StubCodeMark mark(this, "StubRoutines", name);
1316 address start = __ pc();
1317
1318 Label L_skip_alignment, L_skip_alignment2;
1319 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1320
1321 const Register from = O0; // source array address
1322 const Register to = O1; // destination array address
1323 const Register count = O2; // elements count
1324 const Register offset = O5; // offset from start of arrays
1325 // O3, O4, G3, G4 are used as temp registers
1326
1327 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1328
1329 if (entry != NULL) {
1330 *entry = __ pc();
1331 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1332 BLOCK_COMMENT("Entry:");
1333 }
1334
1335 {
1336 // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1337 UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1338 // for short arrays, just do single element copy
1339 __ cmp(count, 11); // 8 + 3 (22 bytes)
1340 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1341 __ delayed()->mov(G0, offset);
1342
1343 if (aligned) {
1344 // 'aligned' == true when it is known statically during compilation
1345 // of this arraycopy call site that both 'from' and 'to' addresses
1346 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1347 //
1348 // Aligned arrays have 4 bytes alignment in 32-bits VM
1349 // and 8 bytes - in 64-bits VM.
1350 //
1351 } else {
1352 // copy 1 element if necessary to align 'to' on an 4 bytes
1353 __ andcc(to, 3, G0);
1354 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1355 __ delayed()->lduh(from, 0, O3);
1356 __ inc(from, 2);
1357 __ inc(to, 2);
1358 __ dec(count);
1359 __ sth(O3, to, -2);
1360 __ BIND(L_skip_alignment);
1361
1362 // copy 2 elements to align 'to' on an 8 byte boundary
1363 __ andcc(to, 7, G0);
1364 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1365 __ delayed()->lduh(from, 0, O3);
1366 __ dec(count, 2);
1367 __ lduh(from, 2, O4);
1368 __ inc(from, 4);
1369 __ inc(to, 4);
1370 __ sth(O3, to, -4);
1371 __ sth(O4, to, -2);
1372 __ BIND(L_skip_alignment2);
1373 }
1374 if (!aligned) {
1375 // Copy with shift 16 bytes per iteration if arrays do not have
1376 // the same alignment mod 8, otherwise fall through to the next
1377 // code for aligned copy.
1378 // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1379 // Also jump over aligned copy after the copy with shift completed.
1380
1381 copy_16_bytes_forward_with_shift(from, to, count, 1, L_copy_2_bytes);
1382 }
1383
1384 // Both array are 8 bytes aligned, copy 16 bytes at a time
1385 __ and3(count, 3, G4); // Save
1386 __ srl(count, 2, count);
1387 generate_disjoint_long_copy_core(aligned);
1388 __ mov(G4, count); // restore
1389
1390 // copy 1 element at a time
1391 __ BIND(L_copy_2_bytes);
1392 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1393 __ align(OptoLoopAlignment);
1394 __ BIND(L_copy_2_bytes_loop);
1395 __ lduh(from, offset, O3);
1396 __ deccc(count);
1397 __ sth(O3, to, offset);
1398 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_2_bytes_loop);
1399 __ delayed()->inc(offset, 2);
1400 }
1401
1402 __ BIND(L_exit);
1403 // O3, O4 are used as temp registers
1404 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1405 __ retl();
1406 __ delayed()->mov(G0, O0); // return 0
1407 return start;
1408 }
1409
1410 //
1411 // Generate stub for disjoint short fill. If "aligned" is true, the
1412 // "to" address is assumed to be heapword aligned.
1413 //
1414 // Arguments for generated stub:
1415 // to: O0
1416 // value: O1
1417 // count: O2 treated as signed
1418 //
1419 address generate_fill(BasicType t, bool aligned, const char* name) {
1420 __ align(CodeEntryAlignment);
1421 StubCodeMark mark(this, "StubRoutines", name);
1422 address start = __ pc();
1423
1424 const Register to = O0; // source array address
1425 const Register value = O1; // fill value
1426 const Register count = O2; // elements count
1427 // O3 is used as a temp register
1428
1429 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1430
1431 Label L_exit, L_skip_align1, L_skip_align2, L_fill_byte;
1432 Label L_fill_2_bytes, L_fill_elements, L_fill_32_bytes;
1433
1434 int shift = -1;
1435 switch (t) {
1436 case T_BYTE:
1437 shift = 2;
1438 break;
1439 case T_SHORT:
1440 shift = 1;
1441 break;
1442 case T_INT:
1443 shift = 0;
1444 break;
1445 default: ShouldNotReachHere();
1446 }
1447
1448 BLOCK_COMMENT("Entry:");
1449
1450 if (t == T_BYTE) {
1451 // Zero extend value
1452 __ and3(value, 0xff, value);
1453 __ sllx(value, 8, O3);
1454 __ or3(value, O3, value);
1455 }
1456 if (t == T_SHORT) {
1457 // Zero extend value
1458 __ sllx(value, 48, value);
1459 __ srlx(value, 48, value);
1460 }
1461 if (t == T_BYTE || t == T_SHORT) {
1462 __ sllx(value, 16, O3);
1463 __ or3(value, O3, value);
1464 }
1465
1466 __ cmp(count, 2<<shift); // Short arrays (< 8 bytes) fill by element
1467 __ brx(Assembler::lessUnsigned, false, Assembler::pn, L_fill_elements); // use unsigned cmp
1468 __ delayed()->andcc(count, 1, G0);
1469
1470 if (!aligned && (t == T_BYTE || t == T_SHORT)) {
1471 // align source address at 4 bytes address boundary
1472 if (t == T_BYTE) {
1473 // One byte misalignment happens only for byte arrays
1474 __ andcc(to, 1, G0);
1475 __ br(Assembler::zero, false, Assembler::pt, L_skip_align1);
1476 __ delayed()->nop();
1477 __ stb(value, to, 0);
1478 __ inc(to, 1);
1479 __ dec(count, 1);
1480 __ BIND(L_skip_align1);
1481 }
1482 // Two bytes misalignment happens only for byte and short (char) arrays
1483 __ andcc(to, 2, G0);
1484 __ br(Assembler::zero, false, Assembler::pt, L_skip_align2);
1485 __ delayed()->nop();
1486 __ sth(value, to, 0);
1487 __ inc(to, 2);
1488 __ dec(count, 1 << (shift - 1));
1489 __ BIND(L_skip_align2);
1490 }
1491 if (!aligned) {
1492 // align to 8 bytes, we know we are 4 byte aligned to start
1493 __ andcc(to, 7, G0);
1494 __ br(Assembler::zero, false, Assembler::pt, L_fill_32_bytes);
1495 __ delayed()->nop();
1496 __ stw(value, to, 0);
1497 __ inc(to, 4);
1498 __ dec(count, 1 << shift);
1499 __ BIND(L_fill_32_bytes);
1500 }
1501
1502 if (t == T_INT) {
1503 // Zero extend value
1504 __ srl(value, 0, value);
1505 }
1506 if (t == T_BYTE || t == T_SHORT || t == T_INT) {
1507 __ sllx(value, 32, O3);
1508 __ or3(value, O3, value);
1509 }
1510
1511 Label L_check_fill_8_bytes;
1512 // Fill 32-byte chunks
1513 __ subcc(count, 8 << shift, count);
1514 __ brx(Assembler::less, false, Assembler::pt, L_check_fill_8_bytes);
1515 __ delayed()->nop();
1516
1517 Label L_fill_32_bytes_loop, L_fill_4_bytes;
1518 __ align(16);
1519 __ BIND(L_fill_32_bytes_loop);
1520
1521 __ stx(value, to, 0);
1522 __ stx(value, to, 8);
1523 __ stx(value, to, 16);
1524 __ stx(value, to, 24);
1525
1526 __ subcc(count, 8 << shift, count);
1527 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_fill_32_bytes_loop);
1528 __ delayed()->add(to, 32, to);
1529
1530 __ BIND(L_check_fill_8_bytes);
1531 __ addcc(count, 8 << shift, count);
1532 __ brx(Assembler::zero, false, Assembler::pn, L_exit);
1533 __ delayed()->subcc(count, 1 << (shift + 1), count);
1534 __ brx(Assembler::less, false, Assembler::pn, L_fill_4_bytes);
1535 __ delayed()->andcc(count, 1<<shift, G0);
1536
1537 //
1538 // length is too short, just fill 8 bytes at a time
1539 //
1540 Label L_fill_8_bytes_loop;
1541 __ BIND(L_fill_8_bytes_loop);
1542 __ stx(value, to, 0);
1543 __ subcc(count, 1 << (shift + 1), count);
1544 __ brx(Assembler::greaterEqual, false, Assembler::pn, L_fill_8_bytes_loop);
1545 __ delayed()->add(to, 8, to);
1546
1547 // fill trailing 4 bytes
1548 __ andcc(count, 1<<shift, G0); // in delay slot of branches
1549 if (t == T_INT) {
1550 __ BIND(L_fill_elements);
1551 }
1552 __ BIND(L_fill_4_bytes);
1553 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2_bytes);
1554 if (t == T_BYTE || t == T_SHORT) {
1555 __ delayed()->andcc(count, 1<<(shift-1), G0);
1556 } else {
1557 __ delayed()->nop();
1558 }
1559 __ stw(value, to, 0);
1560 if (t == T_BYTE || t == T_SHORT) {
1561 __ inc(to, 4);
1562 // fill trailing 2 bytes
1563 __ andcc(count, 1<<(shift-1), G0); // in delay slot of branches
1564 __ BIND(L_fill_2_bytes);
1565 __ brx(Assembler::zero, false, Assembler::pt, L_fill_byte);
1566 __ delayed()->andcc(count, 1, count);
1567 __ sth(value, to, 0);
1568 if (t == T_BYTE) {
1569 __ inc(to, 2);
1570 // fill trailing byte
1571 __ andcc(count, 1, count); // in delay slot of branches
1572 __ BIND(L_fill_byte);
1573 __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1574 __ delayed()->nop();
1575 __ stb(value, to, 0);
1576 } else {
1577 __ BIND(L_fill_byte);
1578 }
1579 } else {
1580 __ BIND(L_fill_2_bytes);
1581 }
1582 __ BIND(L_exit);
1583 __ retl();
1584 __ delayed()->nop();
1585
1586 // Handle copies less than 8 bytes. Int is handled elsewhere.
1587 if (t == T_BYTE) {
1588 __ BIND(L_fill_elements);
1589 Label L_fill_2, L_fill_4;
1590 // in delay slot __ andcc(count, 1, G0);
1591 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1592 __ delayed()->andcc(count, 2, G0);
1593 __ stb(value, to, 0);
1594 __ inc(to, 1);
1595 __ BIND(L_fill_2);
1596 __ brx(Assembler::zero, false, Assembler::pt, L_fill_4);
1597 __ delayed()->andcc(count, 4, G0);
1598 __ stb(value, to, 0);
1599 __ stb(value, to, 1);
1600 __ inc(to, 2);
1601 __ BIND(L_fill_4);
1602 __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1603 __ delayed()->nop();
1604 __ stb(value, to, 0);
1605 __ stb(value, to, 1);
1606 __ stb(value, to, 2);
1607 __ retl();
1608 __ delayed()->stb(value, to, 3);
1609 }
1610
1611 if (t == T_SHORT) {
1612 Label L_fill_2;
1613 __ BIND(L_fill_elements);
1614 // in delay slot __ andcc(count, 1, G0);
1615 __ brx(Assembler::zero, false, Assembler::pt, L_fill_2);
1616 __ delayed()->andcc(count, 2, G0);
1617 __ sth(value, to, 0);
1618 __ inc(to, 2);
1619 __ BIND(L_fill_2);
1620 __ brx(Assembler::zero, false, Assembler::pt, L_exit);
1621 __ delayed()->nop();
1622 __ sth(value, to, 0);
1623 __ retl();
1624 __ delayed()->sth(value, to, 2);
1625 }
1626 return start;
1627 }
1628
1629 //
1630 // Generate stub for conjoint short copy. If "aligned" is true, the
1631 // "from" and "to" addresses are assumed to be heapword aligned.
1632 //
1633 // Arguments for generated stub:
1634 // from: O0
1635 // to: O1
1636 // count: O2 treated as signed
1637 //
1638 address generate_conjoint_short_copy(bool aligned, address nooverlap_target,
1639 address *entry, const char *name) {
1640 // Do reverse copy.
1641
1642 __ align(CodeEntryAlignment);
1643 StubCodeMark mark(this, "StubRoutines", name);
1644 address start = __ pc();
1645
1646 Label L_skip_alignment, L_skip_alignment2, L_aligned_copy;
1647 Label L_copy_2_bytes, L_copy_2_bytes_loop, L_exit;
1648
1649 const Register from = O0; // source array address
1650 const Register to = O1; // destination array address
1651 const Register count = O2; // elements count
1652 const Register end_from = from; // source array end address
1653 const Register end_to = to; // destination array end address
1654
1655 const Register byte_count = O3; // bytes count to copy
1656
1657 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1658
1659 if (entry != NULL) {
1660 *entry = __ pc();
1661 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1662 BLOCK_COMMENT("Entry:");
1663 }
1664
1665 array_overlap_test(nooverlap_target, 1);
1666
1667 {
1668 // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1669 UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1670
1671 __ sllx(count, LogBytesPerShort, byte_count);
1672 __ add(to, byte_count, end_to); // offset after last copied element
1673
1674 // for short arrays, just do single element copy
1675 __ cmp(count, 11); // 8 + 3 (22 bytes)
1676 __ brx(Assembler::less, false, Assembler::pn, L_copy_2_bytes);
1677 __ delayed()->add(from, byte_count, end_from);
1678
1679 {
1680 // Align end of arrays since they could be not aligned even
1681 // when arrays itself are aligned.
1682
1683 // copy 1 element if necessary to align 'end_to' on an 4 bytes
1684 __ andcc(end_to, 3, G0);
1685 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1686 __ delayed()->lduh(end_from, -2, O3);
1687 __ dec(end_from, 2);
1688 __ dec(end_to, 2);
1689 __ dec(count);
1690 __ sth(O3, end_to, 0);
1691 __ BIND(L_skip_alignment);
1692
1693 // copy 2 elements to align 'end_to' on an 8 byte boundary
1694 __ andcc(end_to, 7, G0);
1695 __ br(Assembler::zero, false, Assembler::pn, L_skip_alignment2);
1696 __ delayed()->lduh(end_from, -2, O3);
1697 __ dec(count, 2);
1698 __ lduh(end_from, -4, O4);
1699 __ dec(end_from, 4);
1700 __ dec(end_to, 4);
1701 __ sth(O3, end_to, 2);
1702 __ sth(O4, end_to, 0);
1703 __ BIND(L_skip_alignment2);
1704 }
1705 if (aligned) {
1706 // Both arrays are aligned to 8-bytes in 64-bits VM.
1707 // The 'count' is decremented in copy_16_bytes_backward_with_shift()
1708 // in unaligned case.
1709 __ dec(count, 8);
1710 } else {
1711 // Copy with shift 16 bytes per iteration if arrays do not have
1712 // the same alignment mod 8, otherwise jump to the next
1713 // code for aligned copy (and substracting 8 from 'count' before jump).
1714 // The compare above (count >= 11) guarantes 'count' >= 16 bytes.
1715 // Also jump over aligned copy after the copy with shift completed.
1716
1717 copy_16_bytes_backward_with_shift(end_from, end_to, count, 8,
1718 L_aligned_copy, L_copy_2_bytes);
1719 }
1720 // copy 4 elements (16 bytes) at a time
1721 __ align(OptoLoopAlignment);
1722 __ BIND(L_aligned_copy);
1723 __ dec(end_from, 16);
1724 __ ldx(end_from, 8, O3);
1725 __ ldx(end_from, 0, O4);
1726 __ dec(end_to, 16);
1727 __ deccc(count, 8);
1728 __ stx(O3, end_to, 8);
1729 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1730 __ delayed()->stx(O4, end_to, 0);
1731 __ inc(count, 8);
1732
1733 // copy 1 element (2 bytes) at a time
1734 __ BIND(L_copy_2_bytes);
1735 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1736 __ BIND(L_copy_2_bytes_loop);
1737 __ dec(end_from, 2);
1738 __ dec(end_to, 2);
1739 __ lduh(end_from, 0, O4);
1740 __ deccc(count);
1741 __ brx(Assembler::greater, false, Assembler::pt, L_copy_2_bytes_loop);
1742 __ delayed()->sth(O4, end_to, 0);
1743 }
1744 __ BIND(L_exit);
1745 // O3, O4 are used as temp registers
1746 inc_counter_np(SharedRuntime::_jshort_array_copy_ctr, O3, O4);
1747 __ retl();
1748 __ delayed()->mov(G0, O0); // return 0
1749 return start;
1750 }
1751
1752 //
1753 // Helper methods for generate_disjoint_int_copy_core()
1754 //
1755 void copy_16_bytes_loop(Register from, Register to, Register count, int count_dec,
1756 Label& L_loop, bool use_prefetch, bool use_bis) {
1757
1758 __ align(OptoLoopAlignment);
1759 __ BIND(L_loop);
1760 if (use_prefetch) {
1761 if (ArraycopySrcPrefetchDistance > 0) {
1762 __ prefetch(from, ArraycopySrcPrefetchDistance, Assembler::severalReads);
1763 }
1764 if (ArraycopyDstPrefetchDistance > 0) {
1765 __ prefetch(to, ArraycopyDstPrefetchDistance, Assembler::severalWritesAndPossiblyReads);
1766 }
1767 }
1768 __ ldx(from, 4, O4);
1769 __ ldx(from, 12, G4);
1770 __ inc(to, 16);
1771 __ inc(from, 16);
1772 __ deccc(count, 4); // Can we do next iteration after this one?
1773
1774 __ srlx(O4, 32, G3);
1775 __ bset(G3, O3);
1776 __ sllx(O4, 32, O4);
1777 __ srlx(G4, 32, G3);
1778 __ bset(G3, O4);
1779 if (use_bis) {
1780 __ stxa(O3, to, -16);
1781 __ stxa(O4, to, -8);
1782 } else {
1783 __ stx(O3, to, -16);
1784 __ stx(O4, to, -8);
1785 }
1786 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
1787 __ delayed()->sllx(G4, 32, O3);
1788
1789 }
1790
1791 //
1792 // Generate core code for disjoint int copy (and oop copy on 32-bit).
1793 // If "aligned" is true, the "from" and "to" addresses are assumed
1794 // to be heapword aligned.
1795 //
1796 // Arguments:
1797 // from: O0
1798 // to: O1
1799 // count: O2 treated as signed
1800 //
1801 void generate_disjoint_int_copy_core(bool aligned) {
1802
1803 Label L_skip_alignment, L_aligned_copy;
1804 Label L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
1805
1806 const Register from = O0; // source array address
1807 const Register to = O1; // destination array address
1808 const Register count = O2; // elements count
1809 const Register offset = O5; // offset from start of arrays
1810 // O3, O4, G3, G4 are used as temp registers
1811
1812 // 'aligned' == true when it is known statically during compilation
1813 // of this arraycopy call site that both 'from' and 'to' addresses
1814 // are HeapWordSize aligned (see LibraryCallKit::basictype2arraycopy()).
1815 //
1816 // Aligned arrays have 4 bytes alignment in 32-bits VM
1817 // and 8 bytes - in 64-bits VM.
1818 //
1819 if (!aligned) {
1820 // The next check could be put under 'ifndef' since the code in
1821 // generate_disjoint_long_copy_core() has own checks and set 'offset'.
1822
1823 // for short arrays, just do single element copy
1824 __ cmp(count, 5); // 4 + 1 (20 bytes)
1825 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
1826 __ delayed()->mov(G0, offset);
1827
1828 // copy 1 element to align 'to' on an 8 byte boundary
1829 __ andcc(to, 7, G0);
1830 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1831 __ delayed()->ld(from, 0, O3);
1832 __ inc(from, 4);
1833 __ inc(to, 4);
1834 __ dec(count);
1835 __ st(O3, to, -4);
1836 __ BIND(L_skip_alignment);
1837
1838 // if arrays have same alignment mod 8, do 4 elements copy
1839 __ andcc(from, 7, G0);
1840 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1841 __ delayed()->ld(from, 0, O3);
1842
1843 //
1844 // Load 2 aligned 8-bytes chunks and use one from previous iteration
1845 // to form 2 aligned 8-bytes chunks to store.
1846 //
1847 // copy_16_bytes_forward_with_shift() is not used here since this
1848 // code is more optimal.
1849
1850 // copy with shift 4 elements (16 bytes) at a time
1851 __ dec(count, 4); // The cmp at the beginning guaranty count >= 4
1852 __ sllx(O3, 32, O3);
1853
1854 disjoint_copy_core(from, to, count, 2, 16, &StubGenerator::copy_16_bytes_loop);
1855
1856 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
1857 __ delayed()->inc(count, 4); // restore 'count'
1858
1859 __ BIND(L_aligned_copy);
1860 } // !aligned
1861
1862 // copy 4 elements (16 bytes) at a time
1863 __ and3(count, 1, G4); // Save
1864 __ srl(count, 1, count);
1865 generate_disjoint_long_copy_core(aligned);
1866 __ mov(G4, count); // Restore
1867
1868 // copy 1 element at a time
1869 __ BIND(L_copy_4_bytes);
1870 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
1871 __ BIND(L_copy_4_bytes_loop);
1872 __ ld(from, offset, O3);
1873 __ deccc(count);
1874 __ st(O3, to, offset);
1875 __ brx(Assembler::notZero, false, Assembler::pt, L_copy_4_bytes_loop);
1876 __ delayed()->inc(offset, 4);
1877 __ BIND(L_exit);
1878 }
1879
1880 //
1881 // Generate stub for disjoint int copy. If "aligned" is true, the
1882 // "from" and "to" addresses are assumed to be heapword aligned.
1883 //
1884 // Arguments for generated stub:
1885 // from: O0
1886 // to: O1
1887 // count: O2 treated as signed
1888 //
1889 address generate_disjoint_int_copy(bool aligned, address *entry, const char *name) {
1890 __ align(CodeEntryAlignment);
1891 StubCodeMark mark(this, "StubRoutines", name);
1892 address start = __ pc();
1893
1894 const Register count = O2;
1895 assert_clean_int(count, O3); // Make sure 'count' is clean int.
1896
1897 if (entry != NULL) {
1898 *entry = __ pc();
1899 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
1900 BLOCK_COMMENT("Entry:");
1901 }
1902 {
1903 // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
1904 UnsafeCopyMemoryMark ucmm(this, !aligned, false);
1905 generate_disjoint_int_copy_core(aligned);
1906 }
1907 // O3, O4 are used as temp registers
1908 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
1909 __ retl();
1910 __ delayed()->mov(G0, O0); // return 0
1911 return start;
1912 }
1913
1914 //
1915 // Generate core code for conjoint int copy (and oop copy on 32-bit).
1916 // If "aligned" is true, the "from" and "to" addresses are assumed
1917 // to be heapword aligned.
1918 //
1919 // Arguments:
1920 // from: O0
1921 // to: O1
1922 // count: O2 treated as signed
1923 //
1924 void generate_conjoint_int_copy_core(bool aligned) {
1925 // Do reverse copy.
1926
1927 Label L_skip_alignment, L_aligned_copy;
1928 Label L_copy_16_bytes, L_copy_4_bytes, L_copy_4_bytes_loop, L_exit;
1929
1930 const Register from = O0; // source array address
1931 const Register to = O1; // destination array address
1932 const Register count = O2; // elements count
1933 const Register end_from = from; // source array end address
1934 const Register end_to = to; // destination array end address
1935 // O3, O4, O5, G3 are used as temp registers
1936
1937 const Register byte_count = O3; // bytes count to copy
1938
1939 __ sllx(count, LogBytesPerInt, byte_count);
1940 __ add(to, byte_count, end_to); // offset after last copied element
1941
1942 __ cmp(count, 5); // for short arrays, just do single element copy
1943 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_4_bytes);
1944 __ delayed()->add(from, byte_count, end_from);
1945
1946 // copy 1 element to align 'to' on an 8 byte boundary
1947 __ andcc(end_to, 7, G0);
1948 __ br(Assembler::zero, false, Assembler::pt, L_skip_alignment);
1949 __ delayed()->nop();
1950 __ dec(count);
1951 __ dec(end_from, 4);
1952 __ dec(end_to, 4);
1953 __ ld(end_from, 0, O4);
1954 __ st(O4, end_to, 0);
1955 __ BIND(L_skip_alignment);
1956
1957 // Check if 'end_from' and 'end_to' has the same alignment.
1958 __ andcc(end_from, 7, G0);
1959 __ br(Assembler::zero, false, Assembler::pt, L_aligned_copy);
1960 __ delayed()->dec(count, 4); // The cmp at the start guaranty cnt >= 4
1961
1962 // copy with shift 4 elements (16 bytes) at a time
1963 //
1964 // Load 2 aligned 8-bytes chunks and use one from previous iteration
1965 // to form 2 aligned 8-bytes chunks to store.
1966 //
1967 __ ldx(end_from, -4, O3);
1968 __ align(OptoLoopAlignment);
1969 __ BIND(L_copy_16_bytes);
1970 __ ldx(end_from, -12, O4);
1971 __ deccc(count, 4);
1972 __ ldx(end_from, -20, O5);
1973 __ dec(end_to, 16);
1974 __ dec(end_from, 16);
1975 __ srlx(O3, 32, O3);
1976 __ sllx(O4, 32, G3);
1977 __ bset(G3, O3);
1978 __ stx(O3, end_to, 8);
1979 __ srlx(O4, 32, O4);
1980 __ sllx(O5, 32, G3);
1981 __ bset(O4, G3);
1982 __ stx(G3, end_to, 0);
1983 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
1984 __ delayed()->mov(O5, O3);
1985
1986 __ br(Assembler::always, false, Assembler::pt, L_copy_4_bytes);
1987 __ delayed()->inc(count, 4);
1988
1989 // copy 4 elements (16 bytes) at a time
1990 __ align(OptoLoopAlignment);
1991 __ BIND(L_aligned_copy);
1992 __ dec(end_from, 16);
1993 __ ldx(end_from, 8, O3);
1994 __ ldx(end_from, 0, O4);
1995 __ dec(end_to, 16);
1996 __ deccc(count, 4);
1997 __ stx(O3, end_to, 8);
1998 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_aligned_copy);
1999 __ delayed()->stx(O4, end_to, 0);
2000 __ inc(count, 4);
2001
2002 // copy 1 element (4 bytes) at a time
2003 __ BIND(L_copy_4_bytes);
2004 __ cmp_and_br_short(count, 0, Assembler::equal, Assembler::pt, L_exit);
2005 __ BIND(L_copy_4_bytes_loop);
2006 __ dec(end_from, 4);
2007 __ dec(end_to, 4);
2008 __ ld(end_from, 0, O4);
2009 __ deccc(count);
2010 __ brx(Assembler::greater, false, Assembler::pt, L_copy_4_bytes_loop);
2011 __ delayed()->st(O4, end_to, 0);
2012 __ BIND(L_exit);
2013 }
2014
2015 //
2016 // Generate stub for conjoint int copy. If "aligned" is true, the
2017 // "from" and "to" addresses are assumed to be heapword aligned.
2018 //
2019 // Arguments for generated stub:
2020 // from: O0
2021 // to: O1
2022 // count: O2 treated as signed
2023 //
2024 address generate_conjoint_int_copy(bool aligned, address nooverlap_target,
2025 address *entry, const char *name) {
2026 __ align(CodeEntryAlignment);
2027 StubCodeMark mark(this, "StubRoutines", name);
2028 address start = __ pc();
2029
2030 assert_clean_int(O2, O3); // Make sure 'count' is clean int.
2031
2032 if (entry != NULL) {
2033 *entry = __ pc();
2034 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2035 BLOCK_COMMENT("Entry:");
2036 }
2037
2038 array_overlap_test(nooverlap_target, 2);
2039 {
2040 // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
2041 UnsafeCopyMemoryMark ucmm(this, !aligned, false);
2042 generate_conjoint_int_copy_core(aligned);
2043 }
2044 // O3, O4 are used as temp registers
2045 inc_counter_np(SharedRuntime::_jint_array_copy_ctr, O3, O4);
2046 __ retl();
2047 __ delayed()->mov(G0, O0); // return 0
2048 return start;
2049 }
2050
2051 //
2052 // Helper methods for generate_disjoint_long_copy_core()
2053 //
2054 void copy_64_bytes_loop(Register from, Register to, Register count, int count_dec,
2055 Label& L_loop, bool use_prefetch, bool use_bis) {
2056 __ align(OptoLoopAlignment);
2057 __ BIND(L_loop);
2058 for (int off = 0; off < 64; off += 16) {
2059 if (use_prefetch && (off & 31) == 0) {
2060 if (ArraycopySrcPrefetchDistance > 0) {
2061 __ prefetch(from, ArraycopySrcPrefetchDistance+off, Assembler::severalReads);
2062 }
2063 if (ArraycopyDstPrefetchDistance > 0) {
2064 __ prefetch(to, ArraycopyDstPrefetchDistance+off, Assembler::severalWritesAndPossiblyReads);
2065 }
2066 }
2067 __ ldx(from, off+0, O4);
2068 __ ldx(from, off+8, O5);
2069 if (use_bis) {
2070 __ stxa(O4, to, off+0);
2071 __ stxa(O5, to, off+8);
2072 } else {
2073 __ stx(O4, to, off+0);
2074 __ stx(O5, to, off+8);
2075 }
2076 }
2077 __ deccc(count, 8);
2078 __ inc(from, 64);
2079 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_loop);
2080 __ delayed()->inc(to, 64);
2081 }
2082
2083 //
2084 // Generate core code for disjoint long copy (and oop copy on 64-bit).
2085 // "aligned" is ignored, because we must make the stronger
2086 // assumption that both addresses are always 64-bit aligned.
2087 //
2088 // Arguments:
2089 // from: O0
2090 // to: O1
2091 // count: O2 treated as signed
2092 //
2093 // count -= 2;
2094 // if ( count >= 0 ) { // >= 2 elements
2095 // if ( count > 6) { // >= 8 elements
2096 // count -= 6; // original count - 8
2097 // do {
2098 // copy_8_elements;
2099 // count -= 8;
2100 // } while ( count >= 0 );
2101 // count += 6;
2102 // }
2103 // if ( count >= 0 ) { // >= 2 elements
2104 // do {
2105 // copy_2_elements;
2106 // } while ( (count=count-2) >= 0 );
2107 // }
2108 // }
2109 // count += 2;
2110 // if ( count != 0 ) { // 1 element left
2111 // copy_1_element;
2112 // }
2113 //
2114 void generate_disjoint_long_copy_core(bool aligned) {
2115 Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2116 const Register from = O0; // source array address
2117 const Register to = O1; // destination array address
2118 const Register count = O2; // elements count
2119 const Register offset0 = O4; // element offset
2120 const Register offset8 = O5; // next element offset
2121
2122 __ deccc(count, 2);
2123 __ mov(G0, offset0); // offset from start of arrays (0)
2124 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2125 __ delayed()->add(offset0, 8, offset8);
2126
2127 // Copy by 64 bytes chunks
2128
2129 const Register from64 = O3; // source address
2130 const Register to64 = G3; // destination address
2131 __ subcc(count, 6, O3);
2132 __ brx(Assembler::negative, false, Assembler::pt, L_copy_16_bytes );
2133 __ delayed()->mov(to, to64);
2134 // Now we can use O4(offset0), O5(offset8) as temps
2135 __ mov(O3, count);
2136 // count >= 0 (original count - 8)
2137 __ mov(from, from64);
2138
2139 disjoint_copy_core(from64, to64, count, 3, 64, &StubGenerator::copy_64_bytes_loop);
2140
2141 // Restore O4(offset0), O5(offset8)
2142 __ sub(from64, from, offset0);
2143 __ inccc(count, 6); // restore count
2144 __ brx(Assembler::negative, false, Assembler::pn, L_copy_8_bytes );
2145 __ delayed()->add(offset0, 8, offset8);
2146
2147 // Copy by 16 bytes chunks
2148 __ align(OptoLoopAlignment);
2149 __ BIND(L_copy_16_bytes);
2150 __ ldx(from, offset0, O3);
2151 __ ldx(from, offset8, G3);
2152 __ deccc(count, 2);
2153 __ stx(O3, to, offset0);
2154 __ inc(offset0, 16);
2155 __ stx(G3, to, offset8);
2156 __ brx(Assembler::greaterEqual, false, Assembler::pt, L_copy_16_bytes);
2157 __ delayed()->inc(offset8, 16);
2158
2159 // Copy last 8 bytes
2160 __ BIND(L_copy_8_bytes);
2161 __ inccc(count, 2);
2162 __ brx(Assembler::zero, true, Assembler::pn, L_exit );
2163 __ delayed()->mov(offset0, offset8); // Set O5 used by other stubs
2164 __ ldx(from, offset0, O3);
2165 __ stx(O3, to, offset0);
2166 __ BIND(L_exit);
2167 }
2168
2169 //
2170 // Generate stub for disjoint long copy.
2171 // "aligned" is ignored, because we must make the stronger
2172 // assumption that both addresses are always 64-bit aligned.
2173 //
2174 // Arguments for generated stub:
2175 // from: O0
2176 // to: O1
2177 // count: O2 treated as signed
2178 //
2179 address generate_disjoint_long_copy(bool aligned, address *entry, const char *name) {
2180 __ align(CodeEntryAlignment);
2181 StubCodeMark mark(this, "StubRoutines", name);
2182 address start = __ pc();
2183
2184 assert_clean_int(O2, O3); // Make sure 'count' is clean int.
2185
2186 if (entry != NULL) {
2187 *entry = __ pc();
2188 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2189 BLOCK_COMMENT("Entry:");
2190 }
2191
2192 {
2193 // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
2194 UnsafeCopyMemoryMark ucmm(this, true, false);
2195 generate_disjoint_long_copy_core(aligned);
2196 }
2197 // O3, O4 are used as temp registers
2198 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2199 __ retl();
2200 __ delayed()->mov(G0, O0); // return 0
2201 return start;
2202 }
2203
2204 //
2205 // Generate core code for conjoint long copy (and oop copy on 64-bit).
2206 // "aligned" is ignored, because we must make the stronger
2207 // assumption that both addresses are always 64-bit aligned.
2208 //
2209 // Arguments:
2210 // from: O0
2211 // to: O1
2212 // count: O2 treated as signed
2213 //
2214 void generate_conjoint_long_copy_core(bool aligned) {
2215 // Do reverse copy.
2216 Label L_copy_8_bytes, L_copy_16_bytes, L_exit;
2217 const Register from = O0; // source array address
2218 const Register to = O1; // destination array address
2219 const Register count = O2; // elements count
2220 const Register offset8 = O4; // element offset
2221 const Register offset0 = O5; // previous element offset
2222
2223 __ subcc(count, 1, count);
2224 __ brx(Assembler::lessEqual, false, Assembler::pn, L_copy_8_bytes );
2225 __ delayed()->sllx(count, LogBytesPerLong, offset8);
2226 __ sub(offset8, 8, offset0);
2227 __ align(OptoLoopAlignment);
2228 __ BIND(L_copy_16_bytes);
2229 __ ldx(from, offset8, O2);
2230 __ ldx(from, offset0, O3);
2231 __ stx(O2, to, offset8);
2232 __ deccc(offset8, 16); // use offset8 as counter
2233 __ stx(O3, to, offset0);
2234 __ brx(Assembler::greater, false, Assembler::pt, L_copy_16_bytes);
2235 __ delayed()->dec(offset0, 16);
2236
2237 __ BIND(L_copy_8_bytes);
2238 __ brx(Assembler::negative, false, Assembler::pn, L_exit );
2239 __ delayed()->nop();
2240 __ ldx(from, 0, O3);
2241 __ stx(O3, to, 0);
2242 __ BIND(L_exit);
2243 }
2244
2245 // Generate stub for conjoint long copy.
2246 // "aligned" is ignored, because we must make the stronger
2247 // assumption that both addresses are always 64-bit aligned.
2248 //
2249 // Arguments for generated stub:
2250 // from: O0
2251 // to: O1
2252 // count: O2 treated as signed
2253 //
2254 address generate_conjoint_long_copy(bool aligned, address nooverlap_target,
2255 address *entry, const char *name) {
2256 __ align(CodeEntryAlignment);
2257 StubCodeMark mark(this, "StubRoutines", name);
2258 address start = __ pc();
2259
2260 assert(aligned, "Should always be aligned");
2261
2262 assert_clean_int(O2, O3); // Make sure 'count' is clean int.
2263
2264 if (entry != NULL) {
2265 *entry = __ pc();
2266 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory)
2267 BLOCK_COMMENT("Entry:");
2268 }
2269
2270 array_overlap_test(nooverlap_target, 3);
2271 {
2272 // UnsafeCopyMemory page error: continue at UnsafeCopyMemory common_error_exit
2273 UnsafeCopyMemoryMark ucmm(this, true, false);
2274 generate_conjoint_long_copy_core(aligned);
2275 }
2276 // O3, O4 are used as temp registers
2277 inc_counter_np(SharedRuntime::_jlong_array_copy_ctr, O3, O4);
2278 __ retl();
2279 __ delayed()->mov(G0, O0); // return 0
2280 return start;
2281 }
2282
2283 // Generate stub for disjoint oop copy. If "aligned" is true, the
2284 // "from" and "to" addresses are assumed to be heapword aligned.
2285 //
2286 // Arguments for generated stub:
2287 // from: O0
2288 // to: O1
2289 // count: O2 treated as signed
2290 //
2291 address generate_disjoint_oop_copy(bool aligned, address *entry, const char *name,
2292 bool dest_uninitialized = false) {
2293
2294 const Register from = O0; // source array address
2295 const Register to = O1; // destination array address
2296 const Register count = O2; // elements count
2297
2298 __ align(CodeEntryAlignment);
2299 StubCodeMark mark(this, "StubRoutines", name);
2300 address start = __ pc();
2301
2302 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2303
2304 if (entry != NULL) {
2305 *entry = __ pc();
2306 // caller can pass a 64-bit byte count here
2307 BLOCK_COMMENT("Entry:");
2308 }
2309
2310 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT;
2311 if (dest_uninitialized) {
2312 decorators |= IS_DEST_UNINITIALIZED;
2313 }
2314 if (aligned) {
2315 decorators |= ARRAYCOPY_ALIGNED;
2316 }
2317
2318 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2319 bs->arraycopy_prologue(_masm, decorators, T_OBJECT, from, to, count);
2320
2321 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2322 if (UseCompressedOops) {
2323 generate_disjoint_int_copy_core(aligned);
2324 } else {
2325 generate_disjoint_long_copy_core(aligned);
2326 }
2327
2328 bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, from, to, count);
2329
2330 // O3, O4 are used as temp registers
2331 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2332 __ retl();
2333 __ delayed()->mov(G0, O0); // return 0
2334 return start;
2335 }
2336
2337 // Generate stub for conjoint oop copy. If "aligned" is true, the
2338 // "from" and "to" addresses are assumed to be heapword aligned.
2339 //
2340 // Arguments for generated stub:
2341 // from: O0
2342 // to: O1
2343 // count: O2 treated as signed
2344 //
2345 address generate_conjoint_oop_copy(bool aligned, address nooverlap_target,
2346 address *entry, const char *name,
2347 bool dest_uninitialized = false) {
2348
2349 const Register from = O0; // source array address
2350 const Register to = O1; // destination array address
2351 const Register count = O2; // elements count
2352
2353 __ align(CodeEntryAlignment);
2354 StubCodeMark mark(this, "StubRoutines", name);
2355 address start = __ pc();
2356
2357 assert_clean_int(count, O3); // Make sure 'count' is clean int.
2358
2359 if (entry != NULL) {
2360 *entry = __ pc();
2361 // caller can pass a 64-bit byte count here
2362 BLOCK_COMMENT("Entry:");
2363 }
2364
2365 array_overlap_test(nooverlap_target, LogBytesPerHeapOop);
2366
2367 DecoratorSet decorators = IN_HEAP | IS_ARRAY;
2368 if (dest_uninitialized) {
2369 decorators |= IS_DEST_UNINITIALIZED;
2370 }
2371 if (aligned) {
2372 decorators |= ARRAYCOPY_ALIGNED;
2373 }
2374
2375 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2376 bs->arraycopy_prologue(_masm, decorators, T_OBJECT, from, to, count);
2377
2378 if (UseCompressedOops) {
2379 generate_conjoint_int_copy_core(aligned);
2380 } else {
2381 generate_conjoint_long_copy_core(aligned);
2382 }
2383
2384 bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, from, to, count);
2385
2386 // O3, O4 are used as temp registers
2387 inc_counter_np(SharedRuntime::_oop_array_copy_ctr, O3, O4);
2388 __ retl();
2389 __ delayed()->mov(G0, O0); // return 0
2390 return start;
2391 }
2392
2393
2394 // Helper for generating a dynamic type check.
2395 // Smashes only the given temp registers.
2396 void generate_type_check(Register sub_klass,
2397 Register super_check_offset,
2398 Register super_klass,
2399 Register temp,
2400 Label& L_success) {
2401 assert_different_registers(sub_klass, super_check_offset, super_klass, temp);
2402
2403 BLOCK_COMMENT("type_check:");
2404
2405 Label L_miss, L_pop_to_miss;
2406
2407 assert_clean_int(super_check_offset, temp);
2408
2409 __ check_klass_subtype_fast_path(sub_klass, super_klass, temp, noreg,
2410 &L_success, &L_miss, NULL,
2411 super_check_offset);
2412
2413 BLOCK_COMMENT("type_check_slow_path:");
2414 __ save_frame(0);
2415 __ check_klass_subtype_slow_path(sub_klass->after_save(),
2416 super_klass->after_save(),
2417 L0, L1, L2, L4,
2418 NULL, &L_pop_to_miss);
2419 __ ba(L_success);
2420 __ delayed()->restore();
2421
2422 __ bind(L_pop_to_miss);
2423 __ restore();
2424
2425 // Fall through on failure!
2426 __ BIND(L_miss);
2427 }
2428
2429
2430 // Generate stub for checked oop copy.
2431 //
2432 // Arguments for generated stub:
2433 // from: O0
2434 // to: O1
2435 // count: O2 treated as signed
2436 // ckoff: O3 (super_check_offset)
2437 // ckval: O4 (super_klass)
2438 // ret: O0 zero for success; (-1^K) where K is partial transfer count
2439 //
2440 address generate_checkcast_copy(const char *name, address *entry, bool dest_uninitialized = false) {
2441
2442 const Register O0_from = O0; // source array address
2443 const Register O1_to = O1; // destination array address
2444 const Register O2_count = O2; // elements count
2445 const Register O3_ckoff = O3; // super_check_offset
2446 const Register O4_ckval = O4; // super_klass
2447
2448 const Register O5_offset = O5; // loop var, with stride wordSize
2449 const Register G1_remain = G1; // loop var, with stride -1
2450 const Register G3_oop = G3; // actual oop copied
2451 const Register G4_klass = G4; // oop._klass
2452 const Register G5_super = G5; // oop._klass._primary_supers[ckval]
2453
2454 __ align(CodeEntryAlignment);
2455 StubCodeMark mark(this, "StubRoutines", name);
2456 address start = __ pc();
2457
2458 #ifdef ASSERT
2459 // We sometimes save a frame (see generate_type_check below).
2460 // If this will cause trouble, let's fail now instead of later.
2461 __ save_frame(0);
2462 __ restore();
2463 #endif
2464
2465 assert_clean_int(O2_count, G1); // Make sure 'count' is clean int.
2466
2467 #ifdef ASSERT
2468 // caller guarantees that the arrays really are different
2469 // otherwise, we would have to make conjoint checks
2470 { Label L;
2471 __ mov(O3, G1); // spill: overlap test smashes O3
2472 __ mov(O4, G4); // spill: overlap test smashes O4
2473 array_overlap_test(L, LogBytesPerHeapOop);
2474 __ stop("checkcast_copy within a single array");
2475 __ bind(L);
2476 __ mov(G1, O3);
2477 __ mov(G4, O4);
2478 }
2479 #endif //ASSERT
2480
2481 if (entry != NULL) {
2482 *entry = __ pc();
2483 // caller can pass a 64-bit byte count here (from generic stub)
2484 BLOCK_COMMENT("Entry:");
2485 }
2486
2487 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST;
2488 if (dest_uninitialized) {
2489 decorators |= IS_DEST_UNINITIALIZED;
2490 }
2491
2492 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler();
2493 bs->arraycopy_prologue(_masm, decorators, T_OBJECT, O0_from, O1_to, O2_count);
2494
2495 Label load_element, store_element, do_epilogue, fail, done;
2496 __ addcc(O2_count, 0, G1_remain); // initialize loop index, and test it
2497 __ brx(Assembler::notZero, false, Assembler::pt, load_element);
2498 __ delayed()->mov(G0, O5_offset); // offset from start of arrays
2499
2500 // Empty array: Nothing to do.
2501 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2502 __ retl();
2503 __ delayed()->set(0, O0); // return 0 on (trivial) success
2504
2505 // ======== begin loop ========
2506 // (Loop is rotated; its entry is load_element.)
2507 // Loop variables:
2508 // (O5 = 0; ; O5 += wordSize) --- offset from src, dest arrays
2509 // (O2 = len; O2 != 0; O2--) --- number of oops *remaining*
2510 // G3, G4, G5 --- current oop, oop.klass, oop.klass.super
2511 __ align(OptoLoopAlignment);
2512
2513 __ BIND(store_element);
2514 __ deccc(G1_remain); // decrement the count
2515 __ store_heap_oop(G3_oop, O1_to, O5_offset, noreg, AS_RAW); // store the oop
2516 __ inc(O5_offset, heapOopSize); // step to next offset
2517 __ brx(Assembler::zero, true, Assembler::pt, do_epilogue);
2518 __ delayed()->set(0, O0); // return -1 on success
2519
2520 // ======== loop entry is here ========
2521 __ BIND(load_element);
2522 __ load_heap_oop(O0_from, O5_offset, G3_oop, noreg, AS_RAW); // load the oop
2523 __ br_null_short(G3_oop, Assembler::pt, store_element);
2524
2525 __ load_klass(G3_oop, G4_klass); // query the object klass
2526
2527 generate_type_check(G4_klass, O3_ckoff, O4_ckval, G5_super,
2528 // branch to this on success:
2529 store_element);
2530 // ======== end loop ========
2531
2532 // It was a real error; we must depend on the caller to finish the job.
2533 // Register G1 has number of *remaining* oops, O2 number of *total* oops.
2534 // Emit GC store barriers for the oops we have copied (O2 minus G1),
2535 // and report their number to the caller.
2536 __ BIND(fail);
2537 __ subcc(O2_count, G1_remain, O2_count);
2538 __ brx(Assembler::zero, false, Assembler::pt, done);
2539 __ delayed()->not1(O2_count, O0); // report (-1^K) to caller
2540
2541 __ BIND(do_epilogue);
2542 bs->arraycopy_epilogue(_masm, decorators, T_OBJECT, O0_from, O1_to, O2_count);
2543
2544 __ BIND(done);
2545 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr, O3, O4);
2546 __ retl();
2547 __ delayed()->nop(); // return value in 00
2548
2549 return start;
2550 }
2551
2552
2553 // Generate 'unsafe' array copy stub
2554 // Though just as safe as the other stubs, it takes an unscaled
2555 // size_t argument instead of an element count.
2556 //
2557 // Arguments for generated stub:
2558 // from: O0
2559 // to: O1
2560 // count: O2 byte count, treated as ssize_t, can be zero
2561 //
2562 // Examines the alignment of the operands and dispatches
2563 // to a long, int, short, or byte copy loop.
2564 //
2565 address generate_unsafe_copy(const char* name,
2566 address byte_copy_entry,
2567 address short_copy_entry,
2568 address int_copy_entry,
2569 address long_copy_entry) {
2570
2571 const Register O0_from = O0; // source array address
2572 const Register O1_to = O1; // destination array address
2573 const Register O2_count = O2; // elements count
2574
2575 const Register G1_bits = G1; // test copy of low bits
2576
2577 __ align(CodeEntryAlignment);
2578 StubCodeMark mark(this, "StubRoutines", name);
2579 address start = __ pc();
2580
2581 // bump this on entry, not on exit:
2582 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr, G1, G3);
2583
2584 __ or3(O0_from, O1_to, G1_bits);
2585 __ or3(O2_count, G1_bits, G1_bits);
2586
2587 __ btst(BytesPerLong-1, G1_bits);
2588 __ br(Assembler::zero, true, Assembler::pt,
2589 long_copy_entry, relocInfo::runtime_call_type);
2590 // scale the count on the way out:
2591 __ delayed()->srax(O2_count, LogBytesPerLong, O2_count);
2592
2593 __ btst(BytesPerInt-1, G1_bits);
2594 __ br(Assembler::zero, true, Assembler::pt,
2595 int_copy_entry, relocInfo::runtime_call_type);
2596 // scale the count on the way out:
2597 __ delayed()->srax(O2_count, LogBytesPerInt, O2_count);
2598
2599 __ btst(BytesPerShort-1, G1_bits);
2600 __ br(Assembler::zero, true, Assembler::pt,
2601 short_copy_entry, relocInfo::runtime_call_type);
2602 // scale the count on the way out:
2603 __ delayed()->srax(O2_count, LogBytesPerShort, O2_count);
2604
2605 __ br(Assembler::always, false, Assembler::pt,
2606 byte_copy_entry, relocInfo::runtime_call_type);
2607 __ delayed()->nop();
2608
2609 return start;
2610 }
2611
2612
2613 // Perform range checks on the proposed arraycopy.
2614 // Kills the two temps, but nothing else.
2615 // Also, clean the sign bits of src_pos and dst_pos.
2616 void arraycopy_range_checks(Register src, // source array oop (O0)
2617 Register src_pos, // source position (O1)
2618 Register dst, // destination array oo (O2)
2619 Register dst_pos, // destination position (O3)
2620 Register length, // length of copy (O4)
2621 Register temp1, Register temp2,
2622 Label& L_failed) {
2623 BLOCK_COMMENT("arraycopy_range_checks:");
2624
2625 // if (src_pos + length > arrayOop(src)->length() ) FAIL;
2626
2627 const Register array_length = temp1; // scratch
2628 const Register end_pos = temp2; // scratch
2629
2630 // Note: This next instruction may be in the delay slot of a branch:
2631 __ add(length, src_pos, end_pos); // src_pos + length
2632 __ lduw(src, arrayOopDesc::length_offset_in_bytes(), array_length);
2633 __ cmp(end_pos, array_length);
2634 __ br(Assembler::greater, false, Assembler::pn, L_failed);
2635
2636 // if (dst_pos + length > arrayOop(dst)->length() ) FAIL;
2637 __ delayed()->add(length, dst_pos, end_pos); // dst_pos + length
2638 __ lduw(dst, arrayOopDesc::length_offset_in_bytes(), array_length);
2639 __ cmp(end_pos, array_length);
2640 __ br(Assembler::greater, false, Assembler::pn, L_failed);
2641
2642 // Have to clean up high 32-bits of 'src_pos' and 'dst_pos'.
2643 // Move with sign extension can be used since they are positive.
2644 __ delayed()->signx(src_pos, src_pos);
2645 __ signx(dst_pos, dst_pos);
2646
2647 BLOCK_COMMENT("arraycopy_range_checks done");
2648 }
2649
2650
2651 //
2652 // Generate generic array copy stubs
2653 //
2654 // Input:
2655 // O0 - src oop
2656 // O1 - src_pos
2657 // O2 - dst oop
2658 // O3 - dst_pos
2659 // O4 - element count
2660 //
2661 // Output:
2662 // O0 == 0 - success
2663 // O0 == -1 - need to call System.arraycopy
2664 //
2665 address generate_generic_copy(const char *name,
2666 address entry_jbyte_arraycopy,
2667 address entry_jshort_arraycopy,
2668 address entry_jint_arraycopy,
2669 address entry_oop_arraycopy,
2670 address entry_jlong_arraycopy,
2671 address entry_checkcast_arraycopy) {
2672 Label L_failed, L_objArray;
2673
2674 // Input registers
2675 const Register src = O0; // source array oop
2676 const Register src_pos = O1; // source position
2677 const Register dst = O2; // destination array oop
2678 const Register dst_pos = O3; // destination position
2679 const Register length = O4; // elements count
2680
2681 // registers used as temp
2682 const Register G3_src_klass = G3; // source array klass
2683 const Register G4_dst_klass = G4; // destination array klass
2684 const Register G5_lh = G5; // layout handler
2685 const Register O5_temp = O5;
2686
2687 __ align(CodeEntryAlignment);
2688 StubCodeMark mark(this, "StubRoutines", name);
2689 address start = __ pc();
2690
2691 // bump this on entry, not on exit:
2692 inc_counter_np(SharedRuntime::_generic_array_copy_ctr, G1, G3);
2693
2694 // In principle, the int arguments could be dirty.
2695 //assert_clean_int(src_pos, G1);
2696 //assert_clean_int(dst_pos, G1);
2697 //assert_clean_int(length, G1);
2698
2699 //-----------------------------------------------------------------------
2700 // Assembler stubs will be used for this call to arraycopy
2701 // if the following conditions are met:
2702 //
2703 // (1) src and dst must not be null.
2704 // (2) src_pos must not be negative.
2705 // (3) dst_pos must not be negative.
2706 // (4) length must not be negative.
2707 // (5) src klass and dst klass should be the same and not NULL.
2708 // (6) src and dst should be arrays.
2709 // (7) src_pos + length must not exceed length of src.
2710 // (8) dst_pos + length must not exceed length of dst.
2711 BLOCK_COMMENT("arraycopy initial argument checks");
2712
2713 // if (src == NULL) return -1;
2714 __ br_null(src, false, Assembler::pn, L_failed);
2715
2716 // if (src_pos < 0) return -1;
2717 __ delayed()->tst(src_pos);
2718 __ br(Assembler::negative, false, Assembler::pn, L_failed);
2719 __ delayed()->nop();
2720
2721 // if (dst == NULL) return -1;
2722 __ br_null(dst, false, Assembler::pn, L_failed);
2723
2724 // if (dst_pos < 0) return -1;
2725 __ delayed()->tst(dst_pos);
2726 __ br(Assembler::negative, false, Assembler::pn, L_failed);
2727
2728 // if (length < 0) return -1;
2729 __ delayed()->tst(length);
2730 __ br(Assembler::negative, false, Assembler::pn, L_failed);
2731
2732 BLOCK_COMMENT("arraycopy argument klass checks");
2733 // get src->klass()
2734 if (UseCompressedClassPointers) {
2735 __ delayed()->nop(); // ??? not good
2736 __ load_klass(src, G3_src_klass);
2737 } else {
2738 __ delayed()->ld_ptr(src, oopDesc::klass_offset_in_bytes(), G3_src_klass);
2739 }
2740
2741 #ifdef ASSERT
2742 // assert(src->klass() != NULL);
2743 BLOCK_COMMENT("assert klasses not null");
2744 { Label L_a, L_b;
2745 __ br_notnull_short(G3_src_klass, Assembler::pt, L_b); // it is broken if klass is NULL
2746 __ bind(L_a);
2747 __ stop("broken null klass");
2748 __ bind(L_b);
2749 __ load_klass(dst, G4_dst_klass);
2750 __ br_null(G4_dst_klass, false, Assembler::pn, L_a); // this would be broken also
2751 __ delayed()->mov(G0, G4_dst_klass); // scribble the temp
2752 BLOCK_COMMENT("assert done");
2753 }
2754 #endif
2755
2756 // Load layout helper
2757 //
2758 // |array_tag| | header_size | element_type | |log2_element_size|
2759 // 32 30 24 16 8 2 0
2760 //
2761 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0
2762 //
2763
2764 int lh_offset = in_bytes(Klass::layout_helper_offset());
2765
2766 // Load 32-bits signed value. Use br() instruction with it to check icc.
2767 __ lduw(G3_src_klass, lh_offset, G5_lh);
2768
2769 if (UseCompressedClassPointers) {
2770 __ load_klass(dst, G4_dst_klass);
2771 }
2772 // Handle objArrays completely differently...
2773 juint objArray_lh = Klass::array_layout_helper(T_OBJECT);
2774 __ set(objArray_lh, O5_temp);
2775 __ cmp(G5_lh, O5_temp);
2776 __ br(Assembler::equal, false, Assembler::pt, L_objArray);
2777 if (UseCompressedClassPointers) {
2778 __ delayed()->nop();
2779 } else {
2780 __ delayed()->ld_ptr(dst, oopDesc::klass_offset_in_bytes(), G4_dst_klass);
2781 }
2782
2783 // if (src->klass() != dst->klass()) return -1;
2784 __ cmp_and_brx_short(G3_src_klass, G4_dst_klass, Assembler::notEqual, Assembler::pn, L_failed);
2785
2786 // if (!src->is_Array()) return -1;
2787 __ cmp(G5_lh, Klass::_lh_neutral_value); // < 0
2788 __ br(Assembler::greaterEqual, false, Assembler::pn, L_failed);
2789
2790 // At this point, it is known to be a typeArray (array_tag 0x3).
2791 #ifdef ASSERT
2792 __ delayed()->nop();
2793 { Label L;
2794 jint lh_prim_tag_in_place = (Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift);
2795 __ set(lh_prim_tag_in_place, O5_temp);
2796 __ cmp(G5_lh, O5_temp);
2797 __ br(Assembler::greaterEqual, false, Assembler::pt, L);
2798 __ delayed()->nop();
2799 __ stop("must be a primitive array");
2800 __ bind(L);
2801 }
2802 #else
2803 __ delayed(); // match next insn to prev branch
2804 #endif
2805
2806 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2807 O5_temp, G4_dst_klass, L_failed);
2808
2809 // TypeArrayKlass
2810 //
2811 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize);
2812 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize);
2813 //
2814
2815 const Register G4_offset = G4_dst_klass; // array offset
2816 const Register G3_elsize = G3_src_klass; // log2 element size
2817
2818 __ srl(G5_lh, Klass::_lh_header_size_shift, G4_offset);
2819 __ and3(G4_offset, Klass::_lh_header_size_mask, G4_offset); // array_offset
2820 __ add(src, G4_offset, src); // src array offset
2821 __ add(dst, G4_offset, dst); // dst array offset
2822 __ and3(G5_lh, Klass::_lh_log2_element_size_mask, G3_elsize); // log2 element size
2823
2824 // next registers should be set before the jump to corresponding stub
2825 const Register from = O0; // source array address
2826 const Register to = O1; // destination array address
2827 const Register count = O2; // elements count
2828
2829 // 'from', 'to', 'count' registers should be set in this order
2830 // since they are the same as 'src', 'src_pos', 'dst'.
2831
2832 BLOCK_COMMENT("scale indexes to element size");
2833 __ sll_ptr(src_pos, G3_elsize, src_pos);
2834 __ sll_ptr(dst_pos, G3_elsize, dst_pos);
2835 __ add(src, src_pos, from); // src_addr
2836 __ add(dst, dst_pos, to); // dst_addr
2837
2838 BLOCK_COMMENT("choose copy loop based on element size");
2839 __ cmp(G3_elsize, 0);
2840 __ br(Assembler::equal, true, Assembler::pt, entry_jbyte_arraycopy);
2841 __ delayed()->signx(length, count); // length
2842
2843 __ cmp(G3_elsize, LogBytesPerShort);
2844 __ br(Assembler::equal, true, Assembler::pt, entry_jshort_arraycopy);
2845 __ delayed()->signx(length, count); // length
2846
2847 __ cmp(G3_elsize, LogBytesPerInt);
2848 __ br(Assembler::equal, true, Assembler::pt, entry_jint_arraycopy);
2849 __ delayed()->signx(length, count); // length
2850 #ifdef ASSERT
2851 { Label L;
2852 __ cmp_and_br_short(G3_elsize, LogBytesPerLong, Assembler::equal, Assembler::pt, L);
2853 __ stop("must be long copy, but elsize is wrong");
2854 __ bind(L);
2855 }
2856 #endif
2857 __ br(Assembler::always, false, Assembler::pt, entry_jlong_arraycopy);
2858 __ delayed()->signx(length, count); // length
2859
2860 // ObjArrayKlass
2861 __ BIND(L_objArray);
2862 // live at this point: G3_src_klass, G4_dst_klass, src[_pos], dst[_pos], length
2863
2864 Label L_plain_copy, L_checkcast_copy;
2865 // test array classes for subtyping
2866 __ cmp(G3_src_klass, G4_dst_klass); // usual case is exact equality
2867 __ brx(Assembler::notEqual, true, Assembler::pn, L_checkcast_copy);
2868 __ delayed()->lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted from below
2869
2870 // Identically typed arrays can be copied without element-wise checks.
2871 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2872 O5_temp, G5_lh, L_failed);
2873
2874 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
2875 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
2876 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
2877 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
2878 __ add(src, src_pos, from); // src_addr
2879 __ add(dst, dst_pos, to); // dst_addr
2880 __ BIND(L_plain_copy);
2881 __ br(Assembler::always, false, Assembler::pt, entry_oop_arraycopy);
2882 __ delayed()->signx(length, count); // length
2883
2884 __ BIND(L_checkcast_copy);
2885 // live at this point: G3_src_klass, G4_dst_klass
2886 {
2887 // Before looking at dst.length, make sure dst is also an objArray.
2888 // lduw(G4_dst_klass, lh_offset, O5_temp); // hoisted to delay slot
2889 __ cmp(G5_lh, O5_temp);
2890 __ br(Assembler::notEqual, false, Assembler::pn, L_failed);
2891
2892 // It is safe to examine both src.length and dst.length.
2893 __ delayed(); // match next insn to prev branch
2894 arraycopy_range_checks(src, src_pos, dst, dst_pos, length,
2895 O5_temp, G5_lh, L_failed);
2896
2897 // Marshal the base address arguments now, freeing registers.
2898 __ add(src, arrayOopDesc::base_offset_in_bytes(T_OBJECT), src); //src offset
2899 __ add(dst, arrayOopDesc::base_offset_in_bytes(T_OBJECT), dst); //dst offset
2900 __ sll_ptr(src_pos, LogBytesPerHeapOop, src_pos);
2901 __ sll_ptr(dst_pos, LogBytesPerHeapOop, dst_pos);
2902 __ add(src, src_pos, from); // src_addr
2903 __ add(dst, dst_pos, to); // dst_addr
2904 __ signx(length, count); // length (reloaded)
2905
2906 Register sco_temp = O3; // this register is free now
2907 assert_different_registers(from, to, count, sco_temp,
2908 G4_dst_klass, G3_src_klass);
2909
2910 // Generate the type check.
2911 int sco_offset = in_bytes(Klass::super_check_offset_offset());
2912 __ lduw(G4_dst_klass, sco_offset, sco_temp);
2913 generate_type_check(G3_src_klass, sco_temp, G4_dst_klass,
2914 O5_temp, L_plain_copy);
2915
2916 // Fetch destination element klass from the ObjArrayKlass header.
2917 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset());
2918
2919 // the checkcast_copy loop needs two extra arguments:
2920 __ ld_ptr(G4_dst_klass, ek_offset, O4); // dest elem klass
2921 // lduw(O4, sco_offset, O3); // sco of elem klass
2922
2923 __ br(Assembler::always, false, Assembler::pt, entry_checkcast_arraycopy);
2924 __ delayed()->lduw(O4, sco_offset, O3);
2925 }
2926
2927 __ BIND(L_failed);
2928 __ retl();
2929 __ delayed()->sub(G0, 1, O0); // return -1
2930 return start;
2931 }
2932
2933 //
2934 // Generate stub for heap zeroing.
2935 // "to" address is aligned to jlong (8 bytes).
2936 //
2937 // Arguments for generated stub:
2938 // to: O0
2939 // count: O1 treated as signed (count of HeapWord)
2940 // count could be 0
2941 //
2942 address generate_zero_aligned_words(const char* name) {
2943 __ align(CodeEntryAlignment);
2944 StubCodeMark mark(this, "StubRoutines", name);
2945 address start = __ pc();
2946
2947 const Register to = O0; // source array address
2948 const Register count = O1; // HeapWords count
2949 const Register temp = O2; // scratch
2950
2951 Label Ldone;
2952 __ sllx(count, LogHeapWordSize, count); // to bytes count
2953 // Use BIS for zeroing
2954 __ bis_zeroing(to, count, temp, Ldone);
2955 __ bind(Ldone);
2956 __ retl();
2957 __ delayed()->nop();
2958 return start;
2959 }
2960
2961 void generate_arraycopy_stubs() {
2962 address entry;
2963 address entry_jbyte_arraycopy;
2964 address entry_jshort_arraycopy;
2965 address entry_jint_arraycopy;
2966 address entry_oop_arraycopy;
2967 address entry_jlong_arraycopy;
2968 address entry_checkcast_arraycopy;
2969
2970 address ucm_common_error_exit = generate_unsafecopy_common_error_exit();
2971 UnsafeCopyMemory::set_common_exit_stub_pc(ucm_common_error_exit);
2972
2973 //*** jbyte
2974 // Always need aligned and unaligned versions
2975 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry,
2976 "jbyte_disjoint_arraycopy");
2977 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry,
2978 &entry_jbyte_arraycopy,
2979 "jbyte_arraycopy");
2980 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry,
2981 "arrayof_jbyte_disjoint_arraycopy");
2982 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL,
2983 "arrayof_jbyte_arraycopy");
2984
2985 //*** jshort
2986 // Always need aligned and unaligned versions
2987 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry,
2988 "jshort_disjoint_arraycopy");
2989 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry,
2990 &entry_jshort_arraycopy,
2991 "jshort_arraycopy");
2992 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry,
2993 "arrayof_jshort_disjoint_arraycopy");
2994 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL,
2995 "arrayof_jshort_arraycopy");
2996
2997 //*** jint
2998 // Aligned versions
2999 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry,
3000 "arrayof_jint_disjoint_arraycopy");
3001 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy,
3002 "arrayof_jint_arraycopy");
3003 // In 64 bit we need both aligned and unaligned versions of jint arraycopy.
3004 // entry_jint_arraycopy always points to the unaligned version (notice that we overwrite it).
3005 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry,
3006 "jint_disjoint_arraycopy");
3007 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry,
3008 &entry_jint_arraycopy,
3009 "jint_arraycopy");
3010
3011 //*** jlong
3012 // It is always aligned
3013 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry,
3014 "arrayof_jlong_disjoint_arraycopy");
3015 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy,
3016 "arrayof_jlong_arraycopy");
3017 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy;
3018 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy;
3019
3020
3021 //*** oops
3022 // Aligned versions
3023 StubRoutines::_arrayof_oop_disjoint_arraycopy = generate_disjoint_oop_copy(true, &entry,
3024 "arrayof_oop_disjoint_arraycopy");
3025 StubRoutines::_arrayof_oop_arraycopy = generate_conjoint_oop_copy(true, entry, &entry_oop_arraycopy,
3026 "arrayof_oop_arraycopy");
3027 // Aligned versions without pre-barriers
3028 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(true, &entry,
3029 "arrayof_oop_disjoint_arraycopy_uninit",
3030 /*dest_uninitialized*/true);
3031 StubRoutines::_arrayof_oop_arraycopy_uninit = generate_conjoint_oop_copy(true, entry, NULL,
3032 "arrayof_oop_arraycopy_uninit",
3033 /*dest_uninitialized*/true);
3034 if (UseCompressedOops) {
3035 // With compressed oops we need unaligned versions, notice that we overwrite entry_oop_arraycopy.
3036 StubRoutines::_oop_disjoint_arraycopy = generate_disjoint_oop_copy(false, &entry,
3037 "oop_disjoint_arraycopy");
3038 StubRoutines::_oop_arraycopy = generate_conjoint_oop_copy(false, entry, &entry_oop_arraycopy,
3039 "oop_arraycopy");
3040 // Unaligned versions without pre-barriers
3041 StubRoutines::_oop_disjoint_arraycopy_uninit = generate_disjoint_oop_copy(false, &entry,
3042 "oop_disjoint_arraycopy_uninit",
3043 /*dest_uninitialized*/true);
3044 StubRoutines::_oop_arraycopy_uninit = generate_conjoint_oop_copy(false, entry, NULL,
3045 "oop_arraycopy_uninit",
3046 /*dest_uninitialized*/true);
3047 } else {
3048 // oop arraycopy is always aligned on 32bit and 64bit without compressed oops
3049 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy;
3050 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy;
3051 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit;
3052 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit;
3053 }
3054
3055 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy);
3056 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL,
3057 /*dest_uninitialized*/true);
3058
3059 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy",
3060 entry_jbyte_arraycopy,
3061 entry_jshort_arraycopy,
3062 entry_jint_arraycopy,
3063 entry_jlong_arraycopy);
3064 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy",
3065 entry_jbyte_arraycopy,
3066 entry_jshort_arraycopy,
3067 entry_jint_arraycopy,
3068 entry_oop_arraycopy,
3069 entry_jlong_arraycopy,
3070 entry_checkcast_arraycopy);
3071
3072 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill");
3073 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill");
3074 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill");
3075 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill");
3076 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill");
3077 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill");
3078
3079 if (UseBlockZeroing) {
3080 StubRoutines::_zero_aligned_words = generate_zero_aligned_words("zero_aligned_words");
3081 }
3082 }
3083
3084 address generate_aescrypt_encryptBlock() {
3085 // required since we read expanded key 'int' array starting first element without alignment considerations
3086 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3087 "the following code assumes that first element of an int array is aligned to 8 bytes");
3088 __ align(CodeEntryAlignment);
3089 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock");
3090 Label L_load_misaligned_input, L_load_expanded_key, L_doLast128bit, L_storeOutput, L_store_misaligned_output;
3091 address start = __ pc();
3092 Register from = O0; // source byte array
3093 Register to = O1; // destination byte array
3094 Register key = O2; // expanded key array
3095 const Register keylen = O4; //reg for storing expanded key array length
3096
3097 // read expanded key length
3098 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3099
3100 // Method to address arbitrary alignment for load instructions:
3101 // Check last 3 bits of 'from' address to see if it is aligned to 8-byte boundary
3102 // If zero/aligned then continue with double FP load instructions
3103 // If not zero/mis-aligned then alignaddr will set GSR.align with number of bytes to skip during faligndata
3104 // alignaddr will also convert arbitrary aligned 'from' address to nearest 8-byte aligned address
3105 // load 3 * 8-byte components (to read 16 bytes input) in 3 different FP regs starting at this aligned address
3106 // faligndata will then extract (based on GSR.align value) the appropriate 8 bytes from the 2 source regs
3107
3108 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3109 __ andcc(from, 7, G0);
3110 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
3111 __ delayed()->alignaddr(from, G0, from);
3112
3113 // aligned case: load input into F54-F56
3114 __ ldf(FloatRegisterImpl::D, from, 0, F54);
3115 __ ldf(FloatRegisterImpl::D, from, 8, F56);
3116 __ ba_short(L_load_expanded_key);
3117
3118 __ BIND(L_load_misaligned_input);
3119 __ ldf(FloatRegisterImpl::D, from, 0, F54);
3120 __ ldf(FloatRegisterImpl::D, from, 8, F56);
3121 __ ldf(FloatRegisterImpl::D, from, 16, F58);
3122 __ faligndata(F54, F56, F54);
3123 __ faligndata(F56, F58, F56);
3124
3125 __ BIND(L_load_expanded_key);
3126 // Since we load expanded key buffers starting first element, 8-byte alignment is guaranteed
3127 for ( int i = 0; i <= 38; i += 2 ) {
3128 __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i));
3129 }
3130
3131 // perform cipher transformation
3132 __ fxor(FloatRegisterImpl::D, F0, F54, F54);
3133 __ fxor(FloatRegisterImpl::D, F2, F56, F56);
3134 // rounds 1 through 8
3135 for ( int i = 4; i <= 28; i += 8 ) {
3136 __ aes_eround01(as_FloatRegister(i), F54, F56, F58);
3137 __ aes_eround23(as_FloatRegister(i+2), F54, F56, F60);
3138 __ aes_eround01(as_FloatRegister(i+4), F58, F60, F54);
3139 __ aes_eround23(as_FloatRegister(i+6), F58, F60, F56);
3140 }
3141 __ aes_eround01(F36, F54, F56, F58); //round 9
3142 __ aes_eround23(F38, F54, F56, F60);
3143
3144 // 128-bit original key size
3145 __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_doLast128bit);
3146
3147 for ( int i = 40; i <= 50; i += 2 ) {
3148 __ ldf(FloatRegisterImpl::D, key, i*4, as_FloatRegister(i) );
3149 }
3150 __ aes_eround01(F40, F58, F60, F54); //round 10
3151 __ aes_eround23(F42, F58, F60, F56);
3152 __ aes_eround01(F44, F54, F56, F58); //round 11
3153 __ aes_eround23(F46, F54, F56, F60);
3154
3155 // 192-bit original key size
3156 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_storeOutput);
3157
3158 __ ldf(FloatRegisterImpl::D, key, 208, F52);
3159 __ aes_eround01(F48, F58, F60, F54); //round 12
3160 __ aes_eround23(F50, F58, F60, F56);
3161 __ ldf(FloatRegisterImpl::D, key, 216, F46);
3162 __ ldf(FloatRegisterImpl::D, key, 224, F48);
3163 __ ldf(FloatRegisterImpl::D, key, 232, F50);
3164 __ aes_eround01(F52, F54, F56, F58); //round 13
3165 __ aes_eround23(F46, F54, F56, F60);
3166 __ ba_short(L_storeOutput);
3167
3168 __ BIND(L_doLast128bit);
3169 __ ldf(FloatRegisterImpl::D, key, 160, F48);
3170 __ ldf(FloatRegisterImpl::D, key, 168, F50);
3171
3172 __ BIND(L_storeOutput);
3173 // perform last round of encryption common for all key sizes
3174 __ aes_eround01_l(F48, F58, F60, F54); //last round
3175 __ aes_eround23_l(F50, F58, F60, F56);
3176
3177 // Method to address arbitrary alignment for store instructions:
3178 // Check last 3 bits of 'dest' address to see if it is aligned to 8-byte boundary
3179 // If zero/aligned then continue with double FP store instructions
3180 // If not zero/mis-aligned then edge8n will generate edge mask in result reg (O3 in below case)
3181 // Example: If dest address is 0x07 and nearest 8-byte aligned address is 0x00 then edge mask will be 00000001
3182 // Compute (8-n) where n is # of bytes skipped by partial store(stpartialf) inst from edge mask, n=7 in this case
3183 // We get the value of n from the andcc that checks 'dest' alignment. n is available in O5 in below case.
3184 // Set GSR.align to (8-n) using alignaddr
3185 // Circular byte shift store values by n places so that the original bytes are at correct position for stpartialf
3186 // Set the arbitrarily aligned 'dest' address to nearest 8-byte aligned address
3187 // Store (partial) the original first (8-n) bytes starting at the original 'dest' address
3188 // Negate the edge mask so that the subsequent stpartialf can store the original (8-n-1)th through 8th bytes at appropriate address
3189 // We need to execute this process for both the 8-byte result values
3190
3191 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3192 __ andcc(to, 7, O5);
3193 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
3194 __ delayed()->edge8n(to, G0, O3);
3195
3196 // aligned case: store output into the destination array
3197 __ stf(FloatRegisterImpl::D, F54, to, 0);
3198 __ retl();
3199 __ delayed()->stf(FloatRegisterImpl::D, F56, to, 8);
3200
3201 __ BIND(L_store_misaligned_output);
3202 __ add(to, 8, O4);
3203 __ mov(8, O2);
3204 __ sub(O2, O5, O2);
3205 __ alignaddr(O2, G0, O2);
3206 __ faligndata(F54, F54, F54);
3207 __ faligndata(F56, F56, F56);
3208 __ and3(to, -8, to);
3209 __ and3(O4, -8, O4);
3210 __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
3211 __ stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
3212 __ add(to, 8, to);
3213 __ add(O4, 8, O4);
3214 __ orn(G0, O3, O3);
3215 __ stpartialf(to, O3, F54, Assembler::ASI_PST8_PRIMARY);
3216 __ retl();
3217 __ delayed()->stpartialf(O4, O3, F56, Assembler::ASI_PST8_PRIMARY);
3218
3219 return start;
3220 }
3221
3222 address generate_aescrypt_decryptBlock() {
3223 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3224 "the following code assumes that first element of an int array is aligned to 8 bytes");
3225 // required since we read original key 'byte' array as well in the decryption stubs
3226 assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3227 "the following code assumes that first element of a byte array is aligned to 8 bytes");
3228 __ align(CodeEntryAlignment);
3229 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock");
3230 address start = __ pc();
3231 Label L_load_misaligned_input, L_load_original_key, L_expand192bit, L_expand256bit, L_reload_misaligned_input;
3232 Label L_256bit_transform, L_common_transform, L_store_misaligned_output;
3233 Register from = O0; // source byte array
3234 Register to = O1; // destination byte array
3235 Register key = O2; // expanded key array
3236 Register original_key = O3; // original key array only required during decryption
3237 const Register keylen = O4; // reg for storing expanded key array length
3238
3239 // read expanded key array length
3240 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3241
3242 // save 'from' since we may need to recheck alignment in case of 256-bit decryption
3243 __ mov(from, G1);
3244
3245 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3246 __ andcc(from, 7, G0);
3247 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input);
3248 __ delayed()->alignaddr(from, G0, from);
3249
3250 // aligned case: load input into F52-F54
3251 __ ldf(FloatRegisterImpl::D, from, 0, F52);
3252 __ ldf(FloatRegisterImpl::D, from, 8, F54);
3253 __ ba_short(L_load_original_key);
3254
3255 __ BIND(L_load_misaligned_input);
3256 __ ldf(FloatRegisterImpl::D, from, 0, F52);
3257 __ ldf(FloatRegisterImpl::D, from, 8, F54);
3258 __ ldf(FloatRegisterImpl::D, from, 16, F56);
3259 __ faligndata(F52, F54, F52);
3260 __ faligndata(F54, F56, F54);
3261
3262 __ BIND(L_load_original_key);
3263 // load original key from SunJCE expanded decryption key
3264 // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
3265 for ( int i = 0; i <= 3; i++ ) {
3266 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3267 }
3268
3269 // 256-bit original key size
3270 __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
3271
3272 // 192-bit original key size
3273 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
3274
3275 // 128-bit original key size
3276 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3277 for ( int i = 0; i <= 36; i += 4 ) {
3278 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
3279 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
3280 }
3281
3282 // perform 128-bit key specific inverse cipher transformation
3283 __ fxor(FloatRegisterImpl::D, F42, F54, F54);
3284 __ fxor(FloatRegisterImpl::D, F40, F52, F52);
3285 __ ba_short(L_common_transform);
3286
3287 __ BIND(L_expand192bit);
3288
3289 // start loading rest of the 192-bit key
3290 __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
3291 __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
3292
3293 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3294 for ( int i = 0; i <= 36; i += 6 ) {
3295 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
3296 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
3297 __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
3298 }
3299 __ aes_kexpand1(F42, F46, 7, F48);
3300 __ aes_kexpand2(F44, F48, F50);
3301
3302 // perform 192-bit key specific inverse cipher transformation
3303 __ fxor(FloatRegisterImpl::D, F50, F54, F54);
3304 __ fxor(FloatRegisterImpl::D, F48, F52, F52);
3305 __ aes_dround23(F46, F52, F54, F58);
3306 __ aes_dround01(F44, F52, F54, F56);
3307 __ aes_dround23(F42, F56, F58, F54);
3308 __ aes_dround01(F40, F56, F58, F52);
3309 __ ba_short(L_common_transform);
3310
3311 __ BIND(L_expand256bit);
3312
3313 // load rest of the 256-bit key
3314 for ( int i = 4; i <= 7; i++ ) {
3315 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3316 }
3317
3318 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3319 for ( int i = 0; i <= 40; i += 8 ) {
3320 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
3321 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
3322 __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
3323 __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
3324 }
3325 __ aes_kexpand1(F48, F54, 6, F56);
3326 __ aes_kexpand2(F50, F56, F58);
3327
3328 for ( int i = 0; i <= 6; i += 2 ) {
3329 __ fsrc2(FloatRegisterImpl::D, as_FloatRegister(58-i), as_FloatRegister(i));
3330 }
3331
3332 // reload original 'from' address
3333 __ mov(G1, from);
3334
3335 // re-check 8-byte alignment
3336 __ andcc(from, 7, G0);
3337 __ br(Assembler::notZero, true, Assembler::pn, L_reload_misaligned_input);
3338 __ delayed()->alignaddr(from, G0, from);
3339
3340 // aligned case: load input into F52-F54
3341 __ ldf(FloatRegisterImpl::D, from, 0, F52);
3342 __ ldf(FloatRegisterImpl::D, from, 8, F54);
3343 __ ba_short(L_256bit_transform);
3344
3345 __ BIND(L_reload_misaligned_input);
3346 __ ldf(FloatRegisterImpl::D, from, 0, F52);
3347 __ ldf(FloatRegisterImpl::D, from, 8, F54);
3348 __ ldf(FloatRegisterImpl::D, from, 16, F56);
3349 __ faligndata(F52, F54, F52);
3350 __ faligndata(F54, F56, F54);
3351
3352 // perform 256-bit key specific inverse cipher transformation
3353 __ BIND(L_256bit_transform);
3354 __ fxor(FloatRegisterImpl::D, F0, F54, F54);
3355 __ fxor(FloatRegisterImpl::D, F2, F52, F52);
3356 __ aes_dround23(F4, F52, F54, F58);
3357 __ aes_dround01(F6, F52, F54, F56);
3358 __ aes_dround23(F50, F56, F58, F54);
3359 __ aes_dround01(F48, F56, F58, F52);
3360 __ aes_dround23(F46, F52, F54, F58);
3361 __ aes_dround01(F44, F52, F54, F56);
3362 __ aes_dround23(F42, F56, F58, F54);
3363 __ aes_dround01(F40, F56, F58, F52);
3364
3365 for ( int i = 0; i <= 7; i++ ) {
3366 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3367 }
3368
3369 // perform inverse cipher transformations common for all key sizes
3370 __ BIND(L_common_transform);
3371 for ( int i = 38; i >= 6; i -= 8 ) {
3372 __ aes_dround23(as_FloatRegister(i), F52, F54, F58);
3373 __ aes_dround01(as_FloatRegister(i-2), F52, F54, F56);
3374 if ( i != 6) {
3375 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F54);
3376 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F52);
3377 } else {
3378 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F54);
3379 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F52);
3380 }
3381 }
3382
3383 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3384 __ andcc(to, 7, O5);
3385 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output);
3386 __ delayed()->edge8n(to, G0, O3);
3387
3388 // aligned case: store output into the destination array
3389 __ stf(FloatRegisterImpl::D, F52, to, 0);
3390 __ retl();
3391 __ delayed()->stf(FloatRegisterImpl::D, F54, to, 8);
3392
3393 __ BIND(L_store_misaligned_output);
3394 __ add(to, 8, O4);
3395 __ mov(8, O2);
3396 __ sub(O2, O5, O2);
3397 __ alignaddr(O2, G0, O2);
3398 __ faligndata(F52, F52, F52);
3399 __ faligndata(F54, F54, F54);
3400 __ and3(to, -8, to);
3401 __ and3(O4, -8, O4);
3402 __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
3403 __ stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
3404 __ add(to, 8, to);
3405 __ add(O4, 8, O4);
3406 __ orn(G0, O3, O3);
3407 __ stpartialf(to, O3, F52, Assembler::ASI_PST8_PRIMARY);
3408 __ retl();
3409 __ delayed()->stpartialf(O4, O3, F54, Assembler::ASI_PST8_PRIMARY);
3410
3411 return start;
3412 }
3413
3414 address generate_cipherBlockChaining_encryptAESCrypt() {
3415 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3416 "the following code assumes that first element of an int array is aligned to 8 bytes");
3417 assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3418 "the following code assumes that first element of a byte array is aligned to 8 bytes");
3419 __ align(CodeEntryAlignment);
3420 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt");
3421 Label L_cbcenc128, L_load_misaligned_input_128bit, L_128bit_transform, L_store_misaligned_output_128bit;
3422 Label L_check_loop_end_128bit, L_cbcenc192, L_load_misaligned_input_192bit, L_192bit_transform;
3423 Label L_store_misaligned_output_192bit, L_check_loop_end_192bit, L_cbcenc256, L_load_misaligned_input_256bit;
3424 Label L_256bit_transform, L_store_misaligned_output_256bit, L_check_loop_end_256bit;
3425 address start = __ pc();
3426 Register from = I0; // source byte array
3427 Register to = I1; // destination byte array
3428 Register key = I2; // expanded key array
3429 Register rvec = I3; // init vector
3430 const Register len_reg = I4; // cipher length
3431 const Register keylen = I5; // reg for storing expanded key array length
3432
3433 __ save_frame(0);
3434 // save cipher len to return in the end
3435 __ mov(len_reg, L0);
3436
3437 // read expanded key length
3438 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3439
3440 // load initial vector, 8-byte alignment is guranteed
3441 __ ldf(FloatRegisterImpl::D, rvec, 0, F60);
3442 __ ldf(FloatRegisterImpl::D, rvec, 8, F62);
3443 // load key, 8-byte alignment is guranteed
3444 __ ldx(key,0,G1);
3445 __ ldx(key,8,G5);
3446
3447 // start loading expanded key, 8-byte alignment is guranteed
3448 for ( int i = 0, j = 16; i <= 38; i += 2, j += 8 ) {
3449 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3450 }
3451
3452 // 128-bit original key size
3453 __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pt, L_cbcenc128);
3454
3455 for ( int i = 40, j = 176; i <= 46; i += 2, j += 8 ) {
3456 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3457 }
3458
3459 // 192-bit original key size
3460 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pt, L_cbcenc192);
3461
3462 for ( int i = 48, j = 208; i <= 54; i += 2, j += 8 ) {
3463 __ ldf(FloatRegisterImpl::D, key, j, as_FloatRegister(i));
3464 }
3465
3466 // 256-bit original key size
3467 __ ba_short(L_cbcenc256);
3468
3469 __ align(OptoLoopAlignment);
3470 __ BIND(L_cbcenc128);
3471 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3472 __ andcc(from, 7, G0);
3473 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_128bit);
3474 __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3475
3476 // aligned case: load input into G3 and G4
3477 __ ldx(from,0,G3);
3478 __ ldx(from,8,G4);
3479 __ ba_short(L_128bit_transform);
3480
3481 __ BIND(L_load_misaligned_input_128bit);
3482 // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
3483 __ alignaddr(from, G0, from);
3484 __ ldf(FloatRegisterImpl::D, from, 0, F48);
3485 __ ldf(FloatRegisterImpl::D, from, 8, F50);
3486 __ ldf(FloatRegisterImpl::D, from, 16, F52);
3487 __ faligndata(F48, F50, F48);
3488 __ faligndata(F50, F52, F50);
3489 __ movdtox(F48, G3);
3490 __ movdtox(F50, G4);
3491 __ mov(L1, from);
3492
3493 __ BIND(L_128bit_transform);
3494 __ xor3(G1,G3,G3);
3495 __ xor3(G5,G4,G4);
3496 __ movxtod(G3,F56);
3497 __ movxtod(G4,F58);
3498 __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3499 __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3500
3501 // TEN_EROUNDS
3502 for ( int i = 0; i <= 32; i += 8 ) {
3503 __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
3504 __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
3505 if (i != 32 ) {
3506 __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
3507 __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
3508 } else {
3509 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3510 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3511 }
3512 }
3513
3514 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3515 __ andcc(to, 7, L1);
3516 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_128bit);
3517 __ delayed()->edge8n(to, G0, L2);
3518
3519 // aligned case: store output into the destination array
3520 __ stf(FloatRegisterImpl::D, F60, to, 0);
3521 __ stf(FloatRegisterImpl::D, F62, to, 8);
3522 __ ba_short(L_check_loop_end_128bit);
3523
3524 __ BIND(L_store_misaligned_output_128bit);
3525 __ add(to, 8, L3);
3526 __ mov(8, L4);
3527 __ sub(L4, L1, L4);
3528 __ alignaddr(L4, G0, L4);
3529 // save cipher text before circular right shift
3530 // as it needs to be stored as iv for next block (see code before next retl)
3531 __ movdtox(F60, L6);
3532 __ movdtox(F62, L7);
3533 __ faligndata(F60, F60, F60);
3534 __ faligndata(F62, F62, F62);
3535 __ mov(to, L5);
3536 __ and3(to, -8, to);
3537 __ and3(L3, -8, L3);
3538 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3539 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3540 __ add(to, 8, to);
3541 __ add(L3, 8, L3);
3542 __ orn(G0, L2, L2);
3543 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3544 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3545 __ mov(L5, to);
3546 __ movxtod(L6, F60);
3547 __ movxtod(L7, F62);
3548
3549 __ BIND(L_check_loop_end_128bit);
3550 __ add(from, 16, from);
3551 __ add(to, 16, to);
3552 __ subcc(len_reg, 16, len_reg);
3553 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc128);
3554 __ delayed()->nop();
3555 // re-init intial vector for next block, 8-byte alignment is guaranteed
3556 __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3557 __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3558 __ mov(L0, I0);
3559 __ ret();
3560 __ delayed()->restore();
3561
3562 __ align(OptoLoopAlignment);
3563 __ BIND(L_cbcenc192);
3564 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3565 __ andcc(from, 7, G0);
3566 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_192bit);
3567 __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3568
3569 // aligned case: load input into G3 and G4
3570 __ ldx(from,0,G3);
3571 __ ldx(from,8,G4);
3572 __ ba_short(L_192bit_transform);
3573
3574 __ BIND(L_load_misaligned_input_192bit);
3575 // can clobber F48, F50 and F52 as they are not used in 128 and 192-bit key encryption
3576 __ alignaddr(from, G0, from);
3577 __ ldf(FloatRegisterImpl::D, from, 0, F48);
3578 __ ldf(FloatRegisterImpl::D, from, 8, F50);
3579 __ ldf(FloatRegisterImpl::D, from, 16, F52);
3580 __ faligndata(F48, F50, F48);
3581 __ faligndata(F50, F52, F50);
3582 __ movdtox(F48, G3);
3583 __ movdtox(F50, G4);
3584 __ mov(L1, from);
3585
3586 __ BIND(L_192bit_transform);
3587 __ xor3(G1,G3,G3);
3588 __ xor3(G5,G4,G4);
3589 __ movxtod(G3,F56);
3590 __ movxtod(G4,F58);
3591 __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3592 __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3593
3594 // TWELEVE_EROUNDS
3595 for ( int i = 0; i <= 40; i += 8 ) {
3596 __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
3597 __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
3598 if (i != 40 ) {
3599 __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
3600 __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
3601 } else {
3602 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3603 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3604 }
3605 }
3606
3607 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3608 __ andcc(to, 7, L1);
3609 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_192bit);
3610 __ delayed()->edge8n(to, G0, L2);
3611
3612 // aligned case: store output into the destination array
3613 __ stf(FloatRegisterImpl::D, F60, to, 0);
3614 __ stf(FloatRegisterImpl::D, F62, to, 8);
3615 __ ba_short(L_check_loop_end_192bit);
3616
3617 __ BIND(L_store_misaligned_output_192bit);
3618 __ add(to, 8, L3);
3619 __ mov(8, L4);
3620 __ sub(L4, L1, L4);
3621 __ alignaddr(L4, G0, L4);
3622 __ movdtox(F60, L6);
3623 __ movdtox(F62, L7);
3624 __ faligndata(F60, F60, F60);
3625 __ faligndata(F62, F62, F62);
3626 __ mov(to, L5);
3627 __ and3(to, -8, to);
3628 __ and3(L3, -8, L3);
3629 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3630 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3631 __ add(to, 8, to);
3632 __ add(L3, 8, L3);
3633 __ orn(G0, L2, L2);
3634 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3635 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3636 __ mov(L5, to);
3637 __ movxtod(L6, F60);
3638 __ movxtod(L7, F62);
3639
3640 __ BIND(L_check_loop_end_192bit);
3641 __ add(from, 16, from);
3642 __ subcc(len_reg, 16, len_reg);
3643 __ add(to, 16, to);
3644 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc192);
3645 __ delayed()->nop();
3646 // re-init intial vector for next block, 8-byte alignment is guaranteed
3647 __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3648 __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3649 __ mov(L0, I0);
3650 __ ret();
3651 __ delayed()->restore();
3652
3653 __ align(OptoLoopAlignment);
3654 __ BIND(L_cbcenc256);
3655 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3656 __ andcc(from, 7, G0);
3657 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_256bit);
3658 __ delayed()->mov(from, L1); // save original 'from' address before alignaddr
3659
3660 // aligned case: load input into G3 and G4
3661 __ ldx(from,0,G3);
3662 __ ldx(from,8,G4);
3663 __ ba_short(L_256bit_transform);
3664
3665 __ BIND(L_load_misaligned_input_256bit);
3666 // cannot clobber F48, F50 and F52. F56, F58 can be used though
3667 __ alignaddr(from, G0, from);
3668 __ movdtox(F60, L2); // save F60 before overwriting
3669 __ ldf(FloatRegisterImpl::D, from, 0, F56);
3670 __ ldf(FloatRegisterImpl::D, from, 8, F58);
3671 __ ldf(FloatRegisterImpl::D, from, 16, F60);
3672 __ faligndata(F56, F58, F56);
3673 __ faligndata(F58, F60, F58);
3674 __ movdtox(F56, G3);
3675 __ movdtox(F58, G4);
3676 __ mov(L1, from);
3677 __ movxtod(L2, F60);
3678
3679 __ BIND(L_256bit_transform);
3680 __ xor3(G1,G3,G3);
3681 __ xor3(G5,G4,G4);
3682 __ movxtod(G3,F56);
3683 __ movxtod(G4,F58);
3684 __ fxor(FloatRegisterImpl::D, F60, F56, F60);
3685 __ fxor(FloatRegisterImpl::D, F62, F58, F62);
3686
3687 // FOURTEEN_EROUNDS
3688 for ( int i = 0; i <= 48; i += 8 ) {
3689 __ aes_eround01(as_FloatRegister(i), F60, F62, F56);
3690 __ aes_eround23(as_FloatRegister(i+2), F60, F62, F58);
3691 if (i != 48 ) {
3692 __ aes_eround01(as_FloatRegister(i+4), F56, F58, F60);
3693 __ aes_eround23(as_FloatRegister(i+6), F56, F58, F62);
3694 } else {
3695 __ aes_eround01_l(as_FloatRegister(i+4), F56, F58, F60);
3696 __ aes_eround23_l(as_FloatRegister(i+6), F56, F58, F62);
3697 }
3698 }
3699
3700 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3701 __ andcc(to, 7, L1);
3702 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_256bit);
3703 __ delayed()->edge8n(to, G0, L2);
3704
3705 // aligned case: store output into the destination array
3706 __ stf(FloatRegisterImpl::D, F60, to, 0);
3707 __ stf(FloatRegisterImpl::D, F62, to, 8);
3708 __ ba_short(L_check_loop_end_256bit);
3709
3710 __ BIND(L_store_misaligned_output_256bit);
3711 __ add(to, 8, L3);
3712 __ mov(8, L4);
3713 __ sub(L4, L1, L4);
3714 __ alignaddr(L4, G0, L4);
3715 __ movdtox(F60, L6);
3716 __ movdtox(F62, L7);
3717 __ faligndata(F60, F60, F60);
3718 __ faligndata(F62, F62, F62);
3719 __ mov(to, L5);
3720 __ and3(to, -8, to);
3721 __ and3(L3, -8, L3);
3722 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3723 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3724 __ add(to, 8, to);
3725 __ add(L3, 8, L3);
3726 __ orn(G0, L2, L2);
3727 __ stpartialf(to, L2, F60, Assembler::ASI_PST8_PRIMARY);
3728 __ stpartialf(L3, L2, F62, Assembler::ASI_PST8_PRIMARY);
3729 __ mov(L5, to);
3730 __ movxtod(L6, F60);
3731 __ movxtod(L7, F62);
3732
3733 __ BIND(L_check_loop_end_256bit);
3734 __ add(from, 16, from);
3735 __ subcc(len_reg, 16, len_reg);
3736 __ add(to, 16, to);
3737 __ br(Assembler::notEqual, false, Assembler::pt, L_cbcenc256);
3738 __ delayed()->nop();
3739 // re-init intial vector for next block, 8-byte alignment is guaranteed
3740 __ stf(FloatRegisterImpl::D, F60, rvec, 0);
3741 __ stf(FloatRegisterImpl::D, F62, rvec, 8);
3742 __ mov(L0, I0);
3743 __ ret();
3744 __ delayed()->restore();
3745
3746 return start;
3747 }
3748
3749 address generate_cipherBlockChaining_decryptAESCrypt_Parallel() {
3750 assert((arrayOopDesc::base_offset_in_bytes(T_INT) & 7) == 0,
3751 "the following code assumes that first element of an int array is aligned to 8 bytes");
3752 assert((arrayOopDesc::base_offset_in_bytes(T_BYTE) & 7) == 0,
3753 "the following code assumes that first element of a byte array is aligned to 8 bytes");
3754 __ align(CodeEntryAlignment);
3755 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt");
3756 Label L_cbcdec_end, L_expand192bit, L_expand256bit, L_dec_first_block_start;
3757 Label L_dec_first_block128, L_dec_first_block192, L_dec_next2_blocks128, L_dec_next2_blocks192, L_dec_next2_blocks256;
3758 Label L_load_misaligned_input_first_block, L_transform_first_block, L_load_misaligned_next2_blocks128, L_transform_next2_blocks128;
3759 Label L_load_misaligned_next2_blocks192, L_transform_next2_blocks192, L_load_misaligned_next2_blocks256, L_transform_next2_blocks256;
3760 Label L_store_misaligned_output_first_block, L_check_decrypt_end, L_store_misaligned_output_next2_blocks128;
3761 Label L_check_decrypt_loop_end128, L_store_misaligned_output_next2_blocks192, L_check_decrypt_loop_end192;
3762 Label L_store_misaligned_output_next2_blocks256, L_check_decrypt_loop_end256;
3763 address start = __ pc();
3764 Register from = I0; // source byte array
3765 Register to = I1; // destination byte array
3766 Register key = I2; // expanded key array
3767 Register rvec = I3; // init vector
3768 const Register len_reg = I4; // cipher length
3769 const Register original_key = I5; // original key array only required during decryption
3770 const Register keylen = L6; // reg for storing expanded key array length
3771
3772 __ save_frame(0); //args are read from I* registers since we save the frame in the beginning
3773 // save cipher len to return in the end
3774 __ mov(len_reg, L7);
3775
3776 // load original key from SunJCE expanded decryption key
3777 // Since we load original key buffer starting first element, 8-byte alignment is guaranteed
3778 for ( int i = 0; i <= 3; i++ ) {
3779 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3780 }
3781
3782 // load initial vector, 8-byte alignment is guaranteed
3783 __ ldx(rvec,0,L0);
3784 __ ldx(rvec,8,L1);
3785
3786 // read expanded key array length
3787 __ ldsw(Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT)), keylen, 0);
3788
3789 // 256-bit original key size
3790 __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_expand256bit);
3791
3792 // 192-bit original key size
3793 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_expand192bit);
3794
3795 // 128-bit original key size
3796 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3797 for ( int i = 0; i <= 36; i += 4 ) {
3798 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+2), i/4, as_FloatRegister(i+4));
3799 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+4), as_FloatRegister(i+6));
3800 }
3801
3802 // load expanded key[last-1] and key[last] elements
3803 __ movdtox(F40,L2);
3804 __ movdtox(F42,L3);
3805
3806 __ and3(len_reg, 16, L4);
3807 __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks128);
3808 __ nop();
3809
3810 __ ba_short(L_dec_first_block_start);
3811
3812 __ BIND(L_expand192bit);
3813 // load rest of the 192-bit key
3814 __ ldf(FloatRegisterImpl::S, original_key, 16, F4);
3815 __ ldf(FloatRegisterImpl::S, original_key, 20, F5);
3816
3817 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3818 for ( int i = 0; i <= 36; i += 6 ) {
3819 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+4), i/6, as_FloatRegister(i+6));
3820 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+6), as_FloatRegister(i+8));
3821 __ aes_kexpand2(as_FloatRegister(i+4), as_FloatRegister(i+8), as_FloatRegister(i+10));
3822 }
3823 __ aes_kexpand1(F42, F46, 7, F48);
3824 __ aes_kexpand2(F44, F48, F50);
3825
3826 // load expanded key[last-1] and key[last] elements
3827 __ movdtox(F48,L2);
3828 __ movdtox(F50,L3);
3829
3830 __ and3(len_reg, 16, L4);
3831 __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks192);
3832 __ nop();
3833
3834 __ ba_short(L_dec_first_block_start);
3835
3836 __ BIND(L_expand256bit);
3837 // load rest of the 256-bit key
3838 for ( int i = 4; i <= 7; i++ ) {
3839 __ ldf(FloatRegisterImpl::S, original_key, i*4, as_FloatRegister(i));
3840 }
3841
3842 // perform key expansion since SunJCE decryption-key expansion is not compatible with SPARC crypto instructions
3843 for ( int i = 0; i <= 40; i += 8 ) {
3844 __ aes_kexpand1(as_FloatRegister(i), as_FloatRegister(i+6), i/8, as_FloatRegister(i+8));
3845 __ aes_kexpand2(as_FloatRegister(i+2), as_FloatRegister(i+8), as_FloatRegister(i+10));
3846 __ aes_kexpand0(as_FloatRegister(i+4), as_FloatRegister(i+10), as_FloatRegister(i+12));
3847 __ aes_kexpand2(as_FloatRegister(i+6), as_FloatRegister(i+12), as_FloatRegister(i+14));
3848 }
3849 __ aes_kexpand1(F48, F54, 6, F56);
3850 __ aes_kexpand2(F50, F56, F58);
3851
3852 // load expanded key[last-1] and key[last] elements
3853 __ movdtox(F56,L2);
3854 __ movdtox(F58,L3);
3855
3856 __ and3(len_reg, 16, L4);
3857 __ br_null_short(L4, Assembler::pt, L_dec_next2_blocks256);
3858
3859 __ BIND(L_dec_first_block_start);
3860 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3861 __ andcc(from, 7, G0);
3862 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_input_first_block);
3863 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
3864
3865 // aligned case: load input into L4 and L5
3866 __ ldx(from,0,L4);
3867 __ ldx(from,8,L5);
3868 __ ba_short(L_transform_first_block);
3869
3870 __ BIND(L_load_misaligned_input_first_block);
3871 __ alignaddr(from, G0, from);
3872 // F58, F60, F62 can be clobbered
3873 __ ldf(FloatRegisterImpl::D, from, 0, F58);
3874 __ ldf(FloatRegisterImpl::D, from, 8, F60);
3875 __ ldf(FloatRegisterImpl::D, from, 16, F62);
3876 __ faligndata(F58, F60, F58);
3877 __ faligndata(F60, F62, F60);
3878 __ movdtox(F58, L4);
3879 __ movdtox(F60, L5);
3880 __ mov(G1, from);
3881
3882 __ BIND(L_transform_first_block);
3883 __ xor3(L2,L4,G1);
3884 __ movxtod(G1,F60);
3885 __ xor3(L3,L5,G1);
3886 __ movxtod(G1,F62);
3887
3888 // 128-bit original key size
3889 __ cmp_and_brx_short(keylen, 44, Assembler::equal, Assembler::pn, L_dec_first_block128);
3890
3891 // 192-bit original key size
3892 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_first_block192);
3893
3894 __ aes_dround23(F54, F60, F62, F58);
3895 __ aes_dround01(F52, F60, F62, F56);
3896 __ aes_dround23(F50, F56, F58, F62);
3897 __ aes_dround01(F48, F56, F58, F60);
3898
3899 __ BIND(L_dec_first_block192);
3900 __ aes_dround23(F46, F60, F62, F58);
3901 __ aes_dround01(F44, F60, F62, F56);
3902 __ aes_dround23(F42, F56, F58, F62);
3903 __ aes_dround01(F40, F56, F58, F60);
3904
3905 __ BIND(L_dec_first_block128);
3906 for ( int i = 38; i >= 6; i -= 8 ) {
3907 __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
3908 __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
3909 if ( i != 6) {
3910 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
3911 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
3912 } else {
3913 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
3914 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
3915 }
3916 }
3917
3918 __ movxtod(L0,F56);
3919 __ movxtod(L1,F58);
3920 __ mov(L4,L0);
3921 __ mov(L5,L1);
3922 __ fxor(FloatRegisterImpl::D, F56, F60, F60);
3923 __ fxor(FloatRegisterImpl::D, F58, F62, F62);
3924
3925 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
3926 __ andcc(to, 7, G1);
3927 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_first_block);
3928 __ delayed()->edge8n(to, G0, G2);
3929
3930 // aligned case: store output into the destination array
3931 __ stf(FloatRegisterImpl::D, F60, to, 0);
3932 __ stf(FloatRegisterImpl::D, F62, to, 8);
3933 __ ba_short(L_check_decrypt_end);
3934
3935 __ BIND(L_store_misaligned_output_first_block);
3936 __ add(to, 8, G3);
3937 __ mov(8, G4);
3938 __ sub(G4, G1, G4);
3939 __ alignaddr(G4, G0, G4);
3940 __ faligndata(F60, F60, F60);
3941 __ faligndata(F62, F62, F62);
3942 __ mov(to, G1);
3943 __ and3(to, -8, to);
3944 __ and3(G3, -8, G3);
3945 __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
3946 __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
3947 __ add(to, 8, to);
3948 __ add(G3, 8, G3);
3949 __ orn(G0, G2, G2);
3950 __ stpartialf(to, G2, F60, Assembler::ASI_PST8_PRIMARY);
3951 __ stpartialf(G3, G2, F62, Assembler::ASI_PST8_PRIMARY);
3952 __ mov(G1, to);
3953
3954 __ BIND(L_check_decrypt_end);
3955 __ add(from, 16, from);
3956 __ add(to, 16, to);
3957 __ subcc(len_reg, 16, len_reg);
3958 __ br(Assembler::equal, false, Assembler::pt, L_cbcdec_end);
3959 __ delayed()->nop();
3960
3961 // 256-bit original key size
3962 __ cmp_and_brx_short(keylen, 60, Assembler::equal, Assembler::pn, L_dec_next2_blocks256);
3963
3964 // 192-bit original key size
3965 __ cmp_and_brx_short(keylen, 52, Assembler::equal, Assembler::pn, L_dec_next2_blocks192);
3966
3967 __ align(OptoLoopAlignment);
3968 __ BIND(L_dec_next2_blocks128);
3969 __ nop();
3970
3971 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
3972 __ andcc(from, 7, G0);
3973 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks128);
3974 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
3975
3976 // aligned case: load input into G4, G5, L4 and L5
3977 __ ldx(from,0,G4);
3978 __ ldx(from,8,G5);
3979 __ ldx(from,16,L4);
3980 __ ldx(from,24,L5);
3981 __ ba_short(L_transform_next2_blocks128);
3982
3983 __ BIND(L_load_misaligned_next2_blocks128);
3984 __ alignaddr(from, G0, from);
3985 // F40, F42, F58, F60, F62 can be clobbered
3986 __ ldf(FloatRegisterImpl::D, from, 0, F40);
3987 __ ldf(FloatRegisterImpl::D, from, 8, F42);
3988 __ ldf(FloatRegisterImpl::D, from, 16, F60);
3989 __ ldf(FloatRegisterImpl::D, from, 24, F62);
3990 __ ldf(FloatRegisterImpl::D, from, 32, F58);
3991 __ faligndata(F40, F42, F40);
3992 __ faligndata(F42, F60, F42);
3993 __ faligndata(F60, F62, F60);
3994 __ faligndata(F62, F58, F62);
3995 __ movdtox(F40, G4);
3996 __ movdtox(F42, G5);
3997 __ movdtox(F60, L4);
3998 __ movdtox(F62, L5);
3999 __ mov(G1, from);
4000
4001 __ BIND(L_transform_next2_blocks128);
4002 // F40:F42 used for first 16-bytes
4003 __ xor3(L2,G4,G1);
4004 __ movxtod(G1,F40);
4005 __ xor3(L3,G5,G1);
4006 __ movxtod(G1,F42);
4007
4008 // F60:F62 used for next 16-bytes
4009 __ xor3(L2,L4,G1);
4010 __ movxtod(G1,F60);
4011 __ xor3(L3,L5,G1);
4012 __ movxtod(G1,F62);
4013
4014 for ( int i = 38; i >= 6; i -= 8 ) {
4015 __ aes_dround23(as_FloatRegister(i), F40, F42, F44);
4016 __ aes_dround01(as_FloatRegister(i-2), F40, F42, F46);
4017 __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4018 __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4019 if (i != 6 ) {
4020 __ aes_dround23(as_FloatRegister(i-4), F46, F44, F42);
4021 __ aes_dround01(as_FloatRegister(i-6), F46, F44, F40);
4022 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4023 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4024 } else {
4025 __ aes_dround23_l(as_FloatRegister(i-4), F46, F44, F42);
4026 __ aes_dround01_l(as_FloatRegister(i-6), F46, F44, F40);
4027 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
4028 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
4029 }
4030 }
4031
4032 __ movxtod(L0,F46);
4033 __ movxtod(L1,F44);
4034 __ fxor(FloatRegisterImpl::D, F46, F40, F40);
4035 __ fxor(FloatRegisterImpl::D, F44, F42, F42);
4036
4037 __ movxtod(G4,F56);
4038 __ movxtod(G5,F58);
4039 __ mov(L4,L0);
4040 __ mov(L5,L1);
4041 __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4042 __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4043
4044 // For mis-aligned store of 32 bytes of result we can do:
4045 // Circular right-shift all 4 FP registers so that 'head' and 'tail'
4046 // parts that need to be stored starting at mis-aligned address are in a FP reg
4047 // the other 3 FP regs can thus be stored using regular store
4048 // we then use the edge + partial-store mechanism to store the 'head' and 'tail' parts
4049
4050 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4051 __ andcc(to, 7, G1);
4052 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks128);
4053 __ delayed()->edge8n(to, G0, G2);
4054
4055 // aligned case: store output into the destination array
4056 __ stf(FloatRegisterImpl::D, F40, to, 0);
4057 __ stf(FloatRegisterImpl::D, F42, to, 8);
4058 __ stf(FloatRegisterImpl::D, F60, to, 16);
4059 __ stf(FloatRegisterImpl::D, F62, to, 24);
4060 __ ba_short(L_check_decrypt_loop_end128);
4061
4062 __ BIND(L_store_misaligned_output_next2_blocks128);
4063 __ mov(8, G4);
4064 __ sub(G4, G1, G4);
4065 __ alignaddr(G4, G0, G4);
4066 __ faligndata(F40, F42, F56); // F56 can be clobbered
4067 __ faligndata(F42, F60, F42);
4068 __ faligndata(F60, F62, F60);
4069 __ faligndata(F62, F40, F40);
4070 __ mov(to, G1);
4071 __ and3(to, -8, to);
4072 __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
4073 __ stf(FloatRegisterImpl::D, F56, to, 8);
4074 __ stf(FloatRegisterImpl::D, F42, to, 16);
4075 __ stf(FloatRegisterImpl::D, F60, to, 24);
4076 __ add(to, 32, to);
4077 __ orn(G0, G2, G2);
4078 __ stpartialf(to, G2, F40, Assembler::ASI_PST8_PRIMARY);
4079 __ mov(G1, to);
4080
4081 __ BIND(L_check_decrypt_loop_end128);
4082 __ add(from, 32, from);
4083 __ add(to, 32, to);
4084 __ subcc(len_reg, 32, len_reg);
4085 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks128);
4086 __ delayed()->nop();
4087 __ ba_short(L_cbcdec_end);
4088
4089 __ align(OptoLoopAlignment);
4090 __ BIND(L_dec_next2_blocks192);
4091 __ nop();
4092
4093 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4094 __ andcc(from, 7, G0);
4095 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks192);
4096 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4097
4098 // aligned case: load input into G4, G5, L4 and L5
4099 __ ldx(from,0,G4);
4100 __ ldx(from,8,G5);
4101 __ ldx(from,16,L4);
4102 __ ldx(from,24,L5);
4103 __ ba_short(L_transform_next2_blocks192);
4104
4105 __ BIND(L_load_misaligned_next2_blocks192);
4106 __ alignaddr(from, G0, from);
4107 // F48, F50, F52, F60, F62 can be clobbered
4108 __ ldf(FloatRegisterImpl::D, from, 0, F48);
4109 __ ldf(FloatRegisterImpl::D, from, 8, F50);
4110 __ ldf(FloatRegisterImpl::D, from, 16, F60);
4111 __ ldf(FloatRegisterImpl::D, from, 24, F62);
4112 __ ldf(FloatRegisterImpl::D, from, 32, F52);
4113 __ faligndata(F48, F50, F48);
4114 __ faligndata(F50, F60, F50);
4115 __ faligndata(F60, F62, F60);
4116 __ faligndata(F62, F52, F62);
4117 __ movdtox(F48, G4);
4118 __ movdtox(F50, G5);
4119 __ movdtox(F60, L4);
4120 __ movdtox(F62, L5);
4121 __ mov(G1, from);
4122
4123 __ BIND(L_transform_next2_blocks192);
4124 // F48:F50 used for first 16-bytes
4125 __ xor3(L2,G4,G1);
4126 __ movxtod(G1,F48);
4127 __ xor3(L3,G5,G1);
4128 __ movxtod(G1,F50);
4129
4130 // F60:F62 used for next 16-bytes
4131 __ xor3(L2,L4,G1);
4132 __ movxtod(G1,F60);
4133 __ xor3(L3,L5,G1);
4134 __ movxtod(G1,F62);
4135
4136 for ( int i = 46; i >= 6; i -= 8 ) {
4137 __ aes_dround23(as_FloatRegister(i), F48, F50, F52);
4138 __ aes_dround01(as_FloatRegister(i-2), F48, F50, F54);
4139 __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4140 __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4141 if (i != 6 ) {
4142 __ aes_dround23(as_FloatRegister(i-4), F54, F52, F50);
4143 __ aes_dround01(as_FloatRegister(i-6), F54, F52, F48);
4144 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4145 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4146 } else {
4147 __ aes_dround23_l(as_FloatRegister(i-4), F54, F52, F50);
4148 __ aes_dround01_l(as_FloatRegister(i-6), F54, F52, F48);
4149 __ aes_dround23_l(as_FloatRegister(i-4), F56, F58, F62);
4150 __ aes_dround01_l(as_FloatRegister(i-6), F56, F58, F60);
4151 }
4152 }
4153
4154 __ movxtod(L0,F54);
4155 __ movxtod(L1,F52);
4156 __ fxor(FloatRegisterImpl::D, F54, F48, F48);
4157 __ fxor(FloatRegisterImpl::D, F52, F50, F50);
4158
4159 __ movxtod(G4,F56);
4160 __ movxtod(G5,F58);
4161 __ mov(L4,L0);
4162 __ mov(L5,L1);
4163 __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4164 __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4165
4166 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4167 __ andcc(to, 7, G1);
4168 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks192);
4169 __ delayed()->edge8n(to, G0, G2);
4170
4171 // aligned case: store output into the destination array
4172 __ stf(FloatRegisterImpl::D, F48, to, 0);
4173 __ stf(FloatRegisterImpl::D, F50, to, 8);
4174 __ stf(FloatRegisterImpl::D, F60, to, 16);
4175 __ stf(FloatRegisterImpl::D, F62, to, 24);
4176 __ ba_short(L_check_decrypt_loop_end192);
4177
4178 __ BIND(L_store_misaligned_output_next2_blocks192);
4179 __ mov(8, G4);
4180 __ sub(G4, G1, G4);
4181 __ alignaddr(G4, G0, G4);
4182 __ faligndata(F48, F50, F56); // F56 can be clobbered
4183 __ faligndata(F50, F60, F50);
4184 __ faligndata(F60, F62, F60);
4185 __ faligndata(F62, F48, F48);
4186 __ mov(to, G1);
4187 __ and3(to, -8, to);
4188 __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
4189 __ stf(FloatRegisterImpl::D, F56, to, 8);
4190 __ stf(FloatRegisterImpl::D, F50, to, 16);
4191 __ stf(FloatRegisterImpl::D, F60, to, 24);
4192 __ add(to, 32, to);
4193 __ orn(G0, G2, G2);
4194 __ stpartialf(to, G2, F48, Assembler::ASI_PST8_PRIMARY);
4195 __ mov(G1, to);
4196
4197 __ BIND(L_check_decrypt_loop_end192);
4198 __ add(from, 32, from);
4199 __ add(to, 32, to);
4200 __ subcc(len_reg, 32, len_reg);
4201 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks192);
4202 __ delayed()->nop();
4203 __ ba_short(L_cbcdec_end);
4204
4205 __ align(OptoLoopAlignment);
4206 __ BIND(L_dec_next2_blocks256);
4207 __ nop();
4208
4209 // check for 8-byte alignment since source byte array may have an arbitrary alignment if offset mod 8 is non-zero
4210 __ andcc(from, 7, G0);
4211 __ br(Assembler::notZero, true, Assembler::pn, L_load_misaligned_next2_blocks256);
4212 __ delayed()->mov(from, G1); // save original 'from' address before alignaddr
4213
4214 // aligned case: load input into G4, G5, L4 and L5
4215 __ ldx(from,0,G4);
4216 __ ldx(from,8,G5);
4217 __ ldx(from,16,L4);
4218 __ ldx(from,24,L5);
4219 __ ba_short(L_transform_next2_blocks256);
4220
4221 __ BIND(L_load_misaligned_next2_blocks256);
4222 __ alignaddr(from, G0, from);
4223 // F0, F2, F4, F60, F62 can be clobbered
4224 __ ldf(FloatRegisterImpl::D, from, 0, F0);
4225 __ ldf(FloatRegisterImpl::D, from, 8, F2);
4226 __ ldf(FloatRegisterImpl::D, from, 16, F60);
4227 __ ldf(FloatRegisterImpl::D, from, 24, F62);
4228 __ ldf(FloatRegisterImpl::D, from, 32, F4);
4229 __ faligndata(F0, F2, F0);
4230 __ faligndata(F2, F60, F2);
4231 __ faligndata(F60, F62, F60);
4232 __ faligndata(F62, F4, F62);
4233 __ movdtox(F0, G4);
4234 __ movdtox(F2, G5);
4235 __ movdtox(F60, L4);
4236 __ movdtox(F62, L5);
4237 __ mov(G1, from);
4238
4239 __ BIND(L_transform_next2_blocks256);
4240 // F0:F2 used for first 16-bytes
4241 __ xor3(L2,G4,G1);
4242 __ movxtod(G1,F0);
4243 __ xor3(L3,G5,G1);
4244 __ movxtod(G1,F2);
4245
4246 // F60:F62 used for next 16-bytes
4247 __ xor3(L2,L4,G1);
4248 __ movxtod(G1,F60);
4249 __ xor3(L3,L5,G1);
4250 __ movxtod(G1,F62);
4251
4252 __ aes_dround23(F54, F0, F2, F4);
4253 __ aes_dround01(F52, F0, F2, F6);
4254 __ aes_dround23(F54, F60, F62, F58);
4255 __ aes_dround01(F52, F60, F62, F56);
4256 __ aes_dround23(F50, F6, F4, F2);
4257 __ aes_dround01(F48, F6, F4, F0);
4258 __ aes_dround23(F50, F56, F58, F62);
4259 __ aes_dround01(F48, F56, F58, F60);
4260 // save F48:F54 in temp registers
4261 __ movdtox(F54,G2);
4262 __ movdtox(F52,G3);
4263 __ movdtox(F50,L6);
4264 __ movdtox(F48,G1);
4265 for ( int i = 46; i >= 14; i -= 8 ) {
4266 __ aes_dround23(as_FloatRegister(i), F0, F2, F4);
4267 __ aes_dround01(as_FloatRegister(i-2), F0, F2, F6);
4268 __ aes_dround23(as_FloatRegister(i), F60, F62, F58);
4269 __ aes_dround01(as_FloatRegister(i-2), F60, F62, F56);
4270 __ aes_dround23(as_FloatRegister(i-4), F6, F4, F2);
4271 __ aes_dround01(as_FloatRegister(i-6), F6, F4, F0);
4272 __ aes_dround23(as_FloatRegister(i-4), F56, F58, F62);
4273 __ aes_dround01(as_FloatRegister(i-6), F56, F58, F60);
4274 }
4275 // init F48:F54 with F0:F6 values (original key)
4276 __ ldf(FloatRegisterImpl::D, original_key, 0, F48);
4277 __ ldf(FloatRegisterImpl::D, original_key, 8, F50);
4278 __ ldf(FloatRegisterImpl::D, original_key, 16, F52);
4279 __ ldf(FloatRegisterImpl::D, original_key, 24, F54);
4280 __ aes_dround23(F54, F0, F2, F4);
4281 __ aes_dround01(F52, F0, F2, F6);
4282 __ aes_dround23(F54, F60, F62, F58);
4283 __ aes_dround01(F52, F60, F62, F56);
4284 __ aes_dround23_l(F50, F6, F4, F2);
4285 __ aes_dround01_l(F48, F6, F4, F0);
4286 __ aes_dround23_l(F50, F56, F58, F62);
4287 __ aes_dround01_l(F48, F56, F58, F60);
4288 // re-init F48:F54 with their original values
4289 __ movxtod(G2,F54);
4290 __ movxtod(G3,F52);
4291 __ movxtod(L6,F50);
4292 __ movxtod(G1,F48);
4293
4294 __ movxtod(L0,F6);
4295 __ movxtod(L1,F4);
4296 __ fxor(FloatRegisterImpl::D, F6, F0, F0);
4297 __ fxor(FloatRegisterImpl::D, F4, F2, F2);
4298
4299 __ movxtod(G4,F56);
4300 __ movxtod(G5,F58);
4301 __ mov(L4,L0);
4302 __ mov(L5,L1);
4303 __ fxor(FloatRegisterImpl::D, F56, F60, F60);
4304 __ fxor(FloatRegisterImpl::D, F58, F62, F62);
4305
4306 // check for 8-byte alignment since dest byte array may have arbitrary alignment if offset mod 8 is non-zero
4307 __ andcc(to, 7, G1);
4308 __ br(Assembler::notZero, true, Assembler::pn, L_store_misaligned_output_next2_blocks256);
4309 __ delayed()->edge8n(to, G0, G2);
4310
4311 // aligned case: store output into the destination array
4312 __ stf(FloatRegisterImpl::D, F0, to, 0);
4313 __ stf(FloatRegisterImpl::D, F2, to, 8);
4314 __ stf(FloatRegisterImpl::D, F60, to, 16);
4315 __ stf(FloatRegisterImpl::D, F62, to, 24);
4316 __ ba_short(L_check_decrypt_loop_end256);
4317
4318 __ BIND(L_store_misaligned_output_next2_blocks256);
4319 __ mov(8, G4);
4320 __ sub(G4, G1, G4);
4321 __ alignaddr(G4, G0, G4);
4322 __ faligndata(F0, F2, F56); // F56 can be clobbered
4323 __ faligndata(F2, F60, F2);
4324 __ faligndata(F60, F62, F60);
4325 __ faligndata(F62, F0, F0);
4326 __ mov(to, G1);
4327 __ and3(to, -8, to);
4328 __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
4329 __ stf(FloatRegisterImpl::D, F56, to, 8);
4330 __ stf(FloatRegisterImpl::D, F2, to, 16);
4331 __ stf(FloatRegisterImpl::D, F60, to, 24);
4332 __ add(to, 32, to);
4333 __ orn(G0, G2, G2);
4334 __ stpartialf(to, G2, F0, Assembler::ASI_PST8_PRIMARY);
4335 __ mov(G1, to);
4336
4337 __ BIND(L_check_decrypt_loop_end256);
4338 __ add(from, 32, from);
4339 __ add(to, 32, to);
4340 __ subcc(len_reg, 32, len_reg);
4341 __ br(Assembler::notEqual, false, Assembler::pt, L_dec_next2_blocks256);
4342 __ delayed()->nop();
4343
4344 __ BIND(L_cbcdec_end);
4345 // re-init intial vector for next block, 8-byte alignment is guaranteed
4346 __ stx(L0, rvec, 0);
4347 __ stx(L1, rvec, 8);
4348 __ mov(L7, I0);
4349 __ ret();
4350 __ delayed()->restore();
4351
4352 return start;
4353 }
4354
4355 address generate_sha1_implCompress(bool multi_block, const char *name) {
4356 __ align(CodeEntryAlignment);
4357 StubCodeMark mark(this, "StubRoutines", name);
4358 address start = __ pc();
4359
4360 Label L_sha1_loop, L_sha1_unaligned_input, L_sha1_unaligned_input_loop;
4361 int i;
4362
4363 Register buf = O0; // byte[] source+offset
4364 Register state = O1; // int[] SHA.state
4365 Register ofs = O2; // int offset
4366 Register limit = O3; // int limit
4367
4368 // load state into F0-F4
4369 for (i = 0; i < 5; i++) {
4370 __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i));
4371 }
4372
4373 __ andcc(buf, 7, G0);
4374 __ br(Assembler::notZero, false, Assembler::pn, L_sha1_unaligned_input);
4375 __ delayed()->nop();
4376
4377 __ BIND(L_sha1_loop);
4378 // load buf into F8-F22
4379 for (i = 0; i < 8; i++) {
4380 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4381 }
4382 __ sha1();
4383 if (multi_block) {
4384 __ add(ofs, 64, ofs);
4385 __ add(buf, 64, buf);
4386 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_loop);
4387 __ mov(ofs, O0); // to be returned
4388 }
4389
4390 // store F0-F4 into state and return
4391 for (i = 0; i < 4; i++) {
4392 __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4393 }
4394 __ retl();
4395 __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10);
4396
4397 __ BIND(L_sha1_unaligned_input);
4398 __ alignaddr(buf, G0, buf);
4399
4400 __ BIND(L_sha1_unaligned_input_loop);
4401 // load buf into F8-F22
4402 for (i = 0; i < 9; i++) {
4403 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4404 }
4405 for (i = 0; i < 8; i++) {
4406 __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8));
4407 }
4408 __ sha1();
4409 if (multi_block) {
4410 __ add(ofs, 64, ofs);
4411 __ add(buf, 64, buf);
4412 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha1_unaligned_input_loop);
4413 __ mov(ofs, O0); // to be returned
4414 }
4415
4416 // store F0-F4 into state and return
4417 for (i = 0; i < 4; i++) {
4418 __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4419 }
4420 __ retl();
4421 __ delayed()->stf(FloatRegisterImpl::S, F4, state, 0x10);
4422
4423 return start;
4424 }
4425
4426 address generate_sha256_implCompress(bool multi_block, const char *name) {
4427 __ align(CodeEntryAlignment);
4428 StubCodeMark mark(this, "StubRoutines", name);
4429 address start = __ pc();
4430
4431 Label L_sha256_loop, L_sha256_unaligned_input, L_sha256_unaligned_input_loop;
4432 int i;
4433
4434 Register buf = O0; // byte[] source+offset
4435 Register state = O1; // int[] SHA2.state
4436 Register ofs = O2; // int offset
4437 Register limit = O3; // int limit
4438
4439 // load state into F0-F7
4440 for (i = 0; i < 8; i++) {
4441 __ ldf(FloatRegisterImpl::S, state, i*4, as_FloatRegister(i));
4442 }
4443
4444 __ andcc(buf, 7, G0);
4445 __ br(Assembler::notZero, false, Assembler::pn, L_sha256_unaligned_input);
4446 __ delayed()->nop();
4447
4448 __ BIND(L_sha256_loop);
4449 // load buf into F8-F22
4450 for (i = 0; i < 8; i++) {
4451 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4452 }
4453 __ sha256();
4454 if (multi_block) {
4455 __ add(ofs, 64, ofs);
4456 __ add(buf, 64, buf);
4457 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_loop);
4458 __ mov(ofs, O0); // to be returned
4459 }
4460
4461 // store F0-F7 into state and return
4462 for (i = 0; i < 7; i++) {
4463 __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4464 }
4465 __ retl();
4466 __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c);
4467
4468 __ BIND(L_sha256_unaligned_input);
4469 __ alignaddr(buf, G0, buf);
4470
4471 __ BIND(L_sha256_unaligned_input_loop);
4472 // load buf into F8-F22
4473 for (i = 0; i < 9; i++) {
4474 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 8));
4475 }
4476 for (i = 0; i < 8; i++) {
4477 __ faligndata(as_FloatRegister(i*2 + 8), as_FloatRegister(i*2 + 10), as_FloatRegister(i*2 + 8));
4478 }
4479 __ sha256();
4480 if (multi_block) {
4481 __ add(ofs, 64, ofs);
4482 __ add(buf, 64, buf);
4483 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha256_unaligned_input_loop);
4484 __ mov(ofs, O0); // to be returned
4485 }
4486
4487 // store F0-F7 into state and return
4488 for (i = 0; i < 7; i++) {
4489 __ stf(FloatRegisterImpl::S, as_FloatRegister(i), state, i*4);
4490 }
4491 __ retl();
4492 __ delayed()->stf(FloatRegisterImpl::S, F7, state, 0x1c);
4493
4494 return start;
4495 }
4496
4497 address generate_sha512_implCompress(bool multi_block, const char *name) {
4498 __ align(CodeEntryAlignment);
4499 StubCodeMark mark(this, "StubRoutines", name);
4500 address start = __ pc();
4501
4502 Label L_sha512_loop, L_sha512_unaligned_input, L_sha512_unaligned_input_loop;
4503 int i;
4504
4505 Register buf = O0; // byte[] source+offset
4506 Register state = O1; // long[] SHA5.state
4507 Register ofs = O2; // int offset
4508 Register limit = O3; // int limit
4509
4510 // load state into F0-F14
4511 for (i = 0; i < 8; i++) {
4512 __ ldf(FloatRegisterImpl::D, state, i*8, as_FloatRegister(i*2));
4513 }
4514
4515 __ andcc(buf, 7, G0);
4516 __ br(Assembler::notZero, false, Assembler::pn, L_sha512_unaligned_input);
4517 __ delayed()->nop();
4518
4519 __ BIND(L_sha512_loop);
4520 // load buf into F16-F46
4521 for (i = 0; i < 16; i++) {
4522 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16));
4523 }
4524 __ sha512();
4525 if (multi_block) {
4526 __ add(ofs, 128, ofs);
4527 __ add(buf, 128, buf);
4528 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_loop);
4529 __ mov(ofs, O0); // to be returned
4530 }
4531
4532 // store F0-F14 into state and return
4533 for (i = 0; i < 7; i++) {
4534 __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8);
4535 }
4536 __ retl();
4537 __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38);
4538
4539 __ BIND(L_sha512_unaligned_input);
4540 __ alignaddr(buf, G0, buf);
4541
4542 __ BIND(L_sha512_unaligned_input_loop);
4543 // load buf into F16-F46
4544 for (i = 0; i < 17; i++) {
4545 __ ldf(FloatRegisterImpl::D, buf, i*8, as_FloatRegister(i*2 + 16));
4546 }
4547 for (i = 0; i < 16; i++) {
4548 __ faligndata(as_FloatRegister(i*2 + 16), as_FloatRegister(i*2 + 18), as_FloatRegister(i*2 + 16));
4549 }
4550 __ sha512();
4551 if (multi_block) {
4552 __ add(ofs, 128, ofs);
4553 __ add(buf, 128, buf);
4554 __ cmp_and_brx_short(ofs, limit, Assembler::lessEqual, Assembler::pt, L_sha512_unaligned_input_loop);
4555 __ mov(ofs, O0); // to be returned
4556 }
4557
4558 // store F0-F14 into state and return
4559 for (i = 0; i < 7; i++) {
4560 __ stf(FloatRegisterImpl::D, as_FloatRegister(i*2), state, i*8);
4561 }
4562 __ retl();
4563 __ delayed()->stf(FloatRegisterImpl::D, F14, state, 0x38);
4564
4565 return start;
4566 }
4567
4568 /* Single and multi-block ghash operations */
4569 address generate_ghash_processBlocks() {
4570 __ align(CodeEntryAlignment);
4571 Label L_ghash_loop, L_aligned, L_main;
4572 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks");
4573 address start = __ pc();
4574
4575 Register state = I0;
4576 Register subkeyH = I1;
4577 Register data = I2;
4578 Register len = I3;
4579
4580 __ save_frame(0);
4581
4582 __ ldx(state, 0, O0);
4583 __ ldx(state, 8, O1);
4584
4585 // Loop label for multiblock operations
4586 __ BIND(L_ghash_loop);
4587
4588 // Check if 'data' is unaligned
4589 __ andcc(data, 7, G1);
4590 __ br(Assembler::zero, false, Assembler::pt, L_aligned);
4591 __ delayed()->nop();
4592
4593 Register left_shift = L1;
4594 Register right_shift = L2;
4595 Register data_ptr = L3;
4596
4597 // Get left and right shift values in bits
4598 __ sll(G1, LogBitsPerByte, left_shift);
4599 __ mov(64, right_shift);
4600 __ sub(right_shift, left_shift, right_shift);
4601
4602 // Align to read 'data'
4603 __ sub(data, G1, data_ptr);
4604
4605 // Load first 8 bytes of 'data'
4606 __ ldx(data_ptr, 0, O4);
4607 __ sllx(O4, left_shift, O4);
4608 __ ldx(data_ptr, 8, O5);
4609 __ srlx(O5, right_shift, G4);
4610 __ bset(G4, O4);
4611
4612 // Load second 8 bytes of 'data'
4613 __ sllx(O5, left_shift, O5);
4614 __ ldx(data_ptr, 16, G4);
4615 __ srlx(G4, right_shift, G4);
4616 __ ba(L_main);
4617 __ delayed()->bset(G4, O5);
4618
4619 // If 'data' is aligned, load normally
4620 __ BIND(L_aligned);
4621 __ ldx(data, 0, O4);
4622 __ ldx(data, 8, O5);
4623
4624 __ BIND(L_main);
4625 __ ldx(subkeyH, 0, O2);
4626 __ ldx(subkeyH, 8, O3);
4627
4628 __ xor3(O0, O4, O0);
4629 __ xor3(O1, O5, O1);
4630
4631 __ xmulxhi(O0, O3, G3);
4632 __ xmulx(O0, O2, O5);
4633 __ xmulxhi(O1, O2, G4);
4634 __ xmulxhi(O1, O3, G5);
4635 __ xmulx(O0, O3, G1);
4636 __ xmulx(O1, O3, G2);
4637 __ xmulx(O1, O2, O3);
4638 __ xmulxhi(O0, O2, O4);
4639
4640 __ mov(0xE1, O0);
4641 __ sllx(O0, 56, O0);
4642
4643 __ xor3(O5, G3, O5);
4644 __ xor3(O5, G4, O5);
4645 __ xor3(G5, G1, G1);
4646 __ xor3(G1, O3, G1);
4647 __ srlx(G2, 63, O1);
4648 __ srlx(G1, 63, G3);
4649 __ sllx(G2, 63, O3);
4650 __ sllx(G2, 58, O2);
4651 __ xor3(O3, O2, O2);
4652
4653 __ sllx(G1, 1, G1);
4654 __ or3(G1, O1, G1);
4655
4656 __ xor3(G1, O2, G1);
4657
4658 __ sllx(G2, 1, G2);
4659
4660 __ xmulxhi(G1, O0, O1);
4661 __ xmulx(G1, O0, O2);
4662 __ xmulxhi(G2, O0, O3);
4663 __ xmulx(G2, O0, G1);
4664
4665 __ xor3(O4, O1, O4);
4666 __ xor3(O5, O2, O5);
4667 __ xor3(O5, O3, O5);
4668
4669 __ sllx(O4, 1, O2);
4670 __ srlx(O5, 63, O3);
4671
4672 __ or3(O2, O3, O0);
4673
4674 __ sllx(O5, 1, O1);
4675 __ srlx(G1, 63, O2);
4676 __ or3(O1, O2, O1);
4677 __ xor3(O1, G3, O1);
4678
4679 __ deccc(len);
4680 __ br(Assembler::notZero, true, Assembler::pt, L_ghash_loop);
4681 __ delayed()->add(data, 16, data);
4682
4683 __ stx(O0, I0, 0);
4684 __ stx(O1, I0, 8);
4685
4686 __ ret();
4687 __ delayed()->restore();
4688
4689 return start;
4690 }
4691
4692 /**
4693 * Arguments:
4694 *
4695 * Inputs:
4696 * O0 - int crc
4697 * O1 - byte* buf
4698 * O2 - int len
4699 * O3 - int* table
4700 *
4701 * Output:
4702 * O0 - int crc result
4703 */
4704 address generate_updateBytesCRC32C() {
4705 assert(UseCRC32CIntrinsics, "need CRC32C instruction");
4706
4707 __ align(CodeEntryAlignment);
4708 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C");
4709 address start = __ pc();
4710
4711 const Register crc = O0; // crc
4712 const Register buf = O1; // source java byte array address
4713 const Register len = O2; // number of bytes
4714 const Register table = O3; // byteTable
4715
4716 __ kernel_crc32c(crc, buf, len, table);
4717
4718 __ retl();
4719 __ delayed()->nop();
4720
4721 return start;
4722 }
4723
4724 #define ADLER32_NUM_TEMPS 16
4725
4726 /**
4727 * Arguments:
4728 *
4729 * Inputs:
4730 * O0 - int adler
4731 * O1 - byte* buff
4732 * O2 - int len
4733 *
4734 * Output:
4735 * O0 - int adler result
4736 */
4737 address generate_updateBytesAdler32() {
4738 __ align(CodeEntryAlignment);
4739 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32");
4740 address start = __ pc();
4741
4742 Label L_cleanup_loop, L_cleanup_loop_check;
4743 Label L_main_loop_check, L_main_loop, L_inner_loop, L_inner_loop_check;
4744 Label L_nmax_check_done;
4745
4746 // Aliases
4747 Register s1 = O0;
4748 Register s2 = O3;
4749 Register buff = O1;
4750 Register len = O2;
4751 Register temp[ADLER32_NUM_TEMPS] = {L0, L1, L2, L3, L4, L5, L6, L7, I0, I1, I2, I3, I4, I5, G3, I7};
4752
4753 // Max number of bytes we can process before having to take the mod
4754 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1
4755 unsigned long NMAX = 0x15B0;
4756
4757 // Zero-out the upper bits of len
4758 __ clruwu(len);
4759
4760 // Create the mask 0xFFFF
4761 __ set64(0x00FFFF, O4, O5); // O5 is the temp register
4762
4763 // s1 is initialized to the lower 16 bits of adler
4764 // s2 is initialized to the upper 16 bits of adler
4765 __ srlx(O0, 16, O5); // adler >> 16
4766 __ and3(O0, O4, s1); // s1 = (adler & 0xFFFF)
4767 __ and3(O5, O4, s2); // s2 = ((adler >> 16) & 0xFFFF)
4768
4769 // The pipelined loop needs at least 16 elements for 1 iteration
4770 // It does check this, but it is more effective to skip to the cleanup loop
4771 // Setup the constant for cutoff checking
4772 __ mov(15, O4);
4773
4774 // Check if we are above the cutoff, if not go to the cleanup loop immediately
4775 __ cmp_and_br_short(len, O4, Assembler::lessEqualUnsigned, Assembler::pt, L_cleanup_loop_check);
4776
4777 // Free up some registers for our use
4778 for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
4779 __ movxtod(temp[i], as_FloatRegister(2*i));
4780 }
4781
4782 // Loop maintenance stuff is done at the end of the loop, so skip to there
4783 __ ba_short(L_main_loop_check);
4784
4785 __ BIND(L_main_loop);
4786
4787 // Prologue for inner loop
4788 __ ldub(buff, 0, L0);
4789 __ dec(O5);
4790
4791 for (int i = 1; i < 8; i++) {
4792 __ ldub(buff, i, temp[i]);
4793 }
4794
4795 __ inc(buff, 8);
4796
4797 // Inner loop processes 16 elements at a time, might never execute if only 16 elements
4798 // to be processed by the outter loop
4799 __ ba_short(L_inner_loop_check);
4800
4801 __ BIND(L_inner_loop);
4802
4803 for (int i = 0; i < 8; i++) {
4804 __ ldub(buff, (2*i), temp[(8+(2*i)) % ADLER32_NUM_TEMPS]);
4805 __ add(s1, temp[i], s1);
4806 __ ldub(buff, (2*i)+1, temp[(8+(2*i)+1) % ADLER32_NUM_TEMPS]);
4807 __ add(s2, s1, s2);
4808 }
4809
4810 // Original temp 0-7 used and new loads to temp 0-7 issued
4811 // temp 8-15 ready to be consumed
4812 __ add(s1, I0, s1);
4813 __ dec(O5);
4814 __ add(s2, s1, s2);
4815 __ add(s1, I1, s1);
4816 __ inc(buff, 16);
4817 __ add(s2, s1, s2);
4818
4819 for (int i = 0; i < 6; i++) {
4820 __ add(s1, temp[10+i], s1);
4821 __ add(s2, s1, s2);
4822 }
4823
4824 __ BIND(L_inner_loop_check);
4825 __ nop();
4826 __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_inner_loop);
4827
4828 // Epilogue
4829 for (int i = 0; i < 4; i++) {
4830 __ ldub(buff, (2*i), temp[8+(2*i)]);
4831 __ add(s1, temp[i], s1);
4832 __ ldub(buff, (2*i)+1, temp[8+(2*i)+1]);
4833 __ add(s2, s1, s2);
4834 }
4835
4836 __ add(s1, temp[4], s1);
4837 __ inc(buff, 8);
4838
4839 for (int i = 0; i < 11; i++) {
4840 __ add(s2, s1, s2);
4841 __ add(s1, temp[5+i], s1);
4842 }
4843
4844 __ add(s2, s1, s2);
4845
4846 // Take the mod for s1 and s2
4847 __ set64(0xFFF1, L0, L1);
4848 __ udivx(s1, L0, L1);
4849 __ udivx(s2, L0, L2);
4850 __ mulx(L0, L1, L1);
4851 __ mulx(L0, L2, L2);
4852 __ sub(s1, L1, s1);
4853 __ sub(s2, L2, s2);
4854
4855 // Make sure there is something left to process
4856 __ BIND(L_main_loop_check);
4857 __ set64(NMAX, L0, L1);
4858 // k = len < NMAX ? len : NMAX
4859 __ cmp_and_br_short(len, L0, Assembler::greaterEqualUnsigned, Assembler::pt, L_nmax_check_done);
4860 __ andn(len, 0x0F, L0); // only loop a multiple of 16 times
4861 __ BIND(L_nmax_check_done);
4862 __ mov(L0, O5);
4863 __ sub(len, L0, len); // len -= k
4864
4865 __ srlx(O5, 4, O5); // multiplies of 16
4866 __ cmp_and_br_short(O5, 0, Assembler::notEqual, Assembler::pt, L_main_loop);
4867
4868 // Restore anything we used, take the mod one last time, combine and return
4869 // Restore any registers we saved
4870 for (int i = 0; i < ADLER32_NUM_TEMPS; i++) {
4871 __ movdtox(as_FloatRegister(2*i), temp[i]);
4872 }
4873
4874 // There might be nothing left to process
4875 __ ba_short(L_cleanup_loop_check);
4876
4877 __ BIND(L_cleanup_loop);
4878 __ ldub(buff, 0, O4); // load single byte form buffer
4879 __ inc(buff); // buff++
4880 __ add(s1, O4, s1); // s1 += *buff++;
4881 __ dec(len); // len--
4882 __ add(s1, s2, s2); // s2 += s1;
4883 __ BIND(L_cleanup_loop_check);
4884 __ nop();
4885 __ cmp_and_br_short(len, 0, Assembler::notEqual, Assembler::pt, L_cleanup_loop);
4886
4887 // Take the mod one last time
4888 __ set64(0xFFF1, O1, O2);
4889 __ udivx(s1, O1, O2);
4890 __ udivx(s2, O1, O5);
4891 __ mulx(O1, O2, O2);
4892 __ mulx(O1, O5, O5);
4893 __ sub(s1, O2, s1);
4894 __ sub(s2, O5, s2);
4895
4896 // Combine lower bits and higher bits
4897 __ sllx(s2, 16, s2); // s2 = s2 << 16
4898 __ or3(s1, s2, s1); // adler = s2 | s1
4899 // Final return value is in O0
4900 __ retl();
4901 __ delayed()->nop();
4902
4903 return start;
4904 }
4905
4906 /**
4907 * Arguments:
4908 *
4909 * Inputs:
4910 * O0 - int crc
4911 * O1 - byte* buf
4912 * O2 - int len
4913 * O3 - int* table
4914 *
4915 * Output:
4916 * O0 - int crc result
4917 */
4918 address generate_updateBytesCRC32() {
4919 assert(UseCRC32Intrinsics, "need VIS3 instructions");
4920
4921 __ align(CodeEntryAlignment);
4922 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32");
4923 address start = __ pc();
4924
4925 const Register crc = O0; // crc
4926 const Register buf = O1; // source java byte array address
4927 const Register len = O2; // length
4928 const Register table = O3; // crc_table address (reuse register)
4929
4930 __ kernel_crc32(crc, buf, len, table);
4931
4932 __ retl();
4933 __ delayed()->nop();
4934
4935 return start;
4936 }
4937
4938 /**
4939 * Arguments:
4940 *
4941 * Inputs:
4942 * I0 - int* x-addr
4943 * I1 - int x-len
4944 * I2 - int* y-addr
4945 * I3 - int y-len
4946 * I4 - int* z-addr (output vector)
4947 * I5 - int z-len
4948 */
4949 address generate_multiplyToLen() {
4950 assert(UseMultiplyToLenIntrinsic, "need VIS3 instructions");
4951
4952 __ align(CodeEntryAlignment);
4953 StubCodeMark mark(this, "StubRoutines", "multiplyToLen");
4954 address start = __ pc();
4955
4956 __ save_frame(0);
4957
4958 const Register xptr = I0; // input address
4959 const Register xlen = I1; // ...and length in 32b-words
4960 const Register yptr = I2; //
4961 const Register ylen = I3; //
4962 const Register zptr = I4; // output address
4963 const Register zlen = I5; // ...and length in 32b-words
4964
4965 /* The minimal "limb" representation suggest that odd length vectors are as
4966 * likely as even length dittos. This in turn suggests that we need to cope
4967 * with odd/even length arrays and data not aligned properly for 64-bit read
4968 * and write operations. We thus use a number of different kernels:
4969 *
4970 * if (is_even(x.len) && is_even(y.len))
4971 * if (is_align64(x) && is_align64(y) && is_align64(z))
4972 * if (x.len == y.len && 16 <= x.len && x.len <= 64)
4973 * memv_mult_mpmul(...)
4974 * else
4975 * memv_mult_64x64(...)
4976 * else
4977 * memv_mult_64x64u(...)
4978 * else
4979 * memv_mult_32x32(...)
4980 *
4981 * Here we assume VIS3 support (for 'umulxhi', 'addxc' and 'addxccc').
4982 * In case CBCOND instructions are supported, we will use 'cxbX'. If the
4983 * MPMUL instruction is supported, we will generate a kernel using 'mpmul'
4984 * (for vectors with proper characteristics).
4985 */
4986 const Register tmp0 = L0;
4987 const Register tmp1 = L1;
4988
4989 Label L_mult_32x32;
4990 Label L_mult_64x64u;
4991 Label L_mult_64x64;
4992 Label L_exit;
4993
4994 if_both_even(xlen, ylen, tmp0, false, L_mult_32x32);
4995 if_all3_aligned(xptr, yptr, zptr, tmp1, 64, false, L_mult_64x64u);
4996
4997 if (UseMPMUL) {
4998 if_eq(xlen, ylen, false, L_mult_64x64);
4999 if_in_rng(xlen, 16, 64, tmp0, tmp1, false, L_mult_64x64);
5000
5001 // 1. Multiply naturally aligned 64b-datums using a generic 'mpmul' kernel,
5002 // operating on equal length vectors of size [16..64].
5003 gen_mult_mpmul(xlen, xptr, yptr, zptr, L_exit);
5004 }
5005
5006 // 2. Multiply naturally aligned 64-bit datums (64x64).
5007 __ bind(L_mult_64x64);
5008 gen_mult_64x64(xptr, xlen, yptr, ylen, zptr, zlen, L_exit);
5009
5010 // 3. Multiply unaligned 64-bit datums (64x64).
5011 __ bind(L_mult_64x64u);
5012 gen_mult_64x64_unaligned(xptr, xlen, yptr, ylen, zptr, zlen, L_exit);
5013
5014 // 4. Multiply naturally aligned 32-bit datums (32x32).
5015 __ bind(L_mult_32x32);
5016 gen_mult_32x32(xptr, xlen, yptr, ylen, zptr, zlen, L_exit);
5017
5018 __ bind(L_exit);
5019 __ ret();
5020 __ delayed()->restore();
5021
5022 return start;
5023 }
5024
5025 // Additional help functions used by multiplyToLen generation.
5026
5027 void if_both_even(Register r1, Register r2, Register tmp, bool iseven, Label &L)
5028 {
5029 __ or3(r1, r2, tmp);
5030 __ andcc(tmp, 0x1, tmp);
5031 __ br_icc_zero(iseven, Assembler::pn, L);
5032 }
5033
5034 void if_all3_aligned(Register r1, Register r2, Register r3,
5035 Register tmp, uint align, bool isalign, Label &L)
5036 {
5037 __ or3(r1, r2, tmp);
5038 __ or3(r3, tmp, tmp);
5039 __ andcc(tmp, (align - 1), tmp);
5040 __ br_icc_zero(isalign, Assembler::pn, L);
5041 }
5042
5043 void if_eq(Register x, Register y, bool iseq, Label &L)
5044 {
5045 Assembler::Condition cf = (iseq ? Assembler::equal : Assembler::notEqual);
5046 __ cmp_and_br_short(x, y, cf, Assembler::pt, L);
5047 }
5048
5049 void if_in_rng(Register x, int lb, int ub, Register t1, Register t2, bool inrng, Label &L)
5050 {
5051 assert(Assembler::is_simm13(lb), "Small ints only!");
5052 assert(Assembler::is_simm13(ub), "Small ints only!");
5053 // Compute (x - lb) * (ub - x) >= 0
5054 // NOTE: With the local use of this routine, we rely on small integers to
5055 // guarantee that we do not overflow in the multiplication.
5056 __ add(G0, ub, t2);
5057 __ sub(x, lb, t1);
5058 __ sub(t2, x, t2);
5059 __ mulx(t1, t2, t1);
5060 Assembler::Condition cf = (inrng ? Assembler::greaterEqual : Assembler::less);
5061 __ cmp_and_br_short(t1, G0, cf, Assembler::pt, L);
5062 }
5063
5064 void ldd_entry(Register base, Register offs, FloatRegister dest)
5065 {
5066 __ ldd(base, offs, dest);
5067 __ inc(offs, 8);
5068 }
5069
5070 void ldx_entry(Register base, Register offs, Register dest)
5071 {
5072 __ ldx(base, offs, dest);
5073 __ inc(offs, 8);
5074 }
5075
5076 void mpmul_entry(int m, Label &next)
5077 {
5078 __ mpmul(m);
5079 __ cbcond(Assembler::equal, Assembler::icc, G0, G0, next);
5080 }
5081
5082 void stx_entry(Label &L, Register r1, Register r2, Register base, Register offs)
5083 {
5084 __ bind(L);
5085 __ stx(r1, base, offs);
5086 __ inc(offs, 8);
5087 __ stx(r2, base, offs);
5088 __ inc(offs, 8);
5089 }
5090
5091 void offs_entry(Label &Lbl0, Label &Lbl1)
5092 {
5093 assert(Lbl0.is_bound(), "must be");
5094 assert(Lbl1.is_bound(), "must be");
5095
5096 int offset = Lbl0.loc_pos() - Lbl1.loc_pos();
5097
5098 __ emit_data(offset);
5099 }
5100
5101 /* Generate the actual multiplication kernels for BigInteger vectors:
5102 *
5103 * 1. gen_mult_mpmul(...)
5104 *
5105 * 2. gen_mult_64x64(...)
5106 *
5107 * 3. gen_mult_64x64_unaligned(...)
5108 *
5109 * 4. gen_mult_32x32(...)
5110 */
5111 void gen_mult_mpmul(Register len, Register xptr, Register yptr, Register zptr,
5112 Label &L_exit)
5113 {
5114 const Register zero = G0;
5115 const Register gxp = G1; // Need to use global registers across RWs.
5116 const Register gyp = G2;
5117 const Register gzp = G3;
5118 const Register disp = G4;
5119 const Register offs = G5;
5120
5121 __ mov(xptr, gxp);
5122 __ mov(yptr, gyp);
5123 __ mov(zptr, gzp);
5124
5125 /* Compute jump vector entry:
5126 *
5127 * 1. mpmul input size (0..31) x 64b
5128 * 2. vector input size in 32b limbs (even number)
5129 * 3. branch entries in reverse order (31..0), using two
5130 * instructions per entry (2 * 4 bytes).
5131 *
5132 * displacement = byte_offset(bra_offset(len))
5133 * = byte_offset((64 - len)/2)
5134 * = 8 * (64 - len)/2
5135 * = 4 * (64 - len)
5136 */
5137 Register temp = I5; // Alright to use input regs. in first batch.
5138
5139 __ sub(zero, len, temp);
5140 __ add(temp, 64, temp);
5141 __ sllx(temp, 2, disp); // disp := (64 - len) << 2
5142
5143 // Dispatch relative current PC, into instruction table below.
5144 __ rdpc(temp);
5145 __ add(temp, 16, temp);
5146 __ jmp(temp, disp);
5147 __ delayed()->clr(offs);
5148
5149 ldd_entry(gxp, offs, F22);
5150 ldd_entry(gxp, offs, F20);
5151 ldd_entry(gxp, offs, F18);
5152 ldd_entry(gxp, offs, F16);
5153 ldd_entry(gxp, offs, F14);
5154 ldd_entry(gxp, offs, F12);
5155 ldd_entry(gxp, offs, F10);
5156 ldd_entry(gxp, offs, F8);
5157 ldd_entry(gxp, offs, F6);
5158 ldd_entry(gxp, offs, F4);
5159 ldx_entry(gxp, offs, I5);
5160 ldx_entry(gxp, offs, I4);
5161 ldx_entry(gxp, offs, I3);
5162 ldx_entry(gxp, offs, I2);
5163 ldx_entry(gxp, offs, I1);
5164 ldx_entry(gxp, offs, I0);
5165 ldx_entry(gxp, offs, L7);
5166 ldx_entry(gxp, offs, L6);
5167 ldx_entry(gxp, offs, L5);
5168 ldx_entry(gxp, offs, L4);
5169 ldx_entry(gxp, offs, L3);
5170 ldx_entry(gxp, offs, L2);
5171 ldx_entry(gxp, offs, L1);
5172 ldx_entry(gxp, offs, L0);
5173 ldd_entry(gxp, offs, F2);
5174 ldd_entry(gxp, offs, F0);
5175 ldx_entry(gxp, offs, O5);
5176 ldx_entry(gxp, offs, O4);
5177 ldx_entry(gxp, offs, O3);
5178 ldx_entry(gxp, offs, O2);
5179 ldx_entry(gxp, offs, O1);
5180 ldx_entry(gxp, offs, O0);
5181
5182 __ save(SP, -176, SP);
5183
5184 const Register addr = gxp; // Alright to reuse 'gxp'.
5185
5186 // Dispatch relative current PC, into instruction table below.
5187 __ rdpc(addr);
5188 __ add(addr, 16, addr);
5189 __ jmp(addr, disp);
5190 __ delayed()->clr(offs);
5191
5192 ldd_entry(gyp, offs, F58);
5193 ldd_entry(gyp, offs, F56);
5194 ldd_entry(gyp, offs, F54);
5195 ldd_entry(gyp, offs, F52);
5196 ldd_entry(gyp, offs, F50);
5197 ldd_entry(gyp, offs, F48);
5198 ldd_entry(gyp, offs, F46);
5199 ldd_entry(gyp, offs, F44);
5200 ldd_entry(gyp, offs, F42);
5201 ldd_entry(gyp, offs, F40);
5202 ldd_entry(gyp, offs, F38);
5203 ldd_entry(gyp, offs, F36);
5204 ldd_entry(gyp, offs, F34);
5205 ldd_entry(gyp, offs, F32);
5206 ldd_entry(gyp, offs, F30);
5207 ldd_entry(gyp, offs, F28);
5208 ldd_entry(gyp, offs, F26);
5209 ldd_entry(gyp, offs, F24);
5210 ldx_entry(gyp, offs, O5);
5211 ldx_entry(gyp, offs, O4);
5212 ldx_entry(gyp, offs, O3);
5213 ldx_entry(gyp, offs, O2);
5214 ldx_entry(gyp, offs, O1);
5215 ldx_entry(gyp, offs, O0);
5216 ldx_entry(gyp, offs, L7);
5217 ldx_entry(gyp, offs, L6);
5218 ldx_entry(gyp, offs, L5);
5219 ldx_entry(gyp, offs, L4);
5220 ldx_entry(gyp, offs, L3);
5221 ldx_entry(gyp, offs, L2);
5222 ldx_entry(gyp, offs, L1);
5223 ldx_entry(gyp, offs, L0);
5224
5225 __ save(SP, -176, SP);
5226 __ save(SP, -176, SP);
5227 __ save(SP, -176, SP);
5228 __ save(SP, -176, SP);
5229 __ save(SP, -176, SP);
5230
5231 Label L_mpmul_restore_4, L_mpmul_restore_3, L_mpmul_restore_2;
5232 Label L_mpmul_restore_1, L_mpmul_restore_0;
5233
5234 // Dispatch relative current PC, into instruction table below.
5235 __ rdpc(addr);
5236 __ add(addr, 16, addr);
5237 __ jmp(addr, disp);
5238 __ delayed()->clr(offs);
5239
5240 mpmul_entry(31, L_mpmul_restore_0);
5241 mpmul_entry(30, L_mpmul_restore_0);
5242 mpmul_entry(29, L_mpmul_restore_0);
5243 mpmul_entry(28, L_mpmul_restore_0);
5244 mpmul_entry(27, L_mpmul_restore_1);
5245 mpmul_entry(26, L_mpmul_restore_1);
5246 mpmul_entry(25, L_mpmul_restore_1);
5247 mpmul_entry(24, L_mpmul_restore_1);
5248 mpmul_entry(23, L_mpmul_restore_1);
5249 mpmul_entry(22, L_mpmul_restore_1);
5250 mpmul_entry(21, L_mpmul_restore_1);
5251 mpmul_entry(20, L_mpmul_restore_2);
5252 mpmul_entry(19, L_mpmul_restore_2);
5253 mpmul_entry(18, L_mpmul_restore_2);
5254 mpmul_entry(17, L_mpmul_restore_2);
5255 mpmul_entry(16, L_mpmul_restore_2);
5256 mpmul_entry(15, L_mpmul_restore_2);
5257 mpmul_entry(14, L_mpmul_restore_2);
5258 mpmul_entry(13, L_mpmul_restore_3);
5259 mpmul_entry(12, L_mpmul_restore_3);
5260 mpmul_entry(11, L_mpmul_restore_3);
5261 mpmul_entry(10, L_mpmul_restore_3);
5262 mpmul_entry( 9, L_mpmul_restore_3);
5263 mpmul_entry( 8, L_mpmul_restore_3);
5264 mpmul_entry( 7, L_mpmul_restore_3);
5265 mpmul_entry( 6, L_mpmul_restore_4);
5266 mpmul_entry( 5, L_mpmul_restore_4);
5267 mpmul_entry( 4, L_mpmul_restore_4);
5268 mpmul_entry( 3, L_mpmul_restore_4);
5269 mpmul_entry( 2, L_mpmul_restore_4);
5270 mpmul_entry( 1, L_mpmul_restore_4);
5271 mpmul_entry( 0, L_mpmul_restore_4);
5272
5273 Label L_z31, L_z30, L_z29, L_z28, L_z27, L_z26, L_z25, L_z24;
5274 Label L_z23, L_z22, L_z21, L_z20, L_z19, L_z18, L_z17, L_z16;
5275 Label L_z15, L_z14, L_z13, L_z12, L_z11, L_z10, L_z09, L_z08;
5276 Label L_z07, L_z06, L_z05, L_z04, L_z03, L_z02, L_z01, L_z00;
5277
5278 Label L_zst_base; // Store sequence base address.
5279 __ bind(L_zst_base);
5280
5281 stx_entry(L_z31, L7, L6, gzp, offs);
5282 stx_entry(L_z30, L5, L4, gzp, offs);
5283 stx_entry(L_z29, L3, L2, gzp, offs);
5284 stx_entry(L_z28, L1, L0, gzp, offs);
5285 __ restore();
5286 stx_entry(L_z27, O5, O4, gzp, offs);
5287 stx_entry(L_z26, O3, O2, gzp, offs);
5288 stx_entry(L_z25, O1, O0, gzp, offs);
5289 stx_entry(L_z24, L7, L6, gzp, offs);
5290 stx_entry(L_z23, L5, L4, gzp, offs);
5291 stx_entry(L_z22, L3, L2, gzp, offs);
5292 stx_entry(L_z21, L1, L0, gzp, offs);
5293 __ restore();
5294 stx_entry(L_z20, O5, O4, gzp, offs);
5295 stx_entry(L_z19, O3, O2, gzp, offs);
5296 stx_entry(L_z18, O1, O0, gzp, offs);
5297 stx_entry(L_z17, L7, L6, gzp, offs);
5298 stx_entry(L_z16, L5, L4, gzp, offs);
5299 stx_entry(L_z15, L3, L2, gzp, offs);
5300 stx_entry(L_z14, L1, L0, gzp, offs);
5301 __ restore();
5302 stx_entry(L_z13, O5, O4, gzp, offs);
5303 stx_entry(L_z12, O3, O2, gzp, offs);
5304 stx_entry(L_z11, O1, O0, gzp, offs);
5305 stx_entry(L_z10, L7, L6, gzp, offs);
5306 stx_entry(L_z09, L5, L4, gzp, offs);
5307 stx_entry(L_z08, L3, L2, gzp, offs);
5308 stx_entry(L_z07, L1, L0, gzp, offs);
5309 __ restore();
5310 stx_entry(L_z06, O5, O4, gzp, offs);
5311 stx_entry(L_z05, O3, O2, gzp, offs);
5312 stx_entry(L_z04, O1, O0, gzp, offs);
5313 stx_entry(L_z03, L7, L6, gzp, offs);
5314 stx_entry(L_z02, L5, L4, gzp, offs);
5315 stx_entry(L_z01, L3, L2, gzp, offs);
5316 stx_entry(L_z00, L1, L0, gzp, offs);
5317
5318 __ restore();
5319 __ restore();
5320 // Exit out of 'mpmul' routine, back to multiplyToLen.
5321 __ ba_short(L_exit);
5322
5323 Label L_zst_offs;
5324 __ bind(L_zst_offs);
5325
5326 offs_entry(L_z31, L_zst_base); // index 31: 2048x2048
5327 offs_entry(L_z30, L_zst_base);
5328 offs_entry(L_z29, L_zst_base);
5329 offs_entry(L_z28, L_zst_base);
5330 offs_entry(L_z27, L_zst_base);
5331 offs_entry(L_z26, L_zst_base);
5332 offs_entry(L_z25, L_zst_base);
5333 offs_entry(L_z24, L_zst_base);
5334 offs_entry(L_z23, L_zst_base);
5335 offs_entry(L_z22, L_zst_base);
5336 offs_entry(L_z21, L_zst_base);
5337 offs_entry(L_z20, L_zst_base);
5338 offs_entry(L_z19, L_zst_base);
5339 offs_entry(L_z18, L_zst_base);
5340 offs_entry(L_z17, L_zst_base);
5341 offs_entry(L_z16, L_zst_base);
5342 offs_entry(L_z15, L_zst_base);
5343 offs_entry(L_z14, L_zst_base);
5344 offs_entry(L_z13, L_zst_base);
5345 offs_entry(L_z12, L_zst_base);
5346 offs_entry(L_z11, L_zst_base);
5347 offs_entry(L_z10, L_zst_base);
5348 offs_entry(L_z09, L_zst_base);
5349 offs_entry(L_z08, L_zst_base);
5350 offs_entry(L_z07, L_zst_base);
5351 offs_entry(L_z06, L_zst_base);
5352 offs_entry(L_z05, L_zst_base);
5353 offs_entry(L_z04, L_zst_base);
5354 offs_entry(L_z03, L_zst_base);
5355 offs_entry(L_z02, L_zst_base);
5356 offs_entry(L_z01, L_zst_base);
5357 offs_entry(L_z00, L_zst_base); // index 0: 64x64
5358
5359 __ bind(L_mpmul_restore_4);
5360 __ restore();
5361 __ bind(L_mpmul_restore_3);
5362 __ restore();
5363 __ bind(L_mpmul_restore_2);
5364 __ restore();
5365 __ bind(L_mpmul_restore_1);
5366 __ restore();
5367 __ bind(L_mpmul_restore_0);
5368
5369 // Dispatch via offset vector entry, into z-store sequence.
5370 Label L_zst_rdpc;
5371 __ bind(L_zst_rdpc);
5372
5373 assert(L_zst_base.is_bound(), "must be");
5374 assert(L_zst_offs.is_bound(), "must be");
5375 assert(L_zst_rdpc.is_bound(), "must be");
5376
5377 int dbase = L_zst_rdpc.loc_pos() - L_zst_base.loc_pos();
5378 int doffs = L_zst_rdpc.loc_pos() - L_zst_offs.loc_pos();
5379
5380 temp = gyp; // Alright to reuse 'gyp'.
5381
5382 __ rdpc(addr);
5383 __ sub(addr, doffs, temp);
5384 __ srlx(disp, 1, disp);
5385 __ lduw(temp, disp, offs);
5386 __ sub(addr, dbase, temp);
5387 __ jmp(temp, offs);
5388 __ delayed()->clr(offs);
5389 }
5390
5391 void gen_mult_64x64(Register xp, Register xn,
5392 Register yp, Register yn,
5393 Register zp, Register zn, Label &L_exit)
5394 {
5395 // Assuming that a stack frame has already been created, i.e. local and
5396 // output registers are available for immediate use.
5397
5398 const Register ri = L0; // Outer loop index, xv[i]
5399 const Register rj = L1; // Inner loop index, yv[j]
5400 const Register rk = L2; // Output loop index, zv[k]
5401 const Register rx = L4; // x-vector datum [i]
5402 const Register ry = L5; // y-vector datum [j]
5403 const Register rz = L6; // z-vector datum [k]
5404 const Register rc = L7; // carry over (to z-vector datum [k-1])
5405
5406 const Register lop = O0; // lo-64b product
5407 const Register hip = O1; // hi-64b product
5408
5409 const Register zero = G0;
5410
5411 Label L_loop_i, L_exit_loop_i;
5412 Label L_loop_j;
5413 Label L_loop_i2, L_exit_loop_i2;
5414
5415 __ srlx(xn, 1, xn); // index for u32 to u64 ditto
5416 __ srlx(yn, 1, yn); // index for u32 to u64 ditto
5417 __ srlx(zn, 1, zn); // index for u32 to u64 ditto
5418 __ dec(xn); // Adjust [0..(N/2)-1]
5419 __ dec(yn);
5420 __ dec(zn);
5421 __ clr(rc); // u64 c = 0
5422 __ sllx(xn, 3, ri); // int i = xn (byte offset i = 8*xn)
5423 __ sllx(yn, 3, rj); // int j = yn (byte offset i = 8*xn)
5424 __ sllx(zn, 3, rk); // int k = zn (byte offset k = 8*zn)
5425 __ ldx(yp, rj, ry); // u64 y = yp[yn]
5426
5427 // for (int i = xn; i >= 0; i--)
5428 __ bind(L_loop_i);
5429
5430 __ cmp_and_br_short(ri, 0, // i >= 0
5431 Assembler::less, Assembler::pn, L_exit_loop_i);
5432 __ ldx(xp, ri, rx); // x = xp[i]
5433 __ mulx(rx, ry, lop); // lo-64b-part of result 64x64
5434 __ umulxhi(rx, ry, hip); // hi-64b-part of result 64x64
5435 __ addcc(rc, lop, lop); // Accumulate lower order bits (producing carry)
5436 __ addxc(hip, zero, rc); // carry over to next datum [k-1]
5437 __ stx(lop, zp, rk); // z[k] = lop
5438 __ dec(rk, 8); // k--
5439 __ dec(ri, 8); // i--
5440 __ ba_short(L_loop_i);
5441
5442 __ bind(L_exit_loop_i);
5443 __ stx(rc, zp, rk); // z[k] = c
5444
5445 // for (int j = yn - 1; j >= 0; j--)
5446 __ sllx(yn, 3, rj); // int j = yn - 1 (byte offset j = 8*yn)
5447 __ dec(rj, 8);
5448
5449 __ bind(L_loop_j);
5450
5451 __ cmp_and_br_short(rj, 0, // j >= 0
5452 Assembler::less, Assembler::pn, L_exit);
5453 __ clr(rc); // u64 c = 0
5454 __ ldx(yp, rj, ry); // u64 y = yp[j]
5455
5456 // for (int i = xn, k = --zn; i >= 0; i--)
5457 __ dec(zn); // --zn
5458 __ sllx(xn, 3, ri); // int i = xn (byte offset i = 8*xn)
5459 __ sllx(zn, 3, rk); // int k = zn (byte offset k = 8*zn)
5460
5461 __ bind(L_loop_i2);
5462
5463 __ cmp_and_br_short(ri, 0, // i >= 0
5464 Assembler::less, Assembler::pn, L_exit_loop_i2);
5465 __ ldx(xp, ri, rx); // x = xp[i]
5466 __ ldx(zp, rk, rz); // z = zp[k], accumulator
5467 __ mulx(rx, ry, lop); // lo-64b-part of result 64x64
5468 __ umulxhi(rx, ry, hip); // hi-64b-part of result 64x64
5469 __ addcc(rz, rc, rz); // Accumulate lower order bits,
5470 __ addxc(hip, zero, rc); // Accumulate higher order bits to carry
5471 __ addcc(rz, lop, rz); // z += lo(p) + c
5472 __ addxc(rc, zero, rc);
5473 __ stx(rz, zp, rk); // zp[k] = z
5474 __ dec(rk, 8); // k--
5475 __ dec(ri, 8); // i--
5476 __ ba_short(L_loop_i2);
5477
5478 __ bind(L_exit_loop_i2);
5479 __ stx(rc, zp, rk); // z[k] = c
5480 __ dec(rj, 8); // j--
5481 __ ba_short(L_loop_j);
5482 }
5483
5484 void gen_mult_64x64_unaligned(Register xp, Register xn,
5485 Register yp, Register yn,
5486 Register zp, Register zn, Label &L_exit)
5487 {
5488 // Assuming that a stack frame has already been created, i.e. local and
5489 // output registers are available for use.
5490
5491 const Register xpc = L0; // Outer loop cursor, xp[i]
5492 const Register ypc = L1; // Inner loop cursor, yp[j]
5493 const Register zpc = L2; // Output loop cursor, zp[k]
5494 const Register rx = L4; // x-vector datum [i]
5495 const Register ry = L5; // y-vector datum [j]
5496 const Register rz = L6; // z-vector datum [k]
5497 const Register rc = L7; // carry over (to z-vector datum [k-1])
5498 const Register rt = O2;
5499
5500 const Register lop = O0; // lo-64b product
5501 const Register hip = O1; // hi-64b product
5502
5503 const Register zero = G0;
5504
5505 Label L_loop_i, L_exit_loop_i;
5506 Label L_loop_j;
5507 Label L_loop_i2, L_exit_loop_i2;
5508
5509 __ srlx(xn, 1, xn); // index for u32 to u64 ditto
5510 __ srlx(yn, 1, yn); // index for u32 to u64 ditto
5511 __ srlx(zn, 1, zn); // index for u32 to u64 ditto
5512 __ dec(xn); // Adjust [0..(N/2)-1]
5513 __ dec(yn);
5514 __ dec(zn);
5515 __ clr(rc); // u64 c = 0
5516 __ sllx(xn, 3, xpc); // u32* xpc = &xp[xn] (byte offset 8*xn)
5517 __ add(xp, xpc, xpc);
5518 __ sllx(yn, 3, ypc); // u32* ypc = &yp[yn] (byte offset 8*yn)
5519 __ add(yp, ypc, ypc);
5520 __ sllx(zn, 3, zpc); // u32* zpc = &zp[zn] (byte offset 8*zn)
5521 __ add(zp, zpc, zpc);
5522 __ lduw(ypc, 0, rt); // u64 y = yp[yn]
5523 __ lduw(ypc, 4, ry); // ...
5524 __ sllx(rt, 32, rt);
5525 __ or3(rt, ry, ry);
5526
5527 // for (int i = xn; i >= 0; i--)
5528 __ bind(L_loop_i);
5529
5530 __ cmp_and_brx_short(xpc, xp,// i >= 0
5531 Assembler::lessUnsigned, Assembler::pn, L_exit_loop_i);
5532 __ lduw(xpc, 0, rt); // u64 x = xp[i]
5533 __ lduw(xpc, 4, rx); // ...
5534 __ sllx(rt, 32, rt);
5535 __ or3(rt, rx, rx);
5536 __ mulx(rx, ry, lop); // lo-64b-part of result 64x64
5537 __ umulxhi(rx, ry, hip); // hi-64b-part of result 64x64
5538 __ addcc(rc, lop, lop); // Accumulate lower order bits (producing carry)
5539 __ addxc(hip, zero, rc); // carry over to next datum [k-1]
5540 __ srlx(lop, 32, rt);
5541 __ stw(rt, zpc, 0); // z[k] = lop
5542 __ stw(lop, zpc, 4); // ...
5543 __ dec(zpc, 8); // k-- (zpc--)
5544 __ dec(xpc, 8); // i-- (xpc--)
5545 __ ba_short(L_loop_i);
5546
5547 __ bind(L_exit_loop_i);
5548 __ srlx(rc, 32, rt);
5549 __ stw(rt, zpc, 0); // z[k] = c
5550 __ stw(rc, zpc, 4);
5551
5552 // for (int j = yn - 1; j >= 0; j--)
5553 __ sllx(yn, 3, ypc); // u32* ypc = &yp[yn] (byte offset 8*yn)
5554 __ add(yp, ypc, ypc);
5555 __ dec(ypc, 8); // yn - 1 (ypc--)
5556
5557 __ bind(L_loop_j);
5558
5559 __ cmp_and_brx_short(ypc, yp,// j >= 0
5560 Assembler::lessUnsigned, Assembler::pn, L_exit);
5561 __ clr(rc); // u64 c = 0
5562 __ lduw(ypc, 0, rt); // u64 y = yp[j] (= *ypc)
5563 __ lduw(ypc, 4, ry); // ...
5564 __ sllx(rt, 32, rt);
5565 __ or3(rt, ry, ry);
5566
5567 // for (int i = xn, k = --zn; i >= 0; i--)
5568 __ sllx(xn, 3, xpc); // u32* xpc = &xp[xn] (byte offset 8*xn)
5569 __ add(xp, xpc, xpc);
5570 __ dec(zn); // --zn
5571 __ sllx(zn, 3, zpc); // u32* zpc = &zp[zn] (byte offset 8*zn)
5572 __ add(zp, zpc, zpc);
5573
5574 __ bind(L_loop_i2);
5575
5576 __ cmp_and_brx_short(xpc, xp,// i >= 0
5577 Assembler::lessUnsigned, Assembler::pn, L_exit_loop_i2);
5578 __ lduw(xpc, 0, rt); // u64 x = xp[i] (= *xpc)
5579 __ lduw(xpc, 4, rx); // ...
5580 __ sllx(rt, 32, rt);
5581 __ or3(rt, rx, rx);
5582
5583 __ lduw(zpc, 0, rt); // u64 z = zp[k] (= *zpc)
5584 __ lduw(zpc, 4, rz); // ...
5585 __ sllx(rt, 32, rt);
5586 __ or3(rt, rz, rz);
5587
5588 __ mulx(rx, ry, lop); // lo-64b-part of result 64x64
5589 __ umulxhi(rx, ry, hip); // hi-64b-part of result 64x64
5590 __ addcc(rz, rc, rz); // Accumulate lower order bits...
5591 __ addxc(hip, zero, rc); // Accumulate higher order bits to carry
5592 __ addcc(rz, lop, rz); // ... z += lo(p) + c
5593 __ addxccc(rc, zero, rc);
5594 __ srlx(rz, 32, rt);
5595 __ stw(rt, zpc, 0); // zp[k] = z (*zpc = z)
5596 __ stw(rz, zpc, 4);
5597 __ dec(zpc, 8); // k-- (zpc--)
5598 __ dec(xpc, 8); // i-- (xpc--)
5599 __ ba_short(L_loop_i2);
5600
5601 __ bind(L_exit_loop_i2);
5602 __ srlx(rc, 32, rt);
5603 __ stw(rt, zpc, 0); // z[k] = c
5604 __ stw(rc, zpc, 4);
5605 __ dec(ypc, 8); // j-- (ypc--)
5606 __ ba_short(L_loop_j);
5607 }
5608
5609 void gen_mult_32x32(Register xp, Register xn,
5610 Register yp, Register yn,
5611 Register zp, Register zn, Label &L_exit)
5612 {
5613 // Assuming that a stack frame has already been created, i.e. local and
5614 // output registers are available for use.
5615
5616 const Register ri = L0; // Outer loop index, xv[i]
5617 const Register rj = L1; // Inner loop index, yv[j]
5618 const Register rk = L2; // Output loop index, zv[k]
5619 const Register rx = L4; // x-vector datum [i]
5620 const Register ry = L5; // y-vector datum [j]
5621 const Register rz = L6; // z-vector datum [k]
5622 const Register rc = L7; // carry over (to z-vector datum [k-1])
5623
5624 const Register p64 = O0; // 64b product
5625 const Register z65 = O1; // carry+64b accumulator
5626 const Register c65 = O2; // carry at bit 65
5627 const Register c33 = O2; // carry at bit 33 (after shift)
5628
5629 const Register zero = G0;
5630
5631 Label L_loop_i, L_exit_loop_i;
5632 Label L_loop_j;
5633 Label L_loop_i2, L_exit_loop_i2;
5634
5635 __ dec(xn); // Adjust [0..N-1]
5636 __ dec(yn);
5637 __ dec(zn);
5638 __ clr(rc); // u32 c = 0
5639 __ sllx(xn, 2, ri); // int i = xn (byte offset i = 4*xn)
5640 __ sllx(yn, 2, rj); // int j = yn (byte offset i = 4*xn)
5641 __ sllx(zn, 2, rk); // int k = zn (byte offset k = 4*zn)
5642 __ lduw(yp, rj, ry); // u32 y = yp[yn]
5643
5644 // for (int i = xn; i >= 0; i--)
5645 __ bind(L_loop_i);
5646
5647 __ cmp_and_br_short(ri, 0, // i >= 0
5648 Assembler::less, Assembler::pn, L_exit_loop_i);
5649 __ lduw(xp, ri, rx); // x = xp[i]
5650 __ mulx(rx, ry, p64); // 64b result of 32x32
5651 __ addcc(rc, p64, z65); // Accumulate to 65 bits (producing carry)
5652 __ addxc(zero, zero, c65); // Materialise carry (in bit 65) into lsb,
5653 __ sllx(c65, 32, c33); // and shift into bit 33
5654 __ srlx(z65, 32, rc); // carry = c33 | hi(z65) >> 32
5655 __ add(c33, rc, rc); // carry over to next datum [k-1]
5656 __ stw(z65, zp, rk); // z[k] = lo(z65)
5657 __ dec(rk, 4); // k--
5658 __ dec(ri, 4); // i--
5659 __ ba_short(L_loop_i);
5660
5661 __ bind(L_exit_loop_i);
5662 __ stw(rc, zp, rk); // z[k] = c
5663
5664 // for (int j = yn - 1; j >= 0; j--)
5665 __ sllx(yn, 2, rj); // int j = yn - 1 (byte offset j = 4*yn)
5666 __ dec(rj, 4);
5667
5668 __ bind(L_loop_j);
5669
5670 __ cmp_and_br_short(rj, 0, // j >= 0
5671 Assembler::less, Assembler::pn, L_exit);
5672 __ clr(rc); // u32 c = 0
5673 __ lduw(yp, rj, ry); // u32 y = yp[j]
5674
5675 // for (int i = xn, k = --zn; i >= 0; i--)
5676 __ dec(zn); // --zn
5677 __ sllx(xn, 2, ri); // int i = xn (byte offset i = 4*xn)
5678 __ sllx(zn, 2, rk); // int k = zn (byte offset k = 4*zn)
5679
5680 __ bind(L_loop_i2);
5681
5682 __ cmp_and_br_short(ri, 0, // i >= 0
5683 Assembler::less, Assembler::pn, L_exit_loop_i2);
5684 __ lduw(xp, ri, rx); // x = xp[i]
5685 __ lduw(zp, rk, rz); // z = zp[k], accumulator
5686 __ mulx(rx, ry, p64); // 64b result of 32x32
5687 __ add(rz, rc, rz); // Accumulate lower order bits,
5688 __ addcc(rz, p64, z65); // z += lo(p64) + c
5689 __ addxc(zero, zero, c65); // Materialise carry (in bit 65) into lsb,
5690 __ sllx(c65, 32, c33); // and shift into bit 33
5691 __ srlx(z65, 32, rc); // carry = c33 | hi(z65) >> 32
5692 __ add(c33, rc, rc); // carry over to next datum [k-1]
5693 __ stw(z65, zp, rk); // zp[k] = lo(z65)
5694 __ dec(rk, 4); // k--
5695 __ dec(ri, 4); // i--
5696 __ ba_short(L_loop_i2);
5697
5698 __ bind(L_exit_loop_i2);
5699 __ stw(rc, zp, rk); // z[k] = c
5700 __ dec(rj, 4); // j--
5701 __ ba_short(L_loop_j);
5702 }
5703
5704
5705 void generate_initial() {
5706 // Generates all stubs and initializes the entry points
5707
5708 //------------------------------------------------------------------------------------------------------------------------
5709 // entry points that exist in all platforms
5710 // Note: This is code that could be shared among different platforms - however the benefit seems to be smaller than
5711 // the disadvantage of having a much more complicated generator structure. See also comment in stubRoutines.hpp.
5712 StubRoutines::_forward_exception_entry = generate_forward_exception();
5713
5714 StubRoutines::_call_stub_entry = generate_call_stub(StubRoutines::_call_stub_return_address);
5715 StubRoutines::_catch_exception_entry = generate_catch_exception();
5716
5717 //------------------------------------------------------------------------------------------------------------------------
5718 // entry points that are platform specific
5719 StubRoutines::Sparc::_test_stop_entry = generate_test_stop();
5720
5721 StubRoutines::Sparc::_stop_subroutine_entry = generate_stop_subroutine();
5722 StubRoutines::Sparc::_flush_callers_register_windows_entry = generate_flush_callers_register_windows();
5723
5724 // Build this early so it's available for the interpreter.
5725 StubRoutines::_throw_StackOverflowError_entry =
5726 generate_throw_exception("StackOverflowError throw_exception",
5727 CAST_FROM_FN_PTR(address, SharedRuntime::throw_StackOverflowError));
5728 StubRoutines::_throw_delayed_StackOverflowError_entry =
5729 generate_throw_exception("delayed StackOverflowError throw_exception",
5730 CAST_FROM_FN_PTR(address, SharedRuntime::throw_delayed_StackOverflowError));
5731
5732 if (UseCRC32Intrinsics) {
5733 // set table address before stub generation which use it
5734 StubRoutines::_crc_table_adr = (address)StubRoutines::Sparc::_crc_table;
5735 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32();
5736 }
5737
5738 if (UseCRC32CIntrinsics) {
5739 // set table address before stub generation which use it
5740 StubRoutines::_crc32c_table_addr = (address)StubRoutines::Sparc::_crc32c_table;
5741 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C();
5742 }
5743 }
5744
5745
5746 void generate_all() {
5747 // Generates all stubs and initializes the entry points
5748
5749 // Generate partial_subtype_check first here since its code depends on
5750 // UseZeroBaseCompressedOops which is defined after heap initialization.
5751 StubRoutines::Sparc::_partial_subtype_check = generate_partial_subtype_check();
5752 // These entry points require SharedInfo::stack0 to be set up in non-core builds
5753 StubRoutines::_throw_AbstractMethodError_entry = generate_throw_exception("AbstractMethodError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_AbstractMethodError));
5754 StubRoutines::_throw_IncompatibleClassChangeError_entry= generate_throw_exception("IncompatibleClassChangeError throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_IncompatibleClassChangeError));
5755 StubRoutines::_throw_NullPointerException_at_call_entry= generate_throw_exception("NullPointerException at call throw_exception", CAST_FROM_FN_PTR(address, SharedRuntime::throw_NullPointerException_at_call));
5756
5757 // support for verify_oop (must happen after universe_init)
5758 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop_subroutine();
5759
5760 // arraycopy stubs used by compilers
5761 generate_arraycopy_stubs();
5762
5763 // Don't initialize the platform math functions since sparc
5764 // doesn't have intrinsics for these operations.
5765
5766 // Safefetch stubs.
5767 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry,
5768 &StubRoutines::_safefetch32_fault_pc,
5769 &StubRoutines::_safefetch32_continuation_pc);
5770 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry,
5771 &StubRoutines::_safefetchN_fault_pc,
5772 &StubRoutines::_safefetchN_continuation_pc);
5773
5774 // generate AES intrinsics code
5775 if (UseAESIntrinsics) {
5776 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock();
5777 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock();
5778 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt();
5779 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt_Parallel();
5780 }
5781 // generate GHASH intrinsics code
5782 if (UseGHASHIntrinsics) {
5783 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks();
5784 }
5785
5786 // generate SHA1/SHA256/SHA512 intrinsics code
5787 if (UseSHA1Intrinsics) {
5788 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress");
5789 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB");
5790 }
5791 if (UseSHA256Intrinsics) {
5792 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress");
5793 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB");
5794 }
5795 if (UseSHA512Intrinsics) {
5796 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress");
5797 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB");
5798 }
5799 // generate Adler32 intrinsics code
5800 if (UseAdler32Intrinsics) {
5801 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32();
5802 }
5803
5804 #ifdef COMPILER2
5805 // Intrinsics supported by C2 only:
5806 if (UseMultiplyToLenIntrinsic) {
5807 StubRoutines::_multiplyToLen = generate_multiplyToLen();
5808 }
5809 #endif // COMPILER2
5810 }
5811
5812 public:
5813 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) {
5814 // replace the standard masm with a special one:
5815 _masm = new MacroAssembler(code);
5816
5817 _stub_count = !all ? 0x100 : 0x200;
5818 if (all) {
5819 generate_all();
5820 } else {
5821 generate_initial();
5822 }
5823
5824 // make sure this stub is available for all local calls
5825 if (_atomic_add_stub.is_unbound()) {
5826 // generate a second time, if necessary
5827 (void) generate_atomic_add();
5828 }
5829 }
5830
5831
5832 private:
5833 int _stub_count;
5834 void stub_prolog(StubCodeDesc* cdesc) {
5835 # ifdef ASSERT
5836 // put extra information in the stub code, to make it more readable
5837 // Write the high part of the address
5838 // [RGV] Check if there is a dependency on the size of this prolog
5839 __ emit_data((intptr_t)cdesc >> 32, relocInfo::none);
5840 __ emit_data((intptr_t)cdesc, relocInfo::none);
5841 __ emit_data(++_stub_count, relocInfo::none);
5842 # endif
5843 align(true);
5844 }
5845
5846 void align(bool at_header = false) {
5847 // %%%%% move this constant somewhere else
5848 // UltraSPARC cache line size is 8 instructions:
5849 const unsigned int icache_line_size = 32;
5850 const unsigned int icache_half_line_size = 16;
5851
5852 if (at_header) {
5853 while ((intptr_t)(__ pc()) % icache_line_size != 0) {
5854 __ emit_data(0, relocInfo::none);
5855 }
5856 } else {
5857 while ((intptr_t)(__ pc()) % icache_half_line_size != 0) {
5858 __ nop();
5859 }
5860 }
5861 }
5862
5863 }; // end class declaration
5864
5865 #define UCM_TABLE_MAX_ENTRIES 8
5866 void StubGenerator_generate(CodeBuffer* code, bool all) {
5867 if (UnsafeCopyMemory::_table == NULL) {
5868 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES);
5869 }
5870 StubGenerator g(code, all);
5871 }
--- EOF ---