1 /* 2 * Copyright (c) 2013, Red Hat Inc. 3 * Copyright (c) 2003, 2011, Oracle and/or its affiliates. 4 * All rights reserved. 5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 6 * 7 * This code is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 only, as 9 * published by the Free Software Foundation. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 * 25 */ 26 27 #include "precompiled.hpp" 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "gc_implementation/shenandoah/shenandoahBarrierSet.hpp" 31 #include "gc_implementation/shenandoah/shenandoahBrooksPointer.hpp" 32 #include "gc_implementation/shenandoah/shenandoahHeap.hpp" 33 #include "gc_implementation/shenandoah/shenandoahHeapRegion.hpp" 34 #include "interpreter/interpreter.hpp" 35 #include "nativeInst_aarch64.hpp" 36 #include "oops/instanceOop.hpp" 37 #include "oops/method.hpp" 38 #include "oops/objArrayKlass.hpp" 39 #include "oops/oop.inline.hpp" 40 #include "prims/methodHandles.hpp" 41 #include "runtime/frame.inline.hpp" 42 #include "runtime/handles.inline.hpp" 43 #include "runtime/sharedRuntime.hpp" 44 #include "runtime/stubCodeGenerator.hpp" 45 #include "runtime/stubRoutines.hpp" 46 #include "runtime/thread.inline.hpp" 47 #include "utilities/top.hpp" 48 49 #include "stubRoutines_aarch64.hpp" 50 51 #ifdef COMPILER2 52 #include "opto/runtime.hpp" 53 #endif 54 55 #ifdef BUILTIN_SIM 56 #include "../../../../../../simulator/simulator.hpp" 57 #endif 58 59 // Declaration and definition of StubGenerator (no .hpp file). 60 // For a more detailed description of the stub routine structure 61 // see the comment in stubRoutines.hpp 62 63 #undef __ 64 #define __ _masm-> 65 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 66 67 #ifdef PRODUCT 68 #define BLOCK_COMMENT(str) /* nothing */ 69 #else 70 #define BLOCK_COMMENT(str) __ block_comment(str) 71 #endif 72 73 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 74 75 // Stub Code definitions 76 77 class StubGenerator: public StubCodeGenerator { 78 private: 79 80 #ifdef PRODUCT 81 #define inc_counter_np(counter) ((void)0) 82 #else 83 void inc_counter_np_(int& counter) { 84 __ lea(rscratch2, ExternalAddress((address)&counter)); 85 __ ldrw(rscratch1, Address(rscratch2)); 86 __ addw(rscratch1, rscratch1, 1); 87 __ strw(rscratch1, Address(rscratch2)); 88 } 89 #define inc_counter_np(counter) \ 90 BLOCK_COMMENT("inc_counter " #counter); \ 91 inc_counter_np_(counter); 92 #endif 93 94 // Call stubs are used to call Java from C 95 // 96 // Arguments: 97 // c_rarg0: call wrapper address address 98 // c_rarg1: result address 99 // c_rarg2: result type BasicType 100 // c_rarg3: method Method* 101 // c_rarg4: (interpreter) entry point address 102 // c_rarg5: parameters intptr_t* 103 // c_rarg6: parameter size (in words) int 104 // c_rarg7: thread Thread* 105 // 106 // There is no return from the stub itself as any Java result 107 // is written to result 108 // 109 // we save r30 (lr) as the return PC at the base of the frame and 110 // link r29 (fp) below it as the frame pointer installing sp (r31) 111 // into fp. 112 // 113 // we save r0-r7, which accounts for all the c arguments. 114 // 115 // TODO: strictly do we need to save them all? they are treated as 116 // volatile by C so could we omit saving the ones we are going to 117 // place in global registers (thread? method?) or those we only use 118 // during setup of the Java call? 119 // 120 // we don't need to save r8 which C uses as an indirect result location 121 // return register. 122 // 123 // we don't need to save r9-r15 which both C and Java treat as 124 // volatile 125 // 126 // we don't need to save r16-18 because Java does not use them 127 // 128 // we save r19-r28 which Java uses as scratch registers and C 129 // expects to be callee-save 130 // 131 // we save the bottom 64 bits of each value stored in v8-v15; it is 132 // the responsibility of the caller to preserve larger values. 133 // 134 // so the stub frame looks like this when we enter Java code 135 // 136 // [ return_from_Java ] <--- sp 137 // [ argument word n ] 138 // ... 139 // -27 [ argument word 1 ] 140 // -26 [ saved v15 ] <--- sp_after_call 141 // -25 [ saved v14 ] 142 // -24 [ saved v13 ] 143 // -23 [ saved v12 ] 144 // -22 [ saved v11 ] 145 // -21 [ saved v10 ] 146 // -20 [ saved v9 ] 147 // -19 [ saved v8 ] 148 // -18 [ saved r28 ] 149 // -17 [ saved r27 ] 150 // -16 [ saved r26 ] 151 // -15 [ saved r25 ] 152 // -14 [ saved r24 ] 153 // -13 [ saved r23 ] 154 // -12 [ saved r22 ] 155 // -11 [ saved r21 ] 156 // -10 [ saved r20 ] 157 // -9 [ saved r19 ] 158 // -8 [ call wrapper (r0) ] 159 // -7 [ result (r1) ] 160 // -6 [ result type (r2) ] 161 // -5 [ method (r3) ] 162 // -4 [ entry point (r4) ] 163 // -3 [ parameters (r5) ] 164 // -2 [ parameter size (r6) ] 165 // -1 [ thread (r7) ] 166 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 167 // 1 [ saved lr (r30) ] 168 169 // Call stub stack layout word offsets from fp 170 enum call_stub_layout { 171 sp_after_call_off = -26, 172 173 d15_off = -26, 174 d13_off = -24, 175 d11_off = -22, 176 d9_off = -20, 177 178 r28_off = -18, 179 r26_off = -16, 180 r24_off = -14, 181 r22_off = -12, 182 r20_off = -10, 183 call_wrapper_off = -8, 184 result_off = -7, 185 result_type_off = -6, 186 method_off = -5, 187 entry_point_off = -4, 188 parameter_size_off = -2, 189 thread_off = -1, 190 fp_f = 0, 191 retaddr_off = 1, 192 }; 193 194 address generate_call_stub(address& return_address) { 195 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 196 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 197 "adjust this code"); 198 199 StubCodeMark mark(this, "StubRoutines", "call_stub"); 200 address start = __ pc(); 201 202 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 203 204 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 205 const Address result (rfp, result_off * wordSize); 206 const Address result_type (rfp, result_type_off * wordSize); 207 const Address method (rfp, method_off * wordSize); 208 const Address entry_point (rfp, entry_point_off * wordSize); 209 const Address parameter_size(rfp, parameter_size_off * wordSize); 210 211 const Address thread (rfp, thread_off * wordSize); 212 213 const Address d15_save (rfp, d15_off * wordSize); 214 const Address d13_save (rfp, d13_off * wordSize); 215 const Address d11_save (rfp, d11_off * wordSize); 216 const Address d9_save (rfp, d9_off * wordSize); 217 218 const Address r28_save (rfp, r28_off * wordSize); 219 const Address r26_save (rfp, r26_off * wordSize); 220 const Address r24_save (rfp, r24_off * wordSize); 221 const Address r22_save (rfp, r22_off * wordSize); 222 const Address r20_save (rfp, r20_off * wordSize); 223 224 // stub code 225 226 // we need a C prolog to bootstrap the x86 caller into the sim 227 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 228 229 address aarch64_entry = __ pc(); 230 231 #ifdef BUILTIN_SIM 232 // Save sender's SP for stack traces. 233 __ mov(rscratch1, sp); 234 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 235 #endif 236 // set up frame and move sp to end of save area 237 __ enter(); 238 __ sub(sp, rfp, -sp_after_call_off * wordSize); 239 240 // save register parameters and Java scratch/global registers 241 // n.b. we save thread even though it gets installed in 242 // rthread because we want to sanity check rthread later 243 __ str(c_rarg7, thread); 244 __ strw(c_rarg6, parameter_size); 245 __ stp(c_rarg4, c_rarg5, entry_point); 246 __ stp(c_rarg2, c_rarg3, result_type); 247 __ stp(c_rarg0, c_rarg1, call_wrapper); 248 249 __ stp(r20, r19, r20_save); 250 __ stp(r22, r21, r22_save); 251 __ stp(r24, r23, r24_save); 252 __ stp(r26, r25, r26_save); 253 __ stp(r28, r27, r28_save); 254 255 __ stpd(v9, v8, d9_save); 256 __ stpd(v11, v10, d11_save); 257 __ stpd(v13, v12, d13_save); 258 __ stpd(v15, v14, d15_save); 259 260 // install Java thread in global register now we have saved 261 // whatever value it held 262 __ mov(rthread, c_rarg7); 263 // And method 264 __ mov(rmethod, c_rarg3); 265 266 // set up the heapbase register 267 __ reinit_heapbase(); 268 269 #ifdef ASSERT 270 // make sure we have no pending exceptions 271 { 272 Label L; 273 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 274 __ cmp(rscratch1, (unsigned)NULL_WORD); 275 __ br(Assembler::EQ, L); 276 __ stop("StubRoutines::call_stub: entered with pending exception"); 277 __ BIND(L); 278 } 279 #endif 280 // pass parameters if any 281 __ mov(esp, sp); 282 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 283 __ andr(sp, rscratch1, -2 * wordSize); 284 285 BLOCK_COMMENT("pass parameters if any"); 286 Label parameters_done; 287 // parameter count is still in c_rarg6 288 // and parameter pointer identifying param 1 is in c_rarg5 289 __ cbzw(c_rarg6, parameters_done); 290 291 address loop = __ pc(); 292 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 293 __ subsw(c_rarg6, c_rarg6, 1); 294 __ push(rscratch1); 295 __ br(Assembler::GT, loop); 296 297 __ BIND(parameters_done); 298 299 // call Java entry -- passing methdoOop, and current sp 300 // rmethod: Method* 301 // r13: sender sp 302 BLOCK_COMMENT("call Java function"); 303 __ mov(r13, sp); 304 __ blr(c_rarg4); 305 306 // tell the simulator we have returned to the stub 307 308 // we do this here because the notify will already have been done 309 // if we get to the next instruction via an exception 310 // 311 // n.b. adding this instruction here affects the calculation of 312 // whether or not a routine returns to the call stub (used when 313 // doing stack walks) since the normal test is to check the return 314 // pc against the address saved below. so we may need to allow for 315 // this extra instruction in the check. 316 317 if (NotifySimulator) { 318 __ notify(Assembler::method_reentry); 319 } 320 // save current address for use by exception handling code 321 322 return_address = __ pc(); 323 324 // store result depending on type (everything that is not 325 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 326 // n.b. this assumes Java returns an integral result in r0 327 // and a floating result in j_farg0 328 __ ldr(j_rarg2, result); 329 Label is_long, is_float, is_double, exit; 330 __ ldr(j_rarg1, result_type); 331 __ cmp(j_rarg1, T_OBJECT); 332 __ br(Assembler::EQ, is_long); 333 __ cmp(j_rarg1, T_LONG); 334 __ br(Assembler::EQ, is_long); 335 __ cmp(j_rarg1, T_FLOAT); 336 __ br(Assembler::EQ, is_float); 337 __ cmp(j_rarg1, T_DOUBLE); 338 __ br(Assembler::EQ, is_double); 339 340 // handle T_INT case 341 __ strw(r0, Address(j_rarg2)); 342 343 __ BIND(exit); 344 345 // pop parameters 346 __ sub(esp, rfp, -sp_after_call_off * wordSize); 347 348 #ifdef ASSERT 349 // verify that threads correspond 350 { 351 Label L, S; 352 __ ldr(rscratch1, thread); 353 __ cmp(rthread, rscratch1); 354 __ br(Assembler::NE, S); 355 __ get_thread(rscratch1); 356 __ cmp(rthread, rscratch1); 357 __ br(Assembler::EQ, L); 358 __ BIND(S); 359 __ stop("StubRoutines::call_stub: threads must correspond"); 360 __ BIND(L); 361 } 362 #endif 363 364 // restore callee-save registers 365 __ ldpd(v15, v14, d15_save); 366 __ ldpd(v13, v12, d13_save); 367 __ ldpd(v11, v10, d11_save); 368 __ ldpd(v9, v8, d9_save); 369 370 __ ldp(r28, r27, r28_save); 371 __ ldp(r26, r25, r26_save); 372 __ ldp(r24, r23, r24_save); 373 __ ldp(r22, r21, r22_save); 374 __ ldp(r20, r19, r20_save); 375 376 __ ldp(c_rarg0, c_rarg1, call_wrapper); 377 __ ldrw(c_rarg2, result_type); 378 __ ldr(c_rarg3, method); 379 __ ldp(c_rarg4, c_rarg5, entry_point); 380 __ ldp(c_rarg6, c_rarg7, parameter_size); 381 382 #ifndef PRODUCT 383 // tell the simulator we are about to end Java execution 384 if (NotifySimulator) { 385 __ notify(Assembler::method_exit); 386 } 387 #endif 388 // leave frame and return to caller 389 __ leave(); 390 __ ret(lr); 391 392 // handle return types different from T_INT 393 394 __ BIND(is_long); 395 __ str(r0, Address(j_rarg2, 0)); 396 __ br(Assembler::AL, exit); 397 398 __ BIND(is_float); 399 __ strs(j_farg0, Address(j_rarg2, 0)); 400 __ br(Assembler::AL, exit); 401 402 __ BIND(is_double); 403 __ strd(j_farg0, Address(j_rarg2, 0)); 404 __ br(Assembler::AL, exit); 405 406 return start; 407 } 408 409 // Return point for a Java call if there's an exception thrown in 410 // Java code. The exception is caught and transformed into a 411 // pending exception stored in JavaThread that can be tested from 412 // within the VM. 413 // 414 // Note: Usually the parameters are removed by the callee. In case 415 // of an exception crossing an activation frame boundary, that is 416 // not the case if the callee is compiled code => need to setup the 417 // rsp. 418 // 419 // r0: exception oop 420 421 // NOTE: this is used as a target from the signal handler so it 422 // needs an x86 prolog which returns into the current simulator 423 // executing the generated catch_exception code. so the prolog 424 // needs to install rax in a sim register and adjust the sim's 425 // restart pc to enter the generated code at the start position 426 // then return from native to simulated execution. 427 428 address generate_catch_exception() { 429 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 430 address start = __ pc(); 431 432 // same as in generate_call_stub(): 433 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 434 const Address thread (rfp, thread_off * wordSize); 435 436 #ifdef ASSERT 437 // verify that threads correspond 438 { 439 Label L, S; 440 __ ldr(rscratch1, thread); 441 __ cmp(rthread, rscratch1); 442 __ br(Assembler::NE, S); 443 __ get_thread(rscratch1); 444 __ cmp(rthread, rscratch1); 445 __ br(Assembler::EQ, L); 446 __ bind(S); 447 __ stop("StubRoutines::catch_exception: threads must correspond"); 448 __ bind(L); 449 } 450 #endif 451 452 // set pending exception 453 __ verify_oop(r0); 454 455 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 456 __ mov(rscratch1, (address)__FILE__); 457 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 458 __ movw(rscratch1, (int)__LINE__); 459 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 460 461 // complete return to VM 462 assert(StubRoutines::_call_stub_return_address != NULL, 463 "_call_stub_return_address must have been generated before"); 464 __ b(StubRoutines::_call_stub_return_address); 465 466 return start; 467 } 468 469 // Continuation point for runtime calls returning with a pending 470 // exception. The pending exception check happened in the runtime 471 // or native call stub. The pending exception in Thread is 472 // converted into a Java-level exception. 473 // 474 // Contract with Java-level exception handlers: 475 // r0: exception 476 // r3: throwing pc 477 // 478 // NOTE: At entry of this stub, exception-pc must be in LR !! 479 480 // NOTE: this is always used as a jump target within generated code 481 // so it just needs to be generated code wiht no x86 prolog 482 483 address generate_forward_exception() { 484 StubCodeMark mark(this, "StubRoutines", "forward exception"); 485 address start = __ pc(); 486 487 // Upon entry, LR points to the return address returning into 488 // Java (interpreted or compiled) code; i.e., the return address 489 // becomes the throwing pc. 490 // 491 // Arguments pushed before the runtime call are still on the stack 492 // but the exception handler will reset the stack pointer -> 493 // ignore them. A potential result in registers can be ignored as 494 // well. 495 496 #ifdef ASSERT 497 // make sure this code is only executed if there is a pending exception 498 { 499 Label L; 500 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 501 __ cbnz(rscratch1, L); 502 __ stop("StubRoutines::forward exception: no pending exception (1)"); 503 __ bind(L); 504 } 505 #endif 506 507 // compute exception handler into r19 508 509 // call the VM to find the handler address associated with the 510 // caller address. pass thread in r0 and caller pc (ret address) 511 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 512 // the stack. 513 __ mov(c_rarg1, lr); 514 // lr will be trashed by the VM call so we move it to R19 515 // (callee-saved) because we also need to pass it to the handler 516 // returned by this call. 517 __ mov(r19, lr); 518 BLOCK_COMMENT("call exception_handler_for_return_address"); 519 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 520 SharedRuntime::exception_handler_for_return_address), 521 rthread, c_rarg1); 522 // we should not really care that lr is no longer the callee 523 // address. we saved the value the handler needs in r19 so we can 524 // just copy it to r3. however, the C2 handler will push its own 525 // frame and then calls into the VM and the VM code asserts that 526 // the PC for the frame above the handler belongs to a compiled 527 // Java method. So, we restore lr here to satisfy that assert. 528 __ mov(lr, r19); 529 // setup r0 & r3 & clear pending exception 530 __ mov(r3, r19); 531 __ mov(r19, r0); 532 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 533 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 534 535 #ifdef ASSERT 536 // make sure exception is set 537 { 538 Label L; 539 __ cbnz(r0, L); 540 __ stop("StubRoutines::forward exception: no pending exception (2)"); 541 __ bind(L); 542 } 543 #endif 544 545 // continue at exception handler 546 // r0: exception 547 // r3: throwing pc 548 // r19: exception handler 549 __ verify_oop(r0); 550 __ br(r19); 551 552 return start; 553 } 554 555 // Shenandoah write barrier. 556 // 557 // Input: 558 // r0: OOP to evacuate. Not null. 559 // 560 // Output: 561 // r0: Pointer to evacuated OOP. 562 // 563 // Trash rscratch1, rscratch2. Preserve everything else. 564 565 address generate_shenandoah_wb(bool c_abi, bool do_cset_test) { 566 StubCodeMark mark(this, "StubRoutines", "shenandoah_wb"); 567 568 __ align(6); 569 address start = __ pc(); 570 571 if (do_cset_test) { 572 Label work; 573 __ mov(rscratch2, ShenandoahHeap::in_cset_fast_test_addr()); 574 __ lsr(rscratch1, r0, ShenandoahHeapRegion::region_size_bytes_shift_jint()); 575 __ ldrb(rscratch2, Address(rscratch2, rscratch1)); 576 __ tbnz(rscratch2, 0, work); 577 __ ret(lr); 578 __ bind(work); 579 } 580 581 Register obj = r0; 582 583 __ enter(); // required for proper stackwalking of RuntimeStub frame 584 585 if (!c_abi) { 586 __ push_call_clobbered_registers(); 587 } else { 588 __ push_call_clobbered_fp_registers(); 589 } 590 591 __ mov(lr, CAST_FROM_FN_PTR(address, ShenandoahBarrierSet::write_barrier_JRT)); 592 __ blrt(lr, 1, 0, MacroAssembler::ret_type_integral); 593 if (!c_abi) { 594 __ mov(rscratch1, obj); 595 __ pop_call_clobbered_registers(); 596 __ mov(obj, rscratch1); 597 } else { 598 __ pop_call_clobbered_fp_registers(); 599 } 600 601 __ leave(); // required for proper stackwalking of RuntimeStub frame 602 __ ret(lr); 603 604 return start; 605 } 606 607 // Non-destructive plausibility checks for oops 608 // 609 // Arguments: 610 // r0: oop to verify 611 // rscratch1: error message 612 // 613 // Stack after saving c_rarg3: 614 // [tos + 0]: saved c_rarg3 615 // [tos + 1]: saved c_rarg2 616 // [tos + 2]: saved lr 617 // [tos + 3]: saved rscratch2 618 // [tos + 4]: saved r0 619 // [tos + 5]: saved rscratch1 620 address generate_verify_oop() { 621 622 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 623 address start = __ pc(); 624 625 Label exit, error; 626 627 // save c_rarg2 and c_rarg3 628 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 629 630 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 631 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 632 __ ldr(c_rarg3, Address(c_rarg2)); 633 __ add(c_rarg3, c_rarg3, 1); 634 __ str(c_rarg3, Address(c_rarg2)); 635 636 // object is in r0 637 // make sure object is 'reasonable' 638 __ cbz(r0, exit); // if obj is NULL it is OK 639 640 // Check if the oop is in the right area of memory 641 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 642 __ andr(c_rarg2, r0, c_rarg3); 643 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 644 645 // Compare c_rarg2 and c_rarg3. We don't use a compare 646 // instruction here because the flags register is live. 647 __ eor(c_rarg2, c_rarg2, c_rarg3); 648 __ cbnz(c_rarg2, error); 649 650 // make sure klass is 'reasonable', which is not zero. 651 __ load_klass(r0, r0); // get klass 652 __ cbz(r0, error); // if klass is NULL it is broken 653 654 // return if everything seems ok 655 __ bind(exit); 656 657 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 658 __ ret(lr); 659 660 // handle errors 661 __ bind(error); 662 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 663 664 __ push(RegSet::range(r0, r29), sp); 665 // debug(char* msg, int64_t pc, int64_t regs[]) 666 __ mov(c_rarg0, rscratch1); // pass address of error message 667 __ mov(c_rarg1, lr); // pass return address 668 __ mov(c_rarg2, sp); // pass address of regs on stack 669 #ifndef PRODUCT 670 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 671 #endif 672 BLOCK_COMMENT("call MacroAssembler::debug"); 673 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 674 __ blrt(rscratch1, 3, 0, 1); 675 676 return start; 677 } 678 679 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 680 681 // Generate code for an array write pre barrier 682 // 683 // addr - starting address 684 // count - element count 685 // tmp - scratch register 686 // 687 // Destroy no registers except rscratch1 and rscratch2 688 // 689 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 690 BarrierSet* bs = Universe::heap()->barrier_set(); 691 switch (bs->kind()) { 692 case BarrierSet::G1SATBCT: 693 case BarrierSet::G1SATBCTLogging: 694 case BarrierSet::ShenandoahBarrierSet: 695 // Don't generate the call if we statically know that the target is uninitialized 696 if (!dest_uninitialized) { 697 __ push_call_clobbered_registers(); 698 if (count == c_rarg0) { 699 if (addr == c_rarg1) { 700 // exactly backwards!! 701 __ mov(rscratch1, c_rarg0); 702 __ mov(c_rarg0, c_rarg1); 703 __ mov(c_rarg1, rscratch1); 704 } else { 705 __ mov(c_rarg1, count); 706 __ mov(c_rarg0, addr); 707 } 708 } else { 709 __ mov(c_rarg0, addr); 710 __ mov(c_rarg1, count); 711 } 712 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 713 __ pop_call_clobbered_registers(); 714 break; 715 case BarrierSet::CardTableModRef: 716 case BarrierSet::CardTableExtension: 717 case BarrierSet::ModRef: 718 break; 719 default: 720 ShouldNotReachHere(); 721 722 } 723 } 724 } 725 726 // 727 // Generate code for an array write post barrier 728 // 729 // Input: 730 // start - register containing starting address of destination array 731 // end - register containing ending address of destination array 732 // scratch - scratch register 733 // 734 // The input registers are overwritten. 735 // The ending address is inclusive. 736 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) { 737 assert_different_registers(start, end, scratch); 738 BarrierSet* bs = Universe::heap()->barrier_set(); 739 switch (bs->kind()) { 740 case BarrierSet::G1SATBCT: 741 case BarrierSet::G1SATBCTLogging: 742 case BarrierSet::ShenandoahBarrierSet: 743 744 { 745 __ push_call_clobbered_registers(); 746 // must compute element count unless barrier set interface is changed (other platforms supply count) 747 assert_different_registers(start, end, scratch); 748 __ lea(scratch, Address(end, BytesPerHeapOop)); 749 __ sub(scratch, scratch, start); // subtract start to get #bytes 750 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 751 __ mov(c_rarg0, start); 752 __ mov(c_rarg1, scratch); 753 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 754 __ pop_call_clobbered_registers(); 755 } 756 break; 757 case BarrierSet::CardTableModRef: 758 case BarrierSet::CardTableExtension: 759 { 760 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 761 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 762 763 Label L_loop; 764 765 __ lsr(start, start, CardTableModRefBS::card_shift); 766 __ lsr(end, end, CardTableModRefBS::card_shift); 767 __ sub(end, end, start); // number of bytes to copy 768 769 const Register count = end; // 'end' register contains bytes count now 770 __ load_byte_map_base(scratch); 771 __ add(start, start, scratch); 772 __ BIND(L_loop); 773 __ strb(zr, Address(start, count)); 774 __ subs(count, count, 1); 775 __ br(Assembler::GE, L_loop); 776 } 777 break; 778 default: 779 ShouldNotReachHere(); 780 781 } 782 } 783 784 address generate_zero_longs(Register base, Register cnt) { 785 Register tmp = rscratch1; 786 Register tmp2 = rscratch2; 787 int zva_length = VM_Version::zva_length(); 788 Label initial_table_end, loop_zva; 789 Label fini; 790 791 __ align(CodeEntryAlignment); 792 StubCodeMark mark(this, "StubRoutines", "zero_longs"); 793 address start = __ pc(); 794 795 // Base must be 16 byte aligned. If not just return and let caller handle it 796 __ tst(base, 0x0f); 797 __ br(Assembler::NE, fini); 798 // Align base with ZVA length. 799 __ neg(tmp, base); 800 __ andr(tmp, tmp, zva_length - 1); 801 802 // tmp: the number of bytes to be filled to align the base with ZVA length. 803 __ add(base, base, tmp); 804 __ sub(cnt, cnt, tmp, Assembler::ASR, 3); 805 __ adr(tmp2, initial_table_end); 806 __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 807 __ br(tmp2); 808 809 for (int i = -zva_length + 16; i < 0; i += 16) 810 __ stp(zr, zr, Address(base, i)); 811 __ bind(initial_table_end); 812 813 __ sub(cnt, cnt, zva_length >> 3); 814 __ bind(loop_zva); 815 __ dc(Assembler::ZVA, base); 816 __ subs(cnt, cnt, zva_length >> 3); 817 __ add(base, base, zva_length); 818 __ br(Assembler::GE, loop_zva); 819 __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 820 __ bind(fini); 821 __ ret(lr); 822 823 return start; 824 } 825 826 typedef enum { 827 copy_forwards = 1, 828 copy_backwards = -1 829 } copy_direction; 830 831 // Bulk copy of blocks of 8 words. 832 // 833 // count is a count of words. 834 // 835 // Precondition: count >= 8 836 // 837 // Postconditions: 838 // 839 // The least significant bit of count contains the remaining count 840 // of words to copy. The rest of count is trash. 841 // 842 // s and d are adjusted to point to the remaining words to copy 843 // 844 void generate_copy_longs(Label &start, Register s, Register d, Register count, 845 copy_direction direction) { 846 int unit = wordSize * direction; 847 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 848 849 int offset; 850 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 851 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 852 const Register stride = r13; 853 854 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 855 assert_different_registers(s, d, count, rscratch1); 856 857 Label again, drain; 858 const char *stub_name; 859 if (direction == copy_forwards) 860 stub_name = "foward_copy_longs"; 861 else 862 stub_name = "backward_copy_longs"; 863 StubCodeMark mark(this, "StubRoutines", stub_name); 864 __ align(CodeEntryAlignment); 865 __ bind(start); 866 867 Label unaligned_copy_long; 868 if (AvoidUnalignedAccesses) { 869 __ tbnz(d, 3, unaligned_copy_long); 870 } 871 872 if (direction == copy_forwards) { 873 __ sub(s, s, bias); 874 __ sub(d, d, bias); 875 } 876 877 #ifdef ASSERT 878 // Make sure we are never given < 8 words 879 { 880 Label L; 881 __ cmp(count, 8); 882 __ br(Assembler::GE, L); 883 __ stop("genrate_copy_longs called with < 8 words"); 884 __ bind(L); 885 } 886 #endif 887 888 // Fill 8 registers 889 if (UseSIMDForMemoryOps) { 890 __ ldpq(v0, v1, Address(s, 4 * unit)); 891 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 892 } else { 893 __ ldp(t0, t1, Address(s, 2 * unit)); 894 __ ldp(t2, t3, Address(s, 4 * unit)); 895 __ ldp(t4, t5, Address(s, 6 * unit)); 896 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 897 } 898 899 __ subs(count, count, 16); 900 __ br(Assembler::LO, drain); 901 902 int prefetch = PrefetchCopyIntervalInBytes; 903 bool use_stride = false; 904 if (direction == copy_backwards) { 905 use_stride = prefetch > 256; 906 prefetch = -prefetch; 907 if (use_stride) __ mov(stride, prefetch); 908 } 909 910 __ bind(again); 911 912 if (PrefetchCopyIntervalInBytes > 0) 913 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 914 915 if (UseSIMDForMemoryOps) { 916 __ stpq(v0, v1, Address(d, 4 * unit)); 917 __ ldpq(v0, v1, Address(s, 4 * unit)); 918 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 919 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 920 } else { 921 __ stp(t0, t1, Address(d, 2 * unit)); 922 __ ldp(t0, t1, Address(s, 2 * unit)); 923 __ stp(t2, t3, Address(d, 4 * unit)); 924 __ ldp(t2, t3, Address(s, 4 * unit)); 925 __ stp(t4, t5, Address(d, 6 * unit)); 926 __ ldp(t4, t5, Address(s, 6 * unit)); 927 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 928 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 929 } 930 931 __ subs(count, count, 8); 932 __ br(Assembler::HS, again); 933 934 // Drain 935 __ bind(drain); 936 if (UseSIMDForMemoryOps) { 937 __ stpq(v0, v1, Address(d, 4 * unit)); 938 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 939 } else { 940 __ stp(t0, t1, Address(d, 2 * unit)); 941 __ stp(t2, t3, Address(d, 4 * unit)); 942 __ stp(t4, t5, Address(d, 6 * unit)); 943 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 944 } 945 946 { 947 Label L1, L2; 948 __ tbz(count, exact_log2(4), L1); 949 if (UseSIMDForMemoryOps) { 950 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 951 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 952 } else { 953 __ ldp(t0, t1, Address(s, 2 * unit)); 954 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 955 __ stp(t0, t1, Address(d, 2 * unit)); 956 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 957 } 958 __ bind(L1); 959 960 if (direction == copy_forwards) { 961 __ add(s, s, bias); 962 __ add(d, d, bias); 963 } 964 965 __ tbz(count, 1, L2); 966 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 967 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 968 __ bind(L2); 969 } 970 971 __ ret(lr); 972 973 if (AvoidUnalignedAccesses) { 974 Label drain, again; 975 // Register order for storing. Order is different for backward copy. 976 977 __ bind(unaligned_copy_long); 978 979 // source address is even aligned, target odd aligned 980 // 981 // when forward copying word pairs we read long pairs at offsets 982 // {0, 2, 4, 6} (in long words). when backwards copying we read 983 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 984 // address by -2 in the forwards case so we can compute the 985 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 986 // or -1. 987 // 988 // when forward copying we need to store 1 word, 3 pairs and 989 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 990 // zero offset We adjust the destination by -1 which means we 991 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 992 // 993 // When backwards copyng we need to store 1 word, 3 pairs and 994 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 995 // offsets {1, 3, 5, 7, 8} * unit. 996 997 if (direction == copy_forwards) { 998 __ sub(s, s, 16); 999 __ sub(d, d, 8); 1000 } 1001 1002 // Fill 8 registers 1003 // 1004 // for forwards copy s was offset by -16 from the original input 1005 // value of s so the register contents are at these offsets 1006 // relative to the 64 bit block addressed by that original input 1007 // and so on for each successive 64 byte block when s is updated 1008 // 1009 // t0 at offset 0, t1 at offset 8 1010 // t2 at offset 16, t3 at offset 24 1011 // t4 at offset 32, t5 at offset 40 1012 // t6 at offset 48, t7 at offset 56 1013 1014 // for backwards copy s was not offset so the register contents 1015 // are at these offsets into the preceding 64 byte block 1016 // relative to that original input and so on for each successive 1017 // preceding 64 byte block when s is updated. this explains the 1018 // slightly counter-intuitive looking pattern of register usage 1019 // in the stp instructions for backwards copy. 1020 // 1021 // t0 at offset -16, t1 at offset -8 1022 // t2 at offset -32, t3 at offset -24 1023 // t4 at offset -48, t5 at offset -40 1024 // t6 at offset -64, t7 at offset -56 1025 1026 __ ldp(t0, t1, Address(s, 2 * unit)); 1027 __ ldp(t2, t3, Address(s, 4 * unit)); 1028 __ ldp(t4, t5, Address(s, 6 * unit)); 1029 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1030 1031 __ subs(count, count, 16); 1032 __ br(Assembler::LO, drain); 1033 1034 int prefetch = PrefetchCopyIntervalInBytes; 1035 bool use_stride = false; 1036 if (direction == copy_backwards) { 1037 use_stride = prefetch > 256; 1038 prefetch = -prefetch; 1039 if (use_stride) __ mov(stride, prefetch); 1040 } 1041 1042 __ bind(again); 1043 1044 if (PrefetchCopyIntervalInBytes > 0) 1045 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1046 1047 if (direction == copy_forwards) { 1048 // allowing for the offset of -8 the store instructions place 1049 // registers into the target 64 bit block at the following 1050 // offsets 1051 // 1052 // t0 at offset 0 1053 // t1 at offset 8, t2 at offset 16 1054 // t3 at offset 24, t4 at offset 32 1055 // t5 at offset 40, t6 at offset 48 1056 // t7 at offset 56 1057 1058 __ str(t0, Address(d, 1 * unit)); 1059 __ stp(t1, t2, Address(d, 2 * unit)); 1060 __ ldp(t0, t1, Address(s, 2 * unit)); 1061 __ stp(t3, t4, Address(d, 4 * unit)); 1062 __ ldp(t2, t3, Address(s, 4 * unit)); 1063 __ stp(t5, t6, Address(d, 6 * unit)); 1064 __ ldp(t4, t5, Address(s, 6 * unit)); 1065 __ str(t7, Address(__ pre(d, 8 * unit))); 1066 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1067 } else { 1068 // d was not offset when we started so the registers are 1069 // written into the 64 bit block preceding d with the following 1070 // offsets 1071 // 1072 // t1 at offset -8 1073 // t3 at offset -24, t0 at offset -16 1074 // t5 at offset -48, t2 at offset -32 1075 // t7 at offset -56, t4 at offset -48 1076 // t6 at offset -64 1077 // 1078 // note that this matches the offsets previously noted for the 1079 // loads 1080 1081 __ str(t1, Address(d, 1 * unit)); 1082 __ stp(t3, t0, Address(d, 3 * unit)); 1083 __ ldp(t0, t1, Address(s, 2 * unit)); 1084 __ stp(t5, t2, Address(d, 5 * unit)); 1085 __ ldp(t2, t3, Address(s, 4 * unit)); 1086 __ stp(t7, t4, Address(d, 7 * unit)); 1087 __ ldp(t4, t5, Address(s, 6 * unit)); 1088 __ str(t6, Address(__ pre(d, 8 * unit))); 1089 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1090 } 1091 1092 __ subs(count, count, 8); 1093 __ br(Assembler::HS, again); 1094 1095 // Drain 1096 // 1097 // this uses the same pattern of offsets and register arguments 1098 // as above 1099 __ bind(drain); 1100 if (direction == copy_forwards) { 1101 __ str(t0, Address(d, 1 * unit)); 1102 __ stp(t1, t2, Address(d, 2 * unit)); 1103 __ stp(t3, t4, Address(d, 4 * unit)); 1104 __ stp(t5, t6, Address(d, 6 * unit)); 1105 __ str(t7, Address(__ pre(d, 8 * unit))); 1106 } else { 1107 __ str(t1, Address(d, 1 * unit)); 1108 __ stp(t3, t0, Address(d, 3 * unit)); 1109 __ stp(t5, t2, Address(d, 5 * unit)); 1110 __ stp(t7, t4, Address(d, 7 * unit)); 1111 __ str(t6, Address(__ pre(d, 8 * unit))); 1112 } 1113 // now we need to copy any remaining part block which may 1114 // include a 4 word block subblock and/or a 2 word subblock. 1115 // bits 2 and 1 in the count are the tell-tale for whetehr we 1116 // have each such subblock 1117 { 1118 Label L1, L2; 1119 __ tbz(count, exact_log2(4), L1); 1120 // this is the same as above but copying only 4 longs hence 1121 // with ony one intervening stp between the str instructions 1122 // but note that the offsets and registers still follow the 1123 // same pattern 1124 __ ldp(t0, t1, Address(s, 2 * unit)); 1125 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1126 if (direction == copy_forwards) { 1127 __ str(t0, Address(d, 1 * unit)); 1128 __ stp(t1, t2, Address(d, 2 * unit)); 1129 __ str(t3, Address(__ pre(d, 4 * unit))); 1130 } else { 1131 __ str(t1, Address(d, 1 * unit)); 1132 __ stp(t3, t0, Address(d, 3 * unit)); 1133 __ str(t2, Address(__ pre(d, 4 * unit))); 1134 } 1135 __ bind(L1); 1136 1137 __ tbz(count, 1, L2); 1138 // this is the same as above but copying only 2 longs hence 1139 // there is no intervening stp between the str instructions 1140 // but note that the offset and register patterns are still 1141 // the same 1142 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1143 if (direction == copy_forwards) { 1144 __ str(t0, Address(d, 1 * unit)); 1145 __ str(t1, Address(__ pre(d, 2 * unit))); 1146 } else { 1147 __ str(t1, Address(d, 1 * unit)); 1148 __ str(t0, Address(__ pre(d, 2 * unit))); 1149 } 1150 __ bind(L2); 1151 1152 // for forwards copy we need to re-adjust the offsets we 1153 // applied so that s and d are follow the last words written 1154 1155 if (direction == copy_forwards) { 1156 __ add(s, s, 16); 1157 __ add(d, d, 8); 1158 } 1159 1160 } 1161 1162 __ ret(lr); 1163 } 1164 } 1165 1166 // Small copy: less than 16 bytes. 1167 // 1168 // NB: Ignores all of the bits of count which represent more than 15 1169 // bytes, so a caller doesn't have to mask them. 1170 1171 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1172 bool is_backwards = step < 0; 1173 size_t granularity = uabs(step); 1174 int direction = is_backwards ? -1 : 1; 1175 int unit = wordSize * direction; 1176 1177 Label Lpair, Lword, Lint, Lshort, Lbyte; 1178 1179 assert(granularity 1180 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1181 1182 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1183 1184 // ??? I don't know if this bit-test-and-branch is the right thing 1185 // to do. It does a lot of jumping, resulting in several 1186 // mispredicted branches. It might make more sense to do this 1187 // with something like Duff's device with a single computed branch. 1188 1189 __ tbz(count, 3 - exact_log2(granularity), Lword); 1190 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1191 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1192 __ bind(Lword); 1193 1194 if (granularity <= sizeof (jint)) { 1195 __ tbz(count, 2 - exact_log2(granularity), Lint); 1196 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1197 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1198 __ bind(Lint); 1199 } 1200 1201 if (granularity <= sizeof (jshort)) { 1202 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1203 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1204 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1205 __ bind(Lshort); 1206 } 1207 1208 if (granularity <= sizeof (jbyte)) { 1209 __ tbz(count, 0, Lbyte); 1210 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1211 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1212 __ bind(Lbyte); 1213 } 1214 } 1215 1216 Label copy_f, copy_b; 1217 1218 // All-singing all-dancing memory copy. 1219 // 1220 // Copy count units of memory from s to d. The size of a unit is 1221 // step, which can be positive or negative depending on the direction 1222 // of copy. If is_aligned is false, we align the source address. 1223 // 1224 1225 void copy_memory(bool is_aligned, Register s, Register d, 1226 Register count, Register tmp, int step) { 1227 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1228 bool is_backwards = step < 0; 1229 int granularity = uabs(step); 1230 const Register t0 = r3, t1 = r4; 1231 1232 // <= 96 bytes do inline. Direction doesn't matter because we always 1233 // load all the data before writing anything 1234 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1235 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1236 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1237 const Register send = r17, dend = r18; 1238 1239 if (PrefetchCopyIntervalInBytes > 0) 1240 __ prfm(Address(s, 0), PLDL1KEEP); 1241 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1242 __ br(Assembler::HI, copy_big); 1243 1244 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1245 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1246 1247 __ cmp(count, 16/granularity); 1248 __ br(Assembler::LS, copy16); 1249 1250 __ cmp(count, 64/granularity); 1251 __ br(Assembler::HI, copy80); 1252 1253 __ cmp(count, 32/granularity); 1254 __ br(Assembler::LS, copy32); 1255 1256 // 33..64 bytes 1257 if (UseSIMDForMemoryOps) { 1258 __ ldpq(v0, v1, Address(s, 0)); 1259 __ ldpq(v2, v3, Address(send, -32)); 1260 __ stpq(v0, v1, Address(d, 0)); 1261 __ stpq(v2, v3, Address(dend, -32)); 1262 } else { 1263 __ ldp(t0, t1, Address(s, 0)); 1264 __ ldp(t2, t3, Address(s, 16)); 1265 __ ldp(t4, t5, Address(send, -32)); 1266 __ ldp(t6, t7, Address(send, -16)); 1267 1268 __ stp(t0, t1, Address(d, 0)); 1269 __ stp(t2, t3, Address(d, 16)); 1270 __ stp(t4, t5, Address(dend, -32)); 1271 __ stp(t6, t7, Address(dend, -16)); 1272 } 1273 __ b(finish); 1274 1275 // 17..32 bytes 1276 __ bind(copy32); 1277 __ ldp(t0, t1, Address(s, 0)); 1278 __ ldp(t2, t3, Address(send, -16)); 1279 __ stp(t0, t1, Address(d, 0)); 1280 __ stp(t2, t3, Address(dend, -16)); 1281 __ b(finish); 1282 1283 // 65..80/96 bytes 1284 // (96 bytes if SIMD because we do 32 byes per instruction) 1285 __ bind(copy80); 1286 if (UseSIMDForMemoryOps) { 1287 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1288 __ ldpq(v4, v5, Address(send, -32)); 1289 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1290 __ stpq(v4, v5, Address(dend, -32)); 1291 } else { 1292 __ ldp(t0, t1, Address(s, 0)); 1293 __ ldp(t2, t3, Address(s, 16)); 1294 __ ldp(t4, t5, Address(s, 32)); 1295 __ ldp(t6, t7, Address(s, 48)); 1296 __ ldp(t8, t9, Address(send, -16)); 1297 1298 __ stp(t0, t1, Address(d, 0)); 1299 __ stp(t2, t3, Address(d, 16)); 1300 __ stp(t4, t5, Address(d, 32)); 1301 __ stp(t6, t7, Address(d, 48)); 1302 __ stp(t8, t9, Address(dend, -16)); 1303 } 1304 __ b(finish); 1305 1306 // 0..16 bytes 1307 __ bind(copy16); 1308 __ cmp(count, 8/granularity); 1309 __ br(Assembler::LO, copy8); 1310 1311 // 8..16 bytes 1312 __ ldr(t0, Address(s, 0)); 1313 __ ldr(t1, Address(send, -8)); 1314 __ str(t0, Address(d, 0)); 1315 __ str(t1, Address(dend, -8)); 1316 __ b(finish); 1317 1318 if (granularity < 8) { 1319 // 4..7 bytes 1320 __ bind(copy8); 1321 __ tbz(count, 2 - exact_log2(granularity), copy4); 1322 __ ldrw(t0, Address(s, 0)); 1323 __ ldrw(t1, Address(send, -4)); 1324 __ strw(t0, Address(d, 0)); 1325 __ strw(t1, Address(dend, -4)); 1326 __ b(finish); 1327 if (granularity < 4) { 1328 // 0..3 bytes 1329 __ bind(copy4); 1330 __ cbz(count, finish); // get rid of 0 case 1331 if (granularity == 2) { 1332 __ ldrh(t0, Address(s, 0)); 1333 __ strh(t0, Address(d, 0)); 1334 } else { // granularity == 1 1335 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1336 // the first and last byte. 1337 // Handle the 3 byte case by loading and storing base + count/2 1338 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1339 // This does means in the 1 byte case we load/store the same 1340 // byte 3 times. 1341 __ lsr(count, count, 1); 1342 __ ldrb(t0, Address(s, 0)); 1343 __ ldrb(t1, Address(send, -1)); 1344 __ ldrb(t2, Address(s, count)); 1345 __ strb(t0, Address(d, 0)); 1346 __ strb(t1, Address(dend, -1)); 1347 __ strb(t2, Address(d, count)); 1348 } 1349 __ b(finish); 1350 } 1351 } 1352 1353 __ bind(copy_big); 1354 if (is_backwards) { 1355 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1356 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1357 } 1358 1359 // Now we've got the small case out of the way we can align the 1360 // source address on a 2-word boundary. 1361 1362 Label aligned; 1363 1364 if (is_aligned) { 1365 // We may have to adjust by 1 word to get s 2-word-aligned. 1366 __ tbz(s, exact_log2(wordSize), aligned); 1367 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1368 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1369 __ sub(count, count, wordSize/granularity); 1370 } else { 1371 if (is_backwards) { 1372 __ andr(rscratch2, s, 2 * wordSize - 1); 1373 } else { 1374 __ neg(rscratch2, s); 1375 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1376 } 1377 // rscratch2 is the byte adjustment needed to align s. 1378 __ cbz(rscratch2, aligned); 1379 int shift = exact_log2(granularity); 1380 if (shift) __ lsr(rscratch2, rscratch2, shift); 1381 __ sub(count, count, rscratch2); 1382 1383 #if 0 1384 // ?? This code is only correct for a disjoint copy. It may or 1385 // may not make sense to use it in that case. 1386 1387 // Copy the first pair; s and d may not be aligned. 1388 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1389 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1390 1391 // Align s and d, adjust count 1392 if (is_backwards) { 1393 __ sub(s, s, rscratch2); 1394 __ sub(d, d, rscratch2); 1395 } else { 1396 __ add(s, s, rscratch2); 1397 __ add(d, d, rscratch2); 1398 } 1399 #else 1400 copy_memory_small(s, d, rscratch2, rscratch1, step); 1401 #endif 1402 } 1403 1404 __ bind(aligned); 1405 1406 // s is now 2-word-aligned. 1407 1408 // We have a count of units and some trailing bytes. Adjust the 1409 // count and do a bulk copy of words. 1410 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1411 if (direction == copy_forwards) 1412 __ bl(copy_f); 1413 else 1414 __ bl(copy_b); 1415 1416 // And the tail. 1417 copy_memory_small(s, d, count, tmp, step); 1418 1419 if (granularity >= 8) __ bind(copy8); 1420 if (granularity >= 4) __ bind(copy4); 1421 __ bind(finish); 1422 } 1423 1424 1425 void clobber_registers() { 1426 #ifdef ASSERT 1427 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1428 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1429 for (Register r = r3; r <= r18; r++) 1430 if (r != rscratch1) __ mov(r, rscratch1); 1431 #endif 1432 } 1433 1434 // Scan over array at a for count oops, verifying each one. 1435 // Preserves a and count, clobbers rscratch1 and rscratch2. 1436 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1437 Label loop, end; 1438 __ mov(rscratch1, a); 1439 __ mov(rscratch2, zr); 1440 __ bind(loop); 1441 __ cmp(rscratch2, count); 1442 __ br(Assembler::HS, end); 1443 if (size == (size_t)wordSize) { 1444 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1445 __ verify_oop(temp); 1446 } else { 1447 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1448 __ decode_heap_oop(temp); // calls verify_oop 1449 } 1450 __ add(rscratch2, rscratch2, size); 1451 __ b(loop); 1452 __ bind(end); 1453 } 1454 1455 // Arguments: 1456 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1457 // ignored 1458 // is_oop - true => oop array, so generate store check code 1459 // name - stub name string 1460 // 1461 // Inputs: 1462 // c_rarg0 - source array address 1463 // c_rarg1 - destination array address 1464 // c_rarg2 - element count, treated as ssize_t, can be zero 1465 // 1466 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1467 // the hardware handle it. The two dwords within qwords that span 1468 // cache line boundaries will still be loaded and stored atomicly. 1469 // 1470 // Side Effects: 1471 // disjoint_int_copy_entry is set to the no-overlap entry point 1472 // used by generate_conjoint_int_oop_copy(). 1473 // 1474 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1475 const char *name, bool dest_uninitialized = false) { 1476 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1477 __ align(CodeEntryAlignment); 1478 StubCodeMark mark(this, "StubRoutines", name); 1479 address start = __ pc(); 1480 __ enter(); 1481 1482 if (entry != NULL) { 1483 *entry = __ pc(); 1484 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1485 BLOCK_COMMENT("Entry:"); 1486 } 1487 1488 if (is_oop) { 1489 __ push(RegSet::of(d, count), sp); 1490 // no registers are destroyed by this call 1491 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1492 } 1493 copy_memory(aligned, s, d, count, rscratch1, size); 1494 if (is_oop) { 1495 __ pop(RegSet::of(d, count), sp); 1496 if (VerifyOops) 1497 verify_oop_array(size, d, count, r16); 1498 __ sub(count, count, 1); // make an inclusive end pointer 1499 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1500 gen_write_ref_array_post_barrier(d, count, rscratch1); 1501 } 1502 __ leave(); 1503 __ mov(r0, zr); // return 0 1504 __ ret(lr); 1505 #ifdef BUILTIN_SIM 1506 { 1507 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1508 sim->notifyCompile(const_cast<char*>(name), start); 1509 } 1510 #endif 1511 return start; 1512 } 1513 1514 // Arguments: 1515 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1516 // ignored 1517 // is_oop - true => oop array, so generate store check code 1518 // name - stub name string 1519 // 1520 // Inputs: 1521 // c_rarg0 - source array address 1522 // c_rarg1 - destination array address 1523 // c_rarg2 - element count, treated as ssize_t, can be zero 1524 // 1525 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1526 // the hardware handle it. The two dwords within qwords that span 1527 // cache line boundaries will still be loaded and stored atomicly. 1528 // 1529 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1530 address *entry, const char *name, 1531 bool dest_uninitialized = false) { 1532 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1533 1534 StubCodeMark mark(this, "StubRoutines", name); 1535 address start = __ pc(); 1536 1537 __ enter(); 1538 1539 if (entry != NULL) { 1540 *entry = __ pc(); 1541 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1542 BLOCK_COMMENT("Entry:"); 1543 } 1544 1545 // use fwd copy when (d-s) above_equal (count*size) 1546 __ sub(rscratch1, d, s); 1547 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1548 __ br(Assembler::HS, nooverlap_target); 1549 1550 if (is_oop) { 1551 __ push(RegSet::of(d, count), sp); 1552 // no registers are destroyed by this call 1553 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1554 } 1555 copy_memory(aligned, s, d, count, rscratch1, -size); 1556 if (is_oop) { 1557 __ pop(RegSet::of(d, count), sp); 1558 if (VerifyOops) 1559 verify_oop_array(size, d, count, r16); 1560 __ sub(count, count, 1); // make an inclusive end pointer 1561 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1562 gen_write_ref_array_post_barrier(d, count, rscratch1); 1563 } 1564 __ leave(); 1565 __ mov(r0, zr); // return 0 1566 __ ret(lr); 1567 #ifdef BUILTIN_SIM 1568 { 1569 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1570 sim->notifyCompile(const_cast<char*>(name), start); 1571 } 1572 #endif 1573 return start; 1574 } 1575 1576 // Arguments: 1577 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1578 // ignored 1579 // name - stub name string 1580 // 1581 // Inputs: 1582 // c_rarg0 - source array address 1583 // c_rarg1 - destination array address 1584 // c_rarg2 - element count, treated as ssize_t, can be zero 1585 // 1586 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1587 // we let the hardware handle it. The one to eight bytes within words, 1588 // dwords or qwords that span cache line boundaries will still be loaded 1589 // and stored atomically. 1590 // 1591 // Side Effects: 1592 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1593 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1594 // we let the hardware handle it. The one to eight bytes within words, 1595 // dwords or qwords that span cache line boundaries will still be loaded 1596 // and stored atomically. 1597 // 1598 // Side Effects: 1599 // disjoint_byte_copy_entry is set to the no-overlap entry point 1600 // used by generate_conjoint_byte_copy(). 1601 // 1602 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1603 const bool not_oop = false; 1604 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1605 } 1606 1607 // Arguments: 1608 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1609 // ignored 1610 // name - stub name string 1611 // 1612 // Inputs: 1613 // c_rarg0 - source array address 1614 // c_rarg1 - destination array address 1615 // c_rarg2 - element count, treated as ssize_t, can be zero 1616 // 1617 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1618 // we let the hardware handle it. The one to eight bytes within words, 1619 // dwords or qwords that span cache line boundaries will still be loaded 1620 // and stored atomically. 1621 // 1622 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1623 address* entry, const char *name) { 1624 const bool not_oop = false; 1625 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1626 } 1627 1628 // Arguments: 1629 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1630 // ignored 1631 // name - stub name string 1632 // 1633 // Inputs: 1634 // c_rarg0 - source array address 1635 // c_rarg1 - destination array address 1636 // c_rarg2 - element count, treated as ssize_t, can be zero 1637 // 1638 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1639 // let the hardware handle it. The two or four words within dwords 1640 // or qwords that span cache line boundaries will still be loaded 1641 // and stored atomically. 1642 // 1643 // Side Effects: 1644 // disjoint_short_copy_entry is set to the no-overlap entry point 1645 // used by generate_conjoint_short_copy(). 1646 // 1647 address generate_disjoint_short_copy(bool aligned, 1648 address* entry, const char *name) { 1649 const bool not_oop = false; 1650 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1651 } 1652 1653 // Arguments: 1654 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1655 // ignored 1656 // name - stub name string 1657 // 1658 // Inputs: 1659 // c_rarg0 - source array address 1660 // c_rarg1 - destination array address 1661 // c_rarg2 - element count, treated as ssize_t, can be zero 1662 // 1663 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1664 // let the hardware handle it. The two or four words within dwords 1665 // or qwords that span cache line boundaries will still be loaded 1666 // and stored atomically. 1667 // 1668 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1669 address *entry, const char *name) { 1670 const bool not_oop = false; 1671 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1672 1673 } 1674 // Arguments: 1675 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1676 // ignored 1677 // name - stub name string 1678 // 1679 // Inputs: 1680 // c_rarg0 - source array address 1681 // c_rarg1 - destination array address 1682 // c_rarg2 - element count, treated as ssize_t, can be zero 1683 // 1684 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1685 // the hardware handle it. The two dwords within qwords that span 1686 // cache line boundaries will still be loaded and stored atomicly. 1687 // 1688 // Side Effects: 1689 // disjoint_int_copy_entry is set to the no-overlap entry point 1690 // used by generate_conjoint_int_oop_copy(). 1691 // 1692 address generate_disjoint_int_copy(bool aligned, address *entry, 1693 const char *name) { 1694 const bool not_oop = false; 1695 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1696 } 1697 1698 // Arguments: 1699 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1700 // ignored 1701 // name - stub name string 1702 // 1703 // Inputs: 1704 // c_rarg0 - source array address 1705 // c_rarg1 - destination array address 1706 // c_rarg2 - element count, treated as ssize_t, can be zero 1707 // 1708 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1709 // the hardware handle it. The two dwords within qwords that span 1710 // cache line boundaries will still be loaded and stored atomicly. 1711 // 1712 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1713 address *entry, const char *name, 1714 bool dest_uninitialized = false) { 1715 const bool not_oop = false; 1716 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1717 } 1718 1719 1720 // Arguments: 1721 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1722 // ignored 1723 // name - stub name string 1724 // 1725 // Inputs: 1726 // c_rarg0 - source array address 1727 // c_rarg1 - destination array address 1728 // c_rarg2 - element count, treated as size_t, can be zero 1729 // 1730 // Side Effects: 1731 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1732 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1733 // 1734 address generate_disjoint_long_copy(bool aligned, address *entry, 1735 const char *name, bool dest_uninitialized = false) { 1736 const bool not_oop = false; 1737 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1738 } 1739 1740 // Arguments: 1741 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1742 // ignored 1743 // name - stub name string 1744 // 1745 // Inputs: 1746 // c_rarg0 - source array address 1747 // c_rarg1 - destination array address 1748 // c_rarg2 - element count, treated as size_t, can be zero 1749 // 1750 address generate_conjoint_long_copy(bool aligned, 1751 address nooverlap_target, address *entry, 1752 const char *name, bool dest_uninitialized = false) { 1753 const bool not_oop = false; 1754 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1755 } 1756 1757 // Arguments: 1758 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1759 // ignored 1760 // name - stub name string 1761 // 1762 // Inputs: 1763 // c_rarg0 - source array address 1764 // c_rarg1 - destination array address 1765 // c_rarg2 - element count, treated as size_t, can be zero 1766 // 1767 // Side Effects: 1768 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1769 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1770 // 1771 address generate_disjoint_oop_copy(bool aligned, address *entry, 1772 const char *name, bool dest_uninitialized) { 1773 const bool is_oop = true; 1774 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1775 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1776 } 1777 1778 // Arguments: 1779 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1780 // ignored 1781 // name - stub name string 1782 // 1783 // Inputs: 1784 // c_rarg0 - source array address 1785 // c_rarg1 - destination array address 1786 // c_rarg2 - element count, treated as size_t, can be zero 1787 // 1788 address generate_conjoint_oop_copy(bool aligned, 1789 address nooverlap_target, address *entry, 1790 const char *name, bool dest_uninitialized) { 1791 const bool is_oop = true; 1792 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1793 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1794 name, dest_uninitialized); 1795 } 1796 1797 1798 // Helper for generating a dynamic type check. 1799 // Smashes rscratch1. 1800 void generate_type_check(Register sub_klass, 1801 Register super_check_offset, 1802 Register super_klass, 1803 Label& L_success) { 1804 assert_different_registers(sub_klass, super_check_offset, super_klass); 1805 1806 BLOCK_COMMENT("type_check:"); 1807 1808 Label L_miss; 1809 1810 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1811 super_check_offset); 1812 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1813 1814 // Fall through on failure! 1815 __ BIND(L_miss); 1816 } 1817 1818 // 1819 // Generate checkcasting array copy stub 1820 // 1821 // Input: 1822 // c_rarg0 - source array address 1823 // c_rarg1 - destination array address 1824 // c_rarg2 - element count, treated as ssize_t, can be zero 1825 // c_rarg3 - size_t ckoff (super_check_offset) 1826 // c_rarg4 - oop ckval (super_klass) 1827 // 1828 // Output: 1829 // r0 == 0 - success 1830 // r0 == -1^K - failure, where K is partial transfer count 1831 // 1832 address generate_checkcast_copy(const char *name, address *entry, 1833 bool dest_uninitialized = false) { 1834 1835 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1836 1837 // Input registers (after setup_arg_regs) 1838 const Register from = c_rarg0; // source array address 1839 const Register to = c_rarg1; // destination array address 1840 const Register count = c_rarg2; // elementscount 1841 const Register ckoff = c_rarg3; // super_check_offset 1842 const Register ckval = c_rarg4; // super_klass 1843 1844 // Registers used as temps (r18, r19, r20 are save-on-entry) 1845 const Register count_save = r21; // orig elementscount 1846 const Register start_to = r20; // destination array start address 1847 const Register copied_oop = r18; // actual oop copied 1848 const Register r19_klass = r19; // oop._klass 1849 1850 //--------------------------------------------------------------- 1851 // Assembler stub will be used for this call to arraycopy 1852 // if the two arrays are subtypes of Object[] but the 1853 // destination array type is not equal to or a supertype 1854 // of the source type. Each element must be separately 1855 // checked. 1856 1857 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1858 copied_oop, r19_klass, count_save); 1859 1860 __ align(CodeEntryAlignment); 1861 StubCodeMark mark(this, "StubRoutines", name); 1862 address start = __ pc(); 1863 1864 __ enter(); // required for proper stackwalking of RuntimeStub frame 1865 1866 #ifdef ASSERT 1867 // caller guarantees that the arrays really are different 1868 // otherwise, we would have to make conjoint checks 1869 { Label L; 1870 array_overlap_test(L, TIMES_OOP); 1871 __ stop("checkcast_copy within a single array"); 1872 __ bind(L); 1873 } 1874 #endif //ASSERT 1875 1876 // Caller of this entry point must set up the argument registers. 1877 if (entry != NULL) { 1878 *entry = __ pc(); 1879 BLOCK_COMMENT("Entry:"); 1880 } 1881 1882 // Empty array: Nothing to do. 1883 __ cbz(count, L_done); 1884 1885 __ push(RegSet::of(r18, r19, r20, r21), sp); 1886 1887 #ifdef ASSERT 1888 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1889 // The ckoff and ckval must be mutually consistent, 1890 // even though caller generates both. 1891 { Label L; 1892 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1893 __ ldrw(start_to, Address(ckval, sco_offset)); 1894 __ cmpw(ckoff, start_to); 1895 __ br(Assembler::EQ, L); 1896 __ stop("super_check_offset inconsistent"); 1897 __ bind(L); 1898 } 1899 #endif //ASSERT 1900 1901 gen_write_ref_array_pre_barrier(to, count, dest_uninitialized); 1902 1903 // save the original count 1904 __ mov(count_save, count); 1905 1906 // Copy from low to high addresses 1907 __ mov(start_to, to); // Save destination array start address 1908 __ b(L_load_element); 1909 1910 // ======== begin loop ======== 1911 // (Loop is rotated; its entry is L_load_element.) 1912 // Loop control: 1913 // for (; count != 0; count--) { 1914 // copied_oop = load_heap_oop(from++); 1915 // ... generate_type_check ...; 1916 // store_heap_oop(to++, copied_oop); 1917 // } 1918 __ align(OptoLoopAlignment); 1919 1920 __ BIND(L_store_element); 1921 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1922 __ sub(count, count, 1); 1923 __ cbz(count, L_do_card_marks); 1924 1925 // ======== loop entry is here ======== 1926 __ BIND(L_load_element); 1927 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1928 __ cbz(copied_oop, L_store_element); 1929 1930 __ load_klass(r19_klass, copied_oop);// query the object klass 1931 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1932 // ======== end loop ======== 1933 1934 // It was a real error; we must depend on the caller to finish the job. 1935 // Register count = remaining oops, count_orig = total oops. 1936 // Emit GC store barriers for the oops we have copied and report 1937 // their number to the caller. 1938 1939 __ subs(count, count_save, count); // K = partially copied oop count 1940 __ eon(count, count, zr); // report (-1^K) to caller 1941 __ br(Assembler::EQ, L_done_pop); 1942 1943 __ BIND(L_do_card_marks); 1944 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1945 gen_write_ref_array_post_barrier(start_to, to, rscratch1); 1946 1947 __ bind(L_done_pop); 1948 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1949 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1950 1951 __ bind(L_done); 1952 __ mov(r0, count); 1953 __ leave(); 1954 __ ret(lr); 1955 1956 return start; 1957 } 1958 1959 // Perform range checks on the proposed arraycopy. 1960 // Kills temp, but nothing else. 1961 // Also, clean the sign bits of src_pos and dst_pos. 1962 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1963 Register src_pos, // source position (c_rarg1) 1964 Register dst, // destination array oo (c_rarg2) 1965 Register dst_pos, // destination position (c_rarg3) 1966 Register length, 1967 Register temp, 1968 Label& L_failed) { 1969 BLOCK_COMMENT("arraycopy_range_checks:"); 1970 1971 assert_different_registers(rscratch1, temp); 1972 1973 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1974 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1975 __ addw(temp, length, src_pos); 1976 __ cmpw(temp, rscratch1); 1977 __ br(Assembler::HI, L_failed); 1978 1979 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1980 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1981 __ addw(temp, length, dst_pos); 1982 __ cmpw(temp, rscratch1); 1983 __ br(Assembler::HI, L_failed); 1984 1985 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1986 __ movw(src_pos, src_pos); 1987 __ movw(dst_pos, dst_pos); 1988 1989 BLOCK_COMMENT("arraycopy_range_checks done"); 1990 } 1991 1992 // These stubs get called from some dumb test routine. 1993 // I'll write them properly when they're called from 1994 // something that's actually doing something. 1995 static void fake_arraycopy_stub(address src, address dst, int count) { 1996 assert(count == 0, "huh?"); 1997 } 1998 1999 2000 // 2001 // Generate stub for array fill. If "aligned" is true, the 2002 // "to" address is assumed to be heapword aligned. 2003 // 2004 // Arguments for generated stub: 2005 // to: c_rarg0 2006 // value: c_rarg1 2007 // count: c_rarg2 treated as signed 2008 // 2009 address generate_fill(BasicType t, bool aligned, const char *name) { 2010 __ align(CodeEntryAlignment); 2011 StubCodeMark mark(this, "StubRoutines", name); 2012 address start = __ pc(); 2013 2014 BLOCK_COMMENT("Entry:"); 2015 2016 const Register to = c_rarg0; // source array address 2017 const Register value = c_rarg1; // value 2018 const Register count = c_rarg2; // elements count 2019 2020 const Register bz_base = r10; // base for block_zero routine 2021 const Register cnt_words = r11; // temp register 2022 2023 __ enter(); 2024 2025 Label L_fill_elements, L_exit1; 2026 2027 int shift = -1; 2028 switch (t) { 2029 case T_BYTE: 2030 shift = 0; 2031 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2032 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2033 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2034 __ br(Assembler::LO, L_fill_elements); 2035 break; 2036 case T_SHORT: 2037 shift = 1; 2038 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2039 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2040 __ br(Assembler::LO, L_fill_elements); 2041 break; 2042 case T_INT: 2043 shift = 2; 2044 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2045 __ br(Assembler::LO, L_fill_elements); 2046 break; 2047 default: ShouldNotReachHere(); 2048 } 2049 2050 // Align source address at 8 bytes address boundary. 2051 Label L_skip_align1, L_skip_align2, L_skip_align4; 2052 if (!aligned) { 2053 switch (t) { 2054 case T_BYTE: 2055 // One byte misalignment happens only for byte arrays. 2056 __ tbz(to, 0, L_skip_align1); 2057 __ strb(value, Address(__ post(to, 1))); 2058 __ subw(count, count, 1); 2059 __ bind(L_skip_align1); 2060 // Fallthrough 2061 case T_SHORT: 2062 // Two bytes misalignment happens only for byte and short (char) arrays. 2063 __ tbz(to, 1, L_skip_align2); 2064 __ strh(value, Address(__ post(to, 2))); 2065 __ subw(count, count, 2 >> shift); 2066 __ bind(L_skip_align2); 2067 // Fallthrough 2068 case T_INT: 2069 // Align to 8 bytes, we know we are 4 byte aligned to start. 2070 __ tbz(to, 2, L_skip_align4); 2071 __ strw(value, Address(__ post(to, 4))); 2072 __ subw(count, count, 4 >> shift); 2073 __ bind(L_skip_align4); 2074 break; 2075 default: ShouldNotReachHere(); 2076 } 2077 } 2078 2079 // 2080 // Fill large chunks 2081 // 2082 __ lsrw(cnt_words, count, 3 - shift); // number of words 2083 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2084 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2085 if (UseBlockZeroing) { 2086 Label non_block_zeroing, rest; 2087 // count >= BlockZeroingLowLimit && value == 0 2088 __ subs(rscratch1, cnt_words, BlockZeroingLowLimit >> 3); 2089 __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE); 2090 __ br(Assembler::NE, non_block_zeroing); 2091 __ mov(bz_base, to); 2092 __ block_zero(bz_base, cnt_words, true); 2093 __ mov(to, bz_base); 2094 __ b(rest); 2095 __ bind(non_block_zeroing); 2096 __ fill_words(to, cnt_words, value); 2097 __ bind(rest); 2098 } 2099 else { 2100 __ fill_words(to, cnt_words, value); 2101 } 2102 2103 // Remaining count is less than 8 bytes. Fill it by a single store. 2104 // Note that the total length is no less than 8 bytes. 2105 if (t == T_BYTE || t == T_SHORT) { 2106 Label L_exit1; 2107 __ cbzw(count, L_exit1); 2108 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2109 __ str(value, Address(to, -8)); // overwrite some elements 2110 __ bind(L_exit1); 2111 __ leave(); 2112 __ ret(lr); 2113 } 2114 2115 // Handle copies less than 8 bytes. 2116 Label L_fill_2, L_fill_4, L_exit2; 2117 __ bind(L_fill_elements); 2118 switch (t) { 2119 case T_BYTE: 2120 __ tbz(count, 0, L_fill_2); 2121 __ strb(value, Address(__ post(to, 1))); 2122 __ bind(L_fill_2); 2123 __ tbz(count, 1, L_fill_4); 2124 __ strh(value, Address(__ post(to, 2))); 2125 __ bind(L_fill_4); 2126 __ tbz(count, 2, L_exit2); 2127 __ strw(value, Address(to)); 2128 break; 2129 case T_SHORT: 2130 __ tbz(count, 0, L_fill_4); 2131 __ strh(value, Address(__ post(to, 2))); 2132 __ bind(L_fill_4); 2133 __ tbz(count, 1, L_exit2); 2134 __ strw(value, Address(to)); 2135 break; 2136 case T_INT: 2137 __ cbzw(count, L_exit2); 2138 __ strw(value, Address(to)); 2139 break; 2140 default: ShouldNotReachHere(); 2141 } 2142 __ bind(L_exit2); 2143 __ leave(); 2144 __ ret(lr); 2145 return start; 2146 } 2147 2148 // 2149 // Generate 'unsafe' array copy stub 2150 // Though just as safe as the other stubs, it takes an unscaled 2151 // size_t argument instead of an element count. 2152 // 2153 // Input: 2154 // c_rarg0 - source array address 2155 // c_rarg1 - destination array address 2156 // c_rarg2 - byte count, treated as ssize_t, can be zero 2157 // 2158 // Examines the alignment of the operands and dispatches 2159 // to a long, int, short, or byte copy loop. 2160 // 2161 address generate_unsafe_copy(const char *name, 2162 address byte_copy_entry, 2163 address short_copy_entry, 2164 address int_copy_entry, 2165 address long_copy_entry) { 2166 Label L_long_aligned, L_int_aligned, L_short_aligned; 2167 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2168 2169 __ align(CodeEntryAlignment); 2170 StubCodeMark mark(this, "StubRoutines", name); 2171 address start = __ pc(); 2172 __ enter(); // required for proper stackwalking of RuntimeStub frame 2173 2174 // bump this on entry, not on exit: 2175 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2176 2177 __ orr(rscratch1, s, d); 2178 __ orr(rscratch1, rscratch1, count); 2179 2180 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2181 __ cbz(rscratch1, L_long_aligned); 2182 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2183 __ cbz(rscratch1, L_int_aligned); 2184 __ tbz(rscratch1, 0, L_short_aligned); 2185 __ b(RuntimeAddress(byte_copy_entry)); 2186 2187 __ BIND(L_short_aligned); 2188 __ lsr(count, count, LogBytesPerShort); // size => short_count 2189 __ b(RuntimeAddress(short_copy_entry)); 2190 __ BIND(L_int_aligned); 2191 __ lsr(count, count, LogBytesPerInt); // size => int_count 2192 __ b(RuntimeAddress(int_copy_entry)); 2193 __ BIND(L_long_aligned); 2194 __ lsr(count, count, LogBytesPerLong); // size => long_count 2195 __ b(RuntimeAddress(long_copy_entry)); 2196 2197 return start; 2198 } 2199 2200 // 2201 // Generate generic array copy stubs 2202 // 2203 // Input: 2204 // c_rarg0 - src oop 2205 // c_rarg1 - src_pos (32-bits) 2206 // c_rarg2 - dst oop 2207 // c_rarg3 - dst_pos (32-bits) 2208 // c_rarg4 - element count (32-bits) 2209 // 2210 // Output: 2211 // r0 == 0 - success 2212 // r0 == -1^K - failure, where K is partial transfer count 2213 // 2214 address generate_generic_copy(const char *name, 2215 address byte_copy_entry, address short_copy_entry, 2216 address int_copy_entry, address oop_copy_entry, 2217 address long_copy_entry, address checkcast_copy_entry) { 2218 2219 Label L_failed, L_failed_0, L_objArray; 2220 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2221 2222 // Input registers 2223 const Register src = c_rarg0; // source array oop 2224 const Register src_pos = c_rarg1; // source position 2225 const Register dst = c_rarg2; // destination array oop 2226 const Register dst_pos = c_rarg3; // destination position 2227 const Register length = c_rarg4; 2228 2229 StubCodeMark mark(this, "StubRoutines", name); 2230 2231 __ align(CodeEntryAlignment); 2232 address start = __ pc(); 2233 2234 __ enter(); // required for proper stackwalking of RuntimeStub frame 2235 2236 // bump this on entry, not on exit: 2237 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2238 2239 //----------------------------------------------------------------------- 2240 // Assembler stub will be used for this call to arraycopy 2241 // if the following conditions are met: 2242 // 2243 // (1) src and dst must not be null. 2244 // (2) src_pos must not be negative. 2245 // (3) dst_pos must not be negative. 2246 // (4) length must not be negative. 2247 // (5) src klass and dst klass should be the same and not NULL. 2248 // (6) src and dst should be arrays. 2249 // (7) src_pos + length must not exceed length of src. 2250 // (8) dst_pos + length must not exceed length of dst. 2251 // 2252 2253 // if (src == NULL) return -1; 2254 __ cbz(src, L_failed); 2255 2256 // if (src_pos < 0) return -1; 2257 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2258 2259 // if (dst == NULL) return -1; 2260 __ cbz(dst, L_failed); 2261 2262 // if (dst_pos < 0) return -1; 2263 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2264 2265 // registers used as temp 2266 const Register scratch_length = r16; // elements count to copy 2267 const Register scratch_src_klass = r17; // array klass 2268 const Register lh = r18; // layout helper 2269 2270 // if (length < 0) return -1; 2271 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2272 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2273 2274 __ load_klass(scratch_src_klass, src); 2275 #ifdef ASSERT 2276 // assert(src->klass() != NULL); 2277 { 2278 BLOCK_COMMENT("assert klasses not null {"); 2279 Label L1, L2; 2280 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2281 __ bind(L1); 2282 __ stop("broken null klass"); 2283 __ bind(L2); 2284 __ load_klass(rscratch1, dst); 2285 __ cbz(rscratch1, L1); // this would be broken also 2286 BLOCK_COMMENT("} assert klasses not null done"); 2287 } 2288 #endif 2289 2290 // Load layout helper (32-bits) 2291 // 2292 // |array_tag| | header_size | element_type | |log2_element_size| 2293 // 32 30 24 16 8 2 0 2294 // 2295 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2296 // 2297 2298 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2299 2300 // Handle objArrays completely differently... 2301 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2302 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2303 __ movw(rscratch1, objArray_lh); 2304 __ eorw(rscratch2, lh, rscratch1); 2305 __ cbzw(rscratch2, L_objArray); 2306 2307 // if (src->klass() != dst->klass()) return -1; 2308 __ load_klass(rscratch2, dst); 2309 __ eor(rscratch2, rscratch2, scratch_src_klass); 2310 __ cbnz(rscratch2, L_failed); 2311 2312 // if (!src->is_Array()) return -1; 2313 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2314 2315 // At this point, it is known to be a typeArray (array_tag 0x3). 2316 #ifdef ASSERT 2317 { 2318 BLOCK_COMMENT("assert primitive array {"); 2319 Label L; 2320 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2321 __ cmpw(lh, rscratch2); 2322 __ br(Assembler::GE, L); 2323 __ stop("must be a primitive array"); 2324 __ bind(L); 2325 BLOCK_COMMENT("} assert primitive array done"); 2326 } 2327 #endif 2328 2329 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2330 rscratch2, L_failed); 2331 2332 // TypeArrayKlass 2333 // 2334 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2335 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2336 // 2337 2338 const Register rscratch1_offset = rscratch1; // array offset 2339 const Register r18_elsize = lh; // element size 2340 2341 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2342 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2343 __ add(src, src, rscratch1_offset); // src array offset 2344 __ add(dst, dst, rscratch1_offset); // dst array offset 2345 BLOCK_COMMENT("choose copy loop based on element size"); 2346 2347 // next registers should be set before the jump to corresponding stub 2348 const Register from = c_rarg0; // source array address 2349 const Register to = c_rarg1; // destination array address 2350 const Register count = c_rarg2; // elements count 2351 2352 // 'from', 'to', 'count' registers should be set in such order 2353 // since they are the same as 'src', 'src_pos', 'dst'. 2354 2355 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2356 2357 // The possible values of elsize are 0-3, i.e. exact_log2(element 2358 // size in bytes). We do a simple bitwise binary search. 2359 __ BIND(L_copy_bytes); 2360 __ tbnz(r18_elsize, 1, L_copy_ints); 2361 __ tbnz(r18_elsize, 0, L_copy_shorts); 2362 __ lea(from, Address(src, src_pos));// src_addr 2363 __ lea(to, Address(dst, dst_pos));// dst_addr 2364 __ movw(count, scratch_length); // length 2365 __ b(RuntimeAddress(byte_copy_entry)); 2366 2367 __ BIND(L_copy_shorts); 2368 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2369 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2370 __ movw(count, scratch_length); // length 2371 __ b(RuntimeAddress(short_copy_entry)); 2372 2373 __ BIND(L_copy_ints); 2374 __ tbnz(r18_elsize, 0, L_copy_longs); 2375 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2376 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2377 __ movw(count, scratch_length); // length 2378 __ b(RuntimeAddress(int_copy_entry)); 2379 2380 __ BIND(L_copy_longs); 2381 #ifdef ASSERT 2382 { 2383 BLOCK_COMMENT("assert long copy {"); 2384 Label L; 2385 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2386 __ cmpw(r18_elsize, LogBytesPerLong); 2387 __ br(Assembler::EQ, L); 2388 __ stop("must be long copy, but elsize is wrong"); 2389 __ bind(L); 2390 BLOCK_COMMENT("} assert long copy done"); 2391 } 2392 #endif 2393 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2394 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2395 __ movw(count, scratch_length); // length 2396 __ b(RuntimeAddress(long_copy_entry)); 2397 2398 // ObjArrayKlass 2399 __ BIND(L_objArray); 2400 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2401 2402 Label L_plain_copy, L_checkcast_copy; 2403 // test array classes for subtyping 2404 __ load_klass(r18, dst); 2405 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2406 __ br(Assembler::NE, L_checkcast_copy); 2407 2408 // Identically typed arrays can be copied without element-wise checks. 2409 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2410 rscratch2, L_failed); 2411 2412 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2413 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2414 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2415 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2416 __ movw(count, scratch_length); // length 2417 __ BIND(L_plain_copy); 2418 __ b(RuntimeAddress(oop_copy_entry)); 2419 2420 __ BIND(L_checkcast_copy); 2421 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2422 { 2423 // Before looking at dst.length, make sure dst is also an objArray. 2424 __ ldrw(rscratch1, Address(r18, lh_offset)); 2425 __ movw(rscratch2, objArray_lh); 2426 __ eorw(rscratch1, rscratch1, rscratch2); 2427 __ cbnzw(rscratch1, L_failed); 2428 2429 // It is safe to examine both src.length and dst.length. 2430 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2431 r18, L_failed); 2432 2433 const Register rscratch2_dst_klass = rscratch2; 2434 __ load_klass(rscratch2_dst_klass, dst); // reload 2435 2436 // Marshal the base address arguments now, freeing registers. 2437 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2438 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2439 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2440 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2441 __ movw(count, length); // length (reloaded) 2442 Register sco_temp = c_rarg3; // this register is free now 2443 assert_different_registers(from, to, count, sco_temp, 2444 rscratch2_dst_klass, scratch_src_klass); 2445 // assert_clean_int(count, sco_temp); 2446 2447 // Generate the type check. 2448 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2449 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2450 // assert_clean_int(sco_temp, r18); 2451 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2452 2453 // Fetch destination element klass from the ObjArrayKlass header. 2454 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2455 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2456 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2457 2458 // the checkcast_copy loop needs two extra arguments: 2459 assert(c_rarg3 == sco_temp, "#3 already in place"); 2460 // Set up arguments for checkcast_copy_entry. 2461 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2462 __ b(RuntimeAddress(checkcast_copy_entry)); 2463 } 2464 2465 __ BIND(L_failed); 2466 __ mov(r0, -1); 2467 __ leave(); // required for proper stackwalking of RuntimeStub frame 2468 __ ret(lr); 2469 2470 return start; 2471 } 2472 2473 void generate_arraycopy_stubs() { 2474 address entry; 2475 address entry_jbyte_arraycopy; 2476 address entry_jshort_arraycopy; 2477 address entry_jint_arraycopy; 2478 address entry_oop_arraycopy; 2479 address entry_jlong_arraycopy; 2480 address entry_checkcast_arraycopy; 2481 2482 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2483 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2484 2485 StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11); 2486 2487 //*** jbyte 2488 // Always need aligned and unaligned versions 2489 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2490 "jbyte_disjoint_arraycopy"); 2491 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2492 &entry_jbyte_arraycopy, 2493 "jbyte_arraycopy"); 2494 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2495 "arrayof_jbyte_disjoint_arraycopy"); 2496 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2497 "arrayof_jbyte_arraycopy"); 2498 2499 //*** jshort 2500 // Always need aligned and unaligned versions 2501 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2502 "jshort_disjoint_arraycopy"); 2503 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2504 &entry_jshort_arraycopy, 2505 "jshort_arraycopy"); 2506 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2507 "arrayof_jshort_disjoint_arraycopy"); 2508 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2509 "arrayof_jshort_arraycopy"); 2510 2511 //*** jint 2512 // Aligned versions 2513 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2514 "arrayof_jint_disjoint_arraycopy"); 2515 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2516 "arrayof_jint_arraycopy"); 2517 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2518 // entry_jint_arraycopy always points to the unaligned version 2519 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2520 "jint_disjoint_arraycopy"); 2521 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2522 &entry_jint_arraycopy, 2523 "jint_arraycopy"); 2524 2525 //*** jlong 2526 // It is always aligned 2527 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2528 "arrayof_jlong_disjoint_arraycopy"); 2529 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2530 "arrayof_jlong_arraycopy"); 2531 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2532 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2533 2534 //*** oops 2535 { 2536 // With compressed oops we need unaligned versions; notice that 2537 // we overwrite entry_oop_arraycopy. 2538 bool aligned = !UseCompressedOops; 2539 2540 StubRoutines::_arrayof_oop_disjoint_arraycopy 2541 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2542 /*dest_uninitialized*/false); 2543 StubRoutines::_arrayof_oop_arraycopy 2544 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2545 /*dest_uninitialized*/false); 2546 // Aligned versions without pre-barriers 2547 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2548 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2549 /*dest_uninitialized*/true); 2550 StubRoutines::_arrayof_oop_arraycopy_uninit 2551 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2552 /*dest_uninitialized*/true); 2553 } 2554 2555 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2556 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2557 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2558 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2559 2560 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2561 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2562 /*dest_uninitialized*/true); 2563 2564 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2565 entry_jbyte_arraycopy, 2566 entry_jshort_arraycopy, 2567 entry_jint_arraycopy, 2568 entry_jlong_arraycopy); 2569 2570 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2571 entry_jbyte_arraycopy, 2572 entry_jshort_arraycopy, 2573 entry_jint_arraycopy, 2574 entry_oop_arraycopy, 2575 entry_jlong_arraycopy, 2576 entry_checkcast_arraycopy); 2577 2578 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2579 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2580 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2581 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2582 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2583 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2584 } 2585 2586 // Arguments: 2587 // 2588 // Inputs: 2589 // c_rarg0 - source byte array address 2590 // c_rarg1 - destination byte array address 2591 // c_rarg2 - K (key) in little endian int array 2592 // 2593 address generate_aescrypt_encryptBlock() { 2594 __ align(CodeEntryAlignment); 2595 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2596 2597 Label L_doLast; 2598 2599 const Register from = c_rarg0; // source array address 2600 const Register to = c_rarg1; // destination array address 2601 const Register key = c_rarg2; // key array address 2602 const Register keylen = rscratch1; 2603 2604 address start = __ pc(); 2605 __ enter(); 2606 2607 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2608 2609 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2610 2611 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2612 __ rev32(v1, __ T16B, v1); 2613 __ rev32(v2, __ T16B, v2); 2614 __ rev32(v3, __ T16B, v3); 2615 __ rev32(v4, __ T16B, v4); 2616 __ aese(v0, v1); 2617 __ aesmc(v0, v0); 2618 __ aese(v0, v2); 2619 __ aesmc(v0, v0); 2620 __ aese(v0, v3); 2621 __ aesmc(v0, v0); 2622 __ aese(v0, v4); 2623 __ aesmc(v0, v0); 2624 2625 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2626 __ rev32(v1, __ T16B, v1); 2627 __ rev32(v2, __ T16B, v2); 2628 __ rev32(v3, __ T16B, v3); 2629 __ rev32(v4, __ T16B, v4); 2630 __ aese(v0, v1); 2631 __ aesmc(v0, v0); 2632 __ aese(v0, v2); 2633 __ aesmc(v0, v0); 2634 __ aese(v0, v3); 2635 __ aesmc(v0, v0); 2636 __ aese(v0, v4); 2637 __ aesmc(v0, v0); 2638 2639 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2640 __ rev32(v1, __ T16B, v1); 2641 __ rev32(v2, __ T16B, v2); 2642 2643 __ cmpw(keylen, 44); 2644 __ br(Assembler::EQ, L_doLast); 2645 2646 __ aese(v0, v1); 2647 __ aesmc(v0, v0); 2648 __ aese(v0, v2); 2649 __ aesmc(v0, v0); 2650 2651 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2652 __ rev32(v1, __ T16B, v1); 2653 __ rev32(v2, __ T16B, v2); 2654 2655 __ cmpw(keylen, 52); 2656 __ br(Assembler::EQ, L_doLast); 2657 2658 __ aese(v0, v1); 2659 __ aesmc(v0, v0); 2660 __ aese(v0, v2); 2661 __ aesmc(v0, v0); 2662 2663 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2664 __ rev32(v1, __ T16B, v1); 2665 __ rev32(v2, __ T16B, v2); 2666 2667 __ BIND(L_doLast); 2668 2669 __ aese(v0, v1); 2670 __ aesmc(v0, v0); 2671 __ aese(v0, v2); 2672 2673 __ ld1(v1, __ T16B, key); 2674 __ rev32(v1, __ T16B, v1); 2675 __ eor(v0, __ T16B, v0, v1); 2676 2677 __ st1(v0, __ T16B, to); 2678 2679 __ mov(r0, 0); 2680 2681 __ leave(); 2682 __ ret(lr); 2683 2684 return start; 2685 } 2686 2687 // Arguments: 2688 // 2689 // Inputs: 2690 // c_rarg0 - source byte array address 2691 // c_rarg1 - destination byte array address 2692 // c_rarg2 - K (key) in little endian int array 2693 // 2694 address generate_aescrypt_decryptBlock() { 2695 assert(UseAES, "need AES instructions and misaligned SSE support"); 2696 __ align(CodeEntryAlignment); 2697 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2698 Label L_doLast; 2699 2700 const Register from = c_rarg0; // source array address 2701 const Register to = c_rarg1; // destination array address 2702 const Register key = c_rarg2; // key array address 2703 const Register keylen = rscratch1; 2704 2705 address start = __ pc(); 2706 __ enter(); // required for proper stackwalking of RuntimeStub frame 2707 2708 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2709 2710 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2711 2712 __ ld1(v5, __ T16B, __ post(key, 16)); 2713 __ rev32(v5, __ T16B, v5); 2714 2715 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2716 __ rev32(v1, __ T16B, v1); 2717 __ rev32(v2, __ T16B, v2); 2718 __ rev32(v3, __ T16B, v3); 2719 __ rev32(v4, __ T16B, v4); 2720 __ aesd(v0, v1); 2721 __ aesimc(v0, v0); 2722 __ aesd(v0, v2); 2723 __ aesimc(v0, v0); 2724 __ aesd(v0, v3); 2725 __ aesimc(v0, v0); 2726 __ aesd(v0, v4); 2727 __ aesimc(v0, v0); 2728 2729 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2730 __ rev32(v1, __ T16B, v1); 2731 __ rev32(v2, __ T16B, v2); 2732 __ rev32(v3, __ T16B, v3); 2733 __ rev32(v4, __ T16B, v4); 2734 __ aesd(v0, v1); 2735 __ aesimc(v0, v0); 2736 __ aesd(v0, v2); 2737 __ aesimc(v0, v0); 2738 __ aesd(v0, v3); 2739 __ aesimc(v0, v0); 2740 __ aesd(v0, v4); 2741 __ aesimc(v0, v0); 2742 2743 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2744 __ rev32(v1, __ T16B, v1); 2745 __ rev32(v2, __ T16B, v2); 2746 2747 __ cmpw(keylen, 44); 2748 __ br(Assembler::EQ, L_doLast); 2749 2750 __ aesd(v0, v1); 2751 __ aesimc(v0, v0); 2752 __ aesd(v0, v2); 2753 __ aesimc(v0, v0); 2754 2755 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2756 __ rev32(v1, __ T16B, v1); 2757 __ rev32(v2, __ T16B, v2); 2758 2759 __ cmpw(keylen, 52); 2760 __ br(Assembler::EQ, L_doLast); 2761 2762 __ aesd(v0, v1); 2763 __ aesimc(v0, v0); 2764 __ aesd(v0, v2); 2765 __ aesimc(v0, v0); 2766 2767 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2768 __ rev32(v1, __ T16B, v1); 2769 __ rev32(v2, __ T16B, v2); 2770 2771 __ BIND(L_doLast); 2772 2773 __ aesd(v0, v1); 2774 __ aesimc(v0, v0); 2775 __ aesd(v0, v2); 2776 2777 __ eor(v0, __ T16B, v0, v5); 2778 2779 __ st1(v0, __ T16B, to); 2780 2781 __ mov(r0, 0); 2782 2783 __ leave(); 2784 __ ret(lr); 2785 2786 return start; 2787 } 2788 2789 // Arguments: 2790 // 2791 // Inputs: 2792 // c_rarg0 - source byte array address 2793 // c_rarg1 - destination byte array address 2794 // c_rarg2 - K (key) in little endian int array 2795 // c_rarg3 - r vector byte array address 2796 // c_rarg4 - input length 2797 // 2798 // Output: 2799 // x0 - input length 2800 // 2801 address generate_cipherBlockChaining_encryptAESCrypt() { 2802 assert(UseAES, "need AES instructions and misaligned SSE support"); 2803 __ align(CodeEntryAlignment); 2804 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2805 2806 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52, _L_finish; 2807 2808 const Register from = c_rarg0; // source array address 2809 const Register to = c_rarg1; // destination array address 2810 const Register key = c_rarg2; // key array address 2811 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2812 // and left with the results of the last encryption block 2813 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2814 const Register keylen = rscratch1; 2815 2816 address start = __ pc(); 2817 2818 __ enter(); 2819 2820 __ subsw(rscratch2, len_reg, zr); 2821 __ br(Assembler::LE, _L_finish); 2822 2823 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2824 2825 __ ld1(v0, __ T16B, rvec); 2826 2827 __ cmpw(keylen, 52); 2828 __ br(Assembler::CC, L_loadkeys_44); 2829 __ br(Assembler::EQ, L_loadkeys_52); 2830 2831 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2832 __ rev32(v17, __ T16B, v17); 2833 __ rev32(v18, __ T16B, v18); 2834 __ BIND(L_loadkeys_52); 2835 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2836 __ rev32(v19, __ T16B, v19); 2837 __ rev32(v20, __ T16B, v20); 2838 __ BIND(L_loadkeys_44); 2839 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2840 __ rev32(v21, __ T16B, v21); 2841 __ rev32(v22, __ T16B, v22); 2842 __ rev32(v23, __ T16B, v23); 2843 __ rev32(v24, __ T16B, v24); 2844 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2845 __ rev32(v25, __ T16B, v25); 2846 __ rev32(v26, __ T16B, v26); 2847 __ rev32(v27, __ T16B, v27); 2848 __ rev32(v28, __ T16B, v28); 2849 __ ld1(v29, v30, v31, __ T16B, key); 2850 __ rev32(v29, __ T16B, v29); 2851 __ rev32(v30, __ T16B, v30); 2852 __ rev32(v31, __ T16B, v31); 2853 2854 __ BIND(L_aes_loop); 2855 __ ld1(v1, __ T16B, __ post(from, 16)); 2856 __ eor(v0, __ T16B, v0, v1); 2857 2858 __ br(Assembler::CC, L_rounds_44); 2859 __ br(Assembler::EQ, L_rounds_52); 2860 2861 __ aese(v0, v17); __ aesmc(v0, v0); 2862 __ aese(v0, v18); __ aesmc(v0, v0); 2863 __ BIND(L_rounds_52); 2864 __ aese(v0, v19); __ aesmc(v0, v0); 2865 __ aese(v0, v20); __ aesmc(v0, v0); 2866 __ BIND(L_rounds_44); 2867 __ aese(v0, v21); __ aesmc(v0, v0); 2868 __ aese(v0, v22); __ aesmc(v0, v0); 2869 __ aese(v0, v23); __ aesmc(v0, v0); 2870 __ aese(v0, v24); __ aesmc(v0, v0); 2871 __ aese(v0, v25); __ aesmc(v0, v0); 2872 __ aese(v0, v26); __ aesmc(v0, v0); 2873 __ aese(v0, v27); __ aesmc(v0, v0); 2874 __ aese(v0, v28); __ aesmc(v0, v0); 2875 __ aese(v0, v29); __ aesmc(v0, v0); 2876 __ aese(v0, v30); 2877 __ eor(v0, __ T16B, v0, v31); 2878 2879 __ st1(v0, __ T16B, __ post(to, 16)); 2880 2881 __ subw(len_reg, len_reg, 16); 2882 __ cbnzw(len_reg, L_aes_loop); 2883 2884 __ st1(v0, __ T16B, rvec); 2885 2886 __ BIND(_L_finish); 2887 __ mov(r0, rscratch2); 2888 2889 __ leave(); 2890 __ ret(lr); 2891 2892 return start; 2893 } 2894 2895 // Arguments: 2896 // 2897 // Inputs: 2898 // c_rarg0 - source byte array address 2899 // c_rarg1 - destination byte array address 2900 // c_rarg2 - K (key) in little endian int array 2901 // c_rarg3 - r vector byte array address 2902 // c_rarg4 - input length 2903 // 2904 // Output: 2905 // r0 - input length 2906 // 2907 address generate_cipherBlockChaining_decryptAESCrypt() { 2908 assert(UseAES, "need AES instructions and misaligned SSE support"); 2909 __ align(CodeEntryAlignment); 2910 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2911 2912 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52, _L_finish; 2913 2914 const Register from = c_rarg0; // source array address 2915 const Register to = c_rarg1; // destination array address 2916 const Register key = c_rarg2; // key array address 2917 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2918 // and left with the results of the last encryption block 2919 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2920 const Register keylen = rscratch1; 2921 2922 address start = __ pc(); 2923 2924 __ enter(); 2925 2926 __ subsw(rscratch2, len_reg, zr); 2927 __ br(Assembler::LE, _L_finish); 2928 2929 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2930 2931 __ ld1(v2, __ T16B, rvec); 2932 2933 __ ld1(v31, __ T16B, __ post(key, 16)); 2934 __ rev32(v31, __ T16B, v31); 2935 2936 __ cmpw(keylen, 52); 2937 __ br(Assembler::CC, L_loadkeys_44); 2938 __ br(Assembler::EQ, L_loadkeys_52); 2939 2940 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2941 __ rev32(v17, __ T16B, v17); 2942 __ rev32(v18, __ T16B, v18); 2943 __ BIND(L_loadkeys_52); 2944 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2945 __ rev32(v19, __ T16B, v19); 2946 __ rev32(v20, __ T16B, v20); 2947 __ BIND(L_loadkeys_44); 2948 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2949 __ rev32(v21, __ T16B, v21); 2950 __ rev32(v22, __ T16B, v22); 2951 __ rev32(v23, __ T16B, v23); 2952 __ rev32(v24, __ T16B, v24); 2953 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2954 __ rev32(v25, __ T16B, v25); 2955 __ rev32(v26, __ T16B, v26); 2956 __ rev32(v27, __ T16B, v27); 2957 __ rev32(v28, __ T16B, v28); 2958 __ ld1(v29, v30, __ T16B, key); 2959 __ rev32(v29, __ T16B, v29); 2960 __ rev32(v30, __ T16B, v30); 2961 2962 __ BIND(L_aes_loop); 2963 __ ld1(v0, __ T16B, __ post(from, 16)); 2964 __ orr(v1, __ T16B, v0, v0); 2965 2966 __ br(Assembler::CC, L_rounds_44); 2967 __ br(Assembler::EQ, L_rounds_52); 2968 2969 __ aesd(v0, v17); __ aesimc(v0, v0); 2970 __ aesd(v0, v18); __ aesimc(v0, v0); 2971 __ BIND(L_rounds_52); 2972 __ aesd(v0, v19); __ aesimc(v0, v0); 2973 __ aesd(v0, v20); __ aesimc(v0, v0); 2974 __ BIND(L_rounds_44); 2975 __ aesd(v0, v21); __ aesimc(v0, v0); 2976 __ aesd(v0, v22); __ aesimc(v0, v0); 2977 __ aesd(v0, v23); __ aesimc(v0, v0); 2978 __ aesd(v0, v24); __ aesimc(v0, v0); 2979 __ aesd(v0, v25); __ aesimc(v0, v0); 2980 __ aesd(v0, v26); __ aesimc(v0, v0); 2981 __ aesd(v0, v27); __ aesimc(v0, v0); 2982 __ aesd(v0, v28); __ aesimc(v0, v0); 2983 __ aesd(v0, v29); __ aesimc(v0, v0); 2984 __ aesd(v0, v30); 2985 __ eor(v0, __ T16B, v0, v31); 2986 __ eor(v0, __ T16B, v0, v2); 2987 2988 __ st1(v0, __ T16B, __ post(to, 16)); 2989 __ orr(v2, __ T16B, v1, v1); 2990 2991 __ subw(len_reg, len_reg, 16); 2992 __ cbnzw(len_reg, L_aes_loop); 2993 2994 __ st1(v2, __ T16B, rvec); 2995 2996 __ BIND(_L_finish); 2997 __ mov(r0, rscratch2); 2998 2999 __ leave(); 3000 __ ret(lr); 3001 3002 return start; 3003 } 3004 3005 // Arguments: 3006 // 3007 // Inputs: 3008 // c_rarg0 - byte[] source+offset 3009 // c_rarg1 - int[] SHA.state 3010 // c_rarg2 - int offset 3011 // c_rarg3 - int limit 3012 // 3013 address generate_sha1_implCompress(bool multi_block, const char *name) { 3014 __ align(CodeEntryAlignment); 3015 StubCodeMark mark(this, "StubRoutines", name); 3016 address start = __ pc(); 3017 3018 Register buf = c_rarg0; 3019 Register state = c_rarg1; 3020 Register ofs = c_rarg2; 3021 Register limit = c_rarg3; 3022 3023 Label keys; 3024 Label sha1_loop; 3025 3026 // load the keys into v0..v3 3027 __ adr(rscratch1, keys); 3028 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3029 // load 5 words state into v6, v7 3030 __ ldrq(v6, Address(state, 0)); 3031 __ ldrs(v7, Address(state, 16)); 3032 3033 3034 __ BIND(sha1_loop); 3035 // load 64 bytes of data into v16..v19 3036 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3037 __ rev32(v16, __ T16B, v16); 3038 __ rev32(v17, __ T16B, v17); 3039 __ rev32(v18, __ T16B, v18); 3040 __ rev32(v19, __ T16B, v19); 3041 3042 // do the sha1 3043 __ addv(v4, __ T4S, v16, v0); 3044 __ orr(v20, __ T16B, v6, v6); 3045 3046 FloatRegister d0 = v16; 3047 FloatRegister d1 = v17; 3048 FloatRegister d2 = v18; 3049 FloatRegister d3 = v19; 3050 3051 for (int round = 0; round < 20; round++) { 3052 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3053 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3054 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3055 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3056 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3057 3058 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3059 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3060 __ sha1h(tmp2, __ T4S, v20); 3061 if (round < 5) 3062 __ sha1c(v20, __ T4S, tmp3, tmp4); 3063 else if (round < 10 || round >= 15) 3064 __ sha1p(v20, __ T4S, tmp3, tmp4); 3065 else 3066 __ sha1m(v20, __ T4S, tmp3, tmp4); 3067 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3068 3069 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3070 } 3071 3072 __ addv(v7, __ T2S, v7, v21); 3073 __ addv(v6, __ T4S, v6, v20); 3074 3075 if (multi_block) { 3076 __ add(ofs, ofs, 64); 3077 __ cmp(ofs, limit); 3078 __ br(Assembler::LE, sha1_loop); 3079 __ mov(c_rarg0, ofs); // return ofs 3080 } 3081 3082 __ strq(v6, Address(state, 0)); 3083 __ strs(v7, Address(state, 16)); 3084 3085 __ ret(lr); 3086 3087 __ bind(keys); 3088 __ emit_int32(0x5a827999); 3089 __ emit_int32(0x6ed9eba1); 3090 __ emit_int32(0x8f1bbcdc); 3091 __ emit_int32(0xca62c1d6); 3092 3093 return start; 3094 } 3095 3096 3097 // Arguments: 3098 // 3099 // Inputs: 3100 // c_rarg0 - byte[] source+offset 3101 // c_rarg1 - int[] SHA.state 3102 // c_rarg2 - int offset 3103 // c_rarg3 - int limit 3104 // 3105 address generate_sha256_implCompress(bool multi_block, const char *name) { 3106 static const uint32_t round_consts[64] = { 3107 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3108 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3109 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3110 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3111 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3112 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3113 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3114 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3115 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3116 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3117 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3118 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3119 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3120 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3121 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3122 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3123 }; 3124 __ align(CodeEntryAlignment); 3125 StubCodeMark mark(this, "StubRoutines", name); 3126 address start = __ pc(); 3127 3128 Register buf = c_rarg0; 3129 Register state = c_rarg1; 3130 Register ofs = c_rarg2; 3131 Register limit = c_rarg3; 3132 3133 Label sha1_loop; 3134 3135 __ stpd(v8, v9, __ pre(sp, -32)); 3136 __ stpd(v10, v11, Address(sp, 16)); 3137 3138 // dga == v0 3139 // dgb == v1 3140 // dg0 == v2 3141 // dg1 == v3 3142 // dg2 == v4 3143 // t0 == v6 3144 // t1 == v7 3145 3146 // load 16 keys to v16..v31 3147 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3148 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3149 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3150 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3151 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3152 3153 // load 8 words (256 bits) state 3154 __ ldpq(v0, v1, state); 3155 3156 __ BIND(sha1_loop); 3157 // load 64 bytes of data into v8..v11 3158 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3159 __ rev32(v8, __ T16B, v8); 3160 __ rev32(v9, __ T16B, v9); 3161 __ rev32(v10, __ T16B, v10); 3162 __ rev32(v11, __ T16B, v11); 3163 3164 __ addv(v6, __ T4S, v8, v16); 3165 __ orr(v2, __ T16B, v0, v0); 3166 __ orr(v3, __ T16B, v1, v1); 3167 3168 FloatRegister d0 = v8; 3169 FloatRegister d1 = v9; 3170 FloatRegister d2 = v10; 3171 FloatRegister d3 = v11; 3172 3173 3174 for (int round = 0; round < 16; round++) { 3175 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3176 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3177 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3178 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3179 3180 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3181 __ orr(v4, __ T16B, v2, v2); 3182 if (round < 15) 3183 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3184 __ sha256h(v2, __ T4S, v3, tmp2); 3185 __ sha256h2(v3, __ T4S, v4, tmp2); 3186 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3187 3188 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3189 } 3190 3191 __ addv(v0, __ T4S, v0, v2); 3192 __ addv(v1, __ T4S, v1, v3); 3193 3194 if (multi_block) { 3195 __ add(ofs, ofs, 64); 3196 __ cmp(ofs, limit); 3197 __ br(Assembler::LE, sha1_loop); 3198 __ mov(c_rarg0, ofs); // return ofs 3199 } 3200 3201 __ ldpd(v10, v11, Address(sp, 16)); 3202 __ ldpd(v8, v9, __ post(sp, 32)); 3203 3204 __ stpq(v0, v1, state); 3205 3206 __ ret(lr); 3207 3208 return start; 3209 } 3210 3211 #ifndef BUILTIN_SIM 3212 // Safefetch stubs. 3213 void generate_safefetch(const char* name, int size, address* entry, 3214 address* fault_pc, address* continuation_pc) { 3215 // safefetch signatures: 3216 // int SafeFetch32(int* adr, int errValue); 3217 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3218 // 3219 // arguments: 3220 // c_rarg0 = adr 3221 // c_rarg1 = errValue 3222 // 3223 // result: 3224 // PPC_RET = *adr or errValue 3225 3226 StubCodeMark mark(this, "StubRoutines", name); 3227 3228 // Entry point, pc or function descriptor. 3229 *entry = __ pc(); 3230 3231 // Load *adr into c_rarg1, may fault. 3232 *fault_pc = __ pc(); 3233 switch (size) { 3234 case 4: 3235 // int32_t 3236 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3237 break; 3238 case 8: 3239 // int64_t 3240 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3241 break; 3242 default: 3243 ShouldNotReachHere(); 3244 } 3245 3246 // return errValue or *adr 3247 *continuation_pc = __ pc(); 3248 __ mov(r0, c_rarg1); 3249 __ ret(lr); 3250 } 3251 #endif 3252 3253 /** 3254 * Arguments: 3255 * 3256 * Inputs: 3257 * c_rarg0 - int crc 3258 * c_rarg1 - byte* buf 3259 * c_rarg2 - int length 3260 * 3261 * Output: 3262 * r0 - int crc result 3263 * 3264 * Preserves: 3265 * r13 3266 * 3267 */ 3268 address generate_updateBytesCRC32() { 3269 assert(UseCRC32Intrinsics, "what are we doing here?"); 3270 3271 __ align(CodeEntryAlignment); 3272 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3273 3274 address start = __ pc(); 3275 3276 const Register crc = c_rarg0; // crc 3277 const Register buf = c_rarg1; // source java byte array address 3278 const Register len = c_rarg2; // length 3279 const Register table0 = c_rarg3; // crc_table address 3280 const Register table1 = c_rarg4; 3281 const Register table2 = c_rarg5; 3282 const Register table3 = c_rarg6; 3283 const Register tmp3 = c_rarg7; 3284 3285 BLOCK_COMMENT("Entry:"); 3286 __ enter(); // required for proper stackwalking of RuntimeStub frame 3287 3288 __ kernel_crc32(crc, buf, len, 3289 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3290 3291 __ leave(); // required for proper stackwalking of RuntimeStub frame 3292 __ ret(lr); 3293 3294 return start; 3295 } 3296 3297 /** 3298 * Arguments: 3299 * 3300 * Input: 3301 * c_rarg0 - x address 3302 * c_rarg1 - x length 3303 * c_rarg2 - y address 3304 * c_rarg3 - y lenth 3305 * c_rarg4 - z address 3306 * c_rarg5 - z length 3307 */ 3308 address generate_multiplyToLen() { 3309 __ align(CodeEntryAlignment); 3310 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3311 3312 address start = __ pc(); 3313 const Register x = r0; 3314 const Register xlen = r1; 3315 const Register y = r2; 3316 const Register ylen = r3; 3317 const Register z = r4; 3318 const Register zlen = r5; 3319 3320 const Register tmp1 = r10; 3321 const Register tmp2 = r11; 3322 const Register tmp3 = r12; 3323 const Register tmp4 = r13; 3324 const Register tmp5 = r14; 3325 const Register tmp6 = r15; 3326 const Register tmp7 = r16; 3327 3328 BLOCK_COMMENT("Entry:"); 3329 __ enter(); // required for proper stackwalking of RuntimeStub frame 3330 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3331 __ leave(); // required for proper stackwalking of RuntimeStub frame 3332 __ ret(lr); 3333 3334 return start; 3335 } 3336 3337 // Continuation point for throwing of implicit exceptions that are 3338 // not handled in the current activation. Fabricates an exception 3339 // oop and initiates normal exception dispatching in this 3340 // frame. Since we need to preserve callee-saved values (currently 3341 // only for C2, but done for C1 as well) we need a callee-saved oop 3342 // map and therefore have to make these stubs into RuntimeStubs 3343 // rather than BufferBlobs. If the compiler needs all registers to 3344 // be preserved between the fault point and the exception handler 3345 // then it must assume responsibility for that in 3346 // AbstractCompiler::continuation_for_implicit_null_exception or 3347 // continuation_for_implicit_division_by_zero_exception. All other 3348 // implicit exceptions (e.g., NullPointerException or 3349 // AbstractMethodError on entry) are either at call sites or 3350 // otherwise assume that stack unwinding will be initiated, so 3351 // caller saved registers were assumed volatile in the compiler. 3352 3353 #undef __ 3354 #define __ masm-> 3355 3356 address generate_throw_exception(const char* name, 3357 address runtime_entry, 3358 Register arg1 = noreg, 3359 Register arg2 = noreg) { 3360 // Information about frame layout at time of blocking runtime call. 3361 // Note that we only have to preserve callee-saved registers since 3362 // the compilers are responsible for supplying a continuation point 3363 // if they expect all registers to be preserved. 3364 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 3365 enum layout { 3366 rfp_off = 0, 3367 rfp_off2, 3368 return_off, 3369 return_off2, 3370 framesize // inclusive of return address 3371 }; 3372 3373 int insts_size = 512; 3374 int locs_size = 64; 3375 3376 CodeBuffer code(name, insts_size, locs_size); 3377 OopMapSet* oop_maps = new OopMapSet(); 3378 MacroAssembler* masm = new MacroAssembler(&code); 3379 3380 address start = __ pc(); 3381 3382 // This is an inlined and slightly modified version of call_VM 3383 // which has the ability to fetch the return PC out of 3384 // thread-local storage and also sets up last_Java_sp slightly 3385 // differently than the real call_VM 3386 3387 __ enter(); // Save FP and LR before call 3388 3389 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3390 3391 // lr and fp are already in place 3392 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 3393 3394 int frame_complete = __ pc() - start; 3395 3396 // Set up last_Java_sp and last_Java_fp 3397 address the_pc = __ pc(); 3398 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 3399 3400 // Call runtime 3401 if (arg1 != noreg) { 3402 assert(arg2 != c_rarg1, "clobbered"); 3403 __ mov(c_rarg1, arg1); 3404 } 3405 if (arg2 != noreg) { 3406 __ mov(c_rarg2, arg2); 3407 } 3408 __ mov(c_rarg0, rthread); 3409 BLOCK_COMMENT("call runtime_entry"); 3410 __ mov(rscratch1, runtime_entry); 3411 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 3412 3413 // Generate oop map 3414 OopMap* map = new OopMap(framesize, 0); 3415 3416 oop_maps->add_gc_map(the_pc - start, map); 3417 3418 __ reset_last_Java_frame(true); 3419 __ maybe_isb(); 3420 3421 __ leave(); 3422 3423 // check for pending exceptions 3424 #ifdef ASSERT 3425 Label L; 3426 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 3427 __ cbnz(rscratch1, L); 3428 __ should_not_reach_here(); 3429 __ bind(L); 3430 #endif // ASSERT 3431 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3432 3433 3434 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3435 RuntimeStub* stub = 3436 RuntimeStub::new_runtime_stub(name, 3437 &code, 3438 frame_complete, 3439 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3440 oop_maps, false); 3441 return stub->entry_point(); 3442 } 3443 3444 class MontgomeryMultiplyGenerator : public MacroAssembler { 3445 3446 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3447 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 3448 3449 RegSet _toSave; 3450 bool _squaring; 3451 3452 public: 3453 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3454 : MacroAssembler(as->code()), _squaring(squaring) { 3455 3456 // Register allocation 3457 3458 Register reg = c_rarg0; 3459 Pa_base = reg; // Argument registers 3460 if (squaring) 3461 Pb_base = Pa_base; 3462 else 3463 Pb_base = ++reg; 3464 Pn_base = ++reg; 3465 Rlen= ++reg; 3466 inv = ++reg; 3467 Pm_base = ++reg; 3468 3469 // Working registers: 3470 Ra = ++reg; // The current digit of a, b, n, and m. 3471 Rb = ++reg; 3472 Rm = ++reg; 3473 Rn = ++reg; 3474 3475 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 3476 Pb = ++reg; 3477 Pm = ++reg; 3478 Pn = ++reg; 3479 3480 t0 = ++reg; // Three registers which form a 3481 t1 = ++reg; // triple-precision accumuator. 3482 t2 = ++reg; 3483 3484 Ri = ++reg; // Inner and outer loop indexes. 3485 Rj = ++reg; 3486 3487 Rhi_ab = ++reg; // Product registers: low and high parts 3488 Rlo_ab = ++reg; // of a*b and m*n. 3489 Rhi_mn = ++reg; 3490 Rlo_mn = ++reg; 3491 3492 // r19 and up are callee-saved. 3493 _toSave = RegSet::range(r19, reg) + Pm_base; 3494 } 3495 3496 private: 3497 void save_regs() { 3498 push(_toSave, sp); 3499 } 3500 3501 void restore_regs() { 3502 pop(_toSave, sp); 3503 } 3504 3505 template <typename T> 3506 void unroll_2(Register count, T block) { 3507 Label loop, end, odd; 3508 tbnz(count, 0, odd); 3509 cbz(count, end); 3510 align(16); 3511 bind(loop); 3512 (this->*block)(); 3513 bind(odd); 3514 (this->*block)(); 3515 subs(count, count, 2); 3516 br(Assembler::GT, loop); 3517 bind(end); 3518 } 3519 3520 template <typename T> 3521 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3522 Label loop, end, odd; 3523 tbnz(count, 0, odd); 3524 cbz(count, end); 3525 align(16); 3526 bind(loop); 3527 (this->*block)(d, s, tmp); 3528 bind(odd); 3529 (this->*block)(d, s, tmp); 3530 subs(count, count, 2); 3531 br(Assembler::GT, loop); 3532 bind(end); 3533 } 3534 3535 void pre1(RegisterOrConstant i) { 3536 block_comment("pre1"); 3537 // Pa = Pa_base; 3538 // Pb = Pb_base + i; 3539 // Pm = Pm_base; 3540 // Pn = Pn_base + i; 3541 // Ra = *Pa; 3542 // Rb = *Pb; 3543 // Rm = *Pm; 3544 // Rn = *Pn; 3545 ldr(Ra, Address(Pa_base)); 3546 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3547 ldr(Rm, Address(Pm_base)); 3548 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3549 lea(Pa, Address(Pa_base)); 3550 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3551 lea(Pm, Address(Pm_base)); 3552 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3553 3554 // Zero the m*n result. 3555 mov(Rhi_mn, zr); 3556 mov(Rlo_mn, zr); 3557 } 3558 3559 // The core multiply-accumulate step of a Montgomery 3560 // multiplication. The idea is to schedule operations as a 3561 // pipeline so that instructions with long latencies (loads and 3562 // multiplies) have time to complete before their results are 3563 // used. This most benefits in-order implementations of the 3564 // architecture but out-of-order ones also benefit. 3565 void step() { 3566 block_comment("step"); 3567 // MACC(Ra, Rb, t0, t1, t2); 3568 // Ra = *++Pa; 3569 // Rb = *--Pb; 3570 umulh(Rhi_ab, Ra, Rb); 3571 mul(Rlo_ab, Ra, Rb); 3572 ldr(Ra, pre(Pa, wordSize)); 3573 ldr(Rb, pre(Pb, -wordSize)); 3574 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 3575 // previous iteration. 3576 // MACC(Rm, Rn, t0, t1, t2); 3577 // Rm = *++Pm; 3578 // Rn = *--Pn; 3579 umulh(Rhi_mn, Rm, Rn); 3580 mul(Rlo_mn, Rm, Rn); 3581 ldr(Rm, pre(Pm, wordSize)); 3582 ldr(Rn, pre(Pn, -wordSize)); 3583 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3584 } 3585 3586 void post1() { 3587 block_comment("post1"); 3588 3589 // MACC(Ra, Rb, t0, t1, t2); 3590 // Ra = *++Pa; 3591 // Rb = *--Pb; 3592 umulh(Rhi_ab, Ra, Rb); 3593 mul(Rlo_ab, Ra, Rb); 3594 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3595 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3596 3597 // *Pm = Rm = t0 * inv; 3598 mul(Rm, t0, inv); 3599 str(Rm, Address(Pm)); 3600 3601 // MACC(Rm, Rn, t0, t1, t2); 3602 // t0 = t1; t1 = t2; t2 = 0; 3603 umulh(Rhi_mn, Rm, Rn); 3604 3605 #ifndef PRODUCT 3606 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 3607 { 3608 mul(Rlo_mn, Rm, Rn); 3609 add(Rlo_mn, t0, Rlo_mn); 3610 Label ok; 3611 cbz(Rlo_mn, ok); { 3612 stop("broken Montgomery multiply"); 3613 } bind(ok); 3614 } 3615 #endif 3616 // We have very carefully set things up so that 3617 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 3618 // the lower half of Rm * Rn because we know the result already: 3619 // it must be -t0. t0 + (-t0) must generate a carry iff 3620 // t0 != 0. So, rather than do a mul and an adds we just set 3621 // the carry flag iff t0 is nonzero. 3622 // 3623 // mul(Rlo_mn, Rm, Rn); 3624 // adds(zr, t0, Rlo_mn); 3625 subs(zr, t0, 1); // Set carry iff t0 is nonzero 3626 adcs(t0, t1, Rhi_mn); 3627 adc(t1, t2, zr); 3628 mov(t2, zr); 3629 } 3630 3631 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 3632 block_comment("pre2"); 3633 // Pa = Pa_base + i-len; 3634 // Pb = Pb_base + len; 3635 // Pm = Pm_base + i-len; 3636 // Pn = Pn_base + len; 3637 3638 if (i.is_register()) { 3639 sub(Rj, i.as_register(), len); 3640 } else { 3641 mov(Rj, i.as_constant()); 3642 sub(Rj, Rj, len); 3643 } 3644 // Rj == i-len 3645 3646 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 3647 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 3648 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 3649 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 3650 3651 // Ra = *++Pa; 3652 // Rb = *--Pb; 3653 // Rm = *++Pm; 3654 // Rn = *--Pn; 3655 ldr(Ra, pre(Pa, wordSize)); 3656 ldr(Rb, pre(Pb, -wordSize)); 3657 ldr(Rm, pre(Pm, wordSize)); 3658 ldr(Rn, pre(Pn, -wordSize)); 3659 3660 mov(Rhi_mn, zr); 3661 mov(Rlo_mn, zr); 3662 } 3663 3664 void post2(RegisterOrConstant i, RegisterOrConstant len) { 3665 block_comment("post2"); 3666 if (i.is_constant()) { 3667 mov(Rj, i.as_constant()-len.as_constant()); 3668 } else { 3669 sub(Rj, i.as_register(), len); 3670 } 3671 3672 adds(t0, t0, Rlo_mn); // The pending m*n, low part 3673 3674 // As soon as we know the least significant digit of our result, 3675 // store it. 3676 // Pm_base[i-len] = t0; 3677 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 3678 3679 // t0 = t1; t1 = t2; t2 = 0; 3680 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 3681 adc(t1, t2, zr); 3682 mov(t2, zr); 3683 } 3684 3685 // A carry in t0 after Montgomery multiplication means that we 3686 // should subtract multiples of n from our result in m. We'll 3687 // keep doing that until there is no carry. 3688 void normalize(RegisterOrConstant len) { 3689 block_comment("normalize"); 3690 // while (t0) 3691 // t0 = sub(Pm_base, Pn_base, t0, len); 3692 Label loop, post, again; 3693 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 3694 cbz(t0, post); { 3695 bind(again); { 3696 mov(i, zr); 3697 mov(cnt, len); 3698 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3699 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3700 subs(zr, zr, zr); // set carry flag, i.e. no borrow 3701 align(16); 3702 bind(loop); { 3703 sbcs(Rm, Rm, Rn); 3704 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3705 add(i, i, 1); 3706 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3707 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3708 sub(cnt, cnt, 1); 3709 } cbnz(cnt, loop); 3710 sbc(t0, t0, zr); 3711 } cbnz(t0, again); 3712 } bind(post); 3713 } 3714 3715 // Move memory at s to d, reversing words. 3716 // Increments d to end of copied memory 3717 // Destroys tmp1, tmp2 3718 // Preserves len 3719 // Leaves s pointing to the address which was in d at start 3720 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 3721 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 3722 3723 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 3724 mov(tmp1, len); 3725 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 3726 sub(s, d, len, ext::uxtw, LogBytesPerWord); 3727 } 3728 // where 3729 void reverse1(Register d, Register s, Register tmp) { 3730 ldr(tmp, pre(s, -wordSize)); 3731 ror(tmp, tmp, 32); 3732 str(tmp, post(d, wordSize)); 3733 } 3734 3735 void step_squaring() { 3736 // An extra ACC 3737 step(); 3738 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3739 } 3740 3741 void last_squaring(RegisterOrConstant i) { 3742 Label dont; 3743 // if ((i & 1) == 0) { 3744 tbnz(i.as_register(), 0, dont); { 3745 // MACC(Ra, Rb, t0, t1, t2); 3746 // Ra = *++Pa; 3747 // Rb = *--Pb; 3748 umulh(Rhi_ab, Ra, Rb); 3749 mul(Rlo_ab, Ra, Rb); 3750 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3751 } bind(dont); 3752 } 3753 3754 void extra_step_squaring() { 3755 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3756 3757 // MACC(Rm, Rn, t0, t1, t2); 3758 // Rm = *++Pm; 3759 // Rn = *--Pn; 3760 umulh(Rhi_mn, Rm, Rn); 3761 mul(Rlo_mn, Rm, Rn); 3762 ldr(Rm, pre(Pm, wordSize)); 3763 ldr(Rn, pre(Pn, -wordSize)); 3764 } 3765 3766 void post1_squaring() { 3767 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3768 3769 // *Pm = Rm = t0 * inv; 3770 mul(Rm, t0, inv); 3771 str(Rm, Address(Pm)); 3772 3773 // MACC(Rm, Rn, t0, t1, t2); 3774 // t0 = t1; t1 = t2; t2 = 0; 3775 umulh(Rhi_mn, Rm, Rn); 3776 3777 #ifndef PRODUCT 3778 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 3779 { 3780 mul(Rlo_mn, Rm, Rn); 3781 add(Rlo_mn, t0, Rlo_mn); 3782 Label ok; 3783 cbz(Rlo_mn, ok); { 3784 stop("broken Montgomery multiply"); 3785 } bind(ok); 3786 } 3787 #endif 3788 // We have very carefully set things up so that 3789 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 3790 // the lower half of Rm * Rn because we know the result already: 3791 // it must be -t0. t0 + (-t0) must generate a carry iff 3792 // t0 != 0. So, rather than do a mul and an adds we just set 3793 // the carry flag iff t0 is nonzero. 3794 // 3795 // mul(Rlo_mn, Rm, Rn); 3796 // adds(zr, t0, Rlo_mn); 3797 subs(zr, t0, 1); // Set carry iff t0 is nonzero 3798 adcs(t0, t1, Rhi_mn); 3799 adc(t1, t2, zr); 3800 mov(t2, zr); 3801 } 3802 3803 void acc(Register Rhi, Register Rlo, 3804 Register t0, Register t1, Register t2) { 3805 adds(t0, t0, Rlo); 3806 adcs(t1, t1, Rhi); 3807 adc(t2, t2, zr); 3808 } 3809 3810 public: 3811 /** 3812 * Fast Montgomery multiplication. The derivation of the 3813 * algorithm is in A Cryptographic Library for the Motorola 3814 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3815 * 3816 * Arguments: 3817 * 3818 * Inputs for multiplication: 3819 * c_rarg0 - int array elements a 3820 * c_rarg1 - int array elements b 3821 * c_rarg2 - int array elements n (the modulus) 3822 * c_rarg3 - int length 3823 * c_rarg4 - int inv 3824 * c_rarg5 - int array elements m (the result) 3825 * 3826 * Inputs for squaring: 3827 * c_rarg0 - int array elements a 3828 * c_rarg1 - int array elements n (the modulus) 3829 * c_rarg2 - int length 3830 * c_rarg3 - int inv 3831 * c_rarg4 - int array elements m (the result) 3832 * 3833 */ 3834 address generate_multiply() { 3835 Label argh, nothing; 3836 bind(argh); 3837 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3838 3839 align(CodeEntryAlignment); 3840 address entry = pc(); 3841 3842 cbzw(Rlen, nothing); 3843 3844 enter(); 3845 3846 // Make room. 3847 cmpw(Rlen, 512); 3848 br(Assembler::HI, argh); 3849 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 3850 andr(sp, Ra, -2 * wordSize); 3851 3852 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 3853 3854 { 3855 // Copy input args, reversing as we go. We use Ra as a 3856 // temporary variable. 3857 reverse(Ra, Pa_base, Rlen, t0, t1); 3858 if (!_squaring) 3859 reverse(Ra, Pb_base, Rlen, t0, t1); 3860 reverse(Ra, Pn_base, Rlen, t0, t1); 3861 } 3862 3863 // Push all call-saved registers and also Pm_base which we'll need 3864 // at the end. 3865 save_regs(); 3866 3867 #ifndef PRODUCT 3868 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 3869 { 3870 ldr(Rn, Address(Pn_base, 0)); 3871 mul(Rlo_mn, Rn, inv); 3872 cmp(Rlo_mn, -1); 3873 Label ok; 3874 br(EQ, ok); { 3875 stop("broken inverse in Montgomery multiply"); 3876 } bind(ok); 3877 } 3878 #endif 3879 3880 mov(Pm_base, Ra); 3881 3882 mov(t0, zr); 3883 mov(t1, zr); 3884 mov(t2, zr); 3885 3886 block_comment("for (int i = 0; i < len; i++) {"); 3887 mov(Ri, zr); { 3888 Label loop, end; 3889 cmpw(Ri, Rlen); 3890 br(Assembler::GE, end); 3891 3892 bind(loop); 3893 pre1(Ri); 3894 3895 block_comment(" for (j = i; j; j--) {"); { 3896 movw(Rj, Ri); 3897 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3898 } block_comment(" } // j"); 3899 3900 post1(); 3901 addw(Ri, Ri, 1); 3902 cmpw(Ri, Rlen); 3903 br(Assembler::LT, loop); 3904 bind(end); 3905 block_comment("} // i"); 3906 } 3907 3908 block_comment("for (int i = len; i < 2*len; i++) {"); 3909 mov(Ri, Rlen); { 3910 Label loop, end; 3911 cmpw(Ri, Rlen, Assembler::LSL, 1); 3912 br(Assembler::GE, end); 3913 3914 bind(loop); 3915 pre2(Ri, Rlen); 3916 3917 block_comment(" for (j = len*2-i-1; j; j--) {"); { 3918 lslw(Rj, Rlen, 1); 3919 subw(Rj, Rj, Ri); 3920 subw(Rj, Rj, 1); 3921 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3922 } block_comment(" } // j"); 3923 3924 post2(Ri, Rlen); 3925 addw(Ri, Ri, 1); 3926 cmpw(Ri, Rlen, Assembler::LSL, 1); 3927 br(Assembler::LT, loop); 3928 bind(end); 3929 } 3930 block_comment("} // i"); 3931 3932 normalize(Rlen); 3933 3934 mov(Ra, Pm_base); // Save Pm_base in Ra 3935 restore_regs(); // Restore caller's Pm_base 3936 3937 // Copy our result into caller's Pm_base 3938 reverse(Pm_base, Ra, Rlen, t0, t1); 3939 3940 leave(); 3941 bind(nothing); 3942 ret(lr); 3943 3944 return entry; 3945 } 3946 // In C, approximately: 3947 3948 // void 3949 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 3950 // unsigned long Pn_base[], unsigned long Pm_base[], 3951 // unsigned long inv, int len) { 3952 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3953 // unsigned long *Pa, *Pb, *Pn, *Pm; 3954 // unsigned long Ra, Rb, Rn, Rm; 3955 3956 // int i; 3957 3958 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 3959 3960 // for (i = 0; i < len; i++) { 3961 // int j; 3962 3963 // Pa = Pa_base; 3964 // Pb = Pb_base + i; 3965 // Pm = Pm_base; 3966 // Pn = Pn_base + i; 3967 3968 // Ra = *Pa; 3969 // Rb = *Pb; 3970 // Rm = *Pm; 3971 // Rn = *Pn; 3972 3973 // int iters = i; 3974 // for (j = 0; iters--; j++) { 3975 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 3976 // MACC(Ra, Rb, t0, t1, t2); 3977 // Ra = *++Pa; 3978 // Rb = *--Pb; 3979 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 3980 // MACC(Rm, Rn, t0, t1, t2); 3981 // Rm = *++Pm; 3982 // Rn = *--Pn; 3983 // } 3984 3985 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 3986 // MACC(Ra, Rb, t0, t1, t2); 3987 // *Pm = Rm = t0 * inv; 3988 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 3989 // MACC(Rm, Rn, t0, t1, t2); 3990 3991 // assert(t0 == 0, "broken Montgomery multiply"); 3992 3993 // t0 = t1; t1 = t2; t2 = 0; 3994 // } 3995 3996 // for (i = len; i < 2*len; i++) { 3997 // int j; 3998 3999 // Pa = Pa_base + i-len; 4000 // Pb = Pb_base + len; 4001 // Pm = Pm_base + i-len; 4002 // Pn = Pn_base + len; 4003 4004 // Ra = *++Pa; 4005 // Rb = *--Pb; 4006 // Rm = *++Pm; 4007 // Rn = *--Pn; 4008 4009 // int iters = len*2-i-1; 4010 // for (j = i-len+1; iters--; j++) { 4011 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4012 // MACC(Ra, Rb, t0, t1, t2); 4013 // Ra = *++Pa; 4014 // Rb = *--Pb; 4015 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4016 // MACC(Rm, Rn, t0, t1, t2); 4017 // Rm = *++Pm; 4018 // Rn = *--Pn; 4019 // } 4020 4021 // Pm_base[i-len] = t0; 4022 // t0 = t1; t1 = t2; t2 = 0; 4023 // } 4024 4025 // while (t0) 4026 // t0 = sub(Pm_base, Pn_base, t0, len); 4027 // } 4028 4029 /** 4030 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4031 * multiplies than Montgomery multiplication so it should be up to 4032 * 25% faster. However, its loop control is more complex and it 4033 * may actually run slower on some machines. 4034 * 4035 * Arguments: 4036 * 4037 * Inputs: 4038 * c_rarg0 - int array elements a 4039 * c_rarg1 - int array elements n (the modulus) 4040 * c_rarg2 - int length 4041 * c_rarg3 - int inv 4042 * c_rarg4 - int array elements m (the result) 4043 * 4044 */ 4045 address generate_square() { 4046 Label argh; 4047 bind(argh); 4048 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4049 4050 align(CodeEntryAlignment); 4051 address entry = pc(); 4052 4053 enter(); 4054 4055 // Make room. 4056 cmpw(Rlen, 512); 4057 br(Assembler::HI, argh); 4058 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4059 andr(sp, Ra, -2 * wordSize); 4060 4061 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4062 4063 { 4064 // Copy input args, reversing as we go. We use Ra as a 4065 // temporary variable. 4066 reverse(Ra, Pa_base, Rlen, t0, t1); 4067 reverse(Ra, Pn_base, Rlen, t0, t1); 4068 } 4069 4070 // Push all call-saved registers and also Pm_base which we'll need 4071 // at the end. 4072 save_regs(); 4073 4074 mov(Pm_base, Ra); 4075 4076 mov(t0, zr); 4077 mov(t1, zr); 4078 mov(t2, zr); 4079 4080 block_comment("for (int i = 0; i < len; i++) {"); 4081 mov(Ri, zr); { 4082 Label loop, end; 4083 bind(loop); 4084 cmp(Ri, Rlen); 4085 br(Assembler::GE, end); 4086 4087 pre1(Ri); 4088 4089 block_comment("for (j = (i+1)/2; j; j--) {"); { 4090 add(Rj, Ri, 1); 4091 lsr(Rj, Rj, 1); 4092 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4093 } block_comment(" } // j"); 4094 4095 last_squaring(Ri); 4096 4097 block_comment(" for (j = i/2; j; j--) {"); { 4098 lsr(Rj, Ri, 1); 4099 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4100 } block_comment(" } // j"); 4101 4102 post1_squaring(); 4103 add(Ri, Ri, 1); 4104 cmp(Ri, Rlen); 4105 br(Assembler::LT, loop); 4106 4107 bind(end); 4108 block_comment("} // i"); 4109 } 4110 4111 block_comment("for (int i = len; i < 2*len; i++) {"); 4112 mov(Ri, Rlen); { 4113 Label loop, end; 4114 bind(loop); 4115 cmp(Ri, Rlen, Assembler::LSL, 1); 4116 br(Assembler::GE, end); 4117 4118 pre2(Ri, Rlen); 4119 4120 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4121 lsl(Rj, Rlen, 1); 4122 sub(Rj, Rj, Ri); 4123 sub(Rj, Rj, 1); 4124 lsr(Rj, Rj, 1); 4125 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4126 } block_comment(" } // j"); 4127 4128 last_squaring(Ri); 4129 4130 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4131 lsl(Rj, Rlen, 1); 4132 sub(Rj, Rj, Ri); 4133 lsr(Rj, Rj, 1); 4134 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4135 } block_comment(" } // j"); 4136 4137 post2(Ri, Rlen); 4138 add(Ri, Ri, 1); 4139 cmp(Ri, Rlen, Assembler::LSL, 1); 4140 4141 br(Assembler::LT, loop); 4142 bind(end); 4143 block_comment("} // i"); 4144 } 4145 4146 normalize(Rlen); 4147 4148 mov(Ra, Pm_base); // Save Pm_base in Ra 4149 restore_regs(); // Restore caller's Pm_base 4150 4151 // Copy our result into caller's Pm_base 4152 reverse(Pm_base, Ra, Rlen, t0, t1); 4153 4154 leave(); 4155 ret(lr); 4156 4157 return entry; 4158 } 4159 // In C, approximately: 4160 4161 // void 4162 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4163 // unsigned long Pm_base[], unsigned long inv, int len) { 4164 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4165 // unsigned long *Pa, *Pb, *Pn, *Pm; 4166 // unsigned long Ra, Rb, Rn, Rm; 4167 4168 // int i; 4169 4170 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4171 4172 // for (i = 0; i < len; i++) { 4173 // int j; 4174 4175 // Pa = Pa_base; 4176 // Pb = Pa_base + i; 4177 // Pm = Pm_base; 4178 // Pn = Pn_base + i; 4179 4180 // Ra = *Pa; 4181 // Rb = *Pb; 4182 // Rm = *Pm; 4183 // Rn = *Pn; 4184 4185 // int iters = (i+1)/2; 4186 // for (j = 0; iters--; j++) { 4187 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4188 // MACC2(Ra, Rb, t0, t1, t2); 4189 // Ra = *++Pa; 4190 // Rb = *--Pb; 4191 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4192 // MACC(Rm, Rn, t0, t1, t2); 4193 // Rm = *++Pm; 4194 // Rn = *--Pn; 4195 // } 4196 // if ((i & 1) == 0) { 4197 // assert(Ra == Pa_base[j], "must be"); 4198 // MACC(Ra, Ra, t0, t1, t2); 4199 // } 4200 // iters = i/2; 4201 // assert(iters == i-j, "must be"); 4202 // for (; iters--; j++) { 4203 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4204 // MACC(Rm, Rn, t0, t1, t2); 4205 // Rm = *++Pm; 4206 // Rn = *--Pn; 4207 // } 4208 4209 // *Pm = Rm = t0 * inv; 4210 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4211 // MACC(Rm, Rn, t0, t1, t2); 4212 4213 // assert(t0 == 0, "broken Montgomery multiply"); 4214 4215 // t0 = t1; t1 = t2; t2 = 0; 4216 // } 4217 4218 // for (i = len; i < 2*len; i++) { 4219 // int start = i-len+1; 4220 // int end = start + (len - start)/2; 4221 // int j; 4222 4223 // Pa = Pa_base + i-len; 4224 // Pb = Pa_base + len; 4225 // Pm = Pm_base + i-len; 4226 // Pn = Pn_base + len; 4227 4228 // Ra = *++Pa; 4229 // Rb = *--Pb; 4230 // Rm = *++Pm; 4231 // Rn = *--Pn; 4232 4233 // int iters = (2*len-i-1)/2; 4234 // assert(iters == end-start, "must be"); 4235 // for (j = start; iters--; j++) { 4236 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4237 // MACC2(Ra, Rb, t0, t1, t2); 4238 // Ra = *++Pa; 4239 // Rb = *--Pb; 4240 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4241 // MACC(Rm, Rn, t0, t1, t2); 4242 // Rm = *++Pm; 4243 // Rn = *--Pn; 4244 // } 4245 // if ((i & 1) == 0) { 4246 // assert(Ra == Pa_base[j], "must be"); 4247 // MACC(Ra, Ra, t0, t1, t2); 4248 // } 4249 // iters = (2*len-i)/2; 4250 // assert(iters == len-j, "must be"); 4251 // for (; iters--; j++) { 4252 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4253 // MACC(Rm, Rn, t0, t1, t2); 4254 // Rm = *++Pm; 4255 // Rn = *--Pn; 4256 // } 4257 // Pm_base[i-len] = t0; 4258 // t0 = t1; t1 = t2; t2 = 0; 4259 // } 4260 4261 // while (t0) 4262 // t0 = sub(Pm_base, Pn_base, t0, len); 4263 // } 4264 }; 4265 4266 // Initialization 4267 void generate_initial() { 4268 // Generate initial stubs and initializes the entry points 4269 4270 // entry points that exist in all platforms Note: This is code 4271 // that could be shared among different platforms - however the 4272 // benefit seems to be smaller than the disadvantage of having a 4273 // much more complicated generator structure. See also comment in 4274 // stubRoutines.hpp. 4275 4276 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4277 4278 StubRoutines::_call_stub_entry = 4279 generate_call_stub(StubRoutines::_call_stub_return_address); 4280 4281 // is referenced by megamorphic call 4282 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4283 4284 // Build this early so it's available for the interpreter. 4285 StubRoutines::_throw_StackOverflowError_entry = 4286 generate_throw_exception("StackOverflowError throw_exception", 4287 CAST_FROM_FN_PTR(address, 4288 SharedRuntime:: 4289 throw_StackOverflowError)); 4290 if (UseCRC32Intrinsics) { 4291 // set table address before stub generation which use it 4292 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 4293 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4294 } 4295 } 4296 4297 void generate_all() { 4298 // support for verify_oop (must happen after universe_init) 4299 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4300 StubRoutines::_throw_AbstractMethodError_entry = 4301 generate_throw_exception("AbstractMethodError throw_exception", 4302 CAST_FROM_FN_PTR(address, 4303 SharedRuntime:: 4304 throw_AbstractMethodError)); 4305 4306 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4307 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4308 CAST_FROM_FN_PTR(address, 4309 SharedRuntime:: 4310 throw_IncompatibleClassChangeError)); 4311 4312 StubRoutines::_throw_NullPointerException_at_call_entry = 4313 generate_throw_exception("NullPointerException at call throw_exception", 4314 CAST_FROM_FN_PTR(address, 4315 SharedRuntime:: 4316 throw_NullPointerException_at_call)); 4317 4318 // arraycopy stubs used by compilers 4319 generate_arraycopy_stubs(); 4320 4321 if (UseMultiplyToLenIntrinsic) { 4322 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4323 } 4324 4325 if (UseMontgomeryMultiplyIntrinsic) { 4326 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4327 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4328 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4329 } 4330 4331 if (UseMontgomerySquareIntrinsic) { 4332 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4333 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4334 // We use generate_multiply() rather than generate_square() 4335 // because it's faster for the sizes of modulus we care about. 4336 StubRoutines::_montgomerySquare = g.generate_multiply(); 4337 } 4338 4339 if (UseShenandoahGC && ShenandoahWriteBarrier) { 4340 StubRoutines::aarch64::_shenandoah_wb = generate_shenandoah_wb(false, true); 4341 StubRoutines::_shenandoah_wb_C = generate_shenandoah_wb(true, false); 4342 } 4343 4344 #ifndef BUILTIN_SIM 4345 if (UseAESIntrinsics) { 4346 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4347 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4348 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4349 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4350 } 4351 4352 if (UseSHA1Intrinsics) { 4353 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4354 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4355 } 4356 if (UseSHA256Intrinsics) { 4357 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4358 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4359 } 4360 4361 // Safefetch stubs. 4362 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4363 &StubRoutines::_safefetch32_fault_pc, 4364 &StubRoutines::_safefetch32_continuation_pc); 4365 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4366 &StubRoutines::_safefetchN_fault_pc, 4367 &StubRoutines::_safefetchN_continuation_pc); 4368 #endif 4369 } 4370 4371 public: 4372 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4373 if (all) { 4374 generate_all(); 4375 } else { 4376 generate_initial(); 4377 } 4378 } 4379 }; // end class declaration 4380 4381 void StubGenerator_generate(CodeBuffer* code, bool all) { 4382 StubGenerator g(code, all); 4383 }