1 /* 2 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shenandoah/brooksPointer.hpp" 30 #include "gc/shenandoah/shenandoahBarrierSet.hpp" 31 #include "gc/shenandoah/shenandoahHeap.hpp" 32 #include "gc/shenandoah/shenandoahHeapRegion.hpp" 33 #include "interpreter/interpreter.hpp" 34 #include "nativeInst_aarch64.hpp" 35 #include "oops/instanceOop.hpp" 36 #include "oops/method.hpp" 37 #include "oops/objArrayKlass.hpp" 38 #include "oops/oop.inline.hpp" 39 #include "prims/methodHandles.hpp" 40 #include "runtime/frame.inline.hpp" 41 #include "runtime/handles.inline.hpp" 42 #include "runtime/sharedRuntime.hpp" 43 #include "runtime/stubCodeGenerator.hpp" 44 #include "runtime/stubRoutines.hpp" 45 #include "runtime/thread.inline.hpp" 46 #include "utilities/align.hpp" 47 #ifdef COMPILER2 48 #include "opto/runtime.hpp" 49 #endif 50 51 #ifdef BUILTIN_SIM 52 #include "../../../../../../simulator/simulator.hpp" 53 #endif 54 55 // Declaration and definition of StubGenerator (no .hpp file). 56 // For a more detailed description of the stub routine structure 57 // see the comment in stubRoutines.hpp 58 59 #undef __ 60 #define __ _masm-> 61 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 62 63 #ifdef PRODUCT 64 #define BLOCK_COMMENT(str) /* nothing */ 65 #else 66 #define BLOCK_COMMENT(str) __ block_comment(str) 67 #endif 68 69 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 70 71 // Stub Code definitions 72 73 class StubGenerator: public StubCodeGenerator { 74 private: 75 76 #ifdef PRODUCT 77 #define inc_counter_np(counter) ((void)0) 78 #else 79 void inc_counter_np_(int& counter) { 80 __ lea(rscratch2, ExternalAddress((address)&counter)); 81 __ ldrw(rscratch1, Address(rscratch2)); 82 __ addw(rscratch1, rscratch1, 1); 83 __ strw(rscratch1, Address(rscratch2)); 84 } 85 #define inc_counter_np(counter) \ 86 BLOCK_COMMENT("inc_counter " #counter); \ 87 inc_counter_np_(counter); 88 #endif 89 90 // Call stubs are used to call Java from C 91 // 92 // Arguments: 93 // c_rarg0: call wrapper address address 94 // c_rarg1: result address 95 // c_rarg2: result type BasicType 96 // c_rarg3: method Method* 97 // c_rarg4: (interpreter) entry point address 98 // c_rarg5: parameters intptr_t* 99 // c_rarg6: parameter size (in words) int 100 // c_rarg7: thread Thread* 101 // 102 // There is no return from the stub itself as any Java result 103 // is written to result 104 // 105 // we save r30 (lr) as the return PC at the base of the frame and 106 // link r29 (fp) below it as the frame pointer installing sp (r31) 107 // into fp. 108 // 109 // we save r0-r7, which accounts for all the c arguments. 110 // 111 // TODO: strictly do we need to save them all? they are treated as 112 // volatile by C so could we omit saving the ones we are going to 113 // place in global registers (thread? method?) or those we only use 114 // during setup of the Java call? 115 // 116 // we don't need to save r8 which C uses as an indirect result location 117 // return register. 118 // 119 // we don't need to save r9-r15 which both C and Java treat as 120 // volatile 121 // 122 // we don't need to save r16-18 because Java does not use them 123 // 124 // we save r19-r28 which Java uses as scratch registers and C 125 // expects to be callee-save 126 // 127 // we save the bottom 64 bits of each value stored in v8-v15; it is 128 // the responsibility of the caller to preserve larger values. 129 // 130 // so the stub frame looks like this when we enter Java code 131 // 132 // [ return_from_Java ] <--- sp 133 // [ argument word n ] 134 // ... 135 // -27 [ argument word 1 ] 136 // -26 [ saved v15 ] <--- sp_after_call 137 // -25 [ saved v14 ] 138 // -24 [ saved v13 ] 139 // -23 [ saved v12 ] 140 // -22 [ saved v11 ] 141 // -21 [ saved v10 ] 142 // -20 [ saved v9 ] 143 // -19 [ saved v8 ] 144 // -18 [ saved r28 ] 145 // -17 [ saved r27 ] 146 // -16 [ saved r26 ] 147 // -15 [ saved r25 ] 148 // -14 [ saved r24 ] 149 // -13 [ saved r23 ] 150 // -12 [ saved r22 ] 151 // -11 [ saved r21 ] 152 // -10 [ saved r20 ] 153 // -9 [ saved r19 ] 154 // -8 [ call wrapper (r0) ] 155 // -7 [ result (r1) ] 156 // -6 [ result type (r2) ] 157 // -5 [ method (r3) ] 158 // -4 [ entry point (r4) ] 159 // -3 [ parameters (r5) ] 160 // -2 [ parameter size (r6) ] 161 // -1 [ thread (r7) ] 162 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 163 // 1 [ saved lr (r30) ] 164 165 // Call stub stack layout word offsets from fp 166 enum call_stub_layout { 167 sp_after_call_off = -26, 168 169 d15_off = -26, 170 d13_off = -24, 171 d11_off = -22, 172 d9_off = -20, 173 174 r28_off = -18, 175 r26_off = -16, 176 r24_off = -14, 177 r22_off = -12, 178 r20_off = -10, 179 call_wrapper_off = -8, 180 result_off = -7, 181 result_type_off = -6, 182 method_off = -5, 183 entry_point_off = -4, 184 parameter_size_off = -2, 185 thread_off = -1, 186 fp_f = 0, 187 retaddr_off = 1, 188 }; 189 190 address generate_call_stub(address& return_address) { 191 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 192 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 193 "adjust this code"); 194 195 StubCodeMark mark(this, "StubRoutines", "call_stub"); 196 address start = __ pc(); 197 198 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 199 200 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 201 const Address result (rfp, result_off * wordSize); 202 const Address result_type (rfp, result_type_off * wordSize); 203 const Address method (rfp, method_off * wordSize); 204 const Address entry_point (rfp, entry_point_off * wordSize); 205 const Address parameter_size(rfp, parameter_size_off * wordSize); 206 207 const Address thread (rfp, thread_off * wordSize); 208 209 const Address d15_save (rfp, d15_off * wordSize); 210 const Address d13_save (rfp, d13_off * wordSize); 211 const Address d11_save (rfp, d11_off * wordSize); 212 const Address d9_save (rfp, d9_off * wordSize); 213 214 const Address r28_save (rfp, r28_off * wordSize); 215 const Address r26_save (rfp, r26_off * wordSize); 216 const Address r24_save (rfp, r24_off * wordSize); 217 const Address r22_save (rfp, r22_off * wordSize); 218 const Address r20_save (rfp, r20_off * wordSize); 219 220 // stub code 221 222 // we need a C prolog to bootstrap the x86 caller into the sim 223 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 224 225 address aarch64_entry = __ pc(); 226 227 #ifdef BUILTIN_SIM 228 // Save sender's SP for stack traces. 229 __ mov(rscratch1, sp); 230 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 231 #endif 232 // set up frame and move sp to end of save area 233 __ enter(); 234 __ sub(sp, rfp, -sp_after_call_off * wordSize); 235 236 // save register parameters and Java scratch/global registers 237 // n.b. we save thread even though it gets installed in 238 // rthread because we want to sanity check rthread later 239 __ str(c_rarg7, thread); 240 __ strw(c_rarg6, parameter_size); 241 __ stp(c_rarg4, c_rarg5, entry_point); 242 __ stp(c_rarg2, c_rarg3, result_type); 243 __ stp(c_rarg0, c_rarg1, call_wrapper); 244 245 __ stp(r20, r19, r20_save); 246 __ stp(r22, r21, r22_save); 247 __ stp(r24, r23, r24_save); 248 __ stp(r26, r25, r26_save); 249 __ stp(r28, r27, r28_save); 250 251 __ stpd(v9, v8, d9_save); 252 __ stpd(v11, v10, d11_save); 253 __ stpd(v13, v12, d13_save); 254 __ stpd(v15, v14, d15_save); 255 256 // install Java thread in global register now we have saved 257 // whatever value it held 258 __ mov(rthread, c_rarg7); 259 // And method 260 __ mov(rmethod, c_rarg3); 261 262 // set up the heapbase register 263 __ reinit_heapbase(); 264 265 #ifdef ASSERT 266 // make sure we have no pending exceptions 267 { 268 Label L; 269 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 270 __ cmp(rscratch1, (unsigned)NULL_WORD); 271 __ br(Assembler::EQ, L); 272 __ stop("StubRoutines::call_stub: entered with pending exception"); 273 __ BIND(L); 274 } 275 #endif 276 // pass parameters if any 277 __ mov(esp, sp); 278 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 279 __ andr(sp, rscratch1, -2 * wordSize); 280 281 BLOCK_COMMENT("pass parameters if any"); 282 Label parameters_done; 283 // parameter count is still in c_rarg6 284 // and parameter pointer identifying param 1 is in c_rarg5 285 __ cbzw(c_rarg6, parameters_done); 286 287 address loop = __ pc(); 288 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 289 __ subsw(c_rarg6, c_rarg6, 1); 290 __ push(rscratch1); 291 __ br(Assembler::GT, loop); 292 293 __ BIND(parameters_done); 294 295 // call Java entry -- passing methdoOop, and current sp 296 // rmethod: Method* 297 // r13: sender sp 298 BLOCK_COMMENT("call Java function"); 299 __ mov(r13, sp); 300 __ blr(c_rarg4); 301 302 // tell the simulator we have returned to the stub 303 304 // we do this here because the notify will already have been done 305 // if we get to the next instruction via an exception 306 // 307 // n.b. adding this instruction here affects the calculation of 308 // whether or not a routine returns to the call stub (used when 309 // doing stack walks) since the normal test is to check the return 310 // pc against the address saved below. so we may need to allow for 311 // this extra instruction in the check. 312 313 if (NotifySimulator) { 314 __ notify(Assembler::method_reentry); 315 } 316 // save current address for use by exception handling code 317 318 return_address = __ pc(); 319 320 // store result depending on type (everything that is not 321 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 322 // n.b. this assumes Java returns an integral result in r0 323 // and a floating result in j_farg0 324 __ ldr(j_rarg2, result); 325 Label is_long, is_float, is_double, exit; 326 __ ldr(j_rarg1, result_type); 327 __ cmp(j_rarg1, T_OBJECT); 328 __ br(Assembler::EQ, is_long); 329 __ cmp(j_rarg1, T_LONG); 330 __ br(Assembler::EQ, is_long); 331 __ cmp(j_rarg1, T_FLOAT); 332 __ br(Assembler::EQ, is_float); 333 __ cmp(j_rarg1, T_DOUBLE); 334 __ br(Assembler::EQ, is_double); 335 336 // handle T_INT case 337 __ strw(r0, Address(j_rarg2)); 338 339 __ BIND(exit); 340 341 // pop parameters 342 __ sub(esp, rfp, -sp_after_call_off * wordSize); 343 344 #ifdef ASSERT 345 // verify that threads correspond 346 { 347 Label L, S; 348 __ ldr(rscratch1, thread); 349 __ cmp(rthread, rscratch1); 350 __ br(Assembler::NE, S); 351 __ get_thread(rscratch1); 352 __ cmp(rthread, rscratch1); 353 __ br(Assembler::EQ, L); 354 __ BIND(S); 355 __ stop("StubRoutines::call_stub: threads must correspond"); 356 __ BIND(L); 357 } 358 #endif 359 360 // restore callee-save registers 361 __ ldpd(v15, v14, d15_save); 362 __ ldpd(v13, v12, d13_save); 363 __ ldpd(v11, v10, d11_save); 364 __ ldpd(v9, v8, d9_save); 365 366 __ ldp(r28, r27, r28_save); 367 __ ldp(r26, r25, r26_save); 368 __ ldp(r24, r23, r24_save); 369 __ ldp(r22, r21, r22_save); 370 __ ldp(r20, r19, r20_save); 371 372 __ ldp(c_rarg0, c_rarg1, call_wrapper); 373 __ ldrw(c_rarg2, result_type); 374 __ ldr(c_rarg3, method); 375 __ ldp(c_rarg4, c_rarg5, entry_point); 376 __ ldp(c_rarg6, c_rarg7, parameter_size); 377 378 #ifndef PRODUCT 379 // tell the simulator we are about to end Java execution 380 if (NotifySimulator) { 381 __ notify(Assembler::method_exit); 382 } 383 #endif 384 // leave frame and return to caller 385 __ leave(); 386 __ ret(lr); 387 388 // handle return types different from T_INT 389 390 __ BIND(is_long); 391 __ str(r0, Address(j_rarg2, 0)); 392 __ br(Assembler::AL, exit); 393 394 __ BIND(is_float); 395 __ strs(j_farg0, Address(j_rarg2, 0)); 396 __ br(Assembler::AL, exit); 397 398 __ BIND(is_double); 399 __ strd(j_farg0, Address(j_rarg2, 0)); 400 __ br(Assembler::AL, exit); 401 402 return start; 403 } 404 405 // Return point for a Java call if there's an exception thrown in 406 // Java code. The exception is caught and transformed into a 407 // pending exception stored in JavaThread that can be tested from 408 // within the VM. 409 // 410 // Note: Usually the parameters are removed by the callee. In case 411 // of an exception crossing an activation frame boundary, that is 412 // not the case if the callee is compiled code => need to setup the 413 // rsp. 414 // 415 // r0: exception oop 416 417 // NOTE: this is used as a target from the signal handler so it 418 // needs an x86 prolog which returns into the current simulator 419 // executing the generated catch_exception code. so the prolog 420 // needs to install rax in a sim register and adjust the sim's 421 // restart pc to enter the generated code at the start position 422 // then return from native to simulated execution. 423 424 address generate_catch_exception() { 425 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 426 address start = __ pc(); 427 428 // same as in generate_call_stub(): 429 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 430 const Address thread (rfp, thread_off * wordSize); 431 432 #ifdef ASSERT 433 // verify that threads correspond 434 { 435 Label L, S; 436 __ ldr(rscratch1, thread); 437 __ cmp(rthread, rscratch1); 438 __ br(Assembler::NE, S); 439 __ get_thread(rscratch1); 440 __ cmp(rthread, rscratch1); 441 __ br(Assembler::EQ, L); 442 __ bind(S); 443 __ stop("StubRoutines::catch_exception: threads must correspond"); 444 __ bind(L); 445 } 446 #endif 447 448 // set pending exception 449 __ verify_oop(r0); 450 451 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 452 __ mov(rscratch1, (address)__FILE__); 453 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 454 __ movw(rscratch1, (int)__LINE__); 455 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 456 457 // complete return to VM 458 assert(StubRoutines::_call_stub_return_address != NULL, 459 "_call_stub_return_address must have been generated before"); 460 __ b(StubRoutines::_call_stub_return_address); 461 462 return start; 463 } 464 465 // Continuation point for runtime calls returning with a pending 466 // exception. The pending exception check happened in the runtime 467 // or native call stub. The pending exception in Thread is 468 // converted into a Java-level exception. 469 // 470 // Contract with Java-level exception handlers: 471 // r0: exception 472 // r3: throwing pc 473 // 474 // NOTE: At entry of this stub, exception-pc must be in LR !! 475 476 // NOTE: this is always used as a jump target within generated code 477 // so it just needs to be generated code wiht no x86 prolog 478 479 address generate_forward_exception() { 480 StubCodeMark mark(this, "StubRoutines", "forward exception"); 481 address start = __ pc(); 482 483 // Upon entry, LR points to the return address returning into 484 // Java (interpreted or compiled) code; i.e., the return address 485 // becomes the throwing pc. 486 // 487 // Arguments pushed before the runtime call are still on the stack 488 // but the exception handler will reset the stack pointer -> 489 // ignore them. A potential result in registers can be ignored as 490 // well. 491 492 #ifdef ASSERT 493 // make sure this code is only executed if there is a pending exception 494 { 495 Label L; 496 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 497 __ cbnz(rscratch1, L); 498 __ stop("StubRoutines::forward exception: no pending exception (1)"); 499 __ bind(L); 500 } 501 #endif 502 503 // compute exception handler into r19 504 505 // call the VM to find the handler address associated with the 506 // caller address. pass thread in r0 and caller pc (ret address) 507 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 508 // the stack. 509 __ mov(c_rarg1, lr); 510 // lr will be trashed by the VM call so we move it to R19 511 // (callee-saved) because we also need to pass it to the handler 512 // returned by this call. 513 __ mov(r19, lr); 514 BLOCK_COMMENT("call exception_handler_for_return_address"); 515 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 516 SharedRuntime::exception_handler_for_return_address), 517 rthread, c_rarg1); 518 // we should not really care that lr is no longer the callee 519 // address. we saved the value the handler needs in r19 so we can 520 // just copy it to r3. however, the C2 handler will push its own 521 // frame and then calls into the VM and the VM code asserts that 522 // the PC for the frame above the handler belongs to a compiled 523 // Java method. So, we restore lr here to satisfy that assert. 524 __ mov(lr, r19); 525 // setup r0 & r3 & clear pending exception 526 __ mov(r3, r19); 527 __ mov(r19, r0); 528 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 529 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 530 531 #ifdef ASSERT 532 // make sure exception is set 533 { 534 Label L; 535 __ cbnz(r0, L); 536 __ stop("StubRoutines::forward exception: no pending exception (2)"); 537 __ bind(L); 538 } 539 #endif 540 541 // continue at exception handler 542 // r0: exception 543 // r3: throwing pc 544 // r19: exception handler 545 __ verify_oop(r0); 546 __ br(r19); 547 548 return start; 549 } 550 551 // Shenandoah write barrier. 552 // 553 // Input: 554 // r0: OOP to evacuate. Not null. 555 // 556 // Output: 557 // r0: Pointer to evacuated OOP. 558 // 559 // Trash rscratch1, rscratch2. Preserve everything else. 560 561 address generate_shenandoah_wb(bool c_abi, bool do_cset_test) { 562 StubCodeMark mark(this, "StubRoutines", "shenandoah_wb"); 563 564 __ align(6); 565 address start = __ pc(); 566 567 if (do_cset_test) { 568 Label work; 569 __ mov(rscratch2, ShenandoahHeap::in_cset_fast_test_addr()); 570 __ lsr(rscratch1, r0, ShenandoahHeapRegion::region_size_bytes_shift_jint()); 571 __ ldrb(rscratch2, Address(rscratch2, rscratch1)); 572 __ tbnz(rscratch2, 0, work); 573 __ ret(lr); 574 __ bind(work); 575 } 576 577 Register obj = r0; 578 579 __ enter(); // required for proper stackwalking of RuntimeStub frame 580 581 if (!c_abi) { 582 __ push_call_clobbered_registers(); 583 } else { 584 __ push_call_clobbered_fp_registers(); 585 } 586 587 __ mov(lr, CAST_FROM_FN_PTR(address, ShenandoahBarrierSet::write_barrier_JRT)); 588 __ blrt(lr, 1, 0, MacroAssembler::ret_type_integral); 589 if (!c_abi) { 590 __ mov(rscratch1, obj); 591 __ pop_call_clobbered_registers(); 592 __ mov(obj, rscratch1); 593 } else { 594 __ pop_call_clobbered_fp_registers(); 595 } 596 597 __ leave(); // required for proper stackwalking of RuntimeStub frame 598 __ ret(lr); 599 600 return start; 601 } 602 603 // Non-destructive plausibility checks for oops 604 // 605 // Arguments: 606 // r0: oop to verify 607 // rscratch1: error message 608 // 609 // Stack after saving c_rarg3: 610 // [tos + 0]: saved c_rarg3 611 // [tos + 1]: saved c_rarg2 612 // [tos + 2]: saved lr 613 // [tos + 3]: saved rscratch2 614 // [tos + 4]: saved r0 615 // [tos + 5]: saved rscratch1 616 address generate_verify_oop() { 617 618 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 619 address start = __ pc(); 620 621 Label exit, error; 622 623 // save c_rarg2 and c_rarg3 624 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 625 626 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 627 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 628 __ ldr(c_rarg3, Address(c_rarg2)); 629 __ add(c_rarg3, c_rarg3, 1); 630 __ str(c_rarg3, Address(c_rarg2)); 631 632 // object is in r0 633 // make sure object is 'reasonable' 634 __ cbz(r0, exit); // if obj is NULL it is OK 635 636 // Check if the oop is in the right area of memory 637 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 638 __ andr(c_rarg2, r0, c_rarg3); 639 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 640 641 // Compare c_rarg2 and c_rarg3. We don't use a compare 642 // instruction here because the flags register is live. 643 __ eor(c_rarg2, c_rarg2, c_rarg3); 644 __ cbnz(c_rarg2, error); 645 646 // make sure klass is 'reasonable', which is not zero. 647 __ load_klass(r0, r0); // get klass 648 __ cbz(r0, error); // if klass is NULL it is broken 649 650 // return if everything seems ok 651 __ bind(exit); 652 653 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 654 __ ret(lr); 655 656 // handle errors 657 __ bind(error); 658 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 659 660 __ push(RegSet::range(r0, r29), sp); 661 // debug(char* msg, int64_t pc, int64_t regs[]) 662 __ mov(c_rarg0, rscratch1); // pass address of error message 663 __ mov(c_rarg1, lr); // pass return address 664 __ mov(c_rarg2, sp); // pass address of regs on stack 665 #ifndef PRODUCT 666 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 667 #endif 668 BLOCK_COMMENT("call MacroAssembler::debug"); 669 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 670 __ blrt(rscratch1, 3, 0, 1); 671 672 return start; 673 } 674 675 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 676 677 // Generate code for an array write pre barrier 678 // 679 // addr - starting address 680 // count - element count 681 // tmp - scratch register 682 // saved_regs - registers to be saved before calling static_write_ref_array_pre 683 // 684 // Callers must specify which registers to preserve in saved_regs. 685 // Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs. 686 // 687 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized, RegSet saved_regs) { 688 BarrierSet* bs = Universe::heap()->barrier_set(); 689 switch (bs->kind()) { 690 case BarrierSet::G1SATBCTLogging: 691 case BarrierSet::Shenandoah: 692 // Don't generate the call if we statically know that the target is uninitialized 693 if (!dest_uninitialized) { 694 __ push(saved_regs, sp); 695 if (count == c_rarg0) { 696 if (addr == c_rarg1) { 697 // exactly backwards!! 698 __ mov(rscratch1, c_rarg0); 699 __ mov(c_rarg0, c_rarg1); 700 __ mov(c_rarg1, rscratch1); 701 } else { 702 __ mov(c_rarg1, count); 703 __ mov(c_rarg0, addr); 704 } 705 } else { 706 __ mov(c_rarg0, addr); 707 __ mov(c_rarg1, count); 708 } 709 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 710 __ pop(saved_regs, sp); 711 break; 712 case BarrierSet::CardTableForRS: 713 case BarrierSet::CardTableExtension: 714 case BarrierSet::ModRef: 715 break; 716 default: 717 ShouldNotReachHere(); 718 719 } 720 } 721 } 722 723 // 724 // Generate code for an array write post barrier 725 // 726 // Input: 727 // start - register containing starting address of destination array 728 // end - register containing ending address of destination array 729 // scratch - scratch register 730 // saved_regs - registers to be saved before calling static_write_ref_array_post 731 // 732 // The input registers are overwritten. 733 // The ending address is inclusive. 734 // Callers must specify which registers to preserve in saved_regs. 735 // Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs. 736 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch, RegSet saved_regs) { 737 assert_different_registers(start, end, scratch); 738 BarrierSet* bs = Universe::heap()->barrier_set(); 739 switch (bs->kind()) { 740 case BarrierSet::G1SATBCTLogging: 741 case BarrierSet::Shenandoah: 742 { 743 __ push(saved_regs, sp); 744 // must compute element count unless barrier set interface is changed (other platforms supply count) 745 assert_different_registers(start, end, scratch); 746 __ lea(scratch, Address(end, BytesPerHeapOop)); 747 __ sub(scratch, scratch, start); // subtract start to get #bytes 748 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 749 __ mov(c_rarg0, start); 750 __ mov(c_rarg1, scratch); 751 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 752 __ pop(saved_regs, sp); 753 } 754 break; 755 case BarrierSet::CardTableForRS: 756 case BarrierSet::CardTableExtension: 757 { 758 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 759 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 760 761 Label L_loop; 762 763 __ lsr(start, start, CardTableModRefBS::card_shift); 764 __ lsr(end, end, CardTableModRefBS::card_shift); 765 __ sub(end, end, start); // number of bytes to copy 766 767 const Register count = end; // 'end' register contains bytes count now 768 __ load_byte_map_base(scratch); 769 __ add(start, start, scratch); 770 if (UseConcMarkSweepGC) { 771 __ membar(__ StoreStore); 772 } 773 __ BIND(L_loop); 774 __ strb(zr, Address(start, count)); 775 __ subs(count, count, 1); 776 __ br(Assembler::GE, L_loop); 777 } 778 break; 779 default: 780 ShouldNotReachHere(); 781 782 } 783 } 784 785 // The inner part of zero_words(). This is the bulk operation, 786 // zeroing words in blocks, possibly using DC ZVA to do it. The 787 // caller is responsible for zeroing the last few words. 788 // 789 // Inputs: 790 // r10: the HeapWord-aligned base address of an array to zero. 791 // r11: the count in HeapWords, r11 > 0. 792 // 793 // Returns r10 and r11, adjusted for the caller to clear. 794 // r10: the base address of the tail of words left to clear. 795 // r11: the number of words in the tail. 796 // r11 < MacroAssembler::zero_words_block_size. 797 798 address generate_zero_blocks() { 799 Label store_pair, loop_store_pair, done; 800 Label base_aligned; 801 802 Register base = r10, cnt = r11; 803 804 __ align(CodeEntryAlignment); 805 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 806 address start = __ pc(); 807 808 if (UseBlockZeroing) { 809 int zva_length = VM_Version::zva_length(); 810 811 // Ensure ZVA length can be divided by 16. This is required by 812 // the subsequent operations. 813 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 814 815 __ tbz(base, 3, base_aligned); 816 __ str(zr, Address(__ post(base, 8))); 817 __ sub(cnt, cnt, 1); 818 __ bind(base_aligned); 819 820 // Ensure count >= zva_length * 2 so that it still deserves a zva after 821 // alignment. 822 Label small; 823 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 824 __ subs(rscratch1, cnt, low_limit >> 3); 825 __ br(Assembler::LT, small); 826 __ zero_dcache_blocks(base, cnt); 827 __ bind(small); 828 } 829 830 { 831 // Number of stp instructions we'll unroll 832 const int unroll = 833 MacroAssembler::zero_words_block_size / 2; 834 // Clear the remaining blocks. 835 Label loop; 836 __ subs(cnt, cnt, unroll * 2); 837 __ br(Assembler::LT, done); 838 __ bind(loop); 839 for (int i = 0; i < unroll; i++) 840 __ stp(zr, zr, __ post(base, 16)); 841 __ subs(cnt, cnt, unroll * 2); 842 __ br(Assembler::GE, loop); 843 __ bind(done); 844 __ add(cnt, cnt, unroll * 2); 845 } 846 847 __ ret(lr); 848 849 return start; 850 } 851 852 853 typedef enum { 854 copy_forwards = 1, 855 copy_backwards = -1 856 } copy_direction; 857 858 // Bulk copy of blocks of 8 words. 859 // 860 // count is a count of words. 861 // 862 // Precondition: count >= 8 863 // 864 // Postconditions: 865 // 866 // The least significant bit of count contains the remaining count 867 // of words to copy. The rest of count is trash. 868 // 869 // s and d are adjusted to point to the remaining words to copy 870 // 871 void generate_copy_longs(Label &start, Register s, Register d, Register count, 872 copy_direction direction) { 873 int unit = wordSize * direction; 874 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 875 876 int offset; 877 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 878 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 879 const Register stride = r13; 880 881 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 882 assert_different_registers(s, d, count, rscratch1); 883 884 Label again, drain; 885 const char *stub_name; 886 if (direction == copy_forwards) 887 stub_name = "forward_copy_longs"; 888 else 889 stub_name = "backward_copy_longs"; 890 StubCodeMark mark(this, "StubRoutines", stub_name); 891 __ align(CodeEntryAlignment); 892 __ bind(start); 893 894 Label unaligned_copy_long; 895 if (AvoidUnalignedAccesses) { 896 __ tbnz(d, 3, unaligned_copy_long); 897 } 898 899 if (direction == copy_forwards) { 900 __ sub(s, s, bias); 901 __ sub(d, d, bias); 902 } 903 904 #ifdef ASSERT 905 // Make sure we are never given < 8 words 906 { 907 Label L; 908 __ cmp(count, 8); 909 __ br(Assembler::GE, L); 910 __ stop("genrate_copy_longs called with < 8 words"); 911 __ bind(L); 912 } 913 #endif 914 915 // Fill 8 registers 916 if (UseSIMDForMemoryOps) { 917 __ ldpq(v0, v1, Address(s, 4 * unit)); 918 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 919 } else { 920 __ ldp(t0, t1, Address(s, 2 * unit)); 921 __ ldp(t2, t3, Address(s, 4 * unit)); 922 __ ldp(t4, t5, Address(s, 6 * unit)); 923 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 924 } 925 926 __ subs(count, count, 16); 927 __ br(Assembler::LO, drain); 928 929 int prefetch = PrefetchCopyIntervalInBytes; 930 bool use_stride = false; 931 if (direction == copy_backwards) { 932 use_stride = prefetch > 256; 933 prefetch = -prefetch; 934 if (use_stride) __ mov(stride, prefetch); 935 } 936 937 __ bind(again); 938 939 if (PrefetchCopyIntervalInBytes > 0) 940 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 941 942 if (UseSIMDForMemoryOps) { 943 __ stpq(v0, v1, Address(d, 4 * unit)); 944 __ ldpq(v0, v1, Address(s, 4 * unit)); 945 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 946 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 947 } else { 948 __ stp(t0, t1, Address(d, 2 * unit)); 949 __ ldp(t0, t1, Address(s, 2 * unit)); 950 __ stp(t2, t3, Address(d, 4 * unit)); 951 __ ldp(t2, t3, Address(s, 4 * unit)); 952 __ stp(t4, t5, Address(d, 6 * unit)); 953 __ ldp(t4, t5, Address(s, 6 * unit)); 954 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 955 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 956 } 957 958 __ subs(count, count, 8); 959 __ br(Assembler::HS, again); 960 961 // Drain 962 __ bind(drain); 963 if (UseSIMDForMemoryOps) { 964 __ stpq(v0, v1, Address(d, 4 * unit)); 965 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 966 } else { 967 __ stp(t0, t1, Address(d, 2 * unit)); 968 __ stp(t2, t3, Address(d, 4 * unit)); 969 __ stp(t4, t5, Address(d, 6 * unit)); 970 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 971 } 972 973 { 974 Label L1, L2; 975 __ tbz(count, exact_log2(4), L1); 976 if (UseSIMDForMemoryOps) { 977 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 978 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 979 } else { 980 __ ldp(t0, t1, Address(s, 2 * unit)); 981 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 982 __ stp(t0, t1, Address(d, 2 * unit)); 983 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 984 } 985 __ bind(L1); 986 987 if (direction == copy_forwards) { 988 __ add(s, s, bias); 989 __ add(d, d, bias); 990 } 991 992 __ tbz(count, 1, L2); 993 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 994 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 995 __ bind(L2); 996 } 997 998 __ ret(lr); 999 1000 if (AvoidUnalignedAccesses) { 1001 Label drain, again; 1002 // Register order for storing. Order is different for backward copy. 1003 1004 __ bind(unaligned_copy_long); 1005 1006 // source address is even aligned, target odd aligned 1007 // 1008 // when forward copying word pairs we read long pairs at offsets 1009 // {0, 2, 4, 6} (in long words). when backwards copying we read 1010 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 1011 // address by -2 in the forwards case so we can compute the 1012 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 1013 // or -1. 1014 // 1015 // when forward copying we need to store 1 word, 3 pairs and 1016 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 1017 // zero offset We adjust the destination by -1 which means we 1018 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 1019 // 1020 // When backwards copyng we need to store 1 word, 3 pairs and 1021 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 1022 // offsets {1, 3, 5, 7, 8} * unit. 1023 1024 if (direction == copy_forwards) { 1025 __ sub(s, s, 16); 1026 __ sub(d, d, 8); 1027 } 1028 1029 // Fill 8 registers 1030 // 1031 // for forwards copy s was offset by -16 from the original input 1032 // value of s so the register contents are at these offsets 1033 // relative to the 64 bit block addressed by that original input 1034 // and so on for each successive 64 byte block when s is updated 1035 // 1036 // t0 at offset 0, t1 at offset 8 1037 // t2 at offset 16, t3 at offset 24 1038 // t4 at offset 32, t5 at offset 40 1039 // t6 at offset 48, t7 at offset 56 1040 1041 // for backwards copy s was not offset so the register contents 1042 // are at these offsets into the preceding 64 byte block 1043 // relative to that original input and so on for each successive 1044 // preceding 64 byte block when s is updated. this explains the 1045 // slightly counter-intuitive looking pattern of register usage 1046 // in the stp instructions for backwards copy. 1047 // 1048 // t0 at offset -16, t1 at offset -8 1049 // t2 at offset -32, t3 at offset -24 1050 // t4 at offset -48, t5 at offset -40 1051 // t6 at offset -64, t7 at offset -56 1052 1053 __ ldp(t0, t1, Address(s, 2 * unit)); 1054 __ ldp(t2, t3, Address(s, 4 * unit)); 1055 __ ldp(t4, t5, Address(s, 6 * unit)); 1056 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1057 1058 __ subs(count, count, 16); 1059 __ br(Assembler::LO, drain); 1060 1061 int prefetch = PrefetchCopyIntervalInBytes; 1062 bool use_stride = false; 1063 if (direction == copy_backwards) { 1064 use_stride = prefetch > 256; 1065 prefetch = -prefetch; 1066 if (use_stride) __ mov(stride, prefetch); 1067 } 1068 1069 __ bind(again); 1070 1071 if (PrefetchCopyIntervalInBytes > 0) 1072 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1073 1074 if (direction == copy_forwards) { 1075 // allowing for the offset of -8 the store instructions place 1076 // registers into the target 64 bit block at the following 1077 // offsets 1078 // 1079 // t0 at offset 0 1080 // t1 at offset 8, t2 at offset 16 1081 // t3 at offset 24, t4 at offset 32 1082 // t5 at offset 40, t6 at offset 48 1083 // t7 at offset 56 1084 1085 __ str(t0, Address(d, 1 * unit)); 1086 __ stp(t1, t2, Address(d, 2 * unit)); 1087 __ ldp(t0, t1, Address(s, 2 * unit)); 1088 __ stp(t3, t4, Address(d, 4 * unit)); 1089 __ ldp(t2, t3, Address(s, 4 * unit)); 1090 __ stp(t5, t6, Address(d, 6 * unit)); 1091 __ ldp(t4, t5, Address(s, 6 * unit)); 1092 __ str(t7, Address(__ pre(d, 8 * unit))); 1093 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1094 } else { 1095 // d was not offset when we started so the registers are 1096 // written into the 64 bit block preceding d with the following 1097 // offsets 1098 // 1099 // t1 at offset -8 1100 // t3 at offset -24, t0 at offset -16 1101 // t5 at offset -48, t2 at offset -32 1102 // t7 at offset -56, t4 at offset -48 1103 // t6 at offset -64 1104 // 1105 // note that this matches the offsets previously noted for the 1106 // loads 1107 1108 __ str(t1, Address(d, 1 * unit)); 1109 __ stp(t3, t0, Address(d, 3 * unit)); 1110 __ ldp(t0, t1, Address(s, 2 * unit)); 1111 __ stp(t5, t2, Address(d, 5 * unit)); 1112 __ ldp(t2, t3, Address(s, 4 * unit)); 1113 __ stp(t7, t4, Address(d, 7 * unit)); 1114 __ ldp(t4, t5, Address(s, 6 * unit)); 1115 __ str(t6, Address(__ pre(d, 8 * unit))); 1116 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1117 } 1118 1119 __ subs(count, count, 8); 1120 __ br(Assembler::HS, again); 1121 1122 // Drain 1123 // 1124 // this uses the same pattern of offsets and register arguments 1125 // as above 1126 __ bind(drain); 1127 if (direction == copy_forwards) { 1128 __ str(t0, Address(d, 1 * unit)); 1129 __ stp(t1, t2, Address(d, 2 * unit)); 1130 __ stp(t3, t4, Address(d, 4 * unit)); 1131 __ stp(t5, t6, Address(d, 6 * unit)); 1132 __ str(t7, Address(__ pre(d, 8 * unit))); 1133 } else { 1134 __ str(t1, Address(d, 1 * unit)); 1135 __ stp(t3, t0, Address(d, 3 * unit)); 1136 __ stp(t5, t2, Address(d, 5 * unit)); 1137 __ stp(t7, t4, Address(d, 7 * unit)); 1138 __ str(t6, Address(__ pre(d, 8 * unit))); 1139 } 1140 // now we need to copy any remaining part block which may 1141 // include a 4 word block subblock and/or a 2 word subblock. 1142 // bits 2 and 1 in the count are the tell-tale for whetehr we 1143 // have each such subblock 1144 { 1145 Label L1, L2; 1146 __ tbz(count, exact_log2(4), L1); 1147 // this is the same as above but copying only 4 longs hence 1148 // with ony one intervening stp between the str instructions 1149 // but note that the offsets and registers still follow the 1150 // same pattern 1151 __ ldp(t0, t1, Address(s, 2 * unit)); 1152 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1153 if (direction == copy_forwards) { 1154 __ str(t0, Address(d, 1 * unit)); 1155 __ stp(t1, t2, Address(d, 2 * unit)); 1156 __ str(t3, Address(__ pre(d, 4 * unit))); 1157 } else { 1158 __ str(t1, Address(d, 1 * unit)); 1159 __ stp(t3, t0, Address(d, 3 * unit)); 1160 __ str(t2, Address(__ pre(d, 4 * unit))); 1161 } 1162 __ bind(L1); 1163 1164 __ tbz(count, 1, L2); 1165 // this is the same as above but copying only 2 longs hence 1166 // there is no intervening stp between the str instructions 1167 // but note that the offset and register patterns are still 1168 // the same 1169 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1170 if (direction == copy_forwards) { 1171 __ str(t0, Address(d, 1 * unit)); 1172 __ str(t1, Address(__ pre(d, 2 * unit))); 1173 } else { 1174 __ str(t1, Address(d, 1 * unit)); 1175 __ str(t0, Address(__ pre(d, 2 * unit))); 1176 } 1177 __ bind(L2); 1178 1179 // for forwards copy we need to re-adjust the offsets we 1180 // applied so that s and d are follow the last words written 1181 1182 if (direction == copy_forwards) { 1183 __ add(s, s, 16); 1184 __ add(d, d, 8); 1185 } 1186 1187 } 1188 1189 __ ret(lr); 1190 } 1191 } 1192 1193 // Small copy: less than 16 bytes. 1194 // 1195 // NB: Ignores all of the bits of count which represent more than 15 1196 // bytes, so a caller doesn't have to mask them. 1197 1198 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1199 bool is_backwards = step < 0; 1200 size_t granularity = uabs(step); 1201 int direction = is_backwards ? -1 : 1; 1202 int unit = wordSize * direction; 1203 1204 Label Lpair, Lword, Lint, Lshort, Lbyte; 1205 1206 assert(granularity 1207 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1208 1209 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1210 1211 // ??? I don't know if this bit-test-and-branch is the right thing 1212 // to do. It does a lot of jumping, resulting in several 1213 // mispredicted branches. It might make more sense to do this 1214 // with something like Duff's device with a single computed branch. 1215 1216 __ tbz(count, 3 - exact_log2(granularity), Lword); 1217 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1218 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1219 __ bind(Lword); 1220 1221 if (granularity <= sizeof (jint)) { 1222 __ tbz(count, 2 - exact_log2(granularity), Lint); 1223 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1224 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1225 __ bind(Lint); 1226 } 1227 1228 if (granularity <= sizeof (jshort)) { 1229 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1230 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1231 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1232 __ bind(Lshort); 1233 } 1234 1235 if (granularity <= sizeof (jbyte)) { 1236 __ tbz(count, 0, Lbyte); 1237 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1238 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1239 __ bind(Lbyte); 1240 } 1241 } 1242 1243 Label copy_f, copy_b; 1244 1245 // All-singing all-dancing memory copy. 1246 // 1247 // Copy count units of memory from s to d. The size of a unit is 1248 // step, which can be positive or negative depending on the direction 1249 // of copy. If is_aligned is false, we align the source address. 1250 // 1251 1252 void copy_memory(bool is_aligned, Register s, Register d, 1253 Register count, Register tmp, int step) { 1254 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1255 bool is_backwards = step < 0; 1256 int granularity = uabs(step); 1257 const Register t0 = r3, t1 = r4; 1258 1259 // <= 96 bytes do inline. Direction doesn't matter because we always 1260 // load all the data before writing anything 1261 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1262 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1263 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1264 const Register send = r17, dend = r18; 1265 1266 if (PrefetchCopyIntervalInBytes > 0) 1267 __ prfm(Address(s, 0), PLDL1KEEP); 1268 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1269 __ br(Assembler::HI, copy_big); 1270 1271 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1272 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1273 1274 __ cmp(count, 16/granularity); 1275 __ br(Assembler::LS, copy16); 1276 1277 __ cmp(count, 64/granularity); 1278 __ br(Assembler::HI, copy80); 1279 1280 __ cmp(count, 32/granularity); 1281 __ br(Assembler::LS, copy32); 1282 1283 // 33..64 bytes 1284 if (UseSIMDForMemoryOps) { 1285 __ ldpq(v0, v1, Address(s, 0)); 1286 __ ldpq(v2, v3, Address(send, -32)); 1287 __ stpq(v0, v1, Address(d, 0)); 1288 __ stpq(v2, v3, Address(dend, -32)); 1289 } else { 1290 __ ldp(t0, t1, Address(s, 0)); 1291 __ ldp(t2, t3, Address(s, 16)); 1292 __ ldp(t4, t5, Address(send, -32)); 1293 __ ldp(t6, t7, Address(send, -16)); 1294 1295 __ stp(t0, t1, Address(d, 0)); 1296 __ stp(t2, t3, Address(d, 16)); 1297 __ stp(t4, t5, Address(dend, -32)); 1298 __ stp(t6, t7, Address(dend, -16)); 1299 } 1300 __ b(finish); 1301 1302 // 17..32 bytes 1303 __ bind(copy32); 1304 __ ldp(t0, t1, Address(s, 0)); 1305 __ ldp(t2, t3, Address(send, -16)); 1306 __ stp(t0, t1, Address(d, 0)); 1307 __ stp(t2, t3, Address(dend, -16)); 1308 __ b(finish); 1309 1310 // 65..80/96 bytes 1311 // (96 bytes if SIMD because we do 32 byes per instruction) 1312 __ bind(copy80); 1313 if (UseSIMDForMemoryOps) { 1314 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1315 __ ldpq(v4, v5, Address(send, -32)); 1316 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1317 __ stpq(v4, v5, Address(dend, -32)); 1318 } else { 1319 __ ldp(t0, t1, Address(s, 0)); 1320 __ ldp(t2, t3, Address(s, 16)); 1321 __ ldp(t4, t5, Address(s, 32)); 1322 __ ldp(t6, t7, Address(s, 48)); 1323 __ ldp(t8, t9, Address(send, -16)); 1324 1325 __ stp(t0, t1, Address(d, 0)); 1326 __ stp(t2, t3, Address(d, 16)); 1327 __ stp(t4, t5, Address(d, 32)); 1328 __ stp(t6, t7, Address(d, 48)); 1329 __ stp(t8, t9, Address(dend, -16)); 1330 } 1331 __ b(finish); 1332 1333 // 0..16 bytes 1334 __ bind(copy16); 1335 __ cmp(count, 8/granularity); 1336 __ br(Assembler::LO, copy8); 1337 1338 // 8..16 bytes 1339 __ ldr(t0, Address(s, 0)); 1340 __ ldr(t1, Address(send, -8)); 1341 __ str(t0, Address(d, 0)); 1342 __ str(t1, Address(dend, -8)); 1343 __ b(finish); 1344 1345 if (granularity < 8) { 1346 // 4..7 bytes 1347 __ bind(copy8); 1348 __ tbz(count, 2 - exact_log2(granularity), copy4); 1349 __ ldrw(t0, Address(s, 0)); 1350 __ ldrw(t1, Address(send, -4)); 1351 __ strw(t0, Address(d, 0)); 1352 __ strw(t1, Address(dend, -4)); 1353 __ b(finish); 1354 if (granularity < 4) { 1355 // 0..3 bytes 1356 __ bind(copy4); 1357 __ cbz(count, finish); // get rid of 0 case 1358 if (granularity == 2) { 1359 __ ldrh(t0, Address(s, 0)); 1360 __ strh(t0, Address(d, 0)); 1361 } else { // granularity == 1 1362 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1363 // the first and last byte. 1364 // Handle the 3 byte case by loading and storing base + count/2 1365 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1366 // This does means in the 1 byte case we load/store the same 1367 // byte 3 times. 1368 __ lsr(count, count, 1); 1369 __ ldrb(t0, Address(s, 0)); 1370 __ ldrb(t1, Address(send, -1)); 1371 __ ldrb(t2, Address(s, count)); 1372 __ strb(t0, Address(d, 0)); 1373 __ strb(t1, Address(dend, -1)); 1374 __ strb(t2, Address(d, count)); 1375 } 1376 __ b(finish); 1377 } 1378 } 1379 1380 __ bind(copy_big); 1381 if (is_backwards) { 1382 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1383 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1384 } 1385 1386 // Now we've got the small case out of the way we can align the 1387 // source address on a 2-word boundary. 1388 1389 Label aligned; 1390 1391 if (is_aligned) { 1392 // We may have to adjust by 1 word to get s 2-word-aligned. 1393 __ tbz(s, exact_log2(wordSize), aligned); 1394 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1395 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1396 __ sub(count, count, wordSize/granularity); 1397 } else { 1398 if (is_backwards) { 1399 __ andr(rscratch2, s, 2 * wordSize - 1); 1400 } else { 1401 __ neg(rscratch2, s); 1402 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1403 } 1404 // rscratch2 is the byte adjustment needed to align s. 1405 __ cbz(rscratch2, aligned); 1406 int shift = exact_log2(granularity); 1407 if (shift) __ lsr(rscratch2, rscratch2, shift); 1408 __ sub(count, count, rscratch2); 1409 1410 #if 0 1411 // ?? This code is only correct for a disjoint copy. It may or 1412 // may not make sense to use it in that case. 1413 1414 // Copy the first pair; s and d may not be aligned. 1415 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1416 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1417 1418 // Align s and d, adjust count 1419 if (is_backwards) { 1420 __ sub(s, s, rscratch2); 1421 __ sub(d, d, rscratch2); 1422 } else { 1423 __ add(s, s, rscratch2); 1424 __ add(d, d, rscratch2); 1425 } 1426 #else 1427 copy_memory_small(s, d, rscratch2, rscratch1, step); 1428 #endif 1429 } 1430 1431 __ bind(aligned); 1432 1433 // s is now 2-word-aligned. 1434 1435 // We have a count of units and some trailing bytes. Adjust the 1436 // count and do a bulk copy of words. 1437 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1438 if (direction == copy_forwards) 1439 __ bl(copy_f); 1440 else 1441 __ bl(copy_b); 1442 1443 // And the tail. 1444 copy_memory_small(s, d, count, tmp, step); 1445 1446 if (granularity >= 8) __ bind(copy8); 1447 if (granularity >= 4) __ bind(copy4); 1448 __ bind(finish); 1449 } 1450 1451 1452 void clobber_registers() { 1453 #ifdef ASSERT 1454 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1455 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1456 for (Register r = r3; r <= r18; r++) 1457 if (r != rscratch1) __ mov(r, rscratch1); 1458 #endif 1459 } 1460 1461 // Scan over array at a for count oops, verifying each one. 1462 // Preserves a and count, clobbers rscratch1 and rscratch2. 1463 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1464 Label loop, end; 1465 __ mov(rscratch1, a); 1466 __ mov(rscratch2, zr); 1467 __ bind(loop); 1468 __ cmp(rscratch2, count); 1469 __ br(Assembler::HS, end); 1470 if (size == (size_t)wordSize) { 1471 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1472 __ verify_oop(temp); 1473 } else { 1474 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1475 __ decode_heap_oop(temp); // calls verify_oop 1476 } 1477 __ add(rscratch2, rscratch2, size); 1478 __ b(loop); 1479 __ bind(end); 1480 } 1481 1482 // Arguments: 1483 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1484 // ignored 1485 // is_oop - true => oop array, so generate store check code 1486 // name - stub name string 1487 // 1488 // Inputs: 1489 // c_rarg0 - source array address 1490 // c_rarg1 - destination array address 1491 // c_rarg2 - element count, treated as ssize_t, can be zero 1492 // 1493 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1494 // the hardware handle it. The two dwords within qwords that span 1495 // cache line boundaries will still be loaded and stored atomicly. 1496 // 1497 // Side Effects: 1498 // disjoint_int_copy_entry is set to the no-overlap entry point 1499 // used by generate_conjoint_int_oop_copy(). 1500 // 1501 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1502 const char *name, bool dest_uninitialized = false) { 1503 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1504 RegSet saved_reg = RegSet::of(s, d, count); 1505 __ align(CodeEntryAlignment); 1506 StubCodeMark mark(this, "StubRoutines", name); 1507 address start = __ pc(); 1508 __ enter(); 1509 1510 if (entry != NULL) { 1511 *entry = __ pc(); 1512 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1513 BLOCK_COMMENT("Entry:"); 1514 } 1515 1516 if (is_oop) { 1517 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_reg); 1518 // save regs before copy_memory 1519 __ push(RegSet::of(d, count), sp); 1520 } 1521 copy_memory(aligned, s, d, count, rscratch1, size); 1522 if (is_oop) { 1523 __ pop(RegSet::of(d, count), sp); 1524 if (VerifyOops) 1525 verify_oop_array(size, d, count, r16); 1526 __ sub(count, count, 1); // make an inclusive end pointer 1527 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1528 gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet()); 1529 } 1530 __ leave(); 1531 __ mov(r0, zr); // return 0 1532 __ ret(lr); 1533 #ifdef BUILTIN_SIM 1534 { 1535 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1536 sim->notifyCompile(const_cast<char*>(name), start); 1537 } 1538 #endif 1539 return start; 1540 } 1541 1542 // Arguments: 1543 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1544 // ignored 1545 // is_oop - true => oop array, so generate store check code 1546 // name - stub name string 1547 // 1548 // Inputs: 1549 // c_rarg0 - source array address 1550 // c_rarg1 - destination array address 1551 // c_rarg2 - element count, treated as ssize_t, can be zero 1552 // 1553 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1554 // the hardware handle it. The two dwords within qwords that span 1555 // cache line boundaries will still be loaded and stored atomicly. 1556 // 1557 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1558 address *entry, const char *name, 1559 bool dest_uninitialized = false) { 1560 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1561 RegSet saved_regs = RegSet::of(s, d, count); 1562 StubCodeMark mark(this, "StubRoutines", name); 1563 address start = __ pc(); 1564 __ enter(); 1565 1566 if (entry != NULL) { 1567 *entry = __ pc(); 1568 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1569 BLOCK_COMMENT("Entry:"); 1570 } 1571 1572 // use fwd copy when (d-s) above_equal (count*size) 1573 __ sub(rscratch1, d, s); 1574 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1575 __ br(Assembler::HS, nooverlap_target); 1576 1577 if (is_oop) { 1578 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_regs); 1579 // save regs before copy_memory 1580 __ push(RegSet::of(d, count), sp); 1581 } 1582 copy_memory(aligned, s, d, count, rscratch1, -size); 1583 if (is_oop) { 1584 __ pop(RegSet::of(d, count), sp); 1585 if (VerifyOops) 1586 verify_oop_array(size, d, count, r16); 1587 __ sub(count, count, 1); // make an inclusive end pointer 1588 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1589 gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet()); 1590 } 1591 __ leave(); 1592 __ mov(r0, zr); // return 0 1593 __ ret(lr); 1594 #ifdef BUILTIN_SIM 1595 { 1596 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1597 sim->notifyCompile(const_cast<char*>(name), start); 1598 } 1599 #endif 1600 return start; 1601 } 1602 1603 // Arguments: 1604 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1605 // ignored 1606 // name - stub name string 1607 // 1608 // Inputs: 1609 // c_rarg0 - source array address 1610 // c_rarg1 - destination array address 1611 // c_rarg2 - element count, treated as ssize_t, can be zero 1612 // 1613 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1614 // we let the hardware handle it. The one to eight bytes within words, 1615 // dwords or qwords that span cache line boundaries will still be loaded 1616 // and stored atomically. 1617 // 1618 // Side Effects: 1619 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1620 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1621 // we let the hardware handle it. The one to eight bytes within words, 1622 // dwords or qwords that span cache line boundaries will still be loaded 1623 // and stored atomically. 1624 // 1625 // Side Effects: 1626 // disjoint_byte_copy_entry is set to the no-overlap entry point 1627 // used by generate_conjoint_byte_copy(). 1628 // 1629 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1630 const bool not_oop = false; 1631 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1632 } 1633 1634 // Arguments: 1635 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1636 // ignored 1637 // name - stub name string 1638 // 1639 // Inputs: 1640 // c_rarg0 - source array address 1641 // c_rarg1 - destination array address 1642 // c_rarg2 - element count, treated as ssize_t, can be zero 1643 // 1644 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1645 // we let the hardware handle it. The one to eight bytes within words, 1646 // dwords or qwords that span cache line boundaries will still be loaded 1647 // and stored atomically. 1648 // 1649 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1650 address* entry, const char *name) { 1651 const bool not_oop = false; 1652 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1653 } 1654 1655 // Arguments: 1656 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1657 // ignored 1658 // name - stub name string 1659 // 1660 // Inputs: 1661 // c_rarg0 - source array address 1662 // c_rarg1 - destination array address 1663 // c_rarg2 - element count, treated as ssize_t, can be zero 1664 // 1665 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1666 // let the hardware handle it. The two or four words within dwords 1667 // or qwords that span cache line boundaries will still be loaded 1668 // and stored atomically. 1669 // 1670 // Side Effects: 1671 // disjoint_short_copy_entry is set to the no-overlap entry point 1672 // used by generate_conjoint_short_copy(). 1673 // 1674 address generate_disjoint_short_copy(bool aligned, 1675 address* entry, const char *name) { 1676 const bool not_oop = false; 1677 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1678 } 1679 1680 // Arguments: 1681 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1682 // ignored 1683 // name - stub name string 1684 // 1685 // Inputs: 1686 // c_rarg0 - source array address 1687 // c_rarg1 - destination array address 1688 // c_rarg2 - element count, treated as ssize_t, can be zero 1689 // 1690 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1691 // let the hardware handle it. The two or four words within dwords 1692 // or qwords that span cache line boundaries will still be loaded 1693 // and stored atomically. 1694 // 1695 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1696 address *entry, const char *name) { 1697 const bool not_oop = false; 1698 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1699 1700 } 1701 // Arguments: 1702 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1703 // ignored 1704 // name - stub name string 1705 // 1706 // Inputs: 1707 // c_rarg0 - source array address 1708 // c_rarg1 - destination array address 1709 // c_rarg2 - element count, treated as ssize_t, can be zero 1710 // 1711 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1712 // the hardware handle it. The two dwords within qwords that span 1713 // cache line boundaries will still be loaded and stored atomicly. 1714 // 1715 // Side Effects: 1716 // disjoint_int_copy_entry is set to the no-overlap entry point 1717 // used by generate_conjoint_int_oop_copy(). 1718 // 1719 address generate_disjoint_int_copy(bool aligned, address *entry, 1720 const char *name, bool dest_uninitialized = false) { 1721 const bool not_oop = false; 1722 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1723 } 1724 1725 // Arguments: 1726 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1727 // ignored 1728 // name - stub name string 1729 // 1730 // Inputs: 1731 // c_rarg0 - source array address 1732 // c_rarg1 - destination array address 1733 // c_rarg2 - element count, treated as ssize_t, can be zero 1734 // 1735 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1736 // the hardware handle it. The two dwords within qwords that span 1737 // cache line boundaries will still be loaded and stored atomicly. 1738 // 1739 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1740 address *entry, const char *name, 1741 bool dest_uninitialized = false) { 1742 const bool not_oop = false; 1743 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1744 } 1745 1746 1747 // Arguments: 1748 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1749 // ignored 1750 // name - stub name string 1751 // 1752 // Inputs: 1753 // c_rarg0 - source array address 1754 // c_rarg1 - destination array address 1755 // c_rarg2 - element count, treated as size_t, can be zero 1756 // 1757 // Side Effects: 1758 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1759 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1760 // 1761 address generate_disjoint_long_copy(bool aligned, address *entry, 1762 const char *name, bool dest_uninitialized = false) { 1763 const bool not_oop = false; 1764 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1765 } 1766 1767 // Arguments: 1768 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1769 // ignored 1770 // name - stub name string 1771 // 1772 // Inputs: 1773 // c_rarg0 - source array address 1774 // c_rarg1 - destination array address 1775 // c_rarg2 - element count, treated as size_t, can be zero 1776 // 1777 address generate_conjoint_long_copy(bool aligned, 1778 address nooverlap_target, address *entry, 1779 const char *name, bool dest_uninitialized = false) { 1780 const bool not_oop = false; 1781 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1782 } 1783 1784 // Arguments: 1785 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1786 // ignored 1787 // name - stub name string 1788 // 1789 // Inputs: 1790 // c_rarg0 - source array address 1791 // c_rarg1 - destination array address 1792 // c_rarg2 - element count, treated as size_t, can be zero 1793 // 1794 // Side Effects: 1795 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1796 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1797 // 1798 address generate_disjoint_oop_copy(bool aligned, address *entry, 1799 const char *name, bool dest_uninitialized) { 1800 const bool is_oop = true; 1801 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1802 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1803 } 1804 1805 // Arguments: 1806 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1807 // ignored 1808 // name - stub name string 1809 // 1810 // Inputs: 1811 // c_rarg0 - source array address 1812 // c_rarg1 - destination array address 1813 // c_rarg2 - element count, treated as size_t, can be zero 1814 // 1815 address generate_conjoint_oop_copy(bool aligned, 1816 address nooverlap_target, address *entry, 1817 const char *name, bool dest_uninitialized) { 1818 const bool is_oop = true; 1819 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1820 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1821 name, dest_uninitialized); 1822 } 1823 1824 1825 // Helper for generating a dynamic type check. 1826 // Smashes rscratch1. 1827 void generate_type_check(Register sub_klass, 1828 Register super_check_offset, 1829 Register super_klass, 1830 Label& L_success) { 1831 assert_different_registers(sub_klass, super_check_offset, super_klass); 1832 1833 BLOCK_COMMENT("type_check:"); 1834 1835 Label L_miss; 1836 1837 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1838 super_check_offset); 1839 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1840 1841 // Fall through on failure! 1842 __ BIND(L_miss); 1843 } 1844 1845 // 1846 // Generate checkcasting array copy stub 1847 // 1848 // Input: 1849 // c_rarg0 - source array address 1850 // c_rarg1 - destination array address 1851 // c_rarg2 - element count, treated as ssize_t, can be zero 1852 // c_rarg3 - size_t ckoff (super_check_offset) 1853 // c_rarg4 - oop ckval (super_klass) 1854 // 1855 // Output: 1856 // r0 == 0 - success 1857 // r0 == -1^K - failure, where K is partial transfer count 1858 // 1859 address generate_checkcast_copy(const char *name, address *entry, 1860 bool dest_uninitialized = false) { 1861 1862 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1863 1864 // Input registers (after setup_arg_regs) 1865 const Register from = c_rarg0; // source array address 1866 const Register to = c_rarg1; // destination array address 1867 const Register count = c_rarg2; // elementscount 1868 const Register ckoff = c_rarg3; // super_check_offset 1869 const Register ckval = c_rarg4; // super_klass 1870 1871 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1872 RegSet wb_post_saved_regs = RegSet::of(count); 1873 1874 // Registers used as temps (r18, r19, r20 are save-on-entry) 1875 const Register count_save = r21; // orig elementscount 1876 const Register start_to = r20; // destination array start address 1877 const Register copied_oop = r18; // actual oop copied 1878 const Register r19_klass = r19; // oop._klass 1879 1880 //--------------------------------------------------------------- 1881 // Assembler stub will be used for this call to arraycopy 1882 // if the two arrays are subtypes of Object[] but the 1883 // destination array type is not equal to or a supertype 1884 // of the source type. Each element must be separately 1885 // checked. 1886 1887 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1888 copied_oop, r19_klass, count_save); 1889 1890 __ align(CodeEntryAlignment); 1891 StubCodeMark mark(this, "StubRoutines", name); 1892 address start = __ pc(); 1893 1894 __ enter(); // required for proper stackwalking of RuntimeStub frame 1895 1896 #ifdef ASSERT 1897 // caller guarantees that the arrays really are different 1898 // otherwise, we would have to make conjoint checks 1899 { Label L; 1900 array_overlap_test(L, TIMES_OOP); 1901 __ stop("checkcast_copy within a single array"); 1902 __ bind(L); 1903 } 1904 #endif //ASSERT 1905 1906 // Caller of this entry point must set up the argument registers. 1907 if (entry != NULL) { 1908 *entry = __ pc(); 1909 BLOCK_COMMENT("Entry:"); 1910 } 1911 1912 // Empty array: Nothing to do. 1913 __ cbz(count, L_done); 1914 1915 __ push(RegSet::of(r18, r19, r20, r21), sp); 1916 1917 #ifdef ASSERT 1918 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1919 // The ckoff and ckval must be mutually consistent, 1920 // even though caller generates both. 1921 { Label L; 1922 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1923 __ ldrw(start_to, Address(ckval, sco_offset)); 1924 __ cmpw(ckoff, start_to); 1925 __ br(Assembler::EQ, L); 1926 __ stop("super_check_offset inconsistent"); 1927 __ bind(L); 1928 } 1929 #endif //ASSERT 1930 1931 gen_write_ref_array_pre_barrier(to, count, dest_uninitialized, wb_pre_saved_regs); 1932 1933 // save the original count 1934 __ mov(count_save, count); 1935 1936 // Copy from low to high addresses 1937 __ mov(start_to, to); // Save destination array start address 1938 __ b(L_load_element); 1939 1940 // ======== begin loop ======== 1941 // (Loop is rotated; its entry is L_load_element.) 1942 // Loop control: 1943 // for (; count != 0; count--) { 1944 // copied_oop = load_heap_oop(from++); 1945 // ... generate_type_check ...; 1946 // store_heap_oop(to++, copied_oop); 1947 // } 1948 __ align(OptoLoopAlignment); 1949 1950 __ BIND(L_store_element); 1951 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1952 __ sub(count, count, 1); 1953 __ cbz(count, L_do_card_marks); 1954 1955 // ======== loop entry is here ======== 1956 __ BIND(L_load_element); 1957 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1958 __ cbz(copied_oop, L_store_element); 1959 1960 __ load_klass(r19_klass, copied_oop);// query the object klass 1961 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1962 // ======== end loop ======== 1963 1964 // It was a real error; we must depend on the caller to finish the job. 1965 // Register count = remaining oops, count_orig = total oops. 1966 // Emit GC store barriers for the oops we have copied and report 1967 // their number to the caller. 1968 1969 __ subs(count, count_save, count); // K = partially copied oop count 1970 __ eon(count, count, zr); // report (-1^K) to caller 1971 __ br(Assembler::EQ, L_done_pop); 1972 1973 __ BIND(L_do_card_marks); 1974 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1975 gen_write_ref_array_post_barrier(start_to, to, rscratch1, wb_post_saved_regs); 1976 1977 __ bind(L_done_pop); 1978 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1979 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1980 1981 __ bind(L_done); 1982 __ mov(r0, count); 1983 __ leave(); 1984 __ ret(lr); 1985 1986 return start; 1987 } 1988 1989 // Perform range checks on the proposed arraycopy. 1990 // Kills temp, but nothing else. 1991 // Also, clean the sign bits of src_pos and dst_pos. 1992 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1993 Register src_pos, // source position (c_rarg1) 1994 Register dst, // destination array oo (c_rarg2) 1995 Register dst_pos, // destination position (c_rarg3) 1996 Register length, 1997 Register temp, 1998 Label& L_failed) { 1999 BLOCK_COMMENT("arraycopy_range_checks:"); 2000 2001 assert_different_registers(rscratch1, temp); 2002 2003 // if (src_pos + length > arrayOop(src)->length()) FAIL; 2004 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 2005 __ addw(temp, length, src_pos); 2006 __ cmpw(temp, rscratch1); 2007 __ br(Assembler::HI, L_failed); 2008 2009 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 2010 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 2011 __ addw(temp, length, dst_pos); 2012 __ cmpw(temp, rscratch1); 2013 __ br(Assembler::HI, L_failed); 2014 2015 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 2016 __ movw(src_pos, src_pos); 2017 __ movw(dst_pos, dst_pos); 2018 2019 BLOCK_COMMENT("arraycopy_range_checks done"); 2020 } 2021 2022 // These stubs get called from some dumb test routine. 2023 // I'll write them properly when they're called from 2024 // something that's actually doing something. 2025 static void fake_arraycopy_stub(address src, address dst, int count) { 2026 assert(count == 0, "huh?"); 2027 } 2028 2029 2030 // 2031 // Generate 'unsafe' array copy stub 2032 // Though just as safe as the other stubs, it takes an unscaled 2033 // size_t argument instead of an element count. 2034 // 2035 // Input: 2036 // c_rarg0 - source array address 2037 // c_rarg1 - destination array address 2038 // c_rarg2 - byte count, treated as ssize_t, can be zero 2039 // 2040 // Examines the alignment of the operands and dispatches 2041 // to a long, int, short, or byte copy loop. 2042 // 2043 address generate_unsafe_copy(const char *name, 2044 address byte_copy_entry, 2045 address short_copy_entry, 2046 address int_copy_entry, 2047 address long_copy_entry) { 2048 Label L_long_aligned, L_int_aligned, L_short_aligned; 2049 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 2050 2051 __ align(CodeEntryAlignment); 2052 StubCodeMark mark(this, "StubRoutines", name); 2053 address start = __ pc(); 2054 __ enter(); // required for proper stackwalking of RuntimeStub frame 2055 2056 // bump this on entry, not on exit: 2057 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2058 2059 __ orr(rscratch1, s, d); 2060 __ orr(rscratch1, rscratch1, count); 2061 2062 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2063 __ cbz(rscratch1, L_long_aligned); 2064 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2065 __ cbz(rscratch1, L_int_aligned); 2066 __ tbz(rscratch1, 0, L_short_aligned); 2067 __ b(RuntimeAddress(byte_copy_entry)); 2068 2069 __ BIND(L_short_aligned); 2070 __ lsr(count, count, LogBytesPerShort); // size => short_count 2071 __ b(RuntimeAddress(short_copy_entry)); 2072 __ BIND(L_int_aligned); 2073 __ lsr(count, count, LogBytesPerInt); // size => int_count 2074 __ b(RuntimeAddress(int_copy_entry)); 2075 __ BIND(L_long_aligned); 2076 __ lsr(count, count, LogBytesPerLong); // size => long_count 2077 __ b(RuntimeAddress(long_copy_entry)); 2078 2079 return start; 2080 } 2081 2082 // 2083 // Generate generic array copy stubs 2084 // 2085 // Input: 2086 // c_rarg0 - src oop 2087 // c_rarg1 - src_pos (32-bits) 2088 // c_rarg2 - dst oop 2089 // c_rarg3 - dst_pos (32-bits) 2090 // c_rarg4 - element count (32-bits) 2091 // 2092 // Output: 2093 // r0 == 0 - success 2094 // r0 == -1^K - failure, where K is partial transfer count 2095 // 2096 address generate_generic_copy(const char *name, 2097 address byte_copy_entry, address short_copy_entry, 2098 address int_copy_entry, address oop_copy_entry, 2099 address long_copy_entry, address checkcast_copy_entry) { 2100 2101 Label L_failed, L_failed_0, L_objArray; 2102 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2103 2104 // Input registers 2105 const Register src = c_rarg0; // source array oop 2106 const Register src_pos = c_rarg1; // source position 2107 const Register dst = c_rarg2; // destination array oop 2108 const Register dst_pos = c_rarg3; // destination position 2109 const Register length = c_rarg4; 2110 2111 StubCodeMark mark(this, "StubRoutines", name); 2112 2113 __ align(CodeEntryAlignment); 2114 address start = __ pc(); 2115 2116 __ enter(); // required for proper stackwalking of RuntimeStub frame 2117 2118 // bump this on entry, not on exit: 2119 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2120 2121 //----------------------------------------------------------------------- 2122 // Assembler stub will be used for this call to arraycopy 2123 // if the following conditions are met: 2124 // 2125 // (1) src and dst must not be null. 2126 // (2) src_pos must not be negative. 2127 // (3) dst_pos must not be negative. 2128 // (4) length must not be negative. 2129 // (5) src klass and dst klass should be the same and not NULL. 2130 // (6) src and dst should be arrays. 2131 // (7) src_pos + length must not exceed length of src. 2132 // (8) dst_pos + length must not exceed length of dst. 2133 // 2134 2135 // if (src == NULL) return -1; 2136 __ cbz(src, L_failed); 2137 2138 // if (src_pos < 0) return -1; 2139 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2140 2141 // if (dst == NULL) return -1; 2142 __ cbz(dst, L_failed); 2143 2144 // if (dst_pos < 0) return -1; 2145 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2146 2147 // registers used as temp 2148 const Register scratch_length = r16; // elements count to copy 2149 const Register scratch_src_klass = r17; // array klass 2150 const Register lh = r18; // layout helper 2151 2152 // if (length < 0) return -1; 2153 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2154 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2155 2156 __ load_klass(scratch_src_klass, src); 2157 #ifdef ASSERT 2158 // assert(src->klass() != NULL); 2159 { 2160 BLOCK_COMMENT("assert klasses not null {"); 2161 Label L1, L2; 2162 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2163 __ bind(L1); 2164 __ stop("broken null klass"); 2165 __ bind(L2); 2166 __ load_klass(rscratch1, dst); 2167 __ cbz(rscratch1, L1); // this would be broken also 2168 BLOCK_COMMENT("} assert klasses not null done"); 2169 } 2170 #endif 2171 2172 // Load layout helper (32-bits) 2173 // 2174 // |array_tag| | header_size | element_type | |log2_element_size| 2175 // 32 30 24 16 8 2 0 2176 // 2177 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2178 // 2179 2180 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2181 2182 // Handle objArrays completely differently... 2183 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2184 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2185 __ movw(rscratch1, objArray_lh); 2186 __ eorw(rscratch2, lh, rscratch1); 2187 __ cbzw(rscratch2, L_objArray); 2188 2189 // if (src->klass() != dst->klass()) return -1; 2190 __ load_klass(rscratch2, dst); 2191 __ eor(rscratch2, rscratch2, scratch_src_klass); 2192 __ cbnz(rscratch2, L_failed); 2193 2194 // if (!src->is_Array()) return -1; 2195 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2196 2197 // At this point, it is known to be a typeArray (array_tag 0x3). 2198 #ifdef ASSERT 2199 { 2200 BLOCK_COMMENT("assert primitive array {"); 2201 Label L; 2202 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2203 __ cmpw(lh, rscratch2); 2204 __ br(Assembler::GE, L); 2205 __ stop("must be a primitive array"); 2206 __ bind(L); 2207 BLOCK_COMMENT("} assert primitive array done"); 2208 } 2209 #endif 2210 2211 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2212 rscratch2, L_failed); 2213 2214 // TypeArrayKlass 2215 // 2216 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2217 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2218 // 2219 2220 const Register rscratch1_offset = rscratch1; // array offset 2221 const Register r18_elsize = lh; // element size 2222 2223 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2224 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2225 __ add(src, src, rscratch1_offset); // src array offset 2226 __ add(dst, dst, rscratch1_offset); // dst array offset 2227 BLOCK_COMMENT("choose copy loop based on element size"); 2228 2229 // next registers should be set before the jump to corresponding stub 2230 const Register from = c_rarg0; // source array address 2231 const Register to = c_rarg1; // destination array address 2232 const Register count = c_rarg2; // elements count 2233 2234 // 'from', 'to', 'count' registers should be set in such order 2235 // since they are the same as 'src', 'src_pos', 'dst'. 2236 2237 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2238 2239 // The possible values of elsize are 0-3, i.e. exact_log2(element 2240 // size in bytes). We do a simple bitwise binary search. 2241 __ BIND(L_copy_bytes); 2242 __ tbnz(r18_elsize, 1, L_copy_ints); 2243 __ tbnz(r18_elsize, 0, L_copy_shorts); 2244 __ lea(from, Address(src, src_pos));// src_addr 2245 __ lea(to, Address(dst, dst_pos));// dst_addr 2246 __ movw(count, scratch_length); // length 2247 __ b(RuntimeAddress(byte_copy_entry)); 2248 2249 __ BIND(L_copy_shorts); 2250 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2251 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2252 __ movw(count, scratch_length); // length 2253 __ b(RuntimeAddress(short_copy_entry)); 2254 2255 __ BIND(L_copy_ints); 2256 __ tbnz(r18_elsize, 0, L_copy_longs); 2257 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2258 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2259 __ movw(count, scratch_length); // length 2260 __ b(RuntimeAddress(int_copy_entry)); 2261 2262 __ BIND(L_copy_longs); 2263 #ifdef ASSERT 2264 { 2265 BLOCK_COMMENT("assert long copy {"); 2266 Label L; 2267 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2268 __ cmpw(r18_elsize, LogBytesPerLong); 2269 __ br(Assembler::EQ, L); 2270 __ stop("must be long copy, but elsize is wrong"); 2271 __ bind(L); 2272 BLOCK_COMMENT("} assert long copy done"); 2273 } 2274 #endif 2275 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2276 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2277 __ movw(count, scratch_length); // length 2278 __ b(RuntimeAddress(long_copy_entry)); 2279 2280 // ObjArrayKlass 2281 __ BIND(L_objArray); 2282 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2283 2284 Label L_plain_copy, L_checkcast_copy; 2285 // test array classes for subtyping 2286 __ load_klass(r18, dst); 2287 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2288 __ br(Assembler::NE, L_checkcast_copy); 2289 2290 // Identically typed arrays can be copied without element-wise checks. 2291 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2292 rscratch2, L_failed); 2293 2294 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2295 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2296 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2297 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2298 __ movw(count, scratch_length); // length 2299 __ BIND(L_plain_copy); 2300 __ b(RuntimeAddress(oop_copy_entry)); 2301 2302 __ BIND(L_checkcast_copy); 2303 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2304 { 2305 // Before looking at dst.length, make sure dst is also an objArray. 2306 __ ldrw(rscratch1, Address(r18, lh_offset)); 2307 __ movw(rscratch2, objArray_lh); 2308 __ eorw(rscratch1, rscratch1, rscratch2); 2309 __ cbnzw(rscratch1, L_failed); 2310 2311 // It is safe to examine both src.length and dst.length. 2312 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2313 r18, L_failed); 2314 2315 const Register rscratch2_dst_klass = rscratch2; 2316 __ load_klass(rscratch2_dst_klass, dst); // reload 2317 2318 // Marshal the base address arguments now, freeing registers. 2319 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2320 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2321 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2322 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2323 __ movw(count, length); // length (reloaded) 2324 Register sco_temp = c_rarg3; // this register is free now 2325 assert_different_registers(from, to, count, sco_temp, 2326 rscratch2_dst_klass, scratch_src_klass); 2327 // assert_clean_int(count, sco_temp); 2328 2329 // Generate the type check. 2330 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2331 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2332 // assert_clean_int(sco_temp, r18); 2333 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2334 2335 // Fetch destination element klass from the ObjArrayKlass header. 2336 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2337 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2338 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2339 2340 // the checkcast_copy loop needs two extra arguments: 2341 assert(c_rarg3 == sco_temp, "#3 already in place"); 2342 // Set up arguments for checkcast_copy_entry. 2343 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2344 __ b(RuntimeAddress(checkcast_copy_entry)); 2345 } 2346 2347 __ BIND(L_failed); 2348 __ mov(r0, -1); 2349 __ leave(); // required for proper stackwalking of RuntimeStub frame 2350 __ ret(lr); 2351 2352 return start; 2353 } 2354 2355 // 2356 // Generate stub for array fill. If "aligned" is true, the 2357 // "to" address is assumed to be heapword aligned. 2358 // 2359 // Arguments for generated stub: 2360 // to: c_rarg0 2361 // value: c_rarg1 2362 // count: c_rarg2 treated as signed 2363 // 2364 address generate_fill(BasicType t, bool aligned, const char *name) { 2365 __ align(CodeEntryAlignment); 2366 StubCodeMark mark(this, "StubRoutines", name); 2367 address start = __ pc(); 2368 2369 BLOCK_COMMENT("Entry:"); 2370 2371 const Register to = c_rarg0; // source array address 2372 const Register value = c_rarg1; // value 2373 const Register count = c_rarg2; // elements count 2374 2375 const Register bz_base = r10; // base for block_zero routine 2376 const Register cnt_words = r11; // temp register 2377 2378 __ enter(); 2379 2380 Label L_fill_elements, L_exit1; 2381 2382 int shift = -1; 2383 switch (t) { 2384 case T_BYTE: 2385 shift = 0; 2386 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2387 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2388 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2389 __ br(Assembler::LO, L_fill_elements); 2390 break; 2391 case T_SHORT: 2392 shift = 1; 2393 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2394 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2395 __ br(Assembler::LO, L_fill_elements); 2396 break; 2397 case T_INT: 2398 shift = 2; 2399 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2400 __ br(Assembler::LO, L_fill_elements); 2401 break; 2402 default: ShouldNotReachHere(); 2403 } 2404 2405 // Align source address at 8 bytes address boundary. 2406 Label L_skip_align1, L_skip_align2, L_skip_align4; 2407 if (!aligned) { 2408 switch (t) { 2409 case T_BYTE: 2410 // One byte misalignment happens only for byte arrays. 2411 __ tbz(to, 0, L_skip_align1); 2412 __ strb(value, Address(__ post(to, 1))); 2413 __ subw(count, count, 1); 2414 __ bind(L_skip_align1); 2415 // Fallthrough 2416 case T_SHORT: 2417 // Two bytes misalignment happens only for byte and short (char) arrays. 2418 __ tbz(to, 1, L_skip_align2); 2419 __ strh(value, Address(__ post(to, 2))); 2420 __ subw(count, count, 2 >> shift); 2421 __ bind(L_skip_align2); 2422 // Fallthrough 2423 case T_INT: 2424 // Align to 8 bytes, we know we are 4 byte aligned to start. 2425 __ tbz(to, 2, L_skip_align4); 2426 __ strw(value, Address(__ post(to, 4))); 2427 __ subw(count, count, 4 >> shift); 2428 __ bind(L_skip_align4); 2429 break; 2430 default: ShouldNotReachHere(); 2431 } 2432 } 2433 2434 // 2435 // Fill large chunks 2436 // 2437 __ lsrw(cnt_words, count, 3 - shift); // number of words 2438 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2439 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2440 if (UseBlockZeroing) { 2441 Label non_block_zeroing, rest; 2442 // If the fill value is zero we can use the fast zero_words(). 2443 __ cbnz(value, non_block_zeroing); 2444 __ mov(bz_base, to); 2445 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2446 __ zero_words(bz_base, cnt_words); 2447 __ b(rest); 2448 __ bind(non_block_zeroing); 2449 __ fill_words(to, cnt_words, value); 2450 __ bind(rest); 2451 } else { 2452 __ fill_words(to, cnt_words, value); 2453 } 2454 2455 // Remaining count is less than 8 bytes. Fill it by a single store. 2456 // Note that the total length is no less than 8 bytes. 2457 if (t == T_BYTE || t == T_SHORT) { 2458 Label L_exit1; 2459 __ cbzw(count, L_exit1); 2460 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2461 __ str(value, Address(to, -8)); // overwrite some elements 2462 __ bind(L_exit1); 2463 __ leave(); 2464 __ ret(lr); 2465 } 2466 2467 // Handle copies less than 8 bytes. 2468 Label L_fill_2, L_fill_4, L_exit2; 2469 __ bind(L_fill_elements); 2470 switch (t) { 2471 case T_BYTE: 2472 __ tbz(count, 0, L_fill_2); 2473 __ strb(value, Address(__ post(to, 1))); 2474 __ bind(L_fill_2); 2475 __ tbz(count, 1, L_fill_4); 2476 __ strh(value, Address(__ post(to, 2))); 2477 __ bind(L_fill_4); 2478 __ tbz(count, 2, L_exit2); 2479 __ strw(value, Address(to)); 2480 break; 2481 case T_SHORT: 2482 __ tbz(count, 0, L_fill_4); 2483 __ strh(value, Address(__ post(to, 2))); 2484 __ bind(L_fill_4); 2485 __ tbz(count, 1, L_exit2); 2486 __ strw(value, Address(to)); 2487 break; 2488 case T_INT: 2489 __ cbzw(count, L_exit2); 2490 __ strw(value, Address(to)); 2491 break; 2492 default: ShouldNotReachHere(); 2493 } 2494 __ bind(L_exit2); 2495 __ leave(); 2496 __ ret(lr); 2497 return start; 2498 } 2499 2500 void generate_arraycopy_stubs() { 2501 address entry; 2502 address entry_jbyte_arraycopy; 2503 address entry_jshort_arraycopy; 2504 address entry_jint_arraycopy; 2505 address entry_oop_arraycopy; 2506 address entry_jlong_arraycopy; 2507 address entry_checkcast_arraycopy; 2508 2509 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2510 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2511 2512 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2513 2514 //*** jbyte 2515 // Always need aligned and unaligned versions 2516 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2517 "jbyte_disjoint_arraycopy"); 2518 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2519 &entry_jbyte_arraycopy, 2520 "jbyte_arraycopy"); 2521 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2522 "arrayof_jbyte_disjoint_arraycopy"); 2523 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2524 "arrayof_jbyte_arraycopy"); 2525 2526 //*** jshort 2527 // Always need aligned and unaligned versions 2528 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2529 "jshort_disjoint_arraycopy"); 2530 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2531 &entry_jshort_arraycopy, 2532 "jshort_arraycopy"); 2533 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2534 "arrayof_jshort_disjoint_arraycopy"); 2535 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2536 "arrayof_jshort_arraycopy"); 2537 2538 //*** jint 2539 // Aligned versions 2540 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2541 "arrayof_jint_disjoint_arraycopy"); 2542 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2543 "arrayof_jint_arraycopy"); 2544 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2545 // entry_jint_arraycopy always points to the unaligned version 2546 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2547 "jint_disjoint_arraycopy"); 2548 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2549 &entry_jint_arraycopy, 2550 "jint_arraycopy"); 2551 2552 //*** jlong 2553 // It is always aligned 2554 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2555 "arrayof_jlong_disjoint_arraycopy"); 2556 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2557 "arrayof_jlong_arraycopy"); 2558 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2559 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2560 2561 //*** oops 2562 { 2563 // With compressed oops we need unaligned versions; notice that 2564 // we overwrite entry_oop_arraycopy. 2565 bool aligned = !UseCompressedOops; 2566 2567 StubRoutines::_arrayof_oop_disjoint_arraycopy 2568 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2569 /*dest_uninitialized*/false); 2570 StubRoutines::_arrayof_oop_arraycopy 2571 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2572 /*dest_uninitialized*/false); 2573 // Aligned versions without pre-barriers 2574 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2575 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2576 /*dest_uninitialized*/true); 2577 StubRoutines::_arrayof_oop_arraycopy_uninit 2578 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2579 /*dest_uninitialized*/true); 2580 } 2581 2582 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2583 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2584 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2585 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2586 2587 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2588 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2589 /*dest_uninitialized*/true); 2590 2591 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2592 entry_jbyte_arraycopy, 2593 entry_jshort_arraycopy, 2594 entry_jint_arraycopy, 2595 entry_jlong_arraycopy); 2596 2597 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2598 entry_jbyte_arraycopy, 2599 entry_jshort_arraycopy, 2600 entry_jint_arraycopy, 2601 entry_oop_arraycopy, 2602 entry_jlong_arraycopy, 2603 entry_checkcast_arraycopy); 2604 2605 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2606 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2607 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2608 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2609 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2610 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2611 } 2612 2613 void generate_math_stubs() { Unimplemented(); } 2614 2615 // Arguments: 2616 // 2617 // Inputs: 2618 // c_rarg0 - source byte array address 2619 // c_rarg1 - destination byte array address 2620 // c_rarg2 - K (key) in little endian int array 2621 // 2622 address generate_aescrypt_encryptBlock() { 2623 __ align(CodeEntryAlignment); 2624 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2625 2626 Label L_doLast; 2627 2628 const Register from = c_rarg0; // source array address 2629 const Register to = c_rarg1; // destination array address 2630 const Register key = c_rarg2; // key array address 2631 const Register keylen = rscratch1; 2632 2633 address start = __ pc(); 2634 __ enter(); 2635 2636 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2637 2638 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2639 2640 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2641 __ rev32(v1, __ T16B, v1); 2642 __ rev32(v2, __ T16B, v2); 2643 __ rev32(v3, __ T16B, v3); 2644 __ rev32(v4, __ T16B, v4); 2645 __ aese(v0, v1); 2646 __ aesmc(v0, v0); 2647 __ aese(v0, v2); 2648 __ aesmc(v0, v0); 2649 __ aese(v0, v3); 2650 __ aesmc(v0, v0); 2651 __ aese(v0, v4); 2652 __ aesmc(v0, v0); 2653 2654 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2655 __ rev32(v1, __ T16B, v1); 2656 __ rev32(v2, __ T16B, v2); 2657 __ rev32(v3, __ T16B, v3); 2658 __ rev32(v4, __ T16B, v4); 2659 __ aese(v0, v1); 2660 __ aesmc(v0, v0); 2661 __ aese(v0, v2); 2662 __ aesmc(v0, v0); 2663 __ aese(v0, v3); 2664 __ aesmc(v0, v0); 2665 __ aese(v0, v4); 2666 __ aesmc(v0, v0); 2667 2668 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2669 __ rev32(v1, __ T16B, v1); 2670 __ rev32(v2, __ T16B, v2); 2671 2672 __ cmpw(keylen, 44); 2673 __ br(Assembler::EQ, L_doLast); 2674 2675 __ aese(v0, v1); 2676 __ aesmc(v0, v0); 2677 __ aese(v0, v2); 2678 __ aesmc(v0, v0); 2679 2680 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2681 __ rev32(v1, __ T16B, v1); 2682 __ rev32(v2, __ T16B, v2); 2683 2684 __ cmpw(keylen, 52); 2685 __ br(Assembler::EQ, L_doLast); 2686 2687 __ aese(v0, v1); 2688 __ aesmc(v0, v0); 2689 __ aese(v0, v2); 2690 __ aesmc(v0, v0); 2691 2692 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2693 __ rev32(v1, __ T16B, v1); 2694 __ rev32(v2, __ T16B, v2); 2695 2696 __ BIND(L_doLast); 2697 2698 __ aese(v0, v1); 2699 __ aesmc(v0, v0); 2700 __ aese(v0, v2); 2701 2702 __ ld1(v1, __ T16B, key); 2703 __ rev32(v1, __ T16B, v1); 2704 __ eor(v0, __ T16B, v0, v1); 2705 2706 __ st1(v0, __ T16B, to); 2707 2708 __ mov(r0, 0); 2709 2710 __ leave(); 2711 __ ret(lr); 2712 2713 return start; 2714 } 2715 2716 // Arguments: 2717 // 2718 // Inputs: 2719 // c_rarg0 - source byte array address 2720 // c_rarg1 - destination byte array address 2721 // c_rarg2 - K (key) in little endian int array 2722 // 2723 address generate_aescrypt_decryptBlock() { 2724 assert(UseAES, "need AES instructions and misaligned SSE support"); 2725 __ align(CodeEntryAlignment); 2726 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2727 Label L_doLast; 2728 2729 const Register from = c_rarg0; // source array address 2730 const Register to = c_rarg1; // destination array address 2731 const Register key = c_rarg2; // key array address 2732 const Register keylen = rscratch1; 2733 2734 address start = __ pc(); 2735 __ enter(); // required for proper stackwalking of RuntimeStub frame 2736 2737 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2738 2739 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2740 2741 __ ld1(v5, __ T16B, __ post(key, 16)); 2742 __ rev32(v5, __ T16B, v5); 2743 2744 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2745 __ rev32(v1, __ T16B, v1); 2746 __ rev32(v2, __ T16B, v2); 2747 __ rev32(v3, __ T16B, v3); 2748 __ rev32(v4, __ T16B, v4); 2749 __ aesd(v0, v1); 2750 __ aesimc(v0, v0); 2751 __ aesd(v0, v2); 2752 __ aesimc(v0, v0); 2753 __ aesd(v0, v3); 2754 __ aesimc(v0, v0); 2755 __ aesd(v0, v4); 2756 __ aesimc(v0, v0); 2757 2758 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2759 __ rev32(v1, __ T16B, v1); 2760 __ rev32(v2, __ T16B, v2); 2761 __ rev32(v3, __ T16B, v3); 2762 __ rev32(v4, __ T16B, v4); 2763 __ aesd(v0, v1); 2764 __ aesimc(v0, v0); 2765 __ aesd(v0, v2); 2766 __ aesimc(v0, v0); 2767 __ aesd(v0, v3); 2768 __ aesimc(v0, v0); 2769 __ aesd(v0, v4); 2770 __ aesimc(v0, v0); 2771 2772 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2773 __ rev32(v1, __ T16B, v1); 2774 __ rev32(v2, __ T16B, v2); 2775 2776 __ cmpw(keylen, 44); 2777 __ br(Assembler::EQ, L_doLast); 2778 2779 __ aesd(v0, v1); 2780 __ aesimc(v0, v0); 2781 __ aesd(v0, v2); 2782 __ aesimc(v0, v0); 2783 2784 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2785 __ rev32(v1, __ T16B, v1); 2786 __ rev32(v2, __ T16B, v2); 2787 2788 __ cmpw(keylen, 52); 2789 __ br(Assembler::EQ, L_doLast); 2790 2791 __ aesd(v0, v1); 2792 __ aesimc(v0, v0); 2793 __ aesd(v0, v2); 2794 __ aesimc(v0, v0); 2795 2796 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2797 __ rev32(v1, __ T16B, v1); 2798 __ rev32(v2, __ T16B, v2); 2799 2800 __ BIND(L_doLast); 2801 2802 __ aesd(v0, v1); 2803 __ aesimc(v0, v0); 2804 __ aesd(v0, v2); 2805 2806 __ eor(v0, __ T16B, v0, v5); 2807 2808 __ st1(v0, __ T16B, to); 2809 2810 __ mov(r0, 0); 2811 2812 __ leave(); 2813 __ ret(lr); 2814 2815 return start; 2816 } 2817 2818 // Arguments: 2819 // 2820 // Inputs: 2821 // c_rarg0 - source byte array address 2822 // c_rarg1 - destination byte array address 2823 // c_rarg2 - K (key) in little endian int array 2824 // c_rarg3 - r vector byte array address 2825 // c_rarg4 - input length 2826 // 2827 // Output: 2828 // x0 - input length 2829 // 2830 address generate_cipherBlockChaining_encryptAESCrypt() { 2831 assert(UseAES, "need AES instructions and misaligned SSE support"); 2832 __ align(CodeEntryAlignment); 2833 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2834 2835 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2836 2837 const Register from = c_rarg0; // source array address 2838 const Register to = c_rarg1; // destination array address 2839 const Register key = c_rarg2; // key array address 2840 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2841 // and left with the results of the last encryption block 2842 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2843 const Register keylen = rscratch1; 2844 2845 address start = __ pc(); 2846 2847 __ enter(); 2848 2849 __ movw(rscratch2, len_reg); 2850 2851 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2852 2853 __ ld1(v0, __ T16B, rvec); 2854 2855 __ cmpw(keylen, 52); 2856 __ br(Assembler::CC, L_loadkeys_44); 2857 __ br(Assembler::EQ, L_loadkeys_52); 2858 2859 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2860 __ rev32(v17, __ T16B, v17); 2861 __ rev32(v18, __ T16B, v18); 2862 __ BIND(L_loadkeys_52); 2863 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2864 __ rev32(v19, __ T16B, v19); 2865 __ rev32(v20, __ T16B, v20); 2866 __ BIND(L_loadkeys_44); 2867 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2868 __ rev32(v21, __ T16B, v21); 2869 __ rev32(v22, __ T16B, v22); 2870 __ rev32(v23, __ T16B, v23); 2871 __ rev32(v24, __ T16B, v24); 2872 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2873 __ rev32(v25, __ T16B, v25); 2874 __ rev32(v26, __ T16B, v26); 2875 __ rev32(v27, __ T16B, v27); 2876 __ rev32(v28, __ T16B, v28); 2877 __ ld1(v29, v30, v31, __ T16B, key); 2878 __ rev32(v29, __ T16B, v29); 2879 __ rev32(v30, __ T16B, v30); 2880 __ rev32(v31, __ T16B, v31); 2881 2882 __ BIND(L_aes_loop); 2883 __ ld1(v1, __ T16B, __ post(from, 16)); 2884 __ eor(v0, __ T16B, v0, v1); 2885 2886 __ br(Assembler::CC, L_rounds_44); 2887 __ br(Assembler::EQ, L_rounds_52); 2888 2889 __ aese(v0, v17); __ aesmc(v0, v0); 2890 __ aese(v0, v18); __ aesmc(v0, v0); 2891 __ BIND(L_rounds_52); 2892 __ aese(v0, v19); __ aesmc(v0, v0); 2893 __ aese(v0, v20); __ aesmc(v0, v0); 2894 __ BIND(L_rounds_44); 2895 __ aese(v0, v21); __ aesmc(v0, v0); 2896 __ aese(v0, v22); __ aesmc(v0, v0); 2897 __ aese(v0, v23); __ aesmc(v0, v0); 2898 __ aese(v0, v24); __ aesmc(v0, v0); 2899 __ aese(v0, v25); __ aesmc(v0, v0); 2900 __ aese(v0, v26); __ aesmc(v0, v0); 2901 __ aese(v0, v27); __ aesmc(v0, v0); 2902 __ aese(v0, v28); __ aesmc(v0, v0); 2903 __ aese(v0, v29); __ aesmc(v0, v0); 2904 __ aese(v0, v30); 2905 __ eor(v0, __ T16B, v0, v31); 2906 2907 __ st1(v0, __ T16B, __ post(to, 16)); 2908 2909 __ subw(len_reg, len_reg, 16); 2910 __ cbnzw(len_reg, L_aes_loop); 2911 2912 __ st1(v0, __ T16B, rvec); 2913 2914 __ mov(r0, rscratch2); 2915 2916 __ leave(); 2917 __ ret(lr); 2918 2919 return start; 2920 } 2921 2922 // Arguments: 2923 // 2924 // Inputs: 2925 // c_rarg0 - source byte array address 2926 // c_rarg1 - destination byte array address 2927 // c_rarg2 - K (key) in little endian int array 2928 // c_rarg3 - r vector byte array address 2929 // c_rarg4 - input length 2930 // 2931 // Output: 2932 // r0 - input length 2933 // 2934 address generate_cipherBlockChaining_decryptAESCrypt() { 2935 assert(UseAES, "need AES instructions and misaligned SSE support"); 2936 __ align(CodeEntryAlignment); 2937 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2938 2939 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2940 2941 const Register from = c_rarg0; // source array address 2942 const Register to = c_rarg1; // destination array address 2943 const Register key = c_rarg2; // key array address 2944 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2945 // and left with the results of the last encryption block 2946 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2947 const Register keylen = rscratch1; 2948 2949 address start = __ pc(); 2950 2951 __ enter(); 2952 2953 __ movw(rscratch2, len_reg); 2954 2955 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2956 2957 __ ld1(v2, __ T16B, rvec); 2958 2959 __ ld1(v31, __ T16B, __ post(key, 16)); 2960 __ rev32(v31, __ T16B, v31); 2961 2962 __ cmpw(keylen, 52); 2963 __ br(Assembler::CC, L_loadkeys_44); 2964 __ br(Assembler::EQ, L_loadkeys_52); 2965 2966 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2967 __ rev32(v17, __ T16B, v17); 2968 __ rev32(v18, __ T16B, v18); 2969 __ BIND(L_loadkeys_52); 2970 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2971 __ rev32(v19, __ T16B, v19); 2972 __ rev32(v20, __ T16B, v20); 2973 __ BIND(L_loadkeys_44); 2974 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2975 __ rev32(v21, __ T16B, v21); 2976 __ rev32(v22, __ T16B, v22); 2977 __ rev32(v23, __ T16B, v23); 2978 __ rev32(v24, __ T16B, v24); 2979 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2980 __ rev32(v25, __ T16B, v25); 2981 __ rev32(v26, __ T16B, v26); 2982 __ rev32(v27, __ T16B, v27); 2983 __ rev32(v28, __ T16B, v28); 2984 __ ld1(v29, v30, __ T16B, key); 2985 __ rev32(v29, __ T16B, v29); 2986 __ rev32(v30, __ T16B, v30); 2987 2988 __ BIND(L_aes_loop); 2989 __ ld1(v0, __ T16B, __ post(from, 16)); 2990 __ orr(v1, __ T16B, v0, v0); 2991 2992 __ br(Assembler::CC, L_rounds_44); 2993 __ br(Assembler::EQ, L_rounds_52); 2994 2995 __ aesd(v0, v17); __ aesimc(v0, v0); 2996 __ aesd(v0, v18); __ aesimc(v0, v0); 2997 __ BIND(L_rounds_52); 2998 __ aesd(v0, v19); __ aesimc(v0, v0); 2999 __ aesd(v0, v20); __ aesimc(v0, v0); 3000 __ BIND(L_rounds_44); 3001 __ aesd(v0, v21); __ aesimc(v0, v0); 3002 __ aesd(v0, v22); __ aesimc(v0, v0); 3003 __ aesd(v0, v23); __ aesimc(v0, v0); 3004 __ aesd(v0, v24); __ aesimc(v0, v0); 3005 __ aesd(v0, v25); __ aesimc(v0, v0); 3006 __ aesd(v0, v26); __ aesimc(v0, v0); 3007 __ aesd(v0, v27); __ aesimc(v0, v0); 3008 __ aesd(v0, v28); __ aesimc(v0, v0); 3009 __ aesd(v0, v29); __ aesimc(v0, v0); 3010 __ aesd(v0, v30); 3011 __ eor(v0, __ T16B, v0, v31); 3012 __ eor(v0, __ T16B, v0, v2); 3013 3014 __ st1(v0, __ T16B, __ post(to, 16)); 3015 __ orr(v2, __ T16B, v1, v1); 3016 3017 __ subw(len_reg, len_reg, 16); 3018 __ cbnzw(len_reg, L_aes_loop); 3019 3020 __ st1(v2, __ T16B, rvec); 3021 3022 __ mov(r0, rscratch2); 3023 3024 __ leave(); 3025 __ ret(lr); 3026 3027 return start; 3028 } 3029 3030 // Arguments: 3031 // 3032 // Inputs: 3033 // c_rarg0 - byte[] source+offset 3034 // c_rarg1 - int[] SHA.state 3035 // c_rarg2 - int offset 3036 // c_rarg3 - int limit 3037 // 3038 address generate_sha1_implCompress(bool multi_block, const char *name) { 3039 __ align(CodeEntryAlignment); 3040 StubCodeMark mark(this, "StubRoutines", name); 3041 address start = __ pc(); 3042 3043 Register buf = c_rarg0; 3044 Register state = c_rarg1; 3045 Register ofs = c_rarg2; 3046 Register limit = c_rarg3; 3047 3048 Label keys; 3049 Label sha1_loop; 3050 3051 // load the keys into v0..v3 3052 __ adr(rscratch1, keys); 3053 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 3054 // load 5 words state into v6, v7 3055 __ ldrq(v6, Address(state, 0)); 3056 __ ldrs(v7, Address(state, 16)); 3057 3058 3059 __ BIND(sha1_loop); 3060 // load 64 bytes of data into v16..v19 3061 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3062 __ rev32(v16, __ T16B, v16); 3063 __ rev32(v17, __ T16B, v17); 3064 __ rev32(v18, __ T16B, v18); 3065 __ rev32(v19, __ T16B, v19); 3066 3067 // do the sha1 3068 __ addv(v4, __ T4S, v16, v0); 3069 __ orr(v20, __ T16B, v6, v6); 3070 3071 FloatRegister d0 = v16; 3072 FloatRegister d1 = v17; 3073 FloatRegister d2 = v18; 3074 FloatRegister d3 = v19; 3075 3076 for (int round = 0; round < 20; round++) { 3077 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3078 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3079 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3080 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3081 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3082 3083 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3084 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3085 __ sha1h(tmp2, __ T4S, v20); 3086 if (round < 5) 3087 __ sha1c(v20, __ T4S, tmp3, tmp4); 3088 else if (round < 10 || round >= 15) 3089 __ sha1p(v20, __ T4S, tmp3, tmp4); 3090 else 3091 __ sha1m(v20, __ T4S, tmp3, tmp4); 3092 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3093 3094 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3095 } 3096 3097 __ addv(v7, __ T2S, v7, v21); 3098 __ addv(v6, __ T4S, v6, v20); 3099 3100 if (multi_block) { 3101 __ add(ofs, ofs, 64); 3102 __ cmp(ofs, limit); 3103 __ br(Assembler::LE, sha1_loop); 3104 __ mov(c_rarg0, ofs); // return ofs 3105 } 3106 3107 __ strq(v6, Address(state, 0)); 3108 __ strs(v7, Address(state, 16)); 3109 3110 __ ret(lr); 3111 3112 __ bind(keys); 3113 __ emit_int32(0x5a827999); 3114 __ emit_int32(0x6ed9eba1); 3115 __ emit_int32(0x8f1bbcdc); 3116 __ emit_int32(0xca62c1d6); 3117 3118 return start; 3119 } 3120 3121 3122 // Arguments: 3123 // 3124 // Inputs: 3125 // c_rarg0 - byte[] source+offset 3126 // c_rarg1 - int[] SHA.state 3127 // c_rarg2 - int offset 3128 // c_rarg3 - int limit 3129 // 3130 address generate_sha256_implCompress(bool multi_block, const char *name) { 3131 static const uint32_t round_consts[64] = { 3132 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3133 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3134 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3135 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3136 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3137 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3138 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3139 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3140 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3141 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3142 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3143 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3144 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3145 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3146 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3147 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3148 }; 3149 __ align(CodeEntryAlignment); 3150 StubCodeMark mark(this, "StubRoutines", name); 3151 address start = __ pc(); 3152 3153 Register buf = c_rarg0; 3154 Register state = c_rarg1; 3155 Register ofs = c_rarg2; 3156 Register limit = c_rarg3; 3157 3158 Label sha1_loop; 3159 3160 __ stpd(v8, v9, __ pre(sp, -32)); 3161 __ stpd(v10, v11, Address(sp, 16)); 3162 3163 // dga == v0 3164 // dgb == v1 3165 // dg0 == v2 3166 // dg1 == v3 3167 // dg2 == v4 3168 // t0 == v6 3169 // t1 == v7 3170 3171 // load 16 keys to v16..v31 3172 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3173 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3174 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3175 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3176 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3177 3178 // load 8 words (256 bits) state 3179 __ ldpq(v0, v1, state); 3180 3181 __ BIND(sha1_loop); 3182 // load 64 bytes of data into v8..v11 3183 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3184 __ rev32(v8, __ T16B, v8); 3185 __ rev32(v9, __ T16B, v9); 3186 __ rev32(v10, __ T16B, v10); 3187 __ rev32(v11, __ T16B, v11); 3188 3189 __ addv(v6, __ T4S, v8, v16); 3190 __ orr(v2, __ T16B, v0, v0); 3191 __ orr(v3, __ T16B, v1, v1); 3192 3193 FloatRegister d0 = v8; 3194 FloatRegister d1 = v9; 3195 FloatRegister d2 = v10; 3196 FloatRegister d3 = v11; 3197 3198 3199 for (int round = 0; round < 16; round++) { 3200 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3201 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3202 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3203 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3204 3205 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3206 __ orr(v4, __ T16B, v2, v2); 3207 if (round < 15) 3208 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3209 __ sha256h(v2, __ T4S, v3, tmp2); 3210 __ sha256h2(v3, __ T4S, v4, tmp2); 3211 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3212 3213 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3214 } 3215 3216 __ addv(v0, __ T4S, v0, v2); 3217 __ addv(v1, __ T4S, v1, v3); 3218 3219 if (multi_block) { 3220 __ add(ofs, ofs, 64); 3221 __ cmp(ofs, limit); 3222 __ br(Assembler::LE, sha1_loop); 3223 __ mov(c_rarg0, ofs); // return ofs 3224 } 3225 3226 __ ldpd(v10, v11, Address(sp, 16)); 3227 __ ldpd(v8, v9, __ post(sp, 32)); 3228 3229 __ stpq(v0, v1, state); 3230 3231 __ ret(lr); 3232 3233 return start; 3234 } 3235 3236 #ifndef BUILTIN_SIM 3237 // Safefetch stubs. 3238 void generate_safefetch(const char* name, int size, address* entry, 3239 address* fault_pc, address* continuation_pc) { 3240 // safefetch signatures: 3241 // int SafeFetch32(int* adr, int errValue); 3242 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3243 // 3244 // arguments: 3245 // c_rarg0 = adr 3246 // c_rarg1 = errValue 3247 // 3248 // result: 3249 // PPC_RET = *adr or errValue 3250 3251 StubCodeMark mark(this, "StubRoutines", name); 3252 3253 // Entry point, pc or function descriptor. 3254 *entry = __ pc(); 3255 3256 // Load *adr into c_rarg1, may fault. 3257 *fault_pc = __ pc(); 3258 switch (size) { 3259 case 4: 3260 // int32_t 3261 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3262 break; 3263 case 8: 3264 // int64_t 3265 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3266 break; 3267 default: 3268 ShouldNotReachHere(); 3269 } 3270 3271 // return errValue or *adr 3272 *continuation_pc = __ pc(); 3273 __ mov(r0, c_rarg1); 3274 __ ret(lr); 3275 } 3276 #endif 3277 3278 /** 3279 * Arguments: 3280 * 3281 * Inputs: 3282 * c_rarg0 - int crc 3283 * c_rarg1 - byte* buf 3284 * c_rarg2 - int length 3285 * 3286 * Ouput: 3287 * rax - int crc result 3288 */ 3289 address generate_updateBytesCRC32() { 3290 assert(UseCRC32Intrinsics, "what are we doing here?"); 3291 3292 __ align(CodeEntryAlignment); 3293 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3294 3295 address start = __ pc(); 3296 3297 const Register crc = c_rarg0; // crc 3298 const Register buf = c_rarg1; // source java byte array address 3299 const Register len = c_rarg2; // length 3300 const Register table0 = c_rarg3; // crc_table address 3301 const Register table1 = c_rarg4; 3302 const Register table2 = c_rarg5; 3303 const Register table3 = c_rarg6; 3304 const Register tmp3 = c_rarg7; 3305 3306 BLOCK_COMMENT("Entry:"); 3307 __ enter(); // required for proper stackwalking of RuntimeStub frame 3308 3309 __ kernel_crc32(crc, buf, len, 3310 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3311 3312 __ leave(); // required for proper stackwalking of RuntimeStub frame 3313 __ ret(lr); 3314 3315 return start; 3316 } 3317 3318 /** 3319 * Arguments: 3320 * 3321 * Inputs: 3322 * c_rarg0 - int crc 3323 * c_rarg1 - byte* buf 3324 * c_rarg2 - int length 3325 * c_rarg3 - int* table 3326 * 3327 * Ouput: 3328 * r0 - int crc result 3329 */ 3330 address generate_updateBytesCRC32C() { 3331 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3332 3333 __ align(CodeEntryAlignment); 3334 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3335 3336 address start = __ pc(); 3337 3338 const Register crc = c_rarg0; // crc 3339 const Register buf = c_rarg1; // source java byte array address 3340 const Register len = c_rarg2; // length 3341 const Register table0 = c_rarg3; // crc_table address 3342 const Register table1 = c_rarg4; 3343 const Register table2 = c_rarg5; 3344 const Register table3 = c_rarg6; 3345 const Register tmp3 = c_rarg7; 3346 3347 BLOCK_COMMENT("Entry:"); 3348 __ enter(); // required for proper stackwalking of RuntimeStub frame 3349 3350 __ kernel_crc32c(crc, buf, len, 3351 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3352 3353 __ leave(); // required for proper stackwalking of RuntimeStub frame 3354 __ ret(lr); 3355 3356 return start; 3357 } 3358 3359 /*** 3360 * Arguments: 3361 * 3362 * Inputs: 3363 * c_rarg0 - int adler 3364 * c_rarg1 - byte* buff 3365 * c_rarg2 - int len 3366 * 3367 * Output: 3368 * c_rarg0 - int adler result 3369 */ 3370 address generate_updateBytesAdler32() { 3371 __ align(CodeEntryAlignment); 3372 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3373 address start = __ pc(); 3374 3375 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3376 3377 // Aliases 3378 Register adler = c_rarg0; 3379 Register s1 = c_rarg0; 3380 Register s2 = c_rarg3; 3381 Register buff = c_rarg1; 3382 Register len = c_rarg2; 3383 Register nmax = r4; 3384 Register base = r5; 3385 Register count = r6; 3386 Register temp0 = rscratch1; 3387 Register temp1 = rscratch2; 3388 Register temp2 = r7; 3389 3390 // Max number of bytes we can process before having to take the mod 3391 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3392 unsigned long BASE = 0xfff1; 3393 unsigned long NMAX = 0x15B0; 3394 3395 __ mov(base, BASE); 3396 __ mov(nmax, NMAX); 3397 3398 // s1 is initialized to the lower 16 bits of adler 3399 // s2 is initialized to the upper 16 bits of adler 3400 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3401 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3402 3403 // The pipelined loop needs at least 16 elements for 1 iteration 3404 // It does check this, but it is more effective to skip to the cleanup loop 3405 __ cmp(len, 16); 3406 __ br(Assembler::HS, L_nmax); 3407 __ cbz(len, L_combine); 3408 3409 __ bind(L_simple_by1_loop); 3410 __ ldrb(temp0, Address(__ post(buff, 1))); 3411 __ add(s1, s1, temp0); 3412 __ add(s2, s2, s1); 3413 __ subs(len, len, 1); 3414 __ br(Assembler::HI, L_simple_by1_loop); 3415 3416 // s1 = s1 % BASE 3417 __ subs(temp0, s1, base); 3418 __ csel(s1, temp0, s1, Assembler::HS); 3419 3420 // s2 = s2 % BASE 3421 __ lsr(temp0, s2, 16); 3422 __ lsl(temp1, temp0, 4); 3423 __ sub(temp1, temp1, temp0); 3424 __ add(s2, temp1, s2, ext::uxth); 3425 3426 __ subs(temp0, s2, base); 3427 __ csel(s2, temp0, s2, Assembler::HS); 3428 3429 __ b(L_combine); 3430 3431 __ bind(L_nmax); 3432 __ subs(len, len, nmax); 3433 __ sub(count, nmax, 16); 3434 __ br(Assembler::LO, L_by16); 3435 3436 __ bind(L_nmax_loop); 3437 3438 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3439 3440 __ add(s1, s1, temp0, ext::uxtb); 3441 __ ubfx(temp2, temp0, 8, 8); 3442 __ add(s2, s2, s1); 3443 __ add(s1, s1, temp2); 3444 __ ubfx(temp2, temp0, 16, 8); 3445 __ add(s2, s2, s1); 3446 __ add(s1, s1, temp2); 3447 __ ubfx(temp2, temp0, 24, 8); 3448 __ add(s2, s2, s1); 3449 __ add(s1, s1, temp2); 3450 __ ubfx(temp2, temp0, 32, 8); 3451 __ add(s2, s2, s1); 3452 __ add(s1, s1, temp2); 3453 __ ubfx(temp2, temp0, 40, 8); 3454 __ add(s2, s2, s1); 3455 __ add(s1, s1, temp2); 3456 __ ubfx(temp2, temp0, 48, 8); 3457 __ add(s2, s2, s1); 3458 __ add(s1, s1, temp2); 3459 __ add(s2, s2, s1); 3460 __ add(s1, s1, temp0, Assembler::LSR, 56); 3461 __ add(s2, s2, s1); 3462 3463 __ add(s1, s1, temp1, ext::uxtb); 3464 __ ubfx(temp2, temp1, 8, 8); 3465 __ add(s2, s2, s1); 3466 __ add(s1, s1, temp2); 3467 __ ubfx(temp2, temp1, 16, 8); 3468 __ add(s2, s2, s1); 3469 __ add(s1, s1, temp2); 3470 __ ubfx(temp2, temp1, 24, 8); 3471 __ add(s2, s2, s1); 3472 __ add(s1, s1, temp2); 3473 __ ubfx(temp2, temp1, 32, 8); 3474 __ add(s2, s2, s1); 3475 __ add(s1, s1, temp2); 3476 __ ubfx(temp2, temp1, 40, 8); 3477 __ add(s2, s2, s1); 3478 __ add(s1, s1, temp2); 3479 __ ubfx(temp2, temp1, 48, 8); 3480 __ add(s2, s2, s1); 3481 __ add(s1, s1, temp2); 3482 __ add(s2, s2, s1); 3483 __ add(s1, s1, temp1, Assembler::LSR, 56); 3484 __ add(s2, s2, s1); 3485 3486 __ subs(count, count, 16); 3487 __ br(Assembler::HS, L_nmax_loop); 3488 3489 // s1 = s1 % BASE 3490 __ lsr(temp0, s1, 16); 3491 __ lsl(temp1, temp0, 4); 3492 __ sub(temp1, temp1, temp0); 3493 __ add(temp1, temp1, s1, ext::uxth); 3494 3495 __ lsr(temp0, temp1, 16); 3496 __ lsl(s1, temp0, 4); 3497 __ sub(s1, s1, temp0); 3498 __ add(s1, s1, temp1, ext:: uxth); 3499 3500 __ subs(temp0, s1, base); 3501 __ csel(s1, temp0, s1, Assembler::HS); 3502 3503 // s2 = s2 % BASE 3504 __ lsr(temp0, s2, 16); 3505 __ lsl(temp1, temp0, 4); 3506 __ sub(temp1, temp1, temp0); 3507 __ add(temp1, temp1, s2, ext::uxth); 3508 3509 __ lsr(temp0, temp1, 16); 3510 __ lsl(s2, temp0, 4); 3511 __ sub(s2, s2, temp0); 3512 __ add(s2, s2, temp1, ext:: uxth); 3513 3514 __ subs(temp0, s2, base); 3515 __ csel(s2, temp0, s2, Assembler::HS); 3516 3517 __ subs(len, len, nmax); 3518 __ sub(count, nmax, 16); 3519 __ br(Assembler::HS, L_nmax_loop); 3520 3521 __ bind(L_by16); 3522 __ adds(len, len, count); 3523 __ br(Assembler::LO, L_by1); 3524 3525 __ bind(L_by16_loop); 3526 3527 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3528 3529 __ add(s1, s1, temp0, ext::uxtb); 3530 __ ubfx(temp2, temp0, 8, 8); 3531 __ add(s2, s2, s1); 3532 __ add(s1, s1, temp2); 3533 __ ubfx(temp2, temp0, 16, 8); 3534 __ add(s2, s2, s1); 3535 __ add(s1, s1, temp2); 3536 __ ubfx(temp2, temp0, 24, 8); 3537 __ add(s2, s2, s1); 3538 __ add(s1, s1, temp2); 3539 __ ubfx(temp2, temp0, 32, 8); 3540 __ add(s2, s2, s1); 3541 __ add(s1, s1, temp2); 3542 __ ubfx(temp2, temp0, 40, 8); 3543 __ add(s2, s2, s1); 3544 __ add(s1, s1, temp2); 3545 __ ubfx(temp2, temp0, 48, 8); 3546 __ add(s2, s2, s1); 3547 __ add(s1, s1, temp2); 3548 __ add(s2, s2, s1); 3549 __ add(s1, s1, temp0, Assembler::LSR, 56); 3550 __ add(s2, s2, s1); 3551 3552 __ add(s1, s1, temp1, ext::uxtb); 3553 __ ubfx(temp2, temp1, 8, 8); 3554 __ add(s2, s2, s1); 3555 __ add(s1, s1, temp2); 3556 __ ubfx(temp2, temp1, 16, 8); 3557 __ add(s2, s2, s1); 3558 __ add(s1, s1, temp2); 3559 __ ubfx(temp2, temp1, 24, 8); 3560 __ add(s2, s2, s1); 3561 __ add(s1, s1, temp2); 3562 __ ubfx(temp2, temp1, 32, 8); 3563 __ add(s2, s2, s1); 3564 __ add(s1, s1, temp2); 3565 __ ubfx(temp2, temp1, 40, 8); 3566 __ add(s2, s2, s1); 3567 __ add(s1, s1, temp2); 3568 __ ubfx(temp2, temp1, 48, 8); 3569 __ add(s2, s2, s1); 3570 __ add(s1, s1, temp2); 3571 __ add(s2, s2, s1); 3572 __ add(s1, s1, temp1, Assembler::LSR, 56); 3573 __ add(s2, s2, s1); 3574 3575 __ subs(len, len, 16); 3576 __ br(Assembler::HS, L_by16_loop); 3577 3578 __ bind(L_by1); 3579 __ adds(len, len, 15); 3580 __ br(Assembler::LO, L_do_mod); 3581 3582 __ bind(L_by1_loop); 3583 __ ldrb(temp0, Address(__ post(buff, 1))); 3584 __ add(s1, temp0, s1); 3585 __ add(s2, s2, s1); 3586 __ subs(len, len, 1); 3587 __ br(Assembler::HS, L_by1_loop); 3588 3589 __ bind(L_do_mod); 3590 // s1 = s1 % BASE 3591 __ lsr(temp0, s1, 16); 3592 __ lsl(temp1, temp0, 4); 3593 __ sub(temp1, temp1, temp0); 3594 __ add(temp1, temp1, s1, ext::uxth); 3595 3596 __ lsr(temp0, temp1, 16); 3597 __ lsl(s1, temp0, 4); 3598 __ sub(s1, s1, temp0); 3599 __ add(s1, s1, temp1, ext:: uxth); 3600 3601 __ subs(temp0, s1, base); 3602 __ csel(s1, temp0, s1, Assembler::HS); 3603 3604 // s2 = s2 % BASE 3605 __ lsr(temp0, s2, 16); 3606 __ lsl(temp1, temp0, 4); 3607 __ sub(temp1, temp1, temp0); 3608 __ add(temp1, temp1, s2, ext::uxth); 3609 3610 __ lsr(temp0, temp1, 16); 3611 __ lsl(s2, temp0, 4); 3612 __ sub(s2, s2, temp0); 3613 __ add(s2, s2, temp1, ext:: uxth); 3614 3615 __ subs(temp0, s2, base); 3616 __ csel(s2, temp0, s2, Assembler::HS); 3617 3618 // Combine lower bits and higher bits 3619 __ bind(L_combine); 3620 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3621 3622 __ ret(lr); 3623 3624 return start; 3625 } 3626 3627 /** 3628 * Arguments: 3629 * 3630 * Input: 3631 * c_rarg0 - x address 3632 * c_rarg1 - x length 3633 * c_rarg2 - y address 3634 * c_rarg3 - y lenth 3635 * c_rarg4 - z address 3636 * c_rarg5 - z length 3637 */ 3638 address generate_multiplyToLen() { 3639 __ align(CodeEntryAlignment); 3640 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3641 3642 address start = __ pc(); 3643 const Register x = r0; 3644 const Register xlen = r1; 3645 const Register y = r2; 3646 const Register ylen = r3; 3647 const Register z = r4; 3648 const Register zlen = r5; 3649 3650 const Register tmp1 = r10; 3651 const Register tmp2 = r11; 3652 const Register tmp3 = r12; 3653 const Register tmp4 = r13; 3654 const Register tmp5 = r14; 3655 const Register tmp6 = r15; 3656 const Register tmp7 = r16; 3657 3658 BLOCK_COMMENT("Entry:"); 3659 __ enter(); // required for proper stackwalking of RuntimeStub frame 3660 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3661 __ leave(); // required for proper stackwalking of RuntimeStub frame 3662 __ ret(lr); 3663 3664 return start; 3665 } 3666 3667 address generate_squareToLen() { 3668 // squareToLen algorithm for sizes 1..127 described in java code works 3669 // faster than multiply_to_len on some CPUs and slower on others, but 3670 // multiply_to_len shows a bit better overall results 3671 __ align(CodeEntryAlignment); 3672 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3673 address start = __ pc(); 3674 3675 const Register x = r0; 3676 const Register xlen = r1; 3677 const Register z = r2; 3678 const Register zlen = r3; 3679 const Register y = r4; // == x 3680 const Register ylen = r5; // == xlen 3681 3682 const Register tmp1 = r10; 3683 const Register tmp2 = r11; 3684 const Register tmp3 = r12; 3685 const Register tmp4 = r13; 3686 const Register tmp5 = r14; 3687 const Register tmp6 = r15; 3688 const Register tmp7 = r16; 3689 3690 RegSet spilled_regs = RegSet::of(y, ylen); 3691 BLOCK_COMMENT("Entry:"); 3692 __ enter(); 3693 __ push(spilled_regs, sp); 3694 __ mov(y, x); 3695 __ mov(ylen, xlen); 3696 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3697 __ pop(spilled_regs, sp); 3698 __ leave(); 3699 __ ret(lr); 3700 return start; 3701 } 3702 3703 address generate_mulAdd() { 3704 __ align(CodeEntryAlignment); 3705 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3706 3707 address start = __ pc(); 3708 3709 const Register out = r0; 3710 const Register in = r1; 3711 const Register offset = r2; 3712 const Register len = r3; 3713 const Register k = r4; 3714 3715 BLOCK_COMMENT("Entry:"); 3716 __ enter(); 3717 __ mul_add(out, in, offset, len, k); 3718 __ leave(); 3719 __ ret(lr); 3720 3721 return start; 3722 } 3723 3724 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3725 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3726 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3727 // Karatsuba multiplication performs a 128*128 -> 256-bit 3728 // multiplication in three 128-bit multiplications and a few 3729 // additions. 3730 // 3731 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3732 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3733 // 3734 // Inputs: 3735 // 3736 // A0 in a.d[0] (subkey) 3737 // A1 in a.d[1] 3738 // (A1+A0) in a1_xor_a0.d[0] 3739 // 3740 // B0 in b.d[0] (state) 3741 // B1 in b.d[1] 3742 3743 __ ext(tmp1, __ T16B, b, b, 0x08); 3744 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3745 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3746 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3747 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3748 3749 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3750 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3751 __ eor(tmp2, __ T16B, tmp2, tmp4); 3752 __ eor(tmp2, __ T16B, tmp2, tmp3); 3753 3754 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3755 __ ins(result_hi, __ D, tmp2, 0, 1); 3756 __ ins(result_lo, __ D, tmp2, 1, 0); 3757 } 3758 3759 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3760 FloatRegister p, FloatRegister z, FloatRegister t1) { 3761 const FloatRegister t0 = result; 3762 3763 // The GCM field polynomial f is z^128 + p(z), where p = 3764 // z^7+z^2+z+1. 3765 // 3766 // z^128 === -p(z) (mod (z^128 + p(z))) 3767 // 3768 // so, given that the product we're reducing is 3769 // a == lo + hi * z^128 3770 // substituting, 3771 // === lo - hi * p(z) (mod (z^128 + p(z))) 3772 // 3773 // we reduce by multiplying hi by p(z) and subtracting the result 3774 // from (i.e. XORing it with) lo. Because p has no nonzero high 3775 // bits we can do this with two 64-bit multiplications, lo*p and 3776 // hi*p. 3777 3778 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3779 __ ext(t1, __ T16B, t0, z, 8); 3780 __ eor(hi, __ T16B, hi, t1); 3781 __ ext(t1, __ T16B, z, t0, 8); 3782 __ eor(lo, __ T16B, lo, t1); 3783 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3784 __ eor(result, __ T16B, lo, t0); 3785 } 3786 3787 address generate_has_negatives(address &has_negatives_long) { 3788 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3789 const int large_loop_size = 64; 3790 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3791 int dcache_line = VM_Version::dcache_line_size(); 3792 3793 Register ary1 = r1, len = r2, result = r0; 3794 3795 __ align(CodeEntryAlignment); 3796 address entry = __ pc(); 3797 3798 __ enter(); 3799 3800 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3801 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3802 3803 __ cmp(len, 15); 3804 __ br(Assembler::GT, LEN_OVER_15); 3805 // The only case when execution falls into this code is when pointer is near 3806 // the end of memory page and we have to avoid reading next page 3807 __ add(ary1, ary1, len); 3808 __ subs(len, len, 8); 3809 __ br(Assembler::GT, LEN_OVER_8); 3810 __ ldr(rscratch2, Address(ary1, -8)); 3811 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3812 __ lsrv(rscratch2, rscratch2, rscratch1); 3813 __ tst(rscratch2, UPPER_BIT_MASK); 3814 __ cset(result, Assembler::NE); 3815 __ leave(); 3816 __ ret(lr); 3817 __ bind(LEN_OVER_8); 3818 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3819 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3820 __ tst(rscratch2, UPPER_BIT_MASK); 3821 __ br(Assembler::NE, RET_TRUE_NO_POP); 3822 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3823 __ lsrv(rscratch1, rscratch1, rscratch2); 3824 __ tst(rscratch1, UPPER_BIT_MASK); 3825 __ cset(result, Assembler::NE); 3826 __ leave(); 3827 __ ret(lr); 3828 3829 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3830 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3831 3832 has_negatives_long = __ pc(); // 2nd entry point 3833 3834 __ enter(); 3835 3836 __ bind(LEN_OVER_15); 3837 __ push(spilled_regs, sp); 3838 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3839 __ cbz(rscratch2, ALIGNED); 3840 __ ldp(tmp6, tmp1, Address(ary1)); 3841 __ mov(tmp5, 16); 3842 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3843 __ add(ary1, ary1, rscratch1); 3844 __ sub(len, len, rscratch1); 3845 __ orr(tmp6, tmp6, tmp1); 3846 __ tst(tmp6, UPPER_BIT_MASK); 3847 __ br(Assembler::NE, RET_TRUE); 3848 3849 __ bind(ALIGNED); 3850 __ cmp(len, large_loop_size); 3851 __ br(Assembler::LT, CHECK_16); 3852 // Perform 16-byte load as early return in pre-loop to handle situation 3853 // when initially aligned large array has negative values at starting bytes, 3854 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3855 // slower. Cases with negative bytes further ahead won't be affected that 3856 // much. In fact, it'll be faster due to early loads, less instructions and 3857 // less branches in LARGE_LOOP. 3858 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3859 __ sub(len, len, 16); 3860 __ orr(tmp6, tmp6, tmp1); 3861 __ tst(tmp6, UPPER_BIT_MASK); 3862 __ br(Assembler::NE, RET_TRUE); 3863 __ cmp(len, large_loop_size); 3864 __ br(Assembler::LT, CHECK_16); 3865 3866 if (SoftwarePrefetchHintDistance >= 0 3867 && SoftwarePrefetchHintDistance >= dcache_line) { 3868 // initial prefetch 3869 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3870 } 3871 __ bind(LARGE_LOOP); 3872 if (SoftwarePrefetchHintDistance >= 0) { 3873 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3874 } 3875 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3876 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3877 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3878 // instructions per cycle and have less branches, but this approach disables 3879 // early return, thus, all 64 bytes are loaded and checked every time. 3880 __ ldp(tmp2, tmp3, Address(ary1)); 3881 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3882 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3883 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3884 __ add(ary1, ary1, large_loop_size); 3885 __ sub(len, len, large_loop_size); 3886 __ orr(tmp2, tmp2, tmp3); 3887 __ orr(tmp4, tmp4, tmp5); 3888 __ orr(rscratch1, rscratch1, rscratch2); 3889 __ orr(tmp6, tmp6, tmp1); 3890 __ orr(tmp2, tmp2, tmp4); 3891 __ orr(rscratch1, rscratch1, tmp6); 3892 __ orr(tmp2, tmp2, rscratch1); 3893 __ tst(tmp2, UPPER_BIT_MASK); 3894 __ br(Assembler::NE, RET_TRUE); 3895 __ cmp(len, large_loop_size); 3896 __ br(Assembler::GE, LARGE_LOOP); 3897 3898 __ bind(CHECK_16); // small 16-byte load pre-loop 3899 __ cmp(len, 16); 3900 __ br(Assembler::LT, POST_LOOP16); 3901 3902 __ bind(LOOP16); // small 16-byte load loop 3903 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3904 __ sub(len, len, 16); 3905 __ orr(tmp2, tmp2, tmp3); 3906 __ tst(tmp2, UPPER_BIT_MASK); 3907 __ br(Assembler::NE, RET_TRUE); 3908 __ cmp(len, 16); 3909 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3910 3911 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3912 __ cmp(len, 8); 3913 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3914 __ ldr(tmp3, Address(__ post(ary1, 8))); 3915 __ sub(len, len, 8); 3916 __ tst(tmp3, UPPER_BIT_MASK); 3917 __ br(Assembler::NE, RET_TRUE); 3918 3919 __ bind(POST_LOOP16_LOAD_TAIL); 3920 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3921 __ ldr(tmp1, Address(ary1)); 3922 __ mov(tmp2, 64); 3923 __ sub(tmp4, tmp2, len, __ LSL, 3); 3924 __ lslv(tmp1, tmp1, tmp4); 3925 __ tst(tmp1, UPPER_BIT_MASK); 3926 __ br(Assembler::NE, RET_TRUE); 3927 // Fallthrough 3928 3929 __ bind(RET_FALSE); 3930 __ pop(spilled_regs, sp); 3931 __ leave(); 3932 __ mov(result, zr); 3933 __ ret(lr); 3934 3935 __ bind(RET_TRUE); 3936 __ pop(spilled_regs, sp); 3937 __ bind(RET_TRUE_NO_POP); 3938 __ leave(); 3939 __ mov(result, 1); 3940 __ ret(lr); 3941 3942 __ bind(DONE); 3943 __ pop(spilled_regs, sp); 3944 __ leave(); 3945 __ ret(lr); 3946 return entry; 3947 } 3948 /** 3949 * Arguments: 3950 * 3951 * Input: 3952 * c_rarg0 - current state address 3953 * c_rarg1 - H key address 3954 * c_rarg2 - data address 3955 * c_rarg3 - number of blocks 3956 * 3957 * Output: 3958 * Updated state at c_rarg0 3959 */ 3960 address generate_ghash_processBlocks() { 3961 // Bafflingly, GCM uses little-endian for the byte order, but 3962 // big-endian for the bit order. For example, the polynomial 1 is 3963 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3964 // 3965 // So, we must either reverse the bytes in each word and do 3966 // everything big-endian or reverse the bits in each byte and do 3967 // it little-endian. On AArch64 it's more idiomatic to reverse 3968 // the bits in each byte (we have an instruction, RBIT, to do 3969 // that) and keep the data in little-endian bit order throught the 3970 // calculation, bit-reversing the inputs and outputs. 3971 3972 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3973 __ align(wordSize * 2); 3974 address p = __ pc(); 3975 __ emit_int64(0x87); // The low-order bits of the field 3976 // polynomial (i.e. p = z^7+z^2+z+1) 3977 // repeated in the low and high parts of a 3978 // 128-bit vector 3979 __ emit_int64(0x87); 3980 3981 __ align(CodeEntryAlignment); 3982 address start = __ pc(); 3983 3984 Register state = c_rarg0; 3985 Register subkeyH = c_rarg1; 3986 Register data = c_rarg2; 3987 Register blocks = c_rarg3; 3988 3989 FloatRegister vzr = v30; 3990 __ eor(vzr, __ T16B, vzr, vzr); // zero register 3991 3992 __ ldrq(v0, Address(state)); 3993 __ ldrq(v1, Address(subkeyH)); 3994 3995 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 3996 __ rbit(v0, __ T16B, v0); 3997 __ rev64(v1, __ T16B, v1); 3998 __ rbit(v1, __ T16B, v1); 3999 4000 __ ldrq(v26, p); 4001 4002 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4003 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4004 4005 { 4006 Label L_ghash_loop; 4007 __ bind(L_ghash_loop); 4008 4009 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4010 // reversing each byte 4011 __ rbit(v2, __ T16B, v2); 4012 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4013 4014 // Multiply state in v2 by subkey in v1 4015 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4016 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4017 /*temps*/v6, v20, v18, v21); 4018 // Reduce v7:v5 by the field polynomial 4019 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4020 4021 __ sub(blocks, blocks, 1); 4022 __ cbnz(blocks, L_ghash_loop); 4023 } 4024 4025 // The bit-reversed result is at this point in v0 4026 __ rev64(v1, __ T16B, v0); 4027 __ rbit(v1, __ T16B, v1); 4028 4029 __ st1(v1, __ T16B, state); 4030 __ ret(lr); 4031 4032 return start; 4033 } 4034 4035 // Continuation point for throwing of implicit exceptions that are 4036 // not handled in the current activation. Fabricates an exception 4037 // oop and initiates normal exception dispatching in this 4038 // frame. Since we need to preserve callee-saved values (currently 4039 // only for C2, but done for C1 as well) we need a callee-saved oop 4040 // map and therefore have to make these stubs into RuntimeStubs 4041 // rather than BufferBlobs. If the compiler needs all registers to 4042 // be preserved between the fault point and the exception handler 4043 // then it must assume responsibility for that in 4044 // AbstractCompiler::continuation_for_implicit_null_exception or 4045 // continuation_for_implicit_division_by_zero_exception. All other 4046 // implicit exceptions (e.g., NullPointerException or 4047 // AbstractMethodError on entry) are either at call sites or 4048 // otherwise assume that stack unwinding will be initiated, so 4049 // caller saved registers were assumed volatile in the compiler. 4050 4051 #undef __ 4052 #define __ masm-> 4053 4054 address generate_throw_exception(const char* name, 4055 address runtime_entry, 4056 Register arg1 = noreg, 4057 Register arg2 = noreg) { 4058 // Information about frame layout at time of blocking runtime call. 4059 // Note that we only have to preserve callee-saved registers since 4060 // the compilers are responsible for supplying a continuation point 4061 // if they expect all registers to be preserved. 4062 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4063 enum layout { 4064 rfp_off = 0, 4065 rfp_off2, 4066 return_off, 4067 return_off2, 4068 framesize // inclusive of return address 4069 }; 4070 4071 int insts_size = 512; 4072 int locs_size = 64; 4073 4074 CodeBuffer code(name, insts_size, locs_size); 4075 OopMapSet* oop_maps = new OopMapSet(); 4076 MacroAssembler* masm = new MacroAssembler(&code); 4077 4078 address start = __ pc(); 4079 4080 // This is an inlined and slightly modified version of call_VM 4081 // which has the ability to fetch the return PC out of 4082 // thread-local storage and also sets up last_Java_sp slightly 4083 // differently than the real call_VM 4084 4085 __ enter(); // Save FP and LR before call 4086 4087 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4088 4089 // lr and fp are already in place 4090 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4091 4092 int frame_complete = __ pc() - start; 4093 4094 // Set up last_Java_sp and last_Java_fp 4095 address the_pc = __ pc(); 4096 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 4097 4098 // Call runtime 4099 if (arg1 != noreg) { 4100 assert(arg2 != c_rarg1, "clobbered"); 4101 __ mov(c_rarg1, arg1); 4102 } 4103 if (arg2 != noreg) { 4104 __ mov(c_rarg2, arg2); 4105 } 4106 __ mov(c_rarg0, rthread); 4107 BLOCK_COMMENT("call runtime_entry"); 4108 __ mov(rscratch1, runtime_entry); 4109 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4110 4111 // Generate oop map 4112 OopMap* map = new OopMap(framesize, 0); 4113 4114 oop_maps->add_gc_map(the_pc - start, map); 4115 4116 __ reset_last_Java_frame(true); 4117 __ maybe_isb(); 4118 4119 __ leave(); 4120 4121 // check for pending exceptions 4122 #ifdef ASSERT 4123 Label L; 4124 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4125 __ cbnz(rscratch1, L); 4126 __ should_not_reach_here(); 4127 __ bind(L); 4128 #endif // ASSERT 4129 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4130 4131 4132 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4133 RuntimeStub* stub = 4134 RuntimeStub::new_runtime_stub(name, 4135 &code, 4136 frame_complete, 4137 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4138 oop_maps, false); 4139 return stub->entry_point(); 4140 } 4141 4142 class MontgomeryMultiplyGenerator : public MacroAssembler { 4143 4144 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4145 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4146 4147 RegSet _toSave; 4148 bool _squaring; 4149 4150 public: 4151 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4152 : MacroAssembler(as->code()), _squaring(squaring) { 4153 4154 // Register allocation 4155 4156 Register reg = c_rarg0; 4157 Pa_base = reg; // Argument registers 4158 if (squaring) 4159 Pb_base = Pa_base; 4160 else 4161 Pb_base = ++reg; 4162 Pn_base = ++reg; 4163 Rlen= ++reg; 4164 inv = ++reg; 4165 Pm_base = ++reg; 4166 4167 // Working registers: 4168 Ra = ++reg; // The current digit of a, b, n, and m. 4169 Rb = ++reg; 4170 Rm = ++reg; 4171 Rn = ++reg; 4172 4173 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4174 Pb = ++reg; 4175 Pm = ++reg; 4176 Pn = ++reg; 4177 4178 t0 = ++reg; // Three registers which form a 4179 t1 = ++reg; // triple-precision accumuator. 4180 t2 = ++reg; 4181 4182 Ri = ++reg; // Inner and outer loop indexes. 4183 Rj = ++reg; 4184 4185 Rhi_ab = ++reg; // Product registers: low and high parts 4186 Rlo_ab = ++reg; // of a*b and m*n. 4187 Rhi_mn = ++reg; 4188 Rlo_mn = ++reg; 4189 4190 // r19 and up are callee-saved. 4191 _toSave = RegSet::range(r19, reg) + Pm_base; 4192 } 4193 4194 private: 4195 void save_regs() { 4196 push(_toSave, sp); 4197 } 4198 4199 void restore_regs() { 4200 pop(_toSave, sp); 4201 } 4202 4203 template <typename T> 4204 void unroll_2(Register count, T block) { 4205 Label loop, end, odd; 4206 tbnz(count, 0, odd); 4207 cbz(count, end); 4208 align(16); 4209 bind(loop); 4210 (this->*block)(); 4211 bind(odd); 4212 (this->*block)(); 4213 subs(count, count, 2); 4214 br(Assembler::GT, loop); 4215 bind(end); 4216 } 4217 4218 template <typename T> 4219 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4220 Label loop, end, odd; 4221 tbnz(count, 0, odd); 4222 cbz(count, end); 4223 align(16); 4224 bind(loop); 4225 (this->*block)(d, s, tmp); 4226 bind(odd); 4227 (this->*block)(d, s, tmp); 4228 subs(count, count, 2); 4229 br(Assembler::GT, loop); 4230 bind(end); 4231 } 4232 4233 void pre1(RegisterOrConstant i) { 4234 block_comment("pre1"); 4235 // Pa = Pa_base; 4236 // Pb = Pb_base + i; 4237 // Pm = Pm_base; 4238 // Pn = Pn_base + i; 4239 // Ra = *Pa; 4240 // Rb = *Pb; 4241 // Rm = *Pm; 4242 // Rn = *Pn; 4243 ldr(Ra, Address(Pa_base)); 4244 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4245 ldr(Rm, Address(Pm_base)); 4246 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4247 lea(Pa, Address(Pa_base)); 4248 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4249 lea(Pm, Address(Pm_base)); 4250 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4251 4252 // Zero the m*n result. 4253 mov(Rhi_mn, zr); 4254 mov(Rlo_mn, zr); 4255 } 4256 4257 // The core multiply-accumulate step of a Montgomery 4258 // multiplication. The idea is to schedule operations as a 4259 // pipeline so that instructions with long latencies (loads and 4260 // multiplies) have time to complete before their results are 4261 // used. This most benefits in-order implementations of the 4262 // architecture but out-of-order ones also benefit. 4263 void step() { 4264 block_comment("step"); 4265 // MACC(Ra, Rb, t0, t1, t2); 4266 // Ra = *++Pa; 4267 // Rb = *--Pb; 4268 umulh(Rhi_ab, Ra, Rb); 4269 mul(Rlo_ab, Ra, Rb); 4270 ldr(Ra, pre(Pa, wordSize)); 4271 ldr(Rb, pre(Pb, -wordSize)); 4272 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4273 // previous iteration. 4274 // MACC(Rm, Rn, t0, t1, t2); 4275 // Rm = *++Pm; 4276 // Rn = *--Pn; 4277 umulh(Rhi_mn, Rm, Rn); 4278 mul(Rlo_mn, Rm, Rn); 4279 ldr(Rm, pre(Pm, wordSize)); 4280 ldr(Rn, pre(Pn, -wordSize)); 4281 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4282 } 4283 4284 void post1() { 4285 block_comment("post1"); 4286 4287 // MACC(Ra, Rb, t0, t1, t2); 4288 // Ra = *++Pa; 4289 // Rb = *--Pb; 4290 umulh(Rhi_ab, Ra, Rb); 4291 mul(Rlo_ab, Ra, Rb); 4292 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4293 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4294 4295 // *Pm = Rm = t0 * inv; 4296 mul(Rm, t0, inv); 4297 str(Rm, Address(Pm)); 4298 4299 // MACC(Rm, Rn, t0, t1, t2); 4300 // t0 = t1; t1 = t2; t2 = 0; 4301 umulh(Rhi_mn, Rm, Rn); 4302 4303 #ifndef PRODUCT 4304 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4305 { 4306 mul(Rlo_mn, Rm, Rn); 4307 add(Rlo_mn, t0, Rlo_mn); 4308 Label ok; 4309 cbz(Rlo_mn, ok); { 4310 stop("broken Montgomery multiply"); 4311 } bind(ok); 4312 } 4313 #endif 4314 // We have very carefully set things up so that 4315 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4316 // the lower half of Rm * Rn because we know the result already: 4317 // it must be -t0. t0 + (-t0) must generate a carry iff 4318 // t0 != 0. So, rather than do a mul and an adds we just set 4319 // the carry flag iff t0 is nonzero. 4320 // 4321 // mul(Rlo_mn, Rm, Rn); 4322 // adds(zr, t0, Rlo_mn); 4323 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4324 adcs(t0, t1, Rhi_mn); 4325 adc(t1, t2, zr); 4326 mov(t2, zr); 4327 } 4328 4329 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 4330 block_comment("pre2"); 4331 // Pa = Pa_base + i-len; 4332 // Pb = Pb_base + len; 4333 // Pm = Pm_base + i-len; 4334 // Pn = Pn_base + len; 4335 4336 if (i.is_register()) { 4337 sub(Rj, i.as_register(), len); 4338 } else { 4339 mov(Rj, i.as_constant()); 4340 sub(Rj, Rj, len); 4341 } 4342 // Rj == i-len 4343 4344 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 4345 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 4346 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4347 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 4348 4349 // Ra = *++Pa; 4350 // Rb = *--Pb; 4351 // Rm = *++Pm; 4352 // Rn = *--Pn; 4353 ldr(Ra, pre(Pa, wordSize)); 4354 ldr(Rb, pre(Pb, -wordSize)); 4355 ldr(Rm, pre(Pm, wordSize)); 4356 ldr(Rn, pre(Pn, -wordSize)); 4357 4358 mov(Rhi_mn, zr); 4359 mov(Rlo_mn, zr); 4360 } 4361 4362 void post2(RegisterOrConstant i, RegisterOrConstant len) { 4363 block_comment("post2"); 4364 if (i.is_constant()) { 4365 mov(Rj, i.as_constant()-len.as_constant()); 4366 } else { 4367 sub(Rj, i.as_register(), len); 4368 } 4369 4370 adds(t0, t0, Rlo_mn); // The pending m*n, low part 4371 4372 // As soon as we know the least significant digit of our result, 4373 // store it. 4374 // Pm_base[i-len] = t0; 4375 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4376 4377 // t0 = t1; t1 = t2; t2 = 0; 4378 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 4379 adc(t1, t2, zr); 4380 mov(t2, zr); 4381 } 4382 4383 // A carry in t0 after Montgomery multiplication means that we 4384 // should subtract multiples of n from our result in m. We'll 4385 // keep doing that until there is no carry. 4386 void normalize(RegisterOrConstant len) { 4387 block_comment("normalize"); 4388 // while (t0) 4389 // t0 = sub(Pm_base, Pn_base, t0, len); 4390 Label loop, post, again; 4391 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 4392 cbz(t0, post); { 4393 bind(again); { 4394 mov(i, zr); 4395 mov(cnt, len); 4396 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4397 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4398 subs(zr, zr, zr); // set carry flag, i.e. no borrow 4399 align(16); 4400 bind(loop); { 4401 sbcs(Rm, Rm, Rn); 4402 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4403 add(i, i, 1); 4404 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4405 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4406 sub(cnt, cnt, 1); 4407 } cbnz(cnt, loop); 4408 sbc(t0, t0, zr); 4409 } cbnz(t0, again); 4410 } bind(post); 4411 } 4412 4413 // Move memory at s to d, reversing words. 4414 // Increments d to end of copied memory 4415 // Destroys tmp1, tmp2 4416 // Preserves len 4417 // Leaves s pointing to the address which was in d at start 4418 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 4419 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 4420 4421 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 4422 mov(tmp1, len); 4423 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 4424 sub(s, d, len, ext::uxtw, LogBytesPerWord); 4425 } 4426 // where 4427 void reverse1(Register d, Register s, Register tmp) { 4428 ldr(tmp, pre(s, -wordSize)); 4429 ror(tmp, tmp, 32); 4430 str(tmp, post(d, wordSize)); 4431 } 4432 4433 void step_squaring() { 4434 // An extra ACC 4435 step(); 4436 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4437 } 4438 4439 void last_squaring(RegisterOrConstant i) { 4440 Label dont; 4441 // if ((i & 1) == 0) { 4442 tbnz(i.as_register(), 0, dont); { 4443 // MACC(Ra, Rb, t0, t1, t2); 4444 // Ra = *++Pa; 4445 // Rb = *--Pb; 4446 umulh(Rhi_ab, Ra, Rb); 4447 mul(Rlo_ab, Ra, Rb); 4448 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4449 } bind(dont); 4450 } 4451 4452 void extra_step_squaring() { 4453 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4454 4455 // MACC(Rm, Rn, t0, t1, t2); 4456 // Rm = *++Pm; 4457 // Rn = *--Pn; 4458 umulh(Rhi_mn, Rm, Rn); 4459 mul(Rlo_mn, Rm, Rn); 4460 ldr(Rm, pre(Pm, wordSize)); 4461 ldr(Rn, pre(Pn, -wordSize)); 4462 } 4463 4464 void post1_squaring() { 4465 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4466 4467 // *Pm = Rm = t0 * inv; 4468 mul(Rm, t0, inv); 4469 str(Rm, Address(Pm)); 4470 4471 // MACC(Rm, Rn, t0, t1, t2); 4472 // t0 = t1; t1 = t2; t2 = 0; 4473 umulh(Rhi_mn, Rm, Rn); 4474 4475 #ifndef PRODUCT 4476 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4477 { 4478 mul(Rlo_mn, Rm, Rn); 4479 add(Rlo_mn, t0, Rlo_mn); 4480 Label ok; 4481 cbz(Rlo_mn, ok); { 4482 stop("broken Montgomery multiply"); 4483 } bind(ok); 4484 } 4485 #endif 4486 // We have very carefully set things up so that 4487 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4488 // the lower half of Rm * Rn because we know the result already: 4489 // it must be -t0. t0 + (-t0) must generate a carry iff 4490 // t0 != 0. So, rather than do a mul and an adds we just set 4491 // the carry flag iff t0 is nonzero. 4492 // 4493 // mul(Rlo_mn, Rm, Rn); 4494 // adds(zr, t0, Rlo_mn); 4495 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4496 adcs(t0, t1, Rhi_mn); 4497 adc(t1, t2, zr); 4498 mov(t2, zr); 4499 } 4500 4501 void acc(Register Rhi, Register Rlo, 4502 Register t0, Register t1, Register t2) { 4503 adds(t0, t0, Rlo); 4504 adcs(t1, t1, Rhi); 4505 adc(t2, t2, zr); 4506 } 4507 4508 public: 4509 /** 4510 * Fast Montgomery multiplication. The derivation of the 4511 * algorithm is in A Cryptographic Library for the Motorola 4512 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 4513 * 4514 * Arguments: 4515 * 4516 * Inputs for multiplication: 4517 * c_rarg0 - int array elements a 4518 * c_rarg1 - int array elements b 4519 * c_rarg2 - int array elements n (the modulus) 4520 * c_rarg3 - int length 4521 * c_rarg4 - int inv 4522 * c_rarg5 - int array elements m (the result) 4523 * 4524 * Inputs for squaring: 4525 * c_rarg0 - int array elements a 4526 * c_rarg1 - int array elements n (the modulus) 4527 * c_rarg2 - int length 4528 * c_rarg3 - int inv 4529 * c_rarg4 - int array elements m (the result) 4530 * 4531 */ 4532 address generate_multiply() { 4533 Label argh, nothing; 4534 bind(argh); 4535 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4536 4537 align(CodeEntryAlignment); 4538 address entry = pc(); 4539 4540 cbzw(Rlen, nothing); 4541 4542 enter(); 4543 4544 // Make room. 4545 cmpw(Rlen, 512); 4546 br(Assembler::HI, argh); 4547 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4548 andr(sp, Ra, -2 * wordSize); 4549 4550 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4551 4552 { 4553 // Copy input args, reversing as we go. We use Ra as a 4554 // temporary variable. 4555 reverse(Ra, Pa_base, Rlen, t0, t1); 4556 if (!_squaring) 4557 reverse(Ra, Pb_base, Rlen, t0, t1); 4558 reverse(Ra, Pn_base, Rlen, t0, t1); 4559 } 4560 4561 // Push all call-saved registers and also Pm_base which we'll need 4562 // at the end. 4563 save_regs(); 4564 4565 #ifndef PRODUCT 4566 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 4567 { 4568 ldr(Rn, Address(Pn_base, 0)); 4569 mul(Rlo_mn, Rn, inv); 4570 cmp(Rlo_mn, -1); 4571 Label ok; 4572 br(EQ, ok); { 4573 stop("broken inverse in Montgomery multiply"); 4574 } bind(ok); 4575 } 4576 #endif 4577 4578 mov(Pm_base, Ra); 4579 4580 mov(t0, zr); 4581 mov(t1, zr); 4582 mov(t2, zr); 4583 4584 block_comment("for (int i = 0; i < len; i++) {"); 4585 mov(Ri, zr); { 4586 Label loop, end; 4587 cmpw(Ri, Rlen); 4588 br(Assembler::GE, end); 4589 4590 bind(loop); 4591 pre1(Ri); 4592 4593 block_comment(" for (j = i; j; j--) {"); { 4594 movw(Rj, Ri); 4595 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4596 } block_comment(" } // j"); 4597 4598 post1(); 4599 addw(Ri, Ri, 1); 4600 cmpw(Ri, Rlen); 4601 br(Assembler::LT, loop); 4602 bind(end); 4603 block_comment("} // i"); 4604 } 4605 4606 block_comment("for (int i = len; i < 2*len; i++) {"); 4607 mov(Ri, Rlen); { 4608 Label loop, end; 4609 cmpw(Ri, Rlen, Assembler::LSL, 1); 4610 br(Assembler::GE, end); 4611 4612 bind(loop); 4613 pre2(Ri, Rlen); 4614 4615 block_comment(" for (j = len*2-i-1; j; j--) {"); { 4616 lslw(Rj, Rlen, 1); 4617 subw(Rj, Rj, Ri); 4618 subw(Rj, Rj, 1); 4619 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4620 } block_comment(" } // j"); 4621 4622 post2(Ri, Rlen); 4623 addw(Ri, Ri, 1); 4624 cmpw(Ri, Rlen, Assembler::LSL, 1); 4625 br(Assembler::LT, loop); 4626 bind(end); 4627 } 4628 block_comment("} // i"); 4629 4630 normalize(Rlen); 4631 4632 mov(Ra, Pm_base); // Save Pm_base in Ra 4633 restore_regs(); // Restore caller's Pm_base 4634 4635 // Copy our result into caller's Pm_base 4636 reverse(Pm_base, Ra, Rlen, t0, t1); 4637 4638 leave(); 4639 bind(nothing); 4640 ret(lr); 4641 4642 return entry; 4643 } 4644 // In C, approximately: 4645 4646 // void 4647 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 4648 // unsigned long Pn_base[], unsigned long Pm_base[], 4649 // unsigned long inv, int len) { 4650 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4651 // unsigned long *Pa, *Pb, *Pn, *Pm; 4652 // unsigned long Ra, Rb, Rn, Rm; 4653 4654 // int i; 4655 4656 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4657 4658 // for (i = 0; i < len; i++) { 4659 // int j; 4660 4661 // Pa = Pa_base; 4662 // Pb = Pb_base + i; 4663 // Pm = Pm_base; 4664 // Pn = Pn_base + i; 4665 4666 // Ra = *Pa; 4667 // Rb = *Pb; 4668 // Rm = *Pm; 4669 // Rn = *Pn; 4670 4671 // int iters = i; 4672 // for (j = 0; iters--; j++) { 4673 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4674 // MACC(Ra, Rb, t0, t1, t2); 4675 // Ra = *++Pa; 4676 // Rb = *--Pb; 4677 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4678 // MACC(Rm, Rn, t0, t1, t2); 4679 // Rm = *++Pm; 4680 // Rn = *--Pn; 4681 // } 4682 4683 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 4684 // MACC(Ra, Rb, t0, t1, t2); 4685 // *Pm = Rm = t0 * inv; 4686 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4687 // MACC(Rm, Rn, t0, t1, t2); 4688 4689 // assert(t0 == 0, "broken Montgomery multiply"); 4690 4691 // t0 = t1; t1 = t2; t2 = 0; 4692 // } 4693 4694 // for (i = len; i < 2*len; i++) { 4695 // int j; 4696 4697 // Pa = Pa_base + i-len; 4698 // Pb = Pb_base + len; 4699 // Pm = Pm_base + i-len; 4700 // Pn = Pn_base + len; 4701 4702 // Ra = *++Pa; 4703 // Rb = *--Pb; 4704 // Rm = *++Pm; 4705 // Rn = *--Pn; 4706 4707 // int iters = len*2-i-1; 4708 // for (j = i-len+1; iters--; j++) { 4709 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4710 // MACC(Ra, Rb, t0, t1, t2); 4711 // Ra = *++Pa; 4712 // Rb = *--Pb; 4713 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4714 // MACC(Rm, Rn, t0, t1, t2); 4715 // Rm = *++Pm; 4716 // Rn = *--Pn; 4717 // } 4718 4719 // Pm_base[i-len] = t0; 4720 // t0 = t1; t1 = t2; t2 = 0; 4721 // } 4722 4723 // while (t0) 4724 // t0 = sub(Pm_base, Pn_base, t0, len); 4725 // } 4726 4727 /** 4728 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4729 * multiplies than Montgomery multiplication so it should be up to 4730 * 25% faster. However, its loop control is more complex and it 4731 * may actually run slower on some machines. 4732 * 4733 * Arguments: 4734 * 4735 * Inputs: 4736 * c_rarg0 - int array elements a 4737 * c_rarg1 - int array elements n (the modulus) 4738 * c_rarg2 - int length 4739 * c_rarg3 - int inv 4740 * c_rarg4 - int array elements m (the result) 4741 * 4742 */ 4743 address generate_square() { 4744 Label argh; 4745 bind(argh); 4746 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4747 4748 align(CodeEntryAlignment); 4749 address entry = pc(); 4750 4751 enter(); 4752 4753 // Make room. 4754 cmpw(Rlen, 512); 4755 br(Assembler::HI, argh); 4756 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4757 andr(sp, Ra, -2 * wordSize); 4758 4759 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4760 4761 { 4762 // Copy input args, reversing as we go. We use Ra as a 4763 // temporary variable. 4764 reverse(Ra, Pa_base, Rlen, t0, t1); 4765 reverse(Ra, Pn_base, Rlen, t0, t1); 4766 } 4767 4768 // Push all call-saved registers and also Pm_base which we'll need 4769 // at the end. 4770 save_regs(); 4771 4772 mov(Pm_base, Ra); 4773 4774 mov(t0, zr); 4775 mov(t1, zr); 4776 mov(t2, zr); 4777 4778 block_comment("for (int i = 0; i < len; i++) {"); 4779 mov(Ri, zr); { 4780 Label loop, end; 4781 bind(loop); 4782 cmp(Ri, Rlen); 4783 br(Assembler::GE, end); 4784 4785 pre1(Ri); 4786 4787 block_comment("for (j = (i+1)/2; j; j--) {"); { 4788 add(Rj, Ri, 1); 4789 lsr(Rj, Rj, 1); 4790 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4791 } block_comment(" } // j"); 4792 4793 last_squaring(Ri); 4794 4795 block_comment(" for (j = i/2; j; j--) {"); { 4796 lsr(Rj, Ri, 1); 4797 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4798 } block_comment(" } // j"); 4799 4800 post1_squaring(); 4801 add(Ri, Ri, 1); 4802 cmp(Ri, Rlen); 4803 br(Assembler::LT, loop); 4804 4805 bind(end); 4806 block_comment("} // i"); 4807 } 4808 4809 block_comment("for (int i = len; i < 2*len; i++) {"); 4810 mov(Ri, Rlen); { 4811 Label loop, end; 4812 bind(loop); 4813 cmp(Ri, Rlen, Assembler::LSL, 1); 4814 br(Assembler::GE, end); 4815 4816 pre2(Ri, Rlen); 4817 4818 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4819 lsl(Rj, Rlen, 1); 4820 sub(Rj, Rj, Ri); 4821 sub(Rj, Rj, 1); 4822 lsr(Rj, Rj, 1); 4823 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4824 } block_comment(" } // j"); 4825 4826 last_squaring(Ri); 4827 4828 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4829 lsl(Rj, Rlen, 1); 4830 sub(Rj, Rj, Ri); 4831 lsr(Rj, Rj, 1); 4832 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4833 } block_comment(" } // j"); 4834 4835 post2(Ri, Rlen); 4836 add(Ri, Ri, 1); 4837 cmp(Ri, Rlen, Assembler::LSL, 1); 4838 4839 br(Assembler::LT, loop); 4840 bind(end); 4841 block_comment("} // i"); 4842 } 4843 4844 normalize(Rlen); 4845 4846 mov(Ra, Pm_base); // Save Pm_base in Ra 4847 restore_regs(); // Restore caller's Pm_base 4848 4849 // Copy our result into caller's Pm_base 4850 reverse(Pm_base, Ra, Rlen, t0, t1); 4851 4852 leave(); 4853 ret(lr); 4854 4855 return entry; 4856 } 4857 // In C, approximately: 4858 4859 // void 4860 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4861 // unsigned long Pm_base[], unsigned long inv, int len) { 4862 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4863 // unsigned long *Pa, *Pb, *Pn, *Pm; 4864 // unsigned long Ra, Rb, Rn, Rm; 4865 4866 // int i; 4867 4868 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4869 4870 // for (i = 0; i < len; i++) { 4871 // int j; 4872 4873 // Pa = Pa_base; 4874 // Pb = Pa_base + i; 4875 // Pm = Pm_base; 4876 // Pn = Pn_base + i; 4877 4878 // Ra = *Pa; 4879 // Rb = *Pb; 4880 // Rm = *Pm; 4881 // Rn = *Pn; 4882 4883 // int iters = (i+1)/2; 4884 // for (j = 0; iters--; j++) { 4885 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4886 // MACC2(Ra, Rb, t0, t1, t2); 4887 // Ra = *++Pa; 4888 // Rb = *--Pb; 4889 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4890 // MACC(Rm, Rn, t0, t1, t2); 4891 // Rm = *++Pm; 4892 // Rn = *--Pn; 4893 // } 4894 // if ((i & 1) == 0) { 4895 // assert(Ra == Pa_base[j], "must be"); 4896 // MACC(Ra, Ra, t0, t1, t2); 4897 // } 4898 // iters = i/2; 4899 // assert(iters == i-j, "must be"); 4900 // for (; iters--; j++) { 4901 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4902 // MACC(Rm, Rn, t0, t1, t2); 4903 // Rm = *++Pm; 4904 // Rn = *--Pn; 4905 // } 4906 4907 // *Pm = Rm = t0 * inv; 4908 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4909 // MACC(Rm, Rn, t0, t1, t2); 4910 4911 // assert(t0 == 0, "broken Montgomery multiply"); 4912 4913 // t0 = t1; t1 = t2; t2 = 0; 4914 // } 4915 4916 // for (i = len; i < 2*len; i++) { 4917 // int start = i-len+1; 4918 // int end = start + (len - start)/2; 4919 // int j; 4920 4921 // Pa = Pa_base + i-len; 4922 // Pb = Pa_base + len; 4923 // Pm = Pm_base + i-len; 4924 // Pn = Pn_base + len; 4925 4926 // Ra = *++Pa; 4927 // Rb = *--Pb; 4928 // Rm = *++Pm; 4929 // Rn = *--Pn; 4930 4931 // int iters = (2*len-i-1)/2; 4932 // assert(iters == end-start, "must be"); 4933 // for (j = start; iters--; j++) { 4934 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4935 // MACC2(Ra, Rb, t0, t1, t2); 4936 // Ra = *++Pa; 4937 // Rb = *--Pb; 4938 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4939 // MACC(Rm, Rn, t0, t1, t2); 4940 // Rm = *++Pm; 4941 // Rn = *--Pn; 4942 // } 4943 // if ((i & 1) == 0) { 4944 // assert(Ra == Pa_base[j], "must be"); 4945 // MACC(Ra, Ra, t0, t1, t2); 4946 // } 4947 // iters = (2*len-i)/2; 4948 // assert(iters == len-j, "must be"); 4949 // for (; iters--; j++) { 4950 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4951 // MACC(Rm, Rn, t0, t1, t2); 4952 // Rm = *++Pm; 4953 // Rn = *--Pn; 4954 // } 4955 // Pm_base[i-len] = t0; 4956 // t0 = t1; t1 = t2; t2 = 0; 4957 // } 4958 4959 // while (t0) 4960 // t0 = sub(Pm_base, Pn_base, t0, len); 4961 // } 4962 }; 4963 4964 4965 // Initialization 4966 void generate_initial() { 4967 // Generate initial stubs and initializes the entry points 4968 4969 // entry points that exist in all platforms Note: This is code 4970 // that could be shared among different platforms - however the 4971 // benefit seems to be smaller than the disadvantage of having a 4972 // much more complicated generator structure. See also comment in 4973 // stubRoutines.hpp. 4974 4975 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4976 4977 StubRoutines::_call_stub_entry = 4978 generate_call_stub(StubRoutines::_call_stub_return_address); 4979 4980 // is referenced by megamorphic call 4981 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4982 4983 // Build this early so it's available for the interpreter. 4984 StubRoutines::_throw_StackOverflowError_entry = 4985 generate_throw_exception("StackOverflowError throw_exception", 4986 CAST_FROM_FN_PTR(address, 4987 SharedRuntime::throw_StackOverflowError)); 4988 StubRoutines::_throw_delayed_StackOverflowError_entry = 4989 generate_throw_exception("delayed StackOverflowError throw_exception", 4990 CAST_FROM_FN_PTR(address, 4991 SharedRuntime::throw_delayed_StackOverflowError)); 4992 if (UseCRC32Intrinsics) { 4993 // set table address before stub generation which use it 4994 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 4995 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4996 } 4997 4998 if (UseCRC32CIntrinsics) { 4999 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5000 } 5001 } 5002 5003 void generate_all() { 5004 // support for verify_oop (must happen after universe_init) 5005 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5006 StubRoutines::_throw_AbstractMethodError_entry = 5007 generate_throw_exception("AbstractMethodError throw_exception", 5008 CAST_FROM_FN_PTR(address, 5009 SharedRuntime:: 5010 throw_AbstractMethodError)); 5011 5012 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5013 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5014 CAST_FROM_FN_PTR(address, 5015 SharedRuntime:: 5016 throw_IncompatibleClassChangeError)); 5017 5018 StubRoutines::_throw_NullPointerException_at_call_entry = 5019 generate_throw_exception("NullPointerException at call throw_exception", 5020 CAST_FROM_FN_PTR(address, 5021 SharedRuntime:: 5022 throw_NullPointerException_at_call)); 5023 5024 // arraycopy stubs used by compilers 5025 generate_arraycopy_stubs(); 5026 5027 // has negatives stub for large arrays. 5028 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5029 5030 if (UseMultiplyToLenIntrinsic) { 5031 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5032 } 5033 5034 if (UseSquareToLenIntrinsic) { 5035 StubRoutines::_squareToLen = generate_squareToLen(); 5036 } 5037 5038 if (UseMulAddIntrinsic) { 5039 StubRoutines::_mulAdd = generate_mulAdd(); 5040 } 5041 5042 if (UseMontgomeryMultiplyIntrinsic) { 5043 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5044 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5045 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5046 } 5047 5048 if (UseMontgomerySquareIntrinsic) { 5049 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5050 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5051 // We use generate_multiply() rather than generate_square() 5052 // because it's faster for the sizes of modulus we care about. 5053 StubRoutines::_montgomerySquare = g.generate_multiply(); 5054 } 5055 5056 if (UseShenandoahGC && (ShenandoahWriteBarrier || ShenandoahStoreValWriteBarrier)) { 5057 StubRoutines::aarch64::_shenandoah_wb = generate_shenandoah_wb(false, true); 5058 StubRoutines::_shenandoah_wb_C = generate_shenandoah_wb(true, !ShenandoahWriteBarrierCsetTestInIR); 5059 } 5060 5061 #ifndef BUILTIN_SIM 5062 // generate GHASH intrinsics code 5063 if (UseGHASHIntrinsics) { 5064 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5065 } 5066 5067 if (UseAESIntrinsics) { 5068 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5069 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5070 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5071 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5072 } 5073 5074 if (UseSHA1Intrinsics) { 5075 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5076 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5077 } 5078 if (UseSHA256Intrinsics) { 5079 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5080 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5081 } 5082 5083 // generate Adler32 intrinsics code 5084 if (UseAdler32Intrinsics) { 5085 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5086 } 5087 5088 // Safefetch stubs. 5089 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5090 &StubRoutines::_safefetch32_fault_pc, 5091 &StubRoutines::_safefetch32_continuation_pc); 5092 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5093 &StubRoutines::_safefetchN_fault_pc, 5094 &StubRoutines::_safefetchN_continuation_pc); 5095 #endif 5096 StubRoutines::aarch64::set_completed(); 5097 } 5098 5099 public: 5100 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5101 if (all) { 5102 generate_all(); 5103 } else { 5104 generate_initial(); 5105 } 5106 } 5107 }; // end class declaration 5108 5109 void StubGenerator_generate(CodeBuffer* code, bool all) { 5110 StubGenerator g(code, all); 5111 }