1 /* 2 * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shenandoah/brooksPointer.hpp" 30 #include "gc/shenandoah/shenandoahBarrierSet.hpp" 31 #include "gc/shenandoah/shenandoahHeap.hpp" 32 #include "gc/shenandoah/shenandoahHeapRegion.hpp" 33 #include "gc/shared/barrierSet.hpp" 34 #include "gc/shared/barrierSetAssembler.hpp" 35 #include "interpreter/interpreter.hpp" 36 #include "nativeInst_aarch64.hpp" 37 #include "oops/instanceOop.hpp" 38 #include "oops/method.hpp" 39 #include "oops/objArrayKlass.hpp" 40 #include "oops/oop.inline.hpp" 41 #include "prims/methodHandles.hpp" 42 #include "runtime/frame.inline.hpp" 43 #include "runtime/handles.inline.hpp" 44 #include "runtime/sharedRuntime.hpp" 45 #include "runtime/stubCodeGenerator.hpp" 46 #include "runtime/stubRoutines.hpp" 47 #include "runtime/thread.inline.hpp" 48 #include "utilities/align.hpp" 49 #ifdef COMPILER2 50 #include "opto/runtime.hpp" 51 #endif 52 53 #ifdef BUILTIN_SIM 54 #include "../../../../../../simulator/simulator.hpp" 55 #endif 56 57 // Declaration and definition of StubGenerator (no .hpp file). 58 // For a more detailed description of the stub routine structure 59 // see the comment in stubRoutines.hpp 60 61 #undef __ 62 #define __ _masm-> 63 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 64 65 #ifdef PRODUCT 66 #define BLOCK_COMMENT(str) /* nothing */ 67 #else 68 #define BLOCK_COMMENT(str) __ block_comment(str) 69 #endif 70 71 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 72 73 // Stub Code definitions 74 75 class StubGenerator: public StubCodeGenerator { 76 private: 77 78 #ifdef PRODUCT 79 #define inc_counter_np(counter) ((void)0) 80 #else 81 void inc_counter_np_(int& counter) { 82 __ lea(rscratch2, ExternalAddress((address)&counter)); 83 __ ldrw(rscratch1, Address(rscratch2)); 84 __ addw(rscratch1, rscratch1, 1); 85 __ strw(rscratch1, Address(rscratch2)); 86 } 87 #define inc_counter_np(counter) \ 88 BLOCK_COMMENT("inc_counter " #counter); \ 89 inc_counter_np_(counter); 90 #endif 91 92 // Call stubs are used to call Java from C 93 // 94 // Arguments: 95 // c_rarg0: call wrapper address address 96 // c_rarg1: result address 97 // c_rarg2: result type BasicType 98 // c_rarg3: method Method* 99 // c_rarg4: (interpreter) entry point address 100 // c_rarg5: parameters intptr_t* 101 // c_rarg6: parameter size (in words) int 102 // c_rarg7: thread Thread* 103 // 104 // There is no return from the stub itself as any Java result 105 // is written to result 106 // 107 // we save r30 (lr) as the return PC at the base of the frame and 108 // link r29 (fp) below it as the frame pointer installing sp (r31) 109 // into fp. 110 // 111 // we save r0-r7, which accounts for all the c arguments. 112 // 113 // TODO: strictly do we need to save them all? they are treated as 114 // volatile by C so could we omit saving the ones we are going to 115 // place in global registers (thread? method?) or those we only use 116 // during setup of the Java call? 117 // 118 // we don't need to save r8 which C uses as an indirect result location 119 // return register. 120 // 121 // we don't need to save r9-r15 which both C and Java treat as 122 // volatile 123 // 124 // we don't need to save r16-18 because Java does not use them 125 // 126 // we save r19-r28 which Java uses as scratch registers and C 127 // expects to be callee-save 128 // 129 // we save the bottom 64 bits of each value stored in v8-v15; it is 130 // the responsibility of the caller to preserve larger values. 131 // 132 // so the stub frame looks like this when we enter Java code 133 // 134 // [ return_from_Java ] <--- sp 135 // [ argument word n ] 136 // ... 137 // -27 [ argument word 1 ] 138 // -26 [ saved v15 ] <--- sp_after_call 139 // -25 [ saved v14 ] 140 // -24 [ saved v13 ] 141 // -23 [ saved v12 ] 142 // -22 [ saved v11 ] 143 // -21 [ saved v10 ] 144 // -20 [ saved v9 ] 145 // -19 [ saved v8 ] 146 // -18 [ saved r28 ] 147 // -17 [ saved r27 ] 148 // -16 [ saved r26 ] 149 // -15 [ saved r25 ] 150 // -14 [ saved r24 ] 151 // -13 [ saved r23 ] 152 // -12 [ saved r22 ] 153 // -11 [ saved r21 ] 154 // -10 [ saved r20 ] 155 // -9 [ saved r19 ] 156 // -8 [ call wrapper (r0) ] 157 // -7 [ result (r1) ] 158 // -6 [ result type (r2) ] 159 // -5 [ method (r3) ] 160 // -4 [ entry point (r4) ] 161 // -3 [ parameters (r5) ] 162 // -2 [ parameter size (r6) ] 163 // -1 [ thread (r7) ] 164 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 165 // 1 [ saved lr (r30) ] 166 167 // Call stub stack layout word offsets from fp 168 enum call_stub_layout { 169 sp_after_call_off = -26, 170 171 d15_off = -26, 172 d13_off = -24, 173 d11_off = -22, 174 d9_off = -20, 175 176 r28_off = -18, 177 r26_off = -16, 178 r24_off = -14, 179 r22_off = -12, 180 r20_off = -10, 181 call_wrapper_off = -8, 182 result_off = -7, 183 result_type_off = -6, 184 method_off = -5, 185 entry_point_off = -4, 186 parameter_size_off = -2, 187 thread_off = -1, 188 fp_f = 0, 189 retaddr_off = 1, 190 }; 191 192 address generate_call_stub(address& return_address) { 193 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 194 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 195 "adjust this code"); 196 197 StubCodeMark mark(this, "StubRoutines", "call_stub"); 198 address start = __ pc(); 199 200 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 201 202 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 203 const Address result (rfp, result_off * wordSize); 204 const Address result_type (rfp, result_type_off * wordSize); 205 const Address method (rfp, method_off * wordSize); 206 const Address entry_point (rfp, entry_point_off * wordSize); 207 const Address parameter_size(rfp, parameter_size_off * wordSize); 208 209 const Address thread (rfp, thread_off * wordSize); 210 211 const Address d15_save (rfp, d15_off * wordSize); 212 const Address d13_save (rfp, d13_off * wordSize); 213 const Address d11_save (rfp, d11_off * wordSize); 214 const Address d9_save (rfp, d9_off * wordSize); 215 216 const Address r28_save (rfp, r28_off * wordSize); 217 const Address r26_save (rfp, r26_off * wordSize); 218 const Address r24_save (rfp, r24_off * wordSize); 219 const Address r22_save (rfp, r22_off * wordSize); 220 const Address r20_save (rfp, r20_off * wordSize); 221 222 // stub code 223 224 // we need a C prolog to bootstrap the x86 caller into the sim 225 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 226 227 address aarch64_entry = __ pc(); 228 229 #ifdef BUILTIN_SIM 230 // Save sender's SP for stack traces. 231 __ mov(rscratch1, sp); 232 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 233 #endif 234 // set up frame and move sp to end of save area 235 __ enter(); 236 __ sub(sp, rfp, -sp_after_call_off * wordSize); 237 238 // save register parameters and Java scratch/global registers 239 // n.b. we save thread even though it gets installed in 240 // rthread because we want to sanity check rthread later 241 __ str(c_rarg7, thread); 242 __ strw(c_rarg6, parameter_size); 243 __ stp(c_rarg4, c_rarg5, entry_point); 244 __ stp(c_rarg2, c_rarg3, result_type); 245 __ stp(c_rarg0, c_rarg1, call_wrapper); 246 247 __ stp(r20, r19, r20_save); 248 __ stp(r22, r21, r22_save); 249 __ stp(r24, r23, r24_save); 250 __ stp(r26, r25, r26_save); 251 __ stp(r28, r27, r28_save); 252 253 __ stpd(v9, v8, d9_save); 254 __ stpd(v11, v10, d11_save); 255 __ stpd(v13, v12, d13_save); 256 __ stpd(v15, v14, d15_save); 257 258 // install Java thread in global register now we have saved 259 // whatever value it held 260 __ mov(rthread, c_rarg7); 261 // And method 262 __ mov(rmethod, c_rarg3); 263 264 // set up the heapbase register 265 __ reinit_heapbase(); 266 267 #ifdef ASSERT 268 // make sure we have no pending exceptions 269 { 270 Label L; 271 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 272 __ cmp(rscratch1, (unsigned)NULL_WORD); 273 __ br(Assembler::EQ, L); 274 __ stop("StubRoutines::call_stub: entered with pending exception"); 275 __ BIND(L); 276 } 277 #endif 278 // pass parameters if any 279 __ mov(esp, sp); 280 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 281 __ andr(sp, rscratch1, -2 * wordSize); 282 283 BLOCK_COMMENT("pass parameters if any"); 284 Label parameters_done; 285 // parameter count is still in c_rarg6 286 // and parameter pointer identifying param 1 is in c_rarg5 287 __ cbzw(c_rarg6, parameters_done); 288 289 address loop = __ pc(); 290 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 291 __ subsw(c_rarg6, c_rarg6, 1); 292 __ push(rscratch1); 293 __ br(Assembler::GT, loop); 294 295 __ BIND(parameters_done); 296 297 // call Java entry -- passing methdoOop, and current sp 298 // rmethod: Method* 299 // r13: sender sp 300 BLOCK_COMMENT("call Java function"); 301 __ mov(r13, sp); 302 __ blr(c_rarg4); 303 304 // tell the simulator we have returned to the stub 305 306 // we do this here because the notify will already have been done 307 // if we get to the next instruction via an exception 308 // 309 // n.b. adding this instruction here affects the calculation of 310 // whether or not a routine returns to the call stub (used when 311 // doing stack walks) since the normal test is to check the return 312 // pc against the address saved below. so we may need to allow for 313 // this extra instruction in the check. 314 315 if (NotifySimulator) { 316 __ notify(Assembler::method_reentry); 317 } 318 // save current address for use by exception handling code 319 320 return_address = __ pc(); 321 322 // store result depending on type (everything that is not 323 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 324 // n.b. this assumes Java returns an integral result in r0 325 // and a floating result in j_farg0 326 __ ldr(j_rarg2, result); 327 Label is_long, is_float, is_double, exit; 328 __ ldr(j_rarg1, result_type); 329 __ cmp(j_rarg1, T_OBJECT); 330 __ br(Assembler::EQ, is_long); 331 __ cmp(j_rarg1, T_LONG); 332 __ br(Assembler::EQ, is_long); 333 __ cmp(j_rarg1, T_FLOAT); 334 __ br(Assembler::EQ, is_float); 335 __ cmp(j_rarg1, T_DOUBLE); 336 __ br(Assembler::EQ, is_double); 337 338 // handle T_INT case 339 __ strw(r0, Address(j_rarg2)); 340 341 __ BIND(exit); 342 343 // pop parameters 344 __ sub(esp, rfp, -sp_after_call_off * wordSize); 345 346 #ifdef ASSERT 347 // verify that threads correspond 348 { 349 Label L, S; 350 __ ldr(rscratch1, thread); 351 __ cmp(rthread, rscratch1); 352 __ br(Assembler::NE, S); 353 __ get_thread(rscratch1); 354 __ cmp(rthread, rscratch1); 355 __ br(Assembler::EQ, L); 356 __ BIND(S); 357 __ stop("StubRoutines::call_stub: threads must correspond"); 358 __ BIND(L); 359 } 360 #endif 361 362 // restore callee-save registers 363 __ ldpd(v15, v14, d15_save); 364 __ ldpd(v13, v12, d13_save); 365 __ ldpd(v11, v10, d11_save); 366 __ ldpd(v9, v8, d9_save); 367 368 __ ldp(r28, r27, r28_save); 369 __ ldp(r26, r25, r26_save); 370 __ ldp(r24, r23, r24_save); 371 __ ldp(r22, r21, r22_save); 372 __ ldp(r20, r19, r20_save); 373 374 __ ldp(c_rarg0, c_rarg1, call_wrapper); 375 __ ldrw(c_rarg2, result_type); 376 __ ldr(c_rarg3, method); 377 __ ldp(c_rarg4, c_rarg5, entry_point); 378 __ ldp(c_rarg6, c_rarg7, parameter_size); 379 380 #ifndef PRODUCT 381 // tell the simulator we are about to end Java execution 382 if (NotifySimulator) { 383 __ notify(Assembler::method_exit); 384 } 385 #endif 386 // leave frame and return to caller 387 __ leave(); 388 __ ret(lr); 389 390 // handle return types different from T_INT 391 392 __ BIND(is_long); 393 __ str(r0, Address(j_rarg2, 0)); 394 __ br(Assembler::AL, exit); 395 396 __ BIND(is_float); 397 __ strs(j_farg0, Address(j_rarg2, 0)); 398 __ br(Assembler::AL, exit); 399 400 __ BIND(is_double); 401 __ strd(j_farg0, Address(j_rarg2, 0)); 402 __ br(Assembler::AL, exit); 403 404 return start; 405 } 406 407 // Return point for a Java call if there's an exception thrown in 408 // Java code. The exception is caught and transformed into a 409 // pending exception stored in JavaThread that can be tested from 410 // within the VM. 411 // 412 // Note: Usually the parameters are removed by the callee. In case 413 // of an exception crossing an activation frame boundary, that is 414 // not the case if the callee is compiled code => need to setup the 415 // rsp. 416 // 417 // r0: exception oop 418 419 // NOTE: this is used as a target from the signal handler so it 420 // needs an x86 prolog which returns into the current simulator 421 // executing the generated catch_exception code. so the prolog 422 // needs to install rax in a sim register and adjust the sim's 423 // restart pc to enter the generated code at the start position 424 // then return from native to simulated execution. 425 426 address generate_catch_exception() { 427 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 428 address start = __ pc(); 429 430 // same as in generate_call_stub(): 431 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 432 const Address thread (rfp, thread_off * wordSize); 433 434 #ifdef ASSERT 435 // verify that threads correspond 436 { 437 Label L, S; 438 __ ldr(rscratch1, thread); 439 __ cmp(rthread, rscratch1); 440 __ br(Assembler::NE, S); 441 __ get_thread(rscratch1); 442 __ cmp(rthread, rscratch1); 443 __ br(Assembler::EQ, L); 444 __ bind(S); 445 __ stop("StubRoutines::catch_exception: threads must correspond"); 446 __ bind(L); 447 } 448 #endif 449 450 // set pending exception 451 __ verify_oop(r0); 452 453 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 454 __ mov(rscratch1, (address)__FILE__); 455 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 456 __ movw(rscratch1, (int)__LINE__); 457 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 458 459 // complete return to VM 460 assert(StubRoutines::_call_stub_return_address != NULL, 461 "_call_stub_return_address must have been generated before"); 462 __ b(StubRoutines::_call_stub_return_address); 463 464 return start; 465 } 466 467 // Continuation point for runtime calls returning with a pending 468 // exception. The pending exception check happened in the runtime 469 // or native call stub. The pending exception in Thread is 470 // converted into a Java-level exception. 471 // 472 // Contract with Java-level exception handlers: 473 // r0: exception 474 // r3: throwing pc 475 // 476 // NOTE: At entry of this stub, exception-pc must be in LR !! 477 478 // NOTE: this is always used as a jump target within generated code 479 // so it just needs to be generated code wiht no x86 prolog 480 481 address generate_forward_exception() { 482 StubCodeMark mark(this, "StubRoutines", "forward exception"); 483 address start = __ pc(); 484 485 // Upon entry, LR points to the return address returning into 486 // Java (interpreted or compiled) code; i.e., the return address 487 // becomes the throwing pc. 488 // 489 // Arguments pushed before the runtime call are still on the stack 490 // but the exception handler will reset the stack pointer -> 491 // ignore them. A potential result in registers can be ignored as 492 // well. 493 494 #ifdef ASSERT 495 // make sure this code is only executed if there is a pending exception 496 { 497 Label L; 498 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 499 __ cbnz(rscratch1, L); 500 __ stop("StubRoutines::forward exception: no pending exception (1)"); 501 __ bind(L); 502 } 503 #endif 504 505 // compute exception handler into r19 506 507 // call the VM to find the handler address associated with the 508 // caller address. pass thread in r0 and caller pc (ret address) 509 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 510 // the stack. 511 __ mov(c_rarg1, lr); 512 // lr will be trashed by the VM call so we move it to R19 513 // (callee-saved) because we also need to pass it to the handler 514 // returned by this call. 515 __ mov(r19, lr); 516 BLOCK_COMMENT("call exception_handler_for_return_address"); 517 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 518 SharedRuntime::exception_handler_for_return_address), 519 rthread, c_rarg1); 520 // we should not really care that lr is no longer the callee 521 // address. we saved the value the handler needs in r19 so we can 522 // just copy it to r3. however, the C2 handler will push its own 523 // frame and then calls into the VM and the VM code asserts that 524 // the PC for the frame above the handler belongs to a compiled 525 // Java method. So, we restore lr here to satisfy that assert. 526 __ mov(lr, r19); 527 // setup r0 & r3 & clear pending exception 528 __ mov(r3, r19); 529 __ mov(r19, r0); 530 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 531 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 532 533 #ifdef ASSERT 534 // make sure exception is set 535 { 536 Label L; 537 __ cbnz(r0, L); 538 __ stop("StubRoutines::forward exception: no pending exception (2)"); 539 __ bind(L); 540 } 541 #endif 542 543 // continue at exception handler 544 // r0: exception 545 // r3: throwing pc 546 // r19: exception handler 547 __ verify_oop(r0); 548 __ br(r19); 549 550 return start; 551 } 552 553 // Shenandoah write barrier. 554 // 555 // Input: 556 // r0: OOP to evacuate. Not null. 557 // 558 // Output: 559 // r0: Pointer to evacuated OOP. 560 // 561 // Trash rscratch1, rscratch2. Preserve everything else. 562 563 address generate_shenandoah_wb(bool c_abi, bool do_cset_test) { 564 StubCodeMark mark(this, "StubRoutines", "shenandoah_wb"); 565 566 __ align(6); 567 address start = __ pc(); 568 569 if (do_cset_test) { 570 Label work; 571 __ mov(rscratch2, ShenandoahHeap::in_cset_fast_test_addr()); 572 __ lsr(rscratch1, r0, ShenandoahHeapRegion::region_size_bytes_shift_jint()); 573 __ ldrb(rscratch2, Address(rscratch2, rscratch1)); 574 __ tbnz(rscratch2, 0, work); 575 __ ret(lr); 576 __ bind(work); 577 } 578 579 Register obj = r0; 580 581 __ enter(); // required for proper stackwalking of RuntimeStub frame 582 583 if (!c_abi) { 584 __ push_call_clobbered_registers(); 585 } else { 586 __ push_call_clobbered_fp_registers(); 587 } 588 589 __ mov(lr, CAST_FROM_FN_PTR(address, ShenandoahBarrierSet::write_barrier_JRT)); 590 __ blrt(lr, 1, 0, MacroAssembler::ret_type_integral); 591 if (!c_abi) { 592 __ mov(rscratch1, obj); 593 __ pop_call_clobbered_registers(); 594 __ mov(obj, rscratch1); 595 } else { 596 __ pop_call_clobbered_fp_registers(); 597 } 598 599 __ leave(); // required for proper stackwalking of RuntimeStub frame 600 __ ret(lr); 601 602 return start; 603 } 604 605 // Non-destructive plausibility checks for oops 606 // 607 // Arguments: 608 // r0: oop to verify 609 // rscratch1: error message 610 // 611 // Stack after saving c_rarg3: 612 // [tos + 0]: saved c_rarg3 613 // [tos + 1]: saved c_rarg2 614 // [tos + 2]: saved lr 615 // [tos + 3]: saved rscratch2 616 // [tos + 4]: saved r0 617 // [tos + 5]: saved rscratch1 618 address generate_verify_oop() { 619 620 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 621 address start = __ pc(); 622 623 Label exit, error; 624 625 // save c_rarg2 and c_rarg3 626 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 627 628 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 629 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 630 __ ldr(c_rarg3, Address(c_rarg2)); 631 __ add(c_rarg3, c_rarg3, 1); 632 __ str(c_rarg3, Address(c_rarg2)); 633 634 // object is in r0 635 // make sure object is 'reasonable' 636 __ cbz(r0, exit); // if obj is NULL it is OK 637 638 // Check if the oop is in the right area of memory 639 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 640 __ andr(c_rarg2, r0, c_rarg3); 641 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 642 643 // Compare c_rarg2 and c_rarg3. We don't use a compare 644 // instruction here because the flags register is live. 645 __ eor(c_rarg2, c_rarg2, c_rarg3); 646 __ cbnz(c_rarg2, error); 647 648 // make sure klass is 'reasonable', which is not zero. 649 __ load_klass(r0, r0); // get klass 650 __ cbz(r0, error); // if klass is NULL it is broken 651 652 // return if everything seems ok 653 __ bind(exit); 654 655 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 656 __ ret(lr); 657 658 // handle errors 659 __ bind(error); 660 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 661 662 __ push(RegSet::range(r0, r29), sp); 663 // debug(char* msg, int64_t pc, int64_t regs[]) 664 __ mov(c_rarg0, rscratch1); // pass address of error message 665 __ mov(c_rarg1, lr); // pass return address 666 __ mov(c_rarg2, sp); // pass address of regs on stack 667 #ifndef PRODUCT 668 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 669 #endif 670 BLOCK_COMMENT("call MacroAssembler::debug"); 671 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 672 __ blrt(rscratch1, 3, 0, 1); 673 674 return start; 675 } 676 677 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 678 679 // The inner part of zero_words(). This is the bulk operation, 680 // zeroing words in blocks, possibly using DC ZVA to do it. The 681 // caller is responsible for zeroing the last few words. 682 // 683 // Inputs: 684 // r10: the HeapWord-aligned base address of an array to zero. 685 // r11: the count in HeapWords, r11 > 0. 686 // 687 // Returns r10 and r11, adjusted for the caller to clear. 688 // r10: the base address of the tail of words left to clear. 689 // r11: the number of words in the tail. 690 // r11 < MacroAssembler::zero_words_block_size. 691 692 address generate_zero_blocks() { 693 Label store_pair, loop_store_pair, done; 694 Label base_aligned; 695 696 Register base = r10, cnt = r11; 697 698 __ align(CodeEntryAlignment); 699 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 700 address start = __ pc(); 701 702 if (UseBlockZeroing) { 703 int zva_length = VM_Version::zva_length(); 704 705 // Ensure ZVA length can be divided by 16. This is required by 706 // the subsequent operations. 707 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 708 709 __ tbz(base, 3, base_aligned); 710 __ str(zr, Address(__ post(base, 8))); 711 __ sub(cnt, cnt, 1); 712 __ bind(base_aligned); 713 714 // Ensure count >= zva_length * 2 so that it still deserves a zva after 715 // alignment. 716 Label small; 717 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 718 __ subs(rscratch1, cnt, low_limit >> 3); 719 __ br(Assembler::LT, small); 720 __ zero_dcache_blocks(base, cnt); 721 __ bind(small); 722 } 723 724 { 725 // Number of stp instructions we'll unroll 726 const int unroll = 727 MacroAssembler::zero_words_block_size / 2; 728 // Clear the remaining blocks. 729 Label loop; 730 __ subs(cnt, cnt, unroll * 2); 731 __ br(Assembler::LT, done); 732 __ bind(loop); 733 for (int i = 0; i < unroll; i++) 734 __ stp(zr, zr, __ post(base, 16)); 735 __ subs(cnt, cnt, unroll * 2); 736 __ br(Assembler::GE, loop); 737 __ bind(done); 738 __ add(cnt, cnt, unroll * 2); 739 } 740 741 __ ret(lr); 742 743 return start; 744 } 745 746 747 typedef enum { 748 copy_forwards = 1, 749 copy_backwards = -1 750 } copy_direction; 751 752 // Bulk copy of blocks of 8 words. 753 // 754 // count is a count of words. 755 // 756 // Precondition: count >= 8 757 // 758 // Postconditions: 759 // 760 // The least significant bit of count contains the remaining count 761 // of words to copy. The rest of count is trash. 762 // 763 // s and d are adjusted to point to the remaining words to copy 764 // 765 void generate_copy_longs(Label &start, Register s, Register d, Register count, 766 copy_direction direction) { 767 int unit = wordSize * direction; 768 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 769 770 int offset; 771 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 772 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 773 const Register stride = r13; 774 775 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 776 assert_different_registers(s, d, count, rscratch1); 777 778 Label again, drain; 779 const char *stub_name; 780 if (direction == copy_forwards) 781 stub_name = "forward_copy_longs"; 782 else 783 stub_name = "backward_copy_longs"; 784 StubCodeMark mark(this, "StubRoutines", stub_name); 785 __ align(CodeEntryAlignment); 786 __ bind(start); 787 788 Label unaligned_copy_long; 789 if (AvoidUnalignedAccesses) { 790 __ tbnz(d, 3, unaligned_copy_long); 791 } 792 793 if (direction == copy_forwards) { 794 __ sub(s, s, bias); 795 __ sub(d, d, bias); 796 } 797 798 #ifdef ASSERT 799 // Make sure we are never given < 8 words 800 { 801 Label L; 802 __ cmp(count, 8); 803 __ br(Assembler::GE, L); 804 __ stop("genrate_copy_longs called with < 8 words"); 805 __ bind(L); 806 } 807 #endif 808 809 // Fill 8 registers 810 if (UseSIMDForMemoryOps) { 811 __ ldpq(v0, v1, Address(s, 4 * unit)); 812 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 813 } else { 814 __ ldp(t0, t1, Address(s, 2 * unit)); 815 __ ldp(t2, t3, Address(s, 4 * unit)); 816 __ ldp(t4, t5, Address(s, 6 * unit)); 817 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 818 } 819 820 __ subs(count, count, 16); 821 __ br(Assembler::LO, drain); 822 823 int prefetch = PrefetchCopyIntervalInBytes; 824 bool use_stride = false; 825 if (direction == copy_backwards) { 826 use_stride = prefetch > 256; 827 prefetch = -prefetch; 828 if (use_stride) __ mov(stride, prefetch); 829 } 830 831 __ bind(again); 832 833 if (PrefetchCopyIntervalInBytes > 0) 834 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 835 836 if (UseSIMDForMemoryOps) { 837 __ stpq(v0, v1, Address(d, 4 * unit)); 838 __ ldpq(v0, v1, Address(s, 4 * unit)); 839 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 840 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 841 } else { 842 __ stp(t0, t1, Address(d, 2 * unit)); 843 __ ldp(t0, t1, Address(s, 2 * unit)); 844 __ stp(t2, t3, Address(d, 4 * unit)); 845 __ ldp(t2, t3, Address(s, 4 * unit)); 846 __ stp(t4, t5, Address(d, 6 * unit)); 847 __ ldp(t4, t5, Address(s, 6 * unit)); 848 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 849 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 850 } 851 852 __ subs(count, count, 8); 853 __ br(Assembler::HS, again); 854 855 // Drain 856 __ bind(drain); 857 if (UseSIMDForMemoryOps) { 858 __ stpq(v0, v1, Address(d, 4 * unit)); 859 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 860 } else { 861 __ stp(t0, t1, Address(d, 2 * unit)); 862 __ stp(t2, t3, Address(d, 4 * unit)); 863 __ stp(t4, t5, Address(d, 6 * unit)); 864 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 865 } 866 867 { 868 Label L1, L2; 869 __ tbz(count, exact_log2(4), L1); 870 if (UseSIMDForMemoryOps) { 871 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 872 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 873 } else { 874 __ ldp(t0, t1, Address(s, 2 * unit)); 875 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 876 __ stp(t0, t1, Address(d, 2 * unit)); 877 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 878 } 879 __ bind(L1); 880 881 if (direction == copy_forwards) { 882 __ add(s, s, bias); 883 __ add(d, d, bias); 884 } 885 886 __ tbz(count, 1, L2); 887 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 888 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 889 __ bind(L2); 890 } 891 892 __ ret(lr); 893 894 if (AvoidUnalignedAccesses) { 895 Label drain, again; 896 // Register order for storing. Order is different for backward copy. 897 898 __ bind(unaligned_copy_long); 899 900 // source address is even aligned, target odd aligned 901 // 902 // when forward copying word pairs we read long pairs at offsets 903 // {0, 2, 4, 6} (in long words). when backwards copying we read 904 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 905 // address by -2 in the forwards case so we can compute the 906 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 907 // or -1. 908 // 909 // when forward copying we need to store 1 word, 3 pairs and 910 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 911 // zero offset We adjust the destination by -1 which means we 912 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 913 // 914 // When backwards copyng we need to store 1 word, 3 pairs and 915 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 916 // offsets {1, 3, 5, 7, 8} * unit. 917 918 if (direction == copy_forwards) { 919 __ sub(s, s, 16); 920 __ sub(d, d, 8); 921 } 922 923 // Fill 8 registers 924 // 925 // for forwards copy s was offset by -16 from the original input 926 // value of s so the register contents are at these offsets 927 // relative to the 64 bit block addressed by that original input 928 // and so on for each successive 64 byte block when s is updated 929 // 930 // t0 at offset 0, t1 at offset 8 931 // t2 at offset 16, t3 at offset 24 932 // t4 at offset 32, t5 at offset 40 933 // t6 at offset 48, t7 at offset 56 934 935 // for backwards copy s was not offset so the register contents 936 // are at these offsets into the preceding 64 byte block 937 // relative to that original input and so on for each successive 938 // preceding 64 byte block when s is updated. this explains the 939 // slightly counter-intuitive looking pattern of register usage 940 // in the stp instructions for backwards copy. 941 // 942 // t0 at offset -16, t1 at offset -8 943 // t2 at offset -32, t3 at offset -24 944 // t4 at offset -48, t5 at offset -40 945 // t6 at offset -64, t7 at offset -56 946 947 __ ldp(t0, t1, Address(s, 2 * unit)); 948 __ ldp(t2, t3, Address(s, 4 * unit)); 949 __ ldp(t4, t5, Address(s, 6 * unit)); 950 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 951 952 __ subs(count, count, 16); 953 __ br(Assembler::LO, drain); 954 955 int prefetch = PrefetchCopyIntervalInBytes; 956 bool use_stride = false; 957 if (direction == copy_backwards) { 958 use_stride = prefetch > 256; 959 prefetch = -prefetch; 960 if (use_stride) __ mov(stride, prefetch); 961 } 962 963 __ bind(again); 964 965 if (PrefetchCopyIntervalInBytes > 0) 966 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 967 968 if (direction == copy_forwards) { 969 // allowing for the offset of -8 the store instructions place 970 // registers into the target 64 bit block at the following 971 // offsets 972 // 973 // t0 at offset 0 974 // t1 at offset 8, t2 at offset 16 975 // t3 at offset 24, t4 at offset 32 976 // t5 at offset 40, t6 at offset 48 977 // t7 at offset 56 978 979 __ str(t0, Address(d, 1 * unit)); 980 __ stp(t1, t2, Address(d, 2 * unit)); 981 __ ldp(t0, t1, Address(s, 2 * unit)); 982 __ stp(t3, t4, Address(d, 4 * unit)); 983 __ ldp(t2, t3, Address(s, 4 * unit)); 984 __ stp(t5, t6, Address(d, 6 * unit)); 985 __ ldp(t4, t5, Address(s, 6 * unit)); 986 __ str(t7, Address(__ pre(d, 8 * unit))); 987 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 988 } else { 989 // d was not offset when we started so the registers are 990 // written into the 64 bit block preceding d with the following 991 // offsets 992 // 993 // t1 at offset -8 994 // t3 at offset -24, t0 at offset -16 995 // t5 at offset -48, t2 at offset -32 996 // t7 at offset -56, t4 at offset -48 997 // t6 at offset -64 998 // 999 // note that this matches the offsets previously noted for the 1000 // loads 1001 1002 __ str(t1, Address(d, 1 * unit)); 1003 __ stp(t3, t0, Address(d, 3 * unit)); 1004 __ ldp(t0, t1, Address(s, 2 * unit)); 1005 __ stp(t5, t2, Address(d, 5 * unit)); 1006 __ ldp(t2, t3, Address(s, 4 * unit)); 1007 __ stp(t7, t4, Address(d, 7 * unit)); 1008 __ ldp(t4, t5, Address(s, 6 * unit)); 1009 __ str(t6, Address(__ pre(d, 8 * unit))); 1010 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1011 } 1012 1013 __ subs(count, count, 8); 1014 __ br(Assembler::HS, again); 1015 1016 // Drain 1017 // 1018 // this uses the same pattern of offsets and register arguments 1019 // as above 1020 __ bind(drain); 1021 if (direction == copy_forwards) { 1022 __ str(t0, Address(d, 1 * unit)); 1023 __ stp(t1, t2, Address(d, 2 * unit)); 1024 __ stp(t3, t4, Address(d, 4 * unit)); 1025 __ stp(t5, t6, Address(d, 6 * unit)); 1026 __ str(t7, Address(__ pre(d, 8 * unit))); 1027 } else { 1028 __ str(t1, Address(d, 1 * unit)); 1029 __ stp(t3, t0, Address(d, 3 * unit)); 1030 __ stp(t5, t2, Address(d, 5 * unit)); 1031 __ stp(t7, t4, Address(d, 7 * unit)); 1032 __ str(t6, Address(__ pre(d, 8 * unit))); 1033 } 1034 // now we need to copy any remaining part block which may 1035 // include a 4 word block subblock and/or a 2 word subblock. 1036 // bits 2 and 1 in the count are the tell-tale for whetehr we 1037 // have each such subblock 1038 { 1039 Label L1, L2; 1040 __ tbz(count, exact_log2(4), L1); 1041 // this is the same as above but copying only 4 longs hence 1042 // with ony one intervening stp between the str instructions 1043 // but note that the offsets and registers still follow the 1044 // same pattern 1045 __ ldp(t0, t1, Address(s, 2 * unit)); 1046 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1047 if (direction == copy_forwards) { 1048 __ str(t0, Address(d, 1 * unit)); 1049 __ stp(t1, t2, Address(d, 2 * unit)); 1050 __ str(t3, Address(__ pre(d, 4 * unit))); 1051 } else { 1052 __ str(t1, Address(d, 1 * unit)); 1053 __ stp(t3, t0, Address(d, 3 * unit)); 1054 __ str(t2, Address(__ pre(d, 4 * unit))); 1055 } 1056 __ bind(L1); 1057 1058 __ tbz(count, 1, L2); 1059 // this is the same as above but copying only 2 longs hence 1060 // there is no intervening stp between the str instructions 1061 // but note that the offset and register patterns are still 1062 // the same 1063 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1064 if (direction == copy_forwards) { 1065 __ str(t0, Address(d, 1 * unit)); 1066 __ str(t1, Address(__ pre(d, 2 * unit))); 1067 } else { 1068 __ str(t1, Address(d, 1 * unit)); 1069 __ str(t0, Address(__ pre(d, 2 * unit))); 1070 } 1071 __ bind(L2); 1072 1073 // for forwards copy we need to re-adjust the offsets we 1074 // applied so that s and d are follow the last words written 1075 1076 if (direction == copy_forwards) { 1077 __ add(s, s, 16); 1078 __ add(d, d, 8); 1079 } 1080 1081 } 1082 1083 __ ret(lr); 1084 } 1085 } 1086 1087 // Small copy: less than 16 bytes. 1088 // 1089 // NB: Ignores all of the bits of count which represent more than 15 1090 // bytes, so a caller doesn't have to mask them. 1091 1092 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1093 bool is_backwards = step < 0; 1094 size_t granularity = uabs(step); 1095 int direction = is_backwards ? -1 : 1; 1096 int unit = wordSize * direction; 1097 1098 Label Lpair, Lword, Lint, Lshort, Lbyte; 1099 1100 assert(granularity 1101 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1102 1103 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1104 1105 // ??? I don't know if this bit-test-and-branch is the right thing 1106 // to do. It does a lot of jumping, resulting in several 1107 // mispredicted branches. It might make more sense to do this 1108 // with something like Duff's device with a single computed branch. 1109 1110 __ tbz(count, 3 - exact_log2(granularity), Lword); 1111 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1112 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1113 __ bind(Lword); 1114 1115 if (granularity <= sizeof (jint)) { 1116 __ tbz(count, 2 - exact_log2(granularity), Lint); 1117 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1118 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1119 __ bind(Lint); 1120 } 1121 1122 if (granularity <= sizeof (jshort)) { 1123 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1124 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1125 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1126 __ bind(Lshort); 1127 } 1128 1129 if (granularity <= sizeof (jbyte)) { 1130 __ tbz(count, 0, Lbyte); 1131 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1132 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1133 __ bind(Lbyte); 1134 } 1135 } 1136 1137 Label copy_f, copy_b; 1138 1139 // All-singing all-dancing memory copy. 1140 // 1141 // Copy count units of memory from s to d. The size of a unit is 1142 // step, which can be positive or negative depending on the direction 1143 // of copy. If is_aligned is false, we align the source address. 1144 // 1145 1146 void copy_memory(bool is_aligned, Register s, Register d, 1147 Register count, Register tmp, int step) { 1148 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1149 bool is_backwards = step < 0; 1150 int granularity = uabs(step); 1151 const Register t0 = r3, t1 = r4; 1152 1153 // <= 96 bytes do inline. Direction doesn't matter because we always 1154 // load all the data before writing anything 1155 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1156 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1157 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1158 const Register send = r17, dend = r18; 1159 1160 if (PrefetchCopyIntervalInBytes > 0) 1161 __ prfm(Address(s, 0), PLDL1KEEP); 1162 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1163 __ br(Assembler::HI, copy_big); 1164 1165 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1166 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1167 1168 __ cmp(count, 16/granularity); 1169 __ br(Assembler::LS, copy16); 1170 1171 __ cmp(count, 64/granularity); 1172 __ br(Assembler::HI, copy80); 1173 1174 __ cmp(count, 32/granularity); 1175 __ br(Assembler::LS, copy32); 1176 1177 // 33..64 bytes 1178 if (UseSIMDForMemoryOps) { 1179 __ ldpq(v0, v1, Address(s, 0)); 1180 __ ldpq(v2, v3, Address(send, -32)); 1181 __ stpq(v0, v1, Address(d, 0)); 1182 __ stpq(v2, v3, Address(dend, -32)); 1183 } else { 1184 __ ldp(t0, t1, Address(s, 0)); 1185 __ ldp(t2, t3, Address(s, 16)); 1186 __ ldp(t4, t5, Address(send, -32)); 1187 __ ldp(t6, t7, Address(send, -16)); 1188 1189 __ stp(t0, t1, Address(d, 0)); 1190 __ stp(t2, t3, Address(d, 16)); 1191 __ stp(t4, t5, Address(dend, -32)); 1192 __ stp(t6, t7, Address(dend, -16)); 1193 } 1194 __ b(finish); 1195 1196 // 17..32 bytes 1197 __ bind(copy32); 1198 __ ldp(t0, t1, Address(s, 0)); 1199 __ ldp(t2, t3, Address(send, -16)); 1200 __ stp(t0, t1, Address(d, 0)); 1201 __ stp(t2, t3, Address(dend, -16)); 1202 __ b(finish); 1203 1204 // 65..80/96 bytes 1205 // (96 bytes if SIMD because we do 32 byes per instruction) 1206 __ bind(copy80); 1207 if (UseSIMDForMemoryOps) { 1208 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1209 __ ldpq(v4, v5, Address(send, -32)); 1210 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1211 __ stpq(v4, v5, Address(dend, -32)); 1212 } else { 1213 __ ldp(t0, t1, Address(s, 0)); 1214 __ ldp(t2, t3, Address(s, 16)); 1215 __ ldp(t4, t5, Address(s, 32)); 1216 __ ldp(t6, t7, Address(s, 48)); 1217 __ ldp(t8, t9, Address(send, -16)); 1218 1219 __ stp(t0, t1, Address(d, 0)); 1220 __ stp(t2, t3, Address(d, 16)); 1221 __ stp(t4, t5, Address(d, 32)); 1222 __ stp(t6, t7, Address(d, 48)); 1223 __ stp(t8, t9, Address(dend, -16)); 1224 } 1225 __ b(finish); 1226 1227 // 0..16 bytes 1228 __ bind(copy16); 1229 __ cmp(count, 8/granularity); 1230 __ br(Assembler::LO, copy8); 1231 1232 // 8..16 bytes 1233 __ ldr(t0, Address(s, 0)); 1234 __ ldr(t1, Address(send, -8)); 1235 __ str(t0, Address(d, 0)); 1236 __ str(t1, Address(dend, -8)); 1237 __ b(finish); 1238 1239 if (granularity < 8) { 1240 // 4..7 bytes 1241 __ bind(copy8); 1242 __ tbz(count, 2 - exact_log2(granularity), copy4); 1243 __ ldrw(t0, Address(s, 0)); 1244 __ ldrw(t1, Address(send, -4)); 1245 __ strw(t0, Address(d, 0)); 1246 __ strw(t1, Address(dend, -4)); 1247 __ b(finish); 1248 if (granularity < 4) { 1249 // 0..3 bytes 1250 __ bind(copy4); 1251 __ cbz(count, finish); // get rid of 0 case 1252 if (granularity == 2) { 1253 __ ldrh(t0, Address(s, 0)); 1254 __ strh(t0, Address(d, 0)); 1255 } else { // granularity == 1 1256 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1257 // the first and last byte. 1258 // Handle the 3 byte case by loading and storing base + count/2 1259 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1260 // This does means in the 1 byte case we load/store the same 1261 // byte 3 times. 1262 __ lsr(count, count, 1); 1263 __ ldrb(t0, Address(s, 0)); 1264 __ ldrb(t1, Address(send, -1)); 1265 __ ldrb(t2, Address(s, count)); 1266 __ strb(t0, Address(d, 0)); 1267 __ strb(t1, Address(dend, -1)); 1268 __ strb(t2, Address(d, count)); 1269 } 1270 __ b(finish); 1271 } 1272 } 1273 1274 __ bind(copy_big); 1275 if (is_backwards) { 1276 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1277 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1278 } 1279 1280 // Now we've got the small case out of the way we can align the 1281 // source address on a 2-word boundary. 1282 1283 Label aligned; 1284 1285 if (is_aligned) { 1286 // We may have to adjust by 1 word to get s 2-word-aligned. 1287 __ tbz(s, exact_log2(wordSize), aligned); 1288 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1289 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1290 __ sub(count, count, wordSize/granularity); 1291 } else { 1292 if (is_backwards) { 1293 __ andr(rscratch2, s, 2 * wordSize - 1); 1294 } else { 1295 __ neg(rscratch2, s); 1296 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1297 } 1298 // rscratch2 is the byte adjustment needed to align s. 1299 __ cbz(rscratch2, aligned); 1300 int shift = exact_log2(granularity); 1301 if (shift) __ lsr(rscratch2, rscratch2, shift); 1302 __ sub(count, count, rscratch2); 1303 1304 #if 0 1305 // ?? This code is only correct for a disjoint copy. It may or 1306 // may not make sense to use it in that case. 1307 1308 // Copy the first pair; s and d may not be aligned. 1309 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1310 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1311 1312 // Align s and d, adjust count 1313 if (is_backwards) { 1314 __ sub(s, s, rscratch2); 1315 __ sub(d, d, rscratch2); 1316 } else { 1317 __ add(s, s, rscratch2); 1318 __ add(d, d, rscratch2); 1319 } 1320 #else 1321 copy_memory_small(s, d, rscratch2, rscratch1, step); 1322 #endif 1323 } 1324 1325 __ bind(aligned); 1326 1327 // s is now 2-word-aligned. 1328 1329 // We have a count of units and some trailing bytes. Adjust the 1330 // count and do a bulk copy of words. 1331 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1332 if (direction == copy_forwards) 1333 __ bl(copy_f); 1334 else 1335 __ bl(copy_b); 1336 1337 // And the tail. 1338 copy_memory_small(s, d, count, tmp, step); 1339 1340 if (granularity >= 8) __ bind(copy8); 1341 if (granularity >= 4) __ bind(copy4); 1342 __ bind(finish); 1343 } 1344 1345 1346 void clobber_registers() { 1347 #ifdef ASSERT 1348 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1349 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1350 for (Register r = r3; r <= r18; r++) 1351 if (r != rscratch1) __ mov(r, rscratch1); 1352 #endif 1353 } 1354 1355 // Scan over array at a for count oops, verifying each one. 1356 // Preserves a and count, clobbers rscratch1 and rscratch2. 1357 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1358 Label loop, end; 1359 __ mov(rscratch1, a); 1360 __ mov(rscratch2, zr); 1361 __ bind(loop); 1362 __ cmp(rscratch2, count); 1363 __ br(Assembler::HS, end); 1364 if (size == (size_t)wordSize) { 1365 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1366 __ verify_oop(temp); 1367 } else { 1368 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1369 __ decode_heap_oop(temp); // calls verify_oop 1370 } 1371 __ add(rscratch2, rscratch2, size); 1372 __ b(loop); 1373 __ bind(end); 1374 } 1375 1376 // Arguments: 1377 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1378 // ignored 1379 // is_oop - true => oop array, so generate store check code 1380 // name - stub name string 1381 // 1382 // Inputs: 1383 // c_rarg0 - source array address 1384 // c_rarg1 - destination array address 1385 // c_rarg2 - element count, treated as ssize_t, can be zero 1386 // 1387 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1388 // the hardware handle it. The two dwords within qwords that span 1389 // cache line boundaries will still be loaded and stored atomicly. 1390 // 1391 // Side Effects: 1392 // disjoint_int_copy_entry is set to the no-overlap entry point 1393 // used by generate_conjoint_int_oop_copy(). 1394 // 1395 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1396 const char *name, bool dest_uninitialized = false) { 1397 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1398 RegSet saved_reg = RegSet::of(s, d, count); 1399 __ align(CodeEntryAlignment); 1400 StubCodeMark mark(this, "StubRoutines", name); 1401 address start = __ pc(); 1402 __ enter(); 1403 1404 if (entry != NULL) { 1405 *entry = __ pc(); 1406 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1407 BLOCK_COMMENT("Entry:"); 1408 } 1409 1410 DecoratorSet decorators = ARRAYCOPY_DISJOINT; 1411 if (dest_uninitialized) { 1412 decorators |= AS_DEST_NOT_INITIALIZED; 1413 } 1414 if (aligned) { 1415 decorators |= ARRAYCOPY_ALIGNED; 1416 } 1417 1418 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1419 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg); 1420 1421 if (is_oop) { 1422 // save regs before copy_memory 1423 __ push(RegSet::of(d, count), sp); 1424 } 1425 copy_memory(aligned, s, d, count, rscratch1, size); 1426 1427 if (is_oop) { 1428 __ pop(RegSet::of(d, count), sp); 1429 if (VerifyOops) 1430 verify_oop_array(size, d, count, r16); 1431 __ sub(count, count, 1); // make an inclusive end pointer 1432 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1433 } 1434 1435 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1436 1437 __ leave(); 1438 __ mov(r0, zr); // return 0 1439 __ ret(lr); 1440 #ifdef BUILTIN_SIM 1441 { 1442 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1443 sim->notifyCompile(const_cast<char*>(name), start); 1444 } 1445 #endif 1446 return start; 1447 } 1448 1449 // Arguments: 1450 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1451 // ignored 1452 // is_oop - true => oop array, so generate store check code 1453 // name - stub name string 1454 // 1455 // Inputs: 1456 // c_rarg0 - source array address 1457 // c_rarg1 - destination array address 1458 // c_rarg2 - element count, treated as ssize_t, can be zero 1459 // 1460 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1461 // the hardware handle it. The two dwords within qwords that span 1462 // cache line boundaries will still be loaded and stored atomicly. 1463 // 1464 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1465 address *entry, const char *name, 1466 bool dest_uninitialized = false) { 1467 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1468 RegSet saved_regs = RegSet::of(s, d, count); 1469 StubCodeMark mark(this, "StubRoutines", name); 1470 address start = __ pc(); 1471 __ enter(); 1472 1473 if (entry != NULL) { 1474 *entry = __ pc(); 1475 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1476 BLOCK_COMMENT("Entry:"); 1477 } 1478 1479 // use fwd copy when (d-s) above_equal (count*size) 1480 __ sub(rscratch1, d, s); 1481 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1482 __ br(Assembler::HS, nooverlap_target); 1483 1484 DecoratorSet decorators = 0; 1485 if (dest_uninitialized) { 1486 decorators |= AS_DEST_NOT_INITIALIZED; 1487 } 1488 if (aligned) { 1489 decorators |= ARRAYCOPY_ALIGNED; 1490 } 1491 1492 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1493 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs); 1494 1495 if (is_oop) { 1496 // save regs before copy_memory 1497 __ push(RegSet::of(d, count), sp); 1498 } 1499 copy_memory(aligned, s, d, count, rscratch1, -size); 1500 if (is_oop) { 1501 __ pop(RegSet::of(d, count), sp); 1502 if (VerifyOops) 1503 verify_oop_array(size, d, count, r16); 1504 __ sub(count, count, 1); // make an inclusive end pointer 1505 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1506 } 1507 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1508 __ leave(); 1509 __ mov(r0, zr); // return 0 1510 __ ret(lr); 1511 #ifdef BUILTIN_SIM 1512 { 1513 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1514 sim->notifyCompile(const_cast<char*>(name), start); 1515 } 1516 #endif 1517 return start; 1518 } 1519 1520 // Arguments: 1521 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1522 // ignored 1523 // name - stub name string 1524 // 1525 // Inputs: 1526 // c_rarg0 - source array address 1527 // c_rarg1 - destination array address 1528 // c_rarg2 - element count, treated as ssize_t, can be zero 1529 // 1530 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1531 // we let the hardware handle it. The one to eight bytes within words, 1532 // dwords or qwords that span cache line boundaries will still be loaded 1533 // and stored atomically. 1534 // 1535 // Side Effects: 1536 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1537 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1538 // we let the hardware handle it. The one to eight bytes within words, 1539 // dwords or qwords that span cache line boundaries will still be loaded 1540 // and stored atomically. 1541 // 1542 // Side Effects: 1543 // disjoint_byte_copy_entry is set to the no-overlap entry point 1544 // used by generate_conjoint_byte_copy(). 1545 // 1546 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1547 const bool not_oop = false; 1548 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1549 } 1550 1551 // Arguments: 1552 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1553 // ignored 1554 // name - stub name string 1555 // 1556 // Inputs: 1557 // c_rarg0 - source array address 1558 // c_rarg1 - destination array address 1559 // c_rarg2 - element count, treated as ssize_t, can be zero 1560 // 1561 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1562 // we let the hardware handle it. The one to eight bytes within words, 1563 // dwords or qwords that span cache line boundaries will still be loaded 1564 // and stored atomically. 1565 // 1566 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1567 address* entry, const char *name) { 1568 const bool not_oop = false; 1569 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1570 } 1571 1572 // Arguments: 1573 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1574 // ignored 1575 // name - stub name string 1576 // 1577 // Inputs: 1578 // c_rarg0 - source array address 1579 // c_rarg1 - destination array address 1580 // c_rarg2 - element count, treated as ssize_t, can be zero 1581 // 1582 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1583 // let the hardware handle it. The two or four words within dwords 1584 // or qwords that span cache line boundaries will still be loaded 1585 // and stored atomically. 1586 // 1587 // Side Effects: 1588 // disjoint_short_copy_entry is set to the no-overlap entry point 1589 // used by generate_conjoint_short_copy(). 1590 // 1591 address generate_disjoint_short_copy(bool aligned, 1592 address* entry, const char *name) { 1593 const bool not_oop = false; 1594 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1595 } 1596 1597 // Arguments: 1598 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1599 // ignored 1600 // name - stub name string 1601 // 1602 // Inputs: 1603 // c_rarg0 - source array address 1604 // c_rarg1 - destination array address 1605 // c_rarg2 - element count, treated as ssize_t, can be zero 1606 // 1607 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1608 // let the hardware handle it. The two or four words within dwords 1609 // or qwords that span cache line boundaries will still be loaded 1610 // and stored atomically. 1611 // 1612 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1613 address *entry, const char *name) { 1614 const bool not_oop = false; 1615 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1616 1617 } 1618 // Arguments: 1619 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1620 // ignored 1621 // name - stub name string 1622 // 1623 // Inputs: 1624 // c_rarg0 - source array address 1625 // c_rarg1 - destination array address 1626 // c_rarg2 - element count, treated as ssize_t, can be zero 1627 // 1628 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1629 // the hardware handle it. The two dwords within qwords that span 1630 // cache line boundaries will still be loaded and stored atomicly. 1631 // 1632 // Side Effects: 1633 // disjoint_int_copy_entry is set to the no-overlap entry point 1634 // used by generate_conjoint_int_oop_copy(). 1635 // 1636 address generate_disjoint_int_copy(bool aligned, address *entry, 1637 const char *name, bool dest_uninitialized = false) { 1638 const bool not_oop = false; 1639 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1640 } 1641 1642 // Arguments: 1643 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1644 // ignored 1645 // name - stub name string 1646 // 1647 // Inputs: 1648 // c_rarg0 - source array address 1649 // c_rarg1 - destination array address 1650 // c_rarg2 - element count, treated as ssize_t, can be zero 1651 // 1652 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1653 // the hardware handle it. The two dwords within qwords that span 1654 // cache line boundaries will still be loaded and stored atomicly. 1655 // 1656 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1657 address *entry, const char *name, 1658 bool dest_uninitialized = false) { 1659 const bool not_oop = false; 1660 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1661 } 1662 1663 1664 // Arguments: 1665 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1666 // ignored 1667 // name - stub name string 1668 // 1669 // Inputs: 1670 // c_rarg0 - source array address 1671 // c_rarg1 - destination array address 1672 // c_rarg2 - element count, treated as size_t, can be zero 1673 // 1674 // Side Effects: 1675 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1676 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1677 // 1678 address generate_disjoint_long_copy(bool aligned, address *entry, 1679 const char *name, bool dest_uninitialized = false) { 1680 const bool not_oop = false; 1681 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1682 } 1683 1684 // Arguments: 1685 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1686 // ignored 1687 // name - stub name string 1688 // 1689 // Inputs: 1690 // c_rarg0 - source array address 1691 // c_rarg1 - destination array address 1692 // c_rarg2 - element count, treated as size_t, can be zero 1693 // 1694 address generate_conjoint_long_copy(bool aligned, 1695 address nooverlap_target, address *entry, 1696 const char *name, bool dest_uninitialized = false) { 1697 const bool not_oop = false; 1698 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1699 } 1700 1701 // Arguments: 1702 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1703 // ignored 1704 // name - stub name string 1705 // 1706 // Inputs: 1707 // c_rarg0 - source array address 1708 // c_rarg1 - destination array address 1709 // c_rarg2 - element count, treated as size_t, can be zero 1710 // 1711 // Side Effects: 1712 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1713 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1714 // 1715 address generate_disjoint_oop_copy(bool aligned, address *entry, 1716 const char *name, bool dest_uninitialized) { 1717 const bool is_oop = true; 1718 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1719 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1720 } 1721 1722 // Arguments: 1723 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1724 // ignored 1725 // name - stub name string 1726 // 1727 // Inputs: 1728 // c_rarg0 - source array address 1729 // c_rarg1 - destination array address 1730 // c_rarg2 - element count, treated as size_t, can be zero 1731 // 1732 address generate_conjoint_oop_copy(bool aligned, 1733 address nooverlap_target, address *entry, 1734 const char *name, bool dest_uninitialized) { 1735 const bool is_oop = true; 1736 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1737 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1738 name, dest_uninitialized); 1739 } 1740 1741 1742 // Helper for generating a dynamic type check. 1743 // Smashes rscratch1. 1744 void generate_type_check(Register sub_klass, 1745 Register super_check_offset, 1746 Register super_klass, 1747 Label& L_success) { 1748 assert_different_registers(sub_klass, super_check_offset, super_klass); 1749 1750 BLOCK_COMMENT("type_check:"); 1751 1752 Label L_miss; 1753 1754 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1755 super_check_offset); 1756 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1757 1758 // Fall through on failure! 1759 __ BIND(L_miss); 1760 } 1761 1762 // 1763 // Generate checkcasting array copy stub 1764 // 1765 // Input: 1766 // c_rarg0 - source array address 1767 // c_rarg1 - destination array address 1768 // c_rarg2 - element count, treated as ssize_t, can be zero 1769 // c_rarg3 - size_t ckoff (super_check_offset) 1770 // c_rarg4 - oop ckval (super_klass) 1771 // 1772 // Output: 1773 // r0 == 0 - success 1774 // r0 == -1^K - failure, where K is partial transfer count 1775 // 1776 address generate_checkcast_copy(const char *name, address *entry, 1777 bool dest_uninitialized = false) { 1778 1779 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1780 1781 // Input registers (after setup_arg_regs) 1782 const Register from = c_rarg0; // source array address 1783 const Register to = c_rarg1; // destination array address 1784 const Register count = c_rarg2; // elementscount 1785 const Register ckoff = c_rarg3; // super_check_offset 1786 const Register ckval = c_rarg4; // super_klass 1787 1788 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1789 RegSet wb_post_saved_regs = RegSet::of(count); 1790 1791 // Registers used as temps (r18, r19, r20 are save-on-entry) 1792 const Register count_save = r21; // orig elementscount 1793 const Register start_to = r20; // destination array start address 1794 const Register copied_oop = r18; // actual oop copied 1795 const Register r19_klass = r19; // oop._klass 1796 1797 //--------------------------------------------------------------- 1798 // Assembler stub will be used for this call to arraycopy 1799 // if the two arrays are subtypes of Object[] but the 1800 // destination array type is not equal to or a supertype 1801 // of the source type. Each element must be separately 1802 // checked. 1803 1804 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1805 copied_oop, r19_klass, count_save); 1806 1807 __ align(CodeEntryAlignment); 1808 StubCodeMark mark(this, "StubRoutines", name); 1809 address start = __ pc(); 1810 1811 __ enter(); // required for proper stackwalking of RuntimeStub frame 1812 1813 #ifdef ASSERT 1814 // caller guarantees that the arrays really are different 1815 // otherwise, we would have to make conjoint checks 1816 { Label L; 1817 array_overlap_test(L, TIMES_OOP); 1818 __ stop("checkcast_copy within a single array"); 1819 __ bind(L); 1820 } 1821 #endif //ASSERT 1822 1823 // Caller of this entry point must set up the argument registers. 1824 if (entry != NULL) { 1825 *entry = __ pc(); 1826 BLOCK_COMMENT("Entry:"); 1827 } 1828 1829 // Empty array: Nothing to do. 1830 __ cbz(count, L_done); 1831 1832 __ push(RegSet::of(r18, r19, r20, r21), sp); 1833 1834 #ifdef ASSERT 1835 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1836 // The ckoff and ckval must be mutually consistent, 1837 // even though caller generates both. 1838 { Label L; 1839 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1840 __ ldrw(start_to, Address(ckval, sco_offset)); 1841 __ cmpw(ckoff, start_to); 1842 __ br(Assembler::EQ, L); 1843 __ stop("super_check_offset inconsistent"); 1844 __ bind(L); 1845 } 1846 #endif //ASSERT 1847 1848 DecoratorSet decorators = ARRAYCOPY_CHECKCAST; 1849 bool is_oop = true; 1850 if (dest_uninitialized) { 1851 decorators |= AS_DEST_NOT_INITIALIZED; 1852 } 1853 1854 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1855 bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs); 1856 1857 // save the original count 1858 __ mov(count_save, count); 1859 1860 // Copy from low to high addresses 1861 __ mov(start_to, to); // Save destination array start address 1862 __ b(L_load_element); 1863 1864 // ======== begin loop ======== 1865 // (Loop is rotated; its entry is L_load_element.) 1866 // Loop control: 1867 // for (; count != 0; count--) { 1868 // copied_oop = load_heap_oop(from++); 1869 // ... generate_type_check ...; 1870 // store_heap_oop(to++, copied_oop); 1871 // } 1872 __ align(OptoLoopAlignment); 1873 1874 __ BIND(L_store_element); 1875 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1876 __ sub(count, count, 1); 1877 __ cbz(count, L_do_card_marks); 1878 1879 // ======== loop entry is here ======== 1880 __ BIND(L_load_element); 1881 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1882 __ cbz(copied_oop, L_store_element); 1883 1884 __ load_klass(r19_klass, copied_oop);// query the object klass 1885 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1886 // ======== end loop ======== 1887 1888 // It was a real error; we must depend on the caller to finish the job. 1889 // Register count = remaining oops, count_orig = total oops. 1890 // Emit GC store barriers for the oops we have copied and report 1891 // their number to the caller. 1892 1893 __ subs(count, count_save, count); // K = partially copied oop count 1894 __ eon(count, count, zr); // report (-1^K) to caller 1895 __ br(Assembler::EQ, L_done_pop); 1896 1897 __ BIND(L_do_card_marks); 1898 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1899 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs); 1900 1901 __ bind(L_done_pop); 1902 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1903 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1904 1905 __ bind(L_done); 1906 __ mov(r0, count); 1907 __ leave(); 1908 __ ret(lr); 1909 1910 return start; 1911 } 1912 1913 // Perform range checks on the proposed arraycopy. 1914 // Kills temp, but nothing else. 1915 // Also, clean the sign bits of src_pos and dst_pos. 1916 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1917 Register src_pos, // source position (c_rarg1) 1918 Register dst, // destination array oo (c_rarg2) 1919 Register dst_pos, // destination position (c_rarg3) 1920 Register length, 1921 Register temp, 1922 Label& L_failed) { 1923 BLOCK_COMMENT("arraycopy_range_checks:"); 1924 1925 assert_different_registers(rscratch1, temp); 1926 1927 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1928 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1929 __ addw(temp, length, src_pos); 1930 __ cmpw(temp, rscratch1); 1931 __ br(Assembler::HI, L_failed); 1932 1933 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1934 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1935 __ addw(temp, length, dst_pos); 1936 __ cmpw(temp, rscratch1); 1937 __ br(Assembler::HI, L_failed); 1938 1939 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1940 __ movw(src_pos, src_pos); 1941 __ movw(dst_pos, dst_pos); 1942 1943 BLOCK_COMMENT("arraycopy_range_checks done"); 1944 } 1945 1946 // These stubs get called from some dumb test routine. 1947 // I'll write them properly when they're called from 1948 // something that's actually doing something. 1949 static void fake_arraycopy_stub(address src, address dst, int count) { 1950 assert(count == 0, "huh?"); 1951 } 1952 1953 1954 // 1955 // Generate 'unsafe' array copy stub 1956 // Though just as safe as the other stubs, it takes an unscaled 1957 // size_t argument instead of an element count. 1958 // 1959 // Input: 1960 // c_rarg0 - source array address 1961 // c_rarg1 - destination array address 1962 // c_rarg2 - byte count, treated as ssize_t, can be zero 1963 // 1964 // Examines the alignment of the operands and dispatches 1965 // to a long, int, short, or byte copy loop. 1966 // 1967 address generate_unsafe_copy(const char *name, 1968 address byte_copy_entry, 1969 address short_copy_entry, 1970 address int_copy_entry, 1971 address long_copy_entry) { 1972 Label L_long_aligned, L_int_aligned, L_short_aligned; 1973 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1974 1975 __ align(CodeEntryAlignment); 1976 StubCodeMark mark(this, "StubRoutines", name); 1977 address start = __ pc(); 1978 __ enter(); // required for proper stackwalking of RuntimeStub frame 1979 1980 // bump this on entry, not on exit: 1981 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1982 1983 __ orr(rscratch1, s, d); 1984 __ orr(rscratch1, rscratch1, count); 1985 1986 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1987 __ cbz(rscratch1, L_long_aligned); 1988 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1989 __ cbz(rscratch1, L_int_aligned); 1990 __ tbz(rscratch1, 0, L_short_aligned); 1991 __ b(RuntimeAddress(byte_copy_entry)); 1992 1993 __ BIND(L_short_aligned); 1994 __ lsr(count, count, LogBytesPerShort); // size => short_count 1995 __ b(RuntimeAddress(short_copy_entry)); 1996 __ BIND(L_int_aligned); 1997 __ lsr(count, count, LogBytesPerInt); // size => int_count 1998 __ b(RuntimeAddress(int_copy_entry)); 1999 __ BIND(L_long_aligned); 2000 __ lsr(count, count, LogBytesPerLong); // size => long_count 2001 __ b(RuntimeAddress(long_copy_entry)); 2002 2003 return start; 2004 } 2005 2006 // 2007 // Generate generic array copy stubs 2008 // 2009 // Input: 2010 // c_rarg0 - src oop 2011 // c_rarg1 - src_pos (32-bits) 2012 // c_rarg2 - dst oop 2013 // c_rarg3 - dst_pos (32-bits) 2014 // c_rarg4 - element count (32-bits) 2015 // 2016 // Output: 2017 // r0 == 0 - success 2018 // r0 == -1^K - failure, where K is partial transfer count 2019 // 2020 address generate_generic_copy(const char *name, 2021 address byte_copy_entry, address short_copy_entry, 2022 address int_copy_entry, address oop_copy_entry, 2023 address long_copy_entry, address checkcast_copy_entry) { 2024 2025 Label L_failed, L_failed_0, L_objArray; 2026 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2027 2028 // Input registers 2029 const Register src = c_rarg0; // source array oop 2030 const Register src_pos = c_rarg1; // source position 2031 const Register dst = c_rarg2; // destination array oop 2032 const Register dst_pos = c_rarg3; // destination position 2033 const Register length = c_rarg4; 2034 2035 StubCodeMark mark(this, "StubRoutines", name); 2036 2037 __ align(CodeEntryAlignment); 2038 address start = __ pc(); 2039 2040 __ enter(); // required for proper stackwalking of RuntimeStub frame 2041 2042 // bump this on entry, not on exit: 2043 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2044 2045 //----------------------------------------------------------------------- 2046 // Assembler stub will be used for this call to arraycopy 2047 // if the following conditions are met: 2048 // 2049 // (1) src and dst must not be null. 2050 // (2) src_pos must not be negative. 2051 // (3) dst_pos must not be negative. 2052 // (4) length must not be negative. 2053 // (5) src klass and dst klass should be the same and not NULL. 2054 // (6) src and dst should be arrays. 2055 // (7) src_pos + length must not exceed length of src. 2056 // (8) dst_pos + length must not exceed length of dst. 2057 // 2058 2059 // if (src == NULL) return -1; 2060 __ cbz(src, L_failed); 2061 2062 // if (src_pos < 0) return -1; 2063 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2064 2065 // if (dst == NULL) return -1; 2066 __ cbz(dst, L_failed); 2067 2068 // if (dst_pos < 0) return -1; 2069 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2070 2071 // registers used as temp 2072 const Register scratch_length = r16; // elements count to copy 2073 const Register scratch_src_klass = r17; // array klass 2074 const Register lh = r18; // layout helper 2075 2076 // if (length < 0) return -1; 2077 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2078 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2079 2080 __ load_klass(scratch_src_klass, src); 2081 #ifdef ASSERT 2082 // assert(src->klass() != NULL); 2083 { 2084 BLOCK_COMMENT("assert klasses not null {"); 2085 Label L1, L2; 2086 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2087 __ bind(L1); 2088 __ stop("broken null klass"); 2089 __ bind(L2); 2090 __ load_klass(rscratch1, dst); 2091 __ cbz(rscratch1, L1); // this would be broken also 2092 BLOCK_COMMENT("} assert klasses not null done"); 2093 } 2094 #endif 2095 2096 // Load layout helper (32-bits) 2097 // 2098 // |array_tag| | header_size | element_type | |log2_element_size| 2099 // 32 30 24 16 8 2 0 2100 // 2101 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2102 // 2103 2104 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2105 2106 // Handle objArrays completely differently... 2107 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2108 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2109 __ movw(rscratch1, objArray_lh); 2110 __ eorw(rscratch2, lh, rscratch1); 2111 __ cbzw(rscratch2, L_objArray); 2112 2113 // if (src->klass() != dst->klass()) return -1; 2114 __ load_klass(rscratch2, dst); 2115 __ eor(rscratch2, rscratch2, scratch_src_klass); 2116 __ cbnz(rscratch2, L_failed); 2117 2118 // if (!src->is_Array()) return -1; 2119 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2120 2121 // At this point, it is known to be a typeArray (array_tag 0x3). 2122 #ifdef ASSERT 2123 { 2124 BLOCK_COMMENT("assert primitive array {"); 2125 Label L; 2126 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2127 __ cmpw(lh, rscratch2); 2128 __ br(Assembler::GE, L); 2129 __ stop("must be a primitive array"); 2130 __ bind(L); 2131 BLOCK_COMMENT("} assert primitive array done"); 2132 } 2133 #endif 2134 2135 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2136 rscratch2, L_failed); 2137 2138 // TypeArrayKlass 2139 // 2140 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2141 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2142 // 2143 2144 const Register rscratch1_offset = rscratch1; // array offset 2145 const Register r18_elsize = lh; // element size 2146 2147 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2148 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2149 __ add(src, src, rscratch1_offset); // src array offset 2150 __ add(dst, dst, rscratch1_offset); // dst array offset 2151 BLOCK_COMMENT("choose copy loop based on element size"); 2152 2153 // next registers should be set before the jump to corresponding stub 2154 const Register from = c_rarg0; // source array address 2155 const Register to = c_rarg1; // destination array address 2156 const Register count = c_rarg2; // elements count 2157 2158 // 'from', 'to', 'count' registers should be set in such order 2159 // since they are the same as 'src', 'src_pos', 'dst'. 2160 2161 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2162 2163 // The possible values of elsize are 0-3, i.e. exact_log2(element 2164 // size in bytes). We do a simple bitwise binary search. 2165 __ BIND(L_copy_bytes); 2166 __ tbnz(r18_elsize, 1, L_copy_ints); 2167 __ tbnz(r18_elsize, 0, L_copy_shorts); 2168 __ lea(from, Address(src, src_pos));// src_addr 2169 __ lea(to, Address(dst, dst_pos));// dst_addr 2170 __ movw(count, scratch_length); // length 2171 __ b(RuntimeAddress(byte_copy_entry)); 2172 2173 __ BIND(L_copy_shorts); 2174 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2175 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2176 __ movw(count, scratch_length); // length 2177 __ b(RuntimeAddress(short_copy_entry)); 2178 2179 __ BIND(L_copy_ints); 2180 __ tbnz(r18_elsize, 0, L_copy_longs); 2181 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2182 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2183 __ movw(count, scratch_length); // length 2184 __ b(RuntimeAddress(int_copy_entry)); 2185 2186 __ BIND(L_copy_longs); 2187 #ifdef ASSERT 2188 { 2189 BLOCK_COMMENT("assert long copy {"); 2190 Label L; 2191 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2192 __ cmpw(r18_elsize, LogBytesPerLong); 2193 __ br(Assembler::EQ, L); 2194 __ stop("must be long copy, but elsize is wrong"); 2195 __ bind(L); 2196 BLOCK_COMMENT("} assert long copy done"); 2197 } 2198 #endif 2199 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2200 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2201 __ movw(count, scratch_length); // length 2202 __ b(RuntimeAddress(long_copy_entry)); 2203 2204 // ObjArrayKlass 2205 __ BIND(L_objArray); 2206 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2207 2208 Label L_plain_copy, L_checkcast_copy; 2209 // test array classes for subtyping 2210 __ load_klass(r18, dst); 2211 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2212 __ br(Assembler::NE, L_checkcast_copy); 2213 2214 // Identically typed arrays can be copied without element-wise checks. 2215 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2216 rscratch2, L_failed); 2217 2218 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2219 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2220 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2221 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2222 __ movw(count, scratch_length); // length 2223 __ BIND(L_plain_copy); 2224 __ b(RuntimeAddress(oop_copy_entry)); 2225 2226 __ BIND(L_checkcast_copy); 2227 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2228 { 2229 // Before looking at dst.length, make sure dst is also an objArray. 2230 __ ldrw(rscratch1, Address(r18, lh_offset)); 2231 __ movw(rscratch2, objArray_lh); 2232 __ eorw(rscratch1, rscratch1, rscratch2); 2233 __ cbnzw(rscratch1, L_failed); 2234 2235 // It is safe to examine both src.length and dst.length. 2236 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2237 r18, L_failed); 2238 2239 const Register rscratch2_dst_klass = rscratch2; 2240 __ load_klass(rscratch2_dst_klass, dst); // reload 2241 2242 // Marshal the base address arguments now, freeing registers. 2243 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2244 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2245 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2246 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2247 __ movw(count, length); // length (reloaded) 2248 Register sco_temp = c_rarg3; // this register is free now 2249 assert_different_registers(from, to, count, sco_temp, 2250 rscratch2_dst_klass, scratch_src_klass); 2251 // assert_clean_int(count, sco_temp); 2252 2253 // Generate the type check. 2254 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2255 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2256 // assert_clean_int(sco_temp, r18); 2257 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2258 2259 // Fetch destination element klass from the ObjArrayKlass header. 2260 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2261 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2262 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2263 2264 // the checkcast_copy loop needs two extra arguments: 2265 assert(c_rarg3 == sco_temp, "#3 already in place"); 2266 // Set up arguments for checkcast_copy_entry. 2267 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2268 __ b(RuntimeAddress(checkcast_copy_entry)); 2269 } 2270 2271 __ BIND(L_failed); 2272 __ mov(r0, -1); 2273 __ leave(); // required for proper stackwalking of RuntimeStub frame 2274 __ ret(lr); 2275 2276 return start; 2277 } 2278 2279 // 2280 // Generate stub for array fill. If "aligned" is true, the 2281 // "to" address is assumed to be heapword aligned. 2282 // 2283 // Arguments for generated stub: 2284 // to: c_rarg0 2285 // value: c_rarg1 2286 // count: c_rarg2 treated as signed 2287 // 2288 address generate_fill(BasicType t, bool aligned, const char *name) { 2289 __ align(CodeEntryAlignment); 2290 StubCodeMark mark(this, "StubRoutines", name); 2291 address start = __ pc(); 2292 2293 BLOCK_COMMENT("Entry:"); 2294 2295 const Register to = c_rarg0; // source array address 2296 const Register value = c_rarg1; // value 2297 const Register count = c_rarg2; // elements count 2298 2299 const Register bz_base = r10; // base for block_zero routine 2300 const Register cnt_words = r11; // temp register 2301 2302 __ enter(); 2303 2304 Label L_fill_elements, L_exit1; 2305 2306 int shift = -1; 2307 switch (t) { 2308 case T_BYTE: 2309 shift = 0; 2310 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2311 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2312 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2313 __ br(Assembler::LO, L_fill_elements); 2314 break; 2315 case T_SHORT: 2316 shift = 1; 2317 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2318 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2319 __ br(Assembler::LO, L_fill_elements); 2320 break; 2321 case T_INT: 2322 shift = 2; 2323 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2324 __ br(Assembler::LO, L_fill_elements); 2325 break; 2326 default: ShouldNotReachHere(); 2327 } 2328 2329 // Align source address at 8 bytes address boundary. 2330 Label L_skip_align1, L_skip_align2, L_skip_align4; 2331 if (!aligned) { 2332 switch (t) { 2333 case T_BYTE: 2334 // One byte misalignment happens only for byte arrays. 2335 __ tbz(to, 0, L_skip_align1); 2336 __ strb(value, Address(__ post(to, 1))); 2337 __ subw(count, count, 1); 2338 __ bind(L_skip_align1); 2339 // Fallthrough 2340 case T_SHORT: 2341 // Two bytes misalignment happens only for byte and short (char) arrays. 2342 __ tbz(to, 1, L_skip_align2); 2343 __ strh(value, Address(__ post(to, 2))); 2344 __ subw(count, count, 2 >> shift); 2345 __ bind(L_skip_align2); 2346 // Fallthrough 2347 case T_INT: 2348 // Align to 8 bytes, we know we are 4 byte aligned to start. 2349 __ tbz(to, 2, L_skip_align4); 2350 __ strw(value, Address(__ post(to, 4))); 2351 __ subw(count, count, 4 >> shift); 2352 __ bind(L_skip_align4); 2353 break; 2354 default: ShouldNotReachHere(); 2355 } 2356 } 2357 2358 // 2359 // Fill large chunks 2360 // 2361 __ lsrw(cnt_words, count, 3 - shift); // number of words 2362 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2363 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2364 if (UseBlockZeroing) { 2365 Label non_block_zeroing, rest; 2366 // If the fill value is zero we can use the fast zero_words(). 2367 __ cbnz(value, non_block_zeroing); 2368 __ mov(bz_base, to); 2369 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2370 __ zero_words(bz_base, cnt_words); 2371 __ b(rest); 2372 __ bind(non_block_zeroing); 2373 __ fill_words(to, cnt_words, value); 2374 __ bind(rest); 2375 } else { 2376 __ fill_words(to, cnt_words, value); 2377 } 2378 2379 // Remaining count is less than 8 bytes. Fill it by a single store. 2380 // Note that the total length is no less than 8 bytes. 2381 if (t == T_BYTE || t == T_SHORT) { 2382 Label L_exit1; 2383 __ cbzw(count, L_exit1); 2384 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2385 __ str(value, Address(to, -8)); // overwrite some elements 2386 __ bind(L_exit1); 2387 __ leave(); 2388 __ ret(lr); 2389 } 2390 2391 // Handle copies less than 8 bytes. 2392 Label L_fill_2, L_fill_4, L_exit2; 2393 __ bind(L_fill_elements); 2394 switch (t) { 2395 case T_BYTE: 2396 __ tbz(count, 0, L_fill_2); 2397 __ strb(value, Address(__ post(to, 1))); 2398 __ bind(L_fill_2); 2399 __ tbz(count, 1, L_fill_4); 2400 __ strh(value, Address(__ post(to, 2))); 2401 __ bind(L_fill_4); 2402 __ tbz(count, 2, L_exit2); 2403 __ strw(value, Address(to)); 2404 break; 2405 case T_SHORT: 2406 __ tbz(count, 0, L_fill_4); 2407 __ strh(value, Address(__ post(to, 2))); 2408 __ bind(L_fill_4); 2409 __ tbz(count, 1, L_exit2); 2410 __ strw(value, Address(to)); 2411 break; 2412 case T_INT: 2413 __ cbzw(count, L_exit2); 2414 __ strw(value, Address(to)); 2415 break; 2416 default: ShouldNotReachHere(); 2417 } 2418 __ bind(L_exit2); 2419 __ leave(); 2420 __ ret(lr); 2421 return start; 2422 } 2423 2424 void generate_arraycopy_stubs() { 2425 address entry; 2426 address entry_jbyte_arraycopy; 2427 address entry_jshort_arraycopy; 2428 address entry_jint_arraycopy; 2429 address entry_oop_arraycopy; 2430 address entry_jlong_arraycopy; 2431 address entry_checkcast_arraycopy; 2432 2433 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2434 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2435 2436 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2437 2438 //*** jbyte 2439 // Always need aligned and unaligned versions 2440 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2441 "jbyte_disjoint_arraycopy"); 2442 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2443 &entry_jbyte_arraycopy, 2444 "jbyte_arraycopy"); 2445 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2446 "arrayof_jbyte_disjoint_arraycopy"); 2447 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2448 "arrayof_jbyte_arraycopy"); 2449 2450 //*** jshort 2451 // Always need aligned and unaligned versions 2452 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2453 "jshort_disjoint_arraycopy"); 2454 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2455 &entry_jshort_arraycopy, 2456 "jshort_arraycopy"); 2457 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2458 "arrayof_jshort_disjoint_arraycopy"); 2459 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2460 "arrayof_jshort_arraycopy"); 2461 2462 //*** jint 2463 // Aligned versions 2464 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2465 "arrayof_jint_disjoint_arraycopy"); 2466 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2467 "arrayof_jint_arraycopy"); 2468 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2469 // entry_jint_arraycopy always points to the unaligned version 2470 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2471 "jint_disjoint_arraycopy"); 2472 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2473 &entry_jint_arraycopy, 2474 "jint_arraycopy"); 2475 2476 //*** jlong 2477 // It is always aligned 2478 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2479 "arrayof_jlong_disjoint_arraycopy"); 2480 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2481 "arrayof_jlong_arraycopy"); 2482 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2483 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2484 2485 //*** oops 2486 { 2487 // With compressed oops we need unaligned versions; notice that 2488 // we overwrite entry_oop_arraycopy. 2489 bool aligned = !UseCompressedOops; 2490 2491 StubRoutines::_arrayof_oop_disjoint_arraycopy 2492 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2493 /*dest_uninitialized*/false); 2494 StubRoutines::_arrayof_oop_arraycopy 2495 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2496 /*dest_uninitialized*/false); 2497 // Aligned versions without pre-barriers 2498 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2499 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2500 /*dest_uninitialized*/true); 2501 StubRoutines::_arrayof_oop_arraycopy_uninit 2502 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2503 /*dest_uninitialized*/true); 2504 } 2505 2506 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2507 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2508 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2509 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2510 2511 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2512 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2513 /*dest_uninitialized*/true); 2514 2515 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2516 entry_jbyte_arraycopy, 2517 entry_jshort_arraycopy, 2518 entry_jint_arraycopy, 2519 entry_jlong_arraycopy); 2520 2521 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2522 entry_jbyte_arraycopy, 2523 entry_jshort_arraycopy, 2524 entry_jint_arraycopy, 2525 entry_oop_arraycopy, 2526 entry_jlong_arraycopy, 2527 entry_checkcast_arraycopy); 2528 2529 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2530 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2531 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2532 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2533 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2534 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2535 } 2536 2537 void generate_math_stubs() { Unimplemented(); } 2538 2539 // Arguments: 2540 // 2541 // Inputs: 2542 // c_rarg0 - source byte array address 2543 // c_rarg1 - destination byte array address 2544 // c_rarg2 - K (key) in little endian int array 2545 // 2546 address generate_aescrypt_encryptBlock() { 2547 __ align(CodeEntryAlignment); 2548 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2549 2550 Label L_doLast; 2551 2552 const Register from = c_rarg0; // source array address 2553 const Register to = c_rarg1; // destination array address 2554 const Register key = c_rarg2; // key array address 2555 const Register keylen = rscratch1; 2556 2557 address start = __ pc(); 2558 __ enter(); 2559 2560 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2561 2562 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2563 2564 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2565 __ rev32(v1, __ T16B, v1); 2566 __ rev32(v2, __ T16B, v2); 2567 __ rev32(v3, __ T16B, v3); 2568 __ rev32(v4, __ T16B, v4); 2569 __ aese(v0, v1); 2570 __ aesmc(v0, v0); 2571 __ aese(v0, v2); 2572 __ aesmc(v0, v0); 2573 __ aese(v0, v3); 2574 __ aesmc(v0, v0); 2575 __ aese(v0, v4); 2576 __ aesmc(v0, v0); 2577 2578 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2579 __ rev32(v1, __ T16B, v1); 2580 __ rev32(v2, __ T16B, v2); 2581 __ rev32(v3, __ T16B, v3); 2582 __ rev32(v4, __ T16B, v4); 2583 __ aese(v0, v1); 2584 __ aesmc(v0, v0); 2585 __ aese(v0, v2); 2586 __ aesmc(v0, v0); 2587 __ aese(v0, v3); 2588 __ aesmc(v0, v0); 2589 __ aese(v0, v4); 2590 __ aesmc(v0, v0); 2591 2592 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2593 __ rev32(v1, __ T16B, v1); 2594 __ rev32(v2, __ T16B, v2); 2595 2596 __ cmpw(keylen, 44); 2597 __ br(Assembler::EQ, L_doLast); 2598 2599 __ aese(v0, v1); 2600 __ aesmc(v0, v0); 2601 __ aese(v0, v2); 2602 __ aesmc(v0, v0); 2603 2604 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2605 __ rev32(v1, __ T16B, v1); 2606 __ rev32(v2, __ T16B, v2); 2607 2608 __ cmpw(keylen, 52); 2609 __ br(Assembler::EQ, L_doLast); 2610 2611 __ aese(v0, v1); 2612 __ aesmc(v0, v0); 2613 __ aese(v0, v2); 2614 __ aesmc(v0, v0); 2615 2616 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2617 __ rev32(v1, __ T16B, v1); 2618 __ rev32(v2, __ T16B, v2); 2619 2620 __ BIND(L_doLast); 2621 2622 __ aese(v0, v1); 2623 __ aesmc(v0, v0); 2624 __ aese(v0, v2); 2625 2626 __ ld1(v1, __ T16B, key); 2627 __ rev32(v1, __ T16B, v1); 2628 __ eor(v0, __ T16B, v0, v1); 2629 2630 __ st1(v0, __ T16B, to); 2631 2632 __ mov(r0, 0); 2633 2634 __ leave(); 2635 __ ret(lr); 2636 2637 return start; 2638 } 2639 2640 // Arguments: 2641 // 2642 // Inputs: 2643 // c_rarg0 - source byte array address 2644 // c_rarg1 - destination byte array address 2645 // c_rarg2 - K (key) in little endian int array 2646 // 2647 address generate_aescrypt_decryptBlock() { 2648 assert(UseAES, "need AES instructions and misaligned SSE support"); 2649 __ align(CodeEntryAlignment); 2650 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2651 Label L_doLast; 2652 2653 const Register from = c_rarg0; // source array address 2654 const Register to = c_rarg1; // destination array address 2655 const Register key = c_rarg2; // key array address 2656 const Register keylen = rscratch1; 2657 2658 address start = __ pc(); 2659 __ enter(); // required for proper stackwalking of RuntimeStub frame 2660 2661 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2662 2663 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2664 2665 __ ld1(v5, __ T16B, __ post(key, 16)); 2666 __ rev32(v5, __ T16B, v5); 2667 2668 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2669 __ rev32(v1, __ T16B, v1); 2670 __ rev32(v2, __ T16B, v2); 2671 __ rev32(v3, __ T16B, v3); 2672 __ rev32(v4, __ T16B, v4); 2673 __ aesd(v0, v1); 2674 __ aesimc(v0, v0); 2675 __ aesd(v0, v2); 2676 __ aesimc(v0, v0); 2677 __ aesd(v0, v3); 2678 __ aesimc(v0, v0); 2679 __ aesd(v0, v4); 2680 __ aesimc(v0, v0); 2681 2682 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2683 __ rev32(v1, __ T16B, v1); 2684 __ rev32(v2, __ T16B, v2); 2685 __ rev32(v3, __ T16B, v3); 2686 __ rev32(v4, __ T16B, v4); 2687 __ aesd(v0, v1); 2688 __ aesimc(v0, v0); 2689 __ aesd(v0, v2); 2690 __ aesimc(v0, v0); 2691 __ aesd(v0, v3); 2692 __ aesimc(v0, v0); 2693 __ aesd(v0, v4); 2694 __ aesimc(v0, v0); 2695 2696 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2697 __ rev32(v1, __ T16B, v1); 2698 __ rev32(v2, __ T16B, v2); 2699 2700 __ cmpw(keylen, 44); 2701 __ br(Assembler::EQ, L_doLast); 2702 2703 __ aesd(v0, v1); 2704 __ aesimc(v0, v0); 2705 __ aesd(v0, v2); 2706 __ aesimc(v0, v0); 2707 2708 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2709 __ rev32(v1, __ T16B, v1); 2710 __ rev32(v2, __ T16B, v2); 2711 2712 __ cmpw(keylen, 52); 2713 __ br(Assembler::EQ, L_doLast); 2714 2715 __ aesd(v0, v1); 2716 __ aesimc(v0, v0); 2717 __ aesd(v0, v2); 2718 __ aesimc(v0, v0); 2719 2720 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2721 __ rev32(v1, __ T16B, v1); 2722 __ rev32(v2, __ T16B, v2); 2723 2724 __ BIND(L_doLast); 2725 2726 __ aesd(v0, v1); 2727 __ aesimc(v0, v0); 2728 __ aesd(v0, v2); 2729 2730 __ eor(v0, __ T16B, v0, v5); 2731 2732 __ st1(v0, __ T16B, to); 2733 2734 __ mov(r0, 0); 2735 2736 __ leave(); 2737 __ ret(lr); 2738 2739 return start; 2740 } 2741 2742 // Arguments: 2743 // 2744 // Inputs: 2745 // c_rarg0 - source byte array address 2746 // c_rarg1 - destination byte array address 2747 // c_rarg2 - K (key) in little endian int array 2748 // c_rarg3 - r vector byte array address 2749 // c_rarg4 - input length 2750 // 2751 // Output: 2752 // x0 - input length 2753 // 2754 address generate_cipherBlockChaining_encryptAESCrypt() { 2755 assert(UseAES, "need AES instructions and misaligned SSE support"); 2756 __ align(CodeEntryAlignment); 2757 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2758 2759 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2760 2761 const Register from = c_rarg0; // source array address 2762 const Register to = c_rarg1; // destination array address 2763 const Register key = c_rarg2; // key array address 2764 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2765 // and left with the results of the last encryption block 2766 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2767 const Register keylen = rscratch1; 2768 2769 address start = __ pc(); 2770 2771 __ enter(); 2772 2773 __ movw(rscratch2, len_reg); 2774 2775 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2776 2777 __ ld1(v0, __ T16B, rvec); 2778 2779 __ cmpw(keylen, 52); 2780 __ br(Assembler::CC, L_loadkeys_44); 2781 __ br(Assembler::EQ, L_loadkeys_52); 2782 2783 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2784 __ rev32(v17, __ T16B, v17); 2785 __ rev32(v18, __ T16B, v18); 2786 __ BIND(L_loadkeys_52); 2787 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2788 __ rev32(v19, __ T16B, v19); 2789 __ rev32(v20, __ T16B, v20); 2790 __ BIND(L_loadkeys_44); 2791 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2792 __ rev32(v21, __ T16B, v21); 2793 __ rev32(v22, __ T16B, v22); 2794 __ rev32(v23, __ T16B, v23); 2795 __ rev32(v24, __ T16B, v24); 2796 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2797 __ rev32(v25, __ T16B, v25); 2798 __ rev32(v26, __ T16B, v26); 2799 __ rev32(v27, __ T16B, v27); 2800 __ rev32(v28, __ T16B, v28); 2801 __ ld1(v29, v30, v31, __ T16B, key); 2802 __ rev32(v29, __ T16B, v29); 2803 __ rev32(v30, __ T16B, v30); 2804 __ rev32(v31, __ T16B, v31); 2805 2806 __ BIND(L_aes_loop); 2807 __ ld1(v1, __ T16B, __ post(from, 16)); 2808 __ eor(v0, __ T16B, v0, v1); 2809 2810 __ br(Assembler::CC, L_rounds_44); 2811 __ br(Assembler::EQ, L_rounds_52); 2812 2813 __ aese(v0, v17); __ aesmc(v0, v0); 2814 __ aese(v0, v18); __ aesmc(v0, v0); 2815 __ BIND(L_rounds_52); 2816 __ aese(v0, v19); __ aesmc(v0, v0); 2817 __ aese(v0, v20); __ aesmc(v0, v0); 2818 __ BIND(L_rounds_44); 2819 __ aese(v0, v21); __ aesmc(v0, v0); 2820 __ aese(v0, v22); __ aesmc(v0, v0); 2821 __ aese(v0, v23); __ aesmc(v0, v0); 2822 __ aese(v0, v24); __ aesmc(v0, v0); 2823 __ aese(v0, v25); __ aesmc(v0, v0); 2824 __ aese(v0, v26); __ aesmc(v0, v0); 2825 __ aese(v0, v27); __ aesmc(v0, v0); 2826 __ aese(v0, v28); __ aesmc(v0, v0); 2827 __ aese(v0, v29); __ aesmc(v0, v0); 2828 __ aese(v0, v30); 2829 __ eor(v0, __ T16B, v0, v31); 2830 2831 __ st1(v0, __ T16B, __ post(to, 16)); 2832 2833 __ subw(len_reg, len_reg, 16); 2834 __ cbnzw(len_reg, L_aes_loop); 2835 2836 __ st1(v0, __ T16B, rvec); 2837 2838 __ mov(r0, rscratch2); 2839 2840 __ leave(); 2841 __ ret(lr); 2842 2843 return start; 2844 } 2845 2846 // Arguments: 2847 // 2848 // Inputs: 2849 // c_rarg0 - source byte array address 2850 // c_rarg1 - destination byte array address 2851 // c_rarg2 - K (key) in little endian int array 2852 // c_rarg3 - r vector byte array address 2853 // c_rarg4 - input length 2854 // 2855 // Output: 2856 // r0 - input length 2857 // 2858 address generate_cipherBlockChaining_decryptAESCrypt() { 2859 assert(UseAES, "need AES instructions and misaligned SSE support"); 2860 __ align(CodeEntryAlignment); 2861 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2862 2863 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2864 2865 const Register from = c_rarg0; // source array address 2866 const Register to = c_rarg1; // destination array address 2867 const Register key = c_rarg2; // key array address 2868 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2869 // and left with the results of the last encryption block 2870 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2871 const Register keylen = rscratch1; 2872 2873 address start = __ pc(); 2874 2875 __ enter(); 2876 2877 __ movw(rscratch2, len_reg); 2878 2879 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2880 2881 __ ld1(v2, __ T16B, rvec); 2882 2883 __ ld1(v31, __ T16B, __ post(key, 16)); 2884 __ rev32(v31, __ T16B, v31); 2885 2886 __ cmpw(keylen, 52); 2887 __ br(Assembler::CC, L_loadkeys_44); 2888 __ br(Assembler::EQ, L_loadkeys_52); 2889 2890 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2891 __ rev32(v17, __ T16B, v17); 2892 __ rev32(v18, __ T16B, v18); 2893 __ BIND(L_loadkeys_52); 2894 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2895 __ rev32(v19, __ T16B, v19); 2896 __ rev32(v20, __ T16B, v20); 2897 __ BIND(L_loadkeys_44); 2898 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2899 __ rev32(v21, __ T16B, v21); 2900 __ rev32(v22, __ T16B, v22); 2901 __ rev32(v23, __ T16B, v23); 2902 __ rev32(v24, __ T16B, v24); 2903 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2904 __ rev32(v25, __ T16B, v25); 2905 __ rev32(v26, __ T16B, v26); 2906 __ rev32(v27, __ T16B, v27); 2907 __ rev32(v28, __ T16B, v28); 2908 __ ld1(v29, v30, __ T16B, key); 2909 __ rev32(v29, __ T16B, v29); 2910 __ rev32(v30, __ T16B, v30); 2911 2912 __ BIND(L_aes_loop); 2913 __ ld1(v0, __ T16B, __ post(from, 16)); 2914 __ orr(v1, __ T16B, v0, v0); 2915 2916 __ br(Assembler::CC, L_rounds_44); 2917 __ br(Assembler::EQ, L_rounds_52); 2918 2919 __ aesd(v0, v17); __ aesimc(v0, v0); 2920 __ aesd(v0, v18); __ aesimc(v0, v0); 2921 __ BIND(L_rounds_52); 2922 __ aesd(v0, v19); __ aesimc(v0, v0); 2923 __ aesd(v0, v20); __ aesimc(v0, v0); 2924 __ BIND(L_rounds_44); 2925 __ aesd(v0, v21); __ aesimc(v0, v0); 2926 __ aesd(v0, v22); __ aesimc(v0, v0); 2927 __ aesd(v0, v23); __ aesimc(v0, v0); 2928 __ aesd(v0, v24); __ aesimc(v0, v0); 2929 __ aesd(v0, v25); __ aesimc(v0, v0); 2930 __ aesd(v0, v26); __ aesimc(v0, v0); 2931 __ aesd(v0, v27); __ aesimc(v0, v0); 2932 __ aesd(v0, v28); __ aesimc(v0, v0); 2933 __ aesd(v0, v29); __ aesimc(v0, v0); 2934 __ aesd(v0, v30); 2935 __ eor(v0, __ T16B, v0, v31); 2936 __ eor(v0, __ T16B, v0, v2); 2937 2938 __ st1(v0, __ T16B, __ post(to, 16)); 2939 __ orr(v2, __ T16B, v1, v1); 2940 2941 __ subw(len_reg, len_reg, 16); 2942 __ cbnzw(len_reg, L_aes_loop); 2943 2944 __ st1(v2, __ T16B, rvec); 2945 2946 __ mov(r0, rscratch2); 2947 2948 __ leave(); 2949 __ ret(lr); 2950 2951 return start; 2952 } 2953 2954 // Arguments: 2955 // 2956 // Inputs: 2957 // c_rarg0 - byte[] source+offset 2958 // c_rarg1 - int[] SHA.state 2959 // c_rarg2 - int offset 2960 // c_rarg3 - int limit 2961 // 2962 address generate_sha1_implCompress(bool multi_block, const char *name) { 2963 __ align(CodeEntryAlignment); 2964 StubCodeMark mark(this, "StubRoutines", name); 2965 address start = __ pc(); 2966 2967 Register buf = c_rarg0; 2968 Register state = c_rarg1; 2969 Register ofs = c_rarg2; 2970 Register limit = c_rarg3; 2971 2972 Label keys; 2973 Label sha1_loop; 2974 2975 // load the keys into v0..v3 2976 __ adr(rscratch1, keys); 2977 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2978 // load 5 words state into v6, v7 2979 __ ldrq(v6, Address(state, 0)); 2980 __ ldrs(v7, Address(state, 16)); 2981 2982 2983 __ BIND(sha1_loop); 2984 // load 64 bytes of data into v16..v19 2985 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2986 __ rev32(v16, __ T16B, v16); 2987 __ rev32(v17, __ T16B, v17); 2988 __ rev32(v18, __ T16B, v18); 2989 __ rev32(v19, __ T16B, v19); 2990 2991 // do the sha1 2992 __ addv(v4, __ T4S, v16, v0); 2993 __ orr(v20, __ T16B, v6, v6); 2994 2995 FloatRegister d0 = v16; 2996 FloatRegister d1 = v17; 2997 FloatRegister d2 = v18; 2998 FloatRegister d3 = v19; 2999 3000 for (int round = 0; round < 20; round++) { 3001 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3002 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3003 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3004 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3005 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3006 3007 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3008 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3009 __ sha1h(tmp2, __ T4S, v20); 3010 if (round < 5) 3011 __ sha1c(v20, __ T4S, tmp3, tmp4); 3012 else if (round < 10 || round >= 15) 3013 __ sha1p(v20, __ T4S, tmp3, tmp4); 3014 else 3015 __ sha1m(v20, __ T4S, tmp3, tmp4); 3016 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3017 3018 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3019 } 3020 3021 __ addv(v7, __ T2S, v7, v21); 3022 __ addv(v6, __ T4S, v6, v20); 3023 3024 if (multi_block) { 3025 __ add(ofs, ofs, 64); 3026 __ cmp(ofs, limit); 3027 __ br(Assembler::LE, sha1_loop); 3028 __ mov(c_rarg0, ofs); // return ofs 3029 } 3030 3031 __ strq(v6, Address(state, 0)); 3032 __ strs(v7, Address(state, 16)); 3033 3034 __ ret(lr); 3035 3036 __ bind(keys); 3037 __ emit_int32(0x5a827999); 3038 __ emit_int32(0x6ed9eba1); 3039 __ emit_int32(0x8f1bbcdc); 3040 __ emit_int32(0xca62c1d6); 3041 3042 return start; 3043 } 3044 3045 3046 // Arguments: 3047 // 3048 // Inputs: 3049 // c_rarg0 - byte[] source+offset 3050 // c_rarg1 - int[] SHA.state 3051 // c_rarg2 - int offset 3052 // c_rarg3 - int limit 3053 // 3054 address generate_sha256_implCompress(bool multi_block, const char *name) { 3055 static const uint32_t round_consts[64] = { 3056 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3057 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3058 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3059 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3060 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3061 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3062 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3063 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3064 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3065 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3066 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3067 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3068 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3069 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3070 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3071 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3072 }; 3073 __ align(CodeEntryAlignment); 3074 StubCodeMark mark(this, "StubRoutines", name); 3075 address start = __ pc(); 3076 3077 Register buf = c_rarg0; 3078 Register state = c_rarg1; 3079 Register ofs = c_rarg2; 3080 Register limit = c_rarg3; 3081 3082 Label sha1_loop; 3083 3084 __ stpd(v8, v9, __ pre(sp, -32)); 3085 __ stpd(v10, v11, Address(sp, 16)); 3086 3087 // dga == v0 3088 // dgb == v1 3089 // dg0 == v2 3090 // dg1 == v3 3091 // dg2 == v4 3092 // t0 == v6 3093 // t1 == v7 3094 3095 // load 16 keys to v16..v31 3096 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3097 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3098 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3099 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3100 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3101 3102 // load 8 words (256 bits) state 3103 __ ldpq(v0, v1, state); 3104 3105 __ BIND(sha1_loop); 3106 // load 64 bytes of data into v8..v11 3107 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3108 __ rev32(v8, __ T16B, v8); 3109 __ rev32(v9, __ T16B, v9); 3110 __ rev32(v10, __ T16B, v10); 3111 __ rev32(v11, __ T16B, v11); 3112 3113 __ addv(v6, __ T4S, v8, v16); 3114 __ orr(v2, __ T16B, v0, v0); 3115 __ orr(v3, __ T16B, v1, v1); 3116 3117 FloatRegister d0 = v8; 3118 FloatRegister d1 = v9; 3119 FloatRegister d2 = v10; 3120 FloatRegister d3 = v11; 3121 3122 3123 for (int round = 0; round < 16; round++) { 3124 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3125 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3126 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3127 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3128 3129 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3130 __ orr(v4, __ T16B, v2, v2); 3131 if (round < 15) 3132 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3133 __ sha256h(v2, __ T4S, v3, tmp2); 3134 __ sha256h2(v3, __ T4S, v4, tmp2); 3135 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3136 3137 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3138 } 3139 3140 __ addv(v0, __ T4S, v0, v2); 3141 __ addv(v1, __ T4S, v1, v3); 3142 3143 if (multi_block) { 3144 __ add(ofs, ofs, 64); 3145 __ cmp(ofs, limit); 3146 __ br(Assembler::LE, sha1_loop); 3147 __ mov(c_rarg0, ofs); // return ofs 3148 } 3149 3150 __ ldpd(v10, v11, Address(sp, 16)); 3151 __ ldpd(v8, v9, __ post(sp, 32)); 3152 3153 __ stpq(v0, v1, state); 3154 3155 __ ret(lr); 3156 3157 return start; 3158 } 3159 3160 #ifndef BUILTIN_SIM 3161 // Safefetch stubs. 3162 void generate_safefetch(const char* name, int size, address* entry, 3163 address* fault_pc, address* continuation_pc) { 3164 // safefetch signatures: 3165 // int SafeFetch32(int* adr, int errValue); 3166 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3167 // 3168 // arguments: 3169 // c_rarg0 = adr 3170 // c_rarg1 = errValue 3171 // 3172 // result: 3173 // PPC_RET = *adr or errValue 3174 3175 StubCodeMark mark(this, "StubRoutines", name); 3176 3177 // Entry point, pc or function descriptor. 3178 *entry = __ pc(); 3179 3180 // Load *adr into c_rarg1, may fault. 3181 *fault_pc = __ pc(); 3182 switch (size) { 3183 case 4: 3184 // int32_t 3185 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3186 break; 3187 case 8: 3188 // int64_t 3189 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3190 break; 3191 default: 3192 ShouldNotReachHere(); 3193 } 3194 3195 // return errValue or *adr 3196 *continuation_pc = __ pc(); 3197 __ mov(r0, c_rarg1); 3198 __ ret(lr); 3199 } 3200 #endif 3201 3202 /** 3203 * Arguments: 3204 * 3205 * Inputs: 3206 * c_rarg0 - int crc 3207 * c_rarg1 - byte* buf 3208 * c_rarg2 - int length 3209 * 3210 * Ouput: 3211 * rax - int crc result 3212 */ 3213 address generate_updateBytesCRC32() { 3214 assert(UseCRC32Intrinsics, "what are we doing here?"); 3215 3216 __ align(CodeEntryAlignment); 3217 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3218 3219 address start = __ pc(); 3220 3221 const Register crc = c_rarg0; // crc 3222 const Register buf = c_rarg1; // source java byte array address 3223 const Register len = c_rarg2; // length 3224 const Register table0 = c_rarg3; // crc_table address 3225 const Register table1 = c_rarg4; 3226 const Register table2 = c_rarg5; 3227 const Register table3 = c_rarg6; 3228 const Register tmp3 = c_rarg7; 3229 3230 BLOCK_COMMENT("Entry:"); 3231 __ enter(); // required for proper stackwalking of RuntimeStub frame 3232 3233 __ kernel_crc32(crc, buf, len, 3234 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3235 3236 __ leave(); // required for proper stackwalking of RuntimeStub frame 3237 __ ret(lr); 3238 3239 return start; 3240 } 3241 3242 /** 3243 * Arguments: 3244 * 3245 * Inputs: 3246 * c_rarg0 - int crc 3247 * c_rarg1 - byte* buf 3248 * c_rarg2 - int length 3249 * c_rarg3 - int* table 3250 * 3251 * Ouput: 3252 * r0 - int crc result 3253 */ 3254 address generate_updateBytesCRC32C() { 3255 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3256 3257 __ align(CodeEntryAlignment); 3258 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3259 3260 address start = __ pc(); 3261 3262 const Register crc = c_rarg0; // crc 3263 const Register buf = c_rarg1; // source java byte array address 3264 const Register len = c_rarg2; // length 3265 const Register table0 = c_rarg3; // crc_table address 3266 const Register table1 = c_rarg4; 3267 const Register table2 = c_rarg5; 3268 const Register table3 = c_rarg6; 3269 const Register tmp3 = c_rarg7; 3270 3271 BLOCK_COMMENT("Entry:"); 3272 __ enter(); // required for proper stackwalking of RuntimeStub frame 3273 3274 __ kernel_crc32c(crc, buf, len, 3275 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3276 3277 __ leave(); // required for proper stackwalking of RuntimeStub frame 3278 __ ret(lr); 3279 3280 return start; 3281 } 3282 3283 /*** 3284 * Arguments: 3285 * 3286 * Inputs: 3287 * c_rarg0 - int adler 3288 * c_rarg1 - byte* buff 3289 * c_rarg2 - int len 3290 * 3291 * Output: 3292 * c_rarg0 - int adler result 3293 */ 3294 address generate_updateBytesAdler32() { 3295 __ align(CodeEntryAlignment); 3296 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3297 address start = __ pc(); 3298 3299 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3300 3301 // Aliases 3302 Register adler = c_rarg0; 3303 Register s1 = c_rarg0; 3304 Register s2 = c_rarg3; 3305 Register buff = c_rarg1; 3306 Register len = c_rarg2; 3307 Register nmax = r4; 3308 Register base = r5; 3309 Register count = r6; 3310 Register temp0 = rscratch1; 3311 Register temp1 = rscratch2; 3312 Register temp2 = r7; 3313 3314 // Max number of bytes we can process before having to take the mod 3315 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3316 unsigned long BASE = 0xfff1; 3317 unsigned long NMAX = 0x15B0; 3318 3319 __ mov(base, BASE); 3320 __ mov(nmax, NMAX); 3321 3322 // s1 is initialized to the lower 16 bits of adler 3323 // s2 is initialized to the upper 16 bits of adler 3324 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3325 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3326 3327 // The pipelined loop needs at least 16 elements for 1 iteration 3328 // It does check this, but it is more effective to skip to the cleanup loop 3329 __ cmp(len, 16); 3330 __ br(Assembler::HS, L_nmax); 3331 __ cbz(len, L_combine); 3332 3333 __ bind(L_simple_by1_loop); 3334 __ ldrb(temp0, Address(__ post(buff, 1))); 3335 __ add(s1, s1, temp0); 3336 __ add(s2, s2, s1); 3337 __ subs(len, len, 1); 3338 __ br(Assembler::HI, L_simple_by1_loop); 3339 3340 // s1 = s1 % BASE 3341 __ subs(temp0, s1, base); 3342 __ csel(s1, temp0, s1, Assembler::HS); 3343 3344 // s2 = s2 % BASE 3345 __ lsr(temp0, s2, 16); 3346 __ lsl(temp1, temp0, 4); 3347 __ sub(temp1, temp1, temp0); 3348 __ add(s2, temp1, s2, ext::uxth); 3349 3350 __ subs(temp0, s2, base); 3351 __ csel(s2, temp0, s2, Assembler::HS); 3352 3353 __ b(L_combine); 3354 3355 __ bind(L_nmax); 3356 __ subs(len, len, nmax); 3357 __ sub(count, nmax, 16); 3358 __ br(Assembler::LO, L_by16); 3359 3360 __ bind(L_nmax_loop); 3361 3362 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3363 3364 __ add(s1, s1, temp0, ext::uxtb); 3365 __ ubfx(temp2, temp0, 8, 8); 3366 __ add(s2, s2, s1); 3367 __ add(s1, s1, temp2); 3368 __ ubfx(temp2, temp0, 16, 8); 3369 __ add(s2, s2, s1); 3370 __ add(s1, s1, temp2); 3371 __ ubfx(temp2, temp0, 24, 8); 3372 __ add(s2, s2, s1); 3373 __ add(s1, s1, temp2); 3374 __ ubfx(temp2, temp0, 32, 8); 3375 __ add(s2, s2, s1); 3376 __ add(s1, s1, temp2); 3377 __ ubfx(temp2, temp0, 40, 8); 3378 __ add(s2, s2, s1); 3379 __ add(s1, s1, temp2); 3380 __ ubfx(temp2, temp0, 48, 8); 3381 __ add(s2, s2, s1); 3382 __ add(s1, s1, temp2); 3383 __ add(s2, s2, s1); 3384 __ add(s1, s1, temp0, Assembler::LSR, 56); 3385 __ add(s2, s2, s1); 3386 3387 __ add(s1, s1, temp1, ext::uxtb); 3388 __ ubfx(temp2, temp1, 8, 8); 3389 __ add(s2, s2, s1); 3390 __ add(s1, s1, temp2); 3391 __ ubfx(temp2, temp1, 16, 8); 3392 __ add(s2, s2, s1); 3393 __ add(s1, s1, temp2); 3394 __ ubfx(temp2, temp1, 24, 8); 3395 __ add(s2, s2, s1); 3396 __ add(s1, s1, temp2); 3397 __ ubfx(temp2, temp1, 32, 8); 3398 __ add(s2, s2, s1); 3399 __ add(s1, s1, temp2); 3400 __ ubfx(temp2, temp1, 40, 8); 3401 __ add(s2, s2, s1); 3402 __ add(s1, s1, temp2); 3403 __ ubfx(temp2, temp1, 48, 8); 3404 __ add(s2, s2, s1); 3405 __ add(s1, s1, temp2); 3406 __ add(s2, s2, s1); 3407 __ add(s1, s1, temp1, Assembler::LSR, 56); 3408 __ add(s2, s2, s1); 3409 3410 __ subs(count, count, 16); 3411 __ br(Assembler::HS, L_nmax_loop); 3412 3413 // s1 = s1 % BASE 3414 __ lsr(temp0, s1, 16); 3415 __ lsl(temp1, temp0, 4); 3416 __ sub(temp1, temp1, temp0); 3417 __ add(temp1, temp1, s1, ext::uxth); 3418 3419 __ lsr(temp0, temp1, 16); 3420 __ lsl(s1, temp0, 4); 3421 __ sub(s1, s1, temp0); 3422 __ add(s1, s1, temp1, ext:: uxth); 3423 3424 __ subs(temp0, s1, base); 3425 __ csel(s1, temp0, s1, Assembler::HS); 3426 3427 // s2 = s2 % BASE 3428 __ lsr(temp0, s2, 16); 3429 __ lsl(temp1, temp0, 4); 3430 __ sub(temp1, temp1, temp0); 3431 __ add(temp1, temp1, s2, ext::uxth); 3432 3433 __ lsr(temp0, temp1, 16); 3434 __ lsl(s2, temp0, 4); 3435 __ sub(s2, s2, temp0); 3436 __ add(s2, s2, temp1, ext:: uxth); 3437 3438 __ subs(temp0, s2, base); 3439 __ csel(s2, temp0, s2, Assembler::HS); 3440 3441 __ subs(len, len, nmax); 3442 __ sub(count, nmax, 16); 3443 __ br(Assembler::HS, L_nmax_loop); 3444 3445 __ bind(L_by16); 3446 __ adds(len, len, count); 3447 __ br(Assembler::LO, L_by1); 3448 3449 __ bind(L_by16_loop); 3450 3451 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3452 3453 __ add(s1, s1, temp0, ext::uxtb); 3454 __ ubfx(temp2, temp0, 8, 8); 3455 __ add(s2, s2, s1); 3456 __ add(s1, s1, temp2); 3457 __ ubfx(temp2, temp0, 16, 8); 3458 __ add(s2, s2, s1); 3459 __ add(s1, s1, temp2); 3460 __ ubfx(temp2, temp0, 24, 8); 3461 __ add(s2, s2, s1); 3462 __ add(s1, s1, temp2); 3463 __ ubfx(temp2, temp0, 32, 8); 3464 __ add(s2, s2, s1); 3465 __ add(s1, s1, temp2); 3466 __ ubfx(temp2, temp0, 40, 8); 3467 __ add(s2, s2, s1); 3468 __ add(s1, s1, temp2); 3469 __ ubfx(temp2, temp0, 48, 8); 3470 __ add(s2, s2, s1); 3471 __ add(s1, s1, temp2); 3472 __ add(s2, s2, s1); 3473 __ add(s1, s1, temp0, Assembler::LSR, 56); 3474 __ add(s2, s2, s1); 3475 3476 __ add(s1, s1, temp1, ext::uxtb); 3477 __ ubfx(temp2, temp1, 8, 8); 3478 __ add(s2, s2, s1); 3479 __ add(s1, s1, temp2); 3480 __ ubfx(temp2, temp1, 16, 8); 3481 __ add(s2, s2, s1); 3482 __ add(s1, s1, temp2); 3483 __ ubfx(temp2, temp1, 24, 8); 3484 __ add(s2, s2, s1); 3485 __ add(s1, s1, temp2); 3486 __ ubfx(temp2, temp1, 32, 8); 3487 __ add(s2, s2, s1); 3488 __ add(s1, s1, temp2); 3489 __ ubfx(temp2, temp1, 40, 8); 3490 __ add(s2, s2, s1); 3491 __ add(s1, s1, temp2); 3492 __ ubfx(temp2, temp1, 48, 8); 3493 __ add(s2, s2, s1); 3494 __ add(s1, s1, temp2); 3495 __ add(s2, s2, s1); 3496 __ add(s1, s1, temp1, Assembler::LSR, 56); 3497 __ add(s2, s2, s1); 3498 3499 __ subs(len, len, 16); 3500 __ br(Assembler::HS, L_by16_loop); 3501 3502 __ bind(L_by1); 3503 __ adds(len, len, 15); 3504 __ br(Assembler::LO, L_do_mod); 3505 3506 __ bind(L_by1_loop); 3507 __ ldrb(temp0, Address(__ post(buff, 1))); 3508 __ add(s1, temp0, s1); 3509 __ add(s2, s2, s1); 3510 __ subs(len, len, 1); 3511 __ br(Assembler::HS, L_by1_loop); 3512 3513 __ bind(L_do_mod); 3514 // s1 = s1 % BASE 3515 __ lsr(temp0, s1, 16); 3516 __ lsl(temp1, temp0, 4); 3517 __ sub(temp1, temp1, temp0); 3518 __ add(temp1, temp1, s1, ext::uxth); 3519 3520 __ lsr(temp0, temp1, 16); 3521 __ lsl(s1, temp0, 4); 3522 __ sub(s1, s1, temp0); 3523 __ add(s1, s1, temp1, ext:: uxth); 3524 3525 __ subs(temp0, s1, base); 3526 __ csel(s1, temp0, s1, Assembler::HS); 3527 3528 // s2 = s2 % BASE 3529 __ lsr(temp0, s2, 16); 3530 __ lsl(temp1, temp0, 4); 3531 __ sub(temp1, temp1, temp0); 3532 __ add(temp1, temp1, s2, ext::uxth); 3533 3534 __ lsr(temp0, temp1, 16); 3535 __ lsl(s2, temp0, 4); 3536 __ sub(s2, s2, temp0); 3537 __ add(s2, s2, temp1, ext:: uxth); 3538 3539 __ subs(temp0, s2, base); 3540 __ csel(s2, temp0, s2, Assembler::HS); 3541 3542 // Combine lower bits and higher bits 3543 __ bind(L_combine); 3544 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3545 3546 __ ret(lr); 3547 3548 return start; 3549 } 3550 3551 /** 3552 * Arguments: 3553 * 3554 * Input: 3555 * c_rarg0 - x address 3556 * c_rarg1 - x length 3557 * c_rarg2 - y address 3558 * c_rarg3 - y lenth 3559 * c_rarg4 - z address 3560 * c_rarg5 - z length 3561 */ 3562 address generate_multiplyToLen() { 3563 __ align(CodeEntryAlignment); 3564 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3565 3566 address start = __ pc(); 3567 const Register x = r0; 3568 const Register xlen = r1; 3569 const Register y = r2; 3570 const Register ylen = r3; 3571 const Register z = r4; 3572 const Register zlen = r5; 3573 3574 const Register tmp1 = r10; 3575 const Register tmp2 = r11; 3576 const Register tmp3 = r12; 3577 const Register tmp4 = r13; 3578 const Register tmp5 = r14; 3579 const Register tmp6 = r15; 3580 const Register tmp7 = r16; 3581 3582 BLOCK_COMMENT("Entry:"); 3583 __ enter(); // required for proper stackwalking of RuntimeStub frame 3584 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3585 __ leave(); // required for proper stackwalking of RuntimeStub frame 3586 __ ret(lr); 3587 3588 return start; 3589 } 3590 3591 address generate_squareToLen() { 3592 // squareToLen algorithm for sizes 1..127 described in java code works 3593 // faster than multiply_to_len on some CPUs and slower on others, but 3594 // multiply_to_len shows a bit better overall results 3595 __ align(CodeEntryAlignment); 3596 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3597 address start = __ pc(); 3598 3599 const Register x = r0; 3600 const Register xlen = r1; 3601 const Register z = r2; 3602 const Register zlen = r3; 3603 const Register y = r4; // == x 3604 const Register ylen = r5; // == xlen 3605 3606 const Register tmp1 = r10; 3607 const Register tmp2 = r11; 3608 const Register tmp3 = r12; 3609 const Register tmp4 = r13; 3610 const Register tmp5 = r14; 3611 const Register tmp6 = r15; 3612 const Register tmp7 = r16; 3613 3614 RegSet spilled_regs = RegSet::of(y, ylen); 3615 BLOCK_COMMENT("Entry:"); 3616 __ enter(); 3617 __ push(spilled_regs, sp); 3618 __ mov(y, x); 3619 __ mov(ylen, xlen); 3620 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3621 __ pop(spilled_regs, sp); 3622 __ leave(); 3623 __ ret(lr); 3624 return start; 3625 } 3626 3627 address generate_mulAdd() { 3628 __ align(CodeEntryAlignment); 3629 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3630 3631 address start = __ pc(); 3632 3633 const Register out = r0; 3634 const Register in = r1; 3635 const Register offset = r2; 3636 const Register len = r3; 3637 const Register k = r4; 3638 3639 BLOCK_COMMENT("Entry:"); 3640 __ enter(); 3641 __ mul_add(out, in, offset, len, k); 3642 __ leave(); 3643 __ ret(lr); 3644 3645 return start; 3646 } 3647 3648 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3649 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3650 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3651 // Karatsuba multiplication performs a 128*128 -> 256-bit 3652 // multiplication in three 128-bit multiplications and a few 3653 // additions. 3654 // 3655 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3656 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3657 // 3658 // Inputs: 3659 // 3660 // A0 in a.d[0] (subkey) 3661 // A1 in a.d[1] 3662 // (A1+A0) in a1_xor_a0.d[0] 3663 // 3664 // B0 in b.d[0] (state) 3665 // B1 in b.d[1] 3666 3667 __ ext(tmp1, __ T16B, b, b, 0x08); 3668 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3669 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3670 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3671 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3672 3673 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3674 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3675 __ eor(tmp2, __ T16B, tmp2, tmp4); 3676 __ eor(tmp2, __ T16B, tmp2, tmp3); 3677 3678 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3679 __ ins(result_hi, __ D, tmp2, 0, 1); 3680 __ ins(result_lo, __ D, tmp2, 1, 0); 3681 } 3682 3683 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3684 FloatRegister p, FloatRegister z, FloatRegister t1) { 3685 const FloatRegister t0 = result; 3686 3687 // The GCM field polynomial f is z^128 + p(z), where p = 3688 // z^7+z^2+z+1. 3689 // 3690 // z^128 === -p(z) (mod (z^128 + p(z))) 3691 // 3692 // so, given that the product we're reducing is 3693 // a == lo + hi * z^128 3694 // substituting, 3695 // === lo - hi * p(z) (mod (z^128 + p(z))) 3696 // 3697 // we reduce by multiplying hi by p(z) and subtracting the result 3698 // from (i.e. XORing it with) lo. Because p has no nonzero high 3699 // bits we can do this with two 64-bit multiplications, lo*p and 3700 // hi*p. 3701 3702 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3703 __ ext(t1, __ T16B, t0, z, 8); 3704 __ eor(hi, __ T16B, hi, t1); 3705 __ ext(t1, __ T16B, z, t0, 8); 3706 __ eor(lo, __ T16B, lo, t1); 3707 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3708 __ eor(result, __ T16B, lo, t0); 3709 } 3710 3711 address generate_has_negatives(address &has_negatives_long) { 3712 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3713 const int large_loop_size = 64; 3714 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3715 int dcache_line = VM_Version::dcache_line_size(); 3716 3717 Register ary1 = r1, len = r2, result = r0; 3718 3719 __ align(CodeEntryAlignment); 3720 address entry = __ pc(); 3721 3722 __ enter(); 3723 3724 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3725 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3726 3727 __ cmp(len, 15); 3728 __ br(Assembler::GT, LEN_OVER_15); 3729 // The only case when execution falls into this code is when pointer is near 3730 // the end of memory page and we have to avoid reading next page 3731 __ add(ary1, ary1, len); 3732 __ subs(len, len, 8); 3733 __ br(Assembler::GT, LEN_OVER_8); 3734 __ ldr(rscratch2, Address(ary1, -8)); 3735 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3736 __ lsrv(rscratch2, rscratch2, rscratch1); 3737 __ tst(rscratch2, UPPER_BIT_MASK); 3738 __ cset(result, Assembler::NE); 3739 __ leave(); 3740 __ ret(lr); 3741 __ bind(LEN_OVER_8); 3742 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3743 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3744 __ tst(rscratch2, UPPER_BIT_MASK); 3745 __ br(Assembler::NE, RET_TRUE_NO_POP); 3746 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3747 __ lsrv(rscratch1, rscratch1, rscratch2); 3748 __ tst(rscratch1, UPPER_BIT_MASK); 3749 __ cset(result, Assembler::NE); 3750 __ leave(); 3751 __ ret(lr); 3752 3753 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3754 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3755 3756 has_negatives_long = __ pc(); // 2nd entry point 3757 3758 __ enter(); 3759 3760 __ bind(LEN_OVER_15); 3761 __ push(spilled_regs, sp); 3762 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3763 __ cbz(rscratch2, ALIGNED); 3764 __ ldp(tmp6, tmp1, Address(ary1)); 3765 __ mov(tmp5, 16); 3766 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3767 __ add(ary1, ary1, rscratch1); 3768 __ sub(len, len, rscratch1); 3769 __ orr(tmp6, tmp6, tmp1); 3770 __ tst(tmp6, UPPER_BIT_MASK); 3771 __ br(Assembler::NE, RET_TRUE); 3772 3773 __ bind(ALIGNED); 3774 __ cmp(len, large_loop_size); 3775 __ br(Assembler::LT, CHECK_16); 3776 // Perform 16-byte load as early return in pre-loop to handle situation 3777 // when initially aligned large array has negative values at starting bytes, 3778 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3779 // slower. Cases with negative bytes further ahead won't be affected that 3780 // much. In fact, it'll be faster due to early loads, less instructions and 3781 // less branches in LARGE_LOOP. 3782 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3783 __ sub(len, len, 16); 3784 __ orr(tmp6, tmp6, tmp1); 3785 __ tst(tmp6, UPPER_BIT_MASK); 3786 __ br(Assembler::NE, RET_TRUE); 3787 __ cmp(len, large_loop_size); 3788 __ br(Assembler::LT, CHECK_16); 3789 3790 if (SoftwarePrefetchHintDistance >= 0 3791 && SoftwarePrefetchHintDistance >= dcache_line) { 3792 // initial prefetch 3793 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3794 } 3795 __ bind(LARGE_LOOP); 3796 if (SoftwarePrefetchHintDistance >= 0) { 3797 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3798 } 3799 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3800 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3801 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3802 // instructions per cycle and have less branches, but this approach disables 3803 // early return, thus, all 64 bytes are loaded and checked every time. 3804 __ ldp(tmp2, tmp3, Address(ary1)); 3805 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3806 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3807 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3808 __ add(ary1, ary1, large_loop_size); 3809 __ sub(len, len, large_loop_size); 3810 __ orr(tmp2, tmp2, tmp3); 3811 __ orr(tmp4, tmp4, tmp5); 3812 __ orr(rscratch1, rscratch1, rscratch2); 3813 __ orr(tmp6, tmp6, tmp1); 3814 __ orr(tmp2, tmp2, tmp4); 3815 __ orr(rscratch1, rscratch1, tmp6); 3816 __ orr(tmp2, tmp2, rscratch1); 3817 __ tst(tmp2, UPPER_BIT_MASK); 3818 __ br(Assembler::NE, RET_TRUE); 3819 __ cmp(len, large_loop_size); 3820 __ br(Assembler::GE, LARGE_LOOP); 3821 3822 __ bind(CHECK_16); // small 16-byte load pre-loop 3823 __ cmp(len, 16); 3824 __ br(Assembler::LT, POST_LOOP16); 3825 3826 __ bind(LOOP16); // small 16-byte load loop 3827 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3828 __ sub(len, len, 16); 3829 __ orr(tmp2, tmp2, tmp3); 3830 __ tst(tmp2, UPPER_BIT_MASK); 3831 __ br(Assembler::NE, RET_TRUE); 3832 __ cmp(len, 16); 3833 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3834 3835 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3836 __ cmp(len, 8); 3837 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3838 __ ldr(tmp3, Address(__ post(ary1, 8))); 3839 __ sub(len, len, 8); 3840 __ tst(tmp3, UPPER_BIT_MASK); 3841 __ br(Assembler::NE, RET_TRUE); 3842 3843 __ bind(POST_LOOP16_LOAD_TAIL); 3844 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3845 __ ldr(tmp1, Address(ary1)); 3846 __ mov(tmp2, 64); 3847 __ sub(tmp4, tmp2, len, __ LSL, 3); 3848 __ lslv(tmp1, tmp1, tmp4); 3849 __ tst(tmp1, UPPER_BIT_MASK); 3850 __ br(Assembler::NE, RET_TRUE); 3851 // Fallthrough 3852 3853 __ bind(RET_FALSE); 3854 __ pop(spilled_regs, sp); 3855 __ leave(); 3856 __ mov(result, zr); 3857 __ ret(lr); 3858 3859 __ bind(RET_TRUE); 3860 __ pop(spilled_regs, sp); 3861 __ bind(RET_TRUE_NO_POP); 3862 __ leave(); 3863 __ mov(result, 1); 3864 __ ret(lr); 3865 3866 __ bind(DONE); 3867 __ pop(spilled_regs, sp); 3868 __ leave(); 3869 __ ret(lr); 3870 return entry; 3871 } 3872 3873 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3874 bool usePrefetch, Label &NOT_EQUAL) { 3875 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3876 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3877 tmp7 = r12, tmp8 = r13; 3878 Label LOOP; 3879 3880 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3881 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3882 __ bind(LOOP); 3883 if (usePrefetch) { 3884 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3885 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3886 } 3887 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3888 __ eor(tmp1, tmp1, tmp2); 3889 __ eor(tmp3, tmp3, tmp4); 3890 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3891 __ orr(tmp1, tmp1, tmp3); 3892 __ cbnz(tmp1, NOT_EQUAL); 3893 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3894 __ eor(tmp5, tmp5, tmp6); 3895 __ eor(tmp7, tmp7, tmp8); 3896 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3897 __ orr(tmp5, tmp5, tmp7); 3898 __ cbnz(tmp5, NOT_EQUAL); 3899 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3900 __ eor(tmp1, tmp1, tmp2); 3901 __ eor(tmp3, tmp3, tmp4); 3902 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3903 __ orr(tmp1, tmp1, tmp3); 3904 __ cbnz(tmp1, NOT_EQUAL); 3905 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3906 __ eor(tmp5, tmp5, tmp6); 3907 __ sub(cnt1, cnt1, 8 * wordSize); 3908 __ eor(tmp7, tmp7, tmp8); 3909 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3910 __ cmp(cnt1, loopThreshold); 3911 __ orr(tmp5, tmp5, tmp7); 3912 __ cbnz(tmp5, NOT_EQUAL); 3913 __ br(__ GE, LOOP); 3914 // post-loop 3915 __ eor(tmp1, tmp1, tmp2); 3916 __ eor(tmp3, tmp3, tmp4); 3917 __ orr(tmp1, tmp1, tmp3); 3918 __ sub(cnt1, cnt1, 2 * wordSize); 3919 __ cbnz(tmp1, NOT_EQUAL); 3920 } 3921 3922 void generate_large_array_equals_loop_simd(int loopThreshold, 3923 bool usePrefetch, Label &NOT_EQUAL) { 3924 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3925 tmp2 = rscratch2; 3926 Label LOOP; 3927 3928 __ bind(LOOP); 3929 if (usePrefetch) { 3930 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3931 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3932 } 3933 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3934 __ sub(cnt1, cnt1, 8 * wordSize); 3935 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3936 __ cmp(cnt1, loopThreshold); 3937 __ eor(v0, __ T16B, v0, v4); 3938 __ eor(v1, __ T16B, v1, v5); 3939 __ eor(v2, __ T16B, v2, v6); 3940 __ eor(v3, __ T16B, v3, v7); 3941 __ orr(v0, __ T16B, v0, v1); 3942 __ orr(v1, __ T16B, v2, v3); 3943 __ orr(v0, __ T16B, v0, v1); 3944 __ umov(tmp1, v0, __ D, 0); 3945 __ umov(tmp2, v0, __ D, 1); 3946 __ orr(tmp1, tmp1, tmp2); 3947 __ cbnz(tmp1, NOT_EQUAL); 3948 __ br(__ GE, LOOP); 3949 } 3950 3951 // a1 = r1 - array1 address 3952 // a2 = r2 - array2 address 3953 // result = r0 - return value. Already contains "false" 3954 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3955 // r3-r5 are reserved temporary registers 3956 address generate_large_array_equals() { 3957 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3958 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3959 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3960 tmp7 = r12, tmp8 = r13; 3961 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3962 SMALL_LOOP, POST_LOOP; 3963 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3964 // calculate if at least 32 prefetched bytes are used 3965 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3966 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3967 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3968 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3969 tmp5, tmp6, tmp7, tmp8); 3970 3971 __ align(CodeEntryAlignment); 3972 address entry = __ pc(); 3973 __ enter(); 3974 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3975 // also advance pointers to use post-increment instead of pre-increment 3976 __ add(a1, a1, wordSize); 3977 __ add(a2, a2, wordSize); 3978 if (AvoidUnalignedAccesses) { 3979 // both implementations (SIMD/nonSIMD) are using relatively large load 3980 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 3981 // on some CPUs in case of address is not at least 16-byte aligned. 3982 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 3983 // load if needed at least for 1st address and make if 16-byte aligned. 3984 Label ALIGNED16; 3985 __ tbz(a1, 3, ALIGNED16); 3986 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3987 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3988 __ sub(cnt1, cnt1, wordSize); 3989 __ eor(tmp1, tmp1, tmp2); 3990 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 3991 __ bind(ALIGNED16); 3992 } 3993 if (UseSIMDForArrayEquals) { 3994 if (SoftwarePrefetchHintDistance >= 0) { 3995 __ cmp(cnt1, prefetchLoopThreshold); 3996 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3997 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 3998 /* prfm = */ true, NOT_EQUAL); 3999 __ cmp(cnt1, nonPrefetchLoopThreshold); 4000 __ br(__ LT, TAIL); 4001 } 4002 __ bind(NO_PREFETCH_LARGE_LOOP); 4003 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 4004 /* prfm = */ false, NOT_EQUAL); 4005 } else { 4006 __ push(spilled_regs, sp); 4007 if (SoftwarePrefetchHintDistance >= 0) { 4008 __ cmp(cnt1, prefetchLoopThreshold); 4009 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 4010 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 4011 /* prfm = */ true, NOT_EQUAL); 4012 __ cmp(cnt1, nonPrefetchLoopThreshold); 4013 __ br(__ LT, TAIL); 4014 } 4015 __ bind(NO_PREFETCH_LARGE_LOOP); 4016 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 4017 /* prfm = */ false, NOT_EQUAL); 4018 } 4019 __ bind(TAIL); 4020 __ cbz(cnt1, EQUAL); 4021 __ subs(cnt1, cnt1, wordSize); 4022 __ br(__ LE, POST_LOOP); 4023 __ bind(SMALL_LOOP); 4024 __ ldr(tmp1, Address(__ post(a1, wordSize))); 4025 __ ldr(tmp2, Address(__ post(a2, wordSize))); 4026 __ subs(cnt1, cnt1, wordSize); 4027 __ eor(tmp1, tmp1, tmp2); 4028 __ cbnz(tmp1, NOT_EQUAL); 4029 __ br(__ GT, SMALL_LOOP); 4030 __ bind(POST_LOOP); 4031 __ ldr(tmp1, Address(a1, cnt1)); 4032 __ ldr(tmp2, Address(a2, cnt1)); 4033 __ eor(tmp1, tmp1, tmp2); 4034 __ cbnz(tmp1, NOT_EQUAL); 4035 __ bind(EQUAL); 4036 __ mov(result, true); 4037 __ bind(NOT_EQUAL); 4038 if (!UseSIMDForArrayEquals) { 4039 __ pop(spilled_regs, sp); 4040 } 4041 __ bind(NOT_EQUAL_NO_POP); 4042 __ leave(); 4043 __ ret(lr); 4044 return entry; 4045 } 4046 4047 4048 /** 4049 * Arguments: 4050 * 4051 * Input: 4052 * c_rarg0 - current state address 4053 * c_rarg1 - H key address 4054 * c_rarg2 - data address 4055 * c_rarg3 - number of blocks 4056 * 4057 * Output: 4058 * Updated state at c_rarg0 4059 */ 4060 address generate_ghash_processBlocks() { 4061 // Bafflingly, GCM uses little-endian for the byte order, but 4062 // big-endian for the bit order. For example, the polynomial 1 is 4063 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4064 // 4065 // So, we must either reverse the bytes in each word and do 4066 // everything big-endian or reverse the bits in each byte and do 4067 // it little-endian. On AArch64 it's more idiomatic to reverse 4068 // the bits in each byte (we have an instruction, RBIT, to do 4069 // that) and keep the data in little-endian bit order throught the 4070 // calculation, bit-reversing the inputs and outputs. 4071 4072 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4073 __ align(wordSize * 2); 4074 address p = __ pc(); 4075 __ emit_int64(0x87); // The low-order bits of the field 4076 // polynomial (i.e. p = z^7+z^2+z+1) 4077 // repeated in the low and high parts of a 4078 // 128-bit vector 4079 __ emit_int64(0x87); 4080 4081 __ align(CodeEntryAlignment); 4082 address start = __ pc(); 4083 4084 Register state = c_rarg0; 4085 Register subkeyH = c_rarg1; 4086 Register data = c_rarg2; 4087 Register blocks = c_rarg3; 4088 4089 FloatRegister vzr = v30; 4090 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4091 4092 __ ldrq(v0, Address(state)); 4093 __ ldrq(v1, Address(subkeyH)); 4094 4095 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4096 __ rbit(v0, __ T16B, v0); 4097 __ rev64(v1, __ T16B, v1); 4098 __ rbit(v1, __ T16B, v1); 4099 4100 __ ldrq(v26, p); 4101 4102 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4103 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4104 4105 { 4106 Label L_ghash_loop; 4107 __ bind(L_ghash_loop); 4108 4109 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4110 // reversing each byte 4111 __ rbit(v2, __ T16B, v2); 4112 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4113 4114 // Multiply state in v2 by subkey in v1 4115 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4116 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4117 /*temps*/v6, v20, v18, v21); 4118 // Reduce v7:v5 by the field polynomial 4119 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4120 4121 __ sub(blocks, blocks, 1); 4122 __ cbnz(blocks, L_ghash_loop); 4123 } 4124 4125 // The bit-reversed result is at this point in v0 4126 __ rev64(v1, __ T16B, v0); 4127 __ rbit(v1, __ T16B, v1); 4128 4129 __ st1(v1, __ T16B, state); 4130 __ ret(lr); 4131 4132 return start; 4133 } 4134 4135 // Continuation point for throwing of implicit exceptions that are 4136 // not handled in the current activation. Fabricates an exception 4137 // oop and initiates normal exception dispatching in this 4138 // frame. Since we need to preserve callee-saved values (currently 4139 // only for C2, but done for C1 as well) we need a callee-saved oop 4140 // map and therefore have to make these stubs into RuntimeStubs 4141 // rather than BufferBlobs. If the compiler needs all registers to 4142 // be preserved between the fault point and the exception handler 4143 // then it must assume responsibility for that in 4144 // AbstractCompiler::continuation_for_implicit_null_exception or 4145 // continuation_for_implicit_division_by_zero_exception. All other 4146 // implicit exceptions (e.g., NullPointerException or 4147 // AbstractMethodError on entry) are either at call sites or 4148 // otherwise assume that stack unwinding will be initiated, so 4149 // caller saved registers were assumed volatile in the compiler. 4150 4151 #undef __ 4152 #define __ masm-> 4153 4154 address generate_throw_exception(const char* name, 4155 address runtime_entry, 4156 Register arg1 = noreg, 4157 Register arg2 = noreg) { 4158 // Information about frame layout at time of blocking runtime call. 4159 // Note that we only have to preserve callee-saved registers since 4160 // the compilers are responsible for supplying a continuation point 4161 // if they expect all registers to be preserved. 4162 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4163 enum layout { 4164 rfp_off = 0, 4165 rfp_off2, 4166 return_off, 4167 return_off2, 4168 framesize // inclusive of return address 4169 }; 4170 4171 int insts_size = 512; 4172 int locs_size = 64; 4173 4174 CodeBuffer code(name, insts_size, locs_size); 4175 OopMapSet* oop_maps = new OopMapSet(); 4176 MacroAssembler* masm = new MacroAssembler(&code); 4177 4178 address start = __ pc(); 4179 4180 // This is an inlined and slightly modified version of call_VM 4181 // which has the ability to fetch the return PC out of 4182 // thread-local storage and also sets up last_Java_sp slightly 4183 // differently than the real call_VM 4184 4185 __ enter(); // Save FP and LR before call 4186 4187 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4188 4189 // lr and fp are already in place 4190 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4191 4192 int frame_complete = __ pc() - start; 4193 4194 // Set up last_Java_sp and last_Java_fp 4195 address the_pc = __ pc(); 4196 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 4197 4198 // Call runtime 4199 if (arg1 != noreg) { 4200 assert(arg2 != c_rarg1, "clobbered"); 4201 __ mov(c_rarg1, arg1); 4202 } 4203 if (arg2 != noreg) { 4204 __ mov(c_rarg2, arg2); 4205 } 4206 __ mov(c_rarg0, rthread); 4207 BLOCK_COMMENT("call runtime_entry"); 4208 __ mov(rscratch1, runtime_entry); 4209 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4210 4211 // Generate oop map 4212 OopMap* map = new OopMap(framesize, 0); 4213 4214 oop_maps->add_gc_map(the_pc - start, map); 4215 4216 __ reset_last_Java_frame(true); 4217 __ maybe_isb(); 4218 4219 __ leave(); 4220 4221 // check for pending exceptions 4222 #ifdef ASSERT 4223 Label L; 4224 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4225 __ cbnz(rscratch1, L); 4226 __ should_not_reach_here(); 4227 __ bind(L); 4228 #endif // ASSERT 4229 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4230 4231 4232 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4233 RuntimeStub* stub = 4234 RuntimeStub::new_runtime_stub(name, 4235 &code, 4236 frame_complete, 4237 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4238 oop_maps, false); 4239 return stub->entry_point(); 4240 } 4241 4242 class MontgomeryMultiplyGenerator : public MacroAssembler { 4243 4244 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4245 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4246 4247 RegSet _toSave; 4248 bool _squaring; 4249 4250 public: 4251 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4252 : MacroAssembler(as->code()), _squaring(squaring) { 4253 4254 // Register allocation 4255 4256 Register reg = c_rarg0; 4257 Pa_base = reg; // Argument registers 4258 if (squaring) 4259 Pb_base = Pa_base; 4260 else 4261 Pb_base = ++reg; 4262 Pn_base = ++reg; 4263 Rlen= ++reg; 4264 inv = ++reg; 4265 Pm_base = ++reg; 4266 4267 // Working registers: 4268 Ra = ++reg; // The current digit of a, b, n, and m. 4269 Rb = ++reg; 4270 Rm = ++reg; 4271 Rn = ++reg; 4272 4273 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4274 Pb = ++reg; 4275 Pm = ++reg; 4276 Pn = ++reg; 4277 4278 t0 = ++reg; // Three registers which form a 4279 t1 = ++reg; // triple-precision accumuator. 4280 t2 = ++reg; 4281 4282 Ri = ++reg; // Inner and outer loop indexes. 4283 Rj = ++reg; 4284 4285 Rhi_ab = ++reg; // Product registers: low and high parts 4286 Rlo_ab = ++reg; // of a*b and m*n. 4287 Rhi_mn = ++reg; 4288 Rlo_mn = ++reg; 4289 4290 // r19 and up are callee-saved. 4291 _toSave = RegSet::range(r19, reg) + Pm_base; 4292 } 4293 4294 private: 4295 void save_regs() { 4296 push(_toSave, sp); 4297 } 4298 4299 void restore_regs() { 4300 pop(_toSave, sp); 4301 } 4302 4303 template <typename T> 4304 void unroll_2(Register count, T block) { 4305 Label loop, end, odd; 4306 tbnz(count, 0, odd); 4307 cbz(count, end); 4308 align(16); 4309 bind(loop); 4310 (this->*block)(); 4311 bind(odd); 4312 (this->*block)(); 4313 subs(count, count, 2); 4314 br(Assembler::GT, loop); 4315 bind(end); 4316 } 4317 4318 template <typename T> 4319 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4320 Label loop, end, odd; 4321 tbnz(count, 0, odd); 4322 cbz(count, end); 4323 align(16); 4324 bind(loop); 4325 (this->*block)(d, s, tmp); 4326 bind(odd); 4327 (this->*block)(d, s, tmp); 4328 subs(count, count, 2); 4329 br(Assembler::GT, loop); 4330 bind(end); 4331 } 4332 4333 void pre1(RegisterOrConstant i) { 4334 block_comment("pre1"); 4335 // Pa = Pa_base; 4336 // Pb = Pb_base + i; 4337 // Pm = Pm_base; 4338 // Pn = Pn_base + i; 4339 // Ra = *Pa; 4340 // Rb = *Pb; 4341 // Rm = *Pm; 4342 // Rn = *Pn; 4343 ldr(Ra, Address(Pa_base)); 4344 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4345 ldr(Rm, Address(Pm_base)); 4346 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4347 lea(Pa, Address(Pa_base)); 4348 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4349 lea(Pm, Address(Pm_base)); 4350 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4351 4352 // Zero the m*n result. 4353 mov(Rhi_mn, zr); 4354 mov(Rlo_mn, zr); 4355 } 4356 4357 // The core multiply-accumulate step of a Montgomery 4358 // multiplication. The idea is to schedule operations as a 4359 // pipeline so that instructions with long latencies (loads and 4360 // multiplies) have time to complete before their results are 4361 // used. This most benefits in-order implementations of the 4362 // architecture but out-of-order ones also benefit. 4363 void step() { 4364 block_comment("step"); 4365 // MACC(Ra, Rb, t0, t1, t2); 4366 // Ra = *++Pa; 4367 // Rb = *--Pb; 4368 umulh(Rhi_ab, Ra, Rb); 4369 mul(Rlo_ab, Ra, Rb); 4370 ldr(Ra, pre(Pa, wordSize)); 4371 ldr(Rb, pre(Pb, -wordSize)); 4372 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4373 // previous iteration. 4374 // MACC(Rm, Rn, t0, t1, t2); 4375 // Rm = *++Pm; 4376 // Rn = *--Pn; 4377 umulh(Rhi_mn, Rm, Rn); 4378 mul(Rlo_mn, Rm, Rn); 4379 ldr(Rm, pre(Pm, wordSize)); 4380 ldr(Rn, pre(Pn, -wordSize)); 4381 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4382 } 4383 4384 void post1() { 4385 block_comment("post1"); 4386 4387 // MACC(Ra, Rb, t0, t1, t2); 4388 // Ra = *++Pa; 4389 // Rb = *--Pb; 4390 umulh(Rhi_ab, Ra, Rb); 4391 mul(Rlo_ab, Ra, Rb); 4392 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4393 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4394 4395 // *Pm = Rm = t0 * inv; 4396 mul(Rm, t0, inv); 4397 str(Rm, Address(Pm)); 4398 4399 // MACC(Rm, Rn, t0, t1, t2); 4400 // t0 = t1; t1 = t2; t2 = 0; 4401 umulh(Rhi_mn, Rm, Rn); 4402 4403 #ifndef PRODUCT 4404 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4405 { 4406 mul(Rlo_mn, Rm, Rn); 4407 add(Rlo_mn, t0, Rlo_mn); 4408 Label ok; 4409 cbz(Rlo_mn, ok); { 4410 stop("broken Montgomery multiply"); 4411 } bind(ok); 4412 } 4413 #endif 4414 // We have very carefully set things up so that 4415 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4416 // the lower half of Rm * Rn because we know the result already: 4417 // it must be -t0. t0 + (-t0) must generate a carry iff 4418 // t0 != 0. So, rather than do a mul and an adds we just set 4419 // the carry flag iff t0 is nonzero. 4420 // 4421 // mul(Rlo_mn, Rm, Rn); 4422 // adds(zr, t0, Rlo_mn); 4423 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4424 adcs(t0, t1, Rhi_mn); 4425 adc(t1, t2, zr); 4426 mov(t2, zr); 4427 } 4428 4429 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 4430 block_comment("pre2"); 4431 // Pa = Pa_base + i-len; 4432 // Pb = Pb_base + len; 4433 // Pm = Pm_base + i-len; 4434 // Pn = Pn_base + len; 4435 4436 if (i.is_register()) { 4437 sub(Rj, i.as_register(), len); 4438 } else { 4439 mov(Rj, i.as_constant()); 4440 sub(Rj, Rj, len); 4441 } 4442 // Rj == i-len 4443 4444 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 4445 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 4446 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4447 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 4448 4449 // Ra = *++Pa; 4450 // Rb = *--Pb; 4451 // Rm = *++Pm; 4452 // Rn = *--Pn; 4453 ldr(Ra, pre(Pa, wordSize)); 4454 ldr(Rb, pre(Pb, -wordSize)); 4455 ldr(Rm, pre(Pm, wordSize)); 4456 ldr(Rn, pre(Pn, -wordSize)); 4457 4458 mov(Rhi_mn, zr); 4459 mov(Rlo_mn, zr); 4460 } 4461 4462 void post2(RegisterOrConstant i, RegisterOrConstant len) { 4463 block_comment("post2"); 4464 if (i.is_constant()) { 4465 mov(Rj, i.as_constant()-len.as_constant()); 4466 } else { 4467 sub(Rj, i.as_register(), len); 4468 } 4469 4470 adds(t0, t0, Rlo_mn); // The pending m*n, low part 4471 4472 // As soon as we know the least significant digit of our result, 4473 // store it. 4474 // Pm_base[i-len] = t0; 4475 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4476 4477 // t0 = t1; t1 = t2; t2 = 0; 4478 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 4479 adc(t1, t2, zr); 4480 mov(t2, zr); 4481 } 4482 4483 // A carry in t0 after Montgomery multiplication means that we 4484 // should subtract multiples of n from our result in m. We'll 4485 // keep doing that until there is no carry. 4486 void normalize(RegisterOrConstant len) { 4487 block_comment("normalize"); 4488 // while (t0) 4489 // t0 = sub(Pm_base, Pn_base, t0, len); 4490 Label loop, post, again; 4491 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 4492 cbz(t0, post); { 4493 bind(again); { 4494 mov(i, zr); 4495 mov(cnt, len); 4496 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4497 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4498 subs(zr, zr, zr); // set carry flag, i.e. no borrow 4499 align(16); 4500 bind(loop); { 4501 sbcs(Rm, Rm, Rn); 4502 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4503 add(i, i, 1); 4504 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4505 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4506 sub(cnt, cnt, 1); 4507 } cbnz(cnt, loop); 4508 sbc(t0, t0, zr); 4509 } cbnz(t0, again); 4510 } bind(post); 4511 } 4512 4513 // Move memory at s to d, reversing words. 4514 // Increments d to end of copied memory 4515 // Destroys tmp1, tmp2 4516 // Preserves len 4517 // Leaves s pointing to the address which was in d at start 4518 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 4519 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 4520 4521 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 4522 mov(tmp1, len); 4523 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 4524 sub(s, d, len, ext::uxtw, LogBytesPerWord); 4525 } 4526 // where 4527 void reverse1(Register d, Register s, Register tmp) { 4528 ldr(tmp, pre(s, -wordSize)); 4529 ror(tmp, tmp, 32); 4530 str(tmp, post(d, wordSize)); 4531 } 4532 4533 void step_squaring() { 4534 // An extra ACC 4535 step(); 4536 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4537 } 4538 4539 void last_squaring(RegisterOrConstant i) { 4540 Label dont; 4541 // if ((i & 1) == 0) { 4542 tbnz(i.as_register(), 0, dont); { 4543 // MACC(Ra, Rb, t0, t1, t2); 4544 // Ra = *++Pa; 4545 // Rb = *--Pb; 4546 umulh(Rhi_ab, Ra, Rb); 4547 mul(Rlo_ab, Ra, Rb); 4548 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4549 } bind(dont); 4550 } 4551 4552 void extra_step_squaring() { 4553 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4554 4555 // MACC(Rm, Rn, t0, t1, t2); 4556 // Rm = *++Pm; 4557 // Rn = *--Pn; 4558 umulh(Rhi_mn, Rm, Rn); 4559 mul(Rlo_mn, Rm, Rn); 4560 ldr(Rm, pre(Pm, wordSize)); 4561 ldr(Rn, pre(Pn, -wordSize)); 4562 } 4563 4564 void post1_squaring() { 4565 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4566 4567 // *Pm = Rm = t0 * inv; 4568 mul(Rm, t0, inv); 4569 str(Rm, Address(Pm)); 4570 4571 // MACC(Rm, Rn, t0, t1, t2); 4572 // t0 = t1; t1 = t2; t2 = 0; 4573 umulh(Rhi_mn, Rm, Rn); 4574 4575 #ifndef PRODUCT 4576 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4577 { 4578 mul(Rlo_mn, Rm, Rn); 4579 add(Rlo_mn, t0, Rlo_mn); 4580 Label ok; 4581 cbz(Rlo_mn, ok); { 4582 stop("broken Montgomery multiply"); 4583 } bind(ok); 4584 } 4585 #endif 4586 // We have very carefully set things up so that 4587 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4588 // the lower half of Rm * Rn because we know the result already: 4589 // it must be -t0. t0 + (-t0) must generate a carry iff 4590 // t0 != 0. So, rather than do a mul and an adds we just set 4591 // the carry flag iff t0 is nonzero. 4592 // 4593 // mul(Rlo_mn, Rm, Rn); 4594 // adds(zr, t0, Rlo_mn); 4595 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4596 adcs(t0, t1, Rhi_mn); 4597 adc(t1, t2, zr); 4598 mov(t2, zr); 4599 } 4600 4601 void acc(Register Rhi, Register Rlo, 4602 Register t0, Register t1, Register t2) { 4603 adds(t0, t0, Rlo); 4604 adcs(t1, t1, Rhi); 4605 adc(t2, t2, zr); 4606 } 4607 4608 public: 4609 /** 4610 * Fast Montgomery multiplication. The derivation of the 4611 * algorithm is in A Cryptographic Library for the Motorola 4612 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 4613 * 4614 * Arguments: 4615 * 4616 * Inputs for multiplication: 4617 * c_rarg0 - int array elements a 4618 * c_rarg1 - int array elements b 4619 * c_rarg2 - int array elements n (the modulus) 4620 * c_rarg3 - int length 4621 * c_rarg4 - int inv 4622 * c_rarg5 - int array elements m (the result) 4623 * 4624 * Inputs for squaring: 4625 * c_rarg0 - int array elements a 4626 * c_rarg1 - int array elements n (the modulus) 4627 * c_rarg2 - int length 4628 * c_rarg3 - int inv 4629 * c_rarg4 - int array elements m (the result) 4630 * 4631 */ 4632 address generate_multiply() { 4633 Label argh, nothing; 4634 bind(argh); 4635 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4636 4637 align(CodeEntryAlignment); 4638 address entry = pc(); 4639 4640 cbzw(Rlen, nothing); 4641 4642 enter(); 4643 4644 // Make room. 4645 cmpw(Rlen, 512); 4646 br(Assembler::HI, argh); 4647 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4648 andr(sp, Ra, -2 * wordSize); 4649 4650 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4651 4652 { 4653 // Copy input args, reversing as we go. We use Ra as a 4654 // temporary variable. 4655 reverse(Ra, Pa_base, Rlen, t0, t1); 4656 if (!_squaring) 4657 reverse(Ra, Pb_base, Rlen, t0, t1); 4658 reverse(Ra, Pn_base, Rlen, t0, t1); 4659 } 4660 4661 // Push all call-saved registers and also Pm_base which we'll need 4662 // at the end. 4663 save_regs(); 4664 4665 #ifndef PRODUCT 4666 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 4667 { 4668 ldr(Rn, Address(Pn_base, 0)); 4669 mul(Rlo_mn, Rn, inv); 4670 cmp(Rlo_mn, -1); 4671 Label ok; 4672 br(EQ, ok); { 4673 stop("broken inverse in Montgomery multiply"); 4674 } bind(ok); 4675 } 4676 #endif 4677 4678 mov(Pm_base, Ra); 4679 4680 mov(t0, zr); 4681 mov(t1, zr); 4682 mov(t2, zr); 4683 4684 block_comment("for (int i = 0; i < len; i++) {"); 4685 mov(Ri, zr); { 4686 Label loop, end; 4687 cmpw(Ri, Rlen); 4688 br(Assembler::GE, end); 4689 4690 bind(loop); 4691 pre1(Ri); 4692 4693 block_comment(" for (j = i; j; j--) {"); { 4694 movw(Rj, Ri); 4695 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4696 } block_comment(" } // j"); 4697 4698 post1(); 4699 addw(Ri, Ri, 1); 4700 cmpw(Ri, Rlen); 4701 br(Assembler::LT, loop); 4702 bind(end); 4703 block_comment("} // i"); 4704 } 4705 4706 block_comment("for (int i = len; i < 2*len; i++) {"); 4707 mov(Ri, Rlen); { 4708 Label loop, end; 4709 cmpw(Ri, Rlen, Assembler::LSL, 1); 4710 br(Assembler::GE, end); 4711 4712 bind(loop); 4713 pre2(Ri, Rlen); 4714 4715 block_comment(" for (j = len*2-i-1; j; j--) {"); { 4716 lslw(Rj, Rlen, 1); 4717 subw(Rj, Rj, Ri); 4718 subw(Rj, Rj, 1); 4719 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4720 } block_comment(" } // j"); 4721 4722 post2(Ri, Rlen); 4723 addw(Ri, Ri, 1); 4724 cmpw(Ri, Rlen, Assembler::LSL, 1); 4725 br(Assembler::LT, loop); 4726 bind(end); 4727 } 4728 block_comment("} // i"); 4729 4730 normalize(Rlen); 4731 4732 mov(Ra, Pm_base); // Save Pm_base in Ra 4733 restore_regs(); // Restore caller's Pm_base 4734 4735 // Copy our result into caller's Pm_base 4736 reverse(Pm_base, Ra, Rlen, t0, t1); 4737 4738 leave(); 4739 bind(nothing); 4740 ret(lr); 4741 4742 return entry; 4743 } 4744 // In C, approximately: 4745 4746 // void 4747 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 4748 // unsigned long Pn_base[], unsigned long Pm_base[], 4749 // unsigned long inv, int len) { 4750 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4751 // unsigned long *Pa, *Pb, *Pn, *Pm; 4752 // unsigned long Ra, Rb, Rn, Rm; 4753 4754 // int i; 4755 4756 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4757 4758 // for (i = 0; i < len; i++) { 4759 // int j; 4760 4761 // Pa = Pa_base; 4762 // Pb = Pb_base + i; 4763 // Pm = Pm_base; 4764 // Pn = Pn_base + i; 4765 4766 // Ra = *Pa; 4767 // Rb = *Pb; 4768 // Rm = *Pm; 4769 // Rn = *Pn; 4770 4771 // int iters = i; 4772 // for (j = 0; iters--; j++) { 4773 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4774 // MACC(Ra, Rb, t0, t1, t2); 4775 // Ra = *++Pa; 4776 // Rb = *--Pb; 4777 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4778 // MACC(Rm, Rn, t0, t1, t2); 4779 // Rm = *++Pm; 4780 // Rn = *--Pn; 4781 // } 4782 4783 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 4784 // MACC(Ra, Rb, t0, t1, t2); 4785 // *Pm = Rm = t0 * inv; 4786 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4787 // MACC(Rm, Rn, t0, t1, t2); 4788 4789 // assert(t0 == 0, "broken Montgomery multiply"); 4790 4791 // t0 = t1; t1 = t2; t2 = 0; 4792 // } 4793 4794 // for (i = len; i < 2*len; i++) { 4795 // int j; 4796 4797 // Pa = Pa_base + i-len; 4798 // Pb = Pb_base + len; 4799 // Pm = Pm_base + i-len; 4800 // Pn = Pn_base + len; 4801 4802 // Ra = *++Pa; 4803 // Rb = *--Pb; 4804 // Rm = *++Pm; 4805 // Rn = *--Pn; 4806 4807 // int iters = len*2-i-1; 4808 // for (j = i-len+1; iters--; j++) { 4809 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4810 // MACC(Ra, Rb, t0, t1, t2); 4811 // Ra = *++Pa; 4812 // Rb = *--Pb; 4813 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4814 // MACC(Rm, Rn, t0, t1, t2); 4815 // Rm = *++Pm; 4816 // Rn = *--Pn; 4817 // } 4818 4819 // Pm_base[i-len] = t0; 4820 // t0 = t1; t1 = t2; t2 = 0; 4821 // } 4822 4823 // while (t0) 4824 // t0 = sub(Pm_base, Pn_base, t0, len); 4825 // } 4826 4827 /** 4828 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4829 * multiplies than Montgomery multiplication so it should be up to 4830 * 25% faster. However, its loop control is more complex and it 4831 * may actually run slower on some machines. 4832 * 4833 * Arguments: 4834 * 4835 * Inputs: 4836 * c_rarg0 - int array elements a 4837 * c_rarg1 - int array elements n (the modulus) 4838 * c_rarg2 - int length 4839 * c_rarg3 - int inv 4840 * c_rarg4 - int array elements m (the result) 4841 * 4842 */ 4843 address generate_square() { 4844 Label argh; 4845 bind(argh); 4846 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4847 4848 align(CodeEntryAlignment); 4849 address entry = pc(); 4850 4851 enter(); 4852 4853 // Make room. 4854 cmpw(Rlen, 512); 4855 br(Assembler::HI, argh); 4856 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4857 andr(sp, Ra, -2 * wordSize); 4858 4859 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4860 4861 { 4862 // Copy input args, reversing as we go. We use Ra as a 4863 // temporary variable. 4864 reverse(Ra, Pa_base, Rlen, t0, t1); 4865 reverse(Ra, Pn_base, Rlen, t0, t1); 4866 } 4867 4868 // Push all call-saved registers and also Pm_base which we'll need 4869 // at the end. 4870 save_regs(); 4871 4872 mov(Pm_base, Ra); 4873 4874 mov(t0, zr); 4875 mov(t1, zr); 4876 mov(t2, zr); 4877 4878 block_comment("for (int i = 0; i < len; i++) {"); 4879 mov(Ri, zr); { 4880 Label loop, end; 4881 bind(loop); 4882 cmp(Ri, Rlen); 4883 br(Assembler::GE, end); 4884 4885 pre1(Ri); 4886 4887 block_comment("for (j = (i+1)/2; j; j--) {"); { 4888 add(Rj, Ri, 1); 4889 lsr(Rj, Rj, 1); 4890 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4891 } block_comment(" } // j"); 4892 4893 last_squaring(Ri); 4894 4895 block_comment(" for (j = i/2; j; j--) {"); { 4896 lsr(Rj, Ri, 1); 4897 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4898 } block_comment(" } // j"); 4899 4900 post1_squaring(); 4901 add(Ri, Ri, 1); 4902 cmp(Ri, Rlen); 4903 br(Assembler::LT, loop); 4904 4905 bind(end); 4906 block_comment("} // i"); 4907 } 4908 4909 block_comment("for (int i = len; i < 2*len; i++) {"); 4910 mov(Ri, Rlen); { 4911 Label loop, end; 4912 bind(loop); 4913 cmp(Ri, Rlen, Assembler::LSL, 1); 4914 br(Assembler::GE, end); 4915 4916 pre2(Ri, Rlen); 4917 4918 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4919 lsl(Rj, Rlen, 1); 4920 sub(Rj, Rj, Ri); 4921 sub(Rj, Rj, 1); 4922 lsr(Rj, Rj, 1); 4923 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4924 } block_comment(" } // j"); 4925 4926 last_squaring(Ri); 4927 4928 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4929 lsl(Rj, Rlen, 1); 4930 sub(Rj, Rj, Ri); 4931 lsr(Rj, Rj, 1); 4932 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4933 } block_comment(" } // j"); 4934 4935 post2(Ri, Rlen); 4936 add(Ri, Ri, 1); 4937 cmp(Ri, Rlen, Assembler::LSL, 1); 4938 4939 br(Assembler::LT, loop); 4940 bind(end); 4941 block_comment("} // i"); 4942 } 4943 4944 normalize(Rlen); 4945 4946 mov(Ra, Pm_base); // Save Pm_base in Ra 4947 restore_regs(); // Restore caller's Pm_base 4948 4949 // Copy our result into caller's Pm_base 4950 reverse(Pm_base, Ra, Rlen, t0, t1); 4951 4952 leave(); 4953 ret(lr); 4954 4955 return entry; 4956 } 4957 // In C, approximately: 4958 4959 // void 4960 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4961 // unsigned long Pm_base[], unsigned long inv, int len) { 4962 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4963 // unsigned long *Pa, *Pb, *Pn, *Pm; 4964 // unsigned long Ra, Rb, Rn, Rm; 4965 4966 // int i; 4967 4968 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4969 4970 // for (i = 0; i < len; i++) { 4971 // int j; 4972 4973 // Pa = Pa_base; 4974 // Pb = Pa_base + i; 4975 // Pm = Pm_base; 4976 // Pn = Pn_base + i; 4977 4978 // Ra = *Pa; 4979 // Rb = *Pb; 4980 // Rm = *Pm; 4981 // Rn = *Pn; 4982 4983 // int iters = (i+1)/2; 4984 // for (j = 0; iters--; j++) { 4985 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4986 // MACC2(Ra, Rb, t0, t1, t2); 4987 // Ra = *++Pa; 4988 // Rb = *--Pb; 4989 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4990 // MACC(Rm, Rn, t0, t1, t2); 4991 // Rm = *++Pm; 4992 // Rn = *--Pn; 4993 // } 4994 // if ((i & 1) == 0) { 4995 // assert(Ra == Pa_base[j], "must be"); 4996 // MACC(Ra, Ra, t0, t1, t2); 4997 // } 4998 // iters = i/2; 4999 // assert(iters == i-j, "must be"); 5000 // for (; iters--; j++) { 5001 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5002 // MACC(Rm, Rn, t0, t1, t2); 5003 // Rm = *++Pm; 5004 // Rn = *--Pn; 5005 // } 5006 5007 // *Pm = Rm = t0 * inv; 5008 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5009 // MACC(Rm, Rn, t0, t1, t2); 5010 5011 // assert(t0 == 0, "broken Montgomery multiply"); 5012 5013 // t0 = t1; t1 = t2; t2 = 0; 5014 // } 5015 5016 // for (i = len; i < 2*len; i++) { 5017 // int start = i-len+1; 5018 // int end = start + (len - start)/2; 5019 // int j; 5020 5021 // Pa = Pa_base + i-len; 5022 // Pb = Pa_base + len; 5023 // Pm = Pm_base + i-len; 5024 // Pn = Pn_base + len; 5025 5026 // Ra = *++Pa; 5027 // Rb = *--Pb; 5028 // Rm = *++Pm; 5029 // Rn = *--Pn; 5030 5031 // int iters = (2*len-i-1)/2; 5032 // assert(iters == end-start, "must be"); 5033 // for (j = start; iters--; j++) { 5034 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5035 // MACC2(Ra, Rb, t0, t1, t2); 5036 // Ra = *++Pa; 5037 // Rb = *--Pb; 5038 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5039 // MACC(Rm, Rn, t0, t1, t2); 5040 // Rm = *++Pm; 5041 // Rn = *--Pn; 5042 // } 5043 // if ((i & 1) == 0) { 5044 // assert(Ra == Pa_base[j], "must be"); 5045 // MACC(Ra, Ra, t0, t1, t2); 5046 // } 5047 // iters = (2*len-i)/2; 5048 // assert(iters == len-j, "must be"); 5049 // for (; iters--; j++) { 5050 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5051 // MACC(Rm, Rn, t0, t1, t2); 5052 // Rm = *++Pm; 5053 // Rn = *--Pn; 5054 // } 5055 // Pm_base[i-len] = t0; 5056 // t0 = t1; t1 = t2; t2 = 0; 5057 // } 5058 5059 // while (t0) 5060 // t0 = sub(Pm_base, Pn_base, t0, len); 5061 // } 5062 }; 5063 5064 5065 // Initialization 5066 void generate_initial() { 5067 // Generate initial stubs and initializes the entry points 5068 5069 // entry points that exist in all platforms Note: This is code 5070 // that could be shared among different platforms - however the 5071 // benefit seems to be smaller than the disadvantage of having a 5072 // much more complicated generator structure. See also comment in 5073 // stubRoutines.hpp. 5074 5075 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5076 5077 StubRoutines::_call_stub_entry = 5078 generate_call_stub(StubRoutines::_call_stub_return_address); 5079 5080 // is referenced by megamorphic call 5081 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5082 5083 // Build this early so it's available for the interpreter. 5084 StubRoutines::_throw_StackOverflowError_entry = 5085 generate_throw_exception("StackOverflowError throw_exception", 5086 CAST_FROM_FN_PTR(address, 5087 SharedRuntime::throw_StackOverflowError)); 5088 StubRoutines::_throw_delayed_StackOverflowError_entry = 5089 generate_throw_exception("delayed StackOverflowError throw_exception", 5090 CAST_FROM_FN_PTR(address, 5091 SharedRuntime::throw_delayed_StackOverflowError)); 5092 if (UseCRC32Intrinsics) { 5093 // set table address before stub generation which use it 5094 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5095 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5096 } 5097 5098 if (UseCRC32CIntrinsics) { 5099 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5100 } 5101 } 5102 5103 void generate_all() { 5104 // support for verify_oop (must happen after universe_init) 5105 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5106 StubRoutines::_throw_AbstractMethodError_entry = 5107 generate_throw_exception("AbstractMethodError throw_exception", 5108 CAST_FROM_FN_PTR(address, 5109 SharedRuntime:: 5110 throw_AbstractMethodError)); 5111 5112 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5113 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5114 CAST_FROM_FN_PTR(address, 5115 SharedRuntime:: 5116 throw_IncompatibleClassChangeError)); 5117 5118 StubRoutines::_throw_NullPointerException_at_call_entry = 5119 generate_throw_exception("NullPointerException at call throw_exception", 5120 CAST_FROM_FN_PTR(address, 5121 SharedRuntime:: 5122 throw_NullPointerException_at_call)); 5123 5124 // arraycopy stubs used by compilers 5125 generate_arraycopy_stubs(); 5126 5127 // has negatives stub for large arrays. 5128 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5129 5130 // array equals stub for large arrays. 5131 if (!UseSimpleArrayEquals) { 5132 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5133 } 5134 5135 if (UseMultiplyToLenIntrinsic) { 5136 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5137 } 5138 5139 if (UseSquareToLenIntrinsic) { 5140 StubRoutines::_squareToLen = generate_squareToLen(); 5141 } 5142 5143 if (UseMulAddIntrinsic) { 5144 StubRoutines::_mulAdd = generate_mulAdd(); 5145 } 5146 5147 if (UseMontgomeryMultiplyIntrinsic) { 5148 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5149 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5150 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5151 } 5152 5153 if (UseMontgomerySquareIntrinsic) { 5154 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5155 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5156 // We use generate_multiply() rather than generate_square() 5157 // because it's faster for the sizes of modulus we care about. 5158 StubRoutines::_montgomerySquare = g.generate_multiply(); 5159 } 5160 5161 if (UseShenandoahGC && (ShenandoahWriteBarrier || ShenandoahStoreValWriteBarrier)) { 5162 StubRoutines::aarch64::_shenandoah_wb = generate_shenandoah_wb(false, true); 5163 StubRoutines::_shenandoah_wb_C = generate_shenandoah_wb(true, !ShenandoahWriteBarrierCsetTestInIR); 5164 } 5165 5166 #ifndef BUILTIN_SIM 5167 // generate GHASH intrinsics code 5168 if (UseGHASHIntrinsics) { 5169 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5170 } 5171 5172 if (UseAESIntrinsics) { 5173 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5174 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5175 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5176 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5177 } 5178 5179 if (UseSHA1Intrinsics) { 5180 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5181 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5182 } 5183 if (UseSHA256Intrinsics) { 5184 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5185 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5186 } 5187 5188 // generate Adler32 intrinsics code 5189 if (UseAdler32Intrinsics) { 5190 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5191 } 5192 5193 // Safefetch stubs. 5194 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5195 &StubRoutines::_safefetch32_fault_pc, 5196 &StubRoutines::_safefetch32_continuation_pc); 5197 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5198 &StubRoutines::_safefetchN_fault_pc, 5199 &StubRoutines::_safefetchN_continuation_pc); 5200 #endif 5201 StubRoutines::aarch64::set_completed(); 5202 } 5203 5204 public: 5205 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5206 if (all) { 5207 generate_all(); 5208 } else { 5209 generate_initial(); 5210 } 5211 } 5212 }; // end class declaration 5213 5214 void StubGenerator_generate(CodeBuffer* code, bool all) { 5215 StubGenerator g(code, all); 5216 }