1 /* 2 * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "nativeInst_aarch64.hpp" 33 #include "oops/instanceOop.hpp" 34 #include "oops/method.hpp" 35 #include "oops/objArrayKlass.hpp" 36 #include "oops/oop.inline.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "runtime/frame.inline.hpp" 39 #include "runtime/handles.inline.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubCodeGenerator.hpp" 42 #include "runtime/stubRoutines.hpp" 43 #include "runtime/thread.inline.hpp" 44 #include "utilities/align.hpp" 45 #include "utilities/macros.hpp" 46 #ifdef COMPILER2 47 #include "opto/runtime.hpp" 48 #endif 49 #if INCLUDE_SHENANDOAHGC 50 #include "gc/shenandoah/brooksPointer.hpp" 51 #include "gc/shenandoah/shenandoahHeap.hpp" 52 #include "gc/shenandoah/shenandoahHeapRegion.hpp" 53 #include "gc/shenandoah/shenandoahRuntime.hpp" 54 #endif 55 56 #ifdef BUILTIN_SIM 57 #include "../../../../../../simulator/simulator.hpp" 58 #endif 59 60 // Declaration and definition of StubGenerator (no .hpp file). 61 // For a more detailed description of the stub routine structure 62 // see the comment in stubRoutines.hpp 63 64 #undef __ 65 #define __ _masm-> 66 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 67 68 #ifdef PRODUCT 69 #define BLOCK_COMMENT(str) /* nothing */ 70 #else 71 #define BLOCK_COMMENT(str) __ block_comment(str) 72 #endif 73 74 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 75 76 // Stub Code definitions 77 78 class StubGenerator: public StubCodeGenerator { 79 private: 80 81 #ifdef PRODUCT 82 #define inc_counter_np(counter) ((void)0) 83 #else 84 void inc_counter_np_(int& counter) { 85 __ lea(rscratch2, ExternalAddress((address)&counter)); 86 __ ldrw(rscratch1, Address(rscratch2)); 87 __ addw(rscratch1, rscratch1, 1); 88 __ strw(rscratch1, Address(rscratch2)); 89 } 90 #define inc_counter_np(counter) \ 91 BLOCK_COMMENT("inc_counter " #counter); \ 92 inc_counter_np_(counter); 93 #endif 94 95 // Call stubs are used to call Java from C 96 // 97 // Arguments: 98 // c_rarg0: call wrapper address address 99 // c_rarg1: result address 100 // c_rarg2: result type BasicType 101 // c_rarg3: method Method* 102 // c_rarg4: (interpreter) entry point address 103 // c_rarg5: parameters intptr_t* 104 // c_rarg6: parameter size (in words) int 105 // c_rarg7: thread Thread* 106 // 107 // There is no return from the stub itself as any Java result 108 // is written to result 109 // 110 // we save r30 (lr) as the return PC at the base of the frame and 111 // link r29 (fp) below it as the frame pointer installing sp (r31) 112 // into fp. 113 // 114 // we save r0-r7, which accounts for all the c arguments. 115 // 116 // TODO: strictly do we need to save them all? they are treated as 117 // volatile by C so could we omit saving the ones we are going to 118 // place in global registers (thread? method?) or those we only use 119 // during setup of the Java call? 120 // 121 // we don't need to save r8 which C uses as an indirect result location 122 // return register. 123 // 124 // we don't need to save r9-r15 which both C and Java treat as 125 // volatile 126 // 127 // we don't need to save r16-18 because Java does not use them 128 // 129 // we save r19-r28 which Java uses as scratch registers and C 130 // expects to be callee-save 131 // 132 // we save the bottom 64 bits of each value stored in v8-v15; it is 133 // the responsibility of the caller to preserve larger values. 134 // 135 // so the stub frame looks like this when we enter Java code 136 // 137 // [ return_from_Java ] <--- sp 138 // [ argument word n ] 139 // ... 140 // -27 [ argument word 1 ] 141 // -26 [ saved v15 ] <--- sp_after_call 142 // -25 [ saved v14 ] 143 // -24 [ saved v13 ] 144 // -23 [ saved v12 ] 145 // -22 [ saved v11 ] 146 // -21 [ saved v10 ] 147 // -20 [ saved v9 ] 148 // -19 [ saved v8 ] 149 // -18 [ saved r28 ] 150 // -17 [ saved r27 ] 151 // -16 [ saved r26 ] 152 // -15 [ saved r25 ] 153 // -14 [ saved r24 ] 154 // -13 [ saved r23 ] 155 // -12 [ saved r22 ] 156 // -11 [ saved r21 ] 157 // -10 [ saved r20 ] 158 // -9 [ saved r19 ] 159 // -8 [ call wrapper (r0) ] 160 // -7 [ result (r1) ] 161 // -6 [ result type (r2) ] 162 // -5 [ method (r3) ] 163 // -4 [ entry point (r4) ] 164 // -3 [ parameters (r5) ] 165 // -2 [ parameter size (r6) ] 166 // -1 [ thread (r7) ] 167 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 168 // 1 [ saved lr (r30) ] 169 170 // Call stub stack layout word offsets from fp 171 enum call_stub_layout { 172 sp_after_call_off = -26, 173 174 d15_off = -26, 175 d13_off = -24, 176 d11_off = -22, 177 d9_off = -20, 178 179 r28_off = -18, 180 r26_off = -16, 181 r24_off = -14, 182 r22_off = -12, 183 r20_off = -10, 184 call_wrapper_off = -8, 185 result_off = -7, 186 result_type_off = -6, 187 method_off = -5, 188 entry_point_off = -4, 189 parameter_size_off = -2, 190 thread_off = -1, 191 fp_f = 0, 192 retaddr_off = 1, 193 }; 194 195 address generate_call_stub(address& return_address) { 196 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 197 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 198 "adjust this code"); 199 200 StubCodeMark mark(this, "StubRoutines", "call_stub"); 201 address start = __ pc(); 202 203 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 204 205 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 206 const Address result (rfp, result_off * wordSize); 207 const Address result_type (rfp, result_type_off * wordSize); 208 const Address method (rfp, method_off * wordSize); 209 const Address entry_point (rfp, entry_point_off * wordSize); 210 const Address parameter_size(rfp, parameter_size_off * wordSize); 211 212 const Address thread (rfp, thread_off * wordSize); 213 214 const Address d15_save (rfp, d15_off * wordSize); 215 const Address d13_save (rfp, d13_off * wordSize); 216 const Address d11_save (rfp, d11_off * wordSize); 217 const Address d9_save (rfp, d9_off * wordSize); 218 219 const Address r28_save (rfp, r28_off * wordSize); 220 const Address r26_save (rfp, r26_off * wordSize); 221 const Address r24_save (rfp, r24_off * wordSize); 222 const Address r22_save (rfp, r22_off * wordSize); 223 const Address r20_save (rfp, r20_off * wordSize); 224 225 // stub code 226 227 // we need a C prolog to bootstrap the x86 caller into the sim 228 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 229 230 address aarch64_entry = __ pc(); 231 232 #ifdef BUILTIN_SIM 233 // Save sender's SP for stack traces. 234 __ mov(rscratch1, sp); 235 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 236 #endif 237 // set up frame and move sp to end of save area 238 __ enter(); 239 __ sub(sp, rfp, -sp_after_call_off * wordSize); 240 241 // save register parameters and Java scratch/global registers 242 // n.b. we save thread even though it gets installed in 243 // rthread because we want to sanity check rthread later 244 __ str(c_rarg7, thread); 245 __ strw(c_rarg6, parameter_size); 246 __ stp(c_rarg4, c_rarg5, entry_point); 247 __ stp(c_rarg2, c_rarg3, result_type); 248 __ stp(c_rarg0, c_rarg1, call_wrapper); 249 250 __ stp(r20, r19, r20_save); 251 __ stp(r22, r21, r22_save); 252 __ stp(r24, r23, r24_save); 253 __ stp(r26, r25, r26_save); 254 __ stp(r28, r27, r28_save); 255 256 __ stpd(v9, v8, d9_save); 257 __ stpd(v11, v10, d11_save); 258 __ stpd(v13, v12, d13_save); 259 __ stpd(v15, v14, d15_save); 260 261 // install Java thread in global register now we have saved 262 // whatever value it held 263 __ mov(rthread, c_rarg7); 264 // And method 265 __ mov(rmethod, c_rarg3); 266 267 // set up the heapbase register 268 __ reinit_heapbase(); 269 270 #ifdef ASSERT 271 // make sure we have no pending exceptions 272 { 273 Label L; 274 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 275 __ cmp(rscratch1, (unsigned)NULL_WORD); 276 __ br(Assembler::EQ, L); 277 __ stop("StubRoutines::call_stub: entered with pending exception"); 278 __ BIND(L); 279 } 280 #endif 281 // pass parameters if any 282 __ mov(esp, sp); 283 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 284 __ andr(sp, rscratch1, -2 * wordSize); 285 286 BLOCK_COMMENT("pass parameters if any"); 287 Label parameters_done; 288 // parameter count is still in c_rarg6 289 // and parameter pointer identifying param 1 is in c_rarg5 290 __ cbzw(c_rarg6, parameters_done); 291 292 address loop = __ pc(); 293 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 294 __ subsw(c_rarg6, c_rarg6, 1); 295 __ push(rscratch1); 296 __ br(Assembler::GT, loop); 297 298 __ BIND(parameters_done); 299 300 // call Java entry -- passing methdoOop, and current sp 301 // rmethod: Method* 302 // r13: sender sp 303 BLOCK_COMMENT("call Java function"); 304 __ mov(r13, sp); 305 __ blr(c_rarg4); 306 307 // tell the simulator we have returned to the stub 308 309 // we do this here because the notify will already have been done 310 // if we get to the next instruction via an exception 311 // 312 // n.b. adding this instruction here affects the calculation of 313 // whether or not a routine returns to the call stub (used when 314 // doing stack walks) since the normal test is to check the return 315 // pc against the address saved below. so we may need to allow for 316 // this extra instruction in the check. 317 318 if (NotifySimulator) { 319 __ notify(Assembler::method_reentry); 320 } 321 // save current address for use by exception handling code 322 323 return_address = __ pc(); 324 325 // store result depending on type (everything that is not 326 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 327 // n.b. this assumes Java returns an integral result in r0 328 // and a floating result in j_farg0 329 __ ldr(j_rarg2, result); 330 Label is_long, is_float, is_double, exit; 331 __ ldr(j_rarg1, result_type); 332 __ cmp(j_rarg1, T_OBJECT); 333 __ br(Assembler::EQ, is_long); 334 __ cmp(j_rarg1, T_LONG); 335 __ br(Assembler::EQ, is_long); 336 __ cmp(j_rarg1, T_FLOAT); 337 __ br(Assembler::EQ, is_float); 338 __ cmp(j_rarg1, T_DOUBLE); 339 __ br(Assembler::EQ, is_double); 340 341 // handle T_INT case 342 __ strw(r0, Address(j_rarg2)); 343 344 __ BIND(exit); 345 346 // pop parameters 347 __ sub(esp, rfp, -sp_after_call_off * wordSize); 348 349 #ifdef ASSERT 350 // verify that threads correspond 351 { 352 Label L, S; 353 __ ldr(rscratch1, thread); 354 __ cmp(rthread, rscratch1); 355 __ br(Assembler::NE, S); 356 __ get_thread(rscratch1); 357 __ cmp(rthread, rscratch1); 358 __ br(Assembler::EQ, L); 359 __ BIND(S); 360 __ stop("StubRoutines::call_stub: threads must correspond"); 361 __ BIND(L); 362 } 363 #endif 364 365 // restore callee-save registers 366 __ ldpd(v15, v14, d15_save); 367 __ ldpd(v13, v12, d13_save); 368 __ ldpd(v11, v10, d11_save); 369 __ ldpd(v9, v8, d9_save); 370 371 __ ldp(r28, r27, r28_save); 372 __ ldp(r26, r25, r26_save); 373 __ ldp(r24, r23, r24_save); 374 __ ldp(r22, r21, r22_save); 375 __ ldp(r20, r19, r20_save); 376 377 __ ldp(c_rarg0, c_rarg1, call_wrapper); 378 __ ldrw(c_rarg2, result_type); 379 __ ldr(c_rarg3, method); 380 __ ldp(c_rarg4, c_rarg5, entry_point); 381 __ ldp(c_rarg6, c_rarg7, parameter_size); 382 383 #ifndef PRODUCT 384 // tell the simulator we are about to end Java execution 385 if (NotifySimulator) { 386 __ notify(Assembler::method_exit); 387 } 388 #endif 389 // leave frame and return to caller 390 __ leave(); 391 __ ret(lr); 392 393 // handle return types different from T_INT 394 395 __ BIND(is_long); 396 __ str(r0, Address(j_rarg2, 0)); 397 __ br(Assembler::AL, exit); 398 399 __ BIND(is_float); 400 __ strs(j_farg0, Address(j_rarg2, 0)); 401 __ br(Assembler::AL, exit); 402 403 __ BIND(is_double); 404 __ strd(j_farg0, Address(j_rarg2, 0)); 405 __ br(Assembler::AL, exit); 406 407 return start; 408 } 409 410 // Return point for a Java call if there's an exception thrown in 411 // Java code. The exception is caught and transformed into a 412 // pending exception stored in JavaThread that can be tested from 413 // within the VM. 414 // 415 // Note: Usually the parameters are removed by the callee. In case 416 // of an exception crossing an activation frame boundary, that is 417 // not the case if the callee is compiled code => need to setup the 418 // rsp. 419 // 420 // r0: exception oop 421 422 // NOTE: this is used as a target from the signal handler so it 423 // needs an x86 prolog which returns into the current simulator 424 // executing the generated catch_exception code. so the prolog 425 // needs to install rax in a sim register and adjust the sim's 426 // restart pc to enter the generated code at the start position 427 // then return from native to simulated execution. 428 429 address generate_catch_exception() { 430 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 431 address start = __ pc(); 432 433 // same as in generate_call_stub(): 434 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 435 const Address thread (rfp, thread_off * wordSize); 436 437 #ifdef ASSERT 438 // verify that threads correspond 439 { 440 Label L, S; 441 __ ldr(rscratch1, thread); 442 __ cmp(rthread, rscratch1); 443 __ br(Assembler::NE, S); 444 __ get_thread(rscratch1); 445 __ cmp(rthread, rscratch1); 446 __ br(Assembler::EQ, L); 447 __ bind(S); 448 __ stop("StubRoutines::catch_exception: threads must correspond"); 449 __ bind(L); 450 } 451 #endif 452 453 // set pending exception 454 __ verify_oop(r0); 455 456 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 457 __ mov(rscratch1, (address)__FILE__); 458 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 459 __ movw(rscratch1, (int)__LINE__); 460 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 461 462 // complete return to VM 463 assert(StubRoutines::_call_stub_return_address != NULL, 464 "_call_stub_return_address must have been generated before"); 465 __ b(StubRoutines::_call_stub_return_address); 466 467 return start; 468 } 469 470 // Continuation point for runtime calls returning with a pending 471 // exception. The pending exception check happened in the runtime 472 // or native call stub. The pending exception in Thread is 473 // converted into a Java-level exception. 474 // 475 // Contract with Java-level exception handlers: 476 // r0: exception 477 // r3: throwing pc 478 // 479 // NOTE: At entry of this stub, exception-pc must be in LR !! 480 481 // NOTE: this is always used as a jump target within generated code 482 // so it just needs to be generated code wiht no x86 prolog 483 484 address generate_forward_exception() { 485 StubCodeMark mark(this, "StubRoutines", "forward exception"); 486 address start = __ pc(); 487 488 // Upon entry, LR points to the return address returning into 489 // Java (interpreted or compiled) code; i.e., the return address 490 // becomes the throwing pc. 491 // 492 // Arguments pushed before the runtime call are still on the stack 493 // but the exception handler will reset the stack pointer -> 494 // ignore them. A potential result in registers can be ignored as 495 // well. 496 497 #ifdef ASSERT 498 // make sure this code is only executed if there is a pending exception 499 { 500 Label L; 501 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 502 __ cbnz(rscratch1, L); 503 __ stop("StubRoutines::forward exception: no pending exception (1)"); 504 __ bind(L); 505 } 506 #endif 507 508 // compute exception handler into r19 509 510 // call the VM to find the handler address associated with the 511 // caller address. pass thread in r0 and caller pc (ret address) 512 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 513 // the stack. 514 __ mov(c_rarg1, lr); 515 // lr will be trashed by the VM call so we move it to R19 516 // (callee-saved) because we also need to pass it to the handler 517 // returned by this call. 518 __ mov(r19, lr); 519 BLOCK_COMMENT("call exception_handler_for_return_address"); 520 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 521 SharedRuntime::exception_handler_for_return_address), 522 rthread, c_rarg1); 523 // we should not really care that lr is no longer the callee 524 // address. we saved the value the handler needs in r19 so we can 525 // just copy it to r3. however, the C2 handler will push its own 526 // frame and then calls into the VM and the VM code asserts that 527 // the PC for the frame above the handler belongs to a compiled 528 // Java method. So, we restore lr here to satisfy that assert. 529 __ mov(lr, r19); 530 // setup r0 & r3 & clear pending exception 531 __ mov(r3, r19); 532 __ mov(r19, r0); 533 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 534 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 535 536 #ifdef ASSERT 537 // make sure exception is set 538 { 539 Label L; 540 __ cbnz(r0, L); 541 __ stop("StubRoutines::forward exception: no pending exception (2)"); 542 __ bind(L); 543 } 544 #endif 545 546 // continue at exception handler 547 // r0: exception 548 // r3: throwing pc 549 // r19: exception handler 550 __ verify_oop(r0); 551 __ br(r19); 552 553 return start; 554 } 555 556 #if INCLUDE_SHENANDOAHGC 557 // Shenandoah write barrier. 558 // 559 // Input: 560 // r0: OOP to evacuate. Not null. 561 // 562 // Output: 563 // r0: Pointer to evacuated OOP. 564 // 565 // Trash rscratch1, rscratch2. Preserve everything else. 566 567 address generate_shenandoah_wb(bool c_abi, bool do_cset_test) { 568 StubCodeMark mark(this, "StubRoutines", "shenandoah_wb"); 569 570 __ align(6); 571 address start = __ pc(); 572 573 if (do_cset_test) { 574 Label work; 575 __ mov(rscratch2, ShenandoahHeap::in_cset_fast_test_addr()); 576 __ lsr(rscratch1, r0, ShenandoahHeapRegion::region_size_bytes_shift_jint()); 577 __ ldrb(rscratch2, Address(rscratch2, rscratch1)); 578 __ tbnz(rscratch2, 0, work); 579 __ ret(lr); 580 __ bind(work); 581 } 582 583 Register obj = r0; 584 585 __ enter(); // required for proper stackwalking of RuntimeStub frame 586 587 if (!c_abi) { 588 __ push_call_clobbered_registers(); 589 } else { 590 __ push_call_clobbered_fp_registers(); 591 } 592 593 __ mov(lr, CAST_FROM_FN_PTR(address, ShenandoahRuntime::write_barrier_JRT)); 594 __ blrt(lr, 1, 0, MacroAssembler::ret_type_integral); 595 if (!c_abi) { 596 __ mov(rscratch1, obj); 597 __ pop_call_clobbered_registers(); 598 __ mov(obj, rscratch1); 599 } else { 600 __ pop_call_clobbered_fp_registers(); 601 } 602 603 __ leave(); // required for proper stackwalking of RuntimeStub frame 604 __ ret(lr); 605 606 return start; 607 } 608 #endif 609 610 // Non-destructive plausibility checks for oops 611 // 612 // Arguments: 613 // r0: oop to verify 614 // rscratch1: error message 615 // 616 // Stack after saving c_rarg3: 617 // [tos + 0]: saved c_rarg3 618 // [tos + 1]: saved c_rarg2 619 // [tos + 2]: saved lr 620 // [tos + 3]: saved rscratch2 621 // [tos + 4]: saved r0 622 // [tos + 5]: saved rscratch1 623 address generate_verify_oop() { 624 625 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 626 address start = __ pc(); 627 628 Label exit, error; 629 630 // save c_rarg2 and c_rarg3 631 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 632 633 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 634 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 635 __ ldr(c_rarg3, Address(c_rarg2)); 636 __ add(c_rarg3, c_rarg3, 1); 637 __ str(c_rarg3, Address(c_rarg2)); 638 639 // object is in r0 640 // make sure object is 'reasonable' 641 __ cbz(r0, exit); // if obj is NULL it is OK 642 643 // Check if the oop is in the right area of memory 644 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 645 __ andr(c_rarg2, r0, c_rarg3); 646 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 647 648 // Compare c_rarg2 and c_rarg3. We don't use a compare 649 // instruction here because the flags register is live. 650 __ eor(c_rarg2, c_rarg2, c_rarg3); 651 __ cbnz(c_rarg2, error); 652 653 // make sure klass is 'reasonable', which is not zero. 654 __ load_klass(r0, r0); // get klass 655 __ cbz(r0, error); // if klass is NULL it is broken 656 657 // return if everything seems ok 658 __ bind(exit); 659 660 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 661 __ ret(lr); 662 663 // handle errors 664 __ bind(error); 665 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 666 667 __ push(RegSet::range(r0, r29), sp); 668 // debug(char* msg, int64_t pc, int64_t regs[]) 669 __ mov(c_rarg0, rscratch1); // pass address of error message 670 __ mov(c_rarg1, lr); // pass return address 671 __ mov(c_rarg2, sp); // pass address of regs on stack 672 #ifndef PRODUCT 673 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 674 #endif 675 BLOCK_COMMENT("call MacroAssembler::debug"); 676 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 677 __ blrt(rscratch1, 3, 0, 1); 678 679 return start; 680 } 681 682 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 683 684 // The inner part of zero_words(). This is the bulk operation, 685 // zeroing words in blocks, possibly using DC ZVA to do it. The 686 // caller is responsible for zeroing the last few words. 687 // 688 // Inputs: 689 // r10: the HeapWord-aligned base address of an array to zero. 690 // r11: the count in HeapWords, r11 > 0. 691 // 692 // Returns r10 and r11, adjusted for the caller to clear. 693 // r10: the base address of the tail of words left to clear. 694 // r11: the number of words in the tail. 695 // r11 < MacroAssembler::zero_words_block_size. 696 697 address generate_zero_blocks() { 698 Label store_pair, loop_store_pair, done; 699 Label base_aligned; 700 701 Register base = r10, cnt = r11; 702 703 __ align(CodeEntryAlignment); 704 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 705 address start = __ pc(); 706 707 if (UseBlockZeroing) { 708 int zva_length = VM_Version::zva_length(); 709 710 // Ensure ZVA length can be divided by 16. This is required by 711 // the subsequent operations. 712 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 713 714 __ tbz(base, 3, base_aligned); 715 __ str(zr, Address(__ post(base, 8))); 716 __ sub(cnt, cnt, 1); 717 __ bind(base_aligned); 718 719 // Ensure count >= zva_length * 2 so that it still deserves a zva after 720 // alignment. 721 Label small; 722 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 723 __ subs(rscratch1, cnt, low_limit >> 3); 724 __ br(Assembler::LT, small); 725 __ zero_dcache_blocks(base, cnt); 726 __ bind(small); 727 } 728 729 { 730 // Number of stp instructions we'll unroll 731 const int unroll = 732 MacroAssembler::zero_words_block_size / 2; 733 // Clear the remaining blocks. 734 Label loop; 735 __ subs(cnt, cnt, unroll * 2); 736 __ br(Assembler::LT, done); 737 __ bind(loop); 738 for (int i = 0; i < unroll; i++) 739 __ stp(zr, zr, __ post(base, 16)); 740 __ subs(cnt, cnt, unroll * 2); 741 __ br(Assembler::GE, loop); 742 __ bind(done); 743 __ add(cnt, cnt, unroll * 2); 744 } 745 746 __ ret(lr); 747 748 return start; 749 } 750 751 752 typedef enum { 753 copy_forwards = 1, 754 copy_backwards = -1 755 } copy_direction; 756 757 // Bulk copy of blocks of 8 words. 758 // 759 // count is a count of words. 760 // 761 // Precondition: count >= 8 762 // 763 // Postconditions: 764 // 765 // The least significant bit of count contains the remaining count 766 // of words to copy. The rest of count is trash. 767 // 768 // s and d are adjusted to point to the remaining words to copy 769 // 770 void generate_copy_longs(Label &start, Register s, Register d, Register count, 771 copy_direction direction) { 772 int unit = wordSize * direction; 773 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 774 775 int offset; 776 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 777 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 778 const Register stride = r13; 779 780 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 781 assert_different_registers(s, d, count, rscratch1); 782 783 Label again, drain; 784 const char *stub_name; 785 if (direction == copy_forwards) 786 stub_name = "forward_copy_longs"; 787 else 788 stub_name = "backward_copy_longs"; 789 StubCodeMark mark(this, "StubRoutines", stub_name); 790 __ align(CodeEntryAlignment); 791 __ bind(start); 792 793 Label unaligned_copy_long; 794 if (AvoidUnalignedAccesses) { 795 __ tbnz(d, 3, unaligned_copy_long); 796 } 797 798 if (direction == copy_forwards) { 799 __ sub(s, s, bias); 800 __ sub(d, d, bias); 801 } 802 803 #ifdef ASSERT 804 // Make sure we are never given < 8 words 805 { 806 Label L; 807 __ cmp(count, 8); 808 __ br(Assembler::GE, L); 809 __ stop("genrate_copy_longs called with < 8 words"); 810 __ bind(L); 811 } 812 #endif 813 814 // Fill 8 registers 815 if (UseSIMDForMemoryOps) { 816 __ ldpq(v0, v1, Address(s, 4 * unit)); 817 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 818 } else { 819 __ ldp(t0, t1, Address(s, 2 * unit)); 820 __ ldp(t2, t3, Address(s, 4 * unit)); 821 __ ldp(t4, t5, Address(s, 6 * unit)); 822 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 823 } 824 825 __ subs(count, count, 16); 826 __ br(Assembler::LO, drain); 827 828 int prefetch = PrefetchCopyIntervalInBytes; 829 bool use_stride = false; 830 if (direction == copy_backwards) { 831 use_stride = prefetch > 256; 832 prefetch = -prefetch; 833 if (use_stride) __ mov(stride, prefetch); 834 } 835 836 __ bind(again); 837 838 if (PrefetchCopyIntervalInBytes > 0) 839 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 840 841 if (UseSIMDForMemoryOps) { 842 __ stpq(v0, v1, Address(d, 4 * unit)); 843 __ ldpq(v0, v1, Address(s, 4 * unit)); 844 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 845 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 846 } else { 847 __ stp(t0, t1, Address(d, 2 * unit)); 848 __ ldp(t0, t1, Address(s, 2 * unit)); 849 __ stp(t2, t3, Address(d, 4 * unit)); 850 __ ldp(t2, t3, Address(s, 4 * unit)); 851 __ stp(t4, t5, Address(d, 6 * unit)); 852 __ ldp(t4, t5, Address(s, 6 * unit)); 853 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 854 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 855 } 856 857 __ subs(count, count, 8); 858 __ br(Assembler::HS, again); 859 860 // Drain 861 __ bind(drain); 862 if (UseSIMDForMemoryOps) { 863 __ stpq(v0, v1, Address(d, 4 * unit)); 864 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 865 } else { 866 __ stp(t0, t1, Address(d, 2 * unit)); 867 __ stp(t2, t3, Address(d, 4 * unit)); 868 __ stp(t4, t5, Address(d, 6 * unit)); 869 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 870 } 871 872 { 873 Label L1, L2; 874 __ tbz(count, exact_log2(4), L1); 875 if (UseSIMDForMemoryOps) { 876 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 877 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 878 } else { 879 __ ldp(t0, t1, Address(s, 2 * unit)); 880 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 881 __ stp(t0, t1, Address(d, 2 * unit)); 882 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 883 } 884 __ bind(L1); 885 886 if (direction == copy_forwards) { 887 __ add(s, s, bias); 888 __ add(d, d, bias); 889 } 890 891 __ tbz(count, 1, L2); 892 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 893 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 894 __ bind(L2); 895 } 896 897 __ ret(lr); 898 899 if (AvoidUnalignedAccesses) { 900 Label drain, again; 901 // Register order for storing. Order is different for backward copy. 902 903 __ bind(unaligned_copy_long); 904 905 // source address is even aligned, target odd aligned 906 // 907 // when forward copying word pairs we read long pairs at offsets 908 // {0, 2, 4, 6} (in long words). when backwards copying we read 909 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 910 // address by -2 in the forwards case so we can compute the 911 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 912 // or -1. 913 // 914 // when forward copying we need to store 1 word, 3 pairs and 915 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 916 // zero offset We adjust the destination by -1 which means we 917 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 918 // 919 // When backwards copyng we need to store 1 word, 3 pairs and 920 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 921 // offsets {1, 3, 5, 7, 8} * unit. 922 923 if (direction == copy_forwards) { 924 __ sub(s, s, 16); 925 __ sub(d, d, 8); 926 } 927 928 // Fill 8 registers 929 // 930 // for forwards copy s was offset by -16 from the original input 931 // value of s so the register contents are at these offsets 932 // relative to the 64 bit block addressed by that original input 933 // and so on for each successive 64 byte block when s is updated 934 // 935 // t0 at offset 0, t1 at offset 8 936 // t2 at offset 16, t3 at offset 24 937 // t4 at offset 32, t5 at offset 40 938 // t6 at offset 48, t7 at offset 56 939 940 // for backwards copy s was not offset so the register contents 941 // are at these offsets into the preceding 64 byte block 942 // relative to that original input and so on for each successive 943 // preceding 64 byte block when s is updated. this explains the 944 // slightly counter-intuitive looking pattern of register usage 945 // in the stp instructions for backwards copy. 946 // 947 // t0 at offset -16, t1 at offset -8 948 // t2 at offset -32, t3 at offset -24 949 // t4 at offset -48, t5 at offset -40 950 // t6 at offset -64, t7 at offset -56 951 952 __ ldp(t0, t1, Address(s, 2 * unit)); 953 __ ldp(t2, t3, Address(s, 4 * unit)); 954 __ ldp(t4, t5, Address(s, 6 * unit)); 955 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 956 957 __ subs(count, count, 16); 958 __ br(Assembler::LO, drain); 959 960 int prefetch = PrefetchCopyIntervalInBytes; 961 bool use_stride = false; 962 if (direction == copy_backwards) { 963 use_stride = prefetch > 256; 964 prefetch = -prefetch; 965 if (use_stride) __ mov(stride, prefetch); 966 } 967 968 __ bind(again); 969 970 if (PrefetchCopyIntervalInBytes > 0) 971 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 972 973 if (direction == copy_forwards) { 974 // allowing for the offset of -8 the store instructions place 975 // registers into the target 64 bit block at the following 976 // offsets 977 // 978 // t0 at offset 0 979 // t1 at offset 8, t2 at offset 16 980 // t3 at offset 24, t4 at offset 32 981 // t5 at offset 40, t6 at offset 48 982 // t7 at offset 56 983 984 __ str(t0, Address(d, 1 * unit)); 985 __ stp(t1, t2, Address(d, 2 * unit)); 986 __ ldp(t0, t1, Address(s, 2 * unit)); 987 __ stp(t3, t4, Address(d, 4 * unit)); 988 __ ldp(t2, t3, Address(s, 4 * unit)); 989 __ stp(t5, t6, Address(d, 6 * unit)); 990 __ ldp(t4, t5, Address(s, 6 * unit)); 991 __ str(t7, Address(__ pre(d, 8 * unit))); 992 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 993 } else { 994 // d was not offset when we started so the registers are 995 // written into the 64 bit block preceding d with the following 996 // offsets 997 // 998 // t1 at offset -8 999 // t3 at offset -24, t0 at offset -16 1000 // t5 at offset -48, t2 at offset -32 1001 // t7 at offset -56, t4 at offset -48 1002 // t6 at offset -64 1003 // 1004 // note that this matches the offsets previously noted for the 1005 // loads 1006 1007 __ str(t1, Address(d, 1 * unit)); 1008 __ stp(t3, t0, Address(d, 3 * unit)); 1009 __ ldp(t0, t1, Address(s, 2 * unit)); 1010 __ stp(t5, t2, Address(d, 5 * unit)); 1011 __ ldp(t2, t3, Address(s, 4 * unit)); 1012 __ stp(t7, t4, Address(d, 7 * unit)); 1013 __ ldp(t4, t5, Address(s, 6 * unit)); 1014 __ str(t6, Address(__ pre(d, 8 * unit))); 1015 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1016 } 1017 1018 __ subs(count, count, 8); 1019 __ br(Assembler::HS, again); 1020 1021 // Drain 1022 // 1023 // this uses the same pattern of offsets and register arguments 1024 // as above 1025 __ bind(drain); 1026 if (direction == copy_forwards) { 1027 __ str(t0, Address(d, 1 * unit)); 1028 __ stp(t1, t2, Address(d, 2 * unit)); 1029 __ stp(t3, t4, Address(d, 4 * unit)); 1030 __ stp(t5, t6, Address(d, 6 * unit)); 1031 __ str(t7, Address(__ pre(d, 8 * unit))); 1032 } else { 1033 __ str(t1, Address(d, 1 * unit)); 1034 __ stp(t3, t0, Address(d, 3 * unit)); 1035 __ stp(t5, t2, Address(d, 5 * unit)); 1036 __ stp(t7, t4, Address(d, 7 * unit)); 1037 __ str(t6, Address(__ pre(d, 8 * unit))); 1038 } 1039 // now we need to copy any remaining part block which may 1040 // include a 4 word block subblock and/or a 2 word subblock. 1041 // bits 2 and 1 in the count are the tell-tale for whetehr we 1042 // have each such subblock 1043 { 1044 Label L1, L2; 1045 __ tbz(count, exact_log2(4), L1); 1046 // this is the same as above but copying only 4 longs hence 1047 // with ony one intervening stp between the str instructions 1048 // but note that the offsets and registers still follow the 1049 // same pattern 1050 __ ldp(t0, t1, Address(s, 2 * unit)); 1051 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1052 if (direction == copy_forwards) { 1053 __ str(t0, Address(d, 1 * unit)); 1054 __ stp(t1, t2, Address(d, 2 * unit)); 1055 __ str(t3, Address(__ pre(d, 4 * unit))); 1056 } else { 1057 __ str(t1, Address(d, 1 * unit)); 1058 __ stp(t3, t0, Address(d, 3 * unit)); 1059 __ str(t2, Address(__ pre(d, 4 * unit))); 1060 } 1061 __ bind(L1); 1062 1063 __ tbz(count, 1, L2); 1064 // this is the same as above but copying only 2 longs hence 1065 // there is no intervening stp between the str instructions 1066 // but note that the offset and register patterns are still 1067 // the same 1068 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1069 if (direction == copy_forwards) { 1070 __ str(t0, Address(d, 1 * unit)); 1071 __ str(t1, Address(__ pre(d, 2 * unit))); 1072 } else { 1073 __ str(t1, Address(d, 1 * unit)); 1074 __ str(t0, Address(__ pre(d, 2 * unit))); 1075 } 1076 __ bind(L2); 1077 1078 // for forwards copy we need to re-adjust the offsets we 1079 // applied so that s and d are follow the last words written 1080 1081 if (direction == copy_forwards) { 1082 __ add(s, s, 16); 1083 __ add(d, d, 8); 1084 } 1085 1086 } 1087 1088 __ ret(lr); 1089 } 1090 } 1091 1092 // Small copy: less than 16 bytes. 1093 // 1094 // NB: Ignores all of the bits of count which represent more than 15 1095 // bytes, so a caller doesn't have to mask them. 1096 1097 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1098 bool is_backwards = step < 0; 1099 size_t granularity = uabs(step); 1100 int direction = is_backwards ? -1 : 1; 1101 int unit = wordSize * direction; 1102 1103 Label Lpair, Lword, Lint, Lshort, Lbyte; 1104 1105 assert(granularity 1106 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1107 1108 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1109 1110 // ??? I don't know if this bit-test-and-branch is the right thing 1111 // to do. It does a lot of jumping, resulting in several 1112 // mispredicted branches. It might make more sense to do this 1113 // with something like Duff's device with a single computed branch. 1114 1115 __ tbz(count, 3 - exact_log2(granularity), Lword); 1116 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1117 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1118 __ bind(Lword); 1119 1120 if (granularity <= sizeof (jint)) { 1121 __ tbz(count, 2 - exact_log2(granularity), Lint); 1122 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1123 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1124 __ bind(Lint); 1125 } 1126 1127 if (granularity <= sizeof (jshort)) { 1128 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1129 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1130 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1131 __ bind(Lshort); 1132 } 1133 1134 if (granularity <= sizeof (jbyte)) { 1135 __ tbz(count, 0, Lbyte); 1136 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1137 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1138 __ bind(Lbyte); 1139 } 1140 } 1141 1142 Label copy_f, copy_b; 1143 1144 // All-singing all-dancing memory copy. 1145 // 1146 // Copy count units of memory from s to d. The size of a unit is 1147 // step, which can be positive or negative depending on the direction 1148 // of copy. If is_aligned is false, we align the source address. 1149 // 1150 1151 void copy_memory(bool is_aligned, Register s, Register d, 1152 Register count, Register tmp, int step) { 1153 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1154 bool is_backwards = step < 0; 1155 int granularity = uabs(step); 1156 const Register t0 = r3, t1 = r4; 1157 1158 // <= 96 bytes do inline. Direction doesn't matter because we always 1159 // load all the data before writing anything 1160 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1161 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1162 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1163 const Register send = r17, dend = r18; 1164 1165 if (PrefetchCopyIntervalInBytes > 0) 1166 __ prfm(Address(s, 0), PLDL1KEEP); 1167 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1168 __ br(Assembler::HI, copy_big); 1169 1170 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1171 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1172 1173 __ cmp(count, 16/granularity); 1174 __ br(Assembler::LS, copy16); 1175 1176 __ cmp(count, 64/granularity); 1177 __ br(Assembler::HI, copy80); 1178 1179 __ cmp(count, 32/granularity); 1180 __ br(Assembler::LS, copy32); 1181 1182 // 33..64 bytes 1183 if (UseSIMDForMemoryOps) { 1184 __ ldpq(v0, v1, Address(s, 0)); 1185 __ ldpq(v2, v3, Address(send, -32)); 1186 __ stpq(v0, v1, Address(d, 0)); 1187 __ stpq(v2, v3, Address(dend, -32)); 1188 } else { 1189 __ ldp(t0, t1, Address(s, 0)); 1190 __ ldp(t2, t3, Address(s, 16)); 1191 __ ldp(t4, t5, Address(send, -32)); 1192 __ ldp(t6, t7, Address(send, -16)); 1193 1194 __ stp(t0, t1, Address(d, 0)); 1195 __ stp(t2, t3, Address(d, 16)); 1196 __ stp(t4, t5, Address(dend, -32)); 1197 __ stp(t6, t7, Address(dend, -16)); 1198 } 1199 __ b(finish); 1200 1201 // 17..32 bytes 1202 __ bind(copy32); 1203 __ ldp(t0, t1, Address(s, 0)); 1204 __ ldp(t2, t3, Address(send, -16)); 1205 __ stp(t0, t1, Address(d, 0)); 1206 __ stp(t2, t3, Address(dend, -16)); 1207 __ b(finish); 1208 1209 // 65..80/96 bytes 1210 // (96 bytes if SIMD because we do 32 byes per instruction) 1211 __ bind(copy80); 1212 if (UseSIMDForMemoryOps) { 1213 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1214 __ ldpq(v4, v5, Address(send, -32)); 1215 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1216 __ stpq(v4, v5, Address(dend, -32)); 1217 } else { 1218 __ ldp(t0, t1, Address(s, 0)); 1219 __ ldp(t2, t3, Address(s, 16)); 1220 __ ldp(t4, t5, Address(s, 32)); 1221 __ ldp(t6, t7, Address(s, 48)); 1222 __ ldp(t8, t9, Address(send, -16)); 1223 1224 __ stp(t0, t1, Address(d, 0)); 1225 __ stp(t2, t3, Address(d, 16)); 1226 __ stp(t4, t5, Address(d, 32)); 1227 __ stp(t6, t7, Address(d, 48)); 1228 __ stp(t8, t9, Address(dend, -16)); 1229 } 1230 __ b(finish); 1231 1232 // 0..16 bytes 1233 __ bind(copy16); 1234 __ cmp(count, 8/granularity); 1235 __ br(Assembler::LO, copy8); 1236 1237 // 8..16 bytes 1238 __ ldr(t0, Address(s, 0)); 1239 __ ldr(t1, Address(send, -8)); 1240 __ str(t0, Address(d, 0)); 1241 __ str(t1, Address(dend, -8)); 1242 __ b(finish); 1243 1244 if (granularity < 8) { 1245 // 4..7 bytes 1246 __ bind(copy8); 1247 __ tbz(count, 2 - exact_log2(granularity), copy4); 1248 __ ldrw(t0, Address(s, 0)); 1249 __ ldrw(t1, Address(send, -4)); 1250 __ strw(t0, Address(d, 0)); 1251 __ strw(t1, Address(dend, -4)); 1252 __ b(finish); 1253 if (granularity < 4) { 1254 // 0..3 bytes 1255 __ bind(copy4); 1256 __ cbz(count, finish); // get rid of 0 case 1257 if (granularity == 2) { 1258 __ ldrh(t0, Address(s, 0)); 1259 __ strh(t0, Address(d, 0)); 1260 } else { // granularity == 1 1261 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1262 // the first and last byte. 1263 // Handle the 3 byte case by loading and storing base + count/2 1264 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1265 // This does means in the 1 byte case we load/store the same 1266 // byte 3 times. 1267 __ lsr(count, count, 1); 1268 __ ldrb(t0, Address(s, 0)); 1269 __ ldrb(t1, Address(send, -1)); 1270 __ ldrb(t2, Address(s, count)); 1271 __ strb(t0, Address(d, 0)); 1272 __ strb(t1, Address(dend, -1)); 1273 __ strb(t2, Address(d, count)); 1274 } 1275 __ b(finish); 1276 } 1277 } 1278 1279 __ bind(copy_big); 1280 if (is_backwards) { 1281 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1282 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1283 } 1284 1285 // Now we've got the small case out of the way we can align the 1286 // source address on a 2-word boundary. 1287 1288 Label aligned; 1289 1290 if (is_aligned) { 1291 // We may have to adjust by 1 word to get s 2-word-aligned. 1292 __ tbz(s, exact_log2(wordSize), aligned); 1293 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1294 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1295 __ sub(count, count, wordSize/granularity); 1296 } else { 1297 if (is_backwards) { 1298 __ andr(rscratch2, s, 2 * wordSize - 1); 1299 } else { 1300 __ neg(rscratch2, s); 1301 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1302 } 1303 // rscratch2 is the byte adjustment needed to align s. 1304 __ cbz(rscratch2, aligned); 1305 int shift = exact_log2(granularity); 1306 if (shift) __ lsr(rscratch2, rscratch2, shift); 1307 __ sub(count, count, rscratch2); 1308 1309 #if 0 1310 // ?? This code is only correct for a disjoint copy. It may or 1311 // may not make sense to use it in that case. 1312 1313 // Copy the first pair; s and d may not be aligned. 1314 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1315 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1316 1317 // Align s and d, adjust count 1318 if (is_backwards) { 1319 __ sub(s, s, rscratch2); 1320 __ sub(d, d, rscratch2); 1321 } else { 1322 __ add(s, s, rscratch2); 1323 __ add(d, d, rscratch2); 1324 } 1325 #else 1326 copy_memory_small(s, d, rscratch2, rscratch1, step); 1327 #endif 1328 } 1329 1330 __ bind(aligned); 1331 1332 // s is now 2-word-aligned. 1333 1334 // We have a count of units and some trailing bytes. Adjust the 1335 // count and do a bulk copy of words. 1336 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1337 if (direction == copy_forwards) 1338 __ bl(copy_f); 1339 else 1340 __ bl(copy_b); 1341 1342 // And the tail. 1343 copy_memory_small(s, d, count, tmp, step); 1344 1345 if (granularity >= 8) __ bind(copy8); 1346 if (granularity >= 4) __ bind(copy4); 1347 __ bind(finish); 1348 } 1349 1350 1351 void clobber_registers() { 1352 #ifdef ASSERT 1353 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1354 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1355 for (Register r = r3; r <= r18; r++) 1356 if (r != rscratch1) __ mov(r, rscratch1); 1357 #endif 1358 } 1359 1360 // Scan over array at a for count oops, verifying each one. 1361 // Preserves a and count, clobbers rscratch1 and rscratch2. 1362 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1363 Label loop, end; 1364 __ mov(rscratch1, a); 1365 __ mov(rscratch2, zr); 1366 __ bind(loop); 1367 __ cmp(rscratch2, count); 1368 __ br(Assembler::HS, end); 1369 if (size == (size_t)wordSize) { 1370 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1371 __ verify_oop(temp); 1372 } else { 1373 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1374 __ decode_heap_oop(temp); // calls verify_oop 1375 } 1376 __ add(rscratch2, rscratch2, size); 1377 __ b(loop); 1378 __ bind(end); 1379 } 1380 1381 // Arguments: 1382 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1383 // ignored 1384 // is_oop - true => oop array, so generate store check code 1385 // name - stub name string 1386 // 1387 // Inputs: 1388 // c_rarg0 - source array address 1389 // c_rarg1 - destination array address 1390 // c_rarg2 - element count, treated as ssize_t, can be zero 1391 // 1392 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1393 // the hardware handle it. The two dwords within qwords that span 1394 // cache line boundaries will still be loaded and stored atomicly. 1395 // 1396 // Side Effects: 1397 // disjoint_int_copy_entry is set to the no-overlap entry point 1398 // used by generate_conjoint_int_oop_copy(). 1399 // 1400 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1401 const char *name, bool dest_uninitialized = false) { 1402 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1403 RegSet saved_reg = RegSet::of(s, d, count); 1404 __ align(CodeEntryAlignment); 1405 StubCodeMark mark(this, "StubRoutines", name); 1406 address start = __ pc(); 1407 __ enter(); 1408 1409 if (entry != NULL) { 1410 *entry = __ pc(); 1411 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1412 BLOCK_COMMENT("Entry:"); 1413 } 1414 1415 DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_DISJOINT; 1416 if (dest_uninitialized) { 1417 decorators |= AS_DEST_NOT_INITIALIZED; 1418 } 1419 if (aligned) { 1420 decorators |= ARRAYCOPY_ALIGNED; 1421 } 1422 1423 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1424 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg); 1425 1426 if (is_oop) { 1427 // save regs before copy_memory 1428 __ push(RegSet::of(d, count), sp); 1429 } 1430 copy_memory(aligned, s, d, count, rscratch1, size); 1431 1432 if (is_oop) { 1433 __ pop(RegSet::of(d, count), sp); 1434 if (VerifyOops) 1435 verify_oop_array(size, d, count, r16); 1436 __ sub(count, count, 1); // make an inclusive end pointer 1437 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1438 } 1439 1440 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1441 1442 __ leave(); 1443 __ mov(r0, zr); // return 0 1444 __ ret(lr); 1445 #ifdef BUILTIN_SIM 1446 { 1447 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1448 sim->notifyCompile(const_cast<char*>(name), start); 1449 } 1450 #endif 1451 return start; 1452 } 1453 1454 // Arguments: 1455 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1456 // ignored 1457 // is_oop - true => oop array, so generate store check code 1458 // name - stub name string 1459 // 1460 // Inputs: 1461 // c_rarg0 - source array address 1462 // c_rarg1 - destination array address 1463 // c_rarg2 - element count, treated as ssize_t, can be zero 1464 // 1465 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1466 // the hardware handle it. The two dwords within qwords that span 1467 // cache line boundaries will still be loaded and stored atomicly. 1468 // 1469 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1470 address *entry, const char *name, 1471 bool dest_uninitialized = false) { 1472 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1473 RegSet saved_regs = RegSet::of(s, d, count); 1474 StubCodeMark mark(this, "StubRoutines", name); 1475 address start = __ pc(); 1476 __ enter(); 1477 1478 if (entry != NULL) { 1479 *entry = __ pc(); 1480 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1481 BLOCK_COMMENT("Entry:"); 1482 } 1483 1484 // use fwd copy when (d-s) above_equal (count*size) 1485 __ sub(rscratch1, d, s); 1486 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1487 __ br(Assembler::HS, nooverlap_target); 1488 1489 DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY; 1490 if (dest_uninitialized) { 1491 decorators |= AS_DEST_NOT_INITIALIZED; 1492 } 1493 if (aligned) { 1494 decorators |= ARRAYCOPY_ALIGNED; 1495 } 1496 1497 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1498 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs); 1499 1500 if (is_oop) { 1501 // save regs before copy_memory 1502 __ push(RegSet::of(d, count), sp); 1503 } 1504 copy_memory(aligned, s, d, count, rscratch1, -size); 1505 if (is_oop) { 1506 __ pop(RegSet::of(d, count), sp); 1507 if (VerifyOops) 1508 verify_oop_array(size, d, count, r16); 1509 __ sub(count, count, 1); // make an inclusive end pointer 1510 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1511 } 1512 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1513 __ leave(); 1514 __ mov(r0, zr); // return 0 1515 __ ret(lr); 1516 #ifdef BUILTIN_SIM 1517 { 1518 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1519 sim->notifyCompile(const_cast<char*>(name), start); 1520 } 1521 #endif 1522 return start; 1523 } 1524 1525 // Arguments: 1526 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1527 // ignored 1528 // name - stub name string 1529 // 1530 // Inputs: 1531 // c_rarg0 - source array address 1532 // c_rarg1 - destination array address 1533 // c_rarg2 - element count, treated as ssize_t, can be zero 1534 // 1535 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1536 // we let the hardware handle it. The one to eight bytes within words, 1537 // dwords or qwords that span cache line boundaries will still be loaded 1538 // and stored atomically. 1539 // 1540 // Side Effects: 1541 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1542 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1543 // we let the hardware handle it. The one to eight bytes within words, 1544 // dwords or qwords that span cache line boundaries will still be loaded 1545 // and stored atomically. 1546 // 1547 // Side Effects: 1548 // disjoint_byte_copy_entry is set to the no-overlap entry point 1549 // used by generate_conjoint_byte_copy(). 1550 // 1551 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1552 const bool not_oop = false; 1553 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1554 } 1555 1556 // Arguments: 1557 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1558 // ignored 1559 // name - stub name string 1560 // 1561 // Inputs: 1562 // c_rarg0 - source array address 1563 // c_rarg1 - destination array address 1564 // c_rarg2 - element count, treated as ssize_t, can be zero 1565 // 1566 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1567 // we let the hardware handle it. The one to eight bytes within words, 1568 // dwords or qwords that span cache line boundaries will still be loaded 1569 // and stored atomically. 1570 // 1571 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1572 address* entry, const char *name) { 1573 const bool not_oop = false; 1574 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1575 } 1576 1577 // Arguments: 1578 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1579 // ignored 1580 // name - stub name string 1581 // 1582 // Inputs: 1583 // c_rarg0 - source array address 1584 // c_rarg1 - destination array address 1585 // c_rarg2 - element count, treated as ssize_t, can be zero 1586 // 1587 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1588 // let the hardware handle it. The two or four words within dwords 1589 // or qwords that span cache line boundaries will still be loaded 1590 // and stored atomically. 1591 // 1592 // Side Effects: 1593 // disjoint_short_copy_entry is set to the no-overlap entry point 1594 // used by generate_conjoint_short_copy(). 1595 // 1596 address generate_disjoint_short_copy(bool aligned, 1597 address* entry, const char *name) { 1598 const bool not_oop = false; 1599 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1600 } 1601 1602 // Arguments: 1603 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1604 // ignored 1605 // name - stub name string 1606 // 1607 // Inputs: 1608 // c_rarg0 - source array address 1609 // c_rarg1 - destination array address 1610 // c_rarg2 - element count, treated as ssize_t, can be zero 1611 // 1612 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1613 // let the hardware handle it. The two or four words within dwords 1614 // or qwords that span cache line boundaries will still be loaded 1615 // and stored atomically. 1616 // 1617 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1618 address *entry, const char *name) { 1619 const bool not_oop = false; 1620 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1621 1622 } 1623 // Arguments: 1624 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1625 // ignored 1626 // name - stub name string 1627 // 1628 // Inputs: 1629 // c_rarg0 - source array address 1630 // c_rarg1 - destination array address 1631 // c_rarg2 - element count, treated as ssize_t, can be zero 1632 // 1633 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1634 // the hardware handle it. The two dwords within qwords that span 1635 // cache line boundaries will still be loaded and stored atomicly. 1636 // 1637 // Side Effects: 1638 // disjoint_int_copy_entry is set to the no-overlap entry point 1639 // used by generate_conjoint_int_oop_copy(). 1640 // 1641 address generate_disjoint_int_copy(bool aligned, address *entry, 1642 const char *name, bool dest_uninitialized = false) { 1643 const bool not_oop = false; 1644 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1645 } 1646 1647 // Arguments: 1648 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1649 // ignored 1650 // name - stub name string 1651 // 1652 // Inputs: 1653 // c_rarg0 - source array address 1654 // c_rarg1 - destination array address 1655 // c_rarg2 - element count, treated as ssize_t, can be zero 1656 // 1657 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1658 // the hardware handle it. The two dwords within qwords that span 1659 // cache line boundaries will still be loaded and stored atomicly. 1660 // 1661 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1662 address *entry, const char *name, 1663 bool dest_uninitialized = false) { 1664 const bool not_oop = false; 1665 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1666 } 1667 1668 1669 // Arguments: 1670 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1671 // ignored 1672 // name - stub name string 1673 // 1674 // Inputs: 1675 // c_rarg0 - source array address 1676 // c_rarg1 - destination array address 1677 // c_rarg2 - element count, treated as size_t, can be zero 1678 // 1679 // Side Effects: 1680 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1681 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1682 // 1683 address generate_disjoint_long_copy(bool aligned, address *entry, 1684 const char *name, bool dest_uninitialized = false) { 1685 const bool not_oop = false; 1686 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1687 } 1688 1689 // Arguments: 1690 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1691 // ignored 1692 // name - stub name string 1693 // 1694 // Inputs: 1695 // c_rarg0 - source array address 1696 // c_rarg1 - destination array address 1697 // c_rarg2 - element count, treated as size_t, can be zero 1698 // 1699 address generate_conjoint_long_copy(bool aligned, 1700 address nooverlap_target, address *entry, 1701 const char *name, bool dest_uninitialized = false) { 1702 const bool not_oop = false; 1703 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1704 } 1705 1706 // Arguments: 1707 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1708 // ignored 1709 // name - stub name string 1710 // 1711 // Inputs: 1712 // c_rarg0 - source array address 1713 // c_rarg1 - destination array address 1714 // c_rarg2 - element count, treated as size_t, can be zero 1715 // 1716 // Side Effects: 1717 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1718 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1719 // 1720 address generate_disjoint_oop_copy(bool aligned, address *entry, 1721 const char *name, bool dest_uninitialized) { 1722 const bool is_oop = true; 1723 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1724 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1725 } 1726 1727 // Arguments: 1728 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1729 // ignored 1730 // name - stub name string 1731 // 1732 // Inputs: 1733 // c_rarg0 - source array address 1734 // c_rarg1 - destination array address 1735 // c_rarg2 - element count, treated as size_t, can be zero 1736 // 1737 address generate_conjoint_oop_copy(bool aligned, 1738 address nooverlap_target, address *entry, 1739 const char *name, bool dest_uninitialized) { 1740 const bool is_oop = true; 1741 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1742 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1743 name, dest_uninitialized); 1744 } 1745 1746 1747 // Helper for generating a dynamic type check. 1748 // Smashes rscratch1. 1749 void generate_type_check(Register sub_klass, 1750 Register super_check_offset, 1751 Register super_klass, 1752 Label& L_success) { 1753 assert_different_registers(sub_klass, super_check_offset, super_klass); 1754 1755 BLOCK_COMMENT("type_check:"); 1756 1757 Label L_miss; 1758 1759 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1760 super_check_offset); 1761 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1762 1763 // Fall through on failure! 1764 __ BIND(L_miss); 1765 } 1766 1767 // 1768 // Generate checkcasting array copy stub 1769 // 1770 // Input: 1771 // c_rarg0 - source array address 1772 // c_rarg1 - destination array address 1773 // c_rarg2 - element count, treated as ssize_t, can be zero 1774 // c_rarg3 - size_t ckoff (super_check_offset) 1775 // c_rarg4 - oop ckval (super_klass) 1776 // 1777 // Output: 1778 // r0 == 0 - success 1779 // r0 == -1^K - failure, where K is partial transfer count 1780 // 1781 address generate_checkcast_copy(const char *name, address *entry, 1782 bool dest_uninitialized = false) { 1783 1784 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1785 1786 // Input registers (after setup_arg_regs) 1787 const Register from = c_rarg0; // source array address 1788 const Register to = c_rarg1; // destination array address 1789 const Register count = c_rarg2; // elementscount 1790 const Register ckoff = c_rarg3; // super_check_offset 1791 const Register ckval = c_rarg4; // super_klass 1792 1793 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1794 RegSet wb_post_saved_regs = RegSet::of(count); 1795 1796 // Registers used as temps (r18, r19, r20 are save-on-entry) 1797 const Register count_save = r21; // orig elementscount 1798 const Register start_to = r20; // destination array start address 1799 const Register copied_oop = r18; // actual oop copied 1800 const Register r19_klass = r19; // oop._klass 1801 1802 //--------------------------------------------------------------- 1803 // Assembler stub will be used for this call to arraycopy 1804 // if the two arrays are subtypes of Object[] but the 1805 // destination array type is not equal to or a supertype 1806 // of the source type. Each element must be separately 1807 // checked. 1808 1809 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1810 copied_oop, r19_klass, count_save); 1811 1812 __ align(CodeEntryAlignment); 1813 StubCodeMark mark(this, "StubRoutines", name); 1814 address start = __ pc(); 1815 1816 __ enter(); // required for proper stackwalking of RuntimeStub frame 1817 1818 #ifdef ASSERT 1819 // caller guarantees that the arrays really are different 1820 // otherwise, we would have to make conjoint checks 1821 { Label L; 1822 array_overlap_test(L, TIMES_OOP); 1823 __ stop("checkcast_copy within a single array"); 1824 __ bind(L); 1825 } 1826 #endif //ASSERT 1827 1828 // Caller of this entry point must set up the argument registers. 1829 if (entry != NULL) { 1830 *entry = __ pc(); 1831 BLOCK_COMMENT("Entry:"); 1832 } 1833 1834 // Empty array: Nothing to do. 1835 __ cbz(count, L_done); 1836 1837 __ push(RegSet::of(r18, r19, r20, r21), sp); 1838 1839 #ifdef ASSERT 1840 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1841 // The ckoff and ckval must be mutually consistent, 1842 // even though caller generates both. 1843 { Label L; 1844 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1845 __ ldrw(start_to, Address(ckval, sco_offset)); 1846 __ cmpw(ckoff, start_to); 1847 __ br(Assembler::EQ, L); 1848 __ stop("super_check_offset inconsistent"); 1849 __ bind(L); 1850 } 1851 #endif //ASSERT 1852 1853 DecoratorSet decorators = IN_HEAP | IN_HEAP_ARRAY | ARRAYCOPY_CHECKCAST; 1854 bool is_oop = true; 1855 if (dest_uninitialized) { 1856 decorators |= AS_DEST_NOT_INITIALIZED; 1857 } 1858 1859 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1860 bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs); 1861 1862 // save the original count 1863 __ mov(count_save, count); 1864 1865 // Copy from low to high addresses 1866 __ mov(start_to, to); // Save destination array start address 1867 __ b(L_load_element); 1868 1869 // ======== begin loop ======== 1870 // (Loop is rotated; its entry is L_load_element.) 1871 // Loop control: 1872 // for (; count != 0; count--) { 1873 // copied_oop = load_heap_oop(from++); 1874 // ... generate_type_check ...; 1875 // store_heap_oop(to++, copied_oop); 1876 // } 1877 __ align(OptoLoopAlignment); 1878 1879 __ BIND(L_store_element); 1880 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1881 __ sub(count, count, 1); 1882 __ cbz(count, L_do_card_marks); 1883 1884 // ======== loop entry is here ======== 1885 __ BIND(L_load_element); 1886 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1887 __ cbz(copied_oop, L_store_element); 1888 1889 __ load_klass(r19_klass, copied_oop);// query the object klass 1890 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1891 // ======== end loop ======== 1892 1893 // It was a real error; we must depend on the caller to finish the job. 1894 // Register count = remaining oops, count_orig = total oops. 1895 // Emit GC store barriers for the oops we have copied and report 1896 // their number to the caller. 1897 1898 __ subs(count, count_save, count); // K = partially copied oop count 1899 __ eon(count, count, zr); // report (-1^K) to caller 1900 __ br(Assembler::EQ, L_done_pop); 1901 1902 __ BIND(L_do_card_marks); 1903 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1904 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs); 1905 1906 __ bind(L_done_pop); 1907 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1908 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1909 1910 __ bind(L_done); 1911 __ mov(r0, count); 1912 __ leave(); 1913 __ ret(lr); 1914 1915 return start; 1916 } 1917 1918 // Perform range checks on the proposed arraycopy. 1919 // Kills temp, but nothing else. 1920 // Also, clean the sign bits of src_pos and dst_pos. 1921 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1922 Register src_pos, // source position (c_rarg1) 1923 Register dst, // destination array oo (c_rarg2) 1924 Register dst_pos, // destination position (c_rarg3) 1925 Register length, 1926 Register temp, 1927 Label& L_failed) { 1928 BLOCK_COMMENT("arraycopy_range_checks:"); 1929 1930 assert_different_registers(rscratch1, temp); 1931 1932 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1933 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1934 __ addw(temp, length, src_pos); 1935 __ cmpw(temp, rscratch1); 1936 __ br(Assembler::HI, L_failed); 1937 1938 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1939 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1940 __ addw(temp, length, dst_pos); 1941 __ cmpw(temp, rscratch1); 1942 __ br(Assembler::HI, L_failed); 1943 1944 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1945 __ movw(src_pos, src_pos); 1946 __ movw(dst_pos, dst_pos); 1947 1948 BLOCK_COMMENT("arraycopy_range_checks done"); 1949 } 1950 1951 // These stubs get called from some dumb test routine. 1952 // I'll write them properly when they're called from 1953 // something that's actually doing something. 1954 static void fake_arraycopy_stub(address src, address dst, int count) { 1955 assert(count == 0, "huh?"); 1956 } 1957 1958 1959 // 1960 // Generate 'unsafe' array copy stub 1961 // Though just as safe as the other stubs, it takes an unscaled 1962 // size_t argument instead of an element count. 1963 // 1964 // Input: 1965 // c_rarg0 - source array address 1966 // c_rarg1 - destination array address 1967 // c_rarg2 - byte count, treated as ssize_t, can be zero 1968 // 1969 // Examines the alignment of the operands and dispatches 1970 // to a long, int, short, or byte copy loop. 1971 // 1972 address generate_unsafe_copy(const char *name, 1973 address byte_copy_entry, 1974 address short_copy_entry, 1975 address int_copy_entry, 1976 address long_copy_entry) { 1977 Label L_long_aligned, L_int_aligned, L_short_aligned; 1978 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1979 1980 __ align(CodeEntryAlignment); 1981 StubCodeMark mark(this, "StubRoutines", name); 1982 address start = __ pc(); 1983 __ enter(); // required for proper stackwalking of RuntimeStub frame 1984 1985 // bump this on entry, not on exit: 1986 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1987 1988 __ orr(rscratch1, s, d); 1989 __ orr(rscratch1, rscratch1, count); 1990 1991 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1992 __ cbz(rscratch1, L_long_aligned); 1993 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1994 __ cbz(rscratch1, L_int_aligned); 1995 __ tbz(rscratch1, 0, L_short_aligned); 1996 __ b(RuntimeAddress(byte_copy_entry)); 1997 1998 __ BIND(L_short_aligned); 1999 __ lsr(count, count, LogBytesPerShort); // size => short_count 2000 __ b(RuntimeAddress(short_copy_entry)); 2001 __ BIND(L_int_aligned); 2002 __ lsr(count, count, LogBytesPerInt); // size => int_count 2003 __ b(RuntimeAddress(int_copy_entry)); 2004 __ BIND(L_long_aligned); 2005 __ lsr(count, count, LogBytesPerLong); // size => long_count 2006 __ b(RuntimeAddress(long_copy_entry)); 2007 2008 return start; 2009 } 2010 2011 // 2012 // Generate generic array copy stubs 2013 // 2014 // Input: 2015 // c_rarg0 - src oop 2016 // c_rarg1 - src_pos (32-bits) 2017 // c_rarg2 - dst oop 2018 // c_rarg3 - dst_pos (32-bits) 2019 // c_rarg4 - element count (32-bits) 2020 // 2021 // Output: 2022 // r0 == 0 - success 2023 // r0 == -1^K - failure, where K is partial transfer count 2024 // 2025 address generate_generic_copy(const char *name, 2026 address byte_copy_entry, address short_copy_entry, 2027 address int_copy_entry, address oop_copy_entry, 2028 address long_copy_entry, address checkcast_copy_entry) { 2029 2030 Label L_failed, L_failed_0, L_objArray; 2031 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2032 2033 // Input registers 2034 const Register src = c_rarg0; // source array oop 2035 const Register src_pos = c_rarg1; // source position 2036 const Register dst = c_rarg2; // destination array oop 2037 const Register dst_pos = c_rarg3; // destination position 2038 const Register length = c_rarg4; 2039 2040 StubCodeMark mark(this, "StubRoutines", name); 2041 2042 __ align(CodeEntryAlignment); 2043 address start = __ pc(); 2044 2045 __ enter(); // required for proper stackwalking of RuntimeStub frame 2046 2047 // bump this on entry, not on exit: 2048 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2049 2050 //----------------------------------------------------------------------- 2051 // Assembler stub will be used for this call to arraycopy 2052 // if the following conditions are met: 2053 // 2054 // (1) src and dst must not be null. 2055 // (2) src_pos must not be negative. 2056 // (3) dst_pos must not be negative. 2057 // (4) length must not be negative. 2058 // (5) src klass and dst klass should be the same and not NULL. 2059 // (6) src and dst should be arrays. 2060 // (7) src_pos + length must not exceed length of src. 2061 // (8) dst_pos + length must not exceed length of dst. 2062 // 2063 2064 // if (src == NULL) return -1; 2065 __ cbz(src, L_failed); 2066 2067 // if (src_pos < 0) return -1; 2068 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2069 2070 // if (dst == NULL) return -1; 2071 __ cbz(dst, L_failed); 2072 2073 // if (dst_pos < 0) return -1; 2074 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2075 2076 // registers used as temp 2077 const Register scratch_length = r16; // elements count to copy 2078 const Register scratch_src_klass = r17; // array klass 2079 const Register lh = r18; // layout helper 2080 2081 // if (length < 0) return -1; 2082 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2083 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2084 2085 __ load_klass(scratch_src_klass, src); 2086 #ifdef ASSERT 2087 // assert(src->klass() != NULL); 2088 { 2089 BLOCK_COMMENT("assert klasses not null {"); 2090 Label L1, L2; 2091 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2092 __ bind(L1); 2093 __ stop("broken null klass"); 2094 __ bind(L2); 2095 __ load_klass(rscratch1, dst); 2096 __ cbz(rscratch1, L1); // this would be broken also 2097 BLOCK_COMMENT("} assert klasses not null done"); 2098 } 2099 #endif 2100 2101 // Load layout helper (32-bits) 2102 // 2103 // |array_tag| | header_size | element_type | |log2_element_size| 2104 // 32 30 24 16 8 2 0 2105 // 2106 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2107 // 2108 2109 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2110 2111 // Handle objArrays completely differently... 2112 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2113 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2114 __ movw(rscratch1, objArray_lh); 2115 __ eorw(rscratch2, lh, rscratch1); 2116 __ cbzw(rscratch2, L_objArray); 2117 2118 // if (src->klass() != dst->klass()) return -1; 2119 __ load_klass(rscratch2, dst); 2120 __ eor(rscratch2, rscratch2, scratch_src_klass); 2121 __ cbnz(rscratch2, L_failed); 2122 2123 // if (!src->is_Array()) return -1; 2124 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2125 2126 // At this point, it is known to be a typeArray (array_tag 0x3). 2127 #ifdef ASSERT 2128 { 2129 BLOCK_COMMENT("assert primitive array {"); 2130 Label L; 2131 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2132 __ cmpw(lh, rscratch2); 2133 __ br(Assembler::GE, L); 2134 __ stop("must be a primitive array"); 2135 __ bind(L); 2136 BLOCK_COMMENT("} assert primitive array done"); 2137 } 2138 #endif 2139 2140 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2141 rscratch2, L_failed); 2142 2143 // TypeArrayKlass 2144 // 2145 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2146 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2147 // 2148 2149 const Register rscratch1_offset = rscratch1; // array offset 2150 const Register r18_elsize = lh; // element size 2151 2152 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2153 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2154 __ add(src, src, rscratch1_offset); // src array offset 2155 __ add(dst, dst, rscratch1_offset); // dst array offset 2156 BLOCK_COMMENT("choose copy loop based on element size"); 2157 2158 // next registers should be set before the jump to corresponding stub 2159 const Register from = c_rarg0; // source array address 2160 const Register to = c_rarg1; // destination array address 2161 const Register count = c_rarg2; // elements count 2162 2163 // 'from', 'to', 'count' registers should be set in such order 2164 // since they are the same as 'src', 'src_pos', 'dst'. 2165 2166 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2167 2168 // The possible values of elsize are 0-3, i.e. exact_log2(element 2169 // size in bytes). We do a simple bitwise binary search. 2170 __ BIND(L_copy_bytes); 2171 __ tbnz(r18_elsize, 1, L_copy_ints); 2172 __ tbnz(r18_elsize, 0, L_copy_shorts); 2173 __ lea(from, Address(src, src_pos));// src_addr 2174 __ lea(to, Address(dst, dst_pos));// dst_addr 2175 __ movw(count, scratch_length); // length 2176 __ b(RuntimeAddress(byte_copy_entry)); 2177 2178 __ BIND(L_copy_shorts); 2179 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2180 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2181 __ movw(count, scratch_length); // length 2182 __ b(RuntimeAddress(short_copy_entry)); 2183 2184 __ BIND(L_copy_ints); 2185 __ tbnz(r18_elsize, 0, L_copy_longs); 2186 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2187 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2188 __ movw(count, scratch_length); // length 2189 __ b(RuntimeAddress(int_copy_entry)); 2190 2191 __ BIND(L_copy_longs); 2192 #ifdef ASSERT 2193 { 2194 BLOCK_COMMENT("assert long copy {"); 2195 Label L; 2196 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2197 __ cmpw(r18_elsize, LogBytesPerLong); 2198 __ br(Assembler::EQ, L); 2199 __ stop("must be long copy, but elsize is wrong"); 2200 __ bind(L); 2201 BLOCK_COMMENT("} assert long copy done"); 2202 } 2203 #endif 2204 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2205 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2206 __ movw(count, scratch_length); // length 2207 __ b(RuntimeAddress(long_copy_entry)); 2208 2209 // ObjArrayKlass 2210 __ BIND(L_objArray); 2211 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2212 2213 Label L_plain_copy, L_checkcast_copy; 2214 // test array classes for subtyping 2215 __ load_klass(r18, dst); 2216 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2217 __ br(Assembler::NE, L_checkcast_copy); 2218 2219 // Identically typed arrays can be copied without element-wise checks. 2220 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2221 rscratch2, L_failed); 2222 2223 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2224 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2225 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2226 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2227 __ movw(count, scratch_length); // length 2228 __ BIND(L_plain_copy); 2229 __ b(RuntimeAddress(oop_copy_entry)); 2230 2231 __ BIND(L_checkcast_copy); 2232 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2233 { 2234 // Before looking at dst.length, make sure dst is also an objArray. 2235 __ ldrw(rscratch1, Address(r18, lh_offset)); 2236 __ movw(rscratch2, objArray_lh); 2237 __ eorw(rscratch1, rscratch1, rscratch2); 2238 __ cbnzw(rscratch1, L_failed); 2239 2240 // It is safe to examine both src.length and dst.length. 2241 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2242 r18, L_failed); 2243 2244 const Register rscratch2_dst_klass = rscratch2; 2245 __ load_klass(rscratch2_dst_klass, dst); // reload 2246 2247 // Marshal the base address arguments now, freeing registers. 2248 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2249 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2250 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2251 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2252 __ movw(count, length); // length (reloaded) 2253 Register sco_temp = c_rarg3; // this register is free now 2254 assert_different_registers(from, to, count, sco_temp, 2255 rscratch2_dst_klass, scratch_src_klass); 2256 // assert_clean_int(count, sco_temp); 2257 2258 // Generate the type check. 2259 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2260 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2261 // assert_clean_int(sco_temp, r18); 2262 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2263 2264 // Fetch destination element klass from the ObjArrayKlass header. 2265 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2266 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2267 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2268 2269 // the checkcast_copy loop needs two extra arguments: 2270 assert(c_rarg3 == sco_temp, "#3 already in place"); 2271 // Set up arguments for checkcast_copy_entry. 2272 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2273 __ b(RuntimeAddress(checkcast_copy_entry)); 2274 } 2275 2276 __ BIND(L_failed); 2277 __ mov(r0, -1); 2278 __ leave(); // required for proper stackwalking of RuntimeStub frame 2279 __ ret(lr); 2280 2281 return start; 2282 } 2283 2284 // 2285 // Generate stub for array fill. If "aligned" is true, the 2286 // "to" address is assumed to be heapword aligned. 2287 // 2288 // Arguments for generated stub: 2289 // to: c_rarg0 2290 // value: c_rarg1 2291 // count: c_rarg2 treated as signed 2292 // 2293 address generate_fill(BasicType t, bool aligned, const char *name) { 2294 __ align(CodeEntryAlignment); 2295 StubCodeMark mark(this, "StubRoutines", name); 2296 address start = __ pc(); 2297 2298 BLOCK_COMMENT("Entry:"); 2299 2300 const Register to = c_rarg0; // source array address 2301 const Register value = c_rarg1; // value 2302 const Register count = c_rarg2; // elements count 2303 2304 const Register bz_base = r10; // base for block_zero routine 2305 const Register cnt_words = r11; // temp register 2306 2307 __ enter(); 2308 2309 Label L_fill_elements, L_exit1; 2310 2311 int shift = -1; 2312 switch (t) { 2313 case T_BYTE: 2314 shift = 0; 2315 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2316 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2317 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2318 __ br(Assembler::LO, L_fill_elements); 2319 break; 2320 case T_SHORT: 2321 shift = 1; 2322 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2323 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2324 __ br(Assembler::LO, L_fill_elements); 2325 break; 2326 case T_INT: 2327 shift = 2; 2328 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2329 __ br(Assembler::LO, L_fill_elements); 2330 break; 2331 default: ShouldNotReachHere(); 2332 } 2333 2334 // Align source address at 8 bytes address boundary. 2335 Label L_skip_align1, L_skip_align2, L_skip_align4; 2336 if (!aligned) { 2337 switch (t) { 2338 case T_BYTE: 2339 // One byte misalignment happens only for byte arrays. 2340 __ tbz(to, 0, L_skip_align1); 2341 __ strb(value, Address(__ post(to, 1))); 2342 __ subw(count, count, 1); 2343 __ bind(L_skip_align1); 2344 // Fallthrough 2345 case T_SHORT: 2346 // Two bytes misalignment happens only for byte and short (char) arrays. 2347 __ tbz(to, 1, L_skip_align2); 2348 __ strh(value, Address(__ post(to, 2))); 2349 __ subw(count, count, 2 >> shift); 2350 __ bind(L_skip_align2); 2351 // Fallthrough 2352 case T_INT: 2353 // Align to 8 bytes, we know we are 4 byte aligned to start. 2354 __ tbz(to, 2, L_skip_align4); 2355 __ strw(value, Address(__ post(to, 4))); 2356 __ subw(count, count, 4 >> shift); 2357 __ bind(L_skip_align4); 2358 break; 2359 default: ShouldNotReachHere(); 2360 } 2361 } 2362 2363 // 2364 // Fill large chunks 2365 // 2366 __ lsrw(cnt_words, count, 3 - shift); // number of words 2367 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2368 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2369 if (UseBlockZeroing) { 2370 Label non_block_zeroing, rest; 2371 // If the fill value is zero we can use the fast zero_words(). 2372 __ cbnz(value, non_block_zeroing); 2373 __ mov(bz_base, to); 2374 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2375 __ zero_words(bz_base, cnt_words); 2376 __ b(rest); 2377 __ bind(non_block_zeroing); 2378 __ fill_words(to, cnt_words, value); 2379 __ bind(rest); 2380 } else { 2381 __ fill_words(to, cnt_words, value); 2382 } 2383 2384 // Remaining count is less than 8 bytes. Fill it by a single store. 2385 // Note that the total length is no less than 8 bytes. 2386 if (t == T_BYTE || t == T_SHORT) { 2387 Label L_exit1; 2388 __ cbzw(count, L_exit1); 2389 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2390 __ str(value, Address(to, -8)); // overwrite some elements 2391 __ bind(L_exit1); 2392 __ leave(); 2393 __ ret(lr); 2394 } 2395 2396 // Handle copies less than 8 bytes. 2397 Label L_fill_2, L_fill_4, L_exit2; 2398 __ bind(L_fill_elements); 2399 switch (t) { 2400 case T_BYTE: 2401 __ tbz(count, 0, L_fill_2); 2402 __ strb(value, Address(__ post(to, 1))); 2403 __ bind(L_fill_2); 2404 __ tbz(count, 1, L_fill_4); 2405 __ strh(value, Address(__ post(to, 2))); 2406 __ bind(L_fill_4); 2407 __ tbz(count, 2, L_exit2); 2408 __ strw(value, Address(to)); 2409 break; 2410 case T_SHORT: 2411 __ tbz(count, 0, L_fill_4); 2412 __ strh(value, Address(__ post(to, 2))); 2413 __ bind(L_fill_4); 2414 __ tbz(count, 1, L_exit2); 2415 __ strw(value, Address(to)); 2416 break; 2417 case T_INT: 2418 __ cbzw(count, L_exit2); 2419 __ strw(value, Address(to)); 2420 break; 2421 default: ShouldNotReachHere(); 2422 } 2423 __ bind(L_exit2); 2424 __ leave(); 2425 __ ret(lr); 2426 return start; 2427 } 2428 2429 void generate_arraycopy_stubs() { 2430 address entry; 2431 address entry_jbyte_arraycopy; 2432 address entry_jshort_arraycopy; 2433 address entry_jint_arraycopy; 2434 address entry_oop_arraycopy; 2435 address entry_jlong_arraycopy; 2436 address entry_checkcast_arraycopy; 2437 2438 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2439 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2440 2441 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2442 2443 //*** jbyte 2444 // Always need aligned and unaligned versions 2445 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2446 "jbyte_disjoint_arraycopy"); 2447 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2448 &entry_jbyte_arraycopy, 2449 "jbyte_arraycopy"); 2450 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2451 "arrayof_jbyte_disjoint_arraycopy"); 2452 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2453 "arrayof_jbyte_arraycopy"); 2454 2455 //*** jshort 2456 // Always need aligned and unaligned versions 2457 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2458 "jshort_disjoint_arraycopy"); 2459 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2460 &entry_jshort_arraycopy, 2461 "jshort_arraycopy"); 2462 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2463 "arrayof_jshort_disjoint_arraycopy"); 2464 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2465 "arrayof_jshort_arraycopy"); 2466 2467 //*** jint 2468 // Aligned versions 2469 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2470 "arrayof_jint_disjoint_arraycopy"); 2471 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2472 "arrayof_jint_arraycopy"); 2473 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2474 // entry_jint_arraycopy always points to the unaligned version 2475 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2476 "jint_disjoint_arraycopy"); 2477 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2478 &entry_jint_arraycopy, 2479 "jint_arraycopy"); 2480 2481 //*** jlong 2482 // It is always aligned 2483 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2484 "arrayof_jlong_disjoint_arraycopy"); 2485 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2486 "arrayof_jlong_arraycopy"); 2487 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2488 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2489 2490 //*** oops 2491 { 2492 // With compressed oops we need unaligned versions; notice that 2493 // we overwrite entry_oop_arraycopy. 2494 bool aligned = !UseCompressedOops; 2495 2496 StubRoutines::_arrayof_oop_disjoint_arraycopy 2497 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2498 /*dest_uninitialized*/false); 2499 StubRoutines::_arrayof_oop_arraycopy 2500 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2501 /*dest_uninitialized*/false); 2502 // Aligned versions without pre-barriers 2503 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2504 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2505 /*dest_uninitialized*/true); 2506 StubRoutines::_arrayof_oop_arraycopy_uninit 2507 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2508 /*dest_uninitialized*/true); 2509 } 2510 2511 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2512 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2513 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2514 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2515 2516 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2517 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2518 /*dest_uninitialized*/true); 2519 2520 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2521 entry_jbyte_arraycopy, 2522 entry_jshort_arraycopy, 2523 entry_jint_arraycopy, 2524 entry_jlong_arraycopy); 2525 2526 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2527 entry_jbyte_arraycopy, 2528 entry_jshort_arraycopy, 2529 entry_jint_arraycopy, 2530 entry_oop_arraycopy, 2531 entry_jlong_arraycopy, 2532 entry_checkcast_arraycopy); 2533 2534 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2535 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2536 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2537 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2538 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2539 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2540 } 2541 2542 void generate_math_stubs() { Unimplemented(); } 2543 2544 // Arguments: 2545 // 2546 // Inputs: 2547 // c_rarg0 - source byte array address 2548 // c_rarg1 - destination byte array address 2549 // c_rarg2 - K (key) in little endian int array 2550 // 2551 address generate_aescrypt_encryptBlock() { 2552 __ align(CodeEntryAlignment); 2553 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2554 2555 Label L_doLast; 2556 2557 const Register from = c_rarg0; // source array address 2558 const Register to = c_rarg1; // destination array address 2559 const Register key = c_rarg2; // key array address 2560 const Register keylen = rscratch1; 2561 2562 address start = __ pc(); 2563 __ enter(); 2564 2565 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2566 2567 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2568 2569 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2570 __ rev32(v1, __ T16B, v1); 2571 __ rev32(v2, __ T16B, v2); 2572 __ rev32(v3, __ T16B, v3); 2573 __ rev32(v4, __ T16B, v4); 2574 __ aese(v0, v1); 2575 __ aesmc(v0, v0); 2576 __ aese(v0, v2); 2577 __ aesmc(v0, v0); 2578 __ aese(v0, v3); 2579 __ aesmc(v0, v0); 2580 __ aese(v0, v4); 2581 __ aesmc(v0, v0); 2582 2583 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2584 __ rev32(v1, __ T16B, v1); 2585 __ rev32(v2, __ T16B, v2); 2586 __ rev32(v3, __ T16B, v3); 2587 __ rev32(v4, __ T16B, v4); 2588 __ aese(v0, v1); 2589 __ aesmc(v0, v0); 2590 __ aese(v0, v2); 2591 __ aesmc(v0, v0); 2592 __ aese(v0, v3); 2593 __ aesmc(v0, v0); 2594 __ aese(v0, v4); 2595 __ aesmc(v0, v0); 2596 2597 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2598 __ rev32(v1, __ T16B, v1); 2599 __ rev32(v2, __ T16B, v2); 2600 2601 __ cmpw(keylen, 44); 2602 __ br(Assembler::EQ, L_doLast); 2603 2604 __ aese(v0, v1); 2605 __ aesmc(v0, v0); 2606 __ aese(v0, v2); 2607 __ aesmc(v0, v0); 2608 2609 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2610 __ rev32(v1, __ T16B, v1); 2611 __ rev32(v2, __ T16B, v2); 2612 2613 __ cmpw(keylen, 52); 2614 __ br(Assembler::EQ, L_doLast); 2615 2616 __ aese(v0, v1); 2617 __ aesmc(v0, v0); 2618 __ aese(v0, v2); 2619 __ aesmc(v0, v0); 2620 2621 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2622 __ rev32(v1, __ T16B, v1); 2623 __ rev32(v2, __ T16B, v2); 2624 2625 __ BIND(L_doLast); 2626 2627 __ aese(v0, v1); 2628 __ aesmc(v0, v0); 2629 __ aese(v0, v2); 2630 2631 __ ld1(v1, __ T16B, key); 2632 __ rev32(v1, __ T16B, v1); 2633 __ eor(v0, __ T16B, v0, v1); 2634 2635 __ st1(v0, __ T16B, to); 2636 2637 __ mov(r0, 0); 2638 2639 __ leave(); 2640 __ ret(lr); 2641 2642 return start; 2643 } 2644 2645 // Arguments: 2646 // 2647 // Inputs: 2648 // c_rarg0 - source byte array address 2649 // c_rarg1 - destination byte array address 2650 // c_rarg2 - K (key) in little endian int array 2651 // 2652 address generate_aescrypt_decryptBlock() { 2653 assert(UseAES, "need AES instructions and misaligned SSE support"); 2654 __ align(CodeEntryAlignment); 2655 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2656 Label L_doLast; 2657 2658 const Register from = c_rarg0; // source array address 2659 const Register to = c_rarg1; // destination array address 2660 const Register key = c_rarg2; // key array address 2661 const Register keylen = rscratch1; 2662 2663 address start = __ pc(); 2664 __ enter(); // required for proper stackwalking of RuntimeStub frame 2665 2666 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2667 2668 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2669 2670 __ ld1(v5, __ T16B, __ post(key, 16)); 2671 __ rev32(v5, __ T16B, v5); 2672 2673 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2674 __ rev32(v1, __ T16B, v1); 2675 __ rev32(v2, __ T16B, v2); 2676 __ rev32(v3, __ T16B, v3); 2677 __ rev32(v4, __ T16B, v4); 2678 __ aesd(v0, v1); 2679 __ aesimc(v0, v0); 2680 __ aesd(v0, v2); 2681 __ aesimc(v0, v0); 2682 __ aesd(v0, v3); 2683 __ aesimc(v0, v0); 2684 __ aesd(v0, v4); 2685 __ aesimc(v0, v0); 2686 2687 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2688 __ rev32(v1, __ T16B, v1); 2689 __ rev32(v2, __ T16B, v2); 2690 __ rev32(v3, __ T16B, v3); 2691 __ rev32(v4, __ T16B, v4); 2692 __ aesd(v0, v1); 2693 __ aesimc(v0, v0); 2694 __ aesd(v0, v2); 2695 __ aesimc(v0, v0); 2696 __ aesd(v0, v3); 2697 __ aesimc(v0, v0); 2698 __ aesd(v0, v4); 2699 __ aesimc(v0, v0); 2700 2701 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2702 __ rev32(v1, __ T16B, v1); 2703 __ rev32(v2, __ T16B, v2); 2704 2705 __ cmpw(keylen, 44); 2706 __ br(Assembler::EQ, L_doLast); 2707 2708 __ aesd(v0, v1); 2709 __ aesimc(v0, v0); 2710 __ aesd(v0, v2); 2711 __ aesimc(v0, v0); 2712 2713 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2714 __ rev32(v1, __ T16B, v1); 2715 __ rev32(v2, __ T16B, v2); 2716 2717 __ cmpw(keylen, 52); 2718 __ br(Assembler::EQ, L_doLast); 2719 2720 __ aesd(v0, v1); 2721 __ aesimc(v0, v0); 2722 __ aesd(v0, v2); 2723 __ aesimc(v0, v0); 2724 2725 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2726 __ rev32(v1, __ T16B, v1); 2727 __ rev32(v2, __ T16B, v2); 2728 2729 __ BIND(L_doLast); 2730 2731 __ aesd(v0, v1); 2732 __ aesimc(v0, v0); 2733 __ aesd(v0, v2); 2734 2735 __ eor(v0, __ T16B, v0, v5); 2736 2737 __ st1(v0, __ T16B, to); 2738 2739 __ mov(r0, 0); 2740 2741 __ leave(); 2742 __ ret(lr); 2743 2744 return start; 2745 } 2746 2747 // Arguments: 2748 // 2749 // Inputs: 2750 // c_rarg0 - source byte array address 2751 // c_rarg1 - destination byte array address 2752 // c_rarg2 - K (key) in little endian int array 2753 // c_rarg3 - r vector byte array address 2754 // c_rarg4 - input length 2755 // 2756 // Output: 2757 // x0 - input length 2758 // 2759 address generate_cipherBlockChaining_encryptAESCrypt() { 2760 assert(UseAES, "need AES instructions and misaligned SSE support"); 2761 __ align(CodeEntryAlignment); 2762 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2763 2764 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2765 2766 const Register from = c_rarg0; // source array address 2767 const Register to = c_rarg1; // destination array address 2768 const Register key = c_rarg2; // key array address 2769 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2770 // and left with the results of the last encryption block 2771 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2772 const Register keylen = rscratch1; 2773 2774 address start = __ pc(); 2775 2776 __ enter(); 2777 2778 __ movw(rscratch2, len_reg); 2779 2780 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2781 2782 __ ld1(v0, __ T16B, rvec); 2783 2784 __ cmpw(keylen, 52); 2785 __ br(Assembler::CC, L_loadkeys_44); 2786 __ br(Assembler::EQ, L_loadkeys_52); 2787 2788 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2789 __ rev32(v17, __ T16B, v17); 2790 __ rev32(v18, __ T16B, v18); 2791 __ BIND(L_loadkeys_52); 2792 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2793 __ rev32(v19, __ T16B, v19); 2794 __ rev32(v20, __ T16B, v20); 2795 __ BIND(L_loadkeys_44); 2796 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2797 __ rev32(v21, __ T16B, v21); 2798 __ rev32(v22, __ T16B, v22); 2799 __ rev32(v23, __ T16B, v23); 2800 __ rev32(v24, __ T16B, v24); 2801 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2802 __ rev32(v25, __ T16B, v25); 2803 __ rev32(v26, __ T16B, v26); 2804 __ rev32(v27, __ T16B, v27); 2805 __ rev32(v28, __ T16B, v28); 2806 __ ld1(v29, v30, v31, __ T16B, key); 2807 __ rev32(v29, __ T16B, v29); 2808 __ rev32(v30, __ T16B, v30); 2809 __ rev32(v31, __ T16B, v31); 2810 2811 __ BIND(L_aes_loop); 2812 __ ld1(v1, __ T16B, __ post(from, 16)); 2813 __ eor(v0, __ T16B, v0, v1); 2814 2815 __ br(Assembler::CC, L_rounds_44); 2816 __ br(Assembler::EQ, L_rounds_52); 2817 2818 __ aese(v0, v17); __ aesmc(v0, v0); 2819 __ aese(v0, v18); __ aesmc(v0, v0); 2820 __ BIND(L_rounds_52); 2821 __ aese(v0, v19); __ aesmc(v0, v0); 2822 __ aese(v0, v20); __ aesmc(v0, v0); 2823 __ BIND(L_rounds_44); 2824 __ aese(v0, v21); __ aesmc(v0, v0); 2825 __ aese(v0, v22); __ aesmc(v0, v0); 2826 __ aese(v0, v23); __ aesmc(v0, v0); 2827 __ aese(v0, v24); __ aesmc(v0, v0); 2828 __ aese(v0, v25); __ aesmc(v0, v0); 2829 __ aese(v0, v26); __ aesmc(v0, v0); 2830 __ aese(v0, v27); __ aesmc(v0, v0); 2831 __ aese(v0, v28); __ aesmc(v0, v0); 2832 __ aese(v0, v29); __ aesmc(v0, v0); 2833 __ aese(v0, v30); 2834 __ eor(v0, __ T16B, v0, v31); 2835 2836 __ st1(v0, __ T16B, __ post(to, 16)); 2837 2838 __ subw(len_reg, len_reg, 16); 2839 __ cbnzw(len_reg, L_aes_loop); 2840 2841 __ st1(v0, __ T16B, rvec); 2842 2843 __ mov(r0, rscratch2); 2844 2845 __ leave(); 2846 __ ret(lr); 2847 2848 return start; 2849 } 2850 2851 // Arguments: 2852 // 2853 // Inputs: 2854 // c_rarg0 - source byte array address 2855 // c_rarg1 - destination byte array address 2856 // c_rarg2 - K (key) in little endian int array 2857 // c_rarg3 - r vector byte array address 2858 // c_rarg4 - input length 2859 // 2860 // Output: 2861 // r0 - input length 2862 // 2863 address generate_cipherBlockChaining_decryptAESCrypt() { 2864 assert(UseAES, "need AES instructions and misaligned SSE support"); 2865 __ align(CodeEntryAlignment); 2866 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2867 2868 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2869 2870 const Register from = c_rarg0; // source array address 2871 const Register to = c_rarg1; // destination array address 2872 const Register key = c_rarg2; // key array address 2873 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2874 // and left with the results of the last encryption block 2875 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2876 const Register keylen = rscratch1; 2877 2878 address start = __ pc(); 2879 2880 __ enter(); 2881 2882 __ movw(rscratch2, len_reg); 2883 2884 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2885 2886 __ ld1(v2, __ T16B, rvec); 2887 2888 __ ld1(v31, __ T16B, __ post(key, 16)); 2889 __ rev32(v31, __ T16B, v31); 2890 2891 __ cmpw(keylen, 52); 2892 __ br(Assembler::CC, L_loadkeys_44); 2893 __ br(Assembler::EQ, L_loadkeys_52); 2894 2895 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2896 __ rev32(v17, __ T16B, v17); 2897 __ rev32(v18, __ T16B, v18); 2898 __ BIND(L_loadkeys_52); 2899 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2900 __ rev32(v19, __ T16B, v19); 2901 __ rev32(v20, __ T16B, v20); 2902 __ BIND(L_loadkeys_44); 2903 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2904 __ rev32(v21, __ T16B, v21); 2905 __ rev32(v22, __ T16B, v22); 2906 __ rev32(v23, __ T16B, v23); 2907 __ rev32(v24, __ T16B, v24); 2908 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2909 __ rev32(v25, __ T16B, v25); 2910 __ rev32(v26, __ T16B, v26); 2911 __ rev32(v27, __ T16B, v27); 2912 __ rev32(v28, __ T16B, v28); 2913 __ ld1(v29, v30, __ T16B, key); 2914 __ rev32(v29, __ T16B, v29); 2915 __ rev32(v30, __ T16B, v30); 2916 2917 __ BIND(L_aes_loop); 2918 __ ld1(v0, __ T16B, __ post(from, 16)); 2919 __ orr(v1, __ T16B, v0, v0); 2920 2921 __ br(Assembler::CC, L_rounds_44); 2922 __ br(Assembler::EQ, L_rounds_52); 2923 2924 __ aesd(v0, v17); __ aesimc(v0, v0); 2925 __ aesd(v0, v18); __ aesimc(v0, v0); 2926 __ BIND(L_rounds_52); 2927 __ aesd(v0, v19); __ aesimc(v0, v0); 2928 __ aesd(v0, v20); __ aesimc(v0, v0); 2929 __ BIND(L_rounds_44); 2930 __ aesd(v0, v21); __ aesimc(v0, v0); 2931 __ aesd(v0, v22); __ aesimc(v0, v0); 2932 __ aesd(v0, v23); __ aesimc(v0, v0); 2933 __ aesd(v0, v24); __ aesimc(v0, v0); 2934 __ aesd(v0, v25); __ aesimc(v0, v0); 2935 __ aesd(v0, v26); __ aesimc(v0, v0); 2936 __ aesd(v0, v27); __ aesimc(v0, v0); 2937 __ aesd(v0, v28); __ aesimc(v0, v0); 2938 __ aesd(v0, v29); __ aesimc(v0, v0); 2939 __ aesd(v0, v30); 2940 __ eor(v0, __ T16B, v0, v31); 2941 __ eor(v0, __ T16B, v0, v2); 2942 2943 __ st1(v0, __ T16B, __ post(to, 16)); 2944 __ orr(v2, __ T16B, v1, v1); 2945 2946 __ subw(len_reg, len_reg, 16); 2947 __ cbnzw(len_reg, L_aes_loop); 2948 2949 __ st1(v2, __ T16B, rvec); 2950 2951 __ mov(r0, rscratch2); 2952 2953 __ leave(); 2954 __ ret(lr); 2955 2956 return start; 2957 } 2958 2959 // Arguments: 2960 // 2961 // Inputs: 2962 // c_rarg0 - byte[] source+offset 2963 // c_rarg1 - int[] SHA.state 2964 // c_rarg2 - int offset 2965 // c_rarg3 - int limit 2966 // 2967 address generate_sha1_implCompress(bool multi_block, const char *name) { 2968 __ align(CodeEntryAlignment); 2969 StubCodeMark mark(this, "StubRoutines", name); 2970 address start = __ pc(); 2971 2972 Register buf = c_rarg0; 2973 Register state = c_rarg1; 2974 Register ofs = c_rarg2; 2975 Register limit = c_rarg3; 2976 2977 Label keys; 2978 Label sha1_loop; 2979 2980 // load the keys into v0..v3 2981 __ adr(rscratch1, keys); 2982 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2983 // load 5 words state into v6, v7 2984 __ ldrq(v6, Address(state, 0)); 2985 __ ldrs(v7, Address(state, 16)); 2986 2987 2988 __ BIND(sha1_loop); 2989 // load 64 bytes of data into v16..v19 2990 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2991 __ rev32(v16, __ T16B, v16); 2992 __ rev32(v17, __ T16B, v17); 2993 __ rev32(v18, __ T16B, v18); 2994 __ rev32(v19, __ T16B, v19); 2995 2996 // do the sha1 2997 __ addv(v4, __ T4S, v16, v0); 2998 __ orr(v20, __ T16B, v6, v6); 2999 3000 FloatRegister d0 = v16; 3001 FloatRegister d1 = v17; 3002 FloatRegister d2 = v18; 3003 FloatRegister d3 = v19; 3004 3005 for (int round = 0; round < 20; round++) { 3006 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3007 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3008 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3009 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3010 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3011 3012 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3013 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3014 __ sha1h(tmp2, __ T4S, v20); 3015 if (round < 5) 3016 __ sha1c(v20, __ T4S, tmp3, tmp4); 3017 else if (round < 10 || round >= 15) 3018 __ sha1p(v20, __ T4S, tmp3, tmp4); 3019 else 3020 __ sha1m(v20, __ T4S, tmp3, tmp4); 3021 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3022 3023 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3024 } 3025 3026 __ addv(v7, __ T2S, v7, v21); 3027 __ addv(v6, __ T4S, v6, v20); 3028 3029 if (multi_block) { 3030 __ add(ofs, ofs, 64); 3031 __ cmp(ofs, limit); 3032 __ br(Assembler::LE, sha1_loop); 3033 __ mov(c_rarg0, ofs); // return ofs 3034 } 3035 3036 __ strq(v6, Address(state, 0)); 3037 __ strs(v7, Address(state, 16)); 3038 3039 __ ret(lr); 3040 3041 __ bind(keys); 3042 __ emit_int32(0x5a827999); 3043 __ emit_int32(0x6ed9eba1); 3044 __ emit_int32(0x8f1bbcdc); 3045 __ emit_int32(0xca62c1d6); 3046 3047 return start; 3048 } 3049 3050 3051 // Arguments: 3052 // 3053 // Inputs: 3054 // c_rarg0 - byte[] source+offset 3055 // c_rarg1 - int[] SHA.state 3056 // c_rarg2 - int offset 3057 // c_rarg3 - int limit 3058 // 3059 address generate_sha256_implCompress(bool multi_block, const char *name) { 3060 static const uint32_t round_consts[64] = { 3061 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3062 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3063 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3064 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3065 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3066 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3067 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3068 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3069 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3070 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3071 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3072 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3073 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3074 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3075 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3076 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3077 }; 3078 __ align(CodeEntryAlignment); 3079 StubCodeMark mark(this, "StubRoutines", name); 3080 address start = __ pc(); 3081 3082 Register buf = c_rarg0; 3083 Register state = c_rarg1; 3084 Register ofs = c_rarg2; 3085 Register limit = c_rarg3; 3086 3087 Label sha1_loop; 3088 3089 __ stpd(v8, v9, __ pre(sp, -32)); 3090 __ stpd(v10, v11, Address(sp, 16)); 3091 3092 // dga == v0 3093 // dgb == v1 3094 // dg0 == v2 3095 // dg1 == v3 3096 // dg2 == v4 3097 // t0 == v6 3098 // t1 == v7 3099 3100 // load 16 keys to v16..v31 3101 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3102 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3103 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3104 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3105 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3106 3107 // load 8 words (256 bits) state 3108 __ ldpq(v0, v1, state); 3109 3110 __ BIND(sha1_loop); 3111 // load 64 bytes of data into v8..v11 3112 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3113 __ rev32(v8, __ T16B, v8); 3114 __ rev32(v9, __ T16B, v9); 3115 __ rev32(v10, __ T16B, v10); 3116 __ rev32(v11, __ T16B, v11); 3117 3118 __ addv(v6, __ T4S, v8, v16); 3119 __ orr(v2, __ T16B, v0, v0); 3120 __ orr(v3, __ T16B, v1, v1); 3121 3122 FloatRegister d0 = v8; 3123 FloatRegister d1 = v9; 3124 FloatRegister d2 = v10; 3125 FloatRegister d3 = v11; 3126 3127 3128 for (int round = 0; round < 16; round++) { 3129 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3130 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3131 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3132 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3133 3134 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3135 __ orr(v4, __ T16B, v2, v2); 3136 if (round < 15) 3137 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3138 __ sha256h(v2, __ T4S, v3, tmp2); 3139 __ sha256h2(v3, __ T4S, v4, tmp2); 3140 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3141 3142 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3143 } 3144 3145 __ addv(v0, __ T4S, v0, v2); 3146 __ addv(v1, __ T4S, v1, v3); 3147 3148 if (multi_block) { 3149 __ add(ofs, ofs, 64); 3150 __ cmp(ofs, limit); 3151 __ br(Assembler::LE, sha1_loop); 3152 __ mov(c_rarg0, ofs); // return ofs 3153 } 3154 3155 __ ldpd(v10, v11, Address(sp, 16)); 3156 __ ldpd(v8, v9, __ post(sp, 32)); 3157 3158 __ stpq(v0, v1, state); 3159 3160 __ ret(lr); 3161 3162 return start; 3163 } 3164 3165 #ifndef BUILTIN_SIM 3166 // Safefetch stubs. 3167 void generate_safefetch(const char* name, int size, address* entry, 3168 address* fault_pc, address* continuation_pc) { 3169 // safefetch signatures: 3170 // int SafeFetch32(int* adr, int errValue); 3171 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3172 // 3173 // arguments: 3174 // c_rarg0 = adr 3175 // c_rarg1 = errValue 3176 // 3177 // result: 3178 // PPC_RET = *adr or errValue 3179 3180 StubCodeMark mark(this, "StubRoutines", name); 3181 3182 // Entry point, pc or function descriptor. 3183 *entry = __ pc(); 3184 3185 // Load *adr into c_rarg1, may fault. 3186 *fault_pc = __ pc(); 3187 switch (size) { 3188 case 4: 3189 // int32_t 3190 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3191 break; 3192 case 8: 3193 // int64_t 3194 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3195 break; 3196 default: 3197 ShouldNotReachHere(); 3198 } 3199 3200 // return errValue or *adr 3201 *continuation_pc = __ pc(); 3202 __ mov(r0, c_rarg1); 3203 __ ret(lr); 3204 } 3205 #endif 3206 3207 /** 3208 * Arguments: 3209 * 3210 * Inputs: 3211 * c_rarg0 - int crc 3212 * c_rarg1 - byte* buf 3213 * c_rarg2 - int length 3214 * 3215 * Ouput: 3216 * rax - int crc result 3217 */ 3218 address generate_updateBytesCRC32() { 3219 assert(UseCRC32Intrinsics, "what are we doing here?"); 3220 3221 __ align(CodeEntryAlignment); 3222 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3223 3224 address start = __ pc(); 3225 3226 const Register crc = c_rarg0; // crc 3227 const Register buf = c_rarg1; // source java byte array address 3228 const Register len = c_rarg2; // length 3229 const Register table0 = c_rarg3; // crc_table address 3230 const Register table1 = c_rarg4; 3231 const Register table2 = c_rarg5; 3232 const Register table3 = c_rarg6; 3233 const Register tmp3 = c_rarg7; 3234 3235 BLOCK_COMMENT("Entry:"); 3236 __ enter(); // required for proper stackwalking of RuntimeStub frame 3237 3238 __ kernel_crc32(crc, buf, len, 3239 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3240 3241 __ leave(); // required for proper stackwalking of RuntimeStub frame 3242 __ ret(lr); 3243 3244 return start; 3245 } 3246 3247 /** 3248 * Arguments: 3249 * 3250 * Inputs: 3251 * c_rarg0 - int crc 3252 * c_rarg1 - byte* buf 3253 * c_rarg2 - int length 3254 * c_rarg3 - int* table 3255 * 3256 * Ouput: 3257 * r0 - int crc result 3258 */ 3259 address generate_updateBytesCRC32C() { 3260 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3261 3262 __ align(CodeEntryAlignment); 3263 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3264 3265 address start = __ pc(); 3266 3267 const Register crc = c_rarg0; // crc 3268 const Register buf = c_rarg1; // source java byte array address 3269 const Register len = c_rarg2; // length 3270 const Register table0 = c_rarg3; // crc_table address 3271 const Register table1 = c_rarg4; 3272 const Register table2 = c_rarg5; 3273 const Register table3 = c_rarg6; 3274 const Register tmp3 = c_rarg7; 3275 3276 BLOCK_COMMENT("Entry:"); 3277 __ enter(); // required for proper stackwalking of RuntimeStub frame 3278 3279 __ kernel_crc32c(crc, buf, len, 3280 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3281 3282 __ leave(); // required for proper stackwalking of RuntimeStub frame 3283 __ ret(lr); 3284 3285 return start; 3286 } 3287 3288 /*** 3289 * Arguments: 3290 * 3291 * Inputs: 3292 * c_rarg0 - int adler 3293 * c_rarg1 - byte* buff 3294 * c_rarg2 - int len 3295 * 3296 * Output: 3297 * c_rarg0 - int adler result 3298 */ 3299 address generate_updateBytesAdler32() { 3300 __ align(CodeEntryAlignment); 3301 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3302 address start = __ pc(); 3303 3304 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3305 3306 // Aliases 3307 Register adler = c_rarg0; 3308 Register s1 = c_rarg0; 3309 Register s2 = c_rarg3; 3310 Register buff = c_rarg1; 3311 Register len = c_rarg2; 3312 Register nmax = r4; 3313 Register base = r5; 3314 Register count = r6; 3315 Register temp0 = rscratch1; 3316 Register temp1 = rscratch2; 3317 Register temp2 = r7; 3318 3319 // Max number of bytes we can process before having to take the mod 3320 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3321 unsigned long BASE = 0xfff1; 3322 unsigned long NMAX = 0x15B0; 3323 3324 __ mov(base, BASE); 3325 __ mov(nmax, NMAX); 3326 3327 // s1 is initialized to the lower 16 bits of adler 3328 // s2 is initialized to the upper 16 bits of adler 3329 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3330 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3331 3332 // The pipelined loop needs at least 16 elements for 1 iteration 3333 // It does check this, but it is more effective to skip to the cleanup loop 3334 __ cmp(len, 16); 3335 __ br(Assembler::HS, L_nmax); 3336 __ cbz(len, L_combine); 3337 3338 __ bind(L_simple_by1_loop); 3339 __ ldrb(temp0, Address(__ post(buff, 1))); 3340 __ add(s1, s1, temp0); 3341 __ add(s2, s2, s1); 3342 __ subs(len, len, 1); 3343 __ br(Assembler::HI, L_simple_by1_loop); 3344 3345 // s1 = s1 % BASE 3346 __ subs(temp0, s1, base); 3347 __ csel(s1, temp0, s1, Assembler::HS); 3348 3349 // s2 = s2 % BASE 3350 __ lsr(temp0, s2, 16); 3351 __ lsl(temp1, temp0, 4); 3352 __ sub(temp1, temp1, temp0); 3353 __ add(s2, temp1, s2, ext::uxth); 3354 3355 __ subs(temp0, s2, base); 3356 __ csel(s2, temp0, s2, Assembler::HS); 3357 3358 __ b(L_combine); 3359 3360 __ bind(L_nmax); 3361 __ subs(len, len, nmax); 3362 __ sub(count, nmax, 16); 3363 __ br(Assembler::LO, L_by16); 3364 3365 __ bind(L_nmax_loop); 3366 3367 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3368 3369 __ add(s1, s1, temp0, ext::uxtb); 3370 __ ubfx(temp2, temp0, 8, 8); 3371 __ add(s2, s2, s1); 3372 __ add(s1, s1, temp2); 3373 __ ubfx(temp2, temp0, 16, 8); 3374 __ add(s2, s2, s1); 3375 __ add(s1, s1, temp2); 3376 __ ubfx(temp2, temp0, 24, 8); 3377 __ add(s2, s2, s1); 3378 __ add(s1, s1, temp2); 3379 __ ubfx(temp2, temp0, 32, 8); 3380 __ add(s2, s2, s1); 3381 __ add(s1, s1, temp2); 3382 __ ubfx(temp2, temp0, 40, 8); 3383 __ add(s2, s2, s1); 3384 __ add(s1, s1, temp2); 3385 __ ubfx(temp2, temp0, 48, 8); 3386 __ add(s2, s2, s1); 3387 __ add(s1, s1, temp2); 3388 __ add(s2, s2, s1); 3389 __ add(s1, s1, temp0, Assembler::LSR, 56); 3390 __ add(s2, s2, s1); 3391 3392 __ add(s1, s1, temp1, ext::uxtb); 3393 __ ubfx(temp2, temp1, 8, 8); 3394 __ add(s2, s2, s1); 3395 __ add(s1, s1, temp2); 3396 __ ubfx(temp2, temp1, 16, 8); 3397 __ add(s2, s2, s1); 3398 __ add(s1, s1, temp2); 3399 __ ubfx(temp2, temp1, 24, 8); 3400 __ add(s2, s2, s1); 3401 __ add(s1, s1, temp2); 3402 __ ubfx(temp2, temp1, 32, 8); 3403 __ add(s2, s2, s1); 3404 __ add(s1, s1, temp2); 3405 __ ubfx(temp2, temp1, 40, 8); 3406 __ add(s2, s2, s1); 3407 __ add(s1, s1, temp2); 3408 __ ubfx(temp2, temp1, 48, 8); 3409 __ add(s2, s2, s1); 3410 __ add(s1, s1, temp2); 3411 __ add(s2, s2, s1); 3412 __ add(s1, s1, temp1, Assembler::LSR, 56); 3413 __ add(s2, s2, s1); 3414 3415 __ subs(count, count, 16); 3416 __ br(Assembler::HS, L_nmax_loop); 3417 3418 // s1 = s1 % BASE 3419 __ lsr(temp0, s1, 16); 3420 __ lsl(temp1, temp0, 4); 3421 __ sub(temp1, temp1, temp0); 3422 __ add(temp1, temp1, s1, ext::uxth); 3423 3424 __ lsr(temp0, temp1, 16); 3425 __ lsl(s1, temp0, 4); 3426 __ sub(s1, s1, temp0); 3427 __ add(s1, s1, temp1, ext:: uxth); 3428 3429 __ subs(temp0, s1, base); 3430 __ csel(s1, temp0, s1, Assembler::HS); 3431 3432 // s2 = s2 % BASE 3433 __ lsr(temp0, s2, 16); 3434 __ lsl(temp1, temp0, 4); 3435 __ sub(temp1, temp1, temp0); 3436 __ add(temp1, temp1, s2, ext::uxth); 3437 3438 __ lsr(temp0, temp1, 16); 3439 __ lsl(s2, temp0, 4); 3440 __ sub(s2, s2, temp0); 3441 __ add(s2, s2, temp1, ext:: uxth); 3442 3443 __ subs(temp0, s2, base); 3444 __ csel(s2, temp0, s2, Assembler::HS); 3445 3446 __ subs(len, len, nmax); 3447 __ sub(count, nmax, 16); 3448 __ br(Assembler::HS, L_nmax_loop); 3449 3450 __ bind(L_by16); 3451 __ adds(len, len, count); 3452 __ br(Assembler::LO, L_by1); 3453 3454 __ bind(L_by16_loop); 3455 3456 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3457 3458 __ add(s1, s1, temp0, ext::uxtb); 3459 __ ubfx(temp2, temp0, 8, 8); 3460 __ add(s2, s2, s1); 3461 __ add(s1, s1, temp2); 3462 __ ubfx(temp2, temp0, 16, 8); 3463 __ add(s2, s2, s1); 3464 __ add(s1, s1, temp2); 3465 __ ubfx(temp2, temp0, 24, 8); 3466 __ add(s2, s2, s1); 3467 __ add(s1, s1, temp2); 3468 __ ubfx(temp2, temp0, 32, 8); 3469 __ add(s2, s2, s1); 3470 __ add(s1, s1, temp2); 3471 __ ubfx(temp2, temp0, 40, 8); 3472 __ add(s2, s2, s1); 3473 __ add(s1, s1, temp2); 3474 __ ubfx(temp2, temp0, 48, 8); 3475 __ add(s2, s2, s1); 3476 __ add(s1, s1, temp2); 3477 __ add(s2, s2, s1); 3478 __ add(s1, s1, temp0, Assembler::LSR, 56); 3479 __ add(s2, s2, s1); 3480 3481 __ add(s1, s1, temp1, ext::uxtb); 3482 __ ubfx(temp2, temp1, 8, 8); 3483 __ add(s2, s2, s1); 3484 __ add(s1, s1, temp2); 3485 __ ubfx(temp2, temp1, 16, 8); 3486 __ add(s2, s2, s1); 3487 __ add(s1, s1, temp2); 3488 __ ubfx(temp2, temp1, 24, 8); 3489 __ add(s2, s2, s1); 3490 __ add(s1, s1, temp2); 3491 __ ubfx(temp2, temp1, 32, 8); 3492 __ add(s2, s2, s1); 3493 __ add(s1, s1, temp2); 3494 __ ubfx(temp2, temp1, 40, 8); 3495 __ add(s2, s2, s1); 3496 __ add(s1, s1, temp2); 3497 __ ubfx(temp2, temp1, 48, 8); 3498 __ add(s2, s2, s1); 3499 __ add(s1, s1, temp2); 3500 __ add(s2, s2, s1); 3501 __ add(s1, s1, temp1, Assembler::LSR, 56); 3502 __ add(s2, s2, s1); 3503 3504 __ subs(len, len, 16); 3505 __ br(Assembler::HS, L_by16_loop); 3506 3507 __ bind(L_by1); 3508 __ adds(len, len, 15); 3509 __ br(Assembler::LO, L_do_mod); 3510 3511 __ bind(L_by1_loop); 3512 __ ldrb(temp0, Address(__ post(buff, 1))); 3513 __ add(s1, temp0, s1); 3514 __ add(s2, s2, s1); 3515 __ subs(len, len, 1); 3516 __ br(Assembler::HS, L_by1_loop); 3517 3518 __ bind(L_do_mod); 3519 // s1 = s1 % BASE 3520 __ lsr(temp0, s1, 16); 3521 __ lsl(temp1, temp0, 4); 3522 __ sub(temp1, temp1, temp0); 3523 __ add(temp1, temp1, s1, ext::uxth); 3524 3525 __ lsr(temp0, temp1, 16); 3526 __ lsl(s1, temp0, 4); 3527 __ sub(s1, s1, temp0); 3528 __ add(s1, s1, temp1, ext:: uxth); 3529 3530 __ subs(temp0, s1, base); 3531 __ csel(s1, temp0, s1, Assembler::HS); 3532 3533 // s2 = s2 % BASE 3534 __ lsr(temp0, s2, 16); 3535 __ lsl(temp1, temp0, 4); 3536 __ sub(temp1, temp1, temp0); 3537 __ add(temp1, temp1, s2, ext::uxth); 3538 3539 __ lsr(temp0, temp1, 16); 3540 __ lsl(s2, temp0, 4); 3541 __ sub(s2, s2, temp0); 3542 __ add(s2, s2, temp1, ext:: uxth); 3543 3544 __ subs(temp0, s2, base); 3545 __ csel(s2, temp0, s2, Assembler::HS); 3546 3547 // Combine lower bits and higher bits 3548 __ bind(L_combine); 3549 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3550 3551 __ ret(lr); 3552 3553 return start; 3554 } 3555 3556 /** 3557 * Arguments: 3558 * 3559 * Input: 3560 * c_rarg0 - x address 3561 * c_rarg1 - x length 3562 * c_rarg2 - y address 3563 * c_rarg3 - y lenth 3564 * c_rarg4 - z address 3565 * c_rarg5 - z length 3566 */ 3567 address generate_multiplyToLen() { 3568 __ align(CodeEntryAlignment); 3569 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3570 3571 address start = __ pc(); 3572 const Register x = r0; 3573 const Register xlen = r1; 3574 const Register y = r2; 3575 const Register ylen = r3; 3576 const Register z = r4; 3577 const Register zlen = r5; 3578 3579 const Register tmp1 = r10; 3580 const Register tmp2 = r11; 3581 const Register tmp3 = r12; 3582 const Register tmp4 = r13; 3583 const Register tmp5 = r14; 3584 const Register tmp6 = r15; 3585 const Register tmp7 = r16; 3586 3587 BLOCK_COMMENT("Entry:"); 3588 __ enter(); // required for proper stackwalking of RuntimeStub frame 3589 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3590 __ leave(); // required for proper stackwalking of RuntimeStub frame 3591 __ ret(lr); 3592 3593 return start; 3594 } 3595 3596 address generate_squareToLen() { 3597 // squareToLen algorithm for sizes 1..127 described in java code works 3598 // faster than multiply_to_len on some CPUs and slower on others, but 3599 // multiply_to_len shows a bit better overall results 3600 __ align(CodeEntryAlignment); 3601 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3602 address start = __ pc(); 3603 3604 const Register x = r0; 3605 const Register xlen = r1; 3606 const Register z = r2; 3607 const Register zlen = r3; 3608 const Register y = r4; // == x 3609 const Register ylen = r5; // == xlen 3610 3611 const Register tmp1 = r10; 3612 const Register tmp2 = r11; 3613 const Register tmp3 = r12; 3614 const Register tmp4 = r13; 3615 const Register tmp5 = r14; 3616 const Register tmp6 = r15; 3617 const Register tmp7 = r16; 3618 3619 RegSet spilled_regs = RegSet::of(y, ylen); 3620 BLOCK_COMMENT("Entry:"); 3621 __ enter(); 3622 __ push(spilled_regs, sp); 3623 __ mov(y, x); 3624 __ mov(ylen, xlen); 3625 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3626 __ pop(spilled_regs, sp); 3627 __ leave(); 3628 __ ret(lr); 3629 return start; 3630 } 3631 3632 address generate_mulAdd() { 3633 __ align(CodeEntryAlignment); 3634 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3635 3636 address start = __ pc(); 3637 3638 const Register out = r0; 3639 const Register in = r1; 3640 const Register offset = r2; 3641 const Register len = r3; 3642 const Register k = r4; 3643 3644 BLOCK_COMMENT("Entry:"); 3645 __ enter(); 3646 __ mul_add(out, in, offset, len, k); 3647 __ leave(); 3648 __ ret(lr); 3649 3650 return start; 3651 } 3652 3653 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3654 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3655 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3656 // Karatsuba multiplication performs a 128*128 -> 256-bit 3657 // multiplication in three 128-bit multiplications and a few 3658 // additions. 3659 // 3660 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3661 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3662 // 3663 // Inputs: 3664 // 3665 // A0 in a.d[0] (subkey) 3666 // A1 in a.d[1] 3667 // (A1+A0) in a1_xor_a0.d[0] 3668 // 3669 // B0 in b.d[0] (state) 3670 // B1 in b.d[1] 3671 3672 __ ext(tmp1, __ T16B, b, b, 0x08); 3673 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3674 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3675 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3676 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3677 3678 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3679 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3680 __ eor(tmp2, __ T16B, tmp2, tmp4); 3681 __ eor(tmp2, __ T16B, tmp2, tmp3); 3682 3683 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3684 __ ins(result_hi, __ D, tmp2, 0, 1); 3685 __ ins(result_lo, __ D, tmp2, 1, 0); 3686 } 3687 3688 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3689 FloatRegister p, FloatRegister z, FloatRegister t1) { 3690 const FloatRegister t0 = result; 3691 3692 // The GCM field polynomial f is z^128 + p(z), where p = 3693 // z^7+z^2+z+1. 3694 // 3695 // z^128 === -p(z) (mod (z^128 + p(z))) 3696 // 3697 // so, given that the product we're reducing is 3698 // a == lo + hi * z^128 3699 // substituting, 3700 // === lo - hi * p(z) (mod (z^128 + p(z))) 3701 // 3702 // we reduce by multiplying hi by p(z) and subtracting the result 3703 // from (i.e. XORing it with) lo. Because p has no nonzero high 3704 // bits we can do this with two 64-bit multiplications, lo*p and 3705 // hi*p. 3706 3707 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3708 __ ext(t1, __ T16B, t0, z, 8); 3709 __ eor(hi, __ T16B, hi, t1); 3710 __ ext(t1, __ T16B, z, t0, 8); 3711 __ eor(lo, __ T16B, lo, t1); 3712 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3713 __ eor(result, __ T16B, lo, t0); 3714 } 3715 3716 address generate_has_negatives(address &has_negatives_long) { 3717 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3718 const int large_loop_size = 64; 3719 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3720 int dcache_line = VM_Version::dcache_line_size(); 3721 3722 Register ary1 = r1, len = r2, result = r0; 3723 3724 __ align(CodeEntryAlignment); 3725 address entry = __ pc(); 3726 3727 __ enter(); 3728 3729 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3730 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3731 3732 __ cmp(len, 15); 3733 __ br(Assembler::GT, LEN_OVER_15); 3734 // The only case when execution falls into this code is when pointer is near 3735 // the end of memory page and we have to avoid reading next page 3736 __ add(ary1, ary1, len); 3737 __ subs(len, len, 8); 3738 __ br(Assembler::GT, LEN_OVER_8); 3739 __ ldr(rscratch2, Address(ary1, -8)); 3740 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3741 __ lsrv(rscratch2, rscratch2, rscratch1); 3742 __ tst(rscratch2, UPPER_BIT_MASK); 3743 __ cset(result, Assembler::NE); 3744 __ leave(); 3745 __ ret(lr); 3746 __ bind(LEN_OVER_8); 3747 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3748 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3749 __ tst(rscratch2, UPPER_BIT_MASK); 3750 __ br(Assembler::NE, RET_TRUE_NO_POP); 3751 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3752 __ lsrv(rscratch1, rscratch1, rscratch2); 3753 __ tst(rscratch1, UPPER_BIT_MASK); 3754 __ cset(result, Assembler::NE); 3755 __ leave(); 3756 __ ret(lr); 3757 3758 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3759 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3760 3761 has_negatives_long = __ pc(); // 2nd entry point 3762 3763 __ enter(); 3764 3765 __ bind(LEN_OVER_15); 3766 __ push(spilled_regs, sp); 3767 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3768 __ cbz(rscratch2, ALIGNED); 3769 __ ldp(tmp6, tmp1, Address(ary1)); 3770 __ mov(tmp5, 16); 3771 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3772 __ add(ary1, ary1, rscratch1); 3773 __ sub(len, len, rscratch1); 3774 __ orr(tmp6, tmp6, tmp1); 3775 __ tst(tmp6, UPPER_BIT_MASK); 3776 __ br(Assembler::NE, RET_TRUE); 3777 3778 __ bind(ALIGNED); 3779 __ cmp(len, large_loop_size); 3780 __ br(Assembler::LT, CHECK_16); 3781 // Perform 16-byte load as early return in pre-loop to handle situation 3782 // when initially aligned large array has negative values at starting bytes, 3783 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3784 // slower. Cases with negative bytes further ahead won't be affected that 3785 // much. In fact, it'll be faster due to early loads, less instructions and 3786 // less branches in LARGE_LOOP. 3787 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3788 __ sub(len, len, 16); 3789 __ orr(tmp6, tmp6, tmp1); 3790 __ tst(tmp6, UPPER_BIT_MASK); 3791 __ br(Assembler::NE, RET_TRUE); 3792 __ cmp(len, large_loop_size); 3793 __ br(Assembler::LT, CHECK_16); 3794 3795 if (SoftwarePrefetchHintDistance >= 0 3796 && SoftwarePrefetchHintDistance >= dcache_line) { 3797 // initial prefetch 3798 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3799 } 3800 __ bind(LARGE_LOOP); 3801 if (SoftwarePrefetchHintDistance >= 0) { 3802 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3803 } 3804 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3805 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3806 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3807 // instructions per cycle and have less branches, but this approach disables 3808 // early return, thus, all 64 bytes are loaded and checked every time. 3809 __ ldp(tmp2, tmp3, Address(ary1)); 3810 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3811 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3812 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3813 __ add(ary1, ary1, large_loop_size); 3814 __ sub(len, len, large_loop_size); 3815 __ orr(tmp2, tmp2, tmp3); 3816 __ orr(tmp4, tmp4, tmp5); 3817 __ orr(rscratch1, rscratch1, rscratch2); 3818 __ orr(tmp6, tmp6, tmp1); 3819 __ orr(tmp2, tmp2, tmp4); 3820 __ orr(rscratch1, rscratch1, tmp6); 3821 __ orr(tmp2, tmp2, rscratch1); 3822 __ tst(tmp2, UPPER_BIT_MASK); 3823 __ br(Assembler::NE, RET_TRUE); 3824 __ cmp(len, large_loop_size); 3825 __ br(Assembler::GE, LARGE_LOOP); 3826 3827 __ bind(CHECK_16); // small 16-byte load pre-loop 3828 __ cmp(len, 16); 3829 __ br(Assembler::LT, POST_LOOP16); 3830 3831 __ bind(LOOP16); // small 16-byte load loop 3832 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3833 __ sub(len, len, 16); 3834 __ orr(tmp2, tmp2, tmp3); 3835 __ tst(tmp2, UPPER_BIT_MASK); 3836 __ br(Assembler::NE, RET_TRUE); 3837 __ cmp(len, 16); 3838 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3839 3840 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3841 __ cmp(len, 8); 3842 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3843 __ ldr(tmp3, Address(__ post(ary1, 8))); 3844 __ sub(len, len, 8); 3845 __ tst(tmp3, UPPER_BIT_MASK); 3846 __ br(Assembler::NE, RET_TRUE); 3847 3848 __ bind(POST_LOOP16_LOAD_TAIL); 3849 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3850 __ ldr(tmp1, Address(ary1)); 3851 __ mov(tmp2, 64); 3852 __ sub(tmp4, tmp2, len, __ LSL, 3); 3853 __ lslv(tmp1, tmp1, tmp4); 3854 __ tst(tmp1, UPPER_BIT_MASK); 3855 __ br(Assembler::NE, RET_TRUE); 3856 // Fallthrough 3857 3858 __ bind(RET_FALSE); 3859 __ pop(spilled_regs, sp); 3860 __ leave(); 3861 __ mov(result, zr); 3862 __ ret(lr); 3863 3864 __ bind(RET_TRUE); 3865 __ pop(spilled_regs, sp); 3866 __ bind(RET_TRUE_NO_POP); 3867 __ leave(); 3868 __ mov(result, 1); 3869 __ ret(lr); 3870 3871 __ bind(DONE); 3872 __ pop(spilled_regs, sp); 3873 __ leave(); 3874 __ ret(lr); 3875 return entry; 3876 } 3877 3878 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3879 bool usePrefetch, Label &NOT_EQUAL) { 3880 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3881 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3882 tmp7 = r12, tmp8 = r13; 3883 Label LOOP; 3884 3885 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3886 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3887 __ bind(LOOP); 3888 if (usePrefetch) { 3889 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3890 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3891 } 3892 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3893 __ eor(tmp1, tmp1, tmp2); 3894 __ eor(tmp3, tmp3, tmp4); 3895 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3896 __ orr(tmp1, tmp1, tmp3); 3897 __ cbnz(tmp1, NOT_EQUAL); 3898 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3899 __ eor(tmp5, tmp5, tmp6); 3900 __ eor(tmp7, tmp7, tmp8); 3901 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3902 __ orr(tmp5, tmp5, tmp7); 3903 __ cbnz(tmp5, NOT_EQUAL); 3904 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3905 __ eor(tmp1, tmp1, tmp2); 3906 __ eor(tmp3, tmp3, tmp4); 3907 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3908 __ orr(tmp1, tmp1, tmp3); 3909 __ cbnz(tmp1, NOT_EQUAL); 3910 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3911 __ eor(tmp5, tmp5, tmp6); 3912 __ sub(cnt1, cnt1, 8 * wordSize); 3913 __ eor(tmp7, tmp7, tmp8); 3914 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3915 // tmp6 is not used. MacroAssembler::subs is used here (rather than 3916 // cmp) because subs allows an unlimited range of immediate operand. 3917 __ subs(tmp6, cnt1, loopThreshold); 3918 __ orr(tmp5, tmp5, tmp7); 3919 __ cbnz(tmp5, NOT_EQUAL); 3920 __ br(__ GE, LOOP); 3921 // post-loop 3922 __ eor(tmp1, tmp1, tmp2); 3923 __ eor(tmp3, tmp3, tmp4); 3924 __ orr(tmp1, tmp1, tmp3); 3925 __ sub(cnt1, cnt1, 2 * wordSize); 3926 __ cbnz(tmp1, NOT_EQUAL); 3927 } 3928 3929 void generate_large_array_equals_loop_simd(int loopThreshold, 3930 bool usePrefetch, Label &NOT_EQUAL) { 3931 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3932 tmp2 = rscratch2; 3933 Label LOOP; 3934 3935 __ bind(LOOP); 3936 if (usePrefetch) { 3937 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3938 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3939 } 3940 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3941 __ sub(cnt1, cnt1, 8 * wordSize); 3942 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3943 __ subs(tmp1, cnt1, loopThreshold); 3944 __ eor(v0, __ T16B, v0, v4); 3945 __ eor(v1, __ T16B, v1, v5); 3946 __ eor(v2, __ T16B, v2, v6); 3947 __ eor(v3, __ T16B, v3, v7); 3948 __ orr(v0, __ T16B, v0, v1); 3949 __ orr(v1, __ T16B, v2, v3); 3950 __ orr(v0, __ T16B, v0, v1); 3951 __ umov(tmp1, v0, __ D, 0); 3952 __ umov(tmp2, v0, __ D, 1); 3953 __ orr(tmp1, tmp1, tmp2); 3954 __ cbnz(tmp1, NOT_EQUAL); 3955 __ br(__ GE, LOOP); 3956 } 3957 3958 // a1 = r1 - array1 address 3959 // a2 = r2 - array2 address 3960 // result = r0 - return value. Already contains "false" 3961 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3962 // r3-r5 are reserved temporary registers 3963 address generate_large_array_equals() { 3964 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3965 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3966 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3967 tmp7 = r12, tmp8 = r13; 3968 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3969 SMALL_LOOP, POST_LOOP; 3970 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3971 // calculate if at least 32 prefetched bytes are used 3972 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3973 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3974 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3975 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3976 tmp5, tmp6, tmp7, tmp8); 3977 3978 __ align(CodeEntryAlignment); 3979 address entry = __ pc(); 3980 __ enter(); 3981 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3982 // also advance pointers to use post-increment instead of pre-increment 3983 __ add(a1, a1, wordSize); 3984 __ add(a2, a2, wordSize); 3985 if (AvoidUnalignedAccesses) { 3986 // both implementations (SIMD/nonSIMD) are using relatively large load 3987 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 3988 // on some CPUs in case of address is not at least 16-byte aligned. 3989 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 3990 // load if needed at least for 1st address and make if 16-byte aligned. 3991 Label ALIGNED16; 3992 __ tbz(a1, 3, ALIGNED16); 3993 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3994 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3995 __ sub(cnt1, cnt1, wordSize); 3996 __ eor(tmp1, tmp1, tmp2); 3997 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 3998 __ bind(ALIGNED16); 3999 } 4000 if (UseSIMDForArrayEquals) { 4001 if (SoftwarePrefetchHintDistance >= 0) { 4002 __ subs(tmp1, cnt1, prefetchLoopThreshold); 4003 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 4004 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 4005 /* prfm = */ true, NOT_EQUAL); 4006 __ cmp(cnt1, nonPrefetchLoopThreshold); 4007 __ br(__ LT, TAIL); 4008 } 4009 __ bind(NO_PREFETCH_LARGE_LOOP); 4010 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 4011 /* prfm = */ false, NOT_EQUAL); 4012 } else { 4013 __ push(spilled_regs, sp); 4014 if (SoftwarePrefetchHintDistance >= 0) { 4015 __ subs(tmp1, cnt1, prefetchLoopThreshold); 4016 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 4017 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 4018 /* prfm = */ true, NOT_EQUAL); 4019 __ cmp(cnt1, nonPrefetchLoopThreshold); 4020 __ br(__ LT, TAIL); 4021 } 4022 __ bind(NO_PREFETCH_LARGE_LOOP); 4023 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 4024 /* prfm = */ false, NOT_EQUAL); 4025 } 4026 __ bind(TAIL); 4027 __ cbz(cnt1, EQUAL); 4028 __ subs(cnt1, cnt1, wordSize); 4029 __ br(__ LE, POST_LOOP); 4030 __ bind(SMALL_LOOP); 4031 __ ldr(tmp1, Address(__ post(a1, wordSize))); 4032 __ ldr(tmp2, Address(__ post(a2, wordSize))); 4033 __ subs(cnt1, cnt1, wordSize); 4034 __ eor(tmp1, tmp1, tmp2); 4035 __ cbnz(tmp1, NOT_EQUAL); 4036 __ br(__ GT, SMALL_LOOP); 4037 __ bind(POST_LOOP); 4038 __ ldr(tmp1, Address(a1, cnt1)); 4039 __ ldr(tmp2, Address(a2, cnt1)); 4040 __ eor(tmp1, tmp1, tmp2); 4041 __ cbnz(tmp1, NOT_EQUAL); 4042 __ bind(EQUAL); 4043 __ mov(result, true); 4044 __ bind(NOT_EQUAL); 4045 if (!UseSIMDForArrayEquals) { 4046 __ pop(spilled_regs, sp); 4047 } 4048 __ bind(NOT_EQUAL_NO_POP); 4049 __ leave(); 4050 __ ret(lr); 4051 return entry; 4052 } 4053 4054 4055 /** 4056 * Arguments: 4057 * 4058 * Input: 4059 * c_rarg0 - current state address 4060 * c_rarg1 - H key address 4061 * c_rarg2 - data address 4062 * c_rarg3 - number of blocks 4063 * 4064 * Output: 4065 * Updated state at c_rarg0 4066 */ 4067 address generate_ghash_processBlocks() { 4068 // Bafflingly, GCM uses little-endian for the byte order, but 4069 // big-endian for the bit order. For example, the polynomial 1 is 4070 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4071 // 4072 // So, we must either reverse the bytes in each word and do 4073 // everything big-endian or reverse the bits in each byte and do 4074 // it little-endian. On AArch64 it's more idiomatic to reverse 4075 // the bits in each byte (we have an instruction, RBIT, to do 4076 // that) and keep the data in little-endian bit order throught the 4077 // calculation, bit-reversing the inputs and outputs. 4078 4079 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4080 __ align(wordSize * 2); 4081 address p = __ pc(); 4082 __ emit_int64(0x87); // The low-order bits of the field 4083 // polynomial (i.e. p = z^7+z^2+z+1) 4084 // repeated in the low and high parts of a 4085 // 128-bit vector 4086 __ emit_int64(0x87); 4087 4088 __ align(CodeEntryAlignment); 4089 address start = __ pc(); 4090 4091 Register state = c_rarg0; 4092 Register subkeyH = c_rarg1; 4093 Register data = c_rarg2; 4094 Register blocks = c_rarg3; 4095 4096 FloatRegister vzr = v30; 4097 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4098 4099 __ ldrq(v0, Address(state)); 4100 __ ldrq(v1, Address(subkeyH)); 4101 4102 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4103 __ rbit(v0, __ T16B, v0); 4104 __ rev64(v1, __ T16B, v1); 4105 __ rbit(v1, __ T16B, v1); 4106 4107 __ ldrq(v26, p); 4108 4109 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4110 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4111 4112 { 4113 Label L_ghash_loop; 4114 __ bind(L_ghash_loop); 4115 4116 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4117 // reversing each byte 4118 __ rbit(v2, __ T16B, v2); 4119 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4120 4121 // Multiply state in v2 by subkey in v1 4122 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4123 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4124 /*temps*/v6, v20, v18, v21); 4125 // Reduce v7:v5 by the field polynomial 4126 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4127 4128 __ sub(blocks, blocks, 1); 4129 __ cbnz(blocks, L_ghash_loop); 4130 } 4131 4132 // The bit-reversed result is at this point in v0 4133 __ rev64(v1, __ T16B, v0); 4134 __ rbit(v1, __ T16B, v1); 4135 4136 __ st1(v1, __ T16B, state); 4137 __ ret(lr); 4138 4139 return start; 4140 } 4141 4142 // Continuation point for throwing of implicit exceptions that are 4143 // not handled in the current activation. Fabricates an exception 4144 // oop and initiates normal exception dispatching in this 4145 // frame. Since we need to preserve callee-saved values (currently 4146 // only for C2, but done for C1 as well) we need a callee-saved oop 4147 // map and therefore have to make these stubs into RuntimeStubs 4148 // rather than BufferBlobs. If the compiler needs all registers to 4149 // be preserved between the fault point and the exception handler 4150 // then it must assume responsibility for that in 4151 // AbstractCompiler::continuation_for_implicit_null_exception or 4152 // continuation_for_implicit_division_by_zero_exception. All other 4153 // implicit exceptions (e.g., NullPointerException or 4154 // AbstractMethodError on entry) are either at call sites or 4155 // otherwise assume that stack unwinding will be initiated, so 4156 // caller saved registers were assumed volatile in the compiler. 4157 4158 #undef __ 4159 #define __ masm-> 4160 4161 address generate_throw_exception(const char* name, 4162 address runtime_entry, 4163 Register arg1 = noreg, 4164 Register arg2 = noreg) { 4165 // Information about frame layout at time of blocking runtime call. 4166 // Note that we only have to preserve callee-saved registers since 4167 // the compilers are responsible for supplying a continuation point 4168 // if they expect all registers to be preserved. 4169 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4170 enum layout { 4171 rfp_off = 0, 4172 rfp_off2, 4173 return_off, 4174 return_off2, 4175 framesize // inclusive of return address 4176 }; 4177 4178 int insts_size = 512; 4179 int locs_size = 64; 4180 4181 CodeBuffer code(name, insts_size, locs_size); 4182 OopMapSet* oop_maps = new OopMapSet(); 4183 MacroAssembler* masm = new MacroAssembler(&code); 4184 4185 address start = __ pc(); 4186 4187 // This is an inlined and slightly modified version of call_VM 4188 // which has the ability to fetch the return PC out of 4189 // thread-local storage and also sets up last_Java_sp slightly 4190 // differently than the real call_VM 4191 4192 __ enter(); // Save FP and LR before call 4193 4194 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4195 4196 // lr and fp are already in place 4197 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4198 4199 int frame_complete = __ pc() - start; 4200 4201 // Set up last_Java_sp and last_Java_fp 4202 address the_pc = __ pc(); 4203 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 4204 4205 // Call runtime 4206 if (arg1 != noreg) { 4207 assert(arg2 != c_rarg1, "clobbered"); 4208 __ mov(c_rarg1, arg1); 4209 } 4210 if (arg2 != noreg) { 4211 __ mov(c_rarg2, arg2); 4212 } 4213 __ mov(c_rarg0, rthread); 4214 BLOCK_COMMENT("call runtime_entry"); 4215 __ mov(rscratch1, runtime_entry); 4216 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4217 4218 // Generate oop map 4219 OopMap* map = new OopMap(framesize, 0); 4220 4221 oop_maps->add_gc_map(the_pc - start, map); 4222 4223 __ reset_last_Java_frame(true); 4224 __ maybe_isb(); 4225 4226 __ leave(); 4227 4228 // check for pending exceptions 4229 #ifdef ASSERT 4230 Label L; 4231 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4232 __ cbnz(rscratch1, L); 4233 __ should_not_reach_here(); 4234 __ bind(L); 4235 #endif // ASSERT 4236 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4237 4238 4239 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4240 RuntimeStub* stub = 4241 RuntimeStub::new_runtime_stub(name, 4242 &code, 4243 frame_complete, 4244 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4245 oop_maps, false); 4246 return stub->entry_point(); 4247 } 4248 4249 class MontgomeryMultiplyGenerator : public MacroAssembler { 4250 4251 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4252 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4253 4254 RegSet _toSave; 4255 bool _squaring; 4256 4257 public: 4258 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4259 : MacroAssembler(as->code()), _squaring(squaring) { 4260 4261 // Register allocation 4262 4263 Register reg = c_rarg0; 4264 Pa_base = reg; // Argument registers 4265 if (squaring) 4266 Pb_base = Pa_base; 4267 else 4268 Pb_base = ++reg; 4269 Pn_base = ++reg; 4270 Rlen= ++reg; 4271 inv = ++reg; 4272 Pm_base = ++reg; 4273 4274 // Working registers: 4275 Ra = ++reg; // The current digit of a, b, n, and m. 4276 Rb = ++reg; 4277 Rm = ++reg; 4278 Rn = ++reg; 4279 4280 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4281 Pb = ++reg; 4282 Pm = ++reg; 4283 Pn = ++reg; 4284 4285 t0 = ++reg; // Three registers which form a 4286 t1 = ++reg; // triple-precision accumuator. 4287 t2 = ++reg; 4288 4289 Ri = ++reg; // Inner and outer loop indexes. 4290 Rj = ++reg; 4291 4292 Rhi_ab = ++reg; // Product registers: low and high parts 4293 Rlo_ab = ++reg; // of a*b and m*n. 4294 Rhi_mn = ++reg; 4295 Rlo_mn = ++reg; 4296 4297 // r19 and up are callee-saved. 4298 _toSave = RegSet::range(r19, reg) + Pm_base; 4299 } 4300 4301 private: 4302 void save_regs() { 4303 push(_toSave, sp); 4304 } 4305 4306 void restore_regs() { 4307 pop(_toSave, sp); 4308 } 4309 4310 template <typename T> 4311 void unroll_2(Register count, T block) { 4312 Label loop, end, odd; 4313 tbnz(count, 0, odd); 4314 cbz(count, end); 4315 align(16); 4316 bind(loop); 4317 (this->*block)(); 4318 bind(odd); 4319 (this->*block)(); 4320 subs(count, count, 2); 4321 br(Assembler::GT, loop); 4322 bind(end); 4323 } 4324 4325 template <typename T> 4326 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4327 Label loop, end, odd; 4328 tbnz(count, 0, odd); 4329 cbz(count, end); 4330 align(16); 4331 bind(loop); 4332 (this->*block)(d, s, tmp); 4333 bind(odd); 4334 (this->*block)(d, s, tmp); 4335 subs(count, count, 2); 4336 br(Assembler::GT, loop); 4337 bind(end); 4338 } 4339 4340 void pre1(RegisterOrConstant i) { 4341 block_comment("pre1"); 4342 // Pa = Pa_base; 4343 // Pb = Pb_base + i; 4344 // Pm = Pm_base; 4345 // Pn = Pn_base + i; 4346 // Ra = *Pa; 4347 // Rb = *Pb; 4348 // Rm = *Pm; 4349 // Rn = *Pn; 4350 ldr(Ra, Address(Pa_base)); 4351 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4352 ldr(Rm, Address(Pm_base)); 4353 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4354 lea(Pa, Address(Pa_base)); 4355 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4356 lea(Pm, Address(Pm_base)); 4357 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4358 4359 // Zero the m*n result. 4360 mov(Rhi_mn, zr); 4361 mov(Rlo_mn, zr); 4362 } 4363 4364 // The core multiply-accumulate step of a Montgomery 4365 // multiplication. The idea is to schedule operations as a 4366 // pipeline so that instructions with long latencies (loads and 4367 // multiplies) have time to complete before their results are 4368 // used. This most benefits in-order implementations of the 4369 // architecture but out-of-order ones also benefit. 4370 void step() { 4371 block_comment("step"); 4372 // MACC(Ra, Rb, t0, t1, t2); 4373 // Ra = *++Pa; 4374 // Rb = *--Pb; 4375 umulh(Rhi_ab, Ra, Rb); 4376 mul(Rlo_ab, Ra, Rb); 4377 ldr(Ra, pre(Pa, wordSize)); 4378 ldr(Rb, pre(Pb, -wordSize)); 4379 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4380 // previous iteration. 4381 // MACC(Rm, Rn, t0, t1, t2); 4382 // Rm = *++Pm; 4383 // Rn = *--Pn; 4384 umulh(Rhi_mn, Rm, Rn); 4385 mul(Rlo_mn, Rm, Rn); 4386 ldr(Rm, pre(Pm, wordSize)); 4387 ldr(Rn, pre(Pn, -wordSize)); 4388 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4389 } 4390 4391 void post1() { 4392 block_comment("post1"); 4393 4394 // MACC(Ra, Rb, t0, t1, t2); 4395 // Ra = *++Pa; 4396 // Rb = *--Pb; 4397 umulh(Rhi_ab, Ra, Rb); 4398 mul(Rlo_ab, Ra, Rb); 4399 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4400 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4401 4402 // *Pm = Rm = t0 * inv; 4403 mul(Rm, t0, inv); 4404 str(Rm, Address(Pm)); 4405 4406 // MACC(Rm, Rn, t0, t1, t2); 4407 // t0 = t1; t1 = t2; t2 = 0; 4408 umulh(Rhi_mn, Rm, Rn); 4409 4410 #ifndef PRODUCT 4411 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4412 { 4413 mul(Rlo_mn, Rm, Rn); 4414 add(Rlo_mn, t0, Rlo_mn); 4415 Label ok; 4416 cbz(Rlo_mn, ok); { 4417 stop("broken Montgomery multiply"); 4418 } bind(ok); 4419 } 4420 #endif 4421 // We have very carefully set things up so that 4422 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4423 // the lower half of Rm * Rn because we know the result already: 4424 // it must be -t0. t0 + (-t0) must generate a carry iff 4425 // t0 != 0. So, rather than do a mul and an adds we just set 4426 // the carry flag iff t0 is nonzero. 4427 // 4428 // mul(Rlo_mn, Rm, Rn); 4429 // adds(zr, t0, Rlo_mn); 4430 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4431 adcs(t0, t1, Rhi_mn); 4432 adc(t1, t2, zr); 4433 mov(t2, zr); 4434 } 4435 4436 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 4437 block_comment("pre2"); 4438 // Pa = Pa_base + i-len; 4439 // Pb = Pb_base + len; 4440 // Pm = Pm_base + i-len; 4441 // Pn = Pn_base + len; 4442 4443 if (i.is_register()) { 4444 sub(Rj, i.as_register(), len); 4445 } else { 4446 mov(Rj, i.as_constant()); 4447 sub(Rj, Rj, len); 4448 } 4449 // Rj == i-len 4450 4451 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 4452 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 4453 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4454 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 4455 4456 // Ra = *++Pa; 4457 // Rb = *--Pb; 4458 // Rm = *++Pm; 4459 // Rn = *--Pn; 4460 ldr(Ra, pre(Pa, wordSize)); 4461 ldr(Rb, pre(Pb, -wordSize)); 4462 ldr(Rm, pre(Pm, wordSize)); 4463 ldr(Rn, pre(Pn, -wordSize)); 4464 4465 mov(Rhi_mn, zr); 4466 mov(Rlo_mn, zr); 4467 } 4468 4469 void post2(RegisterOrConstant i, RegisterOrConstant len) { 4470 block_comment("post2"); 4471 if (i.is_constant()) { 4472 mov(Rj, i.as_constant()-len.as_constant()); 4473 } else { 4474 sub(Rj, i.as_register(), len); 4475 } 4476 4477 adds(t0, t0, Rlo_mn); // The pending m*n, low part 4478 4479 // As soon as we know the least significant digit of our result, 4480 // store it. 4481 // Pm_base[i-len] = t0; 4482 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4483 4484 // t0 = t1; t1 = t2; t2 = 0; 4485 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 4486 adc(t1, t2, zr); 4487 mov(t2, zr); 4488 } 4489 4490 // A carry in t0 after Montgomery multiplication means that we 4491 // should subtract multiples of n from our result in m. We'll 4492 // keep doing that until there is no carry. 4493 void normalize(RegisterOrConstant len) { 4494 block_comment("normalize"); 4495 // while (t0) 4496 // t0 = sub(Pm_base, Pn_base, t0, len); 4497 Label loop, post, again; 4498 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 4499 cbz(t0, post); { 4500 bind(again); { 4501 mov(i, zr); 4502 mov(cnt, len); 4503 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4504 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4505 subs(zr, zr, zr); // set carry flag, i.e. no borrow 4506 align(16); 4507 bind(loop); { 4508 sbcs(Rm, Rm, Rn); 4509 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4510 add(i, i, 1); 4511 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4512 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4513 sub(cnt, cnt, 1); 4514 } cbnz(cnt, loop); 4515 sbc(t0, t0, zr); 4516 } cbnz(t0, again); 4517 } bind(post); 4518 } 4519 4520 // Move memory at s to d, reversing words. 4521 // Increments d to end of copied memory 4522 // Destroys tmp1, tmp2 4523 // Preserves len 4524 // Leaves s pointing to the address which was in d at start 4525 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 4526 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 4527 4528 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 4529 mov(tmp1, len); 4530 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 4531 sub(s, d, len, ext::uxtw, LogBytesPerWord); 4532 } 4533 // where 4534 void reverse1(Register d, Register s, Register tmp) { 4535 ldr(tmp, pre(s, -wordSize)); 4536 ror(tmp, tmp, 32); 4537 str(tmp, post(d, wordSize)); 4538 } 4539 4540 void step_squaring() { 4541 // An extra ACC 4542 step(); 4543 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4544 } 4545 4546 void last_squaring(RegisterOrConstant i) { 4547 Label dont; 4548 // if ((i & 1) == 0) { 4549 tbnz(i.as_register(), 0, dont); { 4550 // MACC(Ra, Rb, t0, t1, t2); 4551 // Ra = *++Pa; 4552 // Rb = *--Pb; 4553 umulh(Rhi_ab, Ra, Rb); 4554 mul(Rlo_ab, Ra, Rb); 4555 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4556 } bind(dont); 4557 } 4558 4559 void extra_step_squaring() { 4560 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4561 4562 // MACC(Rm, Rn, t0, t1, t2); 4563 // Rm = *++Pm; 4564 // Rn = *--Pn; 4565 umulh(Rhi_mn, Rm, Rn); 4566 mul(Rlo_mn, Rm, Rn); 4567 ldr(Rm, pre(Pm, wordSize)); 4568 ldr(Rn, pre(Pn, -wordSize)); 4569 } 4570 4571 void post1_squaring() { 4572 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4573 4574 // *Pm = Rm = t0 * inv; 4575 mul(Rm, t0, inv); 4576 str(Rm, Address(Pm)); 4577 4578 // MACC(Rm, Rn, t0, t1, t2); 4579 // t0 = t1; t1 = t2; t2 = 0; 4580 umulh(Rhi_mn, Rm, Rn); 4581 4582 #ifndef PRODUCT 4583 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4584 { 4585 mul(Rlo_mn, Rm, Rn); 4586 add(Rlo_mn, t0, Rlo_mn); 4587 Label ok; 4588 cbz(Rlo_mn, ok); { 4589 stop("broken Montgomery multiply"); 4590 } bind(ok); 4591 } 4592 #endif 4593 // We have very carefully set things up so that 4594 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4595 // the lower half of Rm * Rn because we know the result already: 4596 // it must be -t0. t0 + (-t0) must generate a carry iff 4597 // t0 != 0. So, rather than do a mul and an adds we just set 4598 // the carry flag iff t0 is nonzero. 4599 // 4600 // mul(Rlo_mn, Rm, Rn); 4601 // adds(zr, t0, Rlo_mn); 4602 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4603 adcs(t0, t1, Rhi_mn); 4604 adc(t1, t2, zr); 4605 mov(t2, zr); 4606 } 4607 4608 void acc(Register Rhi, Register Rlo, 4609 Register t0, Register t1, Register t2) { 4610 adds(t0, t0, Rlo); 4611 adcs(t1, t1, Rhi); 4612 adc(t2, t2, zr); 4613 } 4614 4615 public: 4616 /** 4617 * Fast Montgomery multiplication. The derivation of the 4618 * algorithm is in A Cryptographic Library for the Motorola 4619 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 4620 * 4621 * Arguments: 4622 * 4623 * Inputs for multiplication: 4624 * c_rarg0 - int array elements a 4625 * c_rarg1 - int array elements b 4626 * c_rarg2 - int array elements n (the modulus) 4627 * c_rarg3 - int length 4628 * c_rarg4 - int inv 4629 * c_rarg5 - int array elements m (the result) 4630 * 4631 * Inputs for squaring: 4632 * c_rarg0 - int array elements a 4633 * c_rarg1 - int array elements n (the modulus) 4634 * c_rarg2 - int length 4635 * c_rarg3 - int inv 4636 * c_rarg4 - int array elements m (the result) 4637 * 4638 */ 4639 address generate_multiply() { 4640 Label argh, nothing; 4641 bind(argh); 4642 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4643 4644 align(CodeEntryAlignment); 4645 address entry = pc(); 4646 4647 cbzw(Rlen, nothing); 4648 4649 enter(); 4650 4651 // Make room. 4652 cmpw(Rlen, 512); 4653 br(Assembler::HI, argh); 4654 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4655 andr(sp, Ra, -2 * wordSize); 4656 4657 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4658 4659 { 4660 // Copy input args, reversing as we go. We use Ra as a 4661 // temporary variable. 4662 reverse(Ra, Pa_base, Rlen, t0, t1); 4663 if (!_squaring) 4664 reverse(Ra, Pb_base, Rlen, t0, t1); 4665 reverse(Ra, Pn_base, Rlen, t0, t1); 4666 } 4667 4668 // Push all call-saved registers and also Pm_base which we'll need 4669 // at the end. 4670 save_regs(); 4671 4672 #ifndef PRODUCT 4673 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 4674 { 4675 ldr(Rn, Address(Pn_base, 0)); 4676 mul(Rlo_mn, Rn, inv); 4677 cmp(Rlo_mn, -1); 4678 Label ok; 4679 br(EQ, ok); { 4680 stop("broken inverse in Montgomery multiply"); 4681 } bind(ok); 4682 } 4683 #endif 4684 4685 mov(Pm_base, Ra); 4686 4687 mov(t0, zr); 4688 mov(t1, zr); 4689 mov(t2, zr); 4690 4691 block_comment("for (int i = 0; i < len; i++) {"); 4692 mov(Ri, zr); { 4693 Label loop, end; 4694 cmpw(Ri, Rlen); 4695 br(Assembler::GE, end); 4696 4697 bind(loop); 4698 pre1(Ri); 4699 4700 block_comment(" for (j = i; j; j--) {"); { 4701 movw(Rj, Ri); 4702 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4703 } block_comment(" } // j"); 4704 4705 post1(); 4706 addw(Ri, Ri, 1); 4707 cmpw(Ri, Rlen); 4708 br(Assembler::LT, loop); 4709 bind(end); 4710 block_comment("} // i"); 4711 } 4712 4713 block_comment("for (int i = len; i < 2*len; i++) {"); 4714 mov(Ri, Rlen); { 4715 Label loop, end; 4716 cmpw(Ri, Rlen, Assembler::LSL, 1); 4717 br(Assembler::GE, end); 4718 4719 bind(loop); 4720 pre2(Ri, Rlen); 4721 4722 block_comment(" for (j = len*2-i-1; j; j--) {"); { 4723 lslw(Rj, Rlen, 1); 4724 subw(Rj, Rj, Ri); 4725 subw(Rj, Rj, 1); 4726 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4727 } block_comment(" } // j"); 4728 4729 post2(Ri, Rlen); 4730 addw(Ri, Ri, 1); 4731 cmpw(Ri, Rlen, Assembler::LSL, 1); 4732 br(Assembler::LT, loop); 4733 bind(end); 4734 } 4735 block_comment("} // i"); 4736 4737 normalize(Rlen); 4738 4739 mov(Ra, Pm_base); // Save Pm_base in Ra 4740 restore_regs(); // Restore caller's Pm_base 4741 4742 // Copy our result into caller's Pm_base 4743 reverse(Pm_base, Ra, Rlen, t0, t1); 4744 4745 leave(); 4746 bind(nothing); 4747 ret(lr); 4748 4749 return entry; 4750 } 4751 // In C, approximately: 4752 4753 // void 4754 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 4755 // unsigned long Pn_base[], unsigned long Pm_base[], 4756 // unsigned long inv, int len) { 4757 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4758 // unsigned long *Pa, *Pb, *Pn, *Pm; 4759 // unsigned long Ra, Rb, Rn, Rm; 4760 4761 // int i; 4762 4763 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4764 4765 // for (i = 0; i < len; i++) { 4766 // int j; 4767 4768 // Pa = Pa_base; 4769 // Pb = Pb_base + i; 4770 // Pm = Pm_base; 4771 // Pn = Pn_base + i; 4772 4773 // Ra = *Pa; 4774 // Rb = *Pb; 4775 // Rm = *Pm; 4776 // Rn = *Pn; 4777 4778 // int iters = i; 4779 // for (j = 0; iters--; j++) { 4780 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4781 // MACC(Ra, Rb, t0, t1, t2); 4782 // Ra = *++Pa; 4783 // Rb = *--Pb; 4784 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4785 // MACC(Rm, Rn, t0, t1, t2); 4786 // Rm = *++Pm; 4787 // Rn = *--Pn; 4788 // } 4789 4790 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 4791 // MACC(Ra, Rb, t0, t1, t2); 4792 // *Pm = Rm = t0 * inv; 4793 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4794 // MACC(Rm, Rn, t0, t1, t2); 4795 4796 // assert(t0 == 0, "broken Montgomery multiply"); 4797 4798 // t0 = t1; t1 = t2; t2 = 0; 4799 // } 4800 4801 // for (i = len; i < 2*len; i++) { 4802 // int j; 4803 4804 // Pa = Pa_base + i-len; 4805 // Pb = Pb_base + len; 4806 // Pm = Pm_base + i-len; 4807 // Pn = Pn_base + len; 4808 4809 // Ra = *++Pa; 4810 // Rb = *--Pb; 4811 // Rm = *++Pm; 4812 // Rn = *--Pn; 4813 4814 // int iters = len*2-i-1; 4815 // for (j = i-len+1; iters--; j++) { 4816 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4817 // MACC(Ra, Rb, t0, t1, t2); 4818 // Ra = *++Pa; 4819 // Rb = *--Pb; 4820 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4821 // MACC(Rm, Rn, t0, t1, t2); 4822 // Rm = *++Pm; 4823 // Rn = *--Pn; 4824 // } 4825 4826 // Pm_base[i-len] = t0; 4827 // t0 = t1; t1 = t2; t2 = 0; 4828 // } 4829 4830 // while (t0) 4831 // t0 = sub(Pm_base, Pn_base, t0, len); 4832 // } 4833 4834 /** 4835 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4836 * multiplies than Montgomery multiplication so it should be up to 4837 * 25% faster. However, its loop control is more complex and it 4838 * may actually run slower on some machines. 4839 * 4840 * Arguments: 4841 * 4842 * Inputs: 4843 * c_rarg0 - int array elements a 4844 * c_rarg1 - int array elements n (the modulus) 4845 * c_rarg2 - int length 4846 * c_rarg3 - int inv 4847 * c_rarg4 - int array elements m (the result) 4848 * 4849 */ 4850 address generate_square() { 4851 Label argh; 4852 bind(argh); 4853 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4854 4855 align(CodeEntryAlignment); 4856 address entry = pc(); 4857 4858 enter(); 4859 4860 // Make room. 4861 cmpw(Rlen, 512); 4862 br(Assembler::HI, argh); 4863 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4864 andr(sp, Ra, -2 * wordSize); 4865 4866 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4867 4868 { 4869 // Copy input args, reversing as we go. We use Ra as a 4870 // temporary variable. 4871 reverse(Ra, Pa_base, Rlen, t0, t1); 4872 reverse(Ra, Pn_base, Rlen, t0, t1); 4873 } 4874 4875 // Push all call-saved registers and also Pm_base which we'll need 4876 // at the end. 4877 save_regs(); 4878 4879 mov(Pm_base, Ra); 4880 4881 mov(t0, zr); 4882 mov(t1, zr); 4883 mov(t2, zr); 4884 4885 block_comment("for (int i = 0; i < len; i++) {"); 4886 mov(Ri, zr); { 4887 Label loop, end; 4888 bind(loop); 4889 cmp(Ri, Rlen); 4890 br(Assembler::GE, end); 4891 4892 pre1(Ri); 4893 4894 block_comment("for (j = (i+1)/2; j; j--) {"); { 4895 add(Rj, Ri, 1); 4896 lsr(Rj, Rj, 1); 4897 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4898 } block_comment(" } // j"); 4899 4900 last_squaring(Ri); 4901 4902 block_comment(" for (j = i/2; j; j--) {"); { 4903 lsr(Rj, Ri, 1); 4904 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4905 } block_comment(" } // j"); 4906 4907 post1_squaring(); 4908 add(Ri, Ri, 1); 4909 cmp(Ri, Rlen); 4910 br(Assembler::LT, loop); 4911 4912 bind(end); 4913 block_comment("} // i"); 4914 } 4915 4916 block_comment("for (int i = len; i < 2*len; i++) {"); 4917 mov(Ri, Rlen); { 4918 Label loop, end; 4919 bind(loop); 4920 cmp(Ri, Rlen, Assembler::LSL, 1); 4921 br(Assembler::GE, end); 4922 4923 pre2(Ri, Rlen); 4924 4925 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4926 lsl(Rj, Rlen, 1); 4927 sub(Rj, Rj, Ri); 4928 sub(Rj, Rj, 1); 4929 lsr(Rj, Rj, 1); 4930 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4931 } block_comment(" } // j"); 4932 4933 last_squaring(Ri); 4934 4935 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4936 lsl(Rj, Rlen, 1); 4937 sub(Rj, Rj, Ri); 4938 lsr(Rj, Rj, 1); 4939 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4940 } block_comment(" } // j"); 4941 4942 post2(Ri, Rlen); 4943 add(Ri, Ri, 1); 4944 cmp(Ri, Rlen, Assembler::LSL, 1); 4945 4946 br(Assembler::LT, loop); 4947 bind(end); 4948 block_comment("} // i"); 4949 } 4950 4951 normalize(Rlen); 4952 4953 mov(Ra, Pm_base); // Save Pm_base in Ra 4954 restore_regs(); // Restore caller's Pm_base 4955 4956 // Copy our result into caller's Pm_base 4957 reverse(Pm_base, Ra, Rlen, t0, t1); 4958 4959 leave(); 4960 ret(lr); 4961 4962 return entry; 4963 } 4964 // In C, approximately: 4965 4966 // void 4967 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4968 // unsigned long Pm_base[], unsigned long inv, int len) { 4969 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4970 // unsigned long *Pa, *Pb, *Pn, *Pm; 4971 // unsigned long Ra, Rb, Rn, Rm; 4972 4973 // int i; 4974 4975 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4976 4977 // for (i = 0; i < len; i++) { 4978 // int j; 4979 4980 // Pa = Pa_base; 4981 // Pb = Pa_base + i; 4982 // Pm = Pm_base; 4983 // Pn = Pn_base + i; 4984 4985 // Ra = *Pa; 4986 // Rb = *Pb; 4987 // Rm = *Pm; 4988 // Rn = *Pn; 4989 4990 // int iters = (i+1)/2; 4991 // for (j = 0; iters--; j++) { 4992 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4993 // MACC2(Ra, Rb, t0, t1, t2); 4994 // Ra = *++Pa; 4995 // Rb = *--Pb; 4996 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4997 // MACC(Rm, Rn, t0, t1, t2); 4998 // Rm = *++Pm; 4999 // Rn = *--Pn; 5000 // } 5001 // if ((i & 1) == 0) { 5002 // assert(Ra == Pa_base[j], "must be"); 5003 // MACC(Ra, Ra, t0, t1, t2); 5004 // } 5005 // iters = i/2; 5006 // assert(iters == i-j, "must be"); 5007 // for (; iters--; j++) { 5008 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5009 // MACC(Rm, Rn, t0, t1, t2); 5010 // Rm = *++Pm; 5011 // Rn = *--Pn; 5012 // } 5013 5014 // *Pm = Rm = t0 * inv; 5015 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5016 // MACC(Rm, Rn, t0, t1, t2); 5017 5018 // assert(t0 == 0, "broken Montgomery multiply"); 5019 5020 // t0 = t1; t1 = t2; t2 = 0; 5021 // } 5022 5023 // for (i = len; i < 2*len; i++) { 5024 // int start = i-len+1; 5025 // int end = start + (len - start)/2; 5026 // int j; 5027 5028 // Pa = Pa_base + i-len; 5029 // Pb = Pa_base + len; 5030 // Pm = Pm_base + i-len; 5031 // Pn = Pn_base + len; 5032 5033 // Ra = *++Pa; 5034 // Rb = *--Pb; 5035 // Rm = *++Pm; 5036 // Rn = *--Pn; 5037 5038 // int iters = (2*len-i-1)/2; 5039 // assert(iters == end-start, "must be"); 5040 // for (j = start; iters--; j++) { 5041 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5042 // MACC2(Ra, Rb, t0, t1, t2); 5043 // Ra = *++Pa; 5044 // Rb = *--Pb; 5045 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5046 // MACC(Rm, Rn, t0, t1, t2); 5047 // Rm = *++Pm; 5048 // Rn = *--Pn; 5049 // } 5050 // if ((i & 1) == 0) { 5051 // assert(Ra == Pa_base[j], "must be"); 5052 // MACC(Ra, Ra, t0, t1, t2); 5053 // } 5054 // iters = (2*len-i)/2; 5055 // assert(iters == len-j, "must be"); 5056 // for (; iters--; j++) { 5057 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5058 // MACC(Rm, Rn, t0, t1, t2); 5059 // Rm = *++Pm; 5060 // Rn = *--Pn; 5061 // } 5062 // Pm_base[i-len] = t0; 5063 // t0 = t1; t1 = t2; t2 = 0; 5064 // } 5065 5066 // while (t0) 5067 // t0 = sub(Pm_base, Pn_base, t0, len); 5068 // } 5069 }; 5070 5071 5072 // Initialization 5073 void generate_initial() { 5074 // Generate initial stubs and initializes the entry points 5075 5076 // entry points that exist in all platforms Note: This is code 5077 // that could be shared among different platforms - however the 5078 // benefit seems to be smaller than the disadvantage of having a 5079 // much more complicated generator structure. See also comment in 5080 // stubRoutines.hpp. 5081 5082 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5083 5084 StubRoutines::_call_stub_entry = 5085 generate_call_stub(StubRoutines::_call_stub_return_address); 5086 5087 // is referenced by megamorphic call 5088 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5089 5090 // Build this early so it's available for the interpreter. 5091 StubRoutines::_throw_StackOverflowError_entry = 5092 generate_throw_exception("StackOverflowError throw_exception", 5093 CAST_FROM_FN_PTR(address, 5094 SharedRuntime::throw_StackOverflowError)); 5095 StubRoutines::_throw_delayed_StackOverflowError_entry = 5096 generate_throw_exception("delayed StackOverflowError throw_exception", 5097 CAST_FROM_FN_PTR(address, 5098 SharedRuntime::throw_delayed_StackOverflowError)); 5099 if (UseCRC32Intrinsics) { 5100 // set table address before stub generation which use it 5101 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5102 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5103 } 5104 5105 if (UseCRC32CIntrinsics) { 5106 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5107 } 5108 } 5109 5110 void generate_all() { 5111 // support for verify_oop (must happen after universe_init) 5112 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5113 StubRoutines::_throw_AbstractMethodError_entry = 5114 generate_throw_exception("AbstractMethodError throw_exception", 5115 CAST_FROM_FN_PTR(address, 5116 SharedRuntime:: 5117 throw_AbstractMethodError)); 5118 5119 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5120 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5121 CAST_FROM_FN_PTR(address, 5122 SharedRuntime:: 5123 throw_IncompatibleClassChangeError)); 5124 5125 StubRoutines::_throw_NullPointerException_at_call_entry = 5126 generate_throw_exception("NullPointerException at call throw_exception", 5127 CAST_FROM_FN_PTR(address, 5128 SharedRuntime:: 5129 throw_NullPointerException_at_call)); 5130 5131 // arraycopy stubs used by compilers 5132 generate_arraycopy_stubs(); 5133 5134 // has negatives stub for large arrays. 5135 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5136 5137 // array equals stub for large arrays. 5138 if (!UseSimpleArrayEquals) { 5139 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5140 } 5141 5142 if (UseMultiplyToLenIntrinsic) { 5143 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5144 } 5145 5146 if (UseSquareToLenIntrinsic) { 5147 StubRoutines::_squareToLen = generate_squareToLen(); 5148 } 5149 5150 if (UseMulAddIntrinsic) { 5151 StubRoutines::_mulAdd = generate_mulAdd(); 5152 } 5153 5154 if (UseMontgomeryMultiplyIntrinsic) { 5155 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5156 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5157 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5158 } 5159 5160 if (UseMontgomerySquareIntrinsic) { 5161 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5162 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5163 // We use generate_multiply() rather than generate_square() 5164 // because it's faster for the sizes of modulus we care about. 5165 StubRoutines::_montgomerySquare = g.generate_multiply(); 5166 } 5167 5168 #if INCLUDE_SHENANDOAHGC 5169 if (UseShenandoahGC && (ShenandoahWriteBarrier || ShenandoahStoreValEnqueueBarrier)) { 5170 StubRoutines::aarch64::_shenandoah_wb = generate_shenandoah_wb(false, true); 5171 StubRoutines::_shenandoah_wb_C = generate_shenandoah_wb(true, !ShenandoahWriteBarrierCsetTestInIR); 5172 } 5173 #endif 5174 5175 #ifndef BUILTIN_SIM 5176 // generate GHASH intrinsics code 5177 if (UseGHASHIntrinsics) { 5178 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5179 } 5180 5181 if (UseAESIntrinsics) { 5182 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5183 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5184 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5185 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5186 } 5187 5188 if (UseSHA1Intrinsics) { 5189 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5190 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5191 } 5192 if (UseSHA256Intrinsics) { 5193 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5194 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5195 } 5196 5197 // generate Adler32 intrinsics code 5198 if (UseAdler32Intrinsics) { 5199 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5200 } 5201 5202 // Safefetch stubs. 5203 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5204 &StubRoutines::_safefetch32_fault_pc, 5205 &StubRoutines::_safefetch32_continuation_pc); 5206 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5207 &StubRoutines::_safefetchN_fault_pc, 5208 &StubRoutines::_safefetchN_continuation_pc); 5209 #endif 5210 StubRoutines::aarch64::set_completed(); 5211 } 5212 5213 public: 5214 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5215 if (all) { 5216 generate_all(); 5217 } else { 5218 generate_initial(); 5219 } 5220 } 5221 }; // end class declaration 5222 5223 void StubGenerator_generate(CodeBuffer* code, bool all) { 5224 StubGenerator g(code, all); 5225 }