1 /* 2 * Copyright (c) 2013, Red Hat Inc. 3 * Copyright (c) 2003, 2011, Oracle and/or its affiliates. 4 * All rights reserved. 5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 6 * 7 * This code is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 only, as 9 * published by the Free Software Foundation. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 * 25 */ 26 27 #include "precompiled.hpp" 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "interpreter/interpreter.hpp" 31 #include "nativeInst_aarch64.hpp" 32 #include "oops/instanceOop.hpp" 33 #include "oops/method.hpp" 34 #include "oops/objArrayKlass.hpp" 35 #include "oops/oop.inline.hpp" 36 #include "prims/methodHandles.hpp" 37 #include "runtime/frame.inline.hpp" 38 #include "runtime/handles.inline.hpp" 39 #include "runtime/sharedRuntime.hpp" 40 #include "runtime/stubCodeGenerator.hpp" 41 #include "runtime/stubRoutines.hpp" 42 #include "runtime/thread.inline.hpp" 43 #include "utilities/top.hpp" 44 #ifdef COMPILER2 45 #include "opto/runtime.hpp" 46 #endif 47 48 #ifdef BUILTIN_SIM 49 #include "../../../../../../simulator/simulator.hpp" 50 #endif 51 52 // Declaration and definition of StubGenerator (no .hpp file). 53 // For a more detailed description of the stub routine structure 54 // see the comment in stubRoutines.hpp 55 56 #undef __ 57 #define __ _masm-> 58 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 59 60 #ifdef PRODUCT 61 #define BLOCK_COMMENT(str) /* nothing */ 62 #else 63 #define BLOCK_COMMENT(str) __ block_comment(str) 64 #endif 65 66 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 67 68 // Stub Code definitions 69 70 class StubGenerator: public StubCodeGenerator { 71 private: 72 73 #ifdef PRODUCT 74 #define inc_counter_np(counter) ((void)0) 75 #else 76 void inc_counter_np_(int& counter) { 77 __ lea(rscratch2, ExternalAddress((address)&counter)); 78 __ ldrw(rscratch1, Address(rscratch2)); 79 __ addw(rscratch1, rscratch1, 1); 80 __ strw(rscratch1, Address(rscratch2)); 81 } 82 #define inc_counter_np(counter) \ 83 BLOCK_COMMENT("inc_counter " #counter); \ 84 inc_counter_np_(counter); 85 #endif 86 87 // Call stubs are used to call Java from C 88 // 89 // Arguments: 90 // c_rarg0: call wrapper address address 91 // c_rarg1: result address 92 // c_rarg2: result type BasicType 93 // c_rarg3: method Method* 94 // c_rarg4: (interpreter) entry point address 95 // c_rarg5: parameters intptr_t* 96 // c_rarg6: parameter size (in words) int 97 // c_rarg7: thread Thread* 98 // 99 // There is no return from the stub itself as any Java result 100 // is written to result 101 // 102 // we save r30 (lr) as the return PC at the base of the frame and 103 // link r29 (fp) below it as the frame pointer installing sp (r31) 104 // into fp. 105 // 106 // we save r0-r7, which accounts for all the c arguments. 107 // 108 // TODO: strictly do we need to save them all? they are treated as 109 // volatile by C so could we omit saving the ones we are going to 110 // place in global registers (thread? method?) or those we only use 111 // during setup of the Java call? 112 // 113 // we don't need to save r8 which C uses as an indirect result location 114 // return register. 115 // 116 // we don't need to save r9-r15 which both C and Java treat as 117 // volatile 118 // 119 // we don't need to save r16-18 because Java does not use them 120 // 121 // we save r19-r28 which Java uses as scratch registers and C 122 // expects to be callee-save 123 // 124 // we don't save any FP registers since only v8-v15 are callee-save 125 // (strictly only the f and d components) and Java uses them as 126 // callee-save. v0-v7 are arg registers and C treats v16-v31 as 127 // volatile (as does Java?) 128 // 129 // so the stub frame looks like this when we enter Java code 130 // 131 // [ return_from_Java ] <--- sp 132 // [ argument word n ] 133 // ... 134 // -27 [ argument word 1 ] 135 // -26 [ saved d15 ] <--- sp_after_call 136 // -25 [ saved d14 ] 137 // -24 [ saved d13 ] 138 // -23 [ saved d12 ] 139 // -22 [ saved d11 ] 140 // -21 [ saved d10 ] 141 // -20 [ saved d9 ] 142 // -19 [ saved d8 ] 143 // -18 [ saved r28 ] 144 // -17 [ saved r27 ] 145 // -16 [ saved r26 ] 146 // -15 [ saved r25 ] 147 // -14 [ saved r24 ] 148 // -13 [ saved r23 ] 149 // -12 [ saved r22 ] 150 // -11 [ saved r21 ] 151 // -10 [ saved r20 ] 152 // -9 [ saved r19 ] 153 // -8 [ call wrapper (r0) ] 154 // -7 [ result (r1) ] 155 // -6 [ result type (r2) ] 156 // -5 [ method (r3) ] 157 // -4 [ entry point (r4) ] 158 // -3 [ parameters (r5) ] 159 // -2 [ parameter size (r6) ] 160 // -1 [ thread (r7) ] 161 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 162 // 1 [ saved lr (r30) ] 163 164 // Call stub stack layout word offsets from fp 165 enum call_stub_layout { 166 sp_after_call_off = -26, 167 168 d15_off = -26, 169 d14_off = -25, 170 d13_off = -24, 171 d12_off = -23, 172 d11_off = -22, 173 d10_off = -21, 174 d9_off = -20, 175 d8_off = -19, 176 177 r28_off = -18, 178 r27_off = -17, 179 r26_off = -16, 180 r25_off = -15, 181 r24_off = -14, 182 r23_off = -13, 183 r22_off = -12, 184 r21_off = -11, 185 r20_off = -10, 186 r19_off = -9, 187 call_wrapper_off = -8, 188 result_off = -7, 189 result_type_off = -6, 190 method_off = -5, 191 entry_point_off = -4, 192 parameters_off = -3, 193 parameter_size_off = -2, 194 thread_off = -1, 195 fp_f = 0, 196 retaddr_off = 1, 197 }; 198 199 address generate_call_stub(address& return_address) { 200 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 201 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 202 "adjust this code"); 203 204 StubCodeMark mark(this, "StubRoutines", "call_stub"); 205 address start = __ pc(); 206 207 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 208 209 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 210 const Address result (rfp, result_off * wordSize); 211 const Address result_type (rfp, result_type_off * wordSize); 212 const Address method (rfp, method_off * wordSize); 213 const Address entry_point (rfp, entry_point_off * wordSize); 214 const Address parameters (rfp, parameters_off * wordSize); 215 const Address parameter_size(rfp, parameter_size_off * wordSize); 216 217 const Address thread (rfp, thread_off * wordSize); 218 219 const Address d15_save (rfp, d15_off * wordSize); 220 const Address d14_save (rfp, d14_off * wordSize); 221 const Address d13_save (rfp, d13_off * wordSize); 222 const Address d12_save (rfp, d12_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d10_save (rfp, d10_off * wordSize); 225 const Address d9_save (rfp, d9_off * wordSize); 226 const Address d8_save (rfp, d8_off * wordSize); 227 228 const Address r28_save (rfp, r28_off * wordSize); 229 const Address r27_save (rfp, r27_off * wordSize); 230 const Address r26_save (rfp, r26_off * wordSize); 231 const Address r25_save (rfp, r25_off * wordSize); 232 const Address r24_save (rfp, r24_off * wordSize); 233 const Address r23_save (rfp, r23_off * wordSize); 234 const Address r22_save (rfp, r22_off * wordSize); 235 const Address r21_save (rfp, r21_off * wordSize); 236 const Address r20_save (rfp, r20_off * wordSize); 237 const Address r19_save (rfp, r19_off * wordSize); 238 239 // stub code 240 241 // we need a C prolog to bootstrap the x86 caller into the sim 242 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 243 244 address aarch64_entry = __ pc(); 245 246 #ifdef BUILTIN_SIM 247 // Save sender's SP for stack traces. 248 __ mov(rscratch1, sp); 249 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 250 #endif 251 // set up frame and move sp to end of save area 252 __ enter(); 253 __ sub(sp, rfp, -sp_after_call_off * wordSize); 254 255 // save register parameters and Java scratch/global registers 256 // n.b. we save thread even though it gets installed in 257 // rthread because we want to sanity check rthread later 258 __ str(c_rarg7, thread); 259 __ strw(c_rarg6, parameter_size); 260 __ str(c_rarg5, parameters); 261 __ str(c_rarg4, entry_point); 262 __ str(c_rarg3, method); 263 __ str(c_rarg2, result_type); 264 __ str(c_rarg1, result); 265 __ str(c_rarg0, call_wrapper); 266 __ str(r19, r19_save); 267 __ str(r20, r20_save); 268 __ str(r21, r21_save); 269 __ str(r22, r22_save); 270 __ str(r23, r23_save); 271 __ str(r24, r24_save); 272 __ str(r25, r25_save); 273 __ str(r26, r26_save); 274 __ str(r27, r27_save); 275 __ str(r28, r28_save); 276 277 __ strd(v8, d8_save); 278 __ strd(v9, d9_save); 279 __ strd(v10, d10_save); 280 __ strd(v11, d11_save); 281 __ strd(v12, d12_save); 282 __ strd(v13, d13_save); 283 __ strd(v14, d14_save); 284 __ strd(v15, d15_save); 285 286 // install Java thread in global register now we have saved 287 // whatever value it held 288 __ mov(rthread, c_rarg7); 289 // And method 290 __ mov(rmethod, c_rarg3); 291 292 // set up the heapbase register 293 __ reinit_heapbase(); 294 295 #ifdef ASSERT 296 // make sure we have no pending exceptions 297 { 298 Label L; 299 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 300 __ cmp(rscratch1, (unsigned)NULL_WORD); 301 __ br(Assembler::EQ, L); 302 __ stop("StubRoutines::call_stub: entered with pending exception"); 303 __ BIND(L); 304 } 305 #endif 306 // pass parameters if any 307 __ mov(esp, sp); 308 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 309 __ andr(sp, rscratch1, -2 * wordSize); 310 311 BLOCK_COMMENT("pass parameters if any"); 312 Label parameters_done; 313 // parameter count is still in c_rarg6 314 // and parameter pointer identifying param 1 is in c_rarg5 315 __ cbzw(c_rarg6, parameters_done); 316 317 address loop = __ pc(); 318 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 319 __ subsw(c_rarg6, c_rarg6, 1); 320 __ push(rscratch1); 321 __ br(Assembler::GT, loop); 322 323 __ BIND(parameters_done); 324 325 // call Java entry -- passing methdoOop, and current sp 326 // rmethod: Method* 327 // r13: sender sp 328 BLOCK_COMMENT("call Java function"); 329 __ mov(r13, sp); 330 __ blr(c_rarg4); 331 332 // tell the simulator we have returned to the stub 333 334 // we do this here because the notify will already have been done 335 // if we get to the next instruction via an exception 336 // 337 // n.b. adding this instruction here affects the calculation of 338 // whether or not a routine returns to the call stub (used when 339 // doing stack walks) since the normal test is to check the return 340 // pc against the address saved below. so we may need to allow for 341 // this extra instruction in the check. 342 343 if (NotifySimulator) { 344 __ notify(Assembler::method_reentry); 345 } 346 // save current address for use by exception handling code 347 348 return_address = __ pc(); 349 350 // store result depending on type (everything that is not 351 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 352 // n.b. this assumes Java returns an integral result in r0 353 // and a floating result in j_farg0 354 __ ldr(j_rarg2, result); 355 Label is_long, is_float, is_double, exit; 356 __ ldr(j_rarg1, result_type); 357 __ cmp(j_rarg1, T_OBJECT); 358 __ br(Assembler::EQ, is_long); 359 __ cmp(j_rarg1, T_LONG); 360 __ br(Assembler::EQ, is_long); 361 __ cmp(j_rarg1, T_FLOAT); 362 __ br(Assembler::EQ, is_float); 363 __ cmp(j_rarg1, T_DOUBLE); 364 __ br(Assembler::EQ, is_double); 365 366 // handle T_INT case 367 __ strw(r0, Address(j_rarg2)); 368 369 __ BIND(exit); 370 371 // pop parameters 372 __ sub(esp, rfp, -sp_after_call_off * wordSize); 373 374 #ifdef ASSERT 375 // verify that threads correspond 376 { 377 Label L, S; 378 __ ldr(rscratch1, thread); 379 __ cmp(rthread, rscratch1); 380 __ br(Assembler::NE, S); 381 __ get_thread(rscratch1); 382 __ cmp(rthread, rscratch1); 383 __ br(Assembler::EQ, L); 384 __ BIND(S); 385 __ stop("StubRoutines::call_stub: threads must correspond"); 386 __ BIND(L); 387 } 388 #endif 389 390 // restore callee-save registers 391 __ ldrd(v15, d15_save); 392 __ ldrd(v14, d14_save); 393 __ ldrd(v13, d13_save); 394 __ ldrd(v12, d12_save); 395 __ ldrd(v11, d11_save); 396 __ ldrd(v10, d10_save); 397 __ ldrd(v9, d9_save); 398 __ ldrd(v8, d8_save); 399 400 __ ldr(r28, r28_save); 401 __ ldr(r27, r27_save); 402 __ ldr(r26, r26_save); 403 __ ldr(r25, r25_save); 404 __ ldr(r24, r24_save); 405 __ ldr(r23, r23_save); 406 __ ldr(r22, r22_save); 407 __ ldr(r21, r21_save); 408 __ ldr(r20, r20_save); 409 __ ldr(r19, r19_save); 410 __ ldr(c_rarg0, call_wrapper); 411 __ ldr(c_rarg1, result); 412 __ ldrw(c_rarg2, result_type); 413 __ ldr(c_rarg3, method); 414 __ ldr(c_rarg4, entry_point); 415 __ ldr(c_rarg5, parameters); 416 __ ldr(c_rarg6, parameter_size); 417 __ ldr(c_rarg7, thread); 418 419 #ifndef PRODUCT 420 // tell the simulator we are about to end Java execution 421 if (NotifySimulator) { 422 __ notify(Assembler::method_exit); 423 } 424 #endif 425 // leave frame and return to caller 426 __ leave(); 427 __ ret(lr); 428 429 // handle return types different from T_INT 430 431 __ BIND(is_long); 432 __ str(r0, Address(j_rarg2, 0)); 433 __ br(Assembler::AL, exit); 434 435 __ BIND(is_float); 436 __ strs(j_farg0, Address(j_rarg2, 0)); 437 __ br(Assembler::AL, exit); 438 439 __ BIND(is_double); 440 __ strd(j_farg0, Address(j_rarg2, 0)); 441 __ br(Assembler::AL, exit); 442 443 return start; 444 } 445 446 // Return point for a Java call if there's an exception thrown in 447 // Java code. The exception is caught and transformed into a 448 // pending exception stored in JavaThread that can be tested from 449 // within the VM. 450 // 451 // Note: Usually the parameters are removed by the callee. In case 452 // of an exception crossing an activation frame boundary, that is 453 // not the case if the callee is compiled code => need to setup the 454 // rsp. 455 // 456 // r0: exception oop 457 458 // NOTE: this is used as a target from the signal handler so it 459 // needs an x86 prolog which returns into the current simulator 460 // executing the generated catch_exception code. so the prolog 461 // needs to install rax in a sim register and adjust the sim's 462 // restart pc to enter the generated code at the start position 463 // then return from native to simulated execution. 464 465 address generate_catch_exception() { 466 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 467 address start = __ pc(); 468 469 // same as in generate_call_stub(): 470 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 471 const Address thread (rfp, thread_off * wordSize); 472 473 #ifdef ASSERT 474 // verify that threads correspond 475 { 476 Label L, S; 477 __ ldr(rscratch1, thread); 478 __ cmp(rthread, rscratch1); 479 __ br(Assembler::NE, S); 480 __ get_thread(rscratch1); 481 __ cmp(rthread, rscratch1); 482 __ br(Assembler::EQ, L); 483 __ bind(S); 484 __ stop("StubRoutines::catch_exception: threads must correspond"); 485 __ bind(L); 486 } 487 #endif 488 489 // set pending exception 490 __ verify_oop(r0); 491 492 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 493 __ mov(rscratch1, (address)__FILE__); 494 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 495 __ movw(rscratch1, (int)__LINE__); 496 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 497 498 // complete return to VM 499 assert(StubRoutines::_call_stub_return_address != NULL, 500 "_call_stub_return_address must have been generated before"); 501 __ b(StubRoutines::_call_stub_return_address); 502 503 return start; 504 } 505 506 // Continuation point for runtime calls returning with a pending 507 // exception. The pending exception check happened in the runtime 508 // or native call stub. The pending exception in Thread is 509 // converted into a Java-level exception. 510 // 511 // Contract with Java-level exception handlers: 512 // r0: exception 513 // r3: throwing pc 514 // 515 // NOTE: At entry of this stub, exception-pc must be in LR !! 516 517 // NOTE: this is always used as a jump target within generated code 518 // so it just needs to be generated code wiht no x86 prolog 519 520 address generate_forward_exception() { 521 StubCodeMark mark(this, "StubRoutines", "forward exception"); 522 address start = __ pc(); 523 524 // Upon entry, LR points to the return address returning into 525 // Java (interpreted or compiled) code; i.e., the return address 526 // becomes the throwing pc. 527 // 528 // Arguments pushed before the runtime call are still on the stack 529 // but the exception handler will reset the stack pointer -> 530 // ignore them. A potential result in registers can be ignored as 531 // well. 532 533 #ifdef ASSERT 534 // make sure this code is only executed if there is a pending exception 535 { 536 Label L; 537 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 538 __ cbnz(rscratch1, L); 539 __ stop("StubRoutines::forward exception: no pending exception (1)"); 540 __ bind(L); 541 } 542 #endif 543 544 // compute exception handler into r19 545 546 // call the VM to find the handler address associated with the 547 // caller address. pass thread in r0 and caller pc (ret address) 548 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 549 // the stack. 550 __ mov(c_rarg1, lr); 551 // lr will be trashed by the VM call so we move it to R19 552 // (callee-saved) because we also need to pass it to the handler 553 // returned by this call. 554 __ mov(r19, lr); 555 BLOCK_COMMENT("call exception_handler_for_return_address"); 556 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 557 SharedRuntime::exception_handler_for_return_address), 558 rthread, c_rarg1); 559 // we should not really care that lr is no longer the callee 560 // address. we saved the value the handler needs in r19 so we can 561 // just copy it to r3. however, the C2 handler will push its own 562 // frame and then calls into the VM and the VM code asserts that 563 // the PC for the frame above the handler belongs to a compiled 564 // Java method. So, we restore lr here to satisfy that assert. 565 __ mov(lr, r19); 566 // setup r0 & r3 & clear pending exception 567 __ mov(r3, r19); 568 __ mov(r19, r0); 569 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 570 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 571 572 #ifdef ASSERT 573 // make sure exception is set 574 { 575 Label L; 576 __ cbnz(r0, L); 577 __ stop("StubRoutines::forward exception: no pending exception (2)"); 578 __ bind(L); 579 } 580 #endif 581 582 // continue at exception handler 583 // r0: exception 584 // r3: throwing pc 585 // r19: exception handler 586 __ verify_oop(r0); 587 __ br(r19); 588 589 return start; 590 } 591 592 // Non-destructive plausibility checks for oops 593 // 594 // Arguments: 595 // r0: oop to verify 596 // rscratch1: error message 597 // 598 // Stack after saving c_rarg3: 599 // [tos + 0]: saved c_rarg3 600 // [tos + 1]: saved c_rarg2 601 // [tos + 2]: saved lr 602 // [tos + 3]: saved rscratch2 603 // [tos + 4]: saved r0 604 // [tos + 5]: saved rscratch1 605 address generate_verify_oop() { 606 607 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 608 address start = __ pc(); 609 610 Label exit, error; 611 612 // save c_rarg2 and c_rarg3 613 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 614 615 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 616 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 617 __ ldr(c_rarg3, Address(c_rarg2)); 618 __ add(c_rarg3, c_rarg3, 1); 619 __ str(c_rarg3, Address(c_rarg2)); 620 621 // object is in r0 622 // make sure object is 'reasonable' 623 __ cbz(r0, exit); // if obj is NULL it is OK 624 625 // Check if the oop is in the right area of memory 626 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 627 __ andr(c_rarg2, r0, c_rarg3); 628 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 629 630 // Compare c_rarg2 and c_rarg3. We don't use a compare 631 // instruction here because the flags register is live. 632 __ eor(c_rarg2, c_rarg2, c_rarg3); 633 __ cbnz(c_rarg2, error); 634 635 // make sure klass is 'reasonable', which is not zero. 636 __ load_klass(r0, r0); // get klass 637 __ cbz(r0, error); // if klass is NULL it is broken 638 639 // return if everything seems ok 640 __ bind(exit); 641 642 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 643 __ ret(lr); 644 645 // handle errors 646 __ bind(error); 647 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 648 649 __ push(RegSet::range(r0, r29), sp); 650 // debug(char* msg, int64_t pc, int64_t regs[]) 651 __ mov(c_rarg0, rscratch1); // pass address of error message 652 __ mov(c_rarg1, lr); // pass return address 653 __ mov(c_rarg2, sp); // pass address of regs on stack 654 #ifndef PRODUCT 655 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 656 #endif 657 BLOCK_COMMENT("call MacroAssembler::debug"); 658 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 659 __ blrt(rscratch1, 3, 0, 1); 660 661 return start; 662 } 663 664 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 665 666 // Generate code for an array write pre barrier 667 // 668 // addr - starting address 669 // count - element count 670 // tmp - scratch register 671 // 672 // Destroy no registers! 673 // 674 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 675 BarrierSet* bs = Universe::heap()->barrier_set(); 676 switch (bs->kind()) { 677 case BarrierSet::G1SATBCT: 678 case BarrierSet::G1SATBCTLogging: 679 // With G1, don't generate the call if we statically know that the target in uninitialized 680 if (!dest_uninitialized) { 681 __ push(RegSet::range(r0, r29), sp); // integer registers except lr & sp 682 if (count == c_rarg0) { 683 if (addr == c_rarg1) { 684 // exactly backwards!! 685 __ stp(c_rarg0, c_rarg1, __ pre(sp, -2 * wordSize)); 686 __ ldp(c_rarg1, c_rarg0, __ post(sp, -2 * wordSize)); 687 } else { 688 __ mov(c_rarg1, count); 689 __ mov(c_rarg0, addr); 690 } 691 } else { 692 __ mov(c_rarg0, addr); 693 __ mov(c_rarg1, count); 694 } 695 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 696 __ pop(RegSet::range(r0, r29), sp); // integer registers except lr & sp } 697 break; 698 case BarrierSet::CardTableModRef: 699 case BarrierSet::CardTableExtension: 700 case BarrierSet::ModRef: 701 break; 702 default: 703 ShouldNotReachHere(); 704 705 } 706 } 707 } 708 709 // 710 // Generate code for an array write post barrier 711 // 712 // Input: 713 // start - register containing starting address of destination array 714 // end - register containing ending address of destination array 715 // scratch - scratch register 716 // 717 // The input registers are overwritten. 718 // The ending address is inclusive. 719 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) { 720 assert_different_registers(start, end, scratch); 721 BarrierSet* bs = Universe::heap()->barrier_set(); 722 switch (bs->kind()) { 723 case BarrierSet::G1SATBCT: 724 case BarrierSet::G1SATBCTLogging: 725 726 { 727 __ push(RegSet::range(r0, r29), sp); // integer registers except lr & sp 728 // must compute element count unless barrier set interface is changed (other platforms supply count) 729 assert_different_registers(start, end, scratch); 730 __ lea(scratch, Address(end, BytesPerHeapOop)); 731 __ sub(scratch, scratch, start); // subtract start to get #bytes 732 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 733 __ mov(c_rarg0, start); 734 __ mov(c_rarg1, scratch); 735 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 736 __ pop(RegSet::range(r0, r29), sp); // integer registers except lr & sp } 737 } 738 break; 739 case BarrierSet::CardTableModRef: 740 case BarrierSet::CardTableExtension: 741 { 742 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 743 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 744 745 Label L_loop; 746 747 __ lsr(start, start, CardTableModRefBS::card_shift); 748 __ lsr(end, end, CardTableModRefBS::card_shift); 749 __ sub(end, end, start); // number of bytes to copy 750 751 const Register count = end; // 'end' register contains bytes count now 752 __ mov(scratch, (address)ct->byte_map_base); 753 __ add(start, start, scratch); 754 __ BIND(L_loop); 755 __ strb(zr, Address(start, count)); 756 __ subs(count, count, 1); 757 __ br(Assembler::HS, L_loop); 758 } 759 break; 760 default: 761 ShouldNotReachHere(); 762 763 } 764 } 765 766 typedef enum { 767 copy_forwards = 1, 768 copy_backwards = -1 769 } copy_direction; 770 771 // Bulk copy of blocks of 8 words. 772 // 773 // count is a count of words. 774 // 775 // Precondition: count >= 2 776 // 777 // Postconditions: 778 // 779 // The least significant bit of count contains the remaining count 780 // of words to copy. The rest of count is trash. 781 // 782 // s and d are adjusted to point to the remaining words to copy 783 // 784 void generate_copy_longs(Label &start, Register s, Register d, Register count, 785 copy_direction direction) { 786 int unit = wordSize * direction; 787 788 int offset; 789 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 790 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 791 792 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 793 assert_different_registers(s, d, count, rscratch1); 794 795 Label again, large, small; 796 __ align(6); 797 __ bind(start); 798 __ cmp(count, 8); 799 __ br(Assembler::LO, small); 800 if (direction == copy_forwards) { 801 __ sub(s, s, 2 * wordSize); 802 __ sub(d, d, 2 * wordSize); 803 } 804 __ subs(count, count, 16); 805 __ br(Assembler::GE, large); 806 807 // 8 <= count < 16 words. Copy 8. 808 __ ldp(t0, t1, Address(s, 2 * unit)); 809 __ ldp(t2, t3, Address(s, 4 * unit)); 810 __ ldp(t4, t5, Address(s, 6 * unit)); 811 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 812 813 __ stp(t0, t1, Address(d, 2 * unit)); 814 __ stp(t2, t3, Address(d, 4 * unit)); 815 __ stp(t4, t5, Address(d, 6 * unit)); 816 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 817 818 if (direction == copy_forwards) { 819 __ add(s, s, 2 * wordSize); 820 __ add(d, d, 2 * wordSize); 821 } 822 823 { 824 Label L1, L2; 825 __ bind(small); 826 __ tbz(count, exact_log2(4), L1); 827 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 828 __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 829 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 830 __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 831 __ bind(L1); 832 833 __ tbz(count, 1, L2); 834 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 835 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 836 __ bind(L2); 837 } 838 839 __ ret(lr); 840 841 __ align(6); 842 __ bind(large); 843 844 // Fill 8 registers 845 __ ldp(t0, t1, Address(s, 2 * unit)); 846 __ ldp(t2, t3, Address(s, 4 * unit)); 847 __ ldp(t4, t5, Address(s, 6 * unit)); 848 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 849 850 __ bind(again); 851 852 if (direction == copy_forwards && PrefetchCopyIntervalInBytes > 0) 853 __ prfm(Address(s, PrefetchCopyIntervalInBytes), PLDL1KEEP); 854 855 __ stp(t0, t1, Address(d, 2 * unit)); 856 __ ldp(t0, t1, Address(s, 2 * unit)); 857 __ stp(t2, t3, Address(d, 4 * unit)); 858 __ ldp(t2, t3, Address(s, 4 * unit)); 859 __ stp(t4, t5, Address(d, 6 * unit)); 860 __ ldp(t4, t5, Address(s, 6 * unit)); 861 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 862 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 863 864 __ subs(count, count, 8); 865 __ br(Assembler::HS, again); 866 867 // Drain 868 __ stp(t0, t1, Address(d, 2 * unit)); 869 __ stp(t2, t3, Address(d, 4 * unit)); 870 __ stp(t4, t5, Address(d, 6 * unit)); 871 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 872 873 if (direction == copy_forwards) { 874 __ add(s, s, 2 * wordSize); 875 __ add(d, d, 2 * wordSize); 876 } 877 878 { 879 Label L1, L2; 880 __ tbz(count, exact_log2(4), L1); 881 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 882 __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 883 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 884 __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 885 __ bind(L1); 886 887 __ tbz(count, 1, L2); 888 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 889 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 890 __ bind(L2); 891 } 892 893 __ ret(lr); 894 } 895 896 // Small copy: less than 16 bytes. 897 // 898 // NB: Ignores all of the bits of count which represent more than 15 899 // bytes, so a caller doesn't have to mask them. 900 901 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 902 bool is_backwards = step < 0; 903 size_t granularity = uabs(step); 904 int direction = is_backwards ? -1 : 1; 905 int unit = wordSize * direction; 906 907 Label Lpair, Lword, Lint, Lshort, Lbyte; 908 909 assert(granularity 910 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 911 912 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 913 914 // ??? I don't know if this bit-test-and-branch is the right thing 915 // to do. It does a lot of jumping, resulting in several 916 // mispredicted branches. It might make more sense to do this 917 // with something like Duff's device with a single computed branch. 918 919 __ tbz(count, 3 - exact_log2(granularity), Lword); 920 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 921 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 922 __ bind(Lword); 923 924 if (granularity <= sizeof (jint)) { 925 __ tbz(count, 2 - exact_log2(granularity), Lint); 926 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 927 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 928 __ bind(Lint); 929 } 930 931 if (granularity <= sizeof (jshort)) { 932 __ tbz(count, 1 - exact_log2(granularity), Lshort); 933 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 934 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 935 __ bind(Lshort); 936 } 937 938 if (granularity <= sizeof (jbyte)) { 939 __ tbz(count, 0, Lbyte); 940 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 941 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 942 __ bind(Lbyte); 943 } 944 } 945 946 Label copy_f, copy_b; 947 948 // All-singing all-dancing memory copy. 949 // 950 // Copy count units of memory from s to d. The size of a unit is 951 // step, which can be positive or negative depending on the direction 952 // of copy. If is_aligned is false, we align the source address. 953 // 954 955 void copy_memory(bool is_aligned, Register s, Register d, 956 Register count, Register tmp, int step) { 957 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 958 bool is_backwards = step < 0; 959 int granularity = uabs(step); 960 const Register t0 = r3, t1 = r4; 961 962 if (is_backwards) { 963 __ lea(s, Address(s, count, Address::uxtw(exact_log2(-step)))); 964 __ lea(d, Address(d, count, Address::uxtw(exact_log2(-step)))); 965 } 966 967 Label done, tail; 968 969 __ cmp(count, 16/granularity); 970 __ br(Assembler::LO, tail); 971 972 // Now we've got the small case out of the way we can align the 973 // source address on a 2-word boundary. 974 975 Label aligned; 976 977 if (is_aligned) { 978 // We may have to adjust by 1 word to get s 2-word-aligned. 979 __ tbz(s, exact_log2(wordSize), aligned); 980 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 981 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 982 __ sub(count, count, wordSize/granularity); 983 } else { 984 if (is_backwards) { 985 __ andr(rscratch2, s, 2 * wordSize - 1); 986 } else { 987 __ neg(rscratch2, s); 988 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 989 } 990 // rscratch2 is the byte adjustment needed to align s. 991 __ cbz(rscratch2, aligned); 992 __ lsr(rscratch2, rscratch2, exact_log2(granularity)); 993 __ sub(count, count, rscratch2); 994 995 #if 0 996 // ?? This code is only correct for a disjoint copy. It may or 997 // may not make sense to use it in that case. 998 999 // Copy the first pair; s and d may not be aligned. 1000 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1001 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1002 1003 // Align s and d, adjust count 1004 if (is_backwards) { 1005 __ sub(s, s, rscratch2); 1006 __ sub(d, d, rscratch2); 1007 } else { 1008 __ add(s, s, rscratch2); 1009 __ add(d, d, rscratch2); 1010 } 1011 #else 1012 copy_memory_small(s, d, rscratch2, rscratch1, step); 1013 #endif 1014 } 1015 1016 __ cmp(count, 16/granularity); 1017 __ br(Assembler::LT, tail); 1018 __ bind(aligned); 1019 1020 // s is now 2-word-aligned. 1021 1022 // We have a count of units and some trailing bytes. Adjust the 1023 // count and do a bulk copy of words. 1024 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1025 if (direction == copy_forwards) 1026 __ bl(copy_f); 1027 else 1028 __ bl(copy_b); 1029 1030 // And the tail. 1031 1032 __ bind(tail); 1033 copy_memory_small(s, d, count, tmp, step); 1034 } 1035 1036 1037 void clobber_registers() { 1038 #ifdef ASSERT 1039 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1040 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1041 for (Register r = r3; r <= r18; r++) 1042 if (r != rscratch1) __ mov(r, rscratch1); 1043 #endif 1044 } 1045 1046 // Scan over array at a for count oops, verifying each one. 1047 // Preserves a and count, clobbers rscratch1 and rscratch2. 1048 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1049 Label loop, end; 1050 __ mov(rscratch1, a); 1051 __ mov(rscratch2, zr); 1052 __ bind(loop); 1053 __ cmp(rscratch2, count); 1054 __ br(Assembler::HS, end); 1055 if (size == (size_t)wordSize) { 1056 __ ldr(temp, Address(a, rscratch2, Address::uxtw(exact_log2(size)))); 1057 __ verify_oop(temp); 1058 } else { 1059 __ ldrw(r16, Address(a, rscratch2, Address::uxtw(exact_log2(size)))); 1060 __ decode_heap_oop(temp); // calls verify_oop 1061 } 1062 __ add(rscratch2, rscratch2, size); 1063 __ b(loop); 1064 __ bind(end); 1065 } 1066 1067 // Arguments: 1068 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1069 // ignored 1070 // is_oop - true => oop array, so generate store check code 1071 // name - stub name string 1072 // 1073 // Inputs: 1074 // c_rarg0 - source array address 1075 // c_rarg1 - destination array address 1076 // c_rarg2 - element count, treated as ssize_t, can be zero 1077 // 1078 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1079 // the hardware handle it. The two dwords within qwords that span 1080 // cache line boundaries will still be loaded and stored atomicly. 1081 // 1082 // Side Effects: 1083 // disjoint_int_copy_entry is set to the no-overlap entry point 1084 // used by generate_conjoint_int_oop_copy(). 1085 // 1086 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1087 const char *name, bool dest_uninitialized = false) { 1088 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1089 __ align(CodeEntryAlignment); 1090 StubCodeMark mark(this, "StubRoutines", name); 1091 address start = __ pc(); 1092 if (entry != NULL) { 1093 *entry = __ pc(); 1094 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1095 BLOCK_COMMENT("Entry:"); 1096 } 1097 __ enter(); 1098 if (is_oop) { 1099 __ push(RegSet::of(d, count), sp); 1100 // no registers are destroyed by this call 1101 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1102 } 1103 copy_memory(aligned, s, d, count, rscratch1, size); 1104 if (is_oop) { 1105 __ pop(RegSet::of(d, count), sp); 1106 if (VerifyOops) 1107 verify_oop_array(size, d, count, r16); 1108 __ sub(count, count, 1); // make an inclusive end pointer 1109 __ lea(count, Address(d, count, Address::uxtw(exact_log2(size)))); 1110 gen_write_ref_array_post_barrier(d, count, rscratch1); 1111 } 1112 __ leave(); 1113 __ ret(lr); 1114 #ifdef BUILTIN_SIM 1115 { 1116 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1117 sim->notifyCompile(const_cast<char*>(name), start); 1118 } 1119 #endif 1120 return start; 1121 } 1122 1123 // Arguments: 1124 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1125 // ignored 1126 // is_oop - true => oop array, so generate store check code 1127 // name - stub name string 1128 // 1129 // Inputs: 1130 // c_rarg0 - source array address 1131 // c_rarg1 - destination array address 1132 // c_rarg2 - element count, treated as ssize_t, can be zero 1133 // 1134 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1135 // the hardware handle it. The two dwords within qwords that span 1136 // cache line boundaries will still be loaded and stored atomicly. 1137 // 1138 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1139 address *entry, const char *name, 1140 bool dest_uninitialized = false) { 1141 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1142 1143 StubCodeMark mark(this, "StubRoutines", name); 1144 address start = __ pc(); 1145 1146 __ cmp(d, s); 1147 __ br(Assembler::LS, nooverlap_target); 1148 1149 __ enter(); 1150 if (is_oop) { 1151 __ push(RegSet::of(d, count), sp); 1152 // no registers are destroyed by this call 1153 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1154 } 1155 copy_memory(aligned, s, d, count, rscratch1, -size); 1156 if (is_oop) { 1157 __ pop(RegSet::of(d, count), sp); 1158 if (VerifyOops) 1159 verify_oop_array(size, d, count, r16); 1160 __ sub(count, count, 1); // make an inclusive end pointer 1161 __ lea(count, Address(d, count, Address::uxtw(exact_log2(size)))); 1162 gen_write_ref_array_post_barrier(d, count, rscratch1); 1163 } 1164 __ leave(); 1165 __ ret(lr); 1166 #ifdef BUILTIN_SIM 1167 { 1168 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1169 sim->notifyCompile(const_cast<char*>(name), start); 1170 } 1171 #endif 1172 return start; 1173 } 1174 1175 // Arguments: 1176 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1177 // ignored 1178 // name - stub name string 1179 // 1180 // Inputs: 1181 // c_rarg0 - source array address 1182 // c_rarg1 - destination array address 1183 // c_rarg2 - element count, treated as ssize_t, can be zero 1184 // 1185 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1186 // we let the hardware handle it. The one to eight bytes within words, 1187 // dwords or qwords that span cache line boundaries will still be loaded 1188 // and stored atomically. 1189 // 1190 // Side Effects: 1191 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1192 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1193 // we let the hardware handle it. The one to eight bytes within words, 1194 // dwords or qwords that span cache line boundaries will still be loaded 1195 // and stored atomically. 1196 // 1197 // Side Effects: 1198 // disjoint_byte_copy_entry is set to the no-overlap entry point 1199 // used by generate_conjoint_byte_copy(). 1200 // 1201 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1202 const bool not_oop = false; 1203 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1204 } 1205 1206 // Arguments: 1207 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1208 // ignored 1209 // name - stub name string 1210 // 1211 // Inputs: 1212 // c_rarg0 - source array address 1213 // c_rarg1 - destination array address 1214 // c_rarg2 - element count, treated as ssize_t, can be zero 1215 // 1216 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1217 // we let the hardware handle it. The one to eight bytes within words, 1218 // dwords or qwords that span cache line boundaries will still be loaded 1219 // and stored atomically. 1220 // 1221 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1222 address* entry, const char *name) { 1223 const bool not_oop = false; 1224 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1225 } 1226 1227 // Arguments: 1228 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1229 // ignored 1230 // name - stub name string 1231 // 1232 // Inputs: 1233 // c_rarg0 - source array address 1234 // c_rarg1 - destination array address 1235 // c_rarg2 - element count, treated as ssize_t, can be zero 1236 // 1237 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1238 // let the hardware handle it. The two or four words within dwords 1239 // or qwords that span cache line boundaries will still be loaded 1240 // and stored atomically. 1241 // 1242 // Side Effects: 1243 // disjoint_short_copy_entry is set to the no-overlap entry point 1244 // used by generate_conjoint_short_copy(). 1245 // 1246 address generate_disjoint_short_copy(bool aligned, 1247 address* entry, const char *name) { 1248 const bool not_oop = false; 1249 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1250 } 1251 1252 // Arguments: 1253 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1254 // ignored 1255 // name - stub name string 1256 // 1257 // Inputs: 1258 // c_rarg0 - source array address 1259 // c_rarg1 - destination array address 1260 // c_rarg2 - element count, treated as ssize_t, can be zero 1261 // 1262 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1263 // let the hardware handle it. The two or four words within dwords 1264 // or qwords that span cache line boundaries will still be loaded 1265 // and stored atomically. 1266 // 1267 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1268 address *entry, const char *name) { 1269 const bool not_oop = false; 1270 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1271 1272 } 1273 // Arguments: 1274 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1275 // ignored 1276 // name - stub name string 1277 // 1278 // Inputs: 1279 // c_rarg0 - source array address 1280 // c_rarg1 - destination array address 1281 // c_rarg2 - element count, treated as ssize_t, can be zero 1282 // 1283 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1284 // the hardware handle it. The two dwords within qwords that span 1285 // cache line boundaries will still be loaded and stored atomicly. 1286 // 1287 // Side Effects: 1288 // disjoint_int_copy_entry is set to the no-overlap entry point 1289 // used by generate_conjoint_int_oop_copy(). 1290 // 1291 address generate_disjoint_int_copy(bool aligned, address *entry, 1292 const char *name, bool dest_uninitialized = false) { 1293 const bool not_oop = false; 1294 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1295 } 1296 1297 // Arguments: 1298 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1299 // ignored 1300 // name - stub name string 1301 // 1302 // Inputs: 1303 // c_rarg0 - source array address 1304 // c_rarg1 - destination array address 1305 // c_rarg2 - element count, treated as ssize_t, can be zero 1306 // 1307 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1308 // the hardware handle it. The two dwords within qwords that span 1309 // cache line boundaries will still be loaded and stored atomicly. 1310 // 1311 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1312 address *entry, const char *name, 1313 bool dest_uninitialized = false) { 1314 const bool not_oop = false; 1315 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1316 } 1317 1318 1319 // Arguments: 1320 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1321 // ignored 1322 // name - stub name string 1323 // 1324 // Inputs: 1325 // c_rarg0 - source array address 1326 // c_rarg1 - destination array address 1327 // c_rarg2 - element count, treated as size_t, can be zero 1328 // 1329 // Side Effects: 1330 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1331 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1332 // 1333 address generate_disjoint_long_copy(bool aligned, address *entry, 1334 const char *name, bool dest_uninitialized = false) { 1335 const bool not_oop = false; 1336 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1337 } 1338 1339 // Arguments: 1340 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1341 // ignored 1342 // name - stub name string 1343 // 1344 // Inputs: 1345 // c_rarg0 - source array address 1346 // c_rarg1 - destination array address 1347 // c_rarg2 - element count, treated as size_t, can be zero 1348 // 1349 address generate_conjoint_long_copy(bool aligned, 1350 address nooverlap_target, address *entry, 1351 const char *name, bool dest_uninitialized = false) { 1352 const bool not_oop = false; 1353 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1354 } 1355 1356 // Arguments: 1357 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1358 // ignored 1359 // name - stub name string 1360 // 1361 // Inputs: 1362 // c_rarg0 - source array address 1363 // c_rarg1 - destination array address 1364 // c_rarg2 - element count, treated as size_t, can be zero 1365 // 1366 // Side Effects: 1367 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1368 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1369 // 1370 address generate_disjoint_oop_copy(bool aligned, address *entry, 1371 const char *name, bool dest_uninitialized = false) { 1372 const bool is_oop = true; 1373 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1374 return generate_disjoint_copy(size, aligned, is_oop, entry, name); 1375 } 1376 1377 // Arguments: 1378 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1379 // ignored 1380 // name - stub name string 1381 // 1382 // Inputs: 1383 // c_rarg0 - source array address 1384 // c_rarg1 - destination array address 1385 // c_rarg2 - element count, treated as size_t, can be zero 1386 // 1387 address generate_conjoint_oop_copy(bool aligned, 1388 address nooverlap_target, address *entry, 1389 const char *name, bool dest_uninitialized = false) { 1390 const bool is_oop = true; 1391 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1392 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, name); 1393 } 1394 1395 1396 // Helper for generating a dynamic type check. 1397 // Smashes rscratch1. 1398 void generate_type_check(Register sub_klass, 1399 Register super_check_offset, 1400 Register super_klass, 1401 Label& L_success) { 1402 assert_different_registers(sub_klass, super_check_offset, super_klass); 1403 1404 BLOCK_COMMENT("type_check:"); 1405 1406 Label L_miss; 1407 1408 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1409 super_check_offset); 1410 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1411 1412 // Fall through on failure! 1413 __ BIND(L_miss); 1414 } 1415 1416 // 1417 // Generate checkcasting array copy stub 1418 // 1419 // Input: 1420 // c_rarg0 - source array address 1421 // c_rarg1 - destination array address 1422 // c_rarg2 - element count, treated as ssize_t, can be zero 1423 // c_rarg3 - size_t ckoff (super_check_offset) 1424 // c_rarg4 - oop ckval (super_klass) 1425 // 1426 // Output: 1427 // r0 == 0 - success 1428 // r0 == -1^K - failure, where K is partial transfer count 1429 // 1430 address generate_checkcast_copy(const char *name, address *entry, 1431 bool dest_uninitialized = false) { 1432 1433 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1434 1435 // Input registers (after setup_arg_regs) 1436 const Register from = c_rarg0; // source array address 1437 const Register to = c_rarg1; // destination array address 1438 const Register count = c_rarg2; // elementscount 1439 const Register ckoff = c_rarg3; // super_check_offset 1440 const Register ckval = c_rarg4; // super_klass 1441 1442 // Registers used as temps (r18, r19, r20 are save-on-entry) 1443 const Register count_save = r21; // orig elementscount 1444 const Register start_to = r20; // destination array start address 1445 const Register copied_oop = r18; // actual oop copied 1446 const Register r19_klass = r19; // oop._klass 1447 1448 //--------------------------------------------------------------- 1449 // Assembler stub will be used for this call to arraycopy 1450 // if the two arrays are subtypes of Object[] but the 1451 // destination array type is not equal to or a supertype 1452 // of the source type. Each element must be separately 1453 // checked. 1454 1455 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1456 copied_oop, r19_klass, count_save); 1457 1458 __ align(CodeEntryAlignment); 1459 StubCodeMark mark(this, "StubRoutines", name); 1460 address start = __ pc(); 1461 1462 __ enter(); // required for proper stackwalking of RuntimeStub frame 1463 1464 #ifdef ASSERT 1465 // caller guarantees that the arrays really are different 1466 // otherwise, we would have to make conjoint checks 1467 { Label L; 1468 array_overlap_test(L, TIMES_OOP); 1469 __ stop("checkcast_copy within a single array"); 1470 __ bind(L); 1471 } 1472 #endif //ASSERT 1473 1474 // Caller of this entry point must set up the argument registers. 1475 if (entry != NULL) { 1476 *entry = __ pc(); 1477 BLOCK_COMMENT("Entry:"); 1478 } 1479 1480 // Empty array: Nothing to do. 1481 __ cbz(count, L_done); 1482 1483 __ push(RegSet::of(r18, r19, r20, r21), sp); 1484 1485 #ifdef ASSERT 1486 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1487 // The ckoff and ckval must be mutually consistent, 1488 // even though caller generates both. 1489 { Label L; 1490 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1491 __ ldrw(start_to, Address(ckval, sco_offset)); 1492 __ cmpw(ckoff, start_to); 1493 __ br(Assembler::EQ, L); 1494 __ stop("super_check_offset inconsistent"); 1495 __ bind(L); 1496 } 1497 #endif //ASSERT 1498 1499 // save the original count 1500 __ mov(count_save, count); 1501 1502 // Copy from low to high addresses 1503 __ mov(start_to, to); // Save destination array start address 1504 __ b(L_load_element); 1505 1506 // ======== begin loop ======== 1507 // (Loop is rotated; its entry is L_load_element.) 1508 // Loop control: 1509 // for (; count != 0; count--) { 1510 // copied_oop = load_heap_oop(from++); 1511 // ... generate_type_check ...; 1512 // store_heap_oop(to++, copied_oop); 1513 // } 1514 __ align(OptoLoopAlignment); 1515 1516 __ BIND(L_store_element); 1517 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1518 __ sub(count, count, 1); 1519 __ cbz(count, L_do_card_marks); 1520 1521 // ======== loop entry is here ======== 1522 __ BIND(L_load_element); 1523 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1524 __ cbz(copied_oop, L_store_element); 1525 1526 __ load_klass(r19_klass, copied_oop);// query the object klass 1527 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1528 // ======== end loop ======== 1529 1530 // It was a real error; we must depend on the caller to finish the job. 1531 // Register count = remaining oops, count_orig = total oops. 1532 // Emit GC store barriers for the oops we have copied and report 1533 // their number to the caller. 1534 1535 __ subs(count, count_save, count); // K = partially copied oop count 1536 __ eon(count, count, zr); // report (-1^K) to caller 1537 __ br(Assembler::EQ, L_done_pop); 1538 1539 __ BIND(L_do_card_marks); 1540 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1541 gen_write_ref_array_post_barrier(start_to, to, rscratch1); 1542 1543 __ bind(L_done_pop); 1544 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1545 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1546 1547 __ bind(L_done); 1548 __ mov(r0, count); 1549 __ leave(); 1550 __ ret(lr); 1551 1552 return start; 1553 } 1554 1555 // Perform range checks on the proposed arraycopy. 1556 // Kills temp, but nothing else. 1557 // Also, clean the sign bits of src_pos and dst_pos. 1558 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1559 Register src_pos, // source position (c_rarg1) 1560 Register dst, // destination array oo (c_rarg2) 1561 Register dst_pos, // destination position (c_rarg3) 1562 Register length, 1563 Register temp, 1564 Label& L_failed) { Unimplemented(); } 1565 1566 // These stubs get called from some dumb test routine. 1567 // I'll write them properly when they're called from 1568 // something that's actually doing something. 1569 static void fake_arraycopy_stub(address src, address dst, int count) { 1570 assert(count == 0, "huh?"); 1571 } 1572 1573 1574 void generate_arraycopy_stubs() { 1575 address entry; 1576 address entry_jbyte_arraycopy; 1577 address entry_jshort_arraycopy; 1578 address entry_jint_arraycopy; 1579 address entry_oop_arraycopy; 1580 address entry_jlong_arraycopy; 1581 address entry_checkcast_arraycopy; 1582 1583 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 1584 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 1585 1586 //*** jbyte 1587 // Always need aligned and unaligned versions 1588 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 1589 "jbyte_disjoint_arraycopy"); 1590 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 1591 &entry_jbyte_arraycopy, 1592 "jbyte_arraycopy"); 1593 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 1594 "arrayof_jbyte_disjoint_arraycopy"); 1595 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 1596 "arrayof_jbyte_arraycopy"); 1597 1598 //*** jshort 1599 // Always need aligned and unaligned versions 1600 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 1601 "jshort_disjoint_arraycopy"); 1602 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 1603 &entry_jshort_arraycopy, 1604 "jshort_arraycopy"); 1605 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 1606 "arrayof_jshort_disjoint_arraycopy"); 1607 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 1608 "arrayof_jshort_arraycopy"); 1609 1610 //*** jint 1611 // Aligned versions 1612 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 1613 "arrayof_jint_disjoint_arraycopy"); 1614 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 1615 "arrayof_jint_arraycopy"); 1616 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 1617 // entry_jint_arraycopy always points to the unaligned version 1618 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 1619 "jint_disjoint_arraycopy"); 1620 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 1621 &entry_jint_arraycopy, 1622 "jint_arraycopy"); 1623 1624 //*** jlong 1625 // It is always aligned 1626 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 1627 "arrayof_jlong_disjoint_arraycopy"); 1628 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 1629 "arrayof_jlong_arraycopy"); 1630 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 1631 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 1632 1633 //*** oops 1634 { 1635 // With compressed oops we need unaligned versions; notice that 1636 // we overwrite entry_oop_arraycopy. 1637 bool aligned = !UseCompressedOops; 1638 1639 StubRoutines::_arrayof_oop_disjoint_arraycopy 1640 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy"); 1641 StubRoutines::_arrayof_oop_arraycopy 1642 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy"); 1643 // Aligned versions without pre-barriers 1644 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 1645 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 1646 /*dest_uninitialized*/true); 1647 StubRoutines::_arrayof_oop_arraycopy_uninit 1648 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 1649 /*dest_uninitialized*/true); 1650 } 1651 1652 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 1653 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 1654 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 1655 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 1656 1657 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 1658 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 1659 /*dest_uninitialized*/true); 1660 } 1661 1662 // Arguments: 1663 // 1664 // Inputs: 1665 // c_rarg0 - source byte array address 1666 // c_rarg1 - destination byte array address 1667 // c_rarg2 - K (key) in little endian int array 1668 // 1669 address generate_aescrypt_encryptBlock() { 1670 __ align(CodeEntryAlignment); 1671 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 1672 1673 Label L_doLast; 1674 1675 const Register from = c_rarg0; // source array address 1676 const Register to = c_rarg1; // destination array address 1677 const Register key = c_rarg2; // key array address 1678 const Register keylen = rscratch1; 1679 1680 address start = __ pc(); 1681 __ enter(); 1682 1683 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 1684 1685 __ ld1(v0, __ T16B, from); // get 16 bytes of input 1686 1687 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 1688 __ rev32(v1, __ T16B, v1); 1689 __ rev32(v2, __ T16B, v2); 1690 __ rev32(v3, __ T16B, v3); 1691 __ rev32(v4, __ T16B, v4); 1692 __ aese(v0, v1); 1693 __ aesmc(v0, v0); 1694 __ aese(v0, v2); 1695 __ aesmc(v0, v0); 1696 __ aese(v0, v3); 1697 __ aesmc(v0, v0); 1698 __ aese(v0, v4); 1699 __ aesmc(v0, v0); 1700 1701 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 1702 __ rev32(v1, __ T16B, v1); 1703 __ rev32(v2, __ T16B, v2); 1704 __ rev32(v3, __ T16B, v3); 1705 __ rev32(v4, __ T16B, v4); 1706 __ aese(v0, v1); 1707 __ aesmc(v0, v0); 1708 __ aese(v0, v2); 1709 __ aesmc(v0, v0); 1710 __ aese(v0, v3); 1711 __ aesmc(v0, v0); 1712 __ aese(v0, v4); 1713 __ aesmc(v0, v0); 1714 1715 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 1716 __ rev32(v1, __ T16B, v1); 1717 __ rev32(v2, __ T16B, v2); 1718 1719 __ cmpw(keylen, 44); 1720 __ br(Assembler::EQ, L_doLast); 1721 1722 __ aese(v0, v1); 1723 __ aesmc(v0, v0); 1724 __ aese(v0, v2); 1725 __ aesmc(v0, v0); 1726 1727 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 1728 __ rev32(v1, __ T16B, v1); 1729 __ rev32(v2, __ T16B, v2); 1730 1731 __ cmpw(keylen, 52); 1732 __ br(Assembler::EQ, L_doLast); 1733 1734 __ aese(v0, v1); 1735 __ aesmc(v0, v0); 1736 __ aese(v0, v2); 1737 __ aesmc(v0, v0); 1738 1739 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 1740 __ rev32(v1, __ T16B, v1); 1741 __ rev32(v2, __ T16B, v2); 1742 1743 __ BIND(L_doLast); 1744 1745 __ aese(v0, v1); 1746 __ aesmc(v0, v0); 1747 __ aese(v0, v2); 1748 1749 __ ld1(v1, __ T16B, key); 1750 __ rev32(v1, __ T16B, v1); 1751 __ eor(v0, __ T16B, v0, v1); 1752 1753 __ st1(v0, __ T16B, to); 1754 1755 __ mov(r0, 0); 1756 1757 __ leave(); 1758 __ ret(lr); 1759 1760 return start; 1761 } 1762 1763 // Arguments: 1764 // 1765 // Inputs: 1766 // c_rarg0 - source byte array address 1767 // c_rarg1 - destination byte array address 1768 // c_rarg2 - K (key) in little endian int array 1769 // 1770 address generate_aescrypt_decryptBlock() { 1771 assert(UseAES, "need AES instructions and misaligned SSE support"); 1772 __ align(CodeEntryAlignment); 1773 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 1774 Label L_doLast; 1775 1776 const Register from = c_rarg0; // source array address 1777 const Register to = c_rarg1; // destination array address 1778 const Register key = c_rarg2; // key array address 1779 const Register keylen = rscratch1; 1780 1781 address start = __ pc(); 1782 __ enter(); // required for proper stackwalking of RuntimeStub frame 1783 1784 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 1785 1786 __ ld1(v0, __ T16B, from); // get 16 bytes of input 1787 1788 __ ld1(v5, __ T16B, __ post(key, 16)); 1789 __ rev32(v5, __ T16B, v5); 1790 1791 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 1792 __ rev32(v1, __ T16B, v1); 1793 __ rev32(v2, __ T16B, v2); 1794 __ rev32(v3, __ T16B, v3); 1795 __ rev32(v4, __ T16B, v4); 1796 __ aesd(v0, v1); 1797 __ aesimc(v0, v0); 1798 __ aesd(v0, v2); 1799 __ aesimc(v0, v0); 1800 __ aesd(v0, v3); 1801 __ aesimc(v0, v0); 1802 __ aesd(v0, v4); 1803 __ aesimc(v0, v0); 1804 1805 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 1806 __ rev32(v1, __ T16B, v1); 1807 __ rev32(v2, __ T16B, v2); 1808 __ rev32(v3, __ T16B, v3); 1809 __ rev32(v4, __ T16B, v4); 1810 __ aesd(v0, v1); 1811 __ aesimc(v0, v0); 1812 __ aesd(v0, v2); 1813 __ aesimc(v0, v0); 1814 __ aesd(v0, v3); 1815 __ aesimc(v0, v0); 1816 __ aesd(v0, v4); 1817 __ aesimc(v0, v0); 1818 1819 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 1820 __ rev32(v1, __ T16B, v1); 1821 __ rev32(v2, __ T16B, v2); 1822 1823 __ cmpw(keylen, 44); 1824 __ br(Assembler::EQ, L_doLast); 1825 1826 __ aesd(v0, v1); 1827 __ aesimc(v0, v0); 1828 __ aesd(v0, v2); 1829 __ aesimc(v0, v0); 1830 1831 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 1832 __ rev32(v1, __ T16B, v1); 1833 __ rev32(v2, __ T16B, v2); 1834 1835 __ cmpw(keylen, 52); 1836 __ br(Assembler::EQ, L_doLast); 1837 1838 __ aesd(v0, v1); 1839 __ aesimc(v0, v0); 1840 __ aesd(v0, v2); 1841 __ aesimc(v0, v0); 1842 1843 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 1844 __ rev32(v1, __ T16B, v1); 1845 __ rev32(v2, __ T16B, v2); 1846 1847 __ BIND(L_doLast); 1848 1849 __ aesd(v0, v1); 1850 __ aesimc(v0, v0); 1851 __ aesd(v0, v2); 1852 1853 __ eor(v0, __ T16B, v0, v5); 1854 1855 __ st1(v0, __ T16B, to); 1856 1857 __ mov(r0, 0); 1858 1859 __ leave(); 1860 __ ret(lr); 1861 1862 return start; 1863 } 1864 1865 // Arguments: 1866 // 1867 // Inputs: 1868 // c_rarg0 - source byte array address 1869 // c_rarg1 - destination byte array address 1870 // c_rarg2 - K (key) in little endian int array 1871 // c_rarg3 - r vector byte array address 1872 // c_rarg4 - input length 1873 // 1874 // Output: 1875 // x0 - input length 1876 // 1877 address generate_cipherBlockChaining_encryptAESCrypt() { 1878 assert(UseAES, "need AES instructions and misaligned SSE support"); 1879 __ align(CodeEntryAlignment); 1880 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 1881 1882 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 1883 1884 const Register from = c_rarg0; // source array address 1885 const Register to = c_rarg1; // destination array address 1886 const Register key = c_rarg2; // key array address 1887 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 1888 // and left with the results of the last encryption block 1889 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 1890 const Register keylen = rscratch1; 1891 1892 address start = __ pc(); 1893 __ enter(); 1894 1895 __ mov(rscratch2, len_reg); 1896 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 1897 1898 __ ld1(v0, __ T16B, rvec); 1899 1900 __ cmpw(keylen, 52); 1901 __ br(Assembler::CC, L_loadkeys_44); 1902 __ br(Assembler::EQ, L_loadkeys_52); 1903 1904 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 1905 __ rev32(v17, __ T16B, v17); 1906 __ rev32(v18, __ T16B, v18); 1907 __ BIND(L_loadkeys_52); 1908 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 1909 __ rev32(v19, __ T16B, v19); 1910 __ rev32(v20, __ T16B, v20); 1911 __ BIND(L_loadkeys_44); 1912 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 1913 __ rev32(v21, __ T16B, v21); 1914 __ rev32(v22, __ T16B, v22); 1915 __ rev32(v23, __ T16B, v23); 1916 __ rev32(v24, __ T16B, v24); 1917 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 1918 __ rev32(v25, __ T16B, v25); 1919 __ rev32(v26, __ T16B, v26); 1920 __ rev32(v27, __ T16B, v27); 1921 __ rev32(v28, __ T16B, v28); 1922 __ ld1(v29, v30, v31, __ T16B, key); 1923 __ rev32(v29, __ T16B, v29); 1924 __ rev32(v30, __ T16B, v30); 1925 __ rev32(v31, __ T16B, v31); 1926 1927 __ BIND(L_aes_loop); 1928 __ ld1(v1, __ T16B, __ post(from, 16)); 1929 __ eor(v0, __ T16B, v0, v1); 1930 1931 __ br(Assembler::CC, L_rounds_44); 1932 __ br(Assembler::EQ, L_rounds_52); 1933 1934 __ aese(v0, v17); __ aesmc(v0, v0); 1935 __ aese(v0, v18); __ aesmc(v0, v0); 1936 __ BIND(L_rounds_52); 1937 __ aese(v0, v19); __ aesmc(v0, v0); 1938 __ aese(v0, v20); __ aesmc(v0, v0); 1939 __ BIND(L_rounds_44); 1940 __ aese(v0, v21); __ aesmc(v0, v0); 1941 __ aese(v0, v22); __ aesmc(v0, v0); 1942 __ aese(v0, v23); __ aesmc(v0, v0); 1943 __ aese(v0, v24); __ aesmc(v0, v0); 1944 __ aese(v0, v25); __ aesmc(v0, v0); 1945 __ aese(v0, v26); __ aesmc(v0, v0); 1946 __ aese(v0, v27); __ aesmc(v0, v0); 1947 __ aese(v0, v28); __ aesmc(v0, v0); 1948 __ aese(v0, v29); __ aesmc(v0, v0); 1949 __ aese(v0, v30); 1950 __ eor(v0, __ T16B, v0, v31); 1951 1952 __ st1(v0, __ T16B, __ post(to, 16)); 1953 __ sub(len_reg, len_reg, 16); 1954 __ cbnz(len_reg, L_aes_loop); 1955 1956 __ st1(v0, __ T16B, rvec); 1957 1958 __ mov(r0, rscratch2); 1959 1960 __ leave(); 1961 __ ret(lr); 1962 1963 return start; 1964 } 1965 1966 // Arguments: 1967 // 1968 // Inputs: 1969 // c_rarg0 - source byte array address 1970 // c_rarg1 - destination byte array address 1971 // c_rarg2 - K (key) in little endian int array 1972 // c_rarg3 - r vector byte array address 1973 // c_rarg4 - input length 1974 // 1975 // Output: 1976 // rax - input length 1977 // 1978 address generate_cipherBlockChaining_decryptAESCrypt() { 1979 assert(UseAES, "need AES instructions and misaligned SSE support"); 1980 __ align(CodeEntryAlignment); 1981 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 1982 1983 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 1984 1985 const Register from = c_rarg0; // source array address 1986 const Register to = c_rarg1; // destination array address 1987 const Register key = c_rarg2; // key array address 1988 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 1989 // and left with the results of the last encryption block 1990 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 1991 const Register keylen = rscratch1; 1992 1993 address start = __ pc(); 1994 __ enter(); 1995 1996 __ mov(rscratch2, len_reg); 1997 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 1998 1999 __ ld1(v2, __ T16B, rvec); 2000 2001 __ ld1(v31, __ T16B, __ post(key, 16)); 2002 __ rev32(v31, __ T16B, v31); 2003 2004 __ cmpw(keylen, 52); 2005 __ br(Assembler::CC, L_loadkeys_44); 2006 __ br(Assembler::EQ, L_loadkeys_52); 2007 2008 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2009 __ rev32(v17, __ T16B, v17); 2010 __ rev32(v18, __ T16B, v18); 2011 __ BIND(L_loadkeys_52); 2012 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2013 __ rev32(v19, __ T16B, v19); 2014 __ rev32(v20, __ T16B, v20); 2015 __ BIND(L_loadkeys_44); 2016 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2017 __ rev32(v21, __ T16B, v21); 2018 __ rev32(v22, __ T16B, v22); 2019 __ rev32(v23, __ T16B, v23); 2020 __ rev32(v24, __ T16B, v24); 2021 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2022 __ rev32(v25, __ T16B, v25); 2023 __ rev32(v26, __ T16B, v26); 2024 __ rev32(v27, __ T16B, v27); 2025 __ rev32(v28, __ T16B, v28); 2026 __ ld1(v29, v30, __ T16B, key); 2027 __ rev32(v29, __ T16B, v29); 2028 __ rev32(v30, __ T16B, v30); 2029 2030 __ BIND(L_aes_loop); 2031 __ ld1(v0, __ T16B, __ post(from, 16)); 2032 __ orr(v1, __ T16B, v0, v0); 2033 2034 __ br(Assembler::CC, L_rounds_44); 2035 __ br(Assembler::EQ, L_rounds_52); 2036 2037 __ aesd(v0, v17); __ aesimc(v0, v0); 2038 __ aesd(v0, v17); __ aesimc(v0, v0); 2039 __ BIND(L_rounds_52); 2040 __ aesd(v0, v19); __ aesimc(v0, v0); 2041 __ aesd(v0, v20); __ aesimc(v0, v0); 2042 __ BIND(L_rounds_44); 2043 __ aesd(v0, v21); __ aesimc(v0, v0); 2044 __ aesd(v0, v22); __ aesimc(v0, v0); 2045 __ aesd(v0, v23); __ aesimc(v0, v0); 2046 __ aesd(v0, v24); __ aesimc(v0, v0); 2047 __ aesd(v0, v25); __ aesimc(v0, v0); 2048 __ aesd(v0, v26); __ aesimc(v0, v0); 2049 __ aesd(v0, v27); __ aesimc(v0, v0); 2050 __ aesd(v0, v28); __ aesimc(v0, v0); 2051 __ aesd(v0, v29); __ aesimc(v0, v0); 2052 __ aesd(v0, v30); 2053 __ eor(v0, __ T16B, v0, v31); 2054 __ eor(v0, __ T16B, v0, v2); 2055 2056 __ st1(v0, __ T16B, __ post(to, 16)); 2057 __ orr(v2, __ T16B, v1, v1); 2058 2059 __ sub(len_reg, len_reg, 16); 2060 __ cbnz(len_reg, L_aes_loop); 2061 2062 __ st1(v2, __ T16B, rvec); 2063 2064 __ mov(r0, rscratch2); 2065 2066 __ leave(); 2067 __ ret(lr); 2068 2069 return start; 2070 } 2071 2072 // Arguments: 2073 // 2074 // Inputs: 2075 // c_rarg0 - byte[] source+offset 2076 // c_rarg1 - int[] SHA.state 2077 // c_rarg2 - int offset 2078 // c_rarg3 - int limit 2079 // 2080 address generate_sha1_implCompress(bool multi_block, const char *name) { 2081 __ align(CodeEntryAlignment); 2082 StubCodeMark mark(this, "StubRoutines", name); 2083 address start = __ pc(); 2084 2085 Register buf = c_rarg0; 2086 Register state = c_rarg1; 2087 Register ofs = c_rarg2; 2088 Register limit = c_rarg3; 2089 2090 Label keys; 2091 Label sha1_loop; 2092 2093 // load the keys into v0..v3 2094 __ adr(rscratch1, keys); 2095 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2096 // load 5 words state into v6, v7 2097 __ ldrq(v6, Address(state, 0)); 2098 __ ldrs(v7, Address(state, 16)); 2099 2100 2101 __ BIND(sha1_loop); 2102 // load 64 bytes of data into v16..v19 2103 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2104 __ rev32(v16, __ T16B, v16); 2105 __ rev32(v17, __ T16B, v17); 2106 __ rev32(v18, __ T16B, v18); 2107 __ rev32(v19, __ T16B, v19); 2108 2109 // do the sha1 2110 __ addv(v4, __ T4S, v16, v0); 2111 __ orr(v20, __ T16B, v6, v6); 2112 2113 FloatRegister d0 = v16; 2114 FloatRegister d1 = v17; 2115 FloatRegister d2 = v18; 2116 FloatRegister d3 = v19; 2117 2118 for (int round = 0; round < 20; round++) { 2119 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2120 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2121 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2122 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2123 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2124 2125 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2126 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2127 __ sha1h(tmp2, __ T4S, v20); 2128 if (round < 5) 2129 __ sha1c(v20, __ T4S, tmp3, tmp4); 2130 else if (round < 10 || round >= 15) 2131 __ sha1p(v20, __ T4S, tmp3, tmp4); 2132 else 2133 __ sha1m(v20, __ T4S, tmp3, tmp4); 2134 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2135 2136 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2137 } 2138 2139 __ addv(v7, __ T2S, v7, v21); 2140 __ addv(v6, __ T4S, v6, v20); 2141 2142 if (multi_block) { 2143 __ add(ofs, ofs, 64); 2144 __ cmp(ofs, limit); 2145 __ br(Assembler::LE, sha1_loop); 2146 __ mov(c_rarg0, ofs); // return ofs 2147 } 2148 2149 __ strq(v6, Address(state, 0)); 2150 __ strs(v7, Address(state, 16)); 2151 2152 __ ret(lr); 2153 2154 __ bind(keys); 2155 __ emit_int32(0x5a827999); 2156 __ emit_int32(0x6ed9eba1); 2157 __ emit_int32(0x8f1bbcdc); 2158 __ emit_int32(0xca62c1d6); 2159 2160 return start; 2161 } 2162 2163 2164 // Arguments: 2165 // 2166 // Inputs: 2167 // c_rarg0 - byte[] source+offset 2168 // c_rarg1 - int[] SHA.state 2169 // c_rarg2 - int offset 2170 // c_rarg3 - int limit 2171 // 2172 address generate_sha256_implCompress(bool multi_block, const char *name) { 2173 static const uint32_t round_consts[64] = { 2174 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 2175 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 2176 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 2177 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 2178 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 2179 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 2180 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 2181 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 2182 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 2183 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 2184 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 2185 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 2186 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 2187 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 2188 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 2189 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 2190 }; 2191 __ align(CodeEntryAlignment); 2192 StubCodeMark mark(this, "StubRoutines", name); 2193 address start = __ pc(); 2194 2195 Register buf = c_rarg0; 2196 Register state = c_rarg1; 2197 Register ofs = c_rarg2; 2198 Register limit = c_rarg3; 2199 2200 Label sha1_loop; 2201 2202 __ stpd(v8, v9, __ pre(sp, -32)); 2203 __ stpd(v10, v11, Address(sp, 16)); 2204 2205 // dga == v0 2206 // dgb == v1 2207 // dg0 == v2 2208 // dg1 == v3 2209 // dg2 == v4 2210 // t0 == v6 2211 // t1 == v7 2212 2213 // load 16 keys to v16..v31 2214 __ lea(rscratch1, ExternalAddress((address)round_consts)); 2215 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 2216 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 2217 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 2218 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 2219 2220 // load 8 words (256 bits) state 2221 __ ldpq(v0, v1, state); 2222 2223 __ BIND(sha1_loop); 2224 // load 64 bytes of data into v8..v11 2225 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 2226 __ rev32(v8, __ T16B, v8); 2227 __ rev32(v9, __ T16B, v9); 2228 __ rev32(v10, __ T16B, v10); 2229 __ rev32(v11, __ T16B, v11); 2230 2231 __ addv(v6, __ T4S, v8, v16); 2232 __ orr(v2, __ T16B, v0, v0); 2233 __ orr(v3, __ T16B, v1, v1); 2234 2235 FloatRegister d0 = v8; 2236 FloatRegister d1 = v9; 2237 FloatRegister d2 = v10; 2238 FloatRegister d3 = v11; 2239 2240 2241 for (int round = 0; round < 16; round++) { 2242 FloatRegister tmp1 = (round & 1) ? v6 : v7; 2243 FloatRegister tmp2 = (round & 1) ? v7 : v6; 2244 FloatRegister tmp3 = (round & 1) ? v2 : v4; 2245 FloatRegister tmp4 = (round & 1) ? v4 : v2; 2246 2247 if (round < 12) __ sha256su0(d0, __ T4S, d1); 2248 __ orr(v4, __ T16B, v2, v2); 2249 if (round < 15) 2250 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 2251 __ sha256h(v2, __ T4S, v3, tmp2); 2252 __ sha256h2(v3, __ T4S, v4, tmp2); 2253 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 2254 2255 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2256 } 2257 2258 __ addv(v0, __ T4S, v0, v2); 2259 __ addv(v1, __ T4S, v1, v3); 2260 2261 if (multi_block) { 2262 __ add(ofs, ofs, 64); 2263 __ cmp(ofs, limit); 2264 __ br(Assembler::LE, sha1_loop); 2265 __ mov(c_rarg0, ofs); // return ofs 2266 } 2267 2268 __ ldpd(v10, v11, Address(sp, 16)); 2269 __ ldpd(v8, v9, __ post(sp, 32)); 2270 2271 __ stpq(v0, v1, state); 2272 2273 __ ret(lr); 2274 2275 return start; 2276 } 2277 2278 #ifndef BUILTIN_SIM 2279 // Safefetch stubs. 2280 void generate_safefetch(const char* name, int size, address* entry, 2281 address* fault_pc, address* continuation_pc) { 2282 // safefetch signatures: 2283 // int SafeFetch32(int* adr, int errValue); 2284 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 2285 // 2286 // arguments: 2287 // c_rarg0 = adr 2288 // c_rarg1 = errValue 2289 // 2290 // result: 2291 // PPC_RET = *adr or errValue 2292 2293 StubCodeMark mark(this, "StubRoutines", name); 2294 2295 // Entry point, pc or function descriptor. 2296 *entry = __ pc(); 2297 2298 // Load *adr into c_rarg1, may fault. 2299 *fault_pc = __ pc(); 2300 switch (size) { 2301 case 4: 2302 // int32_t 2303 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 2304 break; 2305 case 8: 2306 // int64_t 2307 __ ldr(c_rarg1, Address(c_rarg0, 0)); 2308 break; 2309 default: 2310 ShouldNotReachHere(); 2311 } 2312 2313 // return errValue or *adr 2314 *continuation_pc = __ pc(); 2315 __ mov(r0, c_rarg1); 2316 __ ret(lr); 2317 } 2318 #endif 2319 2320 /** 2321 * Arguments: 2322 * 2323 * Inputs: 2324 * c_rarg0 - int crc 2325 * c_rarg1 - byte* buf 2326 * c_rarg2 - int length 2327 * 2328 * Output: 2329 * r0 - int crc result 2330 * 2331 * Preserves: 2332 * r13 2333 * 2334 */ 2335 address generate_updateBytesCRC32() { 2336 assert(UseCRC32Intrinsics, "what are we doing here?"); 2337 2338 __ align(CodeEntryAlignment); 2339 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 2340 2341 address start = __ pc(); 2342 2343 const Register crc = c_rarg0; // crc 2344 const Register buf = c_rarg1; // source java byte array address 2345 const Register len = c_rarg2; // length 2346 const Register table0 = c_rarg3; // crc_table address 2347 const Register table1 = c_rarg4; 2348 const Register table2 = c_rarg5; 2349 const Register table3 = c_rarg6; 2350 const Register tmp3 = c_rarg7; 2351 2352 BLOCK_COMMENT("Entry:"); 2353 __ enter(); // required for proper stackwalking of RuntimeStub frame 2354 2355 __ kernel_crc32(crc, buf, len, 2356 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 2357 2358 __ leave(); // required for proper stackwalking of RuntimeStub frame 2359 __ ret(lr); 2360 2361 return start; 2362 } 2363 2364 /** 2365 * Arguments: 2366 * 2367 * Input: 2368 * c_rarg0 - x address 2369 * c_rarg1 - x length 2370 * c_rarg2 - y address 2371 * c_rarg3 - y lenth 2372 * c_rarg4 - z address 2373 * c_rarg5 - z length 2374 */ 2375 address generate_multiplyToLen() { 2376 __ align(CodeEntryAlignment); 2377 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 2378 2379 address start = __ pc(); 2380 const Register x = r0; 2381 const Register xlen = r1; 2382 const Register y = r2; 2383 const Register ylen = r3; 2384 const Register z = r4; 2385 const Register zlen = r5; 2386 2387 const Register tmp1 = r10; 2388 const Register tmp2 = r11; 2389 const Register tmp3 = r12; 2390 const Register tmp4 = r13; 2391 const Register tmp5 = r14; 2392 const Register tmp6 = r15; 2393 const Register tmp7 = r16; 2394 2395 BLOCK_COMMENT("Entry:"); 2396 __ enter(); // required for proper stackwalking of RuntimeStub frame 2397 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 2398 __ leave(); // required for proper stackwalking of RuntimeStub frame 2399 __ ret(lr); 2400 2401 return start; 2402 } 2403 2404 // Continuation point for throwing of implicit exceptions that are 2405 // not handled in the current activation. Fabricates an exception 2406 // oop and initiates normal exception dispatching in this 2407 // frame. Since we need to preserve callee-saved values (currently 2408 // only for C2, but done for C1 as well) we need a callee-saved oop 2409 // map and therefore have to make these stubs into RuntimeStubs 2410 // rather than BufferBlobs. If the compiler needs all registers to 2411 // be preserved between the fault point and the exception handler 2412 // then it must assume responsibility for that in 2413 // AbstractCompiler::continuation_for_implicit_null_exception or 2414 // continuation_for_implicit_division_by_zero_exception. All other 2415 // implicit exceptions (e.g., NullPointerException or 2416 // AbstractMethodError on entry) are either at call sites or 2417 // otherwise assume that stack unwinding will be initiated, so 2418 // caller saved registers were assumed volatile in the compiler. 2419 2420 #undef __ 2421 #define __ masm-> 2422 2423 address generate_throw_exception(const char* name, 2424 address runtime_entry, 2425 Register arg1 = noreg, 2426 Register arg2 = noreg) { 2427 // Information about frame layout at time of blocking runtime call. 2428 // Note that we only have to preserve callee-saved registers since 2429 // the compilers are responsible for supplying a continuation point 2430 // if they expect all registers to be preserved. 2431 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 2432 enum layout { 2433 rfp_off = 0, 2434 rfp_off2, 2435 return_off, 2436 return_off2, 2437 framesize // inclusive of return address 2438 }; 2439 2440 int insts_size = 512; 2441 int locs_size = 64; 2442 2443 CodeBuffer code(name, insts_size, locs_size); 2444 OopMapSet* oop_maps = new OopMapSet(); 2445 MacroAssembler* masm = new MacroAssembler(&code); 2446 2447 address start = __ pc(); 2448 2449 // This is an inlined and slightly modified version of call_VM 2450 // which has the ability to fetch the return PC out of 2451 // thread-local storage and also sets up last_Java_sp slightly 2452 // differently than the real call_VM 2453 2454 __ enter(); // Save FP and LR before call 2455 2456 assert(is_even(framesize/2), "sp not 16-byte aligned"); 2457 2458 // lr and fp are already in place 2459 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 2460 2461 int frame_complete = __ pc() - start; 2462 2463 // Set up last_Java_sp and last_Java_fp 2464 address the_pc = __ pc(); 2465 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 2466 2467 // Call runtime 2468 if (arg1 != noreg) { 2469 assert(arg2 != c_rarg1, "clobbered"); 2470 __ mov(c_rarg1, arg1); 2471 } 2472 if (arg2 != noreg) { 2473 __ mov(c_rarg2, arg2); 2474 } 2475 __ mov(c_rarg0, rthread); 2476 BLOCK_COMMENT("call runtime_entry"); 2477 __ mov(rscratch1, runtime_entry); 2478 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 2479 2480 // Generate oop map 2481 OopMap* map = new OopMap(framesize, 0); 2482 2483 oop_maps->add_gc_map(the_pc - start, map); 2484 2485 __ reset_last_Java_frame(true, true); 2486 __ maybe_isb(); 2487 2488 __ leave(); 2489 2490 // check for pending exceptions 2491 #ifdef ASSERT 2492 Label L; 2493 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 2494 __ cbnz(rscratch1, L); 2495 __ should_not_reach_here(); 2496 __ bind(L); 2497 #endif // ASSERT 2498 __ b(RuntimeAddress(StubRoutines::forward_exception_entry())); 2499 2500 2501 // codeBlob framesize is in words (not VMRegImpl::slot_size) 2502 RuntimeStub* stub = 2503 RuntimeStub::new_runtime_stub(name, 2504 &code, 2505 frame_complete, 2506 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 2507 oop_maps, false); 2508 return stub->entry_point(); 2509 } 2510 2511 // Initialization 2512 void generate_initial() { 2513 // Generate initial stubs and initializes the entry points 2514 2515 // entry points that exist in all platforms Note: This is code 2516 // that could be shared among different platforms - however the 2517 // benefit seems to be smaller than the disadvantage of having a 2518 // much more complicated generator structure. See also comment in 2519 // stubRoutines.hpp. 2520 2521 StubRoutines::_forward_exception_entry = generate_forward_exception(); 2522 2523 StubRoutines::_call_stub_entry = 2524 generate_call_stub(StubRoutines::_call_stub_return_address); 2525 2526 // is referenced by megamorphic call 2527 StubRoutines::_catch_exception_entry = generate_catch_exception(); 2528 2529 // Build this early so it's available for the interpreter. 2530 StubRoutines::_throw_StackOverflowError_entry = 2531 generate_throw_exception("StackOverflowError throw_exception", 2532 CAST_FROM_FN_PTR(address, 2533 SharedRuntime:: 2534 throw_StackOverflowError)); 2535 if (UseCRC32Intrinsics) { 2536 // set table address before stub generation which use it 2537 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 2538 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 2539 } 2540 } 2541 2542 void generate_all() { 2543 // support for verify_oop (must happen after universe_init) 2544 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 2545 StubRoutines::_throw_AbstractMethodError_entry = 2546 generate_throw_exception("AbstractMethodError throw_exception", 2547 CAST_FROM_FN_PTR(address, 2548 SharedRuntime:: 2549 throw_AbstractMethodError)); 2550 2551 StubRoutines::_throw_IncompatibleClassChangeError_entry = 2552 generate_throw_exception("IncompatibleClassChangeError throw_exception", 2553 CAST_FROM_FN_PTR(address, 2554 SharedRuntime:: 2555 throw_IncompatibleClassChangeError)); 2556 2557 StubRoutines::_throw_NullPointerException_at_call_entry = 2558 generate_throw_exception("NullPointerException at call throw_exception", 2559 CAST_FROM_FN_PTR(address, 2560 SharedRuntime:: 2561 throw_NullPointerException_at_call)); 2562 2563 // arraycopy stubs used by compilers 2564 generate_arraycopy_stubs(); 2565 2566 if (UseMultiplyToLenIntrinsic) { 2567 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 2568 } 2569 2570 #ifndef BUILTIN_SIM 2571 if (UseAESIntrinsics) { 2572 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 2573 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 2574 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 2575 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 2576 } 2577 2578 if (UseSHA1Intrinsics) { 2579 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 2580 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 2581 } 2582 if (UseSHA256Intrinsics) { 2583 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 2584 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 2585 } 2586 2587 // Safefetch stubs. 2588 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 2589 &StubRoutines::_safefetch32_fault_pc, 2590 &StubRoutines::_safefetch32_continuation_pc); 2591 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 2592 &StubRoutines::_safefetchN_fault_pc, 2593 &StubRoutines::_safefetchN_continuation_pc); 2594 #endif 2595 } 2596 2597 public: 2598 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 2599 if (all) { 2600 generate_all(); 2601 } else { 2602 generate_initial(); 2603 } 2604 } 2605 }; // end class declaration 2606 2607 void StubGenerator_generate(CodeBuffer* code, bool all) { 2608 StubGenerator g(code, all); 2609 }