1 /* 2 * Copyright (c) 2013, Red Hat Inc. 3 * Copyright (c) 2003, 2011, Oracle and/or its affiliates. 4 * All rights reserved. 5 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 6 * 7 * This code is free software; you can redistribute it and/or modify it 8 * under the terms of the GNU General Public License version 2 only, as 9 * published by the Free Software Foundation. 10 * 11 * This code is distributed in the hope that it will be useful, but WITHOUT 12 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 13 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 14 * version 2 for more details (a copy is included in the LICENSE file that 15 * accompanied this code). 16 * 17 * You should have received a copy of the GNU General Public License version 18 * 2 along with this work; if not, write to the Free Software Foundation, 19 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 20 * 21 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 22 * or visit www.oracle.com if you need additional information or have any 23 * questions. 24 * 25 */ 26 27 #include "precompiled.hpp" 28 #include "asm/macroAssembler.hpp" 29 #include "asm/macroAssembler.inline.hpp" 30 #include "interpreter/interpreter.hpp" 31 #include "nativeInst_aarch64.hpp" 32 #include "oops/instanceOop.hpp" 33 #include "oops/method.hpp" 34 #include "oops/objArrayKlass.hpp" 35 #include "oops/oop.inline.hpp" 36 #include "prims/methodHandles.hpp" 37 #include "runtime/frame.inline.hpp" 38 #include "runtime/handles.inline.hpp" 39 #include "runtime/sharedRuntime.hpp" 40 #include "runtime/stubCodeGenerator.hpp" 41 #include "runtime/stubRoutines.hpp" 42 #include "runtime/thread.inline.hpp" 43 #include "utilities/top.hpp" 44 #ifdef COMPILER2 45 #include "opto/runtime.hpp" 46 #endif 47 48 #ifdef BUILTIN_SIM 49 #include "../../../../../../simulator/simulator.hpp" 50 #endif 51 52 // Declaration and definition of StubGenerator (no .hpp file). 53 // For a more detailed description of the stub routine structure 54 // see the comment in stubRoutines.hpp 55 56 #undef __ 57 #define __ _masm-> 58 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 59 60 #ifdef PRODUCT 61 #define BLOCK_COMMENT(str) /* nothing */ 62 #else 63 #define BLOCK_COMMENT(str) __ block_comment(str) 64 #endif 65 66 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 67 68 // Stub Code definitions 69 70 class StubGenerator: public StubCodeGenerator { 71 private: 72 73 #ifdef PRODUCT 74 #define inc_counter_np(counter) ((void)0) 75 #else 76 void inc_counter_np_(int& counter) { 77 __ lea(rscratch2, ExternalAddress((address)&counter)); 78 __ ldrw(rscratch1, Address(rscratch2)); 79 __ addw(rscratch1, rscratch1, 1); 80 __ strw(rscratch1, Address(rscratch2)); 81 } 82 #define inc_counter_np(counter) \ 83 BLOCK_COMMENT("inc_counter " #counter); \ 84 inc_counter_np_(counter); 85 #endif 86 87 // Call stubs are used to call Java from C 88 // 89 // Arguments: 90 // c_rarg0: call wrapper address address 91 // c_rarg1: result address 92 // c_rarg2: result type BasicType 93 // c_rarg3: method Method* 94 // c_rarg4: (interpreter) entry point address 95 // c_rarg5: parameters intptr_t* 96 // c_rarg6: parameter size (in words) int 97 // c_rarg7: thread Thread* 98 // 99 // There is no return from the stub itself as any Java result 100 // is written to result 101 // 102 // we save r30 (lr) as the return PC at the base of the frame and 103 // link r29 (fp) below it as the frame pointer installing sp (r31) 104 // into fp. 105 // 106 // we save r0-r7, which accounts for all the c arguments. 107 // 108 // TODO: strictly do we need to save them all? they are treated as 109 // volatile by C so could we omit saving the ones we are going to 110 // place in global registers (thread? method?) or those we only use 111 // during setup of the Java call? 112 // 113 // we don't need to save r8 which C uses as an indirect result location 114 // return register. 115 // 116 // we don't need to save r9-r15 which both C and Java treat as 117 // volatile 118 // 119 // we don't need to save r16-18 because Java does not use them 120 // 121 // we save r19-r28 which Java uses as scratch registers and C 122 // expects to be callee-save 123 // 124 // we don't save any FP registers since only v8-v15 are callee-save 125 // (strictly only the f and d components) and Java uses them as 126 // callee-save. v0-v7 are arg registers and C treats v16-v31 as 127 // volatile (as does Java?) 128 // 129 // so the stub frame looks like this when we enter Java code 130 // 131 // [ return_from_Java ] <--- sp 132 // [ argument word n ] 133 // ... 134 // -27 [ argument word 1 ] 135 // -26 [ saved d15 ] <--- sp_after_call 136 // -25 [ saved d14 ] 137 // -24 [ saved d13 ] 138 // -23 [ saved d12 ] 139 // -22 [ saved d11 ] 140 // -21 [ saved d10 ] 141 // -20 [ saved d9 ] 142 // -19 [ saved d8 ] 143 // -18 [ saved r28 ] 144 // -17 [ saved r27 ] 145 // -16 [ saved r26 ] 146 // -15 [ saved r25 ] 147 // -14 [ saved r24 ] 148 // -13 [ saved r23 ] 149 // -12 [ saved r22 ] 150 // -11 [ saved r21 ] 151 // -10 [ saved r20 ] 152 // -9 [ saved r19 ] 153 // -8 [ call wrapper (r0) ] 154 // -7 [ result (r1) ] 155 // -6 [ result type (r2) ] 156 // -5 [ method (r3) ] 157 // -4 [ entry point (r4) ] 158 // -3 [ parameters (r5) ] 159 // -2 [ parameter size (r6) ] 160 // -1 [ thread (r7) ] 161 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 162 // 1 [ saved lr (r30) ] 163 164 // Call stub stack layout word offsets from fp 165 enum call_stub_layout { 166 sp_after_call_off = -26, 167 168 d15_off = -26, 169 d14_off = -25, 170 d13_off = -24, 171 d12_off = -23, 172 d11_off = -22, 173 d10_off = -21, 174 d9_off = -20, 175 d8_off = -19, 176 177 r28_off = -18, 178 r27_off = -17, 179 r26_off = -16, 180 r25_off = -15, 181 r24_off = -14, 182 r23_off = -13, 183 r22_off = -12, 184 r21_off = -11, 185 r20_off = -10, 186 r19_off = -9, 187 call_wrapper_off = -8, 188 result_off = -7, 189 result_type_off = -6, 190 method_off = -5, 191 entry_point_off = -4, 192 parameters_off = -3, 193 parameter_size_off = -2, 194 thread_off = -1, 195 fp_f = 0, 196 retaddr_off = 1, 197 }; 198 199 address generate_call_stub(address& return_address) { 200 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 201 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 202 "adjust this code"); 203 204 StubCodeMark mark(this, "StubRoutines", "call_stub"); 205 address start = __ pc(); 206 207 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 208 209 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 210 const Address result (rfp, result_off * wordSize); 211 const Address result_type (rfp, result_type_off * wordSize); 212 const Address method (rfp, method_off * wordSize); 213 const Address entry_point (rfp, entry_point_off * wordSize); 214 const Address parameters (rfp, parameters_off * wordSize); 215 const Address parameter_size(rfp, parameter_size_off * wordSize); 216 217 const Address thread (rfp, thread_off * wordSize); 218 219 const Address d15_save (rfp, d15_off * wordSize); 220 const Address d14_save (rfp, d14_off * wordSize); 221 const Address d13_save (rfp, d13_off * wordSize); 222 const Address d12_save (rfp, d12_off * wordSize); 223 const Address d11_save (rfp, d11_off * wordSize); 224 const Address d10_save (rfp, d10_off * wordSize); 225 const Address d9_save (rfp, d9_off * wordSize); 226 const Address d8_save (rfp, d8_off * wordSize); 227 228 const Address r28_save (rfp, r28_off * wordSize); 229 const Address r27_save (rfp, r27_off * wordSize); 230 const Address r26_save (rfp, r26_off * wordSize); 231 const Address r25_save (rfp, r25_off * wordSize); 232 const Address r24_save (rfp, r24_off * wordSize); 233 const Address r23_save (rfp, r23_off * wordSize); 234 const Address r22_save (rfp, r22_off * wordSize); 235 const Address r21_save (rfp, r21_off * wordSize); 236 const Address r20_save (rfp, r20_off * wordSize); 237 const Address r19_save (rfp, r19_off * wordSize); 238 239 // stub code 240 241 // we need a C prolog to bootstrap the x86 caller into the sim 242 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 243 244 address aarch64_entry = __ pc(); 245 246 #ifdef BUILTIN_SIM 247 // Save sender's SP for stack traces. 248 __ mov(rscratch1, sp); 249 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 250 #endif 251 // set up frame and move sp to end of save area 252 __ enter(); 253 __ sub(sp, rfp, -sp_after_call_off * wordSize); 254 255 // save register parameters and Java scratch/global registers 256 // n.b. we save thread even though it gets installed in 257 // rthread because we want to sanity check rthread later 258 __ str(c_rarg7, thread); 259 __ strw(c_rarg6, parameter_size); 260 __ str(c_rarg5, parameters); 261 __ str(c_rarg4, entry_point); 262 __ str(c_rarg3, method); 263 __ str(c_rarg2, result_type); 264 __ str(c_rarg1, result); 265 __ str(c_rarg0, call_wrapper); 266 __ str(r19, r19_save); 267 __ str(r20, r20_save); 268 __ str(r21, r21_save); 269 __ str(r22, r22_save); 270 __ str(r23, r23_save); 271 __ str(r24, r24_save); 272 __ str(r25, r25_save); 273 __ str(r26, r26_save); 274 __ str(r27, r27_save); 275 __ str(r28, r28_save); 276 277 __ strd(v8, d8_save); 278 __ strd(v9, d9_save); 279 __ strd(v10, d10_save); 280 __ strd(v11, d11_save); 281 __ strd(v12, d12_save); 282 __ strd(v13, d13_save); 283 __ strd(v14, d14_save); 284 __ strd(v15, d15_save); 285 286 // install Java thread in global register now we have saved 287 // whatever value it held 288 __ mov(rthread, c_rarg7); 289 // And method 290 __ mov(rmethod, c_rarg3); 291 292 // set up the heapbase register 293 __ reinit_heapbase(); 294 295 #ifdef ASSERT 296 // make sure we have no pending exceptions 297 { 298 Label L; 299 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 300 __ cmp(rscratch1, (unsigned)NULL_WORD); 301 __ br(Assembler::EQ, L); 302 __ stop("StubRoutines::call_stub: entered with pending exception"); 303 __ BIND(L); 304 } 305 #endif 306 // pass parameters if any 307 __ mov(esp, sp); 308 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 309 __ andr(sp, rscratch1, -2 * wordSize); 310 311 BLOCK_COMMENT("pass parameters if any"); 312 Label parameters_done; 313 // parameter count is still in c_rarg6 314 // and parameter pointer identifying param 1 is in c_rarg5 315 __ cbzw(c_rarg6, parameters_done); 316 317 address loop = __ pc(); 318 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 319 __ subsw(c_rarg6, c_rarg6, 1); 320 __ push(rscratch1); 321 __ br(Assembler::GT, loop); 322 323 __ BIND(parameters_done); 324 325 // call Java entry -- passing methdoOop, and current sp 326 // rmethod: Method* 327 // r13: sender sp 328 BLOCK_COMMENT("call Java function"); 329 __ mov(r13, sp); 330 __ blr(c_rarg4); 331 332 // tell the simulator we have returned to the stub 333 334 // we do this here because the notify will already have been done 335 // if we get to the next instruction via an exception 336 // 337 // n.b. adding this instruction here affects the calculation of 338 // whether or not a routine returns to the call stub (used when 339 // doing stack walks) since the normal test is to check the return 340 // pc against the address saved below. so we may need to allow for 341 // this extra instruction in the check. 342 343 if (NotifySimulator) { 344 __ notify(Assembler::method_reentry); 345 } 346 // save current address for use by exception handling code 347 348 return_address = __ pc(); 349 350 // store result depending on type (everything that is not 351 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 352 // n.b. this assumes Java returns an integral result in r0 353 // and a floating result in j_farg0 354 __ ldr(j_rarg2, result); 355 Label is_long, is_float, is_double, exit; 356 __ ldr(j_rarg1, result_type); 357 __ cmp(j_rarg1, T_OBJECT); 358 __ br(Assembler::EQ, is_long); 359 __ cmp(j_rarg1, T_LONG); 360 __ br(Assembler::EQ, is_long); 361 __ cmp(j_rarg1, T_FLOAT); 362 __ br(Assembler::EQ, is_float); 363 __ cmp(j_rarg1, T_DOUBLE); 364 __ br(Assembler::EQ, is_double); 365 366 // handle T_INT case 367 __ strw(r0, Address(j_rarg2)); 368 369 __ BIND(exit); 370 371 // pop parameters 372 __ sub(esp, rfp, -sp_after_call_off * wordSize); 373 374 #ifdef ASSERT 375 // verify that threads correspond 376 { 377 Label L, S; 378 __ ldr(rscratch1, thread); 379 __ cmp(rthread, rscratch1); 380 __ br(Assembler::NE, S); 381 __ get_thread(rscratch1); 382 __ cmp(rthread, rscratch1); 383 __ br(Assembler::EQ, L); 384 __ BIND(S); 385 __ stop("StubRoutines::call_stub: threads must correspond"); 386 __ BIND(L); 387 } 388 #endif 389 390 // restore callee-save registers 391 __ ldrd(v15, d15_save); 392 __ ldrd(v14, d14_save); 393 __ ldrd(v13, d13_save); 394 __ ldrd(v12, d12_save); 395 __ ldrd(v11, d11_save); 396 __ ldrd(v10, d10_save); 397 __ ldrd(v9, d9_save); 398 __ ldrd(v8, d8_save); 399 400 __ ldr(r28, r28_save); 401 __ ldr(r27, r27_save); 402 __ ldr(r26, r26_save); 403 __ ldr(r25, r25_save); 404 __ ldr(r24, r24_save); 405 __ ldr(r23, r23_save); 406 __ ldr(r22, r22_save); 407 __ ldr(r21, r21_save); 408 __ ldr(r20, r20_save); 409 __ ldr(r19, r19_save); 410 __ ldr(c_rarg0, call_wrapper); 411 __ ldr(c_rarg1, result); 412 __ ldrw(c_rarg2, result_type); 413 __ ldr(c_rarg3, method); 414 __ ldr(c_rarg4, entry_point); 415 __ ldr(c_rarg5, parameters); 416 __ ldr(c_rarg6, parameter_size); 417 __ ldr(c_rarg7, thread); 418 419 #ifndef PRODUCT 420 // tell the simulator we are about to end Java execution 421 if (NotifySimulator) { 422 __ notify(Assembler::method_exit); 423 } 424 #endif 425 // leave frame and return to caller 426 __ leave(); 427 __ ret(lr); 428 429 // handle return types different from T_INT 430 431 __ BIND(is_long); 432 __ str(r0, Address(j_rarg2, 0)); 433 __ br(Assembler::AL, exit); 434 435 __ BIND(is_float); 436 __ strs(j_farg0, Address(j_rarg2, 0)); 437 __ br(Assembler::AL, exit); 438 439 __ BIND(is_double); 440 __ strd(j_farg0, Address(j_rarg2, 0)); 441 __ br(Assembler::AL, exit); 442 443 return start; 444 } 445 446 // Return point for a Java call if there's an exception thrown in 447 // Java code. The exception is caught and transformed into a 448 // pending exception stored in JavaThread that can be tested from 449 // within the VM. 450 // 451 // Note: Usually the parameters are removed by the callee. In case 452 // of an exception crossing an activation frame boundary, that is 453 // not the case if the callee is compiled code => need to setup the 454 // rsp. 455 // 456 // r0: exception oop 457 458 // NOTE: this is used as a target from the signal handler so it 459 // needs an x86 prolog which returns into the current simulator 460 // executing the generated catch_exception code. so the prolog 461 // needs to install rax in a sim register and adjust the sim's 462 // restart pc to enter the generated code at the start position 463 // then return from native to simulated execution. 464 465 address generate_catch_exception() { 466 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 467 address start = __ pc(); 468 469 // same as in generate_call_stub(): 470 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 471 const Address thread (rfp, thread_off * wordSize); 472 473 #ifdef ASSERT 474 // verify that threads correspond 475 { 476 Label L, S; 477 __ ldr(rscratch1, thread); 478 __ cmp(rthread, rscratch1); 479 __ br(Assembler::NE, S); 480 __ get_thread(rscratch1); 481 __ cmp(rthread, rscratch1); 482 __ br(Assembler::EQ, L); 483 __ bind(S); 484 __ stop("StubRoutines::catch_exception: threads must correspond"); 485 __ bind(L); 486 } 487 #endif 488 489 // set pending exception 490 __ verify_oop(r0); 491 492 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 493 __ mov(rscratch1, (address)__FILE__); 494 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 495 __ movw(rscratch1, (int)__LINE__); 496 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 497 498 // complete return to VM 499 assert(StubRoutines::_call_stub_return_address != NULL, 500 "_call_stub_return_address must have been generated before"); 501 __ b(StubRoutines::_call_stub_return_address); 502 503 return start; 504 } 505 506 // Continuation point for runtime calls returning with a pending 507 // exception. The pending exception check happened in the runtime 508 // or native call stub. The pending exception in Thread is 509 // converted into a Java-level exception. 510 // 511 // Contract with Java-level exception handlers: 512 // r0: exception 513 // r3: throwing pc 514 // 515 // NOTE: At entry of this stub, exception-pc must be in LR !! 516 517 // NOTE: this is always used as a jump target within generated code 518 // so it just needs to be generated code wiht no x86 prolog 519 520 address generate_forward_exception() { 521 StubCodeMark mark(this, "StubRoutines", "forward exception"); 522 address start = __ pc(); 523 524 // Upon entry, LR points to the return address returning into 525 // Java (interpreted or compiled) code; i.e., the return address 526 // becomes the throwing pc. 527 // 528 // Arguments pushed before the runtime call are still on the stack 529 // but the exception handler will reset the stack pointer -> 530 // ignore them. A potential result in registers can be ignored as 531 // well. 532 533 #ifdef ASSERT 534 // make sure this code is only executed if there is a pending exception 535 { 536 Label L; 537 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 538 __ cbnz(rscratch1, L); 539 __ stop("StubRoutines::forward exception: no pending exception (1)"); 540 __ bind(L); 541 } 542 #endif 543 544 // compute exception handler into r19 545 546 // call the VM to find the handler address associated with the 547 // caller address. pass thread in r0 and caller pc (ret address) 548 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 549 // the stack. 550 __ mov(c_rarg1, lr); 551 // lr will be trashed by the VM call so we move it to R19 552 // (callee-saved) because we also need to pass it to the handler 553 // returned by this call. 554 __ mov(r19, lr); 555 BLOCK_COMMENT("call exception_handler_for_return_address"); 556 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 557 SharedRuntime::exception_handler_for_return_address), 558 rthread, c_rarg1); 559 // we should not really care that lr is no longer the callee 560 // address. we saved the value the handler needs in r19 so we can 561 // just copy it to r3. however, the C2 handler will push its own 562 // frame and then calls into the VM and the VM code asserts that 563 // the PC for the frame above the handler belongs to a compiled 564 // Java method. So, we restore lr here to satisfy that assert. 565 __ mov(lr, r19); 566 // setup r0 & r3 & clear pending exception 567 __ mov(r3, r19); 568 __ mov(r19, r0); 569 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 570 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 571 572 #ifdef ASSERT 573 // make sure exception is set 574 { 575 Label L; 576 __ cbnz(r0, L); 577 __ stop("StubRoutines::forward exception: no pending exception (2)"); 578 __ bind(L); 579 } 580 #endif 581 582 // continue at exception handler 583 // r0: exception 584 // r3: throwing pc 585 // r19: exception handler 586 __ verify_oop(r0); 587 __ br(r19); 588 589 return start; 590 } 591 592 // Non-destructive plausibility checks for oops 593 // 594 // Arguments: 595 // r0: oop to verify 596 // rscratch1: error message 597 // 598 // Stack after saving c_rarg3: 599 // [tos + 0]: saved c_rarg3 600 // [tos + 1]: saved c_rarg2 601 // [tos + 2]: saved lr 602 // [tos + 3]: saved rscratch2 603 // [tos + 4]: saved r0 604 // [tos + 5]: saved rscratch1 605 address generate_verify_oop() { 606 607 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 608 address start = __ pc(); 609 610 Label exit, error; 611 612 // save c_rarg2 and c_rarg3 613 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 614 615 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 616 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 617 __ ldr(c_rarg3, Address(c_rarg2)); 618 __ add(c_rarg3, c_rarg3, 1); 619 __ str(c_rarg3, Address(c_rarg2)); 620 621 // object is in r0 622 // make sure object is 'reasonable' 623 __ cbz(r0, exit); // if obj is NULL it is OK 624 625 // Check if the oop is in the right area of memory 626 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 627 __ andr(c_rarg2, r0, c_rarg3); 628 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 629 630 // Compare c_rarg2 and c_rarg3. We don't use a compare 631 // instruction here because the flags register is live. 632 __ eor(c_rarg2, c_rarg2, c_rarg3); 633 __ cbnz(c_rarg2, error); 634 635 // make sure klass is 'reasonable', which is not zero. 636 __ load_klass(r0, r0); // get klass 637 __ cbz(r0, error); // if klass is NULL it is broken 638 639 // return if everything seems ok 640 __ bind(exit); 641 642 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 643 __ ret(lr); 644 645 // handle errors 646 __ bind(error); 647 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 648 649 __ push(RegSet::range(r0, r29), sp); 650 // debug(char* msg, int64_t pc, int64_t regs[]) 651 __ mov(c_rarg0, rscratch1); // pass address of error message 652 __ mov(c_rarg1, lr); // pass return address 653 __ mov(c_rarg2, sp); // pass address of regs on stack 654 #ifndef PRODUCT 655 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 656 #endif 657 BLOCK_COMMENT("call MacroAssembler::debug"); 658 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 659 __ blrt(rscratch1, 3, 0, 1); 660 661 return start; 662 } 663 664 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 665 666 // Generate code for an array write pre barrier 667 // 668 // addr - starting address 669 // count - element count 670 // tmp - scratch register 671 // 672 // Destroy no registers! 673 // 674 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 675 BarrierSet* bs = Universe::heap()->barrier_set(); 676 switch (bs->kind()) { 677 case BarrierSet::G1SATBCT: 678 case BarrierSet::G1SATBCTLogging: 679 // With G1, don't generate the call if we statically know that the target in uninitialized 680 if (!dest_uninitialized) { 681 __ push(RegSet::range(r0, r29), sp); // integer registers except lr & sp 682 if (count == c_rarg0) { 683 if (addr == c_rarg1) { 684 // exactly backwards!! 685 __ stp(c_rarg0, c_rarg1, __ pre(sp, -2 * wordSize)); 686 __ ldp(c_rarg1, c_rarg0, __ post(sp, -2 * wordSize)); 687 } else { 688 __ mov(c_rarg1, count); 689 __ mov(c_rarg0, addr); 690 } 691 } else { 692 __ mov(c_rarg0, addr); 693 __ mov(c_rarg1, count); 694 } 695 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 696 __ pop(RegSet::range(r0, r29), sp); // integer registers except lr & sp } 697 break; 698 case BarrierSet::CardTableModRef: 699 case BarrierSet::CardTableExtension: 700 case BarrierSet::ModRef: 701 break; 702 default: 703 ShouldNotReachHere(); 704 705 } 706 } 707 } 708 709 // 710 // Generate code for an array write post barrier 711 // 712 // Input: 713 // start - register containing starting address of destination array 714 // end - register containing ending address of destination array 715 // scratch - scratch register 716 // 717 // The input registers are overwritten. 718 // The ending address is inclusive. 719 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) { 720 assert_different_registers(start, end, scratch); 721 BarrierSet* bs = Universe::heap()->barrier_set(); 722 switch (bs->kind()) { 723 case BarrierSet::G1SATBCT: 724 case BarrierSet::G1SATBCTLogging: 725 726 { 727 __ push(RegSet::range(r0, r29), sp); // integer registers except lr & sp 728 // must compute element count unless barrier set interface is changed (other platforms supply count) 729 assert_different_registers(start, end, scratch); 730 __ lea(scratch, Address(end, BytesPerHeapOop)); 731 __ sub(scratch, scratch, start); // subtract start to get #bytes 732 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 733 __ mov(c_rarg0, start); 734 __ mov(c_rarg1, scratch); 735 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 736 __ pop(RegSet::range(r0, r29), sp); // integer registers except lr & sp } 737 } 738 break; 739 case BarrierSet::CardTableModRef: 740 case BarrierSet::CardTableExtension: 741 { 742 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 743 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 744 745 Label L_loop; 746 747 __ lsr(start, start, CardTableModRefBS::card_shift); 748 __ lsr(end, end, CardTableModRefBS::card_shift); 749 __ sub(end, end, start); // number of bytes to copy 750 751 const Register count = end; // 'end' register contains bytes count now 752 __ mov(scratch, (address)ct->byte_map_base); 753 __ add(start, start, scratch); 754 __ membar(__ StoreStore|__ LoadStore); 755 __ BIND(L_loop); 756 __ strb(zr, Address(start, count)); 757 __ subs(count, count, 1); 758 __ br(Assembler::HS, L_loop); 759 } 760 break; 761 default: 762 ShouldNotReachHere(); 763 764 } 765 } 766 767 typedef enum { 768 copy_forwards = 1, 769 copy_backwards = -1 770 } copy_direction; 771 772 // Bulk copy of blocks of 8 words. 773 // 774 // count is a count of words. 775 // 776 // Precondition: count >= 2 777 // 778 // Postconditions: 779 // 780 // The least significant bit of count contains the remaining count 781 // of words to copy. The rest of count is trash. 782 // 783 // s and d are adjusted to point to the remaining words to copy 784 // 785 void generate_copy_longs(Label &start, Register s, Register d, Register count, 786 copy_direction direction) { 787 int unit = wordSize * direction; 788 789 int offset; 790 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 791 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 792 793 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 794 assert_different_registers(s, d, count, rscratch1); 795 796 Label again, large, small; 797 __ align(6); 798 __ bind(start); 799 __ cmp(count, 8); 800 __ br(Assembler::LO, small); 801 if (direction == copy_forwards) { 802 __ sub(s, s, 2 * wordSize); 803 __ sub(d, d, 2 * wordSize); 804 } 805 __ subs(count, count, 16); 806 __ br(Assembler::GE, large); 807 808 // 8 <= count < 16 words. Copy 8. 809 __ ldp(t0, t1, Address(s, 2 * unit)); 810 __ ldp(t2, t3, Address(s, 4 * unit)); 811 __ ldp(t4, t5, Address(s, 6 * unit)); 812 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 813 814 __ stp(t0, t1, Address(d, 2 * unit)); 815 __ stp(t2, t3, Address(d, 4 * unit)); 816 __ stp(t4, t5, Address(d, 6 * unit)); 817 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 818 819 if (direction == copy_forwards) { 820 __ add(s, s, 2 * wordSize); 821 __ add(d, d, 2 * wordSize); 822 } 823 824 { 825 Label L1, L2; 826 __ bind(small); 827 __ tbz(count, exact_log2(4), L1); 828 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 829 __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 830 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 831 __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 832 __ bind(L1); 833 834 __ tbz(count, 1, L2); 835 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 836 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 837 __ bind(L2); 838 } 839 840 __ ret(lr); 841 842 __ align(6); 843 __ bind(large); 844 845 // Fill 8 registers 846 __ ldp(t0, t1, Address(s, 2 * unit)); 847 __ ldp(t2, t3, Address(s, 4 * unit)); 848 __ ldp(t4, t5, Address(s, 6 * unit)); 849 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 850 851 __ bind(again); 852 853 if (direction == copy_forwards && PrefetchCopyIntervalInBytes > 0) 854 __ prfm(Address(s, PrefetchCopyIntervalInBytes), PLDL1KEEP); 855 856 __ stp(t0, t1, Address(d, 2 * unit)); 857 __ ldp(t0, t1, Address(s, 2 * unit)); 858 __ stp(t2, t3, Address(d, 4 * unit)); 859 __ ldp(t2, t3, Address(s, 4 * unit)); 860 __ stp(t4, t5, Address(d, 6 * unit)); 861 __ ldp(t4, t5, Address(s, 6 * unit)); 862 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 863 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 864 865 __ subs(count, count, 8); 866 __ br(Assembler::HS, again); 867 868 // Drain 869 __ stp(t0, t1, Address(d, 2 * unit)); 870 __ stp(t2, t3, Address(d, 4 * unit)); 871 __ stp(t4, t5, Address(d, 6 * unit)); 872 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 873 874 if (direction == copy_forwards) { 875 __ add(s, s, 2 * wordSize); 876 __ add(d, d, 2 * wordSize); 877 } 878 879 { 880 Label L1, L2; 881 __ tbz(count, exact_log2(4), L1); 882 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 883 __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 884 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 885 __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 886 __ bind(L1); 887 888 __ tbz(count, 1, L2); 889 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 890 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 891 __ bind(L2); 892 } 893 894 __ ret(lr); 895 } 896 897 // Small copy: less than 16 bytes. 898 // 899 // NB: Ignores all of the bits of count which represent more than 15 900 // bytes, so a caller doesn't have to mask them. 901 902 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 903 bool is_backwards = step < 0; 904 size_t granularity = uabs(step); 905 int direction = is_backwards ? -1 : 1; 906 int unit = wordSize * direction; 907 908 Label Lpair, Lword, Lint, Lshort, Lbyte; 909 910 assert(granularity 911 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 912 913 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 914 915 // ??? I don't know if this bit-test-and-branch is the right thing 916 // to do. It does a lot of jumping, resulting in several 917 // mispredicted branches. It might make more sense to do this 918 // with something like Duff's device with a single computed branch. 919 920 __ tbz(count, 3 - exact_log2(granularity), Lword); 921 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 922 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 923 __ bind(Lword); 924 925 if (granularity <= sizeof (jint)) { 926 __ tbz(count, 2 - exact_log2(granularity), Lint); 927 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 928 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 929 __ bind(Lint); 930 } 931 932 if (granularity <= sizeof (jshort)) { 933 __ tbz(count, 1 - exact_log2(granularity), Lshort); 934 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 935 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 936 __ bind(Lshort); 937 } 938 939 if (granularity <= sizeof (jbyte)) { 940 __ tbz(count, 0, Lbyte); 941 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 942 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 943 __ bind(Lbyte); 944 } 945 } 946 947 Label copy_f, copy_b; 948 949 // All-singing all-dancing memory copy. 950 // 951 // Copy count units of memory from s to d. The size of a unit is 952 // step, which can be positive or negative depending on the direction 953 // of copy. If is_aligned is false, we align the source address. 954 // 955 956 void copy_memory(bool is_aligned, Register s, Register d, 957 Register count, Register tmp, int step) { 958 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 959 bool is_backwards = step < 0; 960 int granularity = uabs(step); 961 const Register t0 = r3, t1 = r4; 962 963 if (is_backwards) { 964 __ lea(s, Address(s, count, Address::uxtw(exact_log2(-step)))); 965 __ lea(d, Address(d, count, Address::uxtw(exact_log2(-step)))); 966 } 967 968 Label done, tail; 969 970 __ cmp(count, 16/granularity); 971 __ br(Assembler::LO, tail); 972 973 // Now we've got the small case out of the way we can align the 974 // source address on a 2-word boundary. 975 976 Label aligned; 977 978 if (is_aligned) { 979 // We may have to adjust by 1 word to get s 2-word-aligned. 980 __ tbz(s, exact_log2(wordSize), aligned); 981 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 982 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 983 __ sub(count, count, wordSize/granularity); 984 } else { 985 if (is_backwards) { 986 __ andr(rscratch2, s, 2 * wordSize - 1); 987 } else { 988 __ neg(rscratch2, s); 989 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 990 } 991 // rscratch2 is the byte adjustment needed to align s. 992 __ cbz(rscratch2, aligned); 993 __ lsr(rscratch2, rscratch2, exact_log2(granularity)); 994 __ sub(count, count, rscratch2); 995 996 #if 0 997 // ?? This code is only correct for a disjoint copy. It may or 998 // may not make sense to use it in that case. 999 1000 // Copy the first pair; s and d may not be aligned. 1001 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1002 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1003 1004 // Align s and d, adjust count 1005 if (is_backwards) { 1006 __ sub(s, s, rscratch2); 1007 __ sub(d, d, rscratch2); 1008 } else { 1009 __ add(s, s, rscratch2); 1010 __ add(d, d, rscratch2); 1011 } 1012 #else 1013 copy_memory_small(s, d, rscratch2, rscratch1, step); 1014 #endif 1015 } 1016 1017 __ cmp(count, 16/granularity); 1018 __ br(Assembler::LT, tail); 1019 __ bind(aligned); 1020 1021 // s is now 2-word-aligned. 1022 1023 // We have a count of units and some trailing bytes. Adjust the 1024 // count and do a bulk copy of words. 1025 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1026 if (direction == copy_forwards) 1027 __ bl(copy_f); 1028 else 1029 __ bl(copy_b); 1030 1031 // And the tail. 1032 1033 __ bind(tail); 1034 copy_memory_small(s, d, count, tmp, step); 1035 } 1036 1037 1038 void clobber_registers() { 1039 #ifdef ASSERT 1040 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1041 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1042 for (Register r = r3; r <= r18; r++) 1043 if (r != rscratch1) __ mov(r, rscratch1); 1044 #endif 1045 } 1046 1047 // Scan over array at a for count oops, verifying each one. 1048 // Preserves a and count, clobbers rscratch1 and rscratch2. 1049 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1050 Label loop, end; 1051 __ mov(rscratch1, a); 1052 __ mov(rscratch2, zr); 1053 __ bind(loop); 1054 __ cmp(rscratch2, count); 1055 __ br(Assembler::HS, end); 1056 if (size == (size_t)wordSize) { 1057 __ ldr(temp, Address(a, rscratch2, Address::uxtw(exact_log2(size)))); 1058 __ verify_oop(temp); 1059 } else { 1060 __ ldrw(r16, Address(a, rscratch2, Address::uxtw(exact_log2(size)))); 1061 __ decode_heap_oop(temp); // calls verify_oop 1062 } 1063 __ add(rscratch2, rscratch2, size); 1064 __ b(loop); 1065 __ bind(end); 1066 } 1067 1068 // Arguments: 1069 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1070 // ignored 1071 // is_oop - true => oop array, so generate store check code 1072 // name - stub name string 1073 // 1074 // Inputs: 1075 // c_rarg0 - source array address 1076 // c_rarg1 - destination array address 1077 // c_rarg2 - element count, treated as ssize_t, can be zero 1078 // 1079 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1080 // the hardware handle it. The two dwords within qwords that span 1081 // cache line boundaries will still be loaded and stored atomicly. 1082 // 1083 // Side Effects: 1084 // disjoint_int_copy_entry is set to the no-overlap entry point 1085 // used by generate_conjoint_int_oop_copy(). 1086 // 1087 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1088 const char *name, bool dest_uninitialized = false) { 1089 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1090 __ align(CodeEntryAlignment); 1091 StubCodeMark mark(this, "StubRoutines", name); 1092 address start = __ pc(); 1093 if (entry != NULL) { 1094 *entry = __ pc(); 1095 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1096 BLOCK_COMMENT("Entry:"); 1097 } 1098 __ enter(); 1099 if (is_oop) { 1100 __ push(RegSet::of(d, count), sp); 1101 // no registers are destroyed by this call 1102 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1103 } 1104 copy_memory(aligned, s, d, count, rscratch1, size); 1105 if (is_oop) { 1106 __ pop(RegSet::of(d, count), sp); 1107 if (VerifyOops) 1108 verify_oop_array(size, d, count, r16); 1109 __ sub(count, count, 1); // make an inclusive end pointer 1110 __ lea(count, Address(d, count, Address::uxtw(exact_log2(size)))); 1111 gen_write_ref_array_post_barrier(d, count, rscratch1); 1112 } 1113 __ leave(); 1114 __ ret(lr); 1115 #ifdef BUILTIN_SIM 1116 { 1117 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1118 sim->notifyCompile(const_cast<char*>(name), start); 1119 } 1120 #endif 1121 return start; 1122 } 1123 1124 // Arguments: 1125 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1126 // ignored 1127 // is_oop - true => oop array, so generate store check code 1128 // name - stub name string 1129 // 1130 // Inputs: 1131 // c_rarg0 - source array address 1132 // c_rarg1 - destination array address 1133 // c_rarg2 - element count, treated as ssize_t, can be zero 1134 // 1135 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1136 // the hardware handle it. The two dwords within qwords that span 1137 // cache line boundaries will still be loaded and stored atomicly. 1138 // 1139 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1140 address *entry, const char *name, 1141 bool dest_uninitialized = false) { 1142 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1143 1144 StubCodeMark mark(this, "StubRoutines", name); 1145 address start = __ pc(); 1146 1147 __ cmp(d, s); 1148 __ br(Assembler::LS, nooverlap_target); 1149 1150 __ enter(); 1151 if (is_oop) { 1152 __ push(RegSet::of(d, count), sp); 1153 // no registers are destroyed by this call 1154 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1155 } 1156 copy_memory(aligned, s, d, count, rscratch1, -size); 1157 if (is_oop) { 1158 __ pop(RegSet::of(d, count), sp); 1159 if (VerifyOops) 1160 verify_oop_array(size, d, count, r16); 1161 __ sub(count, count, 1); // make an inclusive end pointer 1162 __ lea(count, Address(d, count, Address::uxtw(exact_log2(size)))); 1163 gen_write_ref_array_post_barrier(d, count, rscratch1); 1164 } 1165 __ leave(); 1166 __ ret(lr); 1167 #ifdef BUILTIN_SIM 1168 { 1169 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1170 sim->notifyCompile(const_cast<char*>(name), start); 1171 } 1172 #endif 1173 return start; 1174 } 1175 1176 // Arguments: 1177 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1178 // ignored 1179 // name - stub name string 1180 // 1181 // Inputs: 1182 // c_rarg0 - source array address 1183 // c_rarg1 - destination array address 1184 // c_rarg2 - element count, treated as ssize_t, can be zero 1185 // 1186 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1187 // we let the hardware handle it. The one to eight bytes within words, 1188 // dwords or qwords that span cache line boundaries will still be loaded 1189 // and stored atomically. 1190 // 1191 // Side Effects: 1192 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1193 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1194 // we let the hardware handle it. The one to eight bytes within words, 1195 // dwords or qwords that span cache line boundaries will still be loaded 1196 // and stored atomically. 1197 // 1198 // Side Effects: 1199 // disjoint_byte_copy_entry is set to the no-overlap entry point 1200 // used by generate_conjoint_byte_copy(). 1201 // 1202 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1203 const bool not_oop = false; 1204 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1205 } 1206 1207 // Arguments: 1208 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1209 // ignored 1210 // name - stub name string 1211 // 1212 // Inputs: 1213 // c_rarg0 - source array address 1214 // c_rarg1 - destination array address 1215 // c_rarg2 - element count, treated as ssize_t, can be zero 1216 // 1217 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1218 // we let the hardware handle it. The one to eight bytes within words, 1219 // dwords or qwords that span cache line boundaries will still be loaded 1220 // and stored atomically. 1221 // 1222 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1223 address* entry, const char *name) { 1224 const bool not_oop = false; 1225 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1226 } 1227 1228 // Arguments: 1229 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1230 // ignored 1231 // name - stub name string 1232 // 1233 // Inputs: 1234 // c_rarg0 - source array address 1235 // c_rarg1 - destination array address 1236 // c_rarg2 - element count, treated as ssize_t, can be zero 1237 // 1238 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1239 // let the hardware handle it. The two or four words within dwords 1240 // or qwords that span cache line boundaries will still be loaded 1241 // and stored atomically. 1242 // 1243 // Side Effects: 1244 // disjoint_short_copy_entry is set to the no-overlap entry point 1245 // used by generate_conjoint_short_copy(). 1246 // 1247 address generate_disjoint_short_copy(bool aligned, 1248 address* entry, const char *name) { 1249 const bool not_oop = false; 1250 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1251 } 1252 1253 // Arguments: 1254 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1255 // ignored 1256 // name - stub name string 1257 // 1258 // Inputs: 1259 // c_rarg0 - source array address 1260 // c_rarg1 - destination array address 1261 // c_rarg2 - element count, treated as ssize_t, can be zero 1262 // 1263 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1264 // let the hardware handle it. The two or four words within dwords 1265 // or qwords that span cache line boundaries will still be loaded 1266 // and stored atomically. 1267 // 1268 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1269 address *entry, const char *name) { 1270 const bool not_oop = false; 1271 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1272 1273 } 1274 // Arguments: 1275 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1276 // ignored 1277 // name - stub name string 1278 // 1279 // Inputs: 1280 // c_rarg0 - source array address 1281 // c_rarg1 - destination array address 1282 // c_rarg2 - element count, treated as ssize_t, can be zero 1283 // 1284 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1285 // the hardware handle it. The two dwords within qwords that span 1286 // cache line boundaries will still be loaded and stored atomicly. 1287 // 1288 // Side Effects: 1289 // disjoint_int_copy_entry is set to the no-overlap entry point 1290 // used by generate_conjoint_int_oop_copy(). 1291 // 1292 address generate_disjoint_int_copy(bool aligned, address *entry, 1293 const char *name, bool dest_uninitialized = false) { 1294 const bool not_oop = false; 1295 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1296 } 1297 1298 // Arguments: 1299 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1300 // ignored 1301 // name - stub name string 1302 // 1303 // Inputs: 1304 // c_rarg0 - source array address 1305 // c_rarg1 - destination array address 1306 // c_rarg2 - element count, treated as ssize_t, can be zero 1307 // 1308 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1309 // the hardware handle it. The two dwords within qwords that span 1310 // cache line boundaries will still be loaded and stored atomicly. 1311 // 1312 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1313 address *entry, const char *name, 1314 bool dest_uninitialized = false) { 1315 const bool not_oop = false; 1316 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1317 } 1318 1319 1320 // Arguments: 1321 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1322 // ignored 1323 // name - stub name string 1324 // 1325 // Inputs: 1326 // c_rarg0 - source array address 1327 // c_rarg1 - destination array address 1328 // c_rarg2 - element count, treated as size_t, can be zero 1329 // 1330 // Side Effects: 1331 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1332 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1333 // 1334 address generate_disjoint_long_copy(bool aligned, address *entry, 1335 const char *name, bool dest_uninitialized = false) { 1336 const bool not_oop = false; 1337 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1338 } 1339 1340 // Arguments: 1341 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1342 // ignored 1343 // name - stub name string 1344 // 1345 // Inputs: 1346 // c_rarg0 - source array address 1347 // c_rarg1 - destination array address 1348 // c_rarg2 - element count, treated as size_t, can be zero 1349 // 1350 address generate_conjoint_long_copy(bool aligned, 1351 address nooverlap_target, address *entry, 1352 const char *name, bool dest_uninitialized = false) { 1353 const bool not_oop = false; 1354 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1355 } 1356 1357 // Arguments: 1358 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1359 // ignored 1360 // name - stub name string 1361 // 1362 // Inputs: 1363 // c_rarg0 - source array address 1364 // c_rarg1 - destination array address 1365 // c_rarg2 - element count, treated as size_t, can be zero 1366 // 1367 // Side Effects: 1368 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1369 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1370 // 1371 address generate_disjoint_oop_copy(bool aligned, address *entry, 1372 const char *name, bool dest_uninitialized = false) { 1373 const bool is_oop = true; 1374 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1375 return generate_disjoint_copy(size, aligned, is_oop, entry, name); 1376 } 1377 1378 // Arguments: 1379 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1380 // ignored 1381 // name - stub name string 1382 // 1383 // Inputs: 1384 // c_rarg0 - source array address 1385 // c_rarg1 - destination array address 1386 // c_rarg2 - element count, treated as size_t, can be zero 1387 // 1388 address generate_conjoint_oop_copy(bool aligned, 1389 address nooverlap_target, address *entry, 1390 const char *name, bool dest_uninitialized = false) { 1391 const bool is_oop = true; 1392 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1393 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, name); 1394 } 1395 1396 1397 // Helper for generating a dynamic type check. 1398 // Smashes rscratch1. 1399 void generate_type_check(Register sub_klass, 1400 Register super_check_offset, 1401 Register super_klass, 1402 Label& L_success) { 1403 assert_different_registers(sub_klass, super_check_offset, super_klass); 1404 1405 BLOCK_COMMENT("type_check:"); 1406 1407 Label L_miss; 1408 1409 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1410 super_check_offset); 1411 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1412 1413 // Fall through on failure! 1414 __ BIND(L_miss); 1415 } 1416 1417 // 1418 // Generate checkcasting array copy stub 1419 // 1420 // Input: 1421 // c_rarg0 - source array address 1422 // c_rarg1 - destination array address 1423 // c_rarg2 - element count, treated as ssize_t, can be zero 1424 // c_rarg3 - size_t ckoff (super_check_offset) 1425 // c_rarg4 - oop ckval (super_klass) 1426 // 1427 // Output: 1428 // r0 == 0 - success 1429 // r0 == -1^K - failure, where K is partial transfer count 1430 // 1431 address generate_checkcast_copy(const char *name, address *entry, 1432 bool dest_uninitialized = false) { 1433 1434 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1435 1436 // Input registers (after setup_arg_regs) 1437 const Register from = c_rarg0; // source array address 1438 const Register to = c_rarg1; // destination array address 1439 const Register count = c_rarg2; // elementscount 1440 const Register ckoff = c_rarg3; // super_check_offset 1441 const Register ckval = c_rarg4; // super_klass 1442 1443 // Registers used as temps (r18, r19, r20 are save-on-entry) 1444 const Register count_save = r21; // orig elementscount 1445 const Register start_to = r20; // destination array start address 1446 const Register copied_oop = r18; // actual oop copied 1447 const Register r19_klass = r19; // oop._klass 1448 1449 //--------------------------------------------------------------- 1450 // Assembler stub will be used for this call to arraycopy 1451 // if the two arrays are subtypes of Object[] but the 1452 // destination array type is not equal to or a supertype 1453 // of the source type. Each element must be separately 1454 // checked. 1455 1456 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1457 copied_oop, r19_klass, count_save); 1458 1459 __ align(CodeEntryAlignment); 1460 StubCodeMark mark(this, "StubRoutines", name); 1461 address start = __ pc(); 1462 1463 __ enter(); // required for proper stackwalking of RuntimeStub frame 1464 1465 #ifdef ASSERT 1466 // caller guarantees that the arrays really are different 1467 // otherwise, we would have to make conjoint checks 1468 { Label L; 1469 array_overlap_test(L, TIMES_OOP); 1470 __ stop("checkcast_copy within a single array"); 1471 __ bind(L); 1472 } 1473 #endif //ASSERT 1474 1475 // Caller of this entry point must set up the argument registers. 1476 if (entry != NULL) { 1477 *entry = __ pc(); 1478 BLOCK_COMMENT("Entry:"); 1479 } 1480 1481 // Empty array: Nothing to do. 1482 __ cbz(count, L_done); 1483 1484 __ push(RegSet::of(r18, r19, r20, r21), sp); 1485 1486 #ifdef ASSERT 1487 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1488 // The ckoff and ckval must be mutually consistent, 1489 // even though caller generates both. 1490 { Label L; 1491 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1492 __ ldrw(start_to, Address(ckval, sco_offset)); 1493 __ cmpw(ckoff, start_to); 1494 __ br(Assembler::EQ, L); 1495 __ stop("super_check_offset inconsistent"); 1496 __ bind(L); 1497 } 1498 #endif //ASSERT 1499 1500 // save the original count 1501 __ mov(count_save, count); 1502 1503 // Copy from low to high addresses 1504 __ mov(start_to, to); // Save destination array start address 1505 __ b(L_load_element); 1506 1507 // ======== begin loop ======== 1508 // (Loop is rotated; its entry is L_load_element.) 1509 // Loop control: 1510 // for (; count != 0; count--) { 1511 // copied_oop = load_heap_oop(from++); 1512 // ... generate_type_check ...; 1513 // store_heap_oop(to++, copied_oop); 1514 // } 1515 __ align(OptoLoopAlignment); 1516 1517 __ BIND(L_store_element); 1518 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1519 __ sub(count, count, 1); 1520 __ cbz(count, L_do_card_marks); 1521 1522 // ======== loop entry is here ======== 1523 __ BIND(L_load_element); 1524 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1525 __ cbz(copied_oop, L_store_element); 1526 1527 __ load_klass(r19_klass, copied_oop);// query the object klass 1528 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1529 // ======== end loop ======== 1530 1531 // It was a real error; we must depend on the caller to finish the job. 1532 // Register count = remaining oops, count_orig = total oops. 1533 // Emit GC store barriers for the oops we have copied and report 1534 // their number to the caller. 1535 1536 __ subs(count, count_save, count); // K = partially copied oop count 1537 __ eon(count, count, zr); // report (-1^K) to caller 1538 __ br(Assembler::EQ, L_done_pop); 1539 1540 __ BIND(L_do_card_marks); 1541 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1542 gen_write_ref_array_post_barrier(start_to, to, rscratch1); 1543 1544 __ bind(L_done_pop); 1545 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1546 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1547 1548 __ bind(L_done); 1549 __ mov(r0, count); 1550 __ leave(); 1551 __ ret(lr); 1552 1553 return start; 1554 } 1555 1556 // Perform range checks on the proposed arraycopy. 1557 // Kills temp, but nothing else. 1558 // Also, clean the sign bits of src_pos and dst_pos. 1559 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1560 Register src_pos, // source position (c_rarg1) 1561 Register dst, // destination array oo (c_rarg2) 1562 Register dst_pos, // destination position (c_rarg3) 1563 Register length, 1564 Register temp, 1565 Label& L_failed) { Unimplemented(); } 1566 1567 // These stubs get called from some dumb test routine. 1568 // I'll write them properly when they're called from 1569 // something that's actually doing something. 1570 static void fake_arraycopy_stub(address src, address dst, int count) { 1571 assert(count == 0, "huh?"); 1572 } 1573 1574 1575 void generate_arraycopy_stubs() { 1576 address entry; 1577 address entry_jbyte_arraycopy; 1578 address entry_jshort_arraycopy; 1579 address entry_jint_arraycopy; 1580 address entry_oop_arraycopy; 1581 address entry_jlong_arraycopy; 1582 address entry_checkcast_arraycopy; 1583 1584 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 1585 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 1586 1587 //*** jbyte 1588 // Always need aligned and unaligned versions 1589 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 1590 "jbyte_disjoint_arraycopy"); 1591 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 1592 &entry_jbyte_arraycopy, 1593 "jbyte_arraycopy"); 1594 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 1595 "arrayof_jbyte_disjoint_arraycopy"); 1596 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 1597 "arrayof_jbyte_arraycopy"); 1598 1599 //*** jshort 1600 // Always need aligned and unaligned versions 1601 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 1602 "jshort_disjoint_arraycopy"); 1603 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 1604 &entry_jshort_arraycopy, 1605 "jshort_arraycopy"); 1606 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 1607 "arrayof_jshort_disjoint_arraycopy"); 1608 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 1609 "arrayof_jshort_arraycopy"); 1610 1611 //*** jint 1612 // Aligned versions 1613 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 1614 "arrayof_jint_disjoint_arraycopy"); 1615 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 1616 "arrayof_jint_arraycopy"); 1617 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 1618 // entry_jint_arraycopy always points to the unaligned version 1619 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 1620 "jint_disjoint_arraycopy"); 1621 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 1622 &entry_jint_arraycopy, 1623 "jint_arraycopy"); 1624 1625 //*** jlong 1626 // It is always aligned 1627 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 1628 "arrayof_jlong_disjoint_arraycopy"); 1629 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 1630 "arrayof_jlong_arraycopy"); 1631 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 1632 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 1633 1634 //*** oops 1635 { 1636 // With compressed oops we need unaligned versions; notice that 1637 // we overwrite entry_oop_arraycopy. 1638 bool aligned = !UseCompressedOops; 1639 1640 StubRoutines::_arrayof_oop_disjoint_arraycopy 1641 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy"); 1642 StubRoutines::_arrayof_oop_arraycopy 1643 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy"); 1644 // Aligned versions without pre-barriers 1645 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 1646 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 1647 /*dest_uninitialized*/true); 1648 StubRoutines::_arrayof_oop_arraycopy_uninit 1649 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 1650 /*dest_uninitialized*/true); 1651 } 1652 1653 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 1654 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 1655 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 1656 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 1657 1658 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 1659 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 1660 /*dest_uninitialized*/true); 1661 } 1662 1663 // Arguments: 1664 // 1665 // Inputs: 1666 // c_rarg0 - source byte array address 1667 // c_rarg1 - destination byte array address 1668 // c_rarg2 - K (key) in little endian int array 1669 // 1670 address generate_aescrypt_encryptBlock() { 1671 __ align(CodeEntryAlignment); 1672 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 1673 1674 Label L_doLast; 1675 1676 const Register from = c_rarg0; // source array address 1677 const Register to = c_rarg1; // destination array address 1678 const Register key = c_rarg2; // key array address 1679 const Register keylen = rscratch1; 1680 1681 address start = __ pc(); 1682 __ enter(); 1683 1684 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 1685 1686 __ ld1(v0, __ T16B, from); // get 16 bytes of input 1687 1688 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 1689 __ rev32(v1, __ T16B, v1); 1690 __ rev32(v2, __ T16B, v2); 1691 __ rev32(v3, __ T16B, v3); 1692 __ rev32(v4, __ T16B, v4); 1693 __ aese(v0, v1); 1694 __ aesmc(v0, v0); 1695 __ aese(v0, v2); 1696 __ aesmc(v0, v0); 1697 __ aese(v0, v3); 1698 __ aesmc(v0, v0); 1699 __ aese(v0, v4); 1700 __ aesmc(v0, v0); 1701 1702 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 1703 __ rev32(v1, __ T16B, v1); 1704 __ rev32(v2, __ T16B, v2); 1705 __ rev32(v3, __ T16B, v3); 1706 __ rev32(v4, __ T16B, v4); 1707 __ aese(v0, v1); 1708 __ aesmc(v0, v0); 1709 __ aese(v0, v2); 1710 __ aesmc(v0, v0); 1711 __ aese(v0, v3); 1712 __ aesmc(v0, v0); 1713 __ aese(v0, v4); 1714 __ aesmc(v0, v0); 1715 1716 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 1717 __ rev32(v1, __ T16B, v1); 1718 __ rev32(v2, __ T16B, v2); 1719 1720 __ cmpw(keylen, 44); 1721 __ br(Assembler::EQ, L_doLast); 1722 1723 __ aese(v0, v1); 1724 __ aesmc(v0, v0); 1725 __ aese(v0, v2); 1726 __ aesmc(v0, v0); 1727 1728 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 1729 __ rev32(v1, __ T16B, v1); 1730 __ rev32(v2, __ T16B, v2); 1731 1732 __ cmpw(keylen, 52); 1733 __ br(Assembler::EQ, L_doLast); 1734 1735 __ aese(v0, v1); 1736 __ aesmc(v0, v0); 1737 __ aese(v0, v2); 1738 __ aesmc(v0, v0); 1739 1740 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 1741 __ rev32(v1, __ T16B, v1); 1742 __ rev32(v2, __ T16B, v2); 1743 1744 __ BIND(L_doLast); 1745 1746 __ aese(v0, v1); 1747 __ aesmc(v0, v0); 1748 __ aese(v0, v2); 1749 1750 __ ld1(v1, __ T16B, key); 1751 __ rev32(v1, __ T16B, v1); 1752 __ eor(v0, __ T16B, v0, v1); 1753 1754 __ st1(v0, __ T16B, to); 1755 1756 __ mov(r0, 0); 1757 1758 __ leave(); 1759 __ ret(lr); 1760 1761 return start; 1762 } 1763 1764 // Arguments: 1765 // 1766 // Inputs: 1767 // c_rarg0 - source byte array address 1768 // c_rarg1 - destination byte array address 1769 // c_rarg2 - K (key) in little endian int array 1770 // 1771 address generate_aescrypt_decryptBlock() { 1772 assert(UseAES, "need AES instructions and misaligned SSE support"); 1773 __ align(CodeEntryAlignment); 1774 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 1775 Label L_doLast; 1776 1777 const Register from = c_rarg0; // source array address 1778 const Register to = c_rarg1; // destination array address 1779 const Register key = c_rarg2; // key array address 1780 const Register keylen = rscratch1; 1781 1782 address start = __ pc(); 1783 __ enter(); // required for proper stackwalking of RuntimeStub frame 1784 1785 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 1786 1787 __ ld1(v0, __ T16B, from); // get 16 bytes of input 1788 1789 __ ld1(v5, __ T16B, __ post(key, 16)); 1790 __ rev32(v5, __ T16B, v5); 1791 1792 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 1793 __ rev32(v1, __ T16B, v1); 1794 __ rev32(v2, __ T16B, v2); 1795 __ rev32(v3, __ T16B, v3); 1796 __ rev32(v4, __ T16B, v4); 1797 __ aesd(v0, v1); 1798 __ aesimc(v0, v0); 1799 __ aesd(v0, v2); 1800 __ aesimc(v0, v0); 1801 __ aesd(v0, v3); 1802 __ aesimc(v0, v0); 1803 __ aesd(v0, v4); 1804 __ aesimc(v0, v0); 1805 1806 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 1807 __ rev32(v1, __ T16B, v1); 1808 __ rev32(v2, __ T16B, v2); 1809 __ rev32(v3, __ T16B, v3); 1810 __ rev32(v4, __ T16B, v4); 1811 __ aesd(v0, v1); 1812 __ aesimc(v0, v0); 1813 __ aesd(v0, v2); 1814 __ aesimc(v0, v0); 1815 __ aesd(v0, v3); 1816 __ aesimc(v0, v0); 1817 __ aesd(v0, v4); 1818 __ aesimc(v0, v0); 1819 1820 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 1821 __ rev32(v1, __ T16B, v1); 1822 __ rev32(v2, __ T16B, v2); 1823 1824 __ cmpw(keylen, 44); 1825 __ br(Assembler::EQ, L_doLast); 1826 1827 __ aesd(v0, v1); 1828 __ aesimc(v0, v0); 1829 __ aesd(v0, v2); 1830 __ aesimc(v0, v0); 1831 1832 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 1833 __ rev32(v1, __ T16B, v1); 1834 __ rev32(v2, __ T16B, v2); 1835 1836 __ cmpw(keylen, 52); 1837 __ br(Assembler::EQ, L_doLast); 1838 1839 __ aesd(v0, v1); 1840 __ aesimc(v0, v0); 1841 __ aesd(v0, v2); 1842 __ aesimc(v0, v0); 1843 1844 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 1845 __ rev32(v1, __ T16B, v1); 1846 __ rev32(v2, __ T16B, v2); 1847 1848 __ BIND(L_doLast); 1849 1850 __ aesd(v0, v1); 1851 __ aesimc(v0, v0); 1852 __ aesd(v0, v2); 1853 1854 __ eor(v0, __ T16B, v0, v5); 1855 1856 __ st1(v0, __ T16B, to); 1857 1858 __ mov(r0, 0); 1859 1860 __ leave(); 1861 __ ret(lr); 1862 1863 return start; 1864 } 1865 1866 // Arguments: 1867 // 1868 // Inputs: 1869 // c_rarg0 - source byte array address 1870 // c_rarg1 - destination byte array address 1871 // c_rarg2 - K (key) in little endian int array 1872 // c_rarg3 - r vector byte array address 1873 // c_rarg4 - input length 1874 // 1875 // Output: 1876 // x0 - input length 1877 // 1878 address generate_cipherBlockChaining_encryptAESCrypt() { 1879 assert(UseAES, "need AES instructions and misaligned SSE support"); 1880 __ align(CodeEntryAlignment); 1881 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 1882 1883 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 1884 1885 const Register from = c_rarg0; // source array address 1886 const Register to = c_rarg1; // destination array address 1887 const Register key = c_rarg2; // key array address 1888 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 1889 // and left with the results of the last encryption block 1890 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 1891 const Register keylen = rscratch1; 1892 1893 address start = __ pc(); 1894 __ enter(); 1895 1896 __ mov(rscratch2, len_reg); 1897 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 1898 1899 __ ld1(v0, __ T16B, rvec); 1900 1901 __ cmpw(keylen, 52); 1902 __ br(Assembler::CC, L_loadkeys_44); 1903 __ br(Assembler::EQ, L_loadkeys_52); 1904 1905 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 1906 __ rev32(v17, __ T16B, v17); 1907 __ rev32(v18, __ T16B, v18); 1908 __ BIND(L_loadkeys_52); 1909 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 1910 __ rev32(v19, __ T16B, v19); 1911 __ rev32(v20, __ T16B, v20); 1912 __ BIND(L_loadkeys_44); 1913 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 1914 __ rev32(v21, __ T16B, v21); 1915 __ rev32(v22, __ T16B, v22); 1916 __ rev32(v23, __ T16B, v23); 1917 __ rev32(v24, __ T16B, v24); 1918 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 1919 __ rev32(v25, __ T16B, v25); 1920 __ rev32(v26, __ T16B, v26); 1921 __ rev32(v27, __ T16B, v27); 1922 __ rev32(v28, __ T16B, v28); 1923 __ ld1(v29, v30, v31, __ T16B, key); 1924 __ rev32(v29, __ T16B, v29); 1925 __ rev32(v30, __ T16B, v30); 1926 __ rev32(v31, __ T16B, v31); 1927 1928 __ BIND(L_aes_loop); 1929 __ ld1(v1, __ T16B, __ post(from, 16)); 1930 __ eor(v0, __ T16B, v0, v1); 1931 1932 __ br(Assembler::CC, L_rounds_44); 1933 __ br(Assembler::EQ, L_rounds_52); 1934 1935 __ aese(v0, v17); __ aesmc(v0, v0); 1936 __ aese(v0, v18); __ aesmc(v0, v0); 1937 __ BIND(L_rounds_52); 1938 __ aese(v0, v19); __ aesmc(v0, v0); 1939 __ aese(v0, v20); __ aesmc(v0, v0); 1940 __ BIND(L_rounds_44); 1941 __ aese(v0, v21); __ aesmc(v0, v0); 1942 __ aese(v0, v22); __ aesmc(v0, v0); 1943 __ aese(v0, v23); __ aesmc(v0, v0); 1944 __ aese(v0, v24); __ aesmc(v0, v0); 1945 __ aese(v0, v25); __ aesmc(v0, v0); 1946 __ aese(v0, v26); __ aesmc(v0, v0); 1947 __ aese(v0, v27); __ aesmc(v0, v0); 1948 __ aese(v0, v28); __ aesmc(v0, v0); 1949 __ aese(v0, v29); __ aesmc(v0, v0); 1950 __ aese(v0, v30); 1951 __ eor(v0, __ T16B, v0, v31); 1952 1953 __ st1(v0, __ T16B, __ post(to, 16)); 1954 __ sub(len_reg, len_reg, 16); 1955 __ cbnz(len_reg, L_aes_loop); 1956 1957 __ st1(v0, __ T16B, rvec); 1958 1959 __ mov(r0, rscratch2); 1960 1961 __ leave(); 1962 __ ret(lr); 1963 1964 return start; 1965 } 1966 1967 // Arguments: 1968 // 1969 // Inputs: 1970 // c_rarg0 - source byte array address 1971 // c_rarg1 - destination byte array address 1972 // c_rarg2 - K (key) in little endian int array 1973 // c_rarg3 - r vector byte array address 1974 // c_rarg4 - input length 1975 // 1976 // Output: 1977 // rax - input length 1978 // 1979 address generate_cipherBlockChaining_decryptAESCrypt() { 1980 assert(UseAES, "need AES instructions and misaligned SSE support"); 1981 __ align(CodeEntryAlignment); 1982 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 1983 1984 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 1985 1986 const Register from = c_rarg0; // source array address 1987 const Register to = c_rarg1; // destination array address 1988 const Register key = c_rarg2; // key array address 1989 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 1990 // and left with the results of the last encryption block 1991 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 1992 const Register keylen = rscratch1; 1993 1994 address start = __ pc(); 1995 __ enter(); 1996 1997 __ mov(rscratch2, len_reg); 1998 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 1999 2000 __ ld1(v2, __ T16B, rvec); 2001 2002 __ ld1(v31, __ T16B, __ post(key, 16)); 2003 __ rev32(v31, __ T16B, v31); 2004 2005 __ cmpw(keylen, 52); 2006 __ br(Assembler::CC, L_loadkeys_44); 2007 __ br(Assembler::EQ, L_loadkeys_52); 2008 2009 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2010 __ rev32(v17, __ T16B, v17); 2011 __ rev32(v18, __ T16B, v18); 2012 __ BIND(L_loadkeys_52); 2013 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2014 __ rev32(v19, __ T16B, v19); 2015 __ rev32(v20, __ T16B, v20); 2016 __ BIND(L_loadkeys_44); 2017 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2018 __ rev32(v21, __ T16B, v21); 2019 __ rev32(v22, __ T16B, v22); 2020 __ rev32(v23, __ T16B, v23); 2021 __ rev32(v24, __ T16B, v24); 2022 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2023 __ rev32(v25, __ T16B, v25); 2024 __ rev32(v26, __ T16B, v26); 2025 __ rev32(v27, __ T16B, v27); 2026 __ rev32(v28, __ T16B, v28); 2027 __ ld1(v29, v30, __ T16B, key); 2028 __ rev32(v29, __ T16B, v29); 2029 __ rev32(v30, __ T16B, v30); 2030 2031 __ BIND(L_aes_loop); 2032 __ ld1(v0, __ T16B, __ post(from, 16)); 2033 __ orr(v1, __ T16B, v0, v0); 2034 2035 __ br(Assembler::CC, L_rounds_44); 2036 __ br(Assembler::EQ, L_rounds_52); 2037 2038 __ aesd(v0, v17); __ aesimc(v0, v0); 2039 __ aesd(v0, v17); __ aesimc(v0, v0); 2040 __ BIND(L_rounds_52); 2041 __ aesd(v0, v19); __ aesimc(v0, v0); 2042 __ aesd(v0, v20); __ aesimc(v0, v0); 2043 __ BIND(L_rounds_44); 2044 __ aesd(v0, v21); __ aesimc(v0, v0); 2045 __ aesd(v0, v22); __ aesimc(v0, v0); 2046 __ aesd(v0, v23); __ aesimc(v0, v0); 2047 __ aesd(v0, v24); __ aesimc(v0, v0); 2048 __ aesd(v0, v25); __ aesimc(v0, v0); 2049 __ aesd(v0, v26); __ aesimc(v0, v0); 2050 __ aesd(v0, v27); __ aesimc(v0, v0); 2051 __ aesd(v0, v28); __ aesimc(v0, v0); 2052 __ aesd(v0, v29); __ aesimc(v0, v0); 2053 __ aesd(v0, v30); 2054 __ eor(v0, __ T16B, v0, v31); 2055 __ eor(v0, __ T16B, v0, v2); 2056 2057 __ st1(v0, __ T16B, __ post(to, 16)); 2058 __ orr(v2, __ T16B, v1, v1); 2059 2060 __ sub(len_reg, len_reg, 16); 2061 __ cbnz(len_reg, L_aes_loop); 2062 2063 __ st1(v2, __ T16B, rvec); 2064 2065 __ mov(r0, rscratch2); 2066 2067 __ leave(); 2068 __ ret(lr); 2069 2070 return start; 2071 } 2072 2073 // Arguments: 2074 // 2075 // Inputs: 2076 // c_rarg0 - byte[] source+offset 2077 // c_rarg1 - int[] SHA.state 2078 // c_rarg2 - int offset 2079 // c_rarg3 - int limit 2080 // 2081 address generate_sha1_implCompress(bool multi_block, const char *name) { 2082 __ align(CodeEntryAlignment); 2083 StubCodeMark mark(this, "StubRoutines", name); 2084 address start = __ pc(); 2085 2086 Register buf = c_rarg0; 2087 Register state = c_rarg1; 2088 Register ofs = c_rarg2; 2089 Register limit = c_rarg3; 2090 2091 Label keys; 2092 Label sha1_loop; 2093 2094 // load the keys into v0..v3 2095 __ adr(rscratch1, keys); 2096 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2097 // load 5 words state into v6, v7 2098 __ ldrq(v6, Address(state, 0)); 2099 __ ldrs(v7, Address(state, 16)); 2100 2101 2102 __ BIND(sha1_loop); 2103 // load 64 bytes of data into v16..v19 2104 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2105 __ rev32(v16, __ T16B, v16); 2106 __ rev32(v17, __ T16B, v17); 2107 __ rev32(v18, __ T16B, v18); 2108 __ rev32(v19, __ T16B, v19); 2109 2110 // do the sha1 2111 __ addv(v4, __ T4S, v16, v0); 2112 __ orr(v20, __ T16B, v6, v6); 2113 2114 FloatRegister d0 = v16; 2115 FloatRegister d1 = v17; 2116 FloatRegister d2 = v18; 2117 FloatRegister d3 = v19; 2118 2119 for (int round = 0; round < 20; round++) { 2120 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2121 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2122 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2123 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2124 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2125 2126 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2127 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2128 __ sha1h(tmp2, __ T4S, v20); 2129 if (round < 5) 2130 __ sha1c(v20, __ T4S, tmp3, tmp4); 2131 else if (round < 10 || round >= 15) 2132 __ sha1p(v20, __ T4S, tmp3, tmp4); 2133 else 2134 __ sha1m(v20, __ T4S, tmp3, tmp4); 2135 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2136 2137 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2138 } 2139 2140 __ addv(v7, __ T2S, v7, v21); 2141 __ addv(v6, __ T4S, v6, v20); 2142 2143 if (multi_block) { 2144 __ add(ofs, ofs, 64); 2145 __ cmp(ofs, limit); 2146 __ br(Assembler::LE, sha1_loop); 2147 __ mov(c_rarg0, ofs); // return ofs 2148 } 2149 2150 __ strq(v6, Address(state, 0)); 2151 __ strs(v7, Address(state, 16)); 2152 2153 __ ret(lr); 2154 2155 __ bind(keys); 2156 __ emit_int32(0x5a827999); 2157 __ emit_int32(0x6ed9eba1); 2158 __ emit_int32(0x8f1bbcdc); 2159 __ emit_int32(0xca62c1d6); 2160 2161 return start; 2162 } 2163 2164 2165 // Arguments: 2166 // 2167 // Inputs: 2168 // c_rarg0 - byte[] source+offset 2169 // c_rarg1 - int[] SHA.state 2170 // c_rarg2 - int offset 2171 // c_rarg3 - int limit 2172 // 2173 address generate_sha256_implCompress(bool multi_block, const char *name) { 2174 static const uint32_t round_consts[64] = { 2175 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 2176 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 2177 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 2178 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 2179 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 2180 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 2181 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 2182 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 2183 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 2184 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 2185 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 2186 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 2187 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 2188 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 2189 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 2190 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 2191 }; 2192 __ align(CodeEntryAlignment); 2193 StubCodeMark mark(this, "StubRoutines", name); 2194 address start = __ pc(); 2195 2196 Register buf = c_rarg0; 2197 Register state = c_rarg1; 2198 Register ofs = c_rarg2; 2199 Register limit = c_rarg3; 2200 2201 Label sha1_loop; 2202 2203 __ stpd(v8, v9, __ pre(sp, -32)); 2204 __ stpd(v10, v11, Address(sp, 16)); 2205 2206 // dga == v0 2207 // dgb == v1 2208 // dg0 == v2 2209 // dg1 == v3 2210 // dg2 == v4 2211 // t0 == v6 2212 // t1 == v7 2213 2214 // load 16 keys to v16..v31 2215 __ lea(rscratch1, ExternalAddress((address)round_consts)); 2216 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 2217 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 2218 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 2219 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 2220 2221 // load 8 words (256 bits) state 2222 __ ldpq(v0, v1, state); 2223 2224 __ BIND(sha1_loop); 2225 // load 64 bytes of data into v8..v11 2226 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 2227 __ rev32(v8, __ T16B, v8); 2228 __ rev32(v9, __ T16B, v9); 2229 __ rev32(v10, __ T16B, v10); 2230 __ rev32(v11, __ T16B, v11); 2231 2232 __ addv(v6, __ T4S, v8, v16); 2233 __ orr(v2, __ T16B, v0, v0); 2234 __ orr(v3, __ T16B, v1, v1); 2235 2236 FloatRegister d0 = v8; 2237 FloatRegister d1 = v9; 2238 FloatRegister d2 = v10; 2239 FloatRegister d3 = v11; 2240 2241 2242 for (int round = 0; round < 16; round++) { 2243 FloatRegister tmp1 = (round & 1) ? v6 : v7; 2244 FloatRegister tmp2 = (round & 1) ? v7 : v6; 2245 FloatRegister tmp3 = (round & 1) ? v2 : v4; 2246 FloatRegister tmp4 = (round & 1) ? v4 : v2; 2247 2248 if (round < 12) __ sha256su0(d0, __ T4S, d1); 2249 __ orr(v4, __ T16B, v2, v2); 2250 if (round < 15) 2251 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 2252 __ sha256h(v2, __ T4S, v3, tmp2); 2253 __ sha256h2(v3, __ T4S, v4, tmp2); 2254 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 2255 2256 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2257 } 2258 2259 __ addv(v0, __ T4S, v0, v2); 2260 __ addv(v1, __ T4S, v1, v3); 2261 2262 if (multi_block) { 2263 __ add(ofs, ofs, 64); 2264 __ cmp(ofs, limit); 2265 __ br(Assembler::LE, sha1_loop); 2266 __ mov(c_rarg0, ofs); // return ofs 2267 } 2268 2269 __ ldpd(v10, v11, Address(sp, 16)); 2270 __ ldpd(v8, v9, __ post(sp, 32)); 2271 2272 __ stpq(v0, v1, state); 2273 2274 __ ret(lr); 2275 2276 return start; 2277 } 2278 2279 #ifndef BUILTIN_SIM 2280 // Safefetch stubs. 2281 void generate_safefetch(const char* name, int size, address* entry, 2282 address* fault_pc, address* continuation_pc) { 2283 // safefetch signatures: 2284 // int SafeFetch32(int* adr, int errValue); 2285 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 2286 // 2287 // arguments: 2288 // c_rarg0 = adr 2289 // c_rarg1 = errValue 2290 // 2291 // result: 2292 // PPC_RET = *adr or errValue 2293 2294 StubCodeMark mark(this, "StubRoutines", name); 2295 2296 // Entry point, pc or function descriptor. 2297 *entry = __ pc(); 2298 2299 // Load *adr into c_rarg1, may fault. 2300 *fault_pc = __ pc(); 2301 switch (size) { 2302 case 4: 2303 // int32_t 2304 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 2305 break; 2306 case 8: 2307 // int64_t 2308 __ ldr(c_rarg1, Address(c_rarg0, 0)); 2309 break; 2310 default: 2311 ShouldNotReachHere(); 2312 } 2313 2314 // return errValue or *adr 2315 *continuation_pc = __ pc(); 2316 __ mov(r0, c_rarg1); 2317 __ ret(lr); 2318 } 2319 #endif 2320 2321 /** 2322 * Arguments: 2323 * 2324 * Inputs: 2325 * c_rarg0 - int crc 2326 * c_rarg1 - byte* buf 2327 * c_rarg2 - int length 2328 * 2329 * Output: 2330 * r0 - int crc result 2331 * 2332 * Preserves: 2333 * r13 2334 * 2335 */ 2336 address generate_updateBytesCRC32() { 2337 assert(UseCRC32Intrinsics, "what are we doing here?"); 2338 2339 __ align(CodeEntryAlignment); 2340 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 2341 2342 address start = __ pc(); 2343 2344 const Register crc = c_rarg0; // crc 2345 const Register buf = c_rarg1; // source java byte array address 2346 const Register len = c_rarg2; // length 2347 const Register table0 = c_rarg3; // crc_table address 2348 const Register table1 = c_rarg4; 2349 const Register table2 = c_rarg5; 2350 const Register table3 = c_rarg6; 2351 const Register tmp3 = c_rarg7; 2352 2353 BLOCK_COMMENT("Entry:"); 2354 __ enter(); // required for proper stackwalking of RuntimeStub frame 2355 2356 __ kernel_crc32(crc, buf, len, 2357 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 2358 2359 __ leave(); // required for proper stackwalking of RuntimeStub frame 2360 __ ret(lr); 2361 2362 return start; 2363 } 2364 2365 /** 2366 * Arguments: 2367 * 2368 * Input: 2369 * c_rarg0 - x address 2370 * c_rarg1 - x length 2371 * c_rarg2 - y address 2372 * c_rarg3 - y lenth 2373 * c_rarg4 - z address 2374 * c_rarg5 - z length 2375 */ 2376 address generate_multiplyToLen() { 2377 __ align(CodeEntryAlignment); 2378 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 2379 2380 address start = __ pc(); 2381 const Register x = r0; 2382 const Register xlen = r1; 2383 const Register y = r2; 2384 const Register ylen = r3; 2385 const Register z = r4; 2386 const Register zlen = r5; 2387 2388 const Register tmp1 = r10; 2389 const Register tmp2 = r11; 2390 const Register tmp3 = r12; 2391 const Register tmp4 = r13; 2392 const Register tmp5 = r14; 2393 const Register tmp6 = r15; 2394 const Register tmp7 = r16; 2395 2396 BLOCK_COMMENT("Entry:"); 2397 __ enter(); // required for proper stackwalking of RuntimeStub frame 2398 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 2399 __ leave(); // required for proper stackwalking of RuntimeStub frame 2400 __ ret(lr); 2401 2402 return start; 2403 } 2404 2405 // Continuation point for throwing of implicit exceptions that are 2406 // not handled in the current activation. Fabricates an exception 2407 // oop and initiates normal exception dispatching in this 2408 // frame. Since we need to preserve callee-saved values (currently 2409 // only for C2, but done for C1 as well) we need a callee-saved oop 2410 // map and therefore have to make these stubs into RuntimeStubs 2411 // rather than BufferBlobs. If the compiler needs all registers to 2412 // be preserved between the fault point and the exception handler 2413 // then it must assume responsibility for that in 2414 // AbstractCompiler::continuation_for_implicit_null_exception or 2415 // continuation_for_implicit_division_by_zero_exception. All other 2416 // implicit exceptions (e.g., NullPointerException or 2417 // AbstractMethodError on entry) are either at call sites or 2418 // otherwise assume that stack unwinding will be initiated, so 2419 // caller saved registers were assumed volatile in the compiler. 2420 2421 #undef __ 2422 #define __ masm-> 2423 2424 address generate_throw_exception(const char* name, 2425 address runtime_entry, 2426 Register arg1 = noreg, 2427 Register arg2 = noreg) { 2428 // Information about frame layout at time of blocking runtime call. 2429 // Note that we only have to preserve callee-saved registers since 2430 // the compilers are responsible for supplying a continuation point 2431 // if they expect all registers to be preserved. 2432 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 2433 enum layout { 2434 rfp_off = 0, 2435 rfp_off2, 2436 return_off, 2437 return_off2, 2438 framesize // inclusive of return address 2439 }; 2440 2441 int insts_size = 512; 2442 int locs_size = 64; 2443 2444 CodeBuffer code(name, insts_size, locs_size); 2445 OopMapSet* oop_maps = new OopMapSet(); 2446 MacroAssembler* masm = new MacroAssembler(&code); 2447 2448 address start = __ pc(); 2449 2450 // This is an inlined and slightly modified version of call_VM 2451 // which has the ability to fetch the return PC out of 2452 // thread-local storage and also sets up last_Java_sp slightly 2453 // differently than the real call_VM 2454 2455 __ enter(); // Save FP and LR before call 2456 2457 assert(is_even(framesize/2), "sp not 16-byte aligned"); 2458 2459 // lr and fp are already in place 2460 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 2461 2462 int frame_complete = __ pc() - start; 2463 2464 // Set up last_Java_sp and last_Java_fp 2465 address the_pc = __ pc(); 2466 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 2467 2468 // Call runtime 2469 if (arg1 != noreg) { 2470 assert(arg2 != c_rarg1, "clobbered"); 2471 __ mov(c_rarg1, arg1); 2472 } 2473 if (arg2 != noreg) { 2474 __ mov(c_rarg2, arg2); 2475 } 2476 __ mov(c_rarg0, rthread); 2477 BLOCK_COMMENT("call runtime_entry"); 2478 __ mov(rscratch1, runtime_entry); 2479 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 2480 2481 // Generate oop map 2482 OopMap* map = new OopMap(framesize, 0); 2483 2484 oop_maps->add_gc_map(the_pc - start, map); 2485 2486 __ reset_last_Java_frame(true, true); 2487 __ maybe_isb(); 2488 2489 __ leave(); 2490 2491 // check for pending exceptions 2492 #ifdef ASSERT 2493 Label L; 2494 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 2495 __ cbnz(rscratch1, L); 2496 __ should_not_reach_here(); 2497 __ bind(L); 2498 #endif // ASSERT 2499 __ b(RuntimeAddress(StubRoutines::forward_exception_entry())); 2500 2501 2502 // codeBlob framesize is in words (not VMRegImpl::slot_size) 2503 RuntimeStub* stub = 2504 RuntimeStub::new_runtime_stub(name, 2505 &code, 2506 frame_complete, 2507 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 2508 oop_maps, false); 2509 return stub->entry_point(); 2510 } 2511 2512 // Initialization 2513 void generate_initial() { 2514 // Generate initial stubs and initializes the entry points 2515 2516 // entry points that exist in all platforms Note: This is code 2517 // that could be shared among different platforms - however the 2518 // benefit seems to be smaller than the disadvantage of having a 2519 // much more complicated generator structure. See also comment in 2520 // stubRoutines.hpp. 2521 2522 StubRoutines::_forward_exception_entry = generate_forward_exception(); 2523 2524 StubRoutines::_call_stub_entry = 2525 generate_call_stub(StubRoutines::_call_stub_return_address); 2526 2527 // is referenced by megamorphic call 2528 StubRoutines::_catch_exception_entry = generate_catch_exception(); 2529 2530 // Build this early so it's available for the interpreter. 2531 StubRoutines::_throw_StackOverflowError_entry = 2532 generate_throw_exception("StackOverflowError throw_exception", 2533 CAST_FROM_FN_PTR(address, 2534 SharedRuntime:: 2535 throw_StackOverflowError)); 2536 if (UseCRC32Intrinsics) { 2537 // set table address before stub generation which use it 2538 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 2539 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 2540 } 2541 } 2542 2543 void generate_all() { 2544 // support for verify_oop (must happen after universe_init) 2545 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 2546 StubRoutines::_throw_AbstractMethodError_entry = 2547 generate_throw_exception("AbstractMethodError throw_exception", 2548 CAST_FROM_FN_PTR(address, 2549 SharedRuntime:: 2550 throw_AbstractMethodError)); 2551 2552 StubRoutines::_throw_IncompatibleClassChangeError_entry = 2553 generate_throw_exception("IncompatibleClassChangeError throw_exception", 2554 CAST_FROM_FN_PTR(address, 2555 SharedRuntime:: 2556 throw_IncompatibleClassChangeError)); 2557 2558 StubRoutines::_throw_NullPointerException_at_call_entry = 2559 generate_throw_exception("NullPointerException at call throw_exception", 2560 CAST_FROM_FN_PTR(address, 2561 SharedRuntime:: 2562 throw_NullPointerException_at_call)); 2563 2564 // arraycopy stubs used by compilers 2565 generate_arraycopy_stubs(); 2566 2567 if (UseMultiplyToLenIntrinsic) { 2568 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 2569 } 2570 2571 #ifndef BUILTIN_SIM 2572 if (UseAESIntrinsics) { 2573 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 2574 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 2575 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 2576 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 2577 } 2578 2579 if (UseSHA1Intrinsics) { 2580 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 2581 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 2582 } 2583 if (UseSHA256Intrinsics) { 2584 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 2585 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 2586 } 2587 2588 // Safefetch stubs. 2589 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 2590 &StubRoutines::_safefetch32_fault_pc, 2591 &StubRoutines::_safefetch32_continuation_pc); 2592 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 2593 &StubRoutines::_safefetchN_fault_pc, 2594 &StubRoutines::_safefetchN_continuation_pc); 2595 #endif 2596 } 2597 2598 public: 2599 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 2600 if (all) { 2601 generate_all(); 2602 } else { 2603 generate_initial(); 2604 } 2605 } 2606 }; // end class declaration 2607 2608 void StubGenerator_generate(CodeBuffer* code, bool all) { 2609 StubGenerator g(code, all); 2610 }