1 /* 2 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "interpreter/interpreter.hpp" 30 #include "nativeInst_aarch64.hpp" 31 #include "oops/instanceOop.hpp" 32 #include "oops/method.hpp" 33 #include "oops/objArrayKlass.hpp" 34 #include "oops/oop.inline.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/frame.inline.hpp" 37 #include "runtime/handles.inline.hpp" 38 #include "runtime/sharedRuntime.hpp" 39 #include "runtime/stubCodeGenerator.hpp" 40 #include "runtime/stubRoutines.hpp" 41 #include "runtime/thread.inline.hpp" 42 #ifdef COMPILER2 43 #include "opto/runtime.hpp" 44 #endif 45 46 #ifdef BUILTIN_SIM 47 #include "../../../../../../simulator/simulator.hpp" 48 #endif 49 50 // Declaration and definition of StubGenerator (no .hpp file). 51 // For a more detailed description of the stub routine structure 52 // see the comment in stubRoutines.hpp 53 54 #undef __ 55 #define __ _masm-> 56 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 57 58 #ifdef PRODUCT 59 #define BLOCK_COMMENT(str) /* nothing */ 60 #else 61 #define BLOCK_COMMENT(str) __ block_comment(str) 62 #endif 63 64 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 65 66 // Stub Code definitions 67 68 class StubGenerator: public StubCodeGenerator { 69 private: 70 71 #ifdef PRODUCT 72 #define inc_counter_np(counter) ((void)0) 73 #else 74 void inc_counter_np_(int& counter) { 75 __ lea(rscratch2, ExternalAddress((address)&counter)); 76 __ ldrw(rscratch1, Address(rscratch2)); 77 __ addw(rscratch1, rscratch1, 1); 78 __ strw(rscratch1, Address(rscratch2)); 79 } 80 #define inc_counter_np(counter) \ 81 BLOCK_COMMENT("inc_counter " #counter); \ 82 inc_counter_np_(counter); 83 #endif 84 85 // Call stubs are used to call Java from C 86 // 87 // Arguments: 88 // c_rarg0: call wrapper address address 89 // c_rarg1: result address 90 // c_rarg2: result type BasicType 91 // c_rarg3: method Method* 92 // c_rarg4: (interpreter) entry point address 93 // c_rarg5: parameters intptr_t* 94 // c_rarg6: parameter size (in words) int 95 // c_rarg7: thread Thread* 96 // 97 // There is no return from the stub itself as any Java result 98 // is written to result 99 // 100 // we save r30 (lr) as the return PC at the base of the frame and 101 // link r29 (fp) below it as the frame pointer installing sp (r31) 102 // into fp. 103 // 104 // we save r0-r7, which accounts for all the c arguments. 105 // 106 // TODO: strictly do we need to save them all? they are treated as 107 // volatile by C so could we omit saving the ones we are going to 108 // place in global registers (thread? method?) or those we only use 109 // during setup of the Java call? 110 // 111 // we don't need to save r8 which C uses as an indirect result location 112 // return register. 113 // 114 // we don't need to save r9-r15 which both C and Java treat as 115 // volatile 116 // 117 // we don't need to save r16-18 because Java does not use them 118 // 119 // we save r19-r28 which Java uses as scratch registers and C 120 // expects to be callee-save 121 // 122 // we save the bottom 64 bits of each value stored in v8-v15; it is 123 // the responsibility of the caller to preserve larger values. 124 // 125 // so the stub frame looks like this when we enter Java code 126 // 127 // [ return_from_Java ] <--- sp 128 // [ argument word n ] 129 // ... 130 // -27 [ argument word 1 ] 131 // -26 [ saved v15 ] <--- sp_after_call 132 // -25 [ saved v14 ] 133 // -24 [ saved v13 ] 134 // -23 [ saved v12 ] 135 // -22 [ saved v11 ] 136 // -21 [ saved v10 ] 137 // -20 [ saved v9 ] 138 // -19 [ saved v8 ] 139 // -18 [ saved r28 ] 140 // -17 [ saved r27 ] 141 // -16 [ saved r26 ] 142 // -15 [ saved r25 ] 143 // -14 [ saved r24 ] 144 // -13 [ saved r23 ] 145 // -12 [ saved r22 ] 146 // -11 [ saved r21 ] 147 // -10 [ saved r20 ] 148 // -9 [ saved r19 ] 149 // -8 [ call wrapper (r0) ] 150 // -7 [ result (r1) ] 151 // -6 [ result type (r2) ] 152 // -5 [ method (r3) ] 153 // -4 [ entry point (r4) ] 154 // -3 [ parameters (r5) ] 155 // -2 [ parameter size (r6) ] 156 // -1 [ thread (r7) ] 157 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 158 // 1 [ saved lr (r30) ] 159 160 // Call stub stack layout word offsets from fp 161 enum call_stub_layout { 162 sp_after_call_off = -26, 163 164 d15_off = -26, 165 d13_off = -24, 166 d11_off = -22, 167 d9_off = -20, 168 169 r28_off = -18, 170 r26_off = -16, 171 r24_off = -14, 172 r22_off = -12, 173 r20_off = -10, 174 call_wrapper_off = -8, 175 result_off = -7, 176 result_type_off = -6, 177 method_off = -5, 178 entry_point_off = -4, 179 parameter_size_off = -2, 180 thread_off = -1, 181 fp_f = 0, 182 retaddr_off = 1, 183 }; 184 185 address generate_call_stub(address& return_address) { 186 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 187 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 188 "adjust this code"); 189 190 StubCodeMark mark(this, "StubRoutines", "call_stub"); 191 address start = __ pc(); 192 193 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 194 195 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 196 const Address result (rfp, result_off * wordSize); 197 const Address result_type (rfp, result_type_off * wordSize); 198 const Address method (rfp, method_off * wordSize); 199 const Address entry_point (rfp, entry_point_off * wordSize); 200 const Address parameter_size(rfp, parameter_size_off * wordSize); 201 202 const Address thread (rfp, thread_off * wordSize); 203 204 const Address d15_save (rfp, d15_off * wordSize); 205 const Address d13_save (rfp, d13_off * wordSize); 206 const Address d11_save (rfp, d11_off * wordSize); 207 const Address d9_save (rfp, d9_off * wordSize); 208 209 const Address r28_save (rfp, r28_off * wordSize); 210 const Address r26_save (rfp, r26_off * wordSize); 211 const Address r24_save (rfp, r24_off * wordSize); 212 const Address r22_save (rfp, r22_off * wordSize); 213 const Address r20_save (rfp, r20_off * wordSize); 214 215 // stub code 216 217 // we need a C prolog to bootstrap the x86 caller into the sim 218 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 219 220 address aarch64_entry = __ pc(); 221 222 #ifdef BUILTIN_SIM 223 // Save sender's SP for stack traces. 224 __ mov(rscratch1, sp); 225 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 226 #endif 227 // set up frame and move sp to end of save area 228 __ enter(); 229 __ sub(sp, rfp, -sp_after_call_off * wordSize); 230 231 // save register parameters and Java scratch/global registers 232 // n.b. we save thread even though it gets installed in 233 // rthread because we want to sanity check rthread later 234 __ str(c_rarg7, thread); 235 __ strw(c_rarg6, parameter_size); 236 __ stp(c_rarg4, c_rarg5, entry_point); 237 __ stp(c_rarg2, c_rarg3, result_type); 238 __ stp(c_rarg0, c_rarg1, call_wrapper); 239 240 __ stp(r20, r19, r20_save); 241 __ stp(r22, r21, r22_save); 242 __ stp(r24, r23, r24_save); 243 __ stp(r26, r25, r26_save); 244 __ stp(r28, r27, r28_save); 245 246 __ stpd(v9, v8, d9_save); 247 __ stpd(v11, v10, d11_save); 248 __ stpd(v13, v12, d13_save); 249 __ stpd(v15, v14, d15_save); 250 251 // install Java thread in global register now we have saved 252 // whatever value it held 253 __ mov(rthread, c_rarg7); 254 // And method 255 __ mov(rmethod, c_rarg3); 256 257 // set up the heapbase register 258 __ reinit_heapbase(); 259 260 #ifdef ASSERT 261 // make sure we have no pending exceptions 262 { 263 Label L; 264 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 265 __ cmp(rscratch1, (unsigned)NULL_WORD); 266 __ br(Assembler::EQ, L); 267 __ stop("StubRoutines::call_stub: entered with pending exception"); 268 __ BIND(L); 269 } 270 #endif 271 // pass parameters if any 272 __ mov(esp, sp); 273 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 274 __ andr(sp, rscratch1, -2 * wordSize); 275 276 BLOCK_COMMENT("pass parameters if any"); 277 Label parameters_done; 278 // parameter count is still in c_rarg6 279 // and parameter pointer identifying param 1 is in c_rarg5 280 __ cbzw(c_rarg6, parameters_done); 281 282 address loop = __ pc(); 283 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 284 __ subsw(c_rarg6, c_rarg6, 1); 285 __ push(rscratch1); 286 __ br(Assembler::GT, loop); 287 288 __ BIND(parameters_done); 289 290 // call Java entry -- passing methdoOop, and current sp 291 // rmethod: Method* 292 // r13: sender sp 293 BLOCK_COMMENT("call Java function"); 294 __ mov(r13, sp); 295 __ blr(c_rarg4); 296 297 // tell the simulator we have returned to the stub 298 299 // we do this here because the notify will already have been done 300 // if we get to the next instruction via an exception 301 // 302 // n.b. adding this instruction here affects the calculation of 303 // whether or not a routine returns to the call stub (used when 304 // doing stack walks) since the normal test is to check the return 305 // pc against the address saved below. so we may need to allow for 306 // this extra instruction in the check. 307 308 if (NotifySimulator) { 309 __ notify(Assembler::method_reentry); 310 } 311 // save current address for use by exception handling code 312 313 return_address = __ pc(); 314 315 // store result depending on type (everything that is not 316 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 317 // n.b. this assumes Java returns an integral result in r0 318 // and a floating result in j_farg0 319 __ ldr(j_rarg2, result); 320 Label is_long, is_float, is_double, exit; 321 __ ldr(j_rarg1, result_type); 322 __ cmp(j_rarg1, T_OBJECT); 323 __ br(Assembler::EQ, is_long); 324 __ cmp(j_rarg1, T_LONG); 325 __ br(Assembler::EQ, is_long); 326 __ cmp(j_rarg1, T_FLOAT); 327 __ br(Assembler::EQ, is_float); 328 __ cmp(j_rarg1, T_DOUBLE); 329 __ br(Assembler::EQ, is_double); 330 331 // handle T_INT case 332 __ strw(r0, Address(j_rarg2)); 333 334 __ BIND(exit); 335 336 // pop parameters 337 __ sub(esp, rfp, -sp_after_call_off * wordSize); 338 339 #ifdef ASSERT 340 // verify that threads correspond 341 { 342 Label L, S; 343 __ ldr(rscratch1, thread); 344 __ cmp(rthread, rscratch1); 345 __ br(Assembler::NE, S); 346 __ get_thread(rscratch1); 347 __ cmp(rthread, rscratch1); 348 __ br(Assembler::EQ, L); 349 __ BIND(S); 350 __ stop("StubRoutines::call_stub: threads must correspond"); 351 __ BIND(L); 352 } 353 #endif 354 355 // restore callee-save registers 356 __ ldpd(v15, v14, d15_save); 357 __ ldpd(v13, v12, d13_save); 358 __ ldpd(v11, v10, d11_save); 359 __ ldpd(v9, v8, d9_save); 360 361 __ ldp(r28, r27, r28_save); 362 __ ldp(r26, r25, r26_save); 363 __ ldp(r24, r23, r24_save); 364 __ ldp(r22, r21, r22_save); 365 __ ldp(r20, r19, r20_save); 366 367 __ ldp(c_rarg0, c_rarg1, call_wrapper); 368 __ ldrw(c_rarg2, result_type); 369 __ ldr(c_rarg3, method); 370 __ ldp(c_rarg4, c_rarg5, entry_point); 371 __ ldp(c_rarg6, c_rarg7, parameter_size); 372 373 #ifndef PRODUCT 374 // tell the simulator we are about to end Java execution 375 if (NotifySimulator) { 376 __ notify(Assembler::method_exit); 377 } 378 #endif 379 // leave frame and return to caller 380 __ leave(); 381 __ ret(lr); 382 383 // handle return types different from T_INT 384 385 __ BIND(is_long); 386 __ str(r0, Address(j_rarg2, 0)); 387 __ br(Assembler::AL, exit); 388 389 __ BIND(is_float); 390 __ strs(j_farg0, Address(j_rarg2, 0)); 391 __ br(Assembler::AL, exit); 392 393 __ BIND(is_double); 394 __ strd(j_farg0, Address(j_rarg2, 0)); 395 __ br(Assembler::AL, exit); 396 397 return start; 398 } 399 400 // Return point for a Java call if there's an exception thrown in 401 // Java code. The exception is caught and transformed into a 402 // pending exception stored in JavaThread that can be tested from 403 // within the VM. 404 // 405 // Note: Usually the parameters are removed by the callee. In case 406 // of an exception crossing an activation frame boundary, that is 407 // not the case if the callee is compiled code => need to setup the 408 // rsp. 409 // 410 // r0: exception oop 411 412 // NOTE: this is used as a target from the signal handler so it 413 // needs an x86 prolog which returns into the current simulator 414 // executing the generated catch_exception code. so the prolog 415 // needs to install rax in a sim register and adjust the sim's 416 // restart pc to enter the generated code at the start position 417 // then return from native to simulated execution. 418 419 address generate_catch_exception() { 420 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 421 address start = __ pc(); 422 423 // same as in generate_call_stub(): 424 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 425 const Address thread (rfp, thread_off * wordSize); 426 427 #ifdef ASSERT 428 // verify that threads correspond 429 { 430 Label L, S; 431 __ ldr(rscratch1, thread); 432 __ cmp(rthread, rscratch1); 433 __ br(Assembler::NE, S); 434 __ get_thread(rscratch1); 435 __ cmp(rthread, rscratch1); 436 __ br(Assembler::EQ, L); 437 __ bind(S); 438 __ stop("StubRoutines::catch_exception: threads must correspond"); 439 __ bind(L); 440 } 441 #endif 442 443 // set pending exception 444 __ verify_oop(r0); 445 446 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 447 __ mov(rscratch1, (address)__FILE__); 448 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 449 __ movw(rscratch1, (int)__LINE__); 450 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 451 452 // complete return to VM 453 assert(StubRoutines::_call_stub_return_address != NULL, 454 "_call_stub_return_address must have been generated before"); 455 __ b(StubRoutines::_call_stub_return_address); 456 457 return start; 458 } 459 460 // Continuation point for runtime calls returning with a pending 461 // exception. The pending exception check happened in the runtime 462 // or native call stub. The pending exception in Thread is 463 // converted into a Java-level exception. 464 // 465 // Contract with Java-level exception handlers: 466 // r0: exception 467 // r3: throwing pc 468 // 469 // NOTE: At entry of this stub, exception-pc must be in LR !! 470 471 // NOTE: this is always used as a jump target within generated code 472 // so it just needs to be generated code wiht no x86 prolog 473 474 address generate_forward_exception() { 475 StubCodeMark mark(this, "StubRoutines", "forward exception"); 476 address start = __ pc(); 477 478 // Upon entry, LR points to the return address returning into 479 // Java (interpreted or compiled) code; i.e., the return address 480 // becomes the throwing pc. 481 // 482 // Arguments pushed before the runtime call are still on the stack 483 // but the exception handler will reset the stack pointer -> 484 // ignore them. A potential result in registers can be ignored as 485 // well. 486 487 #ifdef ASSERT 488 // make sure this code is only executed if there is a pending exception 489 { 490 Label L; 491 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 492 __ cbnz(rscratch1, L); 493 __ stop("StubRoutines::forward exception: no pending exception (1)"); 494 __ bind(L); 495 } 496 #endif 497 498 // compute exception handler into r19 499 500 // call the VM to find the handler address associated with the 501 // caller address. pass thread in r0 and caller pc (ret address) 502 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 503 // the stack. 504 __ mov(c_rarg1, lr); 505 // lr will be trashed by the VM call so we move it to R19 506 // (callee-saved) because we also need to pass it to the handler 507 // returned by this call. 508 __ mov(r19, lr); 509 BLOCK_COMMENT("call exception_handler_for_return_address"); 510 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 511 SharedRuntime::exception_handler_for_return_address), 512 rthread, c_rarg1); 513 // we should not really care that lr is no longer the callee 514 // address. we saved the value the handler needs in r19 so we can 515 // just copy it to r3. however, the C2 handler will push its own 516 // frame and then calls into the VM and the VM code asserts that 517 // the PC for the frame above the handler belongs to a compiled 518 // Java method. So, we restore lr here to satisfy that assert. 519 __ mov(lr, r19); 520 // setup r0 & r3 & clear pending exception 521 __ mov(r3, r19); 522 __ mov(r19, r0); 523 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 524 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 525 526 #ifdef ASSERT 527 // make sure exception is set 528 { 529 Label L; 530 __ cbnz(r0, L); 531 __ stop("StubRoutines::forward exception: no pending exception (2)"); 532 __ bind(L); 533 } 534 #endif 535 536 // continue at exception handler 537 // r0: exception 538 // r3: throwing pc 539 // r19: exception handler 540 __ verify_oop(r0); 541 __ br(r19); 542 543 return start; 544 } 545 546 // Non-destructive plausibility checks for oops 547 // 548 // Arguments: 549 // r0: oop to verify 550 // rscratch1: error message 551 // 552 // Stack after saving c_rarg3: 553 // [tos + 0]: saved c_rarg3 554 // [tos + 1]: saved c_rarg2 555 // [tos + 2]: saved lr 556 // [tos + 3]: saved rscratch2 557 // [tos + 4]: saved r0 558 // [tos + 5]: saved rscratch1 559 address generate_verify_oop() { 560 561 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 562 address start = __ pc(); 563 564 Label exit, error; 565 566 // save c_rarg2 and c_rarg3 567 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 568 569 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 570 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 571 __ ldr(c_rarg3, Address(c_rarg2)); 572 __ add(c_rarg3, c_rarg3, 1); 573 __ str(c_rarg3, Address(c_rarg2)); 574 575 // object is in r0 576 // make sure object is 'reasonable' 577 __ cbz(r0, exit); // if obj is NULL it is OK 578 579 // Check if the oop is in the right area of memory 580 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 581 __ andr(c_rarg2, r0, c_rarg3); 582 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 583 584 // Compare c_rarg2 and c_rarg3. We don't use a compare 585 // instruction here because the flags register is live. 586 __ eor(c_rarg2, c_rarg2, c_rarg3); 587 __ cbnz(c_rarg2, error); 588 589 // make sure klass is 'reasonable', which is not zero. 590 __ load_klass(r0, r0); // get klass 591 __ cbz(r0, error); // if klass is NULL it is broken 592 593 // return if everything seems ok 594 __ bind(exit); 595 596 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 597 __ ret(lr); 598 599 // handle errors 600 __ bind(error); 601 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 602 603 __ push(RegSet::range(r0, r29), sp); 604 // debug(char* msg, int64_t pc, int64_t regs[]) 605 __ mov(c_rarg0, rscratch1); // pass address of error message 606 __ mov(c_rarg1, lr); // pass return address 607 __ mov(c_rarg2, sp); // pass address of regs on stack 608 #ifndef PRODUCT 609 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 610 #endif 611 BLOCK_COMMENT("call MacroAssembler::debug"); 612 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 613 __ blrt(rscratch1, 3, 0, 1); 614 615 return start; 616 } 617 618 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 619 620 // Generate code for an array write pre barrier 621 // 622 // addr - starting address 623 // count - element count 624 // tmp - scratch register 625 // 626 // Destroy no registers except rscratch1 and rscratch2 627 // 628 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 629 BarrierSet* bs = Universe::heap()->barrier_set(); 630 switch (bs->kind()) { 631 case BarrierSet::G1SATBCTLogging: 632 // With G1, don't generate the call if we statically know that the target in uninitialized 633 if (!dest_uninitialized) { 634 __ push_call_clobbered_registers(); 635 if (count == c_rarg0) { 636 if (addr == c_rarg1) { 637 // exactly backwards!! 638 __ mov(rscratch1, c_rarg0); 639 __ mov(c_rarg0, c_rarg1); 640 __ mov(c_rarg1, rscratch1); 641 } else { 642 __ mov(c_rarg1, count); 643 __ mov(c_rarg0, addr); 644 } 645 } else { 646 __ mov(c_rarg0, addr); 647 __ mov(c_rarg1, count); 648 } 649 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 650 __ pop_call_clobbered_registers(); 651 break; 652 case BarrierSet::CardTableForRS: 653 case BarrierSet::CardTableExtension: 654 case BarrierSet::ModRef: 655 break; 656 default: 657 ShouldNotReachHere(); 658 659 } 660 } 661 } 662 663 // 664 // Generate code for an array write post barrier 665 // 666 // Input: 667 // start - register containing starting address of destination array 668 // end - register containing ending address of destination array 669 // scratch - scratch register 670 // 671 // The input registers are overwritten. 672 // The ending address is inclusive. 673 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) { 674 assert_different_registers(start, end, scratch); 675 BarrierSet* bs = Universe::heap()->barrier_set(); 676 switch (bs->kind()) { 677 case BarrierSet::G1SATBCTLogging: 678 679 { 680 __ push_call_clobbered_registers(); 681 // must compute element count unless barrier set interface is changed (other platforms supply count) 682 assert_different_registers(start, end, scratch); 683 __ lea(scratch, Address(end, BytesPerHeapOop)); 684 __ sub(scratch, scratch, start); // subtract start to get #bytes 685 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 686 __ mov(c_rarg0, start); 687 __ mov(c_rarg1, scratch); 688 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 689 __ pop_call_clobbered_registers(); 690 } 691 break; 692 case BarrierSet::CardTableForRS: 693 case BarrierSet::CardTableExtension: 694 { 695 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 696 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 697 698 Label L_loop; 699 700 __ lsr(start, start, CardTableModRefBS::card_shift); 701 __ lsr(end, end, CardTableModRefBS::card_shift); 702 __ sub(end, end, start); // number of bytes to copy 703 704 const Register count = end; // 'end' register contains bytes count now 705 __ load_byte_map_base(scratch); 706 __ add(start, start, scratch); 707 if (UseConcMarkSweepGC) { 708 __ membar(__ StoreStore); 709 } 710 __ BIND(L_loop); 711 __ strb(zr, Address(start, count)); 712 __ subs(count, count, 1); 713 __ br(Assembler::GE, L_loop); 714 } 715 break; 716 default: 717 ShouldNotReachHere(); 718 719 } 720 } 721 722 // The inner part of zero_words(). This is the bulk operation, 723 // zeroing words in blocks, possibly using DC ZVA to do it. The 724 // caller is responsible for zeroing the last few words. 725 // 726 // Inputs: 727 // r10: the HeapWord-aligned base address of an array to zero. 728 // r11: the count in HeapWords, r11 > 0. 729 // 730 // Returns r10 and r11, adjusted for the caller to clear. 731 // r10: the base address of the tail of words left to clear. 732 // r11: the number of words in the tail. 733 // r11 < MacroAssembler::zero_words_block_size. 734 735 address generate_zero_blocks() { 736 Label store_pair, loop_store_pair, done; 737 Label base_aligned; 738 739 Register base = r10, cnt = r11; 740 741 __ align(CodeEntryAlignment); 742 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 743 address start = __ pc(); 744 745 if (UseBlockZeroing) { 746 int zva_length = VM_Version::zva_length(); 747 748 // Ensure ZVA length can be divided by 16. This is required by 749 // the subsequent operations. 750 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 751 752 __ tbz(base, 3, base_aligned); 753 __ str(zr, Address(__ post(base, 8))); 754 __ sub(cnt, cnt, 1); 755 __ bind(base_aligned); 756 757 // Ensure count >= zva_length * 2 so that it still deserves a zva after 758 // alignment. 759 Label small; 760 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 761 __ cmp(cnt, low_limit >> 3); 762 __ br(Assembler::LT, small); 763 __ zero_dcache_blocks(base, cnt); 764 __ bind(small); 765 } 766 767 { 768 // Number of stp instructions we'll unroll 769 const int unroll = 770 MacroAssembler::zero_words_block_size / 2; 771 // Clear the remaining blocks. 772 Label loop; 773 __ subs(cnt, cnt, unroll * 2); 774 __ br(Assembler::LT, done); 775 __ bind(loop); 776 for (int i = 0; i < unroll; i++) 777 __ stp(zr, zr, __ post(base, 16)); 778 __ subs(cnt, cnt, unroll * 2); 779 __ br(Assembler::GE, loop); 780 __ bind(done); 781 __ add(cnt, cnt, unroll * 2); 782 } 783 784 __ ret(lr); 785 786 return start; 787 } 788 789 790 typedef enum { 791 copy_forwards = 1, 792 copy_backwards = -1 793 } copy_direction; 794 795 // Bulk copy of blocks of 8 words. 796 // 797 // count is a count of words. 798 // 799 // Precondition: count >= 8 800 // 801 // Postconditions: 802 // 803 // The least significant bit of count contains the remaining count 804 // of words to copy. The rest of count is trash. 805 // 806 // s and d are adjusted to point to the remaining words to copy 807 // 808 void generate_copy_longs(Label &start, Register s, Register d, Register count, 809 copy_direction direction) { 810 int unit = wordSize * direction; 811 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 812 813 int offset; 814 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 815 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 816 const Register stride = r13; 817 818 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 819 assert_different_registers(s, d, count, rscratch1); 820 821 Label again, drain; 822 const char *stub_name; 823 if (direction == copy_forwards) 824 stub_name = "foward_copy_longs"; 825 else 826 stub_name = "backward_copy_longs"; 827 StubCodeMark mark(this, "StubRoutines", stub_name); 828 __ align(CodeEntryAlignment); 829 __ bind(start); 830 831 Label unaligned_copy_long; 832 if (AvoidUnalignedAccesses) { 833 __ tbnz(d, 3, unaligned_copy_long); 834 } 835 836 if (direction == copy_forwards) { 837 __ sub(s, s, bias); 838 __ sub(d, d, bias); 839 } 840 841 #ifdef ASSERT 842 // Make sure we are never given < 8 words 843 { 844 Label L; 845 __ cmp(count, 8); 846 __ br(Assembler::GE, L); 847 __ stop("genrate_copy_longs called with < 8 words"); 848 __ bind(L); 849 } 850 #endif 851 852 // Fill 8 registers 853 if (UseSIMDForMemoryOps) { 854 __ ldpq(v0, v1, Address(s, 4 * unit)); 855 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 856 } else { 857 __ ldp(t0, t1, Address(s, 2 * unit)); 858 __ ldp(t2, t3, Address(s, 4 * unit)); 859 __ ldp(t4, t5, Address(s, 6 * unit)); 860 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 861 } 862 863 __ subs(count, count, 16); 864 __ br(Assembler::LO, drain); 865 866 int prefetch = PrefetchCopyIntervalInBytes; 867 bool use_stride = false; 868 if (direction == copy_backwards) { 869 use_stride = prefetch > 256; 870 prefetch = -prefetch; 871 if (use_stride) __ mov(stride, prefetch); 872 } 873 874 __ bind(again); 875 876 if (PrefetchCopyIntervalInBytes > 0) 877 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 878 879 if (UseSIMDForMemoryOps) { 880 __ stpq(v0, v1, Address(d, 4 * unit)); 881 __ ldpq(v0, v1, Address(s, 4 * unit)); 882 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 883 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 884 } else { 885 __ stp(t0, t1, Address(d, 2 * unit)); 886 __ ldp(t0, t1, Address(s, 2 * unit)); 887 __ stp(t2, t3, Address(d, 4 * unit)); 888 __ ldp(t2, t3, Address(s, 4 * unit)); 889 __ stp(t4, t5, Address(d, 6 * unit)); 890 __ ldp(t4, t5, Address(s, 6 * unit)); 891 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 892 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 893 } 894 895 __ subs(count, count, 8); 896 __ br(Assembler::HS, again); 897 898 // Drain 899 __ bind(drain); 900 if (UseSIMDForMemoryOps) { 901 __ stpq(v0, v1, Address(d, 4 * unit)); 902 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 903 } else { 904 __ stp(t0, t1, Address(d, 2 * unit)); 905 __ stp(t2, t3, Address(d, 4 * unit)); 906 __ stp(t4, t5, Address(d, 6 * unit)); 907 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 908 } 909 910 { 911 Label L1, L2; 912 __ tbz(count, exact_log2(4), L1); 913 if (UseSIMDForMemoryOps) { 914 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 915 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 916 } else { 917 __ ldp(t0, t1, Address(s, 2 * unit)); 918 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 919 __ stp(t0, t1, Address(d, 2 * unit)); 920 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 921 } 922 __ bind(L1); 923 924 if (direction == copy_forwards) { 925 __ add(s, s, bias); 926 __ add(d, d, bias); 927 } 928 929 __ tbz(count, 1, L2); 930 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 931 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 932 __ bind(L2); 933 } 934 935 __ ret(lr); 936 937 if (AvoidUnalignedAccesses) { 938 Label drain, again; 939 // Register order for storing. Order is different for backward copy. 940 941 __ bind(unaligned_copy_long); 942 943 // source address is even aligned, target odd aligned 944 // 945 // when forward copying word pairs we read long pairs at offsets 946 // {0, 2, 4, 6} (in long words). when backwards copying we read 947 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 948 // address by -2 in the forwards case so we can compute the 949 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 950 // or -1. 951 // 952 // when forward copying we need to store 1 word, 3 pairs and 953 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 954 // zero offset We adjust the destination by -1 which means we 955 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 956 // 957 // When backwards copyng we need to store 1 word, 3 pairs and 958 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 959 // offsets {1, 3, 5, 7, 8} * unit. 960 961 if (direction == copy_forwards) { 962 __ sub(s, s, 16); 963 __ sub(d, d, 8); 964 } 965 966 // Fill 8 registers 967 // 968 // for forwards copy s was offset by -16 from the original input 969 // value of s so the register contents are at these offsets 970 // relative to the 64 bit block addressed by that original input 971 // and so on for each successive 64 byte block when s is updated 972 // 973 // t0 at offset 0, t1 at offset 8 974 // t2 at offset 16, t3 at offset 24 975 // t4 at offset 32, t5 at offset 40 976 // t6 at offset 48, t7 at offset 56 977 978 // for backwards copy s was not offset so the register contents 979 // are at these offsets into the preceding 64 byte block 980 // relative to that original input and so on for each successive 981 // preceding 64 byte block when s is updated. this explains the 982 // slightly counter-intuitive looking pattern of register usage 983 // in the stp instructions for backwards copy. 984 // 985 // t0 at offset -16, t1 at offset -8 986 // t2 at offset -32, t3 at offset -24 987 // t4 at offset -48, t5 at offset -40 988 // t6 at offset -64, t7 at offset -56 989 990 __ ldp(t0, t1, Address(s, 2 * unit)); 991 __ ldp(t2, t3, Address(s, 4 * unit)); 992 __ ldp(t4, t5, Address(s, 6 * unit)); 993 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 994 995 __ subs(count, count, 16); 996 __ br(Assembler::LO, drain); 997 998 int prefetch = PrefetchCopyIntervalInBytes; 999 bool use_stride = false; 1000 if (direction == copy_backwards) { 1001 use_stride = prefetch > 256; 1002 prefetch = -prefetch; 1003 if (use_stride) __ mov(stride, prefetch); 1004 } 1005 1006 __ bind(again); 1007 1008 if (PrefetchCopyIntervalInBytes > 0) 1009 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1010 1011 if (direction == copy_forwards) { 1012 // allowing for the offset of -8 the store instructions place 1013 // registers into the target 64 bit block at the following 1014 // offsets 1015 // 1016 // t0 at offset 0 1017 // t1 at offset 8, t2 at offset 16 1018 // t3 at offset 24, t4 at offset 32 1019 // t5 at offset 40, t6 at offset 48 1020 // t7 at offset 56 1021 1022 __ str(t0, Address(d, 1 * unit)); 1023 __ stp(t1, t2, Address(d, 2 * unit)); 1024 __ ldp(t0, t1, Address(s, 2 * unit)); 1025 __ stp(t3, t4, Address(d, 4 * unit)); 1026 __ ldp(t2, t3, Address(s, 4 * unit)); 1027 __ stp(t5, t6, Address(d, 6 * unit)); 1028 __ ldp(t4, t5, Address(s, 6 * unit)); 1029 __ str(t7, Address(__ pre(d, 8 * unit))); 1030 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1031 } else { 1032 // d was not offset when we started so the registers are 1033 // written into the 64 bit block preceding d with the following 1034 // offsets 1035 // 1036 // t1 at offset -8 1037 // t3 at offset -24, t0 at offset -16 1038 // t5 at offset -48, t2 at offset -32 1039 // t7 at offset -56, t4 at offset -48 1040 // t6 at offset -64 1041 // 1042 // note that this matches the offsets previously noted for the 1043 // loads 1044 1045 __ str(t1, Address(d, 1 * unit)); 1046 __ stp(t3, t0, Address(d, 3 * unit)); 1047 __ ldp(t0, t1, Address(s, 2 * unit)); 1048 __ stp(t5, t2, Address(d, 5 * unit)); 1049 __ ldp(t2, t3, Address(s, 4 * unit)); 1050 __ stp(t7, t4, Address(d, 7 * unit)); 1051 __ ldp(t4, t5, Address(s, 6 * unit)); 1052 __ str(t6, Address(__ pre(d, 8 * unit))); 1053 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1054 } 1055 1056 __ subs(count, count, 8); 1057 __ br(Assembler::HS, again); 1058 1059 // Drain 1060 // 1061 // this uses the same pattern of offsets and register arguments 1062 // as above 1063 __ bind(drain); 1064 if (direction == copy_forwards) { 1065 __ str(t0, Address(d, 1 * unit)); 1066 __ stp(t1, t2, Address(d, 2 * unit)); 1067 __ stp(t3, t4, Address(d, 4 * unit)); 1068 __ stp(t5, t6, Address(d, 6 * unit)); 1069 __ str(t7, Address(__ pre(d, 8 * unit))); 1070 } else { 1071 __ str(t1, Address(d, 1 * unit)); 1072 __ stp(t3, t0, Address(d, 3 * unit)); 1073 __ stp(t5, t2, Address(d, 5 * unit)); 1074 __ stp(t7, t4, Address(d, 7 * unit)); 1075 __ str(t6, Address(__ pre(d, 8 * unit))); 1076 } 1077 // now we need to copy any remaining part block which may 1078 // include a 4 word block subblock and/or a 2 word subblock. 1079 // bits 2 and 1 in the count are the tell-tale for whetehr we 1080 // have each such subblock 1081 { 1082 Label L1, L2; 1083 __ tbz(count, exact_log2(4), L1); 1084 // this is the same as above but copying only 4 longs hence 1085 // with ony one intervening stp between the str instructions 1086 // but note that the offsets and registers still follow the 1087 // same pattern 1088 __ ldp(t0, t1, Address(s, 2 * unit)); 1089 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1090 if (direction == copy_forwards) { 1091 __ str(t0, Address(d, 1 * unit)); 1092 __ stp(t1, t2, Address(d, 2 * unit)); 1093 __ str(t3, Address(__ pre(d, 4 * unit))); 1094 } else { 1095 __ str(t1, Address(d, 1 * unit)); 1096 __ stp(t3, t0, Address(d, 3 * unit)); 1097 __ str(t2, Address(__ pre(d, 4 * unit))); 1098 } 1099 __ bind(L1); 1100 1101 __ tbz(count, 1, L2); 1102 // this is the same as above but copying only 2 longs hence 1103 // there is no intervening stp between the str instructions 1104 // but note that the offset and register patterns are still 1105 // the same 1106 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1107 if (direction == copy_forwards) { 1108 __ str(t0, Address(d, 1 * unit)); 1109 __ str(t1, Address(__ pre(d, 2 * unit))); 1110 } else { 1111 __ str(t1, Address(d, 1 * unit)); 1112 __ str(t0, Address(__ pre(d, 2 * unit))); 1113 } 1114 __ bind(L2); 1115 1116 // for forwards copy we need to re-adjust the offsets we 1117 // applied so that s and d are follow the last words written 1118 1119 if (direction == copy_forwards) { 1120 __ add(s, s, 16); 1121 __ add(d, d, 8); 1122 } 1123 1124 } 1125 1126 __ ret(lr); 1127 } 1128 } 1129 1130 // Small copy: less than 16 bytes. 1131 // 1132 // NB: Ignores all of the bits of count which represent more than 15 1133 // bytes, so a caller doesn't have to mask them. 1134 1135 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1136 bool is_backwards = step < 0; 1137 size_t granularity = uabs(step); 1138 int direction = is_backwards ? -1 : 1; 1139 int unit = wordSize * direction; 1140 1141 Label Lpair, Lword, Lint, Lshort, Lbyte; 1142 1143 assert(granularity 1144 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1145 1146 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1147 1148 // ??? I don't know if this bit-test-and-branch is the right thing 1149 // to do. It does a lot of jumping, resulting in several 1150 // mispredicted branches. It might make more sense to do this 1151 // with something like Duff's device with a single computed branch. 1152 1153 __ tbz(count, 3 - exact_log2(granularity), Lword); 1154 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1155 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1156 __ bind(Lword); 1157 1158 if (granularity <= sizeof (jint)) { 1159 __ tbz(count, 2 - exact_log2(granularity), Lint); 1160 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1161 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1162 __ bind(Lint); 1163 } 1164 1165 if (granularity <= sizeof (jshort)) { 1166 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1167 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1168 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1169 __ bind(Lshort); 1170 } 1171 1172 if (granularity <= sizeof (jbyte)) { 1173 __ tbz(count, 0, Lbyte); 1174 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1175 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1176 __ bind(Lbyte); 1177 } 1178 } 1179 1180 Label copy_f, copy_b; 1181 1182 // All-singing all-dancing memory copy. 1183 // 1184 // Copy count units of memory from s to d. The size of a unit is 1185 // step, which can be positive or negative depending on the direction 1186 // of copy. If is_aligned is false, we align the source address. 1187 // 1188 1189 void copy_memory(bool is_aligned, Register s, Register d, 1190 Register count, Register tmp, int step) { 1191 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1192 bool is_backwards = step < 0; 1193 int granularity = uabs(step); 1194 const Register t0 = r3, t1 = r4; 1195 1196 // <= 96 bytes do inline. Direction doesn't matter because we always 1197 // load all the data before writing anything 1198 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1199 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1200 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1201 const Register send = r17, dend = r18; 1202 1203 if (PrefetchCopyIntervalInBytes > 0) 1204 __ prfm(Address(s, 0), PLDL1KEEP); 1205 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1206 __ br(Assembler::HI, copy_big); 1207 1208 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1209 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1210 1211 __ cmp(count, 16/granularity); 1212 __ br(Assembler::LS, copy16); 1213 1214 __ cmp(count, 64/granularity); 1215 __ br(Assembler::HI, copy80); 1216 1217 __ cmp(count, 32/granularity); 1218 __ br(Assembler::LS, copy32); 1219 1220 // 33..64 bytes 1221 if (UseSIMDForMemoryOps) { 1222 __ ldpq(v0, v1, Address(s, 0)); 1223 __ ldpq(v2, v3, Address(send, -32)); 1224 __ stpq(v0, v1, Address(d, 0)); 1225 __ stpq(v2, v3, Address(dend, -32)); 1226 } else { 1227 __ ldp(t0, t1, Address(s, 0)); 1228 __ ldp(t2, t3, Address(s, 16)); 1229 __ ldp(t4, t5, Address(send, -32)); 1230 __ ldp(t6, t7, Address(send, -16)); 1231 1232 __ stp(t0, t1, Address(d, 0)); 1233 __ stp(t2, t3, Address(d, 16)); 1234 __ stp(t4, t5, Address(dend, -32)); 1235 __ stp(t6, t7, Address(dend, -16)); 1236 } 1237 __ b(finish); 1238 1239 // 17..32 bytes 1240 __ bind(copy32); 1241 __ ldp(t0, t1, Address(s, 0)); 1242 __ ldp(t2, t3, Address(send, -16)); 1243 __ stp(t0, t1, Address(d, 0)); 1244 __ stp(t2, t3, Address(dend, -16)); 1245 __ b(finish); 1246 1247 // 65..80/96 bytes 1248 // (96 bytes if SIMD because we do 32 byes per instruction) 1249 __ bind(copy80); 1250 if (UseSIMDForMemoryOps) { 1251 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1252 __ ldpq(v4, v5, Address(send, -32)); 1253 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1254 __ stpq(v4, v5, Address(dend, -32)); 1255 } else { 1256 __ ldp(t0, t1, Address(s, 0)); 1257 __ ldp(t2, t3, Address(s, 16)); 1258 __ ldp(t4, t5, Address(s, 32)); 1259 __ ldp(t6, t7, Address(s, 48)); 1260 __ ldp(t8, t9, Address(send, -16)); 1261 1262 __ stp(t0, t1, Address(d, 0)); 1263 __ stp(t2, t3, Address(d, 16)); 1264 __ stp(t4, t5, Address(d, 32)); 1265 __ stp(t6, t7, Address(d, 48)); 1266 __ stp(t8, t9, Address(dend, -16)); 1267 } 1268 __ b(finish); 1269 1270 // 0..16 bytes 1271 __ bind(copy16); 1272 __ cmp(count, 8/granularity); 1273 __ br(Assembler::LO, copy8); 1274 1275 // 8..16 bytes 1276 __ ldr(t0, Address(s, 0)); 1277 __ ldr(t1, Address(send, -8)); 1278 __ str(t0, Address(d, 0)); 1279 __ str(t1, Address(dend, -8)); 1280 __ b(finish); 1281 1282 if (granularity < 8) { 1283 // 4..7 bytes 1284 __ bind(copy8); 1285 __ tbz(count, 2 - exact_log2(granularity), copy4); 1286 __ ldrw(t0, Address(s, 0)); 1287 __ ldrw(t1, Address(send, -4)); 1288 __ strw(t0, Address(d, 0)); 1289 __ strw(t1, Address(dend, -4)); 1290 __ b(finish); 1291 if (granularity < 4) { 1292 // 0..3 bytes 1293 __ bind(copy4); 1294 __ cbz(count, finish); // get rid of 0 case 1295 if (granularity == 2) { 1296 __ ldrh(t0, Address(s, 0)); 1297 __ strh(t0, Address(d, 0)); 1298 } else { // granularity == 1 1299 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1300 // the first and last byte. 1301 // Handle the 3 byte case by loading and storing base + count/2 1302 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1303 // This does means in the 1 byte case we load/store the same 1304 // byte 3 times. 1305 __ lsr(count, count, 1); 1306 __ ldrb(t0, Address(s, 0)); 1307 __ ldrb(t1, Address(send, -1)); 1308 __ ldrb(t2, Address(s, count)); 1309 __ strb(t0, Address(d, 0)); 1310 __ strb(t1, Address(dend, -1)); 1311 __ strb(t2, Address(d, count)); 1312 } 1313 __ b(finish); 1314 } 1315 } 1316 1317 __ bind(copy_big); 1318 if (is_backwards) { 1319 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1320 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1321 } 1322 1323 // Now we've got the small case out of the way we can align the 1324 // source address on a 2-word boundary. 1325 1326 Label aligned; 1327 1328 if (is_aligned) { 1329 // We may have to adjust by 1 word to get s 2-word-aligned. 1330 __ tbz(s, exact_log2(wordSize), aligned); 1331 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1332 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1333 __ sub(count, count, wordSize/granularity); 1334 } else { 1335 if (is_backwards) { 1336 __ andr(rscratch2, s, 2 * wordSize - 1); 1337 } else { 1338 __ neg(rscratch2, s); 1339 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1340 } 1341 // rscratch2 is the byte adjustment needed to align s. 1342 __ cbz(rscratch2, aligned); 1343 int shift = exact_log2(granularity); 1344 if (shift) __ lsr(rscratch2, rscratch2, shift); 1345 __ sub(count, count, rscratch2); 1346 1347 #if 0 1348 // ?? This code is only correct for a disjoint copy. It may or 1349 // may not make sense to use it in that case. 1350 1351 // Copy the first pair; s and d may not be aligned. 1352 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1353 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1354 1355 // Align s and d, adjust count 1356 if (is_backwards) { 1357 __ sub(s, s, rscratch2); 1358 __ sub(d, d, rscratch2); 1359 } else { 1360 __ add(s, s, rscratch2); 1361 __ add(d, d, rscratch2); 1362 } 1363 #else 1364 copy_memory_small(s, d, rscratch2, rscratch1, step); 1365 #endif 1366 } 1367 1368 __ bind(aligned); 1369 1370 // s is now 2-word-aligned. 1371 1372 // We have a count of units and some trailing bytes. Adjust the 1373 // count and do a bulk copy of words. 1374 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1375 if (direction == copy_forwards) 1376 __ bl(copy_f); 1377 else 1378 __ bl(copy_b); 1379 1380 // And the tail. 1381 copy_memory_small(s, d, count, tmp, step); 1382 1383 if (granularity >= 8) __ bind(copy8); 1384 if (granularity >= 4) __ bind(copy4); 1385 __ bind(finish); 1386 } 1387 1388 1389 void clobber_registers() { 1390 #ifdef ASSERT 1391 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1392 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1393 for (Register r = r3; r <= r18; r++) 1394 if (r != rscratch1) __ mov(r, rscratch1); 1395 #endif 1396 } 1397 1398 // Scan over array at a for count oops, verifying each one. 1399 // Preserves a and count, clobbers rscratch1 and rscratch2. 1400 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1401 Label loop, end; 1402 __ mov(rscratch1, a); 1403 __ mov(rscratch2, zr); 1404 __ bind(loop); 1405 __ cmp(rscratch2, count); 1406 __ br(Assembler::HS, end); 1407 if (size == (size_t)wordSize) { 1408 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1409 __ verify_oop(temp); 1410 } else { 1411 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1412 __ decode_heap_oop(temp); // calls verify_oop 1413 } 1414 __ add(rscratch2, rscratch2, size); 1415 __ b(loop); 1416 __ bind(end); 1417 } 1418 1419 // Arguments: 1420 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1421 // ignored 1422 // is_oop - true => oop array, so generate store check code 1423 // name - stub name string 1424 // 1425 // Inputs: 1426 // c_rarg0 - source array address 1427 // c_rarg1 - destination array address 1428 // c_rarg2 - element count, treated as ssize_t, can be zero 1429 // 1430 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1431 // the hardware handle it. The two dwords within qwords that span 1432 // cache line boundaries will still be loaded and stored atomicly. 1433 // 1434 // Side Effects: 1435 // disjoint_int_copy_entry is set to the no-overlap entry point 1436 // used by generate_conjoint_int_oop_copy(). 1437 // 1438 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1439 const char *name, bool dest_uninitialized = false) { 1440 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1441 __ align(CodeEntryAlignment); 1442 StubCodeMark mark(this, "StubRoutines", name); 1443 address start = __ pc(); 1444 __ enter(); 1445 1446 if (entry != NULL) { 1447 *entry = __ pc(); 1448 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1449 BLOCK_COMMENT("Entry:"); 1450 } 1451 1452 if (is_oop) { 1453 __ push(RegSet::of(d, count), sp); 1454 // no registers are destroyed by this call 1455 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1456 } 1457 copy_memory(aligned, s, d, count, rscratch1, size); 1458 if (is_oop) { 1459 __ pop(RegSet::of(d, count), sp); 1460 if (VerifyOops) 1461 verify_oop_array(size, d, count, r16); 1462 __ sub(count, count, 1); // make an inclusive end pointer 1463 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1464 gen_write_ref_array_post_barrier(d, count, rscratch1); 1465 } 1466 __ leave(); 1467 __ mov(r0, zr); // return 0 1468 __ ret(lr); 1469 #ifdef BUILTIN_SIM 1470 { 1471 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1472 sim->notifyCompile(const_cast<char*>(name), start); 1473 } 1474 #endif 1475 return start; 1476 } 1477 1478 // Arguments: 1479 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1480 // ignored 1481 // is_oop - true => oop array, so generate store check code 1482 // name - stub name string 1483 // 1484 // Inputs: 1485 // c_rarg0 - source array address 1486 // c_rarg1 - destination array address 1487 // c_rarg2 - element count, treated as ssize_t, can be zero 1488 // 1489 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1490 // the hardware handle it. The two dwords within qwords that span 1491 // cache line boundaries will still be loaded and stored atomicly. 1492 // 1493 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1494 address *entry, const char *name, 1495 bool dest_uninitialized = false) { 1496 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1497 1498 StubCodeMark mark(this, "StubRoutines", name); 1499 address start = __ pc(); 1500 __ enter(); 1501 1502 if (entry != NULL) { 1503 *entry = __ pc(); 1504 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1505 BLOCK_COMMENT("Entry:"); 1506 } 1507 1508 // use fwd copy when (d-s) above_equal (count*size) 1509 __ sub(rscratch1, d, s); 1510 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1511 __ br(Assembler::HS, nooverlap_target); 1512 1513 if (is_oop) { 1514 __ push(RegSet::of(d, count), sp); 1515 // no registers are destroyed by this call 1516 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1517 } 1518 copy_memory(aligned, s, d, count, rscratch1, -size); 1519 if (is_oop) { 1520 __ pop(RegSet::of(d, count), sp); 1521 if (VerifyOops) 1522 verify_oop_array(size, d, count, r16); 1523 __ sub(count, count, 1); // make an inclusive end pointer 1524 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1525 gen_write_ref_array_post_barrier(d, count, rscratch1); 1526 } 1527 __ leave(); 1528 __ mov(r0, zr); // return 0 1529 __ ret(lr); 1530 #ifdef BUILTIN_SIM 1531 { 1532 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1533 sim->notifyCompile(const_cast<char*>(name), start); 1534 } 1535 #endif 1536 return start; 1537 } 1538 1539 // Arguments: 1540 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1541 // ignored 1542 // name - stub name string 1543 // 1544 // Inputs: 1545 // c_rarg0 - source array address 1546 // c_rarg1 - destination array address 1547 // c_rarg2 - element count, treated as ssize_t, can be zero 1548 // 1549 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1550 // we let the hardware handle it. The one to eight bytes within words, 1551 // dwords or qwords that span cache line boundaries will still be loaded 1552 // and stored atomically. 1553 // 1554 // Side Effects: 1555 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1556 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1557 // we let the hardware handle it. The one to eight bytes within words, 1558 // dwords or qwords that span cache line boundaries will still be loaded 1559 // and stored atomically. 1560 // 1561 // Side Effects: 1562 // disjoint_byte_copy_entry is set to the no-overlap entry point 1563 // used by generate_conjoint_byte_copy(). 1564 // 1565 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1566 const bool not_oop = false; 1567 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1568 } 1569 1570 // Arguments: 1571 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1572 // ignored 1573 // name - stub name string 1574 // 1575 // Inputs: 1576 // c_rarg0 - source array address 1577 // c_rarg1 - destination array address 1578 // c_rarg2 - element count, treated as ssize_t, can be zero 1579 // 1580 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1581 // we let the hardware handle it. The one to eight bytes within words, 1582 // dwords or qwords that span cache line boundaries will still be loaded 1583 // and stored atomically. 1584 // 1585 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1586 address* entry, const char *name) { 1587 const bool not_oop = false; 1588 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1589 } 1590 1591 // Arguments: 1592 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1593 // ignored 1594 // name - stub name string 1595 // 1596 // Inputs: 1597 // c_rarg0 - source array address 1598 // c_rarg1 - destination array address 1599 // c_rarg2 - element count, treated as ssize_t, can be zero 1600 // 1601 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1602 // let the hardware handle it. The two or four words within dwords 1603 // or qwords that span cache line boundaries will still be loaded 1604 // and stored atomically. 1605 // 1606 // Side Effects: 1607 // disjoint_short_copy_entry is set to the no-overlap entry point 1608 // used by generate_conjoint_short_copy(). 1609 // 1610 address generate_disjoint_short_copy(bool aligned, 1611 address* entry, const char *name) { 1612 const bool not_oop = false; 1613 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1614 } 1615 1616 // Arguments: 1617 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1618 // ignored 1619 // name - stub name string 1620 // 1621 // Inputs: 1622 // c_rarg0 - source array address 1623 // c_rarg1 - destination array address 1624 // c_rarg2 - element count, treated as ssize_t, can be zero 1625 // 1626 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1627 // let the hardware handle it. The two or four words within dwords 1628 // or qwords that span cache line boundaries will still be loaded 1629 // and stored atomically. 1630 // 1631 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1632 address *entry, const char *name) { 1633 const bool not_oop = false; 1634 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1635 1636 } 1637 // Arguments: 1638 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1639 // ignored 1640 // name - stub name string 1641 // 1642 // Inputs: 1643 // c_rarg0 - source array address 1644 // c_rarg1 - destination array address 1645 // c_rarg2 - element count, treated as ssize_t, can be zero 1646 // 1647 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1648 // the hardware handle it. The two dwords within qwords that span 1649 // cache line boundaries will still be loaded and stored atomicly. 1650 // 1651 // Side Effects: 1652 // disjoint_int_copy_entry is set to the no-overlap entry point 1653 // used by generate_conjoint_int_oop_copy(). 1654 // 1655 address generate_disjoint_int_copy(bool aligned, address *entry, 1656 const char *name, bool dest_uninitialized = false) { 1657 const bool not_oop = false; 1658 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1659 } 1660 1661 // Arguments: 1662 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1663 // ignored 1664 // name - stub name string 1665 // 1666 // Inputs: 1667 // c_rarg0 - source array address 1668 // c_rarg1 - destination array address 1669 // c_rarg2 - element count, treated as ssize_t, can be zero 1670 // 1671 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1672 // the hardware handle it. The two dwords within qwords that span 1673 // cache line boundaries will still be loaded and stored atomicly. 1674 // 1675 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1676 address *entry, const char *name, 1677 bool dest_uninitialized = false) { 1678 const bool not_oop = false; 1679 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1680 } 1681 1682 1683 // Arguments: 1684 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1685 // ignored 1686 // name - stub name string 1687 // 1688 // Inputs: 1689 // c_rarg0 - source array address 1690 // c_rarg1 - destination array address 1691 // c_rarg2 - element count, treated as size_t, can be zero 1692 // 1693 // Side Effects: 1694 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1695 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1696 // 1697 address generate_disjoint_long_copy(bool aligned, address *entry, 1698 const char *name, bool dest_uninitialized = false) { 1699 const bool not_oop = false; 1700 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1701 } 1702 1703 // Arguments: 1704 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1705 // ignored 1706 // name - stub name string 1707 // 1708 // Inputs: 1709 // c_rarg0 - source array address 1710 // c_rarg1 - destination array address 1711 // c_rarg2 - element count, treated as size_t, can be zero 1712 // 1713 address generate_conjoint_long_copy(bool aligned, 1714 address nooverlap_target, address *entry, 1715 const char *name, bool dest_uninitialized = false) { 1716 const bool not_oop = false; 1717 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1718 } 1719 1720 // Arguments: 1721 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1722 // ignored 1723 // name - stub name string 1724 // 1725 // Inputs: 1726 // c_rarg0 - source array address 1727 // c_rarg1 - destination array address 1728 // c_rarg2 - element count, treated as size_t, can be zero 1729 // 1730 // Side Effects: 1731 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1732 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1733 // 1734 address generate_disjoint_oop_copy(bool aligned, address *entry, 1735 const char *name, bool dest_uninitialized) { 1736 const bool is_oop = true; 1737 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1738 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1739 } 1740 1741 // Arguments: 1742 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1743 // ignored 1744 // name - stub name string 1745 // 1746 // Inputs: 1747 // c_rarg0 - source array address 1748 // c_rarg1 - destination array address 1749 // c_rarg2 - element count, treated as size_t, can be zero 1750 // 1751 address generate_conjoint_oop_copy(bool aligned, 1752 address nooverlap_target, address *entry, 1753 const char *name, bool dest_uninitialized) { 1754 const bool is_oop = true; 1755 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1756 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1757 name, dest_uninitialized); 1758 } 1759 1760 1761 // Helper for generating a dynamic type check. 1762 // Smashes rscratch1. 1763 void generate_type_check(Register sub_klass, 1764 Register super_check_offset, 1765 Register super_klass, 1766 Label& L_success) { 1767 assert_different_registers(sub_klass, super_check_offset, super_klass); 1768 1769 BLOCK_COMMENT("type_check:"); 1770 1771 Label L_miss; 1772 1773 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1774 super_check_offset); 1775 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1776 1777 // Fall through on failure! 1778 __ BIND(L_miss); 1779 } 1780 1781 // 1782 // Generate checkcasting array copy stub 1783 // 1784 // Input: 1785 // c_rarg0 - source array address 1786 // c_rarg1 - destination array address 1787 // c_rarg2 - element count, treated as ssize_t, can be zero 1788 // c_rarg3 - size_t ckoff (super_check_offset) 1789 // c_rarg4 - oop ckval (super_klass) 1790 // 1791 // Output: 1792 // r0 == 0 - success 1793 // r0 == -1^K - failure, where K is partial transfer count 1794 // 1795 address generate_checkcast_copy(const char *name, address *entry, 1796 bool dest_uninitialized = false) { 1797 1798 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1799 1800 // Input registers (after setup_arg_regs) 1801 const Register from = c_rarg0; // source array address 1802 const Register to = c_rarg1; // destination array address 1803 const Register count = c_rarg2; // elementscount 1804 const Register ckoff = c_rarg3; // super_check_offset 1805 const Register ckval = c_rarg4; // super_klass 1806 1807 // Registers used as temps (r18, r19, r20 are save-on-entry) 1808 const Register count_save = r21; // orig elementscount 1809 const Register start_to = r20; // destination array start address 1810 const Register copied_oop = r18; // actual oop copied 1811 const Register r19_klass = r19; // oop._klass 1812 1813 //--------------------------------------------------------------- 1814 // Assembler stub will be used for this call to arraycopy 1815 // if the two arrays are subtypes of Object[] but the 1816 // destination array type is not equal to or a supertype 1817 // of the source type. Each element must be separately 1818 // checked. 1819 1820 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1821 copied_oop, r19_klass, count_save); 1822 1823 __ align(CodeEntryAlignment); 1824 StubCodeMark mark(this, "StubRoutines", name); 1825 address start = __ pc(); 1826 1827 __ enter(); // required for proper stackwalking of RuntimeStub frame 1828 1829 #ifdef ASSERT 1830 // caller guarantees that the arrays really are different 1831 // otherwise, we would have to make conjoint checks 1832 { Label L; 1833 array_overlap_test(L, TIMES_OOP); 1834 __ stop("checkcast_copy within a single array"); 1835 __ bind(L); 1836 } 1837 #endif //ASSERT 1838 1839 // Caller of this entry point must set up the argument registers. 1840 if (entry != NULL) { 1841 *entry = __ pc(); 1842 BLOCK_COMMENT("Entry:"); 1843 } 1844 1845 // Empty array: Nothing to do. 1846 __ cbz(count, L_done); 1847 1848 __ push(RegSet::of(r18, r19, r20, r21), sp); 1849 1850 #ifdef ASSERT 1851 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1852 // The ckoff and ckval must be mutually consistent, 1853 // even though caller generates both. 1854 { Label L; 1855 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1856 __ ldrw(start_to, Address(ckval, sco_offset)); 1857 __ cmpw(ckoff, start_to); 1858 __ br(Assembler::EQ, L); 1859 __ stop("super_check_offset inconsistent"); 1860 __ bind(L); 1861 } 1862 #endif //ASSERT 1863 1864 gen_write_ref_array_pre_barrier(to, count, dest_uninitialized); 1865 1866 // save the original count 1867 __ mov(count_save, count); 1868 1869 // Copy from low to high addresses 1870 __ mov(start_to, to); // Save destination array start address 1871 __ b(L_load_element); 1872 1873 // ======== begin loop ======== 1874 // (Loop is rotated; its entry is L_load_element.) 1875 // Loop control: 1876 // for (; count != 0; count--) { 1877 // copied_oop = load_heap_oop(from++); 1878 // ... generate_type_check ...; 1879 // store_heap_oop(to++, copied_oop); 1880 // } 1881 __ align(OptoLoopAlignment); 1882 1883 __ BIND(L_store_element); 1884 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1885 __ sub(count, count, 1); 1886 __ cbz(count, L_do_card_marks); 1887 1888 // ======== loop entry is here ======== 1889 __ BIND(L_load_element); 1890 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1891 __ cbz(copied_oop, L_store_element); 1892 1893 __ load_klass(r19_klass, copied_oop);// query the object klass 1894 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1895 // ======== end loop ======== 1896 1897 // It was a real error; we must depend on the caller to finish the job. 1898 // Register count = remaining oops, count_orig = total oops. 1899 // Emit GC store barriers for the oops we have copied and report 1900 // their number to the caller. 1901 1902 __ subs(count, count_save, count); // K = partially copied oop count 1903 __ eon(count, count, zr); // report (-1^K) to caller 1904 __ br(Assembler::EQ, L_done_pop); 1905 1906 __ BIND(L_do_card_marks); 1907 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1908 gen_write_ref_array_post_barrier(start_to, to, rscratch1); 1909 1910 __ bind(L_done_pop); 1911 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1912 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1913 1914 __ bind(L_done); 1915 __ mov(r0, count); 1916 __ leave(); 1917 __ ret(lr); 1918 1919 return start; 1920 } 1921 1922 // Perform range checks on the proposed arraycopy. 1923 // Kills temp, but nothing else. 1924 // Also, clean the sign bits of src_pos and dst_pos. 1925 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1926 Register src_pos, // source position (c_rarg1) 1927 Register dst, // destination array oo (c_rarg2) 1928 Register dst_pos, // destination position (c_rarg3) 1929 Register length, 1930 Register temp, 1931 Label& L_failed) { 1932 BLOCK_COMMENT("arraycopy_range_checks:"); 1933 1934 assert_different_registers(rscratch1, temp); 1935 1936 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1937 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1938 __ addw(temp, length, src_pos); 1939 __ cmpw(temp, rscratch1); 1940 __ br(Assembler::HI, L_failed); 1941 1942 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1943 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1944 __ addw(temp, length, dst_pos); 1945 __ cmpw(temp, rscratch1); 1946 __ br(Assembler::HI, L_failed); 1947 1948 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1949 __ movw(src_pos, src_pos); 1950 __ movw(dst_pos, dst_pos); 1951 1952 BLOCK_COMMENT("arraycopy_range_checks done"); 1953 } 1954 1955 // These stubs get called from some dumb test routine. 1956 // I'll write them properly when they're called from 1957 // something that's actually doing something. 1958 static void fake_arraycopy_stub(address src, address dst, int count) { 1959 assert(count == 0, "huh?"); 1960 } 1961 1962 1963 // 1964 // Generate 'unsafe' array copy stub 1965 // Though just as safe as the other stubs, it takes an unscaled 1966 // size_t argument instead of an element count. 1967 // 1968 // Input: 1969 // c_rarg0 - source array address 1970 // c_rarg1 - destination array address 1971 // c_rarg2 - byte count, treated as ssize_t, can be zero 1972 // 1973 // Examines the alignment of the operands and dispatches 1974 // to a long, int, short, or byte copy loop. 1975 // 1976 address generate_unsafe_copy(const char *name, 1977 address byte_copy_entry, 1978 address short_copy_entry, 1979 address int_copy_entry, 1980 address long_copy_entry) { 1981 Label L_long_aligned, L_int_aligned, L_short_aligned; 1982 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1983 1984 __ align(CodeEntryAlignment); 1985 StubCodeMark mark(this, "StubRoutines", name); 1986 address start = __ pc(); 1987 __ enter(); // required for proper stackwalking of RuntimeStub frame 1988 1989 // bump this on entry, not on exit: 1990 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1991 1992 __ orr(rscratch1, s, d); 1993 __ orr(rscratch1, rscratch1, count); 1994 1995 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1996 __ cbz(rscratch1, L_long_aligned); 1997 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1998 __ cbz(rscratch1, L_int_aligned); 1999 __ tbz(rscratch1, 0, L_short_aligned); 2000 __ b(RuntimeAddress(byte_copy_entry)); 2001 2002 __ BIND(L_short_aligned); 2003 __ lsr(count, count, LogBytesPerShort); // size => short_count 2004 __ b(RuntimeAddress(short_copy_entry)); 2005 __ BIND(L_int_aligned); 2006 __ lsr(count, count, LogBytesPerInt); // size => int_count 2007 __ b(RuntimeAddress(int_copy_entry)); 2008 __ BIND(L_long_aligned); 2009 __ lsr(count, count, LogBytesPerLong); // size => long_count 2010 __ b(RuntimeAddress(long_copy_entry)); 2011 2012 return start; 2013 } 2014 2015 // 2016 // Generate generic array copy stubs 2017 // 2018 // Input: 2019 // c_rarg0 - src oop 2020 // c_rarg1 - src_pos (32-bits) 2021 // c_rarg2 - dst oop 2022 // c_rarg3 - dst_pos (32-bits) 2023 // c_rarg4 - element count (32-bits) 2024 // 2025 // Output: 2026 // r0 == 0 - success 2027 // r0 == -1^K - failure, where K is partial transfer count 2028 // 2029 address generate_generic_copy(const char *name, 2030 address byte_copy_entry, address short_copy_entry, 2031 address int_copy_entry, address oop_copy_entry, 2032 address long_copy_entry, address checkcast_copy_entry) { 2033 2034 Label L_failed, L_failed_0, L_objArray; 2035 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2036 2037 // Input registers 2038 const Register src = c_rarg0; // source array oop 2039 const Register src_pos = c_rarg1; // source position 2040 const Register dst = c_rarg2; // destination array oop 2041 const Register dst_pos = c_rarg3; // destination position 2042 const Register length = c_rarg4; 2043 2044 StubCodeMark mark(this, "StubRoutines", name); 2045 2046 __ align(CodeEntryAlignment); 2047 address start = __ pc(); 2048 2049 __ enter(); // required for proper stackwalking of RuntimeStub frame 2050 2051 // bump this on entry, not on exit: 2052 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2053 2054 //----------------------------------------------------------------------- 2055 // Assembler stub will be used for this call to arraycopy 2056 // if the following conditions are met: 2057 // 2058 // (1) src and dst must not be null. 2059 // (2) src_pos must not be negative. 2060 // (3) dst_pos must not be negative. 2061 // (4) length must not be negative. 2062 // (5) src klass and dst klass should be the same and not NULL. 2063 // (6) src and dst should be arrays. 2064 // (7) src_pos + length must not exceed length of src. 2065 // (8) dst_pos + length must not exceed length of dst. 2066 // 2067 2068 // if (src == NULL) return -1; 2069 __ cbz(src, L_failed); 2070 2071 // if (src_pos < 0) return -1; 2072 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2073 2074 // if (dst == NULL) return -1; 2075 __ cbz(dst, L_failed); 2076 2077 // if (dst_pos < 0) return -1; 2078 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2079 2080 // registers used as temp 2081 const Register scratch_length = r16; // elements count to copy 2082 const Register scratch_src_klass = r17; // array klass 2083 const Register lh = r18; // layout helper 2084 2085 // if (length < 0) return -1; 2086 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2087 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2088 2089 __ load_klass(scratch_src_klass, src); 2090 #ifdef ASSERT 2091 // assert(src->klass() != NULL); 2092 { 2093 BLOCK_COMMENT("assert klasses not null {"); 2094 Label L1, L2; 2095 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2096 __ bind(L1); 2097 __ stop("broken null klass"); 2098 __ bind(L2); 2099 __ load_klass(rscratch1, dst); 2100 __ cbz(rscratch1, L1); // this would be broken also 2101 BLOCK_COMMENT("} assert klasses not null done"); 2102 } 2103 #endif 2104 2105 // Load layout helper (32-bits) 2106 // 2107 // |array_tag| | header_size | element_type | |log2_element_size| 2108 // 32 30 24 16 8 2 0 2109 // 2110 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2111 // 2112 2113 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2114 2115 // Handle objArrays completely differently... 2116 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2117 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2118 __ movw(rscratch1, objArray_lh); 2119 __ eorw(rscratch2, lh, rscratch1); 2120 __ cbzw(rscratch2, L_objArray); 2121 2122 // if (src->klass() != dst->klass()) return -1; 2123 __ load_klass(rscratch2, dst); 2124 __ eor(rscratch2, rscratch2, scratch_src_klass); 2125 __ cbnz(rscratch2, L_failed); 2126 2127 // if (!src->is_Array()) return -1; 2128 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2129 2130 // At this point, it is known to be a typeArray (array_tag 0x3). 2131 #ifdef ASSERT 2132 { 2133 BLOCK_COMMENT("assert primitive array {"); 2134 Label L; 2135 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2136 __ cmpw(lh, rscratch2); 2137 __ br(Assembler::GE, L); 2138 __ stop("must be a primitive array"); 2139 __ bind(L); 2140 BLOCK_COMMENT("} assert primitive array done"); 2141 } 2142 #endif 2143 2144 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2145 rscratch2, L_failed); 2146 2147 // TypeArrayKlass 2148 // 2149 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2150 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2151 // 2152 2153 const Register rscratch1_offset = rscratch1; // array offset 2154 const Register r18_elsize = lh; // element size 2155 2156 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2157 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2158 __ add(src, src, rscratch1_offset); // src array offset 2159 __ add(dst, dst, rscratch1_offset); // dst array offset 2160 BLOCK_COMMENT("choose copy loop based on element size"); 2161 2162 // next registers should be set before the jump to corresponding stub 2163 const Register from = c_rarg0; // source array address 2164 const Register to = c_rarg1; // destination array address 2165 const Register count = c_rarg2; // elements count 2166 2167 // 'from', 'to', 'count' registers should be set in such order 2168 // since they are the same as 'src', 'src_pos', 'dst'. 2169 2170 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2171 2172 // The possible values of elsize are 0-3, i.e. exact_log2(element 2173 // size in bytes). We do a simple bitwise binary search. 2174 __ BIND(L_copy_bytes); 2175 __ tbnz(r18_elsize, 1, L_copy_ints); 2176 __ tbnz(r18_elsize, 0, L_copy_shorts); 2177 __ lea(from, Address(src, src_pos));// src_addr 2178 __ lea(to, Address(dst, dst_pos));// dst_addr 2179 __ movw(count, scratch_length); // length 2180 __ b(RuntimeAddress(byte_copy_entry)); 2181 2182 __ BIND(L_copy_shorts); 2183 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2184 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2185 __ movw(count, scratch_length); // length 2186 __ b(RuntimeAddress(short_copy_entry)); 2187 2188 __ BIND(L_copy_ints); 2189 __ tbnz(r18_elsize, 0, L_copy_longs); 2190 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2191 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2192 __ movw(count, scratch_length); // length 2193 __ b(RuntimeAddress(int_copy_entry)); 2194 2195 __ BIND(L_copy_longs); 2196 #ifdef ASSERT 2197 { 2198 BLOCK_COMMENT("assert long copy {"); 2199 Label L; 2200 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2201 __ cmpw(r18_elsize, LogBytesPerLong); 2202 __ br(Assembler::EQ, L); 2203 __ stop("must be long copy, but elsize is wrong"); 2204 __ bind(L); 2205 BLOCK_COMMENT("} assert long copy done"); 2206 } 2207 #endif 2208 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2209 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2210 __ movw(count, scratch_length); // length 2211 __ b(RuntimeAddress(long_copy_entry)); 2212 2213 // ObjArrayKlass 2214 __ BIND(L_objArray); 2215 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2216 2217 Label L_plain_copy, L_checkcast_copy; 2218 // test array classes for subtyping 2219 __ load_klass(r18, dst); 2220 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2221 __ br(Assembler::NE, L_checkcast_copy); 2222 2223 // Identically typed arrays can be copied without element-wise checks. 2224 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2225 rscratch2, L_failed); 2226 2227 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2228 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2229 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2230 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2231 __ movw(count, scratch_length); // length 2232 __ BIND(L_plain_copy); 2233 __ b(RuntimeAddress(oop_copy_entry)); 2234 2235 __ BIND(L_checkcast_copy); 2236 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2237 { 2238 // Before looking at dst.length, make sure dst is also an objArray. 2239 __ ldrw(rscratch1, Address(r18, lh_offset)); 2240 __ movw(rscratch2, objArray_lh); 2241 __ eorw(rscratch1, rscratch1, rscratch2); 2242 __ cbnzw(rscratch1, L_failed); 2243 2244 // It is safe to examine both src.length and dst.length. 2245 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2246 r18, L_failed); 2247 2248 const Register rscratch2_dst_klass = rscratch2; 2249 __ load_klass(rscratch2_dst_klass, dst); // reload 2250 2251 // Marshal the base address arguments now, freeing registers. 2252 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2253 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2254 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2255 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2256 __ movw(count, length); // length (reloaded) 2257 Register sco_temp = c_rarg3; // this register is free now 2258 assert_different_registers(from, to, count, sco_temp, 2259 rscratch2_dst_klass, scratch_src_klass); 2260 // assert_clean_int(count, sco_temp); 2261 2262 // Generate the type check. 2263 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2264 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2265 // assert_clean_int(sco_temp, r18); 2266 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2267 2268 // Fetch destination element klass from the ObjArrayKlass header. 2269 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2270 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2271 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2272 2273 // the checkcast_copy loop needs two extra arguments: 2274 assert(c_rarg3 == sco_temp, "#3 already in place"); 2275 // Set up arguments for checkcast_copy_entry. 2276 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2277 __ b(RuntimeAddress(checkcast_copy_entry)); 2278 } 2279 2280 __ BIND(L_failed); 2281 __ mov(r0, -1); 2282 __ leave(); // required for proper stackwalking of RuntimeStub frame 2283 __ ret(lr); 2284 2285 return start; 2286 } 2287 2288 // 2289 // Generate stub for array fill. If "aligned" is true, the 2290 // "to" address is assumed to be heapword aligned. 2291 // 2292 // Arguments for generated stub: 2293 // to: c_rarg0 2294 // value: c_rarg1 2295 // count: c_rarg2 treated as signed 2296 // 2297 address generate_fill(BasicType t, bool aligned, const char *name) { 2298 __ align(CodeEntryAlignment); 2299 StubCodeMark mark(this, "StubRoutines", name); 2300 address start = __ pc(); 2301 2302 BLOCK_COMMENT("Entry:"); 2303 2304 const Register to = c_rarg0; // source array address 2305 const Register value = c_rarg1; // value 2306 const Register count = c_rarg2; // elements count 2307 2308 const Register bz_base = r10; // base for block_zero routine 2309 const Register cnt_words = r11; // temp register 2310 2311 __ enter(); 2312 2313 Label L_fill_elements, L_exit1; 2314 2315 int shift = -1; 2316 switch (t) { 2317 case T_BYTE: 2318 shift = 0; 2319 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2320 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2321 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2322 __ br(Assembler::LO, L_fill_elements); 2323 break; 2324 case T_SHORT: 2325 shift = 1; 2326 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2327 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2328 __ br(Assembler::LO, L_fill_elements); 2329 break; 2330 case T_INT: 2331 shift = 2; 2332 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2333 __ br(Assembler::LO, L_fill_elements); 2334 break; 2335 default: ShouldNotReachHere(); 2336 } 2337 2338 // Align source address at 8 bytes address boundary. 2339 Label L_skip_align1, L_skip_align2, L_skip_align4; 2340 if (!aligned) { 2341 switch (t) { 2342 case T_BYTE: 2343 // One byte misalignment happens only for byte arrays. 2344 __ tbz(to, 0, L_skip_align1); 2345 __ strb(value, Address(__ post(to, 1))); 2346 __ subw(count, count, 1); 2347 __ bind(L_skip_align1); 2348 // Fallthrough 2349 case T_SHORT: 2350 // Two bytes misalignment happens only for byte and short (char) arrays. 2351 __ tbz(to, 1, L_skip_align2); 2352 __ strh(value, Address(__ post(to, 2))); 2353 __ subw(count, count, 2 >> shift); 2354 __ bind(L_skip_align2); 2355 // Fallthrough 2356 case T_INT: 2357 // Align to 8 bytes, we know we are 4 byte aligned to start. 2358 __ tbz(to, 2, L_skip_align4); 2359 __ strw(value, Address(__ post(to, 4))); 2360 __ subw(count, count, 4 >> shift); 2361 __ bind(L_skip_align4); 2362 break; 2363 default: ShouldNotReachHere(); 2364 } 2365 } 2366 2367 // 2368 // Fill large chunks 2369 // 2370 __ lsrw(cnt_words, count, 3 - shift); // number of words 2371 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2372 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2373 if (UseBlockZeroing) { 2374 Label non_block_zeroing, rest; 2375 // If the fill value is zero we can use the fast zero_words(). 2376 __ cbnz(value, non_block_zeroing); 2377 __ mov(bz_base, to); 2378 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2379 __ zero_words(bz_base, cnt_words); 2380 __ b(rest); 2381 __ bind(non_block_zeroing); 2382 __ fill_words(to, cnt_words, value); 2383 __ bind(rest); 2384 } else { 2385 __ fill_words(to, cnt_words, value); 2386 } 2387 2388 // Remaining count is less than 8 bytes. Fill it by a single store. 2389 // Note that the total length is no less than 8 bytes. 2390 if (t == T_BYTE || t == T_SHORT) { 2391 Label L_exit1; 2392 __ cbzw(count, L_exit1); 2393 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2394 __ str(value, Address(to, -8)); // overwrite some elements 2395 __ bind(L_exit1); 2396 __ leave(); 2397 __ ret(lr); 2398 } 2399 2400 // Handle copies less than 8 bytes. 2401 Label L_fill_2, L_fill_4, L_exit2; 2402 __ bind(L_fill_elements); 2403 switch (t) { 2404 case T_BYTE: 2405 __ tbz(count, 0, L_fill_2); 2406 __ strb(value, Address(__ post(to, 1))); 2407 __ bind(L_fill_2); 2408 __ tbz(count, 1, L_fill_4); 2409 __ strh(value, Address(__ post(to, 2))); 2410 __ bind(L_fill_4); 2411 __ tbz(count, 2, L_exit2); 2412 __ strw(value, Address(to)); 2413 break; 2414 case T_SHORT: 2415 __ tbz(count, 0, L_fill_4); 2416 __ strh(value, Address(__ post(to, 2))); 2417 __ bind(L_fill_4); 2418 __ tbz(count, 1, L_exit2); 2419 __ strw(value, Address(to)); 2420 break; 2421 case T_INT: 2422 __ cbzw(count, L_exit2); 2423 __ strw(value, Address(to)); 2424 break; 2425 default: ShouldNotReachHere(); 2426 } 2427 __ bind(L_exit2); 2428 __ leave(); 2429 __ ret(lr); 2430 return start; 2431 } 2432 2433 void generate_arraycopy_stubs() { 2434 address entry; 2435 address entry_jbyte_arraycopy; 2436 address entry_jshort_arraycopy; 2437 address entry_jint_arraycopy; 2438 address entry_oop_arraycopy; 2439 address entry_jlong_arraycopy; 2440 address entry_checkcast_arraycopy; 2441 2442 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2443 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2444 2445 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2446 2447 //*** jbyte 2448 // Always need aligned and unaligned versions 2449 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2450 "jbyte_disjoint_arraycopy"); 2451 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2452 &entry_jbyte_arraycopy, 2453 "jbyte_arraycopy"); 2454 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2455 "arrayof_jbyte_disjoint_arraycopy"); 2456 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2457 "arrayof_jbyte_arraycopy"); 2458 2459 //*** jshort 2460 // Always need aligned and unaligned versions 2461 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2462 "jshort_disjoint_arraycopy"); 2463 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2464 &entry_jshort_arraycopy, 2465 "jshort_arraycopy"); 2466 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2467 "arrayof_jshort_disjoint_arraycopy"); 2468 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2469 "arrayof_jshort_arraycopy"); 2470 2471 //*** jint 2472 // Aligned versions 2473 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2474 "arrayof_jint_disjoint_arraycopy"); 2475 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2476 "arrayof_jint_arraycopy"); 2477 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2478 // entry_jint_arraycopy always points to the unaligned version 2479 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2480 "jint_disjoint_arraycopy"); 2481 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2482 &entry_jint_arraycopy, 2483 "jint_arraycopy"); 2484 2485 //*** jlong 2486 // It is always aligned 2487 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2488 "arrayof_jlong_disjoint_arraycopy"); 2489 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2490 "arrayof_jlong_arraycopy"); 2491 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2492 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2493 2494 //*** oops 2495 { 2496 // With compressed oops we need unaligned versions; notice that 2497 // we overwrite entry_oop_arraycopy. 2498 bool aligned = !UseCompressedOops; 2499 2500 StubRoutines::_arrayof_oop_disjoint_arraycopy 2501 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2502 /*dest_uninitialized*/false); 2503 StubRoutines::_arrayof_oop_arraycopy 2504 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2505 /*dest_uninitialized*/false); 2506 // Aligned versions without pre-barriers 2507 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2508 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2509 /*dest_uninitialized*/true); 2510 StubRoutines::_arrayof_oop_arraycopy_uninit 2511 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2512 /*dest_uninitialized*/true); 2513 } 2514 2515 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2516 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2517 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2518 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2519 2520 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2521 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2522 /*dest_uninitialized*/true); 2523 2524 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2525 entry_jbyte_arraycopy, 2526 entry_jshort_arraycopy, 2527 entry_jint_arraycopy, 2528 entry_jlong_arraycopy); 2529 2530 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2531 entry_jbyte_arraycopy, 2532 entry_jshort_arraycopy, 2533 entry_jint_arraycopy, 2534 entry_oop_arraycopy, 2535 entry_jlong_arraycopy, 2536 entry_checkcast_arraycopy); 2537 2538 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2539 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2540 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2541 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2542 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2543 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2544 } 2545 2546 void generate_math_stubs() { Unimplemented(); } 2547 2548 // Arguments: 2549 // 2550 // Inputs: 2551 // c_rarg0 - source byte array address 2552 // c_rarg1 - destination byte array address 2553 // c_rarg2 - K (key) in little endian int array 2554 // 2555 address generate_aescrypt_encryptBlock() { 2556 __ align(CodeEntryAlignment); 2557 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2558 2559 Label L_doLast; 2560 2561 const Register from = c_rarg0; // source array address 2562 const Register to = c_rarg1; // destination array address 2563 const Register key = c_rarg2; // key array address 2564 const Register keylen = rscratch1; 2565 2566 address start = __ pc(); 2567 __ enter(); 2568 2569 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2570 2571 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2572 2573 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2574 __ rev32(v1, __ T16B, v1); 2575 __ rev32(v2, __ T16B, v2); 2576 __ rev32(v3, __ T16B, v3); 2577 __ rev32(v4, __ T16B, v4); 2578 __ aese(v0, v1); 2579 __ aesmc(v0, v0); 2580 __ aese(v0, v2); 2581 __ aesmc(v0, v0); 2582 __ aese(v0, v3); 2583 __ aesmc(v0, v0); 2584 __ aese(v0, v4); 2585 __ aesmc(v0, v0); 2586 2587 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2588 __ rev32(v1, __ T16B, v1); 2589 __ rev32(v2, __ T16B, v2); 2590 __ rev32(v3, __ T16B, v3); 2591 __ rev32(v4, __ T16B, v4); 2592 __ aese(v0, v1); 2593 __ aesmc(v0, v0); 2594 __ aese(v0, v2); 2595 __ aesmc(v0, v0); 2596 __ aese(v0, v3); 2597 __ aesmc(v0, v0); 2598 __ aese(v0, v4); 2599 __ aesmc(v0, v0); 2600 2601 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2602 __ rev32(v1, __ T16B, v1); 2603 __ rev32(v2, __ T16B, v2); 2604 2605 __ cmpw(keylen, 44); 2606 __ br(Assembler::EQ, L_doLast); 2607 2608 __ aese(v0, v1); 2609 __ aesmc(v0, v0); 2610 __ aese(v0, v2); 2611 __ aesmc(v0, v0); 2612 2613 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2614 __ rev32(v1, __ T16B, v1); 2615 __ rev32(v2, __ T16B, v2); 2616 2617 __ cmpw(keylen, 52); 2618 __ br(Assembler::EQ, L_doLast); 2619 2620 __ aese(v0, v1); 2621 __ aesmc(v0, v0); 2622 __ aese(v0, v2); 2623 __ aesmc(v0, v0); 2624 2625 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2626 __ rev32(v1, __ T16B, v1); 2627 __ rev32(v2, __ T16B, v2); 2628 2629 __ BIND(L_doLast); 2630 2631 __ aese(v0, v1); 2632 __ aesmc(v0, v0); 2633 __ aese(v0, v2); 2634 2635 __ ld1(v1, __ T16B, key); 2636 __ rev32(v1, __ T16B, v1); 2637 __ eor(v0, __ T16B, v0, v1); 2638 2639 __ st1(v0, __ T16B, to); 2640 2641 __ mov(r0, 0); 2642 2643 __ leave(); 2644 __ ret(lr); 2645 2646 return start; 2647 } 2648 2649 // Arguments: 2650 // 2651 // Inputs: 2652 // c_rarg0 - source byte array address 2653 // c_rarg1 - destination byte array address 2654 // c_rarg2 - K (key) in little endian int array 2655 // 2656 address generate_aescrypt_decryptBlock() { 2657 assert(UseAES, "need AES instructions and misaligned SSE support"); 2658 __ align(CodeEntryAlignment); 2659 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2660 Label L_doLast; 2661 2662 const Register from = c_rarg0; // source array address 2663 const Register to = c_rarg1; // destination array address 2664 const Register key = c_rarg2; // key array address 2665 const Register keylen = rscratch1; 2666 2667 address start = __ pc(); 2668 __ enter(); // required for proper stackwalking of RuntimeStub frame 2669 2670 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2671 2672 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2673 2674 __ ld1(v5, __ T16B, __ post(key, 16)); 2675 __ rev32(v5, __ T16B, v5); 2676 2677 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2678 __ rev32(v1, __ T16B, v1); 2679 __ rev32(v2, __ T16B, v2); 2680 __ rev32(v3, __ T16B, v3); 2681 __ rev32(v4, __ T16B, v4); 2682 __ aesd(v0, v1); 2683 __ aesimc(v0, v0); 2684 __ aesd(v0, v2); 2685 __ aesimc(v0, v0); 2686 __ aesd(v0, v3); 2687 __ aesimc(v0, v0); 2688 __ aesd(v0, v4); 2689 __ aesimc(v0, v0); 2690 2691 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2692 __ rev32(v1, __ T16B, v1); 2693 __ rev32(v2, __ T16B, v2); 2694 __ rev32(v3, __ T16B, v3); 2695 __ rev32(v4, __ T16B, v4); 2696 __ aesd(v0, v1); 2697 __ aesimc(v0, v0); 2698 __ aesd(v0, v2); 2699 __ aesimc(v0, v0); 2700 __ aesd(v0, v3); 2701 __ aesimc(v0, v0); 2702 __ aesd(v0, v4); 2703 __ aesimc(v0, v0); 2704 2705 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2706 __ rev32(v1, __ T16B, v1); 2707 __ rev32(v2, __ T16B, v2); 2708 2709 __ cmpw(keylen, 44); 2710 __ br(Assembler::EQ, L_doLast); 2711 2712 __ aesd(v0, v1); 2713 __ aesimc(v0, v0); 2714 __ aesd(v0, v2); 2715 __ aesimc(v0, v0); 2716 2717 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2718 __ rev32(v1, __ T16B, v1); 2719 __ rev32(v2, __ T16B, v2); 2720 2721 __ cmpw(keylen, 52); 2722 __ br(Assembler::EQ, L_doLast); 2723 2724 __ aesd(v0, v1); 2725 __ aesimc(v0, v0); 2726 __ aesd(v0, v2); 2727 __ aesimc(v0, v0); 2728 2729 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2730 __ rev32(v1, __ T16B, v1); 2731 __ rev32(v2, __ T16B, v2); 2732 2733 __ BIND(L_doLast); 2734 2735 __ aesd(v0, v1); 2736 __ aesimc(v0, v0); 2737 __ aesd(v0, v2); 2738 2739 __ eor(v0, __ T16B, v0, v5); 2740 2741 __ st1(v0, __ T16B, to); 2742 2743 __ mov(r0, 0); 2744 2745 __ leave(); 2746 __ ret(lr); 2747 2748 return start; 2749 } 2750 2751 // Arguments: 2752 // 2753 // Inputs: 2754 // c_rarg0 - source byte array address 2755 // c_rarg1 - destination byte array address 2756 // c_rarg2 - K (key) in little endian int array 2757 // c_rarg3 - r vector byte array address 2758 // c_rarg4 - input length 2759 // 2760 // Output: 2761 // x0 - input length 2762 // 2763 address generate_cipherBlockChaining_encryptAESCrypt() { 2764 assert(UseAES, "need AES instructions and misaligned SSE support"); 2765 __ align(CodeEntryAlignment); 2766 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2767 2768 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2769 2770 const Register from = c_rarg0; // source array address 2771 const Register to = c_rarg1; // destination array address 2772 const Register key = c_rarg2; // key array address 2773 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2774 // and left with the results of the last encryption block 2775 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2776 const Register keylen = rscratch1; 2777 2778 address start = __ pc(); 2779 2780 __ enter(); 2781 2782 __ movw(rscratch2, len_reg); 2783 2784 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2785 2786 __ ld1(v0, __ T16B, rvec); 2787 2788 __ cmpw(keylen, 52); 2789 __ br(Assembler::CC, L_loadkeys_44); 2790 __ br(Assembler::EQ, L_loadkeys_52); 2791 2792 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2793 __ rev32(v17, __ T16B, v17); 2794 __ rev32(v18, __ T16B, v18); 2795 __ BIND(L_loadkeys_52); 2796 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2797 __ rev32(v19, __ T16B, v19); 2798 __ rev32(v20, __ T16B, v20); 2799 __ BIND(L_loadkeys_44); 2800 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2801 __ rev32(v21, __ T16B, v21); 2802 __ rev32(v22, __ T16B, v22); 2803 __ rev32(v23, __ T16B, v23); 2804 __ rev32(v24, __ T16B, v24); 2805 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2806 __ rev32(v25, __ T16B, v25); 2807 __ rev32(v26, __ T16B, v26); 2808 __ rev32(v27, __ T16B, v27); 2809 __ rev32(v28, __ T16B, v28); 2810 __ ld1(v29, v30, v31, __ T16B, key); 2811 __ rev32(v29, __ T16B, v29); 2812 __ rev32(v30, __ T16B, v30); 2813 __ rev32(v31, __ T16B, v31); 2814 2815 __ BIND(L_aes_loop); 2816 __ ld1(v1, __ T16B, __ post(from, 16)); 2817 __ eor(v0, __ T16B, v0, v1); 2818 2819 __ br(Assembler::CC, L_rounds_44); 2820 __ br(Assembler::EQ, L_rounds_52); 2821 2822 __ aese(v0, v17); __ aesmc(v0, v0); 2823 __ aese(v0, v18); __ aesmc(v0, v0); 2824 __ BIND(L_rounds_52); 2825 __ aese(v0, v19); __ aesmc(v0, v0); 2826 __ aese(v0, v20); __ aesmc(v0, v0); 2827 __ BIND(L_rounds_44); 2828 __ aese(v0, v21); __ aesmc(v0, v0); 2829 __ aese(v0, v22); __ aesmc(v0, v0); 2830 __ aese(v0, v23); __ aesmc(v0, v0); 2831 __ aese(v0, v24); __ aesmc(v0, v0); 2832 __ aese(v0, v25); __ aesmc(v0, v0); 2833 __ aese(v0, v26); __ aesmc(v0, v0); 2834 __ aese(v0, v27); __ aesmc(v0, v0); 2835 __ aese(v0, v28); __ aesmc(v0, v0); 2836 __ aese(v0, v29); __ aesmc(v0, v0); 2837 __ aese(v0, v30); 2838 __ eor(v0, __ T16B, v0, v31); 2839 2840 __ st1(v0, __ T16B, __ post(to, 16)); 2841 2842 __ subw(len_reg, len_reg, 16); 2843 __ cbnzw(len_reg, L_aes_loop); 2844 2845 __ st1(v0, __ T16B, rvec); 2846 2847 __ mov(r0, rscratch2); 2848 2849 __ leave(); 2850 __ ret(lr); 2851 2852 return start; 2853 } 2854 2855 // Arguments: 2856 // 2857 // Inputs: 2858 // c_rarg0 - source byte array address 2859 // c_rarg1 - destination byte array address 2860 // c_rarg2 - K (key) in little endian int array 2861 // c_rarg3 - r vector byte array address 2862 // c_rarg4 - input length 2863 // 2864 // Output: 2865 // r0 - input length 2866 // 2867 address generate_cipherBlockChaining_decryptAESCrypt() { 2868 assert(UseAES, "need AES instructions and misaligned SSE support"); 2869 __ align(CodeEntryAlignment); 2870 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2871 2872 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2873 2874 const Register from = c_rarg0; // source array address 2875 const Register to = c_rarg1; // destination array address 2876 const Register key = c_rarg2; // key array address 2877 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2878 // and left with the results of the last encryption block 2879 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2880 const Register keylen = rscratch1; 2881 2882 address start = __ pc(); 2883 2884 __ enter(); 2885 2886 __ movw(rscratch2, len_reg); 2887 2888 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2889 2890 __ ld1(v2, __ T16B, rvec); 2891 2892 __ ld1(v31, __ T16B, __ post(key, 16)); 2893 __ rev32(v31, __ T16B, v31); 2894 2895 __ cmpw(keylen, 52); 2896 __ br(Assembler::CC, L_loadkeys_44); 2897 __ br(Assembler::EQ, L_loadkeys_52); 2898 2899 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2900 __ rev32(v17, __ T16B, v17); 2901 __ rev32(v18, __ T16B, v18); 2902 __ BIND(L_loadkeys_52); 2903 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2904 __ rev32(v19, __ T16B, v19); 2905 __ rev32(v20, __ T16B, v20); 2906 __ BIND(L_loadkeys_44); 2907 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2908 __ rev32(v21, __ T16B, v21); 2909 __ rev32(v22, __ T16B, v22); 2910 __ rev32(v23, __ T16B, v23); 2911 __ rev32(v24, __ T16B, v24); 2912 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2913 __ rev32(v25, __ T16B, v25); 2914 __ rev32(v26, __ T16B, v26); 2915 __ rev32(v27, __ T16B, v27); 2916 __ rev32(v28, __ T16B, v28); 2917 __ ld1(v29, v30, __ T16B, key); 2918 __ rev32(v29, __ T16B, v29); 2919 __ rev32(v30, __ T16B, v30); 2920 2921 __ BIND(L_aes_loop); 2922 __ ld1(v0, __ T16B, __ post(from, 16)); 2923 __ orr(v1, __ T16B, v0, v0); 2924 2925 __ br(Assembler::CC, L_rounds_44); 2926 __ br(Assembler::EQ, L_rounds_52); 2927 2928 __ aesd(v0, v17); __ aesimc(v0, v0); 2929 __ aesd(v0, v18); __ aesimc(v0, v0); 2930 __ BIND(L_rounds_52); 2931 __ aesd(v0, v19); __ aesimc(v0, v0); 2932 __ aesd(v0, v20); __ aesimc(v0, v0); 2933 __ BIND(L_rounds_44); 2934 __ aesd(v0, v21); __ aesimc(v0, v0); 2935 __ aesd(v0, v22); __ aesimc(v0, v0); 2936 __ aesd(v0, v23); __ aesimc(v0, v0); 2937 __ aesd(v0, v24); __ aesimc(v0, v0); 2938 __ aesd(v0, v25); __ aesimc(v0, v0); 2939 __ aesd(v0, v26); __ aesimc(v0, v0); 2940 __ aesd(v0, v27); __ aesimc(v0, v0); 2941 __ aesd(v0, v28); __ aesimc(v0, v0); 2942 __ aesd(v0, v29); __ aesimc(v0, v0); 2943 __ aesd(v0, v30); 2944 __ eor(v0, __ T16B, v0, v31); 2945 __ eor(v0, __ T16B, v0, v2); 2946 2947 __ st1(v0, __ T16B, __ post(to, 16)); 2948 __ orr(v2, __ T16B, v1, v1); 2949 2950 __ subw(len_reg, len_reg, 16); 2951 __ cbnzw(len_reg, L_aes_loop); 2952 2953 __ st1(v2, __ T16B, rvec); 2954 2955 __ mov(r0, rscratch2); 2956 2957 __ leave(); 2958 __ ret(lr); 2959 2960 return start; 2961 } 2962 2963 // Arguments: 2964 // 2965 // Inputs: 2966 // c_rarg0 - byte[] source+offset 2967 // c_rarg1 - int[] SHA.state 2968 // c_rarg2 - int offset 2969 // c_rarg3 - int limit 2970 // 2971 address generate_sha1_implCompress(bool multi_block, const char *name) { 2972 __ align(CodeEntryAlignment); 2973 StubCodeMark mark(this, "StubRoutines", name); 2974 address start = __ pc(); 2975 2976 Register buf = c_rarg0; 2977 Register state = c_rarg1; 2978 Register ofs = c_rarg2; 2979 Register limit = c_rarg3; 2980 2981 Label keys; 2982 Label sha1_loop; 2983 2984 // load the keys into v0..v3 2985 __ adr(rscratch1, keys); 2986 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2987 // load 5 words state into v6, v7 2988 __ ldrq(v6, Address(state, 0)); 2989 __ ldrs(v7, Address(state, 16)); 2990 2991 2992 __ BIND(sha1_loop); 2993 // load 64 bytes of data into v16..v19 2994 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2995 __ rev32(v16, __ T16B, v16); 2996 __ rev32(v17, __ T16B, v17); 2997 __ rev32(v18, __ T16B, v18); 2998 __ rev32(v19, __ T16B, v19); 2999 3000 // do the sha1 3001 __ addv(v4, __ T4S, v16, v0); 3002 __ orr(v20, __ T16B, v6, v6); 3003 3004 FloatRegister d0 = v16; 3005 FloatRegister d1 = v17; 3006 FloatRegister d2 = v18; 3007 FloatRegister d3 = v19; 3008 3009 for (int round = 0; round < 20; round++) { 3010 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3011 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3012 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3013 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3014 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3015 3016 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3017 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3018 __ sha1h(tmp2, __ T4S, v20); 3019 if (round < 5) 3020 __ sha1c(v20, __ T4S, tmp3, tmp4); 3021 else if (round < 10 || round >= 15) 3022 __ sha1p(v20, __ T4S, tmp3, tmp4); 3023 else 3024 __ sha1m(v20, __ T4S, tmp3, tmp4); 3025 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3026 3027 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3028 } 3029 3030 __ addv(v7, __ T2S, v7, v21); 3031 __ addv(v6, __ T4S, v6, v20); 3032 3033 if (multi_block) { 3034 __ add(ofs, ofs, 64); 3035 __ cmp(ofs, limit); 3036 __ br(Assembler::LE, sha1_loop); 3037 __ mov(c_rarg0, ofs); // return ofs 3038 } 3039 3040 __ strq(v6, Address(state, 0)); 3041 __ strs(v7, Address(state, 16)); 3042 3043 __ ret(lr); 3044 3045 __ bind(keys); 3046 __ emit_int32(0x5a827999); 3047 __ emit_int32(0x6ed9eba1); 3048 __ emit_int32(0x8f1bbcdc); 3049 __ emit_int32(0xca62c1d6); 3050 3051 return start; 3052 } 3053 3054 3055 // Arguments: 3056 // 3057 // Inputs: 3058 // c_rarg0 - byte[] source+offset 3059 // c_rarg1 - int[] SHA.state 3060 // c_rarg2 - int offset 3061 // c_rarg3 - int limit 3062 // 3063 address generate_sha256_implCompress(bool multi_block, const char *name) { 3064 static const uint32_t round_consts[64] = { 3065 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3066 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3067 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3068 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3069 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3070 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3071 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3072 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3073 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3074 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3075 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3076 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3077 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3078 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3079 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3080 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3081 }; 3082 __ align(CodeEntryAlignment); 3083 StubCodeMark mark(this, "StubRoutines", name); 3084 address start = __ pc(); 3085 3086 Register buf = c_rarg0; 3087 Register state = c_rarg1; 3088 Register ofs = c_rarg2; 3089 Register limit = c_rarg3; 3090 3091 Label sha1_loop; 3092 3093 __ stpd(v8, v9, __ pre(sp, -32)); 3094 __ stpd(v10, v11, Address(sp, 16)); 3095 3096 // dga == v0 3097 // dgb == v1 3098 // dg0 == v2 3099 // dg1 == v3 3100 // dg2 == v4 3101 // t0 == v6 3102 // t1 == v7 3103 3104 // load 16 keys to v16..v31 3105 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3106 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3107 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3108 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3109 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3110 3111 // load 8 words (256 bits) state 3112 __ ldpq(v0, v1, state); 3113 3114 __ BIND(sha1_loop); 3115 // load 64 bytes of data into v8..v11 3116 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3117 __ rev32(v8, __ T16B, v8); 3118 __ rev32(v9, __ T16B, v9); 3119 __ rev32(v10, __ T16B, v10); 3120 __ rev32(v11, __ T16B, v11); 3121 3122 __ addv(v6, __ T4S, v8, v16); 3123 __ orr(v2, __ T16B, v0, v0); 3124 __ orr(v3, __ T16B, v1, v1); 3125 3126 FloatRegister d0 = v8; 3127 FloatRegister d1 = v9; 3128 FloatRegister d2 = v10; 3129 FloatRegister d3 = v11; 3130 3131 3132 for (int round = 0; round < 16; round++) { 3133 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3134 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3135 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3136 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3137 3138 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3139 __ orr(v4, __ T16B, v2, v2); 3140 if (round < 15) 3141 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3142 __ sha256h(v2, __ T4S, v3, tmp2); 3143 __ sha256h2(v3, __ T4S, v4, tmp2); 3144 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3145 3146 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3147 } 3148 3149 __ addv(v0, __ T4S, v0, v2); 3150 __ addv(v1, __ T4S, v1, v3); 3151 3152 if (multi_block) { 3153 __ add(ofs, ofs, 64); 3154 __ cmp(ofs, limit); 3155 __ br(Assembler::LE, sha1_loop); 3156 __ mov(c_rarg0, ofs); // return ofs 3157 } 3158 3159 __ ldpd(v10, v11, Address(sp, 16)); 3160 __ ldpd(v8, v9, __ post(sp, 32)); 3161 3162 __ stpq(v0, v1, state); 3163 3164 __ ret(lr); 3165 3166 return start; 3167 } 3168 3169 #ifndef BUILTIN_SIM 3170 // Safefetch stubs. 3171 void generate_safefetch(const char* name, int size, address* entry, 3172 address* fault_pc, address* continuation_pc) { 3173 // safefetch signatures: 3174 // int SafeFetch32(int* adr, int errValue); 3175 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3176 // 3177 // arguments: 3178 // c_rarg0 = adr 3179 // c_rarg1 = errValue 3180 // 3181 // result: 3182 // PPC_RET = *adr or errValue 3183 3184 StubCodeMark mark(this, "StubRoutines", name); 3185 3186 // Entry point, pc or function descriptor. 3187 *entry = __ pc(); 3188 3189 // Load *adr into c_rarg1, may fault. 3190 *fault_pc = __ pc(); 3191 switch (size) { 3192 case 4: 3193 // int32_t 3194 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3195 break; 3196 case 8: 3197 // int64_t 3198 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3199 break; 3200 default: 3201 ShouldNotReachHere(); 3202 } 3203 3204 // return errValue or *adr 3205 *continuation_pc = __ pc(); 3206 __ mov(r0, c_rarg1); 3207 __ ret(lr); 3208 } 3209 #endif 3210 3211 /** 3212 * Arguments: 3213 * 3214 * Inputs: 3215 * c_rarg0 - int crc 3216 * c_rarg1 - byte* buf 3217 * c_rarg2 - int length 3218 * 3219 * Ouput: 3220 * rax - int crc result 3221 */ 3222 address generate_updateBytesCRC32() { 3223 assert(UseCRC32Intrinsics, "what are we doing here?"); 3224 3225 __ align(CodeEntryAlignment); 3226 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3227 3228 address start = __ pc(); 3229 3230 const Register crc = c_rarg0; // crc 3231 const Register buf = c_rarg1; // source java byte array address 3232 const Register len = c_rarg2; // length 3233 const Register table0 = c_rarg3; // crc_table address 3234 const Register table1 = c_rarg4; 3235 const Register table2 = c_rarg5; 3236 const Register table3 = c_rarg6; 3237 const Register tmp3 = c_rarg7; 3238 3239 BLOCK_COMMENT("Entry:"); 3240 __ enter(); // required for proper stackwalking of RuntimeStub frame 3241 3242 __ kernel_crc32(crc, buf, len, 3243 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3244 3245 __ leave(); // required for proper stackwalking of RuntimeStub frame 3246 __ ret(lr); 3247 3248 return start; 3249 } 3250 3251 /** 3252 * Arguments: 3253 * 3254 * Inputs: 3255 * c_rarg0 - int crc 3256 * c_rarg1 - byte* buf 3257 * c_rarg2 - int length 3258 * c_rarg3 - int* table 3259 * 3260 * Ouput: 3261 * r0 - int crc result 3262 */ 3263 address generate_updateBytesCRC32C() { 3264 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3265 3266 __ align(CodeEntryAlignment); 3267 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3268 3269 address start = __ pc(); 3270 3271 const Register crc = c_rarg0; // crc 3272 const Register buf = c_rarg1; // source java byte array address 3273 const Register len = c_rarg2; // length 3274 const Register table0 = c_rarg3; // crc_table address 3275 const Register table1 = c_rarg4; 3276 const Register table2 = c_rarg5; 3277 const Register table3 = c_rarg6; 3278 const Register tmp3 = c_rarg7; 3279 3280 BLOCK_COMMENT("Entry:"); 3281 __ enter(); // required for proper stackwalking of RuntimeStub frame 3282 3283 __ kernel_crc32c(crc, buf, len, 3284 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3285 3286 __ leave(); // required for proper stackwalking of RuntimeStub frame 3287 __ ret(lr); 3288 3289 return start; 3290 } 3291 3292 /*** 3293 * Arguments: 3294 * 3295 * Inputs: 3296 * c_rarg0 - int adler 3297 * c_rarg1 - byte* buff 3298 * c_rarg2 - int len 3299 * 3300 * Output: 3301 * c_rarg0 - int adler result 3302 */ 3303 address generate_updateBytesAdler32() { 3304 __ align(CodeEntryAlignment); 3305 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3306 address start = __ pc(); 3307 3308 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3309 3310 // Aliases 3311 Register adler = c_rarg0; 3312 Register s1 = c_rarg0; 3313 Register s2 = c_rarg3; 3314 Register buff = c_rarg1; 3315 Register len = c_rarg2; 3316 Register nmax = r4; 3317 Register base = r5; 3318 Register count = r6; 3319 Register temp0 = rscratch1; 3320 Register temp1 = rscratch2; 3321 Register temp2 = r7; 3322 3323 // Max number of bytes we can process before having to take the mod 3324 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3325 unsigned long BASE = 0xfff1; 3326 unsigned long NMAX = 0x15B0; 3327 3328 __ mov(base, BASE); 3329 __ mov(nmax, NMAX); 3330 3331 // s1 is initialized to the lower 16 bits of adler 3332 // s2 is initialized to the upper 16 bits of adler 3333 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3334 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3335 3336 // The pipelined loop needs at least 16 elements for 1 iteration 3337 // It does check this, but it is more effective to skip to the cleanup loop 3338 __ cmp(len, 16); 3339 __ br(Assembler::HS, L_nmax); 3340 __ cbz(len, L_combine); 3341 3342 __ bind(L_simple_by1_loop); 3343 __ ldrb(temp0, Address(__ post(buff, 1))); 3344 __ add(s1, s1, temp0); 3345 __ add(s2, s2, s1); 3346 __ subs(len, len, 1); 3347 __ br(Assembler::HI, L_simple_by1_loop); 3348 3349 // s1 = s1 % BASE 3350 __ subs(temp0, s1, base); 3351 __ csel(s1, temp0, s1, Assembler::HS); 3352 3353 // s2 = s2 % BASE 3354 __ lsr(temp0, s2, 16); 3355 __ lsl(temp1, temp0, 4); 3356 __ sub(temp1, temp1, temp0); 3357 __ add(s2, temp1, s2, ext::uxth); 3358 3359 __ subs(temp0, s2, base); 3360 __ csel(s2, temp0, s2, Assembler::HS); 3361 3362 __ b(L_combine); 3363 3364 __ bind(L_nmax); 3365 __ subs(len, len, nmax); 3366 __ sub(count, nmax, 16); 3367 __ br(Assembler::LO, L_by16); 3368 3369 __ bind(L_nmax_loop); 3370 3371 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3372 3373 __ add(s1, s1, temp0, ext::uxtb); 3374 __ ubfx(temp2, temp0, 8, 8); 3375 __ add(s2, s2, s1); 3376 __ add(s1, s1, temp2); 3377 __ ubfx(temp2, temp0, 16, 8); 3378 __ add(s2, s2, s1); 3379 __ add(s1, s1, temp2); 3380 __ ubfx(temp2, temp0, 24, 8); 3381 __ add(s2, s2, s1); 3382 __ add(s1, s1, temp2); 3383 __ ubfx(temp2, temp0, 32, 8); 3384 __ add(s2, s2, s1); 3385 __ add(s1, s1, temp2); 3386 __ ubfx(temp2, temp0, 40, 8); 3387 __ add(s2, s2, s1); 3388 __ add(s1, s1, temp2); 3389 __ ubfx(temp2, temp0, 48, 8); 3390 __ add(s2, s2, s1); 3391 __ add(s1, s1, temp2); 3392 __ add(s2, s2, s1); 3393 __ add(s1, s1, temp0, Assembler::LSR, 56); 3394 __ add(s2, s2, s1); 3395 3396 __ add(s1, s1, temp1, ext::uxtb); 3397 __ ubfx(temp2, temp1, 8, 8); 3398 __ add(s2, s2, s1); 3399 __ add(s1, s1, temp2); 3400 __ ubfx(temp2, temp1, 16, 8); 3401 __ add(s2, s2, s1); 3402 __ add(s1, s1, temp2); 3403 __ ubfx(temp2, temp1, 24, 8); 3404 __ add(s2, s2, s1); 3405 __ add(s1, s1, temp2); 3406 __ ubfx(temp2, temp1, 32, 8); 3407 __ add(s2, s2, s1); 3408 __ add(s1, s1, temp2); 3409 __ ubfx(temp2, temp1, 40, 8); 3410 __ add(s2, s2, s1); 3411 __ add(s1, s1, temp2); 3412 __ ubfx(temp2, temp1, 48, 8); 3413 __ add(s2, s2, s1); 3414 __ add(s1, s1, temp2); 3415 __ add(s2, s2, s1); 3416 __ add(s1, s1, temp1, Assembler::LSR, 56); 3417 __ add(s2, s2, s1); 3418 3419 __ subs(count, count, 16); 3420 __ br(Assembler::HS, L_nmax_loop); 3421 3422 // s1 = s1 % BASE 3423 __ lsr(temp0, s1, 16); 3424 __ lsl(temp1, temp0, 4); 3425 __ sub(temp1, temp1, temp0); 3426 __ add(temp1, temp1, s1, ext::uxth); 3427 3428 __ lsr(temp0, temp1, 16); 3429 __ lsl(s1, temp0, 4); 3430 __ sub(s1, s1, temp0); 3431 __ add(s1, s1, temp1, ext:: uxth); 3432 3433 __ subs(temp0, s1, base); 3434 __ csel(s1, temp0, s1, Assembler::HS); 3435 3436 // s2 = s2 % BASE 3437 __ lsr(temp0, s2, 16); 3438 __ lsl(temp1, temp0, 4); 3439 __ sub(temp1, temp1, temp0); 3440 __ add(temp1, temp1, s2, ext::uxth); 3441 3442 __ lsr(temp0, temp1, 16); 3443 __ lsl(s2, temp0, 4); 3444 __ sub(s2, s2, temp0); 3445 __ add(s2, s2, temp1, ext:: uxth); 3446 3447 __ subs(temp0, s2, base); 3448 __ csel(s2, temp0, s2, Assembler::HS); 3449 3450 __ subs(len, len, nmax); 3451 __ sub(count, nmax, 16); 3452 __ br(Assembler::HS, L_nmax_loop); 3453 3454 __ bind(L_by16); 3455 __ adds(len, len, count); 3456 __ br(Assembler::LO, L_by1); 3457 3458 __ bind(L_by16_loop); 3459 3460 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3461 3462 __ add(s1, s1, temp0, ext::uxtb); 3463 __ ubfx(temp2, temp0, 8, 8); 3464 __ add(s2, s2, s1); 3465 __ add(s1, s1, temp2); 3466 __ ubfx(temp2, temp0, 16, 8); 3467 __ add(s2, s2, s1); 3468 __ add(s1, s1, temp2); 3469 __ ubfx(temp2, temp0, 24, 8); 3470 __ add(s2, s2, s1); 3471 __ add(s1, s1, temp2); 3472 __ ubfx(temp2, temp0, 32, 8); 3473 __ add(s2, s2, s1); 3474 __ add(s1, s1, temp2); 3475 __ ubfx(temp2, temp0, 40, 8); 3476 __ add(s2, s2, s1); 3477 __ add(s1, s1, temp2); 3478 __ ubfx(temp2, temp0, 48, 8); 3479 __ add(s2, s2, s1); 3480 __ add(s1, s1, temp2); 3481 __ add(s2, s2, s1); 3482 __ add(s1, s1, temp0, Assembler::LSR, 56); 3483 __ add(s2, s2, s1); 3484 3485 __ add(s1, s1, temp1, ext::uxtb); 3486 __ ubfx(temp2, temp1, 8, 8); 3487 __ add(s2, s2, s1); 3488 __ add(s1, s1, temp2); 3489 __ ubfx(temp2, temp1, 16, 8); 3490 __ add(s2, s2, s1); 3491 __ add(s1, s1, temp2); 3492 __ ubfx(temp2, temp1, 24, 8); 3493 __ add(s2, s2, s1); 3494 __ add(s1, s1, temp2); 3495 __ ubfx(temp2, temp1, 32, 8); 3496 __ add(s2, s2, s1); 3497 __ add(s1, s1, temp2); 3498 __ ubfx(temp2, temp1, 40, 8); 3499 __ add(s2, s2, s1); 3500 __ add(s1, s1, temp2); 3501 __ ubfx(temp2, temp1, 48, 8); 3502 __ add(s2, s2, s1); 3503 __ add(s1, s1, temp2); 3504 __ add(s2, s2, s1); 3505 __ add(s1, s1, temp1, Assembler::LSR, 56); 3506 __ add(s2, s2, s1); 3507 3508 __ subs(len, len, 16); 3509 __ br(Assembler::HS, L_by16_loop); 3510 3511 __ bind(L_by1); 3512 __ adds(len, len, 15); 3513 __ br(Assembler::LO, L_do_mod); 3514 3515 __ bind(L_by1_loop); 3516 __ ldrb(temp0, Address(__ post(buff, 1))); 3517 __ add(s1, temp0, s1); 3518 __ add(s2, s2, s1); 3519 __ subs(len, len, 1); 3520 __ br(Assembler::HS, L_by1_loop); 3521 3522 __ bind(L_do_mod); 3523 // s1 = s1 % BASE 3524 __ lsr(temp0, s1, 16); 3525 __ lsl(temp1, temp0, 4); 3526 __ sub(temp1, temp1, temp0); 3527 __ add(temp1, temp1, s1, ext::uxth); 3528 3529 __ lsr(temp0, temp1, 16); 3530 __ lsl(s1, temp0, 4); 3531 __ sub(s1, s1, temp0); 3532 __ add(s1, s1, temp1, ext:: uxth); 3533 3534 __ subs(temp0, s1, base); 3535 __ csel(s1, temp0, s1, Assembler::HS); 3536 3537 // s2 = s2 % BASE 3538 __ lsr(temp0, s2, 16); 3539 __ lsl(temp1, temp0, 4); 3540 __ sub(temp1, temp1, temp0); 3541 __ add(temp1, temp1, s2, ext::uxth); 3542 3543 __ lsr(temp0, temp1, 16); 3544 __ lsl(s2, temp0, 4); 3545 __ sub(s2, s2, temp0); 3546 __ add(s2, s2, temp1, ext:: uxth); 3547 3548 __ subs(temp0, s2, base); 3549 __ csel(s2, temp0, s2, Assembler::HS); 3550 3551 // Combine lower bits and higher bits 3552 __ bind(L_combine); 3553 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3554 3555 __ ret(lr); 3556 3557 return start; 3558 } 3559 3560 /** 3561 * Arguments: 3562 * 3563 * Input: 3564 * c_rarg0 - x address 3565 * c_rarg1 - x length 3566 * c_rarg2 - y address 3567 * c_rarg3 - y lenth 3568 * c_rarg4 - z address 3569 * c_rarg5 - z length 3570 */ 3571 address generate_multiplyToLen() { 3572 __ align(CodeEntryAlignment); 3573 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3574 3575 address start = __ pc(); 3576 const Register x = r0; 3577 const Register xlen = r1; 3578 const Register y = r2; 3579 const Register ylen = r3; 3580 const Register z = r4; 3581 const Register zlen = r5; 3582 3583 const Register tmp1 = r10; 3584 const Register tmp2 = r11; 3585 const Register tmp3 = r12; 3586 const Register tmp4 = r13; 3587 const Register tmp5 = r14; 3588 const Register tmp6 = r15; 3589 const Register tmp7 = r16; 3590 3591 BLOCK_COMMENT("Entry:"); 3592 __ enter(); // required for proper stackwalking of RuntimeStub frame 3593 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3594 __ leave(); // required for proper stackwalking of RuntimeStub frame 3595 __ ret(lr); 3596 3597 return start; 3598 } 3599 3600 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3601 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3602 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3603 // Karatsuba multiplication performs a 128*128 -> 256-bit 3604 // multiplication in three 128-bit multiplications and a few 3605 // additions. 3606 // 3607 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3608 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3609 // 3610 // Inputs: 3611 // 3612 // A0 in a.d[0] (subkey) 3613 // A1 in a.d[1] 3614 // (A1+A0) in a1_xor_a0.d[0] 3615 // 3616 // B0 in b.d[0] (state) 3617 // B1 in b.d[1] 3618 3619 __ ext(tmp1, __ T16B, b, b, 0x08); 3620 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3621 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3622 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3623 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3624 3625 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3626 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3627 __ eor(tmp2, __ T16B, tmp2, tmp4); 3628 __ eor(tmp2, __ T16B, tmp2, tmp3); 3629 3630 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3631 __ ins(result_hi, __ D, tmp2, 0, 1); 3632 __ ins(result_lo, __ D, tmp2, 1, 0); 3633 } 3634 3635 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3636 FloatRegister p, FloatRegister z, FloatRegister t1) { 3637 const FloatRegister t0 = result; 3638 3639 // The GCM field polynomial f is z^128 + p(z), where p = 3640 // z^7+z^2+z+1. 3641 // 3642 // z^128 === -p(z) (mod (z^128 + p(z))) 3643 // 3644 // so, given that the product we're reducing is 3645 // a == lo + hi * z^128 3646 // substituting, 3647 // === lo - hi * p(z) (mod (z^128 + p(z))) 3648 // 3649 // we reduce by multiplying hi by p(z) and subtracting the result 3650 // from (i.e. XORing it with) lo. Because p has no nonzero high 3651 // bits we can do this with two 64-bit multiplications, lo*p and 3652 // hi*p. 3653 3654 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3655 __ ext(t1, __ T16B, t0, z, 8); 3656 __ eor(hi, __ T16B, hi, t1); 3657 __ ext(t1, __ T16B, z, t0, 8); 3658 __ eor(lo, __ T16B, lo, t1); 3659 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3660 __ eor(result, __ T16B, lo, t0); 3661 } 3662 3663 /** 3664 * Arguments: 3665 * 3666 * Input: 3667 * c_rarg0 - current state address 3668 * c_rarg1 - H key address 3669 * c_rarg2 - data address 3670 * c_rarg3 - number of blocks 3671 * 3672 * Output: 3673 * Updated state at c_rarg0 3674 */ 3675 address generate_ghash_processBlocks() { 3676 // Bafflingly, GCM uses little-endian for the byte order, but 3677 // big-endian for the bit order. For example, the polynomial 1 is 3678 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3679 // 3680 // So, we must either reverse the bytes in each word and do 3681 // everything big-endian or reverse the bits in each byte and do 3682 // it little-endian. On AArch64 it's more idiomatic to reverse 3683 // the bits in each byte (we have an instruction, RBIT, to do 3684 // that) and keep the data in little-endian bit order throught the 3685 // calculation, bit-reversing the inputs and outputs. 3686 3687 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3688 __ align(wordSize * 2); 3689 address p = __ pc(); 3690 __ emit_int64(0x87); // The low-order bits of the field 3691 // polynomial (i.e. p = z^7+z^2+z+1) 3692 // repeated in the low and high parts of a 3693 // 128-bit vector 3694 __ emit_int64(0x87); 3695 3696 __ align(CodeEntryAlignment); 3697 address start = __ pc(); 3698 3699 Register state = c_rarg0; 3700 Register subkeyH = c_rarg1; 3701 Register data = c_rarg2; 3702 Register blocks = c_rarg3; 3703 3704 FloatRegister vzr = v30; 3705 __ eor(vzr, __ T16B, vzr, vzr); // zero register 3706 3707 __ ldrq(v0, Address(state)); 3708 __ ldrq(v1, Address(subkeyH)); 3709 3710 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 3711 __ rbit(v0, __ T16B, v0); 3712 __ rev64(v1, __ T16B, v1); 3713 __ rbit(v1, __ T16B, v1); 3714 3715 __ ldrq(v26, p); 3716 3717 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 3718 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 3719 3720 { 3721 Label L_ghash_loop; 3722 __ bind(L_ghash_loop); 3723 3724 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 3725 // reversing each byte 3726 __ rbit(v2, __ T16B, v2); 3727 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 3728 3729 // Multiply state in v2 by subkey in v1 3730 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 3731 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 3732 /*temps*/v6, v20, v18, v21); 3733 // Reduce v7:v5 by the field polynomial 3734 ghash_reduce(v0, v5, v7, v26, vzr, v20); 3735 3736 __ sub(blocks, blocks, 1); 3737 __ cbnz(blocks, L_ghash_loop); 3738 } 3739 3740 // The bit-reversed result is at this point in v0 3741 __ rev64(v1, __ T16B, v0); 3742 __ rbit(v1, __ T16B, v1); 3743 3744 __ st1(v1, __ T16B, state); 3745 __ ret(lr); 3746 3747 return start; 3748 } 3749 3750 // Continuation point for throwing of implicit exceptions that are 3751 // not handled in the current activation. Fabricates an exception 3752 // oop and initiates normal exception dispatching in this 3753 // frame. Since we need to preserve callee-saved values (currently 3754 // only for C2, but done for C1 as well) we need a callee-saved oop 3755 // map and therefore have to make these stubs into RuntimeStubs 3756 // rather than BufferBlobs. If the compiler needs all registers to 3757 // be preserved between the fault point and the exception handler 3758 // then it must assume responsibility for that in 3759 // AbstractCompiler::continuation_for_implicit_null_exception or 3760 // continuation_for_implicit_division_by_zero_exception. All other 3761 // implicit exceptions (e.g., NullPointerException or 3762 // AbstractMethodError on entry) are either at call sites or 3763 // otherwise assume that stack unwinding will be initiated, so 3764 // caller saved registers were assumed volatile in the compiler. 3765 3766 #undef __ 3767 #define __ masm-> 3768 3769 address generate_throw_exception(const char* name, 3770 address runtime_entry, 3771 Register arg1 = noreg, 3772 Register arg2 = noreg) { 3773 // Information about frame layout at time of blocking runtime call. 3774 // Note that we only have to preserve callee-saved registers since 3775 // the compilers are responsible for supplying a continuation point 3776 // if they expect all registers to be preserved. 3777 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 3778 enum layout { 3779 rfp_off = 0, 3780 rfp_off2, 3781 return_off, 3782 return_off2, 3783 framesize // inclusive of return address 3784 }; 3785 3786 int insts_size = 512; 3787 int locs_size = 64; 3788 3789 CodeBuffer code(name, insts_size, locs_size); 3790 OopMapSet* oop_maps = new OopMapSet(); 3791 MacroAssembler* masm = new MacroAssembler(&code); 3792 3793 address start = __ pc(); 3794 3795 // This is an inlined and slightly modified version of call_VM 3796 // which has the ability to fetch the return PC out of 3797 // thread-local storage and also sets up last_Java_sp slightly 3798 // differently than the real call_VM 3799 3800 __ enter(); // Save FP and LR before call 3801 3802 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3803 3804 // lr and fp are already in place 3805 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 3806 3807 int frame_complete = __ pc() - start; 3808 3809 // Set up last_Java_sp and last_Java_fp 3810 address the_pc = __ pc(); 3811 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 3812 3813 // Call runtime 3814 if (arg1 != noreg) { 3815 assert(arg2 != c_rarg1, "clobbered"); 3816 __ mov(c_rarg1, arg1); 3817 } 3818 if (arg2 != noreg) { 3819 __ mov(c_rarg2, arg2); 3820 } 3821 __ mov(c_rarg0, rthread); 3822 BLOCK_COMMENT("call runtime_entry"); 3823 __ mov(rscratch1, runtime_entry); 3824 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 3825 3826 // Generate oop map 3827 OopMap* map = new OopMap(framesize, 0); 3828 3829 oop_maps->add_gc_map(the_pc - start, map); 3830 3831 __ reset_last_Java_frame(true); 3832 __ maybe_isb(); 3833 3834 __ leave(); 3835 3836 // check for pending exceptions 3837 #ifdef ASSERT 3838 Label L; 3839 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 3840 __ cbnz(rscratch1, L); 3841 __ should_not_reach_here(); 3842 __ bind(L); 3843 #endif // ASSERT 3844 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3845 3846 3847 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3848 RuntimeStub* stub = 3849 RuntimeStub::new_runtime_stub(name, 3850 &code, 3851 frame_complete, 3852 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3853 oop_maps, false); 3854 return stub->entry_point(); 3855 } 3856 3857 class MontgomeryMultiplyGenerator : public MacroAssembler { 3858 3859 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3860 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 3861 3862 RegSet _toSave; 3863 bool _squaring; 3864 3865 public: 3866 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3867 : MacroAssembler(as->code()), _squaring(squaring) { 3868 3869 // Register allocation 3870 3871 Register reg = c_rarg0; 3872 Pa_base = reg; // Argument registers 3873 if (squaring) 3874 Pb_base = Pa_base; 3875 else 3876 Pb_base = ++reg; 3877 Pn_base = ++reg; 3878 Rlen= ++reg; 3879 inv = ++reg; 3880 Pm_base = ++reg; 3881 3882 // Working registers: 3883 Ra = ++reg; // The current digit of a, b, n, and m. 3884 Rb = ++reg; 3885 Rm = ++reg; 3886 Rn = ++reg; 3887 3888 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 3889 Pb = ++reg; 3890 Pm = ++reg; 3891 Pn = ++reg; 3892 3893 t0 = ++reg; // Three registers which form a 3894 t1 = ++reg; // triple-precision accumuator. 3895 t2 = ++reg; 3896 3897 Ri = ++reg; // Inner and outer loop indexes. 3898 Rj = ++reg; 3899 3900 Rhi_ab = ++reg; // Product registers: low and high parts 3901 Rlo_ab = ++reg; // of a*b and m*n. 3902 Rhi_mn = ++reg; 3903 Rlo_mn = ++reg; 3904 3905 // r19 and up are callee-saved. 3906 _toSave = RegSet::range(r19, reg) + Pm_base; 3907 } 3908 3909 private: 3910 void save_regs() { 3911 push(_toSave, sp); 3912 } 3913 3914 void restore_regs() { 3915 pop(_toSave, sp); 3916 } 3917 3918 template <typename T> 3919 void unroll_2(Register count, T block) { 3920 Label loop, end, odd; 3921 tbnz(count, 0, odd); 3922 cbz(count, end); 3923 align(16); 3924 bind(loop); 3925 (this->*block)(); 3926 bind(odd); 3927 (this->*block)(); 3928 subs(count, count, 2); 3929 br(Assembler::GT, loop); 3930 bind(end); 3931 } 3932 3933 template <typename T> 3934 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3935 Label loop, end, odd; 3936 tbnz(count, 0, odd); 3937 cbz(count, end); 3938 align(16); 3939 bind(loop); 3940 (this->*block)(d, s, tmp); 3941 bind(odd); 3942 (this->*block)(d, s, tmp); 3943 subs(count, count, 2); 3944 br(Assembler::GT, loop); 3945 bind(end); 3946 } 3947 3948 void pre1(RegisterOrConstant i) { 3949 block_comment("pre1"); 3950 // Pa = Pa_base; 3951 // Pb = Pb_base + i; 3952 // Pm = Pm_base; 3953 // Pn = Pn_base + i; 3954 // Ra = *Pa; 3955 // Rb = *Pb; 3956 // Rm = *Pm; 3957 // Rn = *Pn; 3958 ldr(Ra, Address(Pa_base)); 3959 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3960 ldr(Rm, Address(Pm_base)); 3961 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3962 lea(Pa, Address(Pa_base)); 3963 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3964 lea(Pm, Address(Pm_base)); 3965 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3966 3967 // Zero the m*n result. 3968 mov(Rhi_mn, zr); 3969 mov(Rlo_mn, zr); 3970 } 3971 3972 // The core multiply-accumulate step of a Montgomery 3973 // multiplication. The idea is to schedule operations as a 3974 // pipeline so that instructions with long latencies (loads and 3975 // multiplies) have time to complete before their results are 3976 // used. This most benefits in-order implementations of the 3977 // architecture but out-of-order ones also benefit. 3978 void step() { 3979 block_comment("step"); 3980 // MACC(Ra, Rb, t0, t1, t2); 3981 // Ra = *++Pa; 3982 // Rb = *--Pb; 3983 umulh(Rhi_ab, Ra, Rb); 3984 mul(Rlo_ab, Ra, Rb); 3985 ldr(Ra, pre(Pa, wordSize)); 3986 ldr(Rb, pre(Pb, -wordSize)); 3987 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 3988 // previous iteration. 3989 // MACC(Rm, Rn, t0, t1, t2); 3990 // Rm = *++Pm; 3991 // Rn = *--Pn; 3992 umulh(Rhi_mn, Rm, Rn); 3993 mul(Rlo_mn, Rm, Rn); 3994 ldr(Rm, pre(Pm, wordSize)); 3995 ldr(Rn, pre(Pn, -wordSize)); 3996 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3997 } 3998 3999 void post1() { 4000 block_comment("post1"); 4001 4002 // MACC(Ra, Rb, t0, t1, t2); 4003 // Ra = *++Pa; 4004 // Rb = *--Pb; 4005 umulh(Rhi_ab, Ra, Rb); 4006 mul(Rlo_ab, Ra, Rb); 4007 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4008 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4009 4010 // *Pm = Rm = t0 * inv; 4011 mul(Rm, t0, inv); 4012 str(Rm, Address(Pm)); 4013 4014 // MACC(Rm, Rn, t0, t1, t2); 4015 // t0 = t1; t1 = t2; t2 = 0; 4016 umulh(Rhi_mn, Rm, Rn); 4017 4018 #ifndef PRODUCT 4019 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4020 { 4021 mul(Rlo_mn, Rm, Rn); 4022 add(Rlo_mn, t0, Rlo_mn); 4023 Label ok; 4024 cbz(Rlo_mn, ok); { 4025 stop("broken Montgomery multiply"); 4026 } bind(ok); 4027 } 4028 #endif 4029 // We have very carefully set things up so that 4030 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4031 // the lower half of Rm * Rn because we know the result already: 4032 // it must be -t0. t0 + (-t0) must generate a carry iff 4033 // t0 != 0. So, rather than do a mul and an adds we just set 4034 // the carry flag iff t0 is nonzero. 4035 // 4036 // mul(Rlo_mn, Rm, Rn); 4037 // adds(zr, t0, Rlo_mn); 4038 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4039 adcs(t0, t1, Rhi_mn); 4040 adc(t1, t2, zr); 4041 mov(t2, zr); 4042 } 4043 4044 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 4045 block_comment("pre2"); 4046 // Pa = Pa_base + i-len; 4047 // Pb = Pb_base + len; 4048 // Pm = Pm_base + i-len; 4049 // Pn = Pn_base + len; 4050 4051 if (i.is_register()) { 4052 sub(Rj, i.as_register(), len); 4053 } else { 4054 mov(Rj, i.as_constant()); 4055 sub(Rj, Rj, len); 4056 } 4057 // Rj == i-len 4058 4059 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 4060 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 4061 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4062 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 4063 4064 // Ra = *++Pa; 4065 // Rb = *--Pb; 4066 // Rm = *++Pm; 4067 // Rn = *--Pn; 4068 ldr(Ra, pre(Pa, wordSize)); 4069 ldr(Rb, pre(Pb, -wordSize)); 4070 ldr(Rm, pre(Pm, wordSize)); 4071 ldr(Rn, pre(Pn, -wordSize)); 4072 4073 mov(Rhi_mn, zr); 4074 mov(Rlo_mn, zr); 4075 } 4076 4077 void post2(RegisterOrConstant i, RegisterOrConstant len) { 4078 block_comment("post2"); 4079 if (i.is_constant()) { 4080 mov(Rj, i.as_constant()-len.as_constant()); 4081 } else { 4082 sub(Rj, i.as_register(), len); 4083 } 4084 4085 adds(t0, t0, Rlo_mn); // The pending m*n, low part 4086 4087 // As soon as we know the least significant digit of our result, 4088 // store it. 4089 // Pm_base[i-len] = t0; 4090 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4091 4092 // t0 = t1; t1 = t2; t2 = 0; 4093 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 4094 adc(t1, t2, zr); 4095 mov(t2, zr); 4096 } 4097 4098 // A carry in t0 after Montgomery multiplication means that we 4099 // should subtract multiples of n from our result in m. We'll 4100 // keep doing that until there is no carry. 4101 void normalize(RegisterOrConstant len) { 4102 block_comment("normalize"); 4103 // while (t0) 4104 // t0 = sub(Pm_base, Pn_base, t0, len); 4105 Label loop, post, again; 4106 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 4107 cbz(t0, post); { 4108 bind(again); { 4109 mov(i, zr); 4110 mov(cnt, len); 4111 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4112 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4113 subs(zr, zr, zr); // set carry flag, i.e. no borrow 4114 align(16); 4115 bind(loop); { 4116 sbcs(Rm, Rm, Rn); 4117 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4118 add(i, i, 1); 4119 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4120 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4121 sub(cnt, cnt, 1); 4122 } cbnz(cnt, loop); 4123 sbc(t0, t0, zr); 4124 } cbnz(t0, again); 4125 } bind(post); 4126 } 4127 4128 // Move memory at s to d, reversing words. 4129 // Increments d to end of copied memory 4130 // Destroys tmp1, tmp2 4131 // Preserves len 4132 // Leaves s pointing to the address which was in d at start 4133 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 4134 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 4135 4136 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 4137 mov(tmp1, len); 4138 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 4139 sub(s, d, len, ext::uxtw, LogBytesPerWord); 4140 } 4141 // where 4142 void reverse1(Register d, Register s, Register tmp) { 4143 ldr(tmp, pre(s, -wordSize)); 4144 ror(tmp, tmp, 32); 4145 str(tmp, post(d, wordSize)); 4146 } 4147 4148 void step_squaring() { 4149 // An extra ACC 4150 step(); 4151 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4152 } 4153 4154 void last_squaring(RegisterOrConstant i) { 4155 Label dont; 4156 // if ((i & 1) == 0) { 4157 tbnz(i.as_register(), 0, dont); { 4158 // MACC(Ra, Rb, t0, t1, t2); 4159 // Ra = *++Pa; 4160 // Rb = *--Pb; 4161 umulh(Rhi_ab, Ra, Rb); 4162 mul(Rlo_ab, Ra, Rb); 4163 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4164 } bind(dont); 4165 } 4166 4167 void extra_step_squaring() { 4168 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4169 4170 // MACC(Rm, Rn, t0, t1, t2); 4171 // Rm = *++Pm; 4172 // Rn = *--Pn; 4173 umulh(Rhi_mn, Rm, Rn); 4174 mul(Rlo_mn, Rm, Rn); 4175 ldr(Rm, pre(Pm, wordSize)); 4176 ldr(Rn, pre(Pn, -wordSize)); 4177 } 4178 4179 void post1_squaring() { 4180 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4181 4182 // *Pm = Rm = t0 * inv; 4183 mul(Rm, t0, inv); 4184 str(Rm, Address(Pm)); 4185 4186 // MACC(Rm, Rn, t0, t1, t2); 4187 // t0 = t1; t1 = t2; t2 = 0; 4188 umulh(Rhi_mn, Rm, Rn); 4189 4190 #ifndef PRODUCT 4191 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4192 { 4193 mul(Rlo_mn, Rm, Rn); 4194 add(Rlo_mn, t0, Rlo_mn); 4195 Label ok; 4196 cbz(Rlo_mn, ok); { 4197 stop("broken Montgomery multiply"); 4198 } bind(ok); 4199 } 4200 #endif 4201 // We have very carefully set things up so that 4202 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4203 // the lower half of Rm * Rn because we know the result already: 4204 // it must be -t0. t0 + (-t0) must generate a carry iff 4205 // t0 != 0. So, rather than do a mul and an adds we just set 4206 // the carry flag iff t0 is nonzero. 4207 // 4208 // mul(Rlo_mn, Rm, Rn); 4209 // adds(zr, t0, Rlo_mn); 4210 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4211 adcs(t0, t1, Rhi_mn); 4212 adc(t1, t2, zr); 4213 mov(t2, zr); 4214 } 4215 4216 void acc(Register Rhi, Register Rlo, 4217 Register t0, Register t1, Register t2) { 4218 adds(t0, t0, Rlo); 4219 adcs(t1, t1, Rhi); 4220 adc(t2, t2, zr); 4221 } 4222 4223 public: 4224 /** 4225 * Fast Montgomery multiplication. The derivation of the 4226 * algorithm is in A Cryptographic Library for the Motorola 4227 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 4228 * 4229 * Arguments: 4230 * 4231 * Inputs for multiplication: 4232 * c_rarg0 - int array elements a 4233 * c_rarg1 - int array elements b 4234 * c_rarg2 - int array elements n (the modulus) 4235 * c_rarg3 - int length 4236 * c_rarg4 - int inv 4237 * c_rarg5 - int array elements m (the result) 4238 * 4239 * Inputs for squaring: 4240 * c_rarg0 - int array elements a 4241 * c_rarg1 - int array elements n (the modulus) 4242 * c_rarg2 - int length 4243 * c_rarg3 - int inv 4244 * c_rarg4 - int array elements m (the result) 4245 * 4246 */ 4247 address generate_multiply() { 4248 Label argh, nothing; 4249 bind(argh); 4250 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4251 4252 align(CodeEntryAlignment); 4253 address entry = pc(); 4254 4255 cbzw(Rlen, nothing); 4256 4257 enter(); 4258 4259 // Make room. 4260 cmpw(Rlen, 512); 4261 br(Assembler::HI, argh); 4262 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4263 andr(sp, Ra, -2 * wordSize); 4264 4265 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4266 4267 { 4268 // Copy input args, reversing as we go. We use Ra as a 4269 // temporary variable. 4270 reverse(Ra, Pa_base, Rlen, t0, t1); 4271 if (!_squaring) 4272 reverse(Ra, Pb_base, Rlen, t0, t1); 4273 reverse(Ra, Pn_base, Rlen, t0, t1); 4274 } 4275 4276 // Push all call-saved registers and also Pm_base which we'll need 4277 // at the end. 4278 save_regs(); 4279 4280 #ifndef PRODUCT 4281 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 4282 { 4283 ldr(Rn, Address(Pn_base, 0)); 4284 mul(Rlo_mn, Rn, inv); 4285 cmp(Rlo_mn, -1); 4286 Label ok; 4287 br(EQ, ok); { 4288 stop("broken inverse in Montgomery multiply"); 4289 } bind(ok); 4290 } 4291 #endif 4292 4293 mov(Pm_base, Ra); 4294 4295 mov(t0, zr); 4296 mov(t1, zr); 4297 mov(t2, zr); 4298 4299 block_comment("for (int i = 0; i < len; i++) {"); 4300 mov(Ri, zr); { 4301 Label loop, end; 4302 cmpw(Ri, Rlen); 4303 br(Assembler::GE, end); 4304 4305 bind(loop); 4306 pre1(Ri); 4307 4308 block_comment(" for (j = i; j; j--) {"); { 4309 movw(Rj, Ri); 4310 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4311 } block_comment(" } // j"); 4312 4313 post1(); 4314 addw(Ri, Ri, 1); 4315 cmpw(Ri, Rlen); 4316 br(Assembler::LT, loop); 4317 bind(end); 4318 block_comment("} // i"); 4319 } 4320 4321 block_comment("for (int i = len; i < 2*len; i++) {"); 4322 mov(Ri, Rlen); { 4323 Label loop, end; 4324 cmpw(Ri, Rlen, Assembler::LSL, 1); 4325 br(Assembler::GE, end); 4326 4327 bind(loop); 4328 pre2(Ri, Rlen); 4329 4330 block_comment(" for (j = len*2-i-1; j; j--) {"); { 4331 lslw(Rj, Rlen, 1); 4332 subw(Rj, Rj, Ri); 4333 subw(Rj, Rj, 1); 4334 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4335 } block_comment(" } // j"); 4336 4337 post2(Ri, Rlen); 4338 addw(Ri, Ri, 1); 4339 cmpw(Ri, Rlen, Assembler::LSL, 1); 4340 br(Assembler::LT, loop); 4341 bind(end); 4342 } 4343 block_comment("} // i"); 4344 4345 normalize(Rlen); 4346 4347 mov(Ra, Pm_base); // Save Pm_base in Ra 4348 restore_regs(); // Restore caller's Pm_base 4349 4350 // Copy our result into caller's Pm_base 4351 reverse(Pm_base, Ra, Rlen, t0, t1); 4352 4353 leave(); 4354 bind(nothing); 4355 ret(lr); 4356 4357 return entry; 4358 } 4359 // In C, approximately: 4360 4361 // void 4362 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 4363 // unsigned long Pn_base[], unsigned long Pm_base[], 4364 // unsigned long inv, int len) { 4365 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4366 // unsigned long *Pa, *Pb, *Pn, *Pm; 4367 // unsigned long Ra, Rb, Rn, Rm; 4368 4369 // int i; 4370 4371 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4372 4373 // for (i = 0; i < len; i++) { 4374 // int j; 4375 4376 // Pa = Pa_base; 4377 // Pb = Pb_base + i; 4378 // Pm = Pm_base; 4379 // Pn = Pn_base + i; 4380 4381 // Ra = *Pa; 4382 // Rb = *Pb; 4383 // Rm = *Pm; 4384 // Rn = *Pn; 4385 4386 // int iters = i; 4387 // for (j = 0; iters--; j++) { 4388 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4389 // MACC(Ra, Rb, t0, t1, t2); 4390 // Ra = *++Pa; 4391 // Rb = *--Pb; 4392 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4393 // MACC(Rm, Rn, t0, t1, t2); 4394 // Rm = *++Pm; 4395 // Rn = *--Pn; 4396 // } 4397 4398 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 4399 // MACC(Ra, Rb, t0, t1, t2); 4400 // *Pm = Rm = t0 * inv; 4401 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4402 // MACC(Rm, Rn, t0, t1, t2); 4403 4404 // assert(t0 == 0, "broken Montgomery multiply"); 4405 4406 // t0 = t1; t1 = t2; t2 = 0; 4407 // } 4408 4409 // for (i = len; i < 2*len; i++) { 4410 // int j; 4411 4412 // Pa = Pa_base + i-len; 4413 // Pb = Pb_base + len; 4414 // Pm = Pm_base + i-len; 4415 // Pn = Pn_base + len; 4416 4417 // Ra = *++Pa; 4418 // Rb = *--Pb; 4419 // Rm = *++Pm; 4420 // Rn = *--Pn; 4421 4422 // int iters = len*2-i-1; 4423 // for (j = i-len+1; iters--; j++) { 4424 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4425 // MACC(Ra, Rb, t0, t1, t2); 4426 // Ra = *++Pa; 4427 // Rb = *--Pb; 4428 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4429 // MACC(Rm, Rn, t0, t1, t2); 4430 // Rm = *++Pm; 4431 // Rn = *--Pn; 4432 // } 4433 4434 // Pm_base[i-len] = t0; 4435 // t0 = t1; t1 = t2; t2 = 0; 4436 // } 4437 4438 // while (t0) 4439 // t0 = sub(Pm_base, Pn_base, t0, len); 4440 // } 4441 4442 /** 4443 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4444 * multiplies than Montgomery multiplication so it should be up to 4445 * 25% faster. However, its loop control is more complex and it 4446 * may actually run slower on some machines. 4447 * 4448 * Arguments: 4449 * 4450 * Inputs: 4451 * c_rarg0 - int array elements a 4452 * c_rarg1 - int array elements n (the modulus) 4453 * c_rarg2 - int length 4454 * c_rarg3 - int inv 4455 * c_rarg4 - int array elements m (the result) 4456 * 4457 */ 4458 address generate_square() { 4459 Label argh; 4460 bind(argh); 4461 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4462 4463 align(CodeEntryAlignment); 4464 address entry = pc(); 4465 4466 enter(); 4467 4468 // Make room. 4469 cmpw(Rlen, 512); 4470 br(Assembler::HI, argh); 4471 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4472 andr(sp, Ra, -2 * wordSize); 4473 4474 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4475 4476 { 4477 // Copy input args, reversing as we go. We use Ra as a 4478 // temporary variable. 4479 reverse(Ra, Pa_base, Rlen, t0, t1); 4480 reverse(Ra, Pn_base, Rlen, t0, t1); 4481 } 4482 4483 // Push all call-saved registers and also Pm_base which we'll need 4484 // at the end. 4485 save_regs(); 4486 4487 mov(Pm_base, Ra); 4488 4489 mov(t0, zr); 4490 mov(t1, zr); 4491 mov(t2, zr); 4492 4493 block_comment("for (int i = 0; i < len; i++) {"); 4494 mov(Ri, zr); { 4495 Label loop, end; 4496 bind(loop); 4497 cmp(Ri, Rlen); 4498 br(Assembler::GE, end); 4499 4500 pre1(Ri); 4501 4502 block_comment("for (j = (i+1)/2; j; j--) {"); { 4503 add(Rj, Ri, 1); 4504 lsr(Rj, Rj, 1); 4505 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4506 } block_comment(" } // j"); 4507 4508 last_squaring(Ri); 4509 4510 block_comment(" for (j = i/2; j; j--) {"); { 4511 lsr(Rj, Ri, 1); 4512 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4513 } block_comment(" } // j"); 4514 4515 post1_squaring(); 4516 add(Ri, Ri, 1); 4517 cmp(Ri, Rlen); 4518 br(Assembler::LT, loop); 4519 4520 bind(end); 4521 block_comment("} // i"); 4522 } 4523 4524 block_comment("for (int i = len; i < 2*len; i++) {"); 4525 mov(Ri, Rlen); { 4526 Label loop, end; 4527 bind(loop); 4528 cmp(Ri, Rlen, Assembler::LSL, 1); 4529 br(Assembler::GE, end); 4530 4531 pre2(Ri, Rlen); 4532 4533 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4534 lsl(Rj, Rlen, 1); 4535 sub(Rj, Rj, Ri); 4536 sub(Rj, Rj, 1); 4537 lsr(Rj, Rj, 1); 4538 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4539 } block_comment(" } // j"); 4540 4541 last_squaring(Ri); 4542 4543 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4544 lsl(Rj, Rlen, 1); 4545 sub(Rj, Rj, Ri); 4546 lsr(Rj, Rj, 1); 4547 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4548 } block_comment(" } // j"); 4549 4550 post2(Ri, Rlen); 4551 add(Ri, Ri, 1); 4552 cmp(Ri, Rlen, Assembler::LSL, 1); 4553 4554 br(Assembler::LT, loop); 4555 bind(end); 4556 block_comment("} // i"); 4557 } 4558 4559 normalize(Rlen); 4560 4561 mov(Ra, Pm_base); // Save Pm_base in Ra 4562 restore_regs(); // Restore caller's Pm_base 4563 4564 // Copy our result into caller's Pm_base 4565 reverse(Pm_base, Ra, Rlen, t0, t1); 4566 4567 leave(); 4568 ret(lr); 4569 4570 return entry; 4571 } 4572 // In C, approximately: 4573 4574 // void 4575 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4576 // unsigned long Pm_base[], unsigned long inv, int len) { 4577 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4578 // unsigned long *Pa, *Pb, *Pn, *Pm; 4579 // unsigned long Ra, Rb, Rn, Rm; 4580 4581 // int i; 4582 4583 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4584 4585 // for (i = 0; i < len; i++) { 4586 // int j; 4587 4588 // Pa = Pa_base; 4589 // Pb = Pa_base + i; 4590 // Pm = Pm_base; 4591 // Pn = Pn_base + i; 4592 4593 // Ra = *Pa; 4594 // Rb = *Pb; 4595 // Rm = *Pm; 4596 // Rn = *Pn; 4597 4598 // int iters = (i+1)/2; 4599 // for (j = 0; iters--; j++) { 4600 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4601 // MACC2(Ra, Rb, t0, t1, t2); 4602 // Ra = *++Pa; 4603 // Rb = *--Pb; 4604 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4605 // MACC(Rm, Rn, t0, t1, t2); 4606 // Rm = *++Pm; 4607 // Rn = *--Pn; 4608 // } 4609 // if ((i & 1) == 0) { 4610 // assert(Ra == Pa_base[j], "must be"); 4611 // MACC(Ra, Ra, t0, t1, t2); 4612 // } 4613 // iters = i/2; 4614 // assert(iters == i-j, "must be"); 4615 // for (; iters--; j++) { 4616 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4617 // MACC(Rm, Rn, t0, t1, t2); 4618 // Rm = *++Pm; 4619 // Rn = *--Pn; 4620 // } 4621 4622 // *Pm = Rm = t0 * inv; 4623 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4624 // MACC(Rm, Rn, t0, t1, t2); 4625 4626 // assert(t0 == 0, "broken Montgomery multiply"); 4627 4628 // t0 = t1; t1 = t2; t2 = 0; 4629 // } 4630 4631 // for (i = len; i < 2*len; i++) { 4632 // int start = i-len+1; 4633 // int end = start + (len - start)/2; 4634 // int j; 4635 4636 // Pa = Pa_base + i-len; 4637 // Pb = Pa_base + len; 4638 // Pm = Pm_base + i-len; 4639 // Pn = Pn_base + len; 4640 4641 // Ra = *++Pa; 4642 // Rb = *--Pb; 4643 // Rm = *++Pm; 4644 // Rn = *--Pn; 4645 4646 // int iters = (2*len-i-1)/2; 4647 // assert(iters == end-start, "must be"); 4648 // for (j = start; iters--; j++) { 4649 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4650 // MACC2(Ra, Rb, t0, t1, t2); 4651 // Ra = *++Pa; 4652 // Rb = *--Pb; 4653 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4654 // MACC(Rm, Rn, t0, t1, t2); 4655 // Rm = *++Pm; 4656 // Rn = *--Pn; 4657 // } 4658 // if ((i & 1) == 0) { 4659 // assert(Ra == Pa_base[j], "must be"); 4660 // MACC(Ra, Ra, t0, t1, t2); 4661 // } 4662 // iters = (2*len-i)/2; 4663 // assert(iters == len-j, "must be"); 4664 // for (; iters--; j++) { 4665 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4666 // MACC(Rm, Rn, t0, t1, t2); 4667 // Rm = *++Pm; 4668 // Rn = *--Pn; 4669 // } 4670 // Pm_base[i-len] = t0; 4671 // t0 = t1; t1 = t2; t2 = 0; 4672 // } 4673 4674 // while (t0) 4675 // t0 = sub(Pm_base, Pn_base, t0, len); 4676 // } 4677 }; 4678 4679 // Initialization 4680 void generate_initial() { 4681 // Generate initial stubs and initializes the entry points 4682 4683 // entry points that exist in all platforms Note: This is code 4684 // that could be shared among different platforms - however the 4685 // benefit seems to be smaller than the disadvantage of having a 4686 // much more complicated generator structure. See also comment in 4687 // stubRoutines.hpp. 4688 4689 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4690 4691 StubRoutines::_call_stub_entry = 4692 generate_call_stub(StubRoutines::_call_stub_return_address); 4693 4694 // is referenced by megamorphic call 4695 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4696 4697 // Build this early so it's available for the interpreter. 4698 StubRoutines::_throw_StackOverflowError_entry = 4699 generate_throw_exception("StackOverflowError throw_exception", 4700 CAST_FROM_FN_PTR(address, 4701 SharedRuntime::throw_StackOverflowError)); 4702 StubRoutines::_throw_delayed_StackOverflowError_entry = 4703 generate_throw_exception("delayed StackOverflowError throw_exception", 4704 CAST_FROM_FN_PTR(address, 4705 SharedRuntime::throw_delayed_StackOverflowError)); 4706 if (UseCRC32Intrinsics) { 4707 // set table address before stub generation which use it 4708 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 4709 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4710 } 4711 } 4712 4713 void generate_all() { 4714 // support for verify_oop (must happen after universe_init) 4715 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4716 StubRoutines::_throw_AbstractMethodError_entry = 4717 generate_throw_exception("AbstractMethodError throw_exception", 4718 CAST_FROM_FN_PTR(address, 4719 SharedRuntime:: 4720 throw_AbstractMethodError)); 4721 4722 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4723 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4724 CAST_FROM_FN_PTR(address, 4725 SharedRuntime:: 4726 throw_IncompatibleClassChangeError)); 4727 4728 StubRoutines::_throw_NullPointerException_at_call_entry = 4729 generate_throw_exception("NullPointerException at call throw_exception", 4730 CAST_FROM_FN_PTR(address, 4731 SharedRuntime:: 4732 throw_NullPointerException_at_call)); 4733 4734 // arraycopy stubs used by compilers 4735 generate_arraycopy_stubs(); 4736 4737 if (UseMultiplyToLenIntrinsic) { 4738 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4739 } 4740 4741 if (UseMontgomeryMultiplyIntrinsic) { 4742 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4743 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4744 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4745 } 4746 4747 if (UseMontgomerySquareIntrinsic) { 4748 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4749 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4750 // We use generate_multiply() rather than generate_square() 4751 // because it's faster for the sizes of modulus we care about. 4752 StubRoutines::_montgomerySquare = g.generate_multiply(); 4753 } 4754 4755 #ifndef BUILTIN_SIM 4756 // generate GHASH intrinsics code 4757 if (UseGHASHIntrinsics) { 4758 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 4759 } 4760 4761 if (UseAESIntrinsics) { 4762 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4763 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4764 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4765 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4766 } 4767 4768 if (UseSHA1Intrinsics) { 4769 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4770 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4771 } 4772 if (UseSHA256Intrinsics) { 4773 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4774 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4775 } 4776 4777 if (UseCRC32CIntrinsics) { 4778 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 4779 } 4780 4781 // generate Adler32 intrinsics code 4782 if (UseAdler32Intrinsics) { 4783 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 4784 } 4785 4786 // Safefetch stubs. 4787 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4788 &StubRoutines::_safefetch32_fault_pc, 4789 &StubRoutines::_safefetch32_continuation_pc); 4790 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4791 &StubRoutines::_safefetchN_fault_pc, 4792 &StubRoutines::_safefetchN_continuation_pc); 4793 #endif 4794 StubRoutines::aarch64::set_completed(); 4795 } 4796 4797 public: 4798 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4799 if (all) { 4800 generate_all(); 4801 } else { 4802 generate_initial(); 4803 } 4804 } 4805 }; // end class declaration 4806 4807 void StubGenerator_generate(CodeBuffer* code, bool all) { 4808 StubGenerator g(code, all); 4809 }