1 /* 2 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "interpreter/interpreter.hpp" 30 #include "nativeInst_aarch64.hpp" 31 #include "oops/instanceOop.hpp" 32 #include "oops/method.hpp" 33 #include "oops/objArrayKlass.hpp" 34 #include "oops/oop.inline.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/frame.inline.hpp" 37 #include "runtime/handles.inline.hpp" 38 #include "runtime/sharedRuntime.hpp" 39 #include "runtime/stubCodeGenerator.hpp" 40 #include "runtime/stubRoutines.hpp" 41 #include "runtime/thread.inline.hpp" 42 #include "utilities/align.hpp" 43 #ifdef COMPILER2 44 #include "opto/runtime.hpp" 45 #endif 46 47 #ifdef BUILTIN_SIM 48 #include "../../../../../../simulator/simulator.hpp" 49 #endif 50 51 // Declaration and definition of StubGenerator (no .hpp file). 52 // For a more detailed description of the stub routine structure 53 // see the comment in stubRoutines.hpp 54 55 #undef __ 56 #define __ _masm-> 57 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 58 59 #ifdef PRODUCT 60 #define BLOCK_COMMENT(str) /* nothing */ 61 #else 62 #define BLOCK_COMMENT(str) __ block_comment(str) 63 #endif 64 65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 66 67 // Stub Code definitions 68 69 class StubGenerator: public StubCodeGenerator { 70 private: 71 72 #ifdef PRODUCT 73 #define inc_counter_np(counter) ((void)0) 74 #else 75 void inc_counter_np_(int& counter) { 76 __ lea(rscratch2, ExternalAddress((address)&counter)); 77 __ ldrw(rscratch1, Address(rscratch2)); 78 __ addw(rscratch1, rscratch1, 1); 79 __ strw(rscratch1, Address(rscratch2)); 80 } 81 #define inc_counter_np(counter) \ 82 BLOCK_COMMENT("inc_counter " #counter); \ 83 inc_counter_np_(counter); 84 #endif 85 86 // Call stubs are used to call Java from C 87 // 88 // Arguments: 89 // c_rarg0: call wrapper address address 90 // c_rarg1: result address 91 // c_rarg2: result type BasicType 92 // c_rarg3: method Method* 93 // c_rarg4: (interpreter) entry point address 94 // c_rarg5: parameters intptr_t* 95 // c_rarg6: parameter size (in words) int 96 // c_rarg7: thread Thread* 97 // 98 // There is no return from the stub itself as any Java result 99 // is written to result 100 // 101 // we save r30 (lr) as the return PC at the base of the frame and 102 // link r29 (fp) below it as the frame pointer installing sp (r31) 103 // into fp. 104 // 105 // we save r0-r7, which accounts for all the c arguments. 106 // 107 // TODO: strictly do we need to save them all? they are treated as 108 // volatile by C so could we omit saving the ones we are going to 109 // place in global registers (thread? method?) or those we only use 110 // during setup of the Java call? 111 // 112 // we don't need to save r8 which C uses as an indirect result location 113 // return register. 114 // 115 // we don't need to save r9-r15 which both C and Java treat as 116 // volatile 117 // 118 // we don't need to save r16-18 because Java does not use them 119 // 120 // we save r19-r28 which Java uses as scratch registers and C 121 // expects to be callee-save 122 // 123 // we save the bottom 64 bits of each value stored in v8-v15; it is 124 // the responsibility of the caller to preserve larger values. 125 // 126 // so the stub frame looks like this when we enter Java code 127 // 128 // [ return_from_Java ] <--- sp 129 // [ argument word n ] 130 // ... 131 // -27 [ argument word 1 ] 132 // -26 [ saved v15 ] <--- sp_after_call 133 // -25 [ saved v14 ] 134 // -24 [ saved v13 ] 135 // -23 [ saved v12 ] 136 // -22 [ saved v11 ] 137 // -21 [ saved v10 ] 138 // -20 [ saved v9 ] 139 // -19 [ saved v8 ] 140 // -18 [ saved r28 ] 141 // -17 [ saved r27 ] 142 // -16 [ saved r26 ] 143 // -15 [ saved r25 ] 144 // -14 [ saved r24 ] 145 // -13 [ saved r23 ] 146 // -12 [ saved r22 ] 147 // -11 [ saved r21 ] 148 // -10 [ saved r20 ] 149 // -9 [ saved r19 ] 150 // -8 [ call wrapper (r0) ] 151 // -7 [ result (r1) ] 152 // -6 [ result type (r2) ] 153 // -5 [ method (r3) ] 154 // -4 [ entry point (r4) ] 155 // -3 [ parameters (r5) ] 156 // -2 [ parameter size (r6) ] 157 // -1 [ thread (r7) ] 158 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 159 // 1 [ saved lr (r30) ] 160 161 // Call stub stack layout word offsets from fp 162 enum call_stub_layout { 163 sp_after_call_off = -26, 164 165 d15_off = -26, 166 d13_off = -24, 167 d11_off = -22, 168 d9_off = -20, 169 170 r28_off = -18, 171 r26_off = -16, 172 r24_off = -14, 173 r22_off = -12, 174 r20_off = -10, 175 call_wrapper_off = -8, 176 result_off = -7, 177 result_type_off = -6, 178 method_off = -5, 179 entry_point_off = -4, 180 parameter_size_off = -2, 181 thread_off = -1, 182 fp_f = 0, 183 retaddr_off = 1, 184 }; 185 186 address generate_call_stub(address& return_address) { 187 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 188 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 189 "adjust this code"); 190 191 StubCodeMark mark(this, "StubRoutines", "call_stub"); 192 address start = __ pc(); 193 194 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 195 196 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 197 const Address result (rfp, result_off * wordSize); 198 const Address result_type (rfp, result_type_off * wordSize); 199 const Address method (rfp, method_off * wordSize); 200 const Address entry_point (rfp, entry_point_off * wordSize); 201 const Address parameter_size(rfp, parameter_size_off * wordSize); 202 203 const Address thread (rfp, thread_off * wordSize); 204 205 const Address d15_save (rfp, d15_off * wordSize); 206 const Address d13_save (rfp, d13_off * wordSize); 207 const Address d11_save (rfp, d11_off * wordSize); 208 const Address d9_save (rfp, d9_off * wordSize); 209 210 const Address r28_save (rfp, r28_off * wordSize); 211 const Address r26_save (rfp, r26_off * wordSize); 212 const Address r24_save (rfp, r24_off * wordSize); 213 const Address r22_save (rfp, r22_off * wordSize); 214 const Address r20_save (rfp, r20_off * wordSize); 215 216 // stub code 217 218 // we need a C prolog to bootstrap the x86 caller into the sim 219 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 220 221 address aarch64_entry = __ pc(); 222 223 #ifdef BUILTIN_SIM 224 // Save sender's SP for stack traces. 225 __ mov(rscratch1, sp); 226 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 227 #endif 228 // set up frame and move sp to end of save area 229 __ enter(); 230 __ sub(sp, rfp, -sp_after_call_off * wordSize); 231 232 // save register parameters and Java scratch/global registers 233 // n.b. we save thread even though it gets installed in 234 // rthread because we want to sanity check rthread later 235 __ str(c_rarg7, thread); 236 __ strw(c_rarg6, parameter_size); 237 __ stp(c_rarg4, c_rarg5, entry_point); 238 __ stp(c_rarg2, c_rarg3, result_type); 239 __ stp(c_rarg0, c_rarg1, call_wrapper); 240 241 __ stp(r20, r19, r20_save); 242 __ stp(r22, r21, r22_save); 243 __ stp(r24, r23, r24_save); 244 __ stp(r26, r25, r26_save); 245 __ stp(r28, r27, r28_save); 246 247 __ stpd(v9, v8, d9_save); 248 __ stpd(v11, v10, d11_save); 249 __ stpd(v13, v12, d13_save); 250 __ stpd(v15, v14, d15_save); 251 252 // install Java thread in global register now we have saved 253 // whatever value it held 254 __ mov(rthread, c_rarg7); 255 // And method 256 __ mov(rmethod, c_rarg3); 257 258 // set up the heapbase register 259 __ reinit_heapbase(); 260 261 #ifdef ASSERT 262 // make sure we have no pending exceptions 263 { 264 Label L; 265 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 266 __ cmp(rscratch1, (unsigned)NULL_WORD); 267 __ br(Assembler::EQ, L); 268 __ stop("StubRoutines::call_stub: entered with pending exception"); 269 __ BIND(L); 270 } 271 #endif 272 // pass parameters if any 273 __ mov(esp, sp); 274 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 275 __ andr(sp, rscratch1, -2 * wordSize); 276 277 BLOCK_COMMENT("pass parameters if any"); 278 Label parameters_done; 279 // parameter count is still in c_rarg6 280 // and parameter pointer identifying param 1 is in c_rarg5 281 __ cbzw(c_rarg6, parameters_done); 282 283 address loop = __ pc(); 284 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 285 __ subsw(c_rarg6, c_rarg6, 1); 286 __ push(rscratch1); 287 __ br(Assembler::GT, loop); 288 289 __ BIND(parameters_done); 290 291 // call Java entry -- passing methdoOop, and current sp 292 // rmethod: Method* 293 // r13: sender sp 294 BLOCK_COMMENT("call Java function"); 295 __ mov(r13, sp); 296 __ blr(c_rarg4); 297 298 // tell the simulator we have returned to the stub 299 300 // we do this here because the notify will already have been done 301 // if we get to the next instruction via an exception 302 // 303 // n.b. adding this instruction here affects the calculation of 304 // whether or not a routine returns to the call stub (used when 305 // doing stack walks) since the normal test is to check the return 306 // pc against the address saved below. so we may need to allow for 307 // this extra instruction in the check. 308 309 if (NotifySimulator) { 310 __ notify(Assembler::method_reentry); 311 } 312 // save current address for use by exception handling code 313 314 return_address = __ pc(); 315 316 // store result depending on type (everything that is not 317 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 318 // n.b. this assumes Java returns an integral result in r0 319 // and a floating result in j_farg0 320 __ ldr(j_rarg2, result); 321 Label is_long, is_float, is_double, exit; 322 __ ldr(j_rarg1, result_type); 323 __ cmp(j_rarg1, T_OBJECT); 324 __ br(Assembler::EQ, is_long); 325 __ cmp(j_rarg1, T_LONG); 326 __ br(Assembler::EQ, is_long); 327 __ cmp(j_rarg1, T_FLOAT); 328 __ br(Assembler::EQ, is_float); 329 __ cmp(j_rarg1, T_DOUBLE); 330 __ br(Assembler::EQ, is_double); 331 332 // handle T_INT case 333 __ strw(r0, Address(j_rarg2)); 334 335 __ BIND(exit); 336 337 // pop parameters 338 __ sub(esp, rfp, -sp_after_call_off * wordSize); 339 340 #ifdef ASSERT 341 // verify that threads correspond 342 { 343 Label L, S; 344 __ ldr(rscratch1, thread); 345 __ cmp(rthread, rscratch1); 346 __ br(Assembler::NE, S); 347 __ get_thread(rscratch1); 348 __ cmp(rthread, rscratch1); 349 __ br(Assembler::EQ, L); 350 __ BIND(S); 351 __ stop("StubRoutines::call_stub: threads must correspond"); 352 __ BIND(L); 353 } 354 #endif 355 356 // restore callee-save registers 357 __ ldpd(v15, v14, d15_save); 358 __ ldpd(v13, v12, d13_save); 359 __ ldpd(v11, v10, d11_save); 360 __ ldpd(v9, v8, d9_save); 361 362 __ ldp(r28, r27, r28_save); 363 __ ldp(r26, r25, r26_save); 364 __ ldp(r24, r23, r24_save); 365 __ ldp(r22, r21, r22_save); 366 __ ldp(r20, r19, r20_save); 367 368 __ ldp(c_rarg0, c_rarg1, call_wrapper); 369 __ ldrw(c_rarg2, result_type); 370 __ ldr(c_rarg3, method); 371 __ ldp(c_rarg4, c_rarg5, entry_point); 372 __ ldp(c_rarg6, c_rarg7, parameter_size); 373 374 #ifndef PRODUCT 375 // tell the simulator we are about to end Java execution 376 if (NotifySimulator) { 377 __ notify(Assembler::method_exit); 378 } 379 #endif 380 // leave frame and return to caller 381 __ leave(); 382 __ ret(lr); 383 384 // handle return types different from T_INT 385 386 __ BIND(is_long); 387 __ str(r0, Address(j_rarg2, 0)); 388 __ br(Assembler::AL, exit); 389 390 __ BIND(is_float); 391 __ strs(j_farg0, Address(j_rarg2, 0)); 392 __ br(Assembler::AL, exit); 393 394 __ BIND(is_double); 395 __ strd(j_farg0, Address(j_rarg2, 0)); 396 __ br(Assembler::AL, exit); 397 398 return start; 399 } 400 401 // Return point for a Java call if there's an exception thrown in 402 // Java code. The exception is caught and transformed into a 403 // pending exception stored in JavaThread that can be tested from 404 // within the VM. 405 // 406 // Note: Usually the parameters are removed by the callee. In case 407 // of an exception crossing an activation frame boundary, that is 408 // not the case if the callee is compiled code => need to setup the 409 // rsp. 410 // 411 // r0: exception oop 412 413 // NOTE: this is used as a target from the signal handler so it 414 // needs an x86 prolog which returns into the current simulator 415 // executing the generated catch_exception code. so the prolog 416 // needs to install rax in a sim register and adjust the sim's 417 // restart pc to enter the generated code at the start position 418 // then return from native to simulated execution. 419 420 address generate_catch_exception() { 421 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 422 address start = __ pc(); 423 424 // same as in generate_call_stub(): 425 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 426 const Address thread (rfp, thread_off * wordSize); 427 428 #ifdef ASSERT 429 // verify that threads correspond 430 { 431 Label L, S; 432 __ ldr(rscratch1, thread); 433 __ cmp(rthread, rscratch1); 434 __ br(Assembler::NE, S); 435 __ get_thread(rscratch1); 436 __ cmp(rthread, rscratch1); 437 __ br(Assembler::EQ, L); 438 __ bind(S); 439 __ stop("StubRoutines::catch_exception: threads must correspond"); 440 __ bind(L); 441 } 442 #endif 443 444 // set pending exception 445 __ verify_oop(r0); 446 447 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 448 __ mov(rscratch1, (address)__FILE__); 449 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 450 __ movw(rscratch1, (int)__LINE__); 451 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 452 453 // complete return to VM 454 assert(StubRoutines::_call_stub_return_address != NULL, 455 "_call_stub_return_address must have been generated before"); 456 __ b(StubRoutines::_call_stub_return_address); 457 458 return start; 459 } 460 461 // Continuation point for runtime calls returning with a pending 462 // exception. The pending exception check happened in the runtime 463 // or native call stub. The pending exception in Thread is 464 // converted into a Java-level exception. 465 // 466 // Contract with Java-level exception handlers: 467 // r0: exception 468 // r3: throwing pc 469 // 470 // NOTE: At entry of this stub, exception-pc must be in LR !! 471 472 // NOTE: this is always used as a jump target within generated code 473 // so it just needs to be generated code wiht no x86 prolog 474 475 address generate_forward_exception() { 476 StubCodeMark mark(this, "StubRoutines", "forward exception"); 477 address start = __ pc(); 478 479 // Upon entry, LR points to the return address returning into 480 // Java (interpreted or compiled) code; i.e., the return address 481 // becomes the throwing pc. 482 // 483 // Arguments pushed before the runtime call are still on the stack 484 // but the exception handler will reset the stack pointer -> 485 // ignore them. A potential result in registers can be ignored as 486 // well. 487 488 #ifdef ASSERT 489 // make sure this code is only executed if there is a pending exception 490 { 491 Label L; 492 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 493 __ cbnz(rscratch1, L); 494 __ stop("StubRoutines::forward exception: no pending exception (1)"); 495 __ bind(L); 496 } 497 #endif 498 499 // compute exception handler into r19 500 501 // call the VM to find the handler address associated with the 502 // caller address. pass thread in r0 and caller pc (ret address) 503 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 504 // the stack. 505 __ mov(c_rarg1, lr); 506 // lr will be trashed by the VM call so we move it to R19 507 // (callee-saved) because we also need to pass it to the handler 508 // returned by this call. 509 __ mov(r19, lr); 510 BLOCK_COMMENT("call exception_handler_for_return_address"); 511 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 512 SharedRuntime::exception_handler_for_return_address), 513 rthread, c_rarg1); 514 // we should not really care that lr is no longer the callee 515 // address. we saved the value the handler needs in r19 so we can 516 // just copy it to r3. however, the C2 handler will push its own 517 // frame and then calls into the VM and the VM code asserts that 518 // the PC for the frame above the handler belongs to a compiled 519 // Java method. So, we restore lr here to satisfy that assert. 520 __ mov(lr, r19); 521 // setup r0 & r3 & clear pending exception 522 __ mov(r3, r19); 523 __ mov(r19, r0); 524 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 525 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 526 527 #ifdef ASSERT 528 // make sure exception is set 529 { 530 Label L; 531 __ cbnz(r0, L); 532 __ stop("StubRoutines::forward exception: no pending exception (2)"); 533 __ bind(L); 534 } 535 #endif 536 537 // continue at exception handler 538 // r0: exception 539 // r3: throwing pc 540 // r19: exception handler 541 __ verify_oop(r0); 542 __ br(r19); 543 544 return start; 545 } 546 547 // Non-destructive plausibility checks for oops 548 // 549 // Arguments: 550 // r0: oop to verify 551 // rscratch1: error message 552 // 553 // Stack after saving c_rarg3: 554 // [tos + 0]: saved c_rarg3 555 // [tos + 1]: saved c_rarg2 556 // [tos + 2]: saved lr 557 // [tos + 3]: saved rscratch2 558 // [tos + 4]: saved r0 559 // [tos + 5]: saved rscratch1 560 address generate_verify_oop() { 561 562 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 563 address start = __ pc(); 564 565 Label exit, error; 566 567 // save c_rarg2 and c_rarg3 568 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 569 570 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 571 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 572 __ ldr(c_rarg3, Address(c_rarg2)); 573 __ add(c_rarg3, c_rarg3, 1); 574 __ str(c_rarg3, Address(c_rarg2)); 575 576 // object is in r0 577 // make sure object is 'reasonable' 578 __ cbz(r0, exit); // if obj is NULL it is OK 579 580 // Check if the oop is in the right area of memory 581 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 582 __ andr(c_rarg2, r0, c_rarg3); 583 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 584 585 // Compare c_rarg2 and c_rarg3. We don't use a compare 586 // instruction here because the flags register is live. 587 __ eor(c_rarg2, c_rarg2, c_rarg3); 588 __ cbnz(c_rarg2, error); 589 590 // make sure klass is 'reasonable', which is not zero. 591 __ load_klass(r0, r0); // get klass 592 __ cbz(r0, error); // if klass is NULL it is broken 593 594 // return if everything seems ok 595 __ bind(exit); 596 597 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 598 __ ret(lr); 599 600 // handle errors 601 __ bind(error); 602 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 603 604 __ push(RegSet::range(r0, r29), sp); 605 // debug(char* msg, int64_t pc, int64_t regs[]) 606 __ mov(c_rarg0, rscratch1); // pass address of error message 607 __ mov(c_rarg1, lr); // pass return address 608 __ mov(c_rarg2, sp); // pass address of regs on stack 609 #ifndef PRODUCT 610 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 611 #endif 612 BLOCK_COMMENT("call MacroAssembler::debug"); 613 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 614 __ blrt(rscratch1, 3, 0, 1); 615 616 return start; 617 } 618 619 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 620 621 // Generate code for an array write pre barrier 622 // 623 // addr - starting address 624 // count - element count 625 // tmp - scratch register 626 // 627 // Destroy no registers except rscratch1 and rscratch2 628 // 629 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 630 BarrierSet* bs = Universe::heap()->barrier_set(); 631 switch (bs->kind()) { 632 case BarrierSet::G1SATBCTLogging: 633 // With G1, don't generate the call if we statically know that the target in uninitialized 634 if (!dest_uninitialized) { 635 __ push_call_clobbered_registers(); 636 if (count == c_rarg0) { 637 if (addr == c_rarg1) { 638 // exactly backwards!! 639 __ mov(rscratch1, c_rarg0); 640 __ mov(c_rarg0, c_rarg1); 641 __ mov(c_rarg1, rscratch1); 642 } else { 643 __ mov(c_rarg1, count); 644 __ mov(c_rarg0, addr); 645 } 646 } else { 647 __ mov(c_rarg0, addr); 648 __ mov(c_rarg1, count); 649 } 650 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 651 __ pop_call_clobbered_registers(); 652 break; 653 case BarrierSet::CardTableForRS: 654 case BarrierSet::CardTableExtension: 655 case BarrierSet::ModRef: 656 break; 657 default: 658 ShouldNotReachHere(); 659 660 } 661 } 662 } 663 664 // 665 // Generate code for an array write post barrier 666 // 667 // Input: 668 // start - register containing starting address of destination array 669 // end - register containing ending address of destination array 670 // scratch - scratch register 671 // 672 // The input registers are overwritten. 673 // The ending address is inclusive. 674 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) { 675 assert_different_registers(start, end, scratch); 676 BarrierSet* bs = Universe::heap()->barrier_set(); 677 switch (bs->kind()) { 678 case BarrierSet::G1SATBCTLogging: 679 680 { 681 __ push_call_clobbered_registers(); 682 // must compute element count unless barrier set interface is changed (other platforms supply count) 683 assert_different_registers(start, end, scratch); 684 __ lea(scratch, Address(end, BytesPerHeapOop)); 685 __ sub(scratch, scratch, start); // subtract start to get #bytes 686 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 687 __ mov(c_rarg0, start); 688 __ mov(c_rarg1, scratch); 689 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 690 __ pop_call_clobbered_registers(); 691 } 692 break; 693 case BarrierSet::CardTableForRS: 694 case BarrierSet::CardTableExtension: 695 { 696 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 697 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 698 699 Label L_loop; 700 701 __ lsr(start, start, CardTableModRefBS::card_shift); 702 __ lsr(end, end, CardTableModRefBS::card_shift); 703 __ sub(end, end, start); // number of bytes to copy 704 705 const Register count = end; // 'end' register contains bytes count now 706 __ load_byte_map_base(scratch); 707 __ add(start, start, scratch); 708 if (UseConcMarkSweepGC) { 709 __ membar(__ StoreStore); 710 } 711 __ BIND(L_loop); 712 __ strb(zr, Address(start, count)); 713 __ subs(count, count, 1); 714 __ br(Assembler::GE, L_loop); 715 } 716 break; 717 default: 718 ShouldNotReachHere(); 719 720 } 721 } 722 723 // The inner part of zero_words(). This is the bulk operation, 724 // zeroing words in blocks, possibly using DC ZVA to do it. The 725 // caller is responsible for zeroing the last few words. 726 // 727 // Inputs: 728 // r10: the HeapWord-aligned base address of an array to zero. 729 // r11: the count in HeapWords, r11 > 0. 730 // 731 // Returns r10 and r11, adjusted for the caller to clear. 732 // r10: the base address of the tail of words left to clear. 733 // r11: the number of words in the tail. 734 // r11 < MacroAssembler::zero_words_block_size. 735 736 address generate_zero_blocks() { 737 Label store_pair, loop_store_pair, done; 738 Label base_aligned; 739 740 Register base = r10, cnt = r11; 741 742 __ align(CodeEntryAlignment); 743 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 744 address start = __ pc(); 745 746 if (UseBlockZeroing) { 747 int zva_length = VM_Version::zva_length(); 748 749 // Ensure ZVA length can be divided by 16. This is required by 750 // the subsequent operations. 751 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 752 753 __ tbz(base, 3, base_aligned); 754 __ str(zr, Address(__ post(base, 8))); 755 __ sub(cnt, cnt, 1); 756 __ bind(base_aligned); 757 758 // Ensure count >= zva_length * 2 so that it still deserves a zva after 759 // alignment. 760 Label small; 761 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 762 __ cmp(cnt, low_limit >> 3); 763 __ br(Assembler::LT, small); 764 __ zero_dcache_blocks(base, cnt); 765 __ bind(small); 766 } 767 768 { 769 // Number of stp instructions we'll unroll 770 const int unroll = 771 MacroAssembler::zero_words_block_size / 2; 772 // Clear the remaining blocks. 773 Label loop; 774 __ subs(cnt, cnt, unroll * 2); 775 __ br(Assembler::LT, done); 776 __ bind(loop); 777 for (int i = 0; i < unroll; i++) 778 __ stp(zr, zr, __ post(base, 16)); 779 __ subs(cnt, cnt, unroll * 2); 780 __ br(Assembler::GE, loop); 781 __ bind(done); 782 __ add(cnt, cnt, unroll * 2); 783 } 784 785 __ ret(lr); 786 787 return start; 788 } 789 790 791 typedef enum { 792 copy_forwards = 1, 793 copy_backwards = -1 794 } copy_direction; 795 796 // Bulk copy of blocks of 8 words. 797 // 798 // count is a count of words. 799 // 800 // Precondition: count >= 8 801 // 802 // Postconditions: 803 // 804 // The least significant bit of count contains the remaining count 805 // of words to copy. The rest of count is trash. 806 // 807 // s and d are adjusted to point to the remaining words to copy 808 // 809 void generate_copy_longs(Label &start, Register s, Register d, Register count, 810 copy_direction direction) { 811 int unit = wordSize * direction; 812 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 813 814 int offset; 815 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 816 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 817 const Register stride = r13; 818 819 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 820 assert_different_registers(s, d, count, rscratch1); 821 822 Label again, drain; 823 const char *stub_name; 824 if (direction == copy_forwards) 825 stub_name = "foward_copy_longs"; 826 else 827 stub_name = "backward_copy_longs"; 828 StubCodeMark mark(this, "StubRoutines", stub_name); 829 __ align(CodeEntryAlignment); 830 __ bind(start); 831 832 Label unaligned_copy_long; 833 if (AvoidUnalignedAccesses) { 834 __ tbnz(d, 3, unaligned_copy_long); 835 } 836 837 if (direction == copy_forwards) { 838 __ sub(s, s, bias); 839 __ sub(d, d, bias); 840 } 841 842 #ifdef ASSERT 843 // Make sure we are never given < 8 words 844 { 845 Label L; 846 __ cmp(count, 8); 847 __ br(Assembler::GE, L); 848 __ stop("genrate_copy_longs called with < 8 words"); 849 __ bind(L); 850 } 851 #endif 852 853 // Fill 8 registers 854 if (UseSIMDForMemoryOps) { 855 __ ldpq(v0, v1, Address(s, 4 * unit)); 856 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 857 } else { 858 __ ldp(t0, t1, Address(s, 2 * unit)); 859 __ ldp(t2, t3, Address(s, 4 * unit)); 860 __ ldp(t4, t5, Address(s, 6 * unit)); 861 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 862 } 863 864 __ subs(count, count, 16); 865 __ br(Assembler::LO, drain); 866 867 int prefetch = PrefetchCopyIntervalInBytes; 868 bool use_stride = false; 869 if (direction == copy_backwards) { 870 use_stride = prefetch > 256; 871 prefetch = -prefetch; 872 if (use_stride) __ mov(stride, prefetch); 873 } 874 875 __ bind(again); 876 877 if (PrefetchCopyIntervalInBytes > 0) 878 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 879 880 if (UseSIMDForMemoryOps) { 881 __ stpq(v0, v1, Address(d, 4 * unit)); 882 __ ldpq(v0, v1, Address(s, 4 * unit)); 883 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 884 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 885 } else { 886 __ stp(t0, t1, Address(d, 2 * unit)); 887 __ ldp(t0, t1, Address(s, 2 * unit)); 888 __ stp(t2, t3, Address(d, 4 * unit)); 889 __ ldp(t2, t3, Address(s, 4 * unit)); 890 __ stp(t4, t5, Address(d, 6 * unit)); 891 __ ldp(t4, t5, Address(s, 6 * unit)); 892 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 893 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 894 } 895 896 __ subs(count, count, 8); 897 __ br(Assembler::HS, again); 898 899 // Drain 900 __ bind(drain); 901 if (UseSIMDForMemoryOps) { 902 __ stpq(v0, v1, Address(d, 4 * unit)); 903 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 904 } else { 905 __ stp(t0, t1, Address(d, 2 * unit)); 906 __ stp(t2, t3, Address(d, 4 * unit)); 907 __ stp(t4, t5, Address(d, 6 * unit)); 908 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 909 } 910 911 { 912 Label L1, L2; 913 __ tbz(count, exact_log2(4), L1); 914 if (UseSIMDForMemoryOps) { 915 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 916 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 917 } else { 918 __ ldp(t0, t1, Address(s, 2 * unit)); 919 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 920 __ stp(t0, t1, Address(d, 2 * unit)); 921 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 922 } 923 __ bind(L1); 924 925 if (direction == copy_forwards) { 926 __ add(s, s, bias); 927 __ add(d, d, bias); 928 } 929 930 __ tbz(count, 1, L2); 931 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 932 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 933 __ bind(L2); 934 } 935 936 __ ret(lr); 937 938 if (AvoidUnalignedAccesses) { 939 Label drain, again; 940 // Register order for storing. Order is different for backward copy. 941 942 __ bind(unaligned_copy_long); 943 944 // source address is even aligned, target odd aligned 945 // 946 // when forward copying word pairs we read long pairs at offsets 947 // {0, 2, 4, 6} (in long words). when backwards copying we read 948 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 949 // address by -2 in the forwards case so we can compute the 950 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 951 // or -1. 952 // 953 // when forward copying we need to store 1 word, 3 pairs and 954 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 955 // zero offset We adjust the destination by -1 which means we 956 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 957 // 958 // When backwards copyng we need to store 1 word, 3 pairs and 959 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 960 // offsets {1, 3, 5, 7, 8} * unit. 961 962 if (direction == copy_forwards) { 963 __ sub(s, s, 16); 964 __ sub(d, d, 8); 965 } 966 967 // Fill 8 registers 968 // 969 // for forwards copy s was offset by -16 from the original input 970 // value of s so the register contents are at these offsets 971 // relative to the 64 bit block addressed by that original input 972 // and so on for each successive 64 byte block when s is updated 973 // 974 // t0 at offset 0, t1 at offset 8 975 // t2 at offset 16, t3 at offset 24 976 // t4 at offset 32, t5 at offset 40 977 // t6 at offset 48, t7 at offset 56 978 979 // for backwards copy s was not offset so the register contents 980 // are at these offsets into the preceding 64 byte block 981 // relative to that original input and so on for each successive 982 // preceding 64 byte block when s is updated. this explains the 983 // slightly counter-intuitive looking pattern of register usage 984 // in the stp instructions for backwards copy. 985 // 986 // t0 at offset -16, t1 at offset -8 987 // t2 at offset -32, t3 at offset -24 988 // t4 at offset -48, t5 at offset -40 989 // t6 at offset -64, t7 at offset -56 990 991 __ ldp(t0, t1, Address(s, 2 * unit)); 992 __ ldp(t2, t3, Address(s, 4 * unit)); 993 __ ldp(t4, t5, Address(s, 6 * unit)); 994 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 995 996 __ subs(count, count, 16); 997 __ br(Assembler::LO, drain); 998 999 int prefetch = PrefetchCopyIntervalInBytes; 1000 bool use_stride = false; 1001 if (direction == copy_backwards) { 1002 use_stride = prefetch > 256; 1003 prefetch = -prefetch; 1004 if (use_stride) __ mov(stride, prefetch); 1005 } 1006 1007 __ bind(again); 1008 1009 if (PrefetchCopyIntervalInBytes > 0) 1010 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1011 1012 if (direction == copy_forwards) { 1013 // allowing for the offset of -8 the store instructions place 1014 // registers into the target 64 bit block at the following 1015 // offsets 1016 // 1017 // t0 at offset 0 1018 // t1 at offset 8, t2 at offset 16 1019 // t3 at offset 24, t4 at offset 32 1020 // t5 at offset 40, t6 at offset 48 1021 // t7 at offset 56 1022 1023 __ str(t0, Address(d, 1 * unit)); 1024 __ stp(t1, t2, Address(d, 2 * unit)); 1025 __ ldp(t0, t1, Address(s, 2 * unit)); 1026 __ stp(t3, t4, Address(d, 4 * unit)); 1027 __ ldp(t2, t3, Address(s, 4 * unit)); 1028 __ stp(t5, t6, Address(d, 6 * unit)); 1029 __ ldp(t4, t5, Address(s, 6 * unit)); 1030 __ str(t7, Address(__ pre(d, 8 * unit))); 1031 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1032 } else { 1033 // d was not offset when we started so the registers are 1034 // written into the 64 bit block preceding d with the following 1035 // offsets 1036 // 1037 // t1 at offset -8 1038 // t3 at offset -24, t0 at offset -16 1039 // t5 at offset -48, t2 at offset -32 1040 // t7 at offset -56, t4 at offset -48 1041 // t6 at offset -64 1042 // 1043 // note that this matches the offsets previously noted for the 1044 // loads 1045 1046 __ str(t1, Address(d, 1 * unit)); 1047 __ stp(t3, t0, Address(d, 3 * unit)); 1048 __ ldp(t0, t1, Address(s, 2 * unit)); 1049 __ stp(t5, t2, Address(d, 5 * unit)); 1050 __ ldp(t2, t3, Address(s, 4 * unit)); 1051 __ stp(t7, t4, Address(d, 7 * unit)); 1052 __ ldp(t4, t5, Address(s, 6 * unit)); 1053 __ str(t6, Address(__ pre(d, 8 * unit))); 1054 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1055 } 1056 1057 __ subs(count, count, 8); 1058 __ br(Assembler::HS, again); 1059 1060 // Drain 1061 // 1062 // this uses the same pattern of offsets and register arguments 1063 // as above 1064 __ bind(drain); 1065 if (direction == copy_forwards) { 1066 __ str(t0, Address(d, 1 * unit)); 1067 __ stp(t1, t2, Address(d, 2 * unit)); 1068 __ stp(t3, t4, Address(d, 4 * unit)); 1069 __ stp(t5, t6, Address(d, 6 * unit)); 1070 __ str(t7, Address(__ pre(d, 8 * unit))); 1071 } else { 1072 __ str(t1, Address(d, 1 * unit)); 1073 __ stp(t3, t0, Address(d, 3 * unit)); 1074 __ stp(t5, t2, Address(d, 5 * unit)); 1075 __ stp(t7, t4, Address(d, 7 * unit)); 1076 __ str(t6, Address(__ pre(d, 8 * unit))); 1077 } 1078 // now we need to copy any remaining part block which may 1079 // include a 4 word block subblock and/or a 2 word subblock. 1080 // bits 2 and 1 in the count are the tell-tale for whetehr we 1081 // have each such subblock 1082 { 1083 Label L1, L2; 1084 __ tbz(count, exact_log2(4), L1); 1085 // this is the same as above but copying only 4 longs hence 1086 // with ony one intervening stp between the str instructions 1087 // but note that the offsets and registers still follow the 1088 // same pattern 1089 __ ldp(t0, t1, Address(s, 2 * unit)); 1090 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1091 if (direction == copy_forwards) { 1092 __ str(t0, Address(d, 1 * unit)); 1093 __ stp(t1, t2, Address(d, 2 * unit)); 1094 __ str(t3, Address(__ pre(d, 4 * unit))); 1095 } else { 1096 __ str(t1, Address(d, 1 * unit)); 1097 __ stp(t3, t0, Address(d, 3 * unit)); 1098 __ str(t2, Address(__ pre(d, 4 * unit))); 1099 } 1100 __ bind(L1); 1101 1102 __ tbz(count, 1, L2); 1103 // this is the same as above but copying only 2 longs hence 1104 // there is no intervening stp between the str instructions 1105 // but note that the offset and register patterns are still 1106 // the same 1107 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1108 if (direction == copy_forwards) { 1109 __ str(t0, Address(d, 1 * unit)); 1110 __ str(t1, Address(__ pre(d, 2 * unit))); 1111 } else { 1112 __ str(t1, Address(d, 1 * unit)); 1113 __ str(t0, Address(__ pre(d, 2 * unit))); 1114 } 1115 __ bind(L2); 1116 1117 // for forwards copy we need to re-adjust the offsets we 1118 // applied so that s and d are follow the last words written 1119 1120 if (direction == copy_forwards) { 1121 __ add(s, s, 16); 1122 __ add(d, d, 8); 1123 } 1124 1125 } 1126 1127 __ ret(lr); 1128 } 1129 } 1130 1131 // Small copy: less than 16 bytes. 1132 // 1133 // NB: Ignores all of the bits of count which represent more than 15 1134 // bytes, so a caller doesn't have to mask them. 1135 1136 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1137 bool is_backwards = step < 0; 1138 size_t granularity = uabs(step); 1139 int direction = is_backwards ? -1 : 1; 1140 int unit = wordSize * direction; 1141 1142 Label Lpair, Lword, Lint, Lshort, Lbyte; 1143 1144 assert(granularity 1145 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1146 1147 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1148 1149 // ??? I don't know if this bit-test-and-branch is the right thing 1150 // to do. It does a lot of jumping, resulting in several 1151 // mispredicted branches. It might make more sense to do this 1152 // with something like Duff's device with a single computed branch. 1153 1154 __ tbz(count, 3 - exact_log2(granularity), Lword); 1155 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1156 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1157 __ bind(Lword); 1158 1159 if (granularity <= sizeof (jint)) { 1160 __ tbz(count, 2 - exact_log2(granularity), Lint); 1161 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1162 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1163 __ bind(Lint); 1164 } 1165 1166 if (granularity <= sizeof (jshort)) { 1167 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1168 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1169 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1170 __ bind(Lshort); 1171 } 1172 1173 if (granularity <= sizeof (jbyte)) { 1174 __ tbz(count, 0, Lbyte); 1175 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1176 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1177 __ bind(Lbyte); 1178 } 1179 } 1180 1181 Label copy_f, copy_b; 1182 1183 // All-singing all-dancing memory copy. 1184 // 1185 // Copy count units of memory from s to d. The size of a unit is 1186 // step, which can be positive or negative depending on the direction 1187 // of copy. If is_aligned is false, we align the source address. 1188 // 1189 1190 void copy_memory(bool is_aligned, Register s, Register d, 1191 Register count, Register tmp, int step) { 1192 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1193 bool is_backwards = step < 0; 1194 int granularity = uabs(step); 1195 const Register t0 = r3, t1 = r4; 1196 1197 // <= 96 bytes do inline. Direction doesn't matter because we always 1198 // load all the data before writing anything 1199 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1200 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1201 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1202 const Register send = r17, dend = r18; 1203 1204 if (PrefetchCopyIntervalInBytes > 0) 1205 __ prfm(Address(s, 0), PLDL1KEEP); 1206 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1207 __ br(Assembler::HI, copy_big); 1208 1209 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1210 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1211 1212 __ cmp(count, 16/granularity); 1213 __ br(Assembler::LS, copy16); 1214 1215 __ cmp(count, 64/granularity); 1216 __ br(Assembler::HI, copy80); 1217 1218 __ cmp(count, 32/granularity); 1219 __ br(Assembler::LS, copy32); 1220 1221 // 33..64 bytes 1222 if (UseSIMDForMemoryOps) { 1223 __ ldpq(v0, v1, Address(s, 0)); 1224 __ ldpq(v2, v3, Address(send, -32)); 1225 __ stpq(v0, v1, Address(d, 0)); 1226 __ stpq(v2, v3, Address(dend, -32)); 1227 } else { 1228 __ ldp(t0, t1, Address(s, 0)); 1229 __ ldp(t2, t3, Address(s, 16)); 1230 __ ldp(t4, t5, Address(send, -32)); 1231 __ ldp(t6, t7, Address(send, -16)); 1232 1233 __ stp(t0, t1, Address(d, 0)); 1234 __ stp(t2, t3, Address(d, 16)); 1235 __ stp(t4, t5, Address(dend, -32)); 1236 __ stp(t6, t7, Address(dend, -16)); 1237 } 1238 __ b(finish); 1239 1240 // 17..32 bytes 1241 __ bind(copy32); 1242 __ ldp(t0, t1, Address(s, 0)); 1243 __ ldp(t2, t3, Address(send, -16)); 1244 __ stp(t0, t1, Address(d, 0)); 1245 __ stp(t2, t3, Address(dend, -16)); 1246 __ b(finish); 1247 1248 // 65..80/96 bytes 1249 // (96 bytes if SIMD because we do 32 byes per instruction) 1250 __ bind(copy80); 1251 if (UseSIMDForMemoryOps) { 1252 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1253 __ ldpq(v4, v5, Address(send, -32)); 1254 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1255 __ stpq(v4, v5, Address(dend, -32)); 1256 } else { 1257 __ ldp(t0, t1, Address(s, 0)); 1258 __ ldp(t2, t3, Address(s, 16)); 1259 __ ldp(t4, t5, Address(s, 32)); 1260 __ ldp(t6, t7, Address(s, 48)); 1261 __ ldp(t8, t9, Address(send, -16)); 1262 1263 __ stp(t0, t1, Address(d, 0)); 1264 __ stp(t2, t3, Address(d, 16)); 1265 __ stp(t4, t5, Address(d, 32)); 1266 __ stp(t6, t7, Address(d, 48)); 1267 __ stp(t8, t9, Address(dend, -16)); 1268 } 1269 __ b(finish); 1270 1271 // 0..16 bytes 1272 __ bind(copy16); 1273 __ cmp(count, 8/granularity); 1274 __ br(Assembler::LO, copy8); 1275 1276 // 8..16 bytes 1277 __ ldr(t0, Address(s, 0)); 1278 __ ldr(t1, Address(send, -8)); 1279 __ str(t0, Address(d, 0)); 1280 __ str(t1, Address(dend, -8)); 1281 __ b(finish); 1282 1283 if (granularity < 8) { 1284 // 4..7 bytes 1285 __ bind(copy8); 1286 __ tbz(count, 2 - exact_log2(granularity), copy4); 1287 __ ldrw(t0, Address(s, 0)); 1288 __ ldrw(t1, Address(send, -4)); 1289 __ strw(t0, Address(d, 0)); 1290 __ strw(t1, Address(dend, -4)); 1291 __ b(finish); 1292 if (granularity < 4) { 1293 // 0..3 bytes 1294 __ bind(copy4); 1295 __ cbz(count, finish); // get rid of 0 case 1296 if (granularity == 2) { 1297 __ ldrh(t0, Address(s, 0)); 1298 __ strh(t0, Address(d, 0)); 1299 } else { // granularity == 1 1300 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1301 // the first and last byte. 1302 // Handle the 3 byte case by loading and storing base + count/2 1303 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1304 // This does means in the 1 byte case we load/store the same 1305 // byte 3 times. 1306 __ lsr(count, count, 1); 1307 __ ldrb(t0, Address(s, 0)); 1308 __ ldrb(t1, Address(send, -1)); 1309 __ ldrb(t2, Address(s, count)); 1310 __ strb(t0, Address(d, 0)); 1311 __ strb(t1, Address(dend, -1)); 1312 __ strb(t2, Address(d, count)); 1313 } 1314 __ b(finish); 1315 } 1316 } 1317 1318 __ bind(copy_big); 1319 if (is_backwards) { 1320 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1321 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1322 } 1323 1324 // Now we've got the small case out of the way we can align the 1325 // source address on a 2-word boundary. 1326 1327 Label aligned; 1328 1329 if (is_aligned) { 1330 // We may have to adjust by 1 word to get s 2-word-aligned. 1331 __ tbz(s, exact_log2(wordSize), aligned); 1332 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1333 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1334 __ sub(count, count, wordSize/granularity); 1335 } else { 1336 if (is_backwards) { 1337 __ andr(rscratch2, s, 2 * wordSize - 1); 1338 } else { 1339 __ neg(rscratch2, s); 1340 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1341 } 1342 // rscratch2 is the byte adjustment needed to align s. 1343 __ cbz(rscratch2, aligned); 1344 int shift = exact_log2(granularity); 1345 if (shift) __ lsr(rscratch2, rscratch2, shift); 1346 __ sub(count, count, rscratch2); 1347 1348 #if 0 1349 // ?? This code is only correct for a disjoint copy. It may or 1350 // may not make sense to use it in that case. 1351 1352 // Copy the first pair; s and d may not be aligned. 1353 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1354 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1355 1356 // Align s and d, adjust count 1357 if (is_backwards) { 1358 __ sub(s, s, rscratch2); 1359 __ sub(d, d, rscratch2); 1360 } else { 1361 __ add(s, s, rscratch2); 1362 __ add(d, d, rscratch2); 1363 } 1364 #else 1365 copy_memory_small(s, d, rscratch2, rscratch1, step); 1366 #endif 1367 } 1368 1369 __ bind(aligned); 1370 1371 // s is now 2-word-aligned. 1372 1373 // We have a count of units and some trailing bytes. Adjust the 1374 // count and do a bulk copy of words. 1375 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1376 if (direction == copy_forwards) 1377 __ bl(copy_f); 1378 else 1379 __ bl(copy_b); 1380 1381 // And the tail. 1382 copy_memory_small(s, d, count, tmp, step); 1383 1384 if (granularity >= 8) __ bind(copy8); 1385 if (granularity >= 4) __ bind(copy4); 1386 __ bind(finish); 1387 } 1388 1389 1390 void clobber_registers() { 1391 #ifdef ASSERT 1392 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1393 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1394 for (Register r = r3; r <= r18; r++) 1395 if (r != rscratch1) __ mov(r, rscratch1); 1396 #endif 1397 } 1398 1399 // Scan over array at a for count oops, verifying each one. 1400 // Preserves a and count, clobbers rscratch1 and rscratch2. 1401 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1402 Label loop, end; 1403 __ mov(rscratch1, a); 1404 __ mov(rscratch2, zr); 1405 __ bind(loop); 1406 __ cmp(rscratch2, count); 1407 __ br(Assembler::HS, end); 1408 if (size == (size_t)wordSize) { 1409 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1410 __ verify_oop(temp); 1411 } else { 1412 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1413 __ decode_heap_oop(temp); // calls verify_oop 1414 } 1415 __ add(rscratch2, rscratch2, size); 1416 __ b(loop); 1417 __ bind(end); 1418 } 1419 1420 // Arguments: 1421 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1422 // ignored 1423 // is_oop - true => oop array, so generate store check code 1424 // name - stub name string 1425 // 1426 // Inputs: 1427 // c_rarg0 - source array address 1428 // c_rarg1 - destination array address 1429 // c_rarg2 - element count, treated as ssize_t, can be zero 1430 // 1431 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1432 // the hardware handle it. The two dwords within qwords that span 1433 // cache line boundaries will still be loaded and stored atomicly. 1434 // 1435 // Side Effects: 1436 // disjoint_int_copy_entry is set to the no-overlap entry point 1437 // used by generate_conjoint_int_oop_copy(). 1438 // 1439 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1440 const char *name, bool dest_uninitialized = false) { 1441 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1442 __ align(CodeEntryAlignment); 1443 StubCodeMark mark(this, "StubRoutines", name); 1444 address start = __ pc(); 1445 __ enter(); 1446 1447 if (entry != NULL) { 1448 *entry = __ pc(); 1449 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1450 BLOCK_COMMENT("Entry:"); 1451 } 1452 1453 if (is_oop) { 1454 __ push(RegSet::of(d, count), sp); 1455 // no registers are destroyed by this call 1456 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1457 } 1458 copy_memory(aligned, s, d, count, rscratch1, size); 1459 if (is_oop) { 1460 __ pop(RegSet::of(d, count), sp); 1461 if (VerifyOops) 1462 verify_oop_array(size, d, count, r16); 1463 __ sub(count, count, 1); // make an inclusive end pointer 1464 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1465 gen_write_ref_array_post_barrier(d, count, rscratch1); 1466 } 1467 __ leave(); 1468 __ mov(r0, zr); // return 0 1469 __ ret(lr); 1470 #ifdef BUILTIN_SIM 1471 { 1472 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1473 sim->notifyCompile(const_cast<char*>(name), start); 1474 } 1475 #endif 1476 return start; 1477 } 1478 1479 // Arguments: 1480 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1481 // ignored 1482 // is_oop - true => oop array, so generate store check code 1483 // name - stub name string 1484 // 1485 // Inputs: 1486 // c_rarg0 - source array address 1487 // c_rarg1 - destination array address 1488 // c_rarg2 - element count, treated as ssize_t, can be zero 1489 // 1490 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1491 // the hardware handle it. The two dwords within qwords that span 1492 // cache line boundaries will still be loaded and stored atomicly. 1493 // 1494 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1495 address *entry, const char *name, 1496 bool dest_uninitialized = false) { 1497 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1498 1499 StubCodeMark mark(this, "StubRoutines", name); 1500 address start = __ pc(); 1501 __ enter(); 1502 1503 if (entry != NULL) { 1504 *entry = __ pc(); 1505 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1506 BLOCK_COMMENT("Entry:"); 1507 } 1508 1509 // use fwd copy when (d-s) above_equal (count*size) 1510 __ sub(rscratch1, d, s); 1511 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1512 __ br(Assembler::HS, nooverlap_target); 1513 1514 if (is_oop) { 1515 __ push(RegSet::of(d, count), sp); 1516 // no registers are destroyed by this call 1517 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1518 } 1519 copy_memory(aligned, s, d, count, rscratch1, -size); 1520 if (is_oop) { 1521 __ pop(RegSet::of(d, count), sp); 1522 if (VerifyOops) 1523 verify_oop_array(size, d, count, r16); 1524 __ sub(count, count, 1); // make an inclusive end pointer 1525 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1526 gen_write_ref_array_post_barrier(d, count, rscratch1); 1527 } 1528 __ leave(); 1529 __ mov(r0, zr); // return 0 1530 __ ret(lr); 1531 #ifdef BUILTIN_SIM 1532 { 1533 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1534 sim->notifyCompile(const_cast<char*>(name), start); 1535 } 1536 #endif 1537 return start; 1538 } 1539 1540 // Arguments: 1541 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1542 // ignored 1543 // name - stub name string 1544 // 1545 // Inputs: 1546 // c_rarg0 - source array address 1547 // c_rarg1 - destination array address 1548 // c_rarg2 - element count, treated as ssize_t, can be zero 1549 // 1550 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1551 // we let the hardware handle it. The one to eight bytes within words, 1552 // dwords or qwords that span cache line boundaries will still be loaded 1553 // and stored atomically. 1554 // 1555 // Side Effects: 1556 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1557 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1558 // we let the hardware handle it. The one to eight bytes within words, 1559 // dwords or qwords that span cache line boundaries will still be loaded 1560 // and stored atomically. 1561 // 1562 // Side Effects: 1563 // disjoint_byte_copy_entry is set to the no-overlap entry point 1564 // used by generate_conjoint_byte_copy(). 1565 // 1566 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1567 const bool not_oop = false; 1568 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1569 } 1570 1571 // Arguments: 1572 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1573 // ignored 1574 // name - stub name string 1575 // 1576 // Inputs: 1577 // c_rarg0 - source array address 1578 // c_rarg1 - destination array address 1579 // c_rarg2 - element count, treated as ssize_t, can be zero 1580 // 1581 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1582 // we let the hardware handle it. The one to eight bytes within words, 1583 // dwords or qwords that span cache line boundaries will still be loaded 1584 // and stored atomically. 1585 // 1586 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1587 address* entry, const char *name) { 1588 const bool not_oop = false; 1589 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1590 } 1591 1592 // Arguments: 1593 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1594 // ignored 1595 // name - stub name string 1596 // 1597 // Inputs: 1598 // c_rarg0 - source array address 1599 // c_rarg1 - destination array address 1600 // c_rarg2 - element count, treated as ssize_t, can be zero 1601 // 1602 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1603 // let the hardware handle it. The two or four words within dwords 1604 // or qwords that span cache line boundaries will still be loaded 1605 // and stored atomically. 1606 // 1607 // Side Effects: 1608 // disjoint_short_copy_entry is set to the no-overlap entry point 1609 // used by generate_conjoint_short_copy(). 1610 // 1611 address generate_disjoint_short_copy(bool aligned, 1612 address* entry, const char *name) { 1613 const bool not_oop = false; 1614 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1615 } 1616 1617 // Arguments: 1618 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1619 // ignored 1620 // name - stub name string 1621 // 1622 // Inputs: 1623 // c_rarg0 - source array address 1624 // c_rarg1 - destination array address 1625 // c_rarg2 - element count, treated as ssize_t, can be zero 1626 // 1627 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1628 // let the hardware handle it. The two or four words within dwords 1629 // or qwords that span cache line boundaries will still be loaded 1630 // and stored atomically. 1631 // 1632 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1633 address *entry, const char *name) { 1634 const bool not_oop = false; 1635 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1636 1637 } 1638 // Arguments: 1639 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1640 // ignored 1641 // name - stub name string 1642 // 1643 // Inputs: 1644 // c_rarg0 - source array address 1645 // c_rarg1 - destination array address 1646 // c_rarg2 - element count, treated as ssize_t, can be zero 1647 // 1648 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1649 // the hardware handle it. The two dwords within qwords that span 1650 // cache line boundaries will still be loaded and stored atomicly. 1651 // 1652 // Side Effects: 1653 // disjoint_int_copy_entry is set to the no-overlap entry point 1654 // used by generate_conjoint_int_oop_copy(). 1655 // 1656 address generate_disjoint_int_copy(bool aligned, address *entry, 1657 const char *name, bool dest_uninitialized = false) { 1658 const bool not_oop = false; 1659 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1660 } 1661 1662 // Arguments: 1663 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1664 // ignored 1665 // name - stub name string 1666 // 1667 // Inputs: 1668 // c_rarg0 - source array address 1669 // c_rarg1 - destination array address 1670 // c_rarg2 - element count, treated as ssize_t, can be zero 1671 // 1672 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1673 // the hardware handle it. The two dwords within qwords that span 1674 // cache line boundaries will still be loaded and stored atomicly. 1675 // 1676 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1677 address *entry, const char *name, 1678 bool dest_uninitialized = false) { 1679 const bool not_oop = false; 1680 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1681 } 1682 1683 1684 // Arguments: 1685 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1686 // ignored 1687 // name - stub name string 1688 // 1689 // Inputs: 1690 // c_rarg0 - source array address 1691 // c_rarg1 - destination array address 1692 // c_rarg2 - element count, treated as size_t, can be zero 1693 // 1694 // Side Effects: 1695 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1696 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1697 // 1698 address generate_disjoint_long_copy(bool aligned, address *entry, 1699 const char *name, bool dest_uninitialized = false) { 1700 const bool not_oop = false; 1701 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1702 } 1703 1704 // Arguments: 1705 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1706 // ignored 1707 // name - stub name string 1708 // 1709 // Inputs: 1710 // c_rarg0 - source array address 1711 // c_rarg1 - destination array address 1712 // c_rarg2 - element count, treated as size_t, can be zero 1713 // 1714 address generate_conjoint_long_copy(bool aligned, 1715 address nooverlap_target, address *entry, 1716 const char *name, bool dest_uninitialized = false) { 1717 const bool not_oop = false; 1718 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1719 } 1720 1721 // Arguments: 1722 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1723 // ignored 1724 // name - stub name string 1725 // 1726 // Inputs: 1727 // c_rarg0 - source array address 1728 // c_rarg1 - destination array address 1729 // c_rarg2 - element count, treated as size_t, can be zero 1730 // 1731 // Side Effects: 1732 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1733 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1734 // 1735 address generate_disjoint_oop_copy(bool aligned, address *entry, 1736 const char *name, bool dest_uninitialized) { 1737 const bool is_oop = true; 1738 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1739 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1740 } 1741 1742 // Arguments: 1743 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1744 // ignored 1745 // name - stub name string 1746 // 1747 // Inputs: 1748 // c_rarg0 - source array address 1749 // c_rarg1 - destination array address 1750 // c_rarg2 - element count, treated as size_t, can be zero 1751 // 1752 address generate_conjoint_oop_copy(bool aligned, 1753 address nooverlap_target, address *entry, 1754 const char *name, bool dest_uninitialized) { 1755 const bool is_oop = true; 1756 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1757 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1758 name, dest_uninitialized); 1759 } 1760 1761 1762 // Helper for generating a dynamic type check. 1763 // Smashes rscratch1. 1764 void generate_type_check(Register sub_klass, 1765 Register super_check_offset, 1766 Register super_klass, 1767 Label& L_success) { 1768 assert_different_registers(sub_klass, super_check_offset, super_klass); 1769 1770 BLOCK_COMMENT("type_check:"); 1771 1772 Label L_miss; 1773 1774 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1775 super_check_offset); 1776 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1777 1778 // Fall through on failure! 1779 __ BIND(L_miss); 1780 } 1781 1782 // 1783 // Generate checkcasting array copy stub 1784 // 1785 // Input: 1786 // c_rarg0 - source array address 1787 // c_rarg1 - destination array address 1788 // c_rarg2 - element count, treated as ssize_t, can be zero 1789 // c_rarg3 - size_t ckoff (super_check_offset) 1790 // c_rarg4 - oop ckval (super_klass) 1791 // 1792 // Output: 1793 // r0 == 0 - success 1794 // r0 == -1^K - failure, where K is partial transfer count 1795 // 1796 address generate_checkcast_copy(const char *name, address *entry, 1797 bool dest_uninitialized = false) { 1798 1799 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1800 1801 // Input registers (after setup_arg_regs) 1802 const Register from = c_rarg0; // source array address 1803 const Register to = c_rarg1; // destination array address 1804 const Register count = c_rarg2; // elementscount 1805 const Register ckoff = c_rarg3; // super_check_offset 1806 const Register ckval = c_rarg4; // super_klass 1807 1808 // Registers used as temps (r18, r19, r20 are save-on-entry) 1809 const Register count_save = r21; // orig elementscount 1810 const Register start_to = r20; // destination array start address 1811 const Register copied_oop = r18; // actual oop copied 1812 const Register r19_klass = r19; // oop._klass 1813 1814 //--------------------------------------------------------------- 1815 // Assembler stub will be used for this call to arraycopy 1816 // if the two arrays are subtypes of Object[] but the 1817 // destination array type is not equal to or a supertype 1818 // of the source type. Each element must be separately 1819 // checked. 1820 1821 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1822 copied_oop, r19_klass, count_save); 1823 1824 __ align(CodeEntryAlignment); 1825 StubCodeMark mark(this, "StubRoutines", name); 1826 address start = __ pc(); 1827 1828 __ enter(); // required for proper stackwalking of RuntimeStub frame 1829 1830 #ifdef ASSERT 1831 // caller guarantees that the arrays really are different 1832 // otherwise, we would have to make conjoint checks 1833 { Label L; 1834 array_overlap_test(L, TIMES_OOP); 1835 __ stop("checkcast_copy within a single array"); 1836 __ bind(L); 1837 } 1838 #endif //ASSERT 1839 1840 // Caller of this entry point must set up the argument registers. 1841 if (entry != NULL) { 1842 *entry = __ pc(); 1843 BLOCK_COMMENT("Entry:"); 1844 } 1845 1846 // Empty array: Nothing to do. 1847 __ cbz(count, L_done); 1848 1849 __ push(RegSet::of(r18, r19, r20, r21), sp); 1850 1851 #ifdef ASSERT 1852 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1853 // The ckoff and ckval must be mutually consistent, 1854 // even though caller generates both. 1855 { Label L; 1856 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1857 __ ldrw(start_to, Address(ckval, sco_offset)); 1858 __ cmpw(ckoff, start_to); 1859 __ br(Assembler::EQ, L); 1860 __ stop("super_check_offset inconsistent"); 1861 __ bind(L); 1862 } 1863 #endif //ASSERT 1864 1865 gen_write_ref_array_pre_barrier(to, count, dest_uninitialized); 1866 1867 // save the original count 1868 __ mov(count_save, count); 1869 1870 // Copy from low to high addresses 1871 __ mov(start_to, to); // Save destination array start address 1872 __ b(L_load_element); 1873 1874 // ======== begin loop ======== 1875 // (Loop is rotated; its entry is L_load_element.) 1876 // Loop control: 1877 // for (; count != 0; count--) { 1878 // copied_oop = load_heap_oop(from++); 1879 // ... generate_type_check ...; 1880 // store_heap_oop(to++, copied_oop); 1881 // } 1882 __ align(OptoLoopAlignment); 1883 1884 __ BIND(L_store_element); 1885 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1886 __ sub(count, count, 1); 1887 __ cbz(count, L_do_card_marks); 1888 1889 // ======== loop entry is here ======== 1890 __ BIND(L_load_element); 1891 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1892 __ cbz(copied_oop, L_store_element); 1893 1894 __ load_klass(r19_klass, copied_oop);// query the object klass 1895 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1896 // ======== end loop ======== 1897 1898 // It was a real error; we must depend on the caller to finish the job. 1899 // Register count = remaining oops, count_orig = total oops. 1900 // Emit GC store barriers for the oops we have copied and report 1901 // their number to the caller. 1902 1903 __ subs(count, count_save, count); // K = partially copied oop count 1904 __ eon(count, count, zr); // report (-1^K) to caller 1905 __ br(Assembler::EQ, L_done_pop); 1906 1907 __ BIND(L_do_card_marks); 1908 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1909 gen_write_ref_array_post_barrier(start_to, to, rscratch1); 1910 1911 __ bind(L_done_pop); 1912 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1913 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1914 1915 __ bind(L_done); 1916 __ mov(r0, count); 1917 __ leave(); 1918 __ ret(lr); 1919 1920 return start; 1921 } 1922 1923 // Perform range checks on the proposed arraycopy. 1924 // Kills temp, but nothing else. 1925 // Also, clean the sign bits of src_pos and dst_pos. 1926 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1927 Register src_pos, // source position (c_rarg1) 1928 Register dst, // destination array oo (c_rarg2) 1929 Register dst_pos, // destination position (c_rarg3) 1930 Register length, 1931 Register temp, 1932 Label& L_failed) { 1933 BLOCK_COMMENT("arraycopy_range_checks:"); 1934 1935 assert_different_registers(rscratch1, temp); 1936 1937 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1938 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1939 __ addw(temp, length, src_pos); 1940 __ cmpw(temp, rscratch1); 1941 __ br(Assembler::HI, L_failed); 1942 1943 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1944 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1945 __ addw(temp, length, dst_pos); 1946 __ cmpw(temp, rscratch1); 1947 __ br(Assembler::HI, L_failed); 1948 1949 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1950 __ movw(src_pos, src_pos); 1951 __ movw(dst_pos, dst_pos); 1952 1953 BLOCK_COMMENT("arraycopy_range_checks done"); 1954 } 1955 1956 // These stubs get called from some dumb test routine. 1957 // I'll write them properly when they're called from 1958 // something that's actually doing something. 1959 static void fake_arraycopy_stub(address src, address dst, int count) { 1960 assert(count == 0, "huh?"); 1961 } 1962 1963 1964 // 1965 // Generate 'unsafe' array copy stub 1966 // Though just as safe as the other stubs, it takes an unscaled 1967 // size_t argument instead of an element count. 1968 // 1969 // Input: 1970 // c_rarg0 - source array address 1971 // c_rarg1 - destination array address 1972 // c_rarg2 - byte count, treated as ssize_t, can be zero 1973 // 1974 // Examines the alignment of the operands and dispatches 1975 // to a long, int, short, or byte copy loop. 1976 // 1977 address generate_unsafe_copy(const char *name, 1978 address byte_copy_entry, 1979 address short_copy_entry, 1980 address int_copy_entry, 1981 address long_copy_entry) { 1982 Label L_long_aligned, L_int_aligned, L_short_aligned; 1983 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1984 1985 __ align(CodeEntryAlignment); 1986 StubCodeMark mark(this, "StubRoutines", name); 1987 address start = __ pc(); 1988 __ enter(); // required for proper stackwalking of RuntimeStub frame 1989 1990 // bump this on entry, not on exit: 1991 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1992 1993 __ orr(rscratch1, s, d); 1994 __ orr(rscratch1, rscratch1, count); 1995 1996 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1997 __ cbz(rscratch1, L_long_aligned); 1998 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1999 __ cbz(rscratch1, L_int_aligned); 2000 __ tbz(rscratch1, 0, L_short_aligned); 2001 __ b(RuntimeAddress(byte_copy_entry)); 2002 2003 __ BIND(L_short_aligned); 2004 __ lsr(count, count, LogBytesPerShort); // size => short_count 2005 __ b(RuntimeAddress(short_copy_entry)); 2006 __ BIND(L_int_aligned); 2007 __ lsr(count, count, LogBytesPerInt); // size => int_count 2008 __ b(RuntimeAddress(int_copy_entry)); 2009 __ BIND(L_long_aligned); 2010 __ lsr(count, count, LogBytesPerLong); // size => long_count 2011 __ b(RuntimeAddress(long_copy_entry)); 2012 2013 return start; 2014 } 2015 2016 // 2017 // Generate generic array copy stubs 2018 // 2019 // Input: 2020 // c_rarg0 - src oop 2021 // c_rarg1 - src_pos (32-bits) 2022 // c_rarg2 - dst oop 2023 // c_rarg3 - dst_pos (32-bits) 2024 // c_rarg4 - element count (32-bits) 2025 // 2026 // Output: 2027 // r0 == 0 - success 2028 // r0 == -1^K - failure, where K is partial transfer count 2029 // 2030 address generate_generic_copy(const char *name, 2031 address byte_copy_entry, address short_copy_entry, 2032 address int_copy_entry, address oop_copy_entry, 2033 address long_copy_entry, address checkcast_copy_entry) { 2034 2035 Label L_failed, L_failed_0, L_objArray; 2036 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2037 2038 // Input registers 2039 const Register src = c_rarg0; // source array oop 2040 const Register src_pos = c_rarg1; // source position 2041 const Register dst = c_rarg2; // destination array oop 2042 const Register dst_pos = c_rarg3; // destination position 2043 const Register length = c_rarg4; 2044 2045 StubCodeMark mark(this, "StubRoutines", name); 2046 2047 __ align(CodeEntryAlignment); 2048 address start = __ pc(); 2049 2050 __ enter(); // required for proper stackwalking of RuntimeStub frame 2051 2052 // bump this on entry, not on exit: 2053 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2054 2055 //----------------------------------------------------------------------- 2056 // Assembler stub will be used for this call to arraycopy 2057 // if the following conditions are met: 2058 // 2059 // (1) src and dst must not be null. 2060 // (2) src_pos must not be negative. 2061 // (3) dst_pos must not be negative. 2062 // (4) length must not be negative. 2063 // (5) src klass and dst klass should be the same and not NULL. 2064 // (6) src and dst should be arrays. 2065 // (7) src_pos + length must not exceed length of src. 2066 // (8) dst_pos + length must not exceed length of dst. 2067 // 2068 2069 // if (src == NULL) return -1; 2070 __ cbz(src, L_failed); 2071 2072 // if (src_pos < 0) return -1; 2073 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2074 2075 // if (dst == NULL) return -1; 2076 __ cbz(dst, L_failed); 2077 2078 // if (dst_pos < 0) return -1; 2079 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2080 2081 // registers used as temp 2082 const Register scratch_length = r16; // elements count to copy 2083 const Register scratch_src_klass = r17; // array klass 2084 const Register lh = r18; // layout helper 2085 2086 // if (length < 0) return -1; 2087 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2088 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2089 2090 __ load_klass(scratch_src_klass, src); 2091 #ifdef ASSERT 2092 // assert(src->klass() != NULL); 2093 { 2094 BLOCK_COMMENT("assert klasses not null {"); 2095 Label L1, L2; 2096 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2097 __ bind(L1); 2098 __ stop("broken null klass"); 2099 __ bind(L2); 2100 __ load_klass(rscratch1, dst); 2101 __ cbz(rscratch1, L1); // this would be broken also 2102 BLOCK_COMMENT("} assert klasses not null done"); 2103 } 2104 #endif 2105 2106 // Load layout helper (32-bits) 2107 // 2108 // |array_tag| | header_size | element_type | |log2_element_size| 2109 // 32 30 24 16 8 2 0 2110 // 2111 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2112 // 2113 2114 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2115 2116 // Handle objArrays completely differently... 2117 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2118 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2119 __ movw(rscratch1, objArray_lh); 2120 __ eorw(rscratch2, lh, rscratch1); 2121 __ cbzw(rscratch2, L_objArray); 2122 2123 // if (src->klass() != dst->klass()) return -1; 2124 __ load_klass(rscratch2, dst); 2125 __ eor(rscratch2, rscratch2, scratch_src_klass); 2126 __ cbnz(rscratch2, L_failed); 2127 2128 // if (!src->is_Array()) return -1; 2129 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2130 2131 // At this point, it is known to be a typeArray (array_tag 0x3). 2132 #ifdef ASSERT 2133 { 2134 BLOCK_COMMENT("assert primitive array {"); 2135 Label L; 2136 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2137 __ cmpw(lh, rscratch2); 2138 __ br(Assembler::GE, L); 2139 __ stop("must be a primitive array"); 2140 __ bind(L); 2141 BLOCK_COMMENT("} assert primitive array done"); 2142 } 2143 #endif 2144 2145 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2146 rscratch2, L_failed); 2147 2148 // TypeArrayKlass 2149 // 2150 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2151 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2152 // 2153 2154 const Register rscratch1_offset = rscratch1; // array offset 2155 const Register r18_elsize = lh; // element size 2156 2157 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2158 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2159 __ add(src, src, rscratch1_offset); // src array offset 2160 __ add(dst, dst, rscratch1_offset); // dst array offset 2161 BLOCK_COMMENT("choose copy loop based on element size"); 2162 2163 // next registers should be set before the jump to corresponding stub 2164 const Register from = c_rarg0; // source array address 2165 const Register to = c_rarg1; // destination array address 2166 const Register count = c_rarg2; // elements count 2167 2168 // 'from', 'to', 'count' registers should be set in such order 2169 // since they are the same as 'src', 'src_pos', 'dst'. 2170 2171 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2172 2173 // The possible values of elsize are 0-3, i.e. exact_log2(element 2174 // size in bytes). We do a simple bitwise binary search. 2175 __ BIND(L_copy_bytes); 2176 __ tbnz(r18_elsize, 1, L_copy_ints); 2177 __ tbnz(r18_elsize, 0, L_copy_shorts); 2178 __ lea(from, Address(src, src_pos));// src_addr 2179 __ lea(to, Address(dst, dst_pos));// dst_addr 2180 __ movw(count, scratch_length); // length 2181 __ b(RuntimeAddress(byte_copy_entry)); 2182 2183 __ BIND(L_copy_shorts); 2184 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2185 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2186 __ movw(count, scratch_length); // length 2187 __ b(RuntimeAddress(short_copy_entry)); 2188 2189 __ BIND(L_copy_ints); 2190 __ tbnz(r18_elsize, 0, L_copy_longs); 2191 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2192 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2193 __ movw(count, scratch_length); // length 2194 __ b(RuntimeAddress(int_copy_entry)); 2195 2196 __ BIND(L_copy_longs); 2197 #ifdef ASSERT 2198 { 2199 BLOCK_COMMENT("assert long copy {"); 2200 Label L; 2201 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2202 __ cmpw(r18_elsize, LogBytesPerLong); 2203 __ br(Assembler::EQ, L); 2204 __ stop("must be long copy, but elsize is wrong"); 2205 __ bind(L); 2206 BLOCK_COMMENT("} assert long copy done"); 2207 } 2208 #endif 2209 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2210 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2211 __ movw(count, scratch_length); // length 2212 __ b(RuntimeAddress(long_copy_entry)); 2213 2214 // ObjArrayKlass 2215 __ BIND(L_objArray); 2216 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2217 2218 Label L_plain_copy, L_checkcast_copy; 2219 // test array classes for subtyping 2220 __ load_klass(r18, dst); 2221 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2222 __ br(Assembler::NE, L_checkcast_copy); 2223 2224 // Identically typed arrays can be copied without element-wise checks. 2225 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2226 rscratch2, L_failed); 2227 2228 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2229 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2230 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2231 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2232 __ movw(count, scratch_length); // length 2233 __ BIND(L_plain_copy); 2234 __ b(RuntimeAddress(oop_copy_entry)); 2235 2236 __ BIND(L_checkcast_copy); 2237 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2238 { 2239 // Before looking at dst.length, make sure dst is also an objArray. 2240 __ ldrw(rscratch1, Address(r18, lh_offset)); 2241 __ movw(rscratch2, objArray_lh); 2242 __ eorw(rscratch1, rscratch1, rscratch2); 2243 __ cbnzw(rscratch1, L_failed); 2244 2245 // It is safe to examine both src.length and dst.length. 2246 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2247 r18, L_failed); 2248 2249 const Register rscratch2_dst_klass = rscratch2; 2250 __ load_klass(rscratch2_dst_klass, dst); // reload 2251 2252 // Marshal the base address arguments now, freeing registers. 2253 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2254 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2255 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2256 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2257 __ movw(count, length); // length (reloaded) 2258 Register sco_temp = c_rarg3; // this register is free now 2259 assert_different_registers(from, to, count, sco_temp, 2260 rscratch2_dst_klass, scratch_src_klass); 2261 // assert_clean_int(count, sco_temp); 2262 2263 // Generate the type check. 2264 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2265 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2266 // assert_clean_int(sco_temp, r18); 2267 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2268 2269 // Fetch destination element klass from the ObjArrayKlass header. 2270 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2271 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2272 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2273 2274 // the checkcast_copy loop needs two extra arguments: 2275 assert(c_rarg3 == sco_temp, "#3 already in place"); 2276 // Set up arguments for checkcast_copy_entry. 2277 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2278 __ b(RuntimeAddress(checkcast_copy_entry)); 2279 } 2280 2281 __ BIND(L_failed); 2282 __ mov(r0, -1); 2283 __ leave(); // required for proper stackwalking of RuntimeStub frame 2284 __ ret(lr); 2285 2286 return start; 2287 } 2288 2289 // 2290 // Generate stub for array fill. If "aligned" is true, the 2291 // "to" address is assumed to be heapword aligned. 2292 // 2293 // Arguments for generated stub: 2294 // to: c_rarg0 2295 // value: c_rarg1 2296 // count: c_rarg2 treated as signed 2297 // 2298 address generate_fill(BasicType t, bool aligned, const char *name) { 2299 __ align(CodeEntryAlignment); 2300 StubCodeMark mark(this, "StubRoutines", name); 2301 address start = __ pc(); 2302 2303 BLOCK_COMMENT("Entry:"); 2304 2305 const Register to = c_rarg0; // source array address 2306 const Register value = c_rarg1; // value 2307 const Register count = c_rarg2; // elements count 2308 2309 const Register bz_base = r10; // base for block_zero routine 2310 const Register cnt_words = r11; // temp register 2311 2312 __ enter(); 2313 2314 Label L_fill_elements, L_exit1; 2315 2316 int shift = -1; 2317 switch (t) { 2318 case T_BYTE: 2319 shift = 0; 2320 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2321 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2322 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2323 __ br(Assembler::LO, L_fill_elements); 2324 break; 2325 case T_SHORT: 2326 shift = 1; 2327 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2328 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2329 __ br(Assembler::LO, L_fill_elements); 2330 break; 2331 case T_INT: 2332 shift = 2; 2333 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2334 __ br(Assembler::LO, L_fill_elements); 2335 break; 2336 default: ShouldNotReachHere(); 2337 } 2338 2339 // Align source address at 8 bytes address boundary. 2340 Label L_skip_align1, L_skip_align2, L_skip_align4; 2341 if (!aligned) { 2342 switch (t) { 2343 case T_BYTE: 2344 // One byte misalignment happens only for byte arrays. 2345 __ tbz(to, 0, L_skip_align1); 2346 __ strb(value, Address(__ post(to, 1))); 2347 __ subw(count, count, 1); 2348 __ bind(L_skip_align1); 2349 // Fallthrough 2350 case T_SHORT: 2351 // Two bytes misalignment happens only for byte and short (char) arrays. 2352 __ tbz(to, 1, L_skip_align2); 2353 __ strh(value, Address(__ post(to, 2))); 2354 __ subw(count, count, 2 >> shift); 2355 __ bind(L_skip_align2); 2356 // Fallthrough 2357 case T_INT: 2358 // Align to 8 bytes, we know we are 4 byte aligned to start. 2359 __ tbz(to, 2, L_skip_align4); 2360 __ strw(value, Address(__ post(to, 4))); 2361 __ subw(count, count, 4 >> shift); 2362 __ bind(L_skip_align4); 2363 break; 2364 default: ShouldNotReachHere(); 2365 } 2366 } 2367 2368 // 2369 // Fill large chunks 2370 // 2371 __ lsrw(cnt_words, count, 3 - shift); // number of words 2372 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2373 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2374 if (UseBlockZeroing) { 2375 Label non_block_zeroing, rest; 2376 // If the fill value is zero we can use the fast zero_words(). 2377 __ cbnz(value, non_block_zeroing); 2378 __ mov(bz_base, to); 2379 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2380 __ zero_words(bz_base, cnt_words); 2381 __ b(rest); 2382 __ bind(non_block_zeroing); 2383 __ fill_words(to, cnt_words, value); 2384 __ bind(rest); 2385 } else { 2386 __ fill_words(to, cnt_words, value); 2387 } 2388 2389 // Remaining count is less than 8 bytes. Fill it by a single store. 2390 // Note that the total length is no less than 8 bytes. 2391 if (t == T_BYTE || t == T_SHORT) { 2392 Label L_exit1; 2393 __ cbzw(count, L_exit1); 2394 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2395 __ str(value, Address(to, -8)); // overwrite some elements 2396 __ bind(L_exit1); 2397 __ leave(); 2398 __ ret(lr); 2399 } 2400 2401 // Handle copies less than 8 bytes. 2402 Label L_fill_2, L_fill_4, L_exit2; 2403 __ bind(L_fill_elements); 2404 switch (t) { 2405 case T_BYTE: 2406 __ tbz(count, 0, L_fill_2); 2407 __ strb(value, Address(__ post(to, 1))); 2408 __ bind(L_fill_2); 2409 __ tbz(count, 1, L_fill_4); 2410 __ strh(value, Address(__ post(to, 2))); 2411 __ bind(L_fill_4); 2412 __ tbz(count, 2, L_exit2); 2413 __ strw(value, Address(to)); 2414 break; 2415 case T_SHORT: 2416 __ tbz(count, 0, L_fill_4); 2417 __ strh(value, Address(__ post(to, 2))); 2418 __ bind(L_fill_4); 2419 __ tbz(count, 1, L_exit2); 2420 __ strw(value, Address(to)); 2421 break; 2422 case T_INT: 2423 __ cbzw(count, L_exit2); 2424 __ strw(value, Address(to)); 2425 break; 2426 default: ShouldNotReachHere(); 2427 } 2428 __ bind(L_exit2); 2429 __ leave(); 2430 __ ret(lr); 2431 return start; 2432 } 2433 2434 void generate_arraycopy_stubs() { 2435 address entry; 2436 address entry_jbyte_arraycopy; 2437 address entry_jshort_arraycopy; 2438 address entry_jint_arraycopy; 2439 address entry_oop_arraycopy; 2440 address entry_jlong_arraycopy; 2441 address entry_checkcast_arraycopy; 2442 2443 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2444 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2445 2446 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2447 2448 //*** jbyte 2449 // Always need aligned and unaligned versions 2450 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2451 "jbyte_disjoint_arraycopy"); 2452 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2453 &entry_jbyte_arraycopy, 2454 "jbyte_arraycopy"); 2455 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2456 "arrayof_jbyte_disjoint_arraycopy"); 2457 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2458 "arrayof_jbyte_arraycopy"); 2459 2460 //*** jshort 2461 // Always need aligned and unaligned versions 2462 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2463 "jshort_disjoint_arraycopy"); 2464 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2465 &entry_jshort_arraycopy, 2466 "jshort_arraycopy"); 2467 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2468 "arrayof_jshort_disjoint_arraycopy"); 2469 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2470 "arrayof_jshort_arraycopy"); 2471 2472 //*** jint 2473 // Aligned versions 2474 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2475 "arrayof_jint_disjoint_arraycopy"); 2476 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2477 "arrayof_jint_arraycopy"); 2478 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2479 // entry_jint_arraycopy always points to the unaligned version 2480 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2481 "jint_disjoint_arraycopy"); 2482 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2483 &entry_jint_arraycopy, 2484 "jint_arraycopy"); 2485 2486 //*** jlong 2487 // It is always aligned 2488 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2489 "arrayof_jlong_disjoint_arraycopy"); 2490 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2491 "arrayof_jlong_arraycopy"); 2492 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2493 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2494 2495 //*** oops 2496 { 2497 // With compressed oops we need unaligned versions; notice that 2498 // we overwrite entry_oop_arraycopy. 2499 bool aligned = !UseCompressedOops; 2500 2501 StubRoutines::_arrayof_oop_disjoint_arraycopy 2502 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2503 /*dest_uninitialized*/false); 2504 StubRoutines::_arrayof_oop_arraycopy 2505 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2506 /*dest_uninitialized*/false); 2507 // Aligned versions without pre-barriers 2508 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2509 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2510 /*dest_uninitialized*/true); 2511 StubRoutines::_arrayof_oop_arraycopy_uninit 2512 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2513 /*dest_uninitialized*/true); 2514 } 2515 2516 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2517 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2518 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2519 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2520 2521 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2522 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2523 /*dest_uninitialized*/true); 2524 2525 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2526 entry_jbyte_arraycopy, 2527 entry_jshort_arraycopy, 2528 entry_jint_arraycopy, 2529 entry_jlong_arraycopy); 2530 2531 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2532 entry_jbyte_arraycopy, 2533 entry_jshort_arraycopy, 2534 entry_jint_arraycopy, 2535 entry_oop_arraycopy, 2536 entry_jlong_arraycopy, 2537 entry_checkcast_arraycopy); 2538 2539 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2540 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2541 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2542 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2543 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2544 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2545 } 2546 2547 void generate_math_stubs() { Unimplemented(); } 2548 2549 // Arguments: 2550 // 2551 // Inputs: 2552 // c_rarg0 - source byte array address 2553 // c_rarg1 - destination byte array address 2554 // c_rarg2 - K (key) in little endian int array 2555 // 2556 address generate_aescrypt_encryptBlock() { 2557 __ align(CodeEntryAlignment); 2558 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2559 2560 Label L_doLast; 2561 2562 const Register from = c_rarg0; // source array address 2563 const Register to = c_rarg1; // destination array address 2564 const Register key = c_rarg2; // key array address 2565 const Register keylen = rscratch1; 2566 2567 address start = __ pc(); 2568 __ enter(); 2569 2570 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2571 2572 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2573 2574 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2575 __ rev32(v1, __ T16B, v1); 2576 __ rev32(v2, __ T16B, v2); 2577 __ rev32(v3, __ T16B, v3); 2578 __ rev32(v4, __ T16B, v4); 2579 __ aese(v0, v1); 2580 __ aesmc(v0, v0); 2581 __ aese(v0, v2); 2582 __ aesmc(v0, v0); 2583 __ aese(v0, v3); 2584 __ aesmc(v0, v0); 2585 __ aese(v0, v4); 2586 __ aesmc(v0, v0); 2587 2588 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2589 __ rev32(v1, __ T16B, v1); 2590 __ rev32(v2, __ T16B, v2); 2591 __ rev32(v3, __ T16B, v3); 2592 __ rev32(v4, __ T16B, v4); 2593 __ aese(v0, v1); 2594 __ aesmc(v0, v0); 2595 __ aese(v0, v2); 2596 __ aesmc(v0, v0); 2597 __ aese(v0, v3); 2598 __ aesmc(v0, v0); 2599 __ aese(v0, v4); 2600 __ aesmc(v0, v0); 2601 2602 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2603 __ rev32(v1, __ T16B, v1); 2604 __ rev32(v2, __ T16B, v2); 2605 2606 __ cmpw(keylen, 44); 2607 __ br(Assembler::EQ, L_doLast); 2608 2609 __ aese(v0, v1); 2610 __ aesmc(v0, v0); 2611 __ aese(v0, v2); 2612 __ aesmc(v0, v0); 2613 2614 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2615 __ rev32(v1, __ T16B, v1); 2616 __ rev32(v2, __ T16B, v2); 2617 2618 __ cmpw(keylen, 52); 2619 __ br(Assembler::EQ, L_doLast); 2620 2621 __ aese(v0, v1); 2622 __ aesmc(v0, v0); 2623 __ aese(v0, v2); 2624 __ aesmc(v0, v0); 2625 2626 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2627 __ rev32(v1, __ T16B, v1); 2628 __ rev32(v2, __ T16B, v2); 2629 2630 __ BIND(L_doLast); 2631 2632 __ aese(v0, v1); 2633 __ aesmc(v0, v0); 2634 __ aese(v0, v2); 2635 2636 __ ld1(v1, __ T16B, key); 2637 __ rev32(v1, __ T16B, v1); 2638 __ eor(v0, __ T16B, v0, v1); 2639 2640 __ st1(v0, __ T16B, to); 2641 2642 __ mov(r0, 0); 2643 2644 __ leave(); 2645 __ ret(lr); 2646 2647 return start; 2648 } 2649 2650 // Arguments: 2651 // 2652 // Inputs: 2653 // c_rarg0 - source byte array address 2654 // c_rarg1 - destination byte array address 2655 // c_rarg2 - K (key) in little endian int array 2656 // 2657 address generate_aescrypt_decryptBlock() { 2658 assert(UseAES, "need AES instructions and misaligned SSE support"); 2659 __ align(CodeEntryAlignment); 2660 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2661 Label L_doLast; 2662 2663 const Register from = c_rarg0; // source array address 2664 const Register to = c_rarg1; // destination array address 2665 const Register key = c_rarg2; // key array address 2666 const Register keylen = rscratch1; 2667 2668 address start = __ pc(); 2669 __ enter(); // required for proper stackwalking of RuntimeStub frame 2670 2671 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2672 2673 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2674 2675 __ ld1(v5, __ T16B, __ post(key, 16)); 2676 __ rev32(v5, __ T16B, v5); 2677 2678 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2679 __ rev32(v1, __ T16B, v1); 2680 __ rev32(v2, __ T16B, v2); 2681 __ rev32(v3, __ T16B, v3); 2682 __ rev32(v4, __ T16B, v4); 2683 __ aesd(v0, v1); 2684 __ aesimc(v0, v0); 2685 __ aesd(v0, v2); 2686 __ aesimc(v0, v0); 2687 __ aesd(v0, v3); 2688 __ aesimc(v0, v0); 2689 __ aesd(v0, v4); 2690 __ aesimc(v0, v0); 2691 2692 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2693 __ rev32(v1, __ T16B, v1); 2694 __ rev32(v2, __ T16B, v2); 2695 __ rev32(v3, __ T16B, v3); 2696 __ rev32(v4, __ T16B, v4); 2697 __ aesd(v0, v1); 2698 __ aesimc(v0, v0); 2699 __ aesd(v0, v2); 2700 __ aesimc(v0, v0); 2701 __ aesd(v0, v3); 2702 __ aesimc(v0, v0); 2703 __ aesd(v0, v4); 2704 __ aesimc(v0, v0); 2705 2706 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2707 __ rev32(v1, __ T16B, v1); 2708 __ rev32(v2, __ T16B, v2); 2709 2710 __ cmpw(keylen, 44); 2711 __ br(Assembler::EQ, L_doLast); 2712 2713 __ aesd(v0, v1); 2714 __ aesimc(v0, v0); 2715 __ aesd(v0, v2); 2716 __ aesimc(v0, v0); 2717 2718 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2719 __ rev32(v1, __ T16B, v1); 2720 __ rev32(v2, __ T16B, v2); 2721 2722 __ cmpw(keylen, 52); 2723 __ br(Assembler::EQ, L_doLast); 2724 2725 __ aesd(v0, v1); 2726 __ aesimc(v0, v0); 2727 __ aesd(v0, v2); 2728 __ aesimc(v0, v0); 2729 2730 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2731 __ rev32(v1, __ T16B, v1); 2732 __ rev32(v2, __ T16B, v2); 2733 2734 __ BIND(L_doLast); 2735 2736 __ aesd(v0, v1); 2737 __ aesimc(v0, v0); 2738 __ aesd(v0, v2); 2739 2740 __ eor(v0, __ T16B, v0, v5); 2741 2742 __ st1(v0, __ T16B, to); 2743 2744 __ mov(r0, 0); 2745 2746 __ leave(); 2747 __ ret(lr); 2748 2749 return start; 2750 } 2751 2752 // Arguments: 2753 // 2754 // Inputs: 2755 // c_rarg0 - source byte array address 2756 // c_rarg1 - destination byte array address 2757 // c_rarg2 - K (key) in little endian int array 2758 // c_rarg3 - r vector byte array address 2759 // c_rarg4 - input length 2760 // 2761 // Output: 2762 // x0 - input length 2763 // 2764 address generate_cipherBlockChaining_encryptAESCrypt() { 2765 assert(UseAES, "need AES instructions and misaligned SSE support"); 2766 __ align(CodeEntryAlignment); 2767 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2768 2769 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2770 2771 const Register from = c_rarg0; // source array address 2772 const Register to = c_rarg1; // destination array address 2773 const Register key = c_rarg2; // key array address 2774 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2775 // and left with the results of the last encryption block 2776 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2777 const Register keylen = rscratch1; 2778 2779 address start = __ pc(); 2780 2781 __ enter(); 2782 2783 __ movw(rscratch2, len_reg); 2784 2785 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2786 2787 __ ld1(v0, __ T16B, rvec); 2788 2789 __ cmpw(keylen, 52); 2790 __ br(Assembler::CC, L_loadkeys_44); 2791 __ br(Assembler::EQ, L_loadkeys_52); 2792 2793 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2794 __ rev32(v17, __ T16B, v17); 2795 __ rev32(v18, __ T16B, v18); 2796 __ BIND(L_loadkeys_52); 2797 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2798 __ rev32(v19, __ T16B, v19); 2799 __ rev32(v20, __ T16B, v20); 2800 __ BIND(L_loadkeys_44); 2801 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2802 __ rev32(v21, __ T16B, v21); 2803 __ rev32(v22, __ T16B, v22); 2804 __ rev32(v23, __ T16B, v23); 2805 __ rev32(v24, __ T16B, v24); 2806 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2807 __ rev32(v25, __ T16B, v25); 2808 __ rev32(v26, __ T16B, v26); 2809 __ rev32(v27, __ T16B, v27); 2810 __ rev32(v28, __ T16B, v28); 2811 __ ld1(v29, v30, v31, __ T16B, key); 2812 __ rev32(v29, __ T16B, v29); 2813 __ rev32(v30, __ T16B, v30); 2814 __ rev32(v31, __ T16B, v31); 2815 2816 __ BIND(L_aes_loop); 2817 __ ld1(v1, __ T16B, __ post(from, 16)); 2818 __ eor(v0, __ T16B, v0, v1); 2819 2820 __ br(Assembler::CC, L_rounds_44); 2821 __ br(Assembler::EQ, L_rounds_52); 2822 2823 __ aese(v0, v17); __ aesmc(v0, v0); 2824 __ aese(v0, v18); __ aesmc(v0, v0); 2825 __ BIND(L_rounds_52); 2826 __ aese(v0, v19); __ aesmc(v0, v0); 2827 __ aese(v0, v20); __ aesmc(v0, v0); 2828 __ BIND(L_rounds_44); 2829 __ aese(v0, v21); __ aesmc(v0, v0); 2830 __ aese(v0, v22); __ aesmc(v0, v0); 2831 __ aese(v0, v23); __ aesmc(v0, v0); 2832 __ aese(v0, v24); __ aesmc(v0, v0); 2833 __ aese(v0, v25); __ aesmc(v0, v0); 2834 __ aese(v0, v26); __ aesmc(v0, v0); 2835 __ aese(v0, v27); __ aesmc(v0, v0); 2836 __ aese(v0, v28); __ aesmc(v0, v0); 2837 __ aese(v0, v29); __ aesmc(v0, v0); 2838 __ aese(v0, v30); 2839 __ eor(v0, __ T16B, v0, v31); 2840 2841 __ st1(v0, __ T16B, __ post(to, 16)); 2842 2843 __ subw(len_reg, len_reg, 16); 2844 __ cbnzw(len_reg, L_aes_loop); 2845 2846 __ st1(v0, __ T16B, rvec); 2847 2848 __ mov(r0, rscratch2); 2849 2850 __ leave(); 2851 __ ret(lr); 2852 2853 return start; 2854 } 2855 2856 // Arguments: 2857 // 2858 // Inputs: 2859 // c_rarg0 - source byte array address 2860 // c_rarg1 - destination byte array address 2861 // c_rarg2 - K (key) in little endian int array 2862 // c_rarg3 - r vector byte array address 2863 // c_rarg4 - input length 2864 // 2865 // Output: 2866 // r0 - input length 2867 // 2868 address generate_cipherBlockChaining_decryptAESCrypt() { 2869 assert(UseAES, "need AES instructions and misaligned SSE support"); 2870 __ align(CodeEntryAlignment); 2871 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2872 2873 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2874 2875 const Register from = c_rarg0; // source array address 2876 const Register to = c_rarg1; // destination array address 2877 const Register key = c_rarg2; // key array address 2878 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2879 // and left with the results of the last encryption block 2880 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2881 const Register keylen = rscratch1; 2882 2883 address start = __ pc(); 2884 2885 __ enter(); 2886 2887 __ movw(rscratch2, len_reg); 2888 2889 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2890 2891 __ ld1(v2, __ T16B, rvec); 2892 2893 __ ld1(v31, __ T16B, __ post(key, 16)); 2894 __ rev32(v31, __ T16B, v31); 2895 2896 __ cmpw(keylen, 52); 2897 __ br(Assembler::CC, L_loadkeys_44); 2898 __ br(Assembler::EQ, L_loadkeys_52); 2899 2900 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2901 __ rev32(v17, __ T16B, v17); 2902 __ rev32(v18, __ T16B, v18); 2903 __ BIND(L_loadkeys_52); 2904 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2905 __ rev32(v19, __ T16B, v19); 2906 __ rev32(v20, __ T16B, v20); 2907 __ BIND(L_loadkeys_44); 2908 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2909 __ rev32(v21, __ T16B, v21); 2910 __ rev32(v22, __ T16B, v22); 2911 __ rev32(v23, __ T16B, v23); 2912 __ rev32(v24, __ T16B, v24); 2913 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2914 __ rev32(v25, __ T16B, v25); 2915 __ rev32(v26, __ T16B, v26); 2916 __ rev32(v27, __ T16B, v27); 2917 __ rev32(v28, __ T16B, v28); 2918 __ ld1(v29, v30, __ T16B, key); 2919 __ rev32(v29, __ T16B, v29); 2920 __ rev32(v30, __ T16B, v30); 2921 2922 __ BIND(L_aes_loop); 2923 __ ld1(v0, __ T16B, __ post(from, 16)); 2924 __ orr(v1, __ T16B, v0, v0); 2925 2926 __ br(Assembler::CC, L_rounds_44); 2927 __ br(Assembler::EQ, L_rounds_52); 2928 2929 __ aesd(v0, v17); __ aesimc(v0, v0); 2930 __ aesd(v0, v18); __ aesimc(v0, v0); 2931 __ BIND(L_rounds_52); 2932 __ aesd(v0, v19); __ aesimc(v0, v0); 2933 __ aesd(v0, v20); __ aesimc(v0, v0); 2934 __ BIND(L_rounds_44); 2935 __ aesd(v0, v21); __ aesimc(v0, v0); 2936 __ aesd(v0, v22); __ aesimc(v0, v0); 2937 __ aesd(v0, v23); __ aesimc(v0, v0); 2938 __ aesd(v0, v24); __ aesimc(v0, v0); 2939 __ aesd(v0, v25); __ aesimc(v0, v0); 2940 __ aesd(v0, v26); __ aesimc(v0, v0); 2941 __ aesd(v0, v27); __ aesimc(v0, v0); 2942 __ aesd(v0, v28); __ aesimc(v0, v0); 2943 __ aesd(v0, v29); __ aesimc(v0, v0); 2944 __ aesd(v0, v30); 2945 __ eor(v0, __ T16B, v0, v31); 2946 __ eor(v0, __ T16B, v0, v2); 2947 2948 __ st1(v0, __ T16B, __ post(to, 16)); 2949 __ orr(v2, __ T16B, v1, v1); 2950 2951 __ subw(len_reg, len_reg, 16); 2952 __ cbnzw(len_reg, L_aes_loop); 2953 2954 __ st1(v2, __ T16B, rvec); 2955 2956 __ mov(r0, rscratch2); 2957 2958 __ leave(); 2959 __ ret(lr); 2960 2961 return start; 2962 } 2963 2964 // Arguments: 2965 // 2966 // Inputs: 2967 // c_rarg0 - byte[] source+offset 2968 // c_rarg1 - int[] SHA.state 2969 // c_rarg2 - int offset 2970 // c_rarg3 - int limit 2971 // 2972 address generate_sha1_implCompress(bool multi_block, const char *name) { 2973 __ align(CodeEntryAlignment); 2974 StubCodeMark mark(this, "StubRoutines", name); 2975 address start = __ pc(); 2976 2977 Register buf = c_rarg0; 2978 Register state = c_rarg1; 2979 Register ofs = c_rarg2; 2980 Register limit = c_rarg3; 2981 2982 Label keys; 2983 Label sha1_loop; 2984 2985 // load the keys into v0..v3 2986 __ adr(rscratch1, keys); 2987 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2988 // load 5 words state into v6, v7 2989 __ ldrq(v6, Address(state, 0)); 2990 __ ldrs(v7, Address(state, 16)); 2991 2992 2993 __ BIND(sha1_loop); 2994 // load 64 bytes of data into v16..v19 2995 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2996 __ rev32(v16, __ T16B, v16); 2997 __ rev32(v17, __ T16B, v17); 2998 __ rev32(v18, __ T16B, v18); 2999 __ rev32(v19, __ T16B, v19); 3000 3001 // do the sha1 3002 __ addv(v4, __ T4S, v16, v0); 3003 __ orr(v20, __ T16B, v6, v6); 3004 3005 FloatRegister d0 = v16; 3006 FloatRegister d1 = v17; 3007 FloatRegister d2 = v18; 3008 FloatRegister d3 = v19; 3009 3010 for (int round = 0; round < 20; round++) { 3011 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3012 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3013 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3014 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3015 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3016 3017 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3018 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3019 __ sha1h(tmp2, __ T4S, v20); 3020 if (round < 5) 3021 __ sha1c(v20, __ T4S, tmp3, tmp4); 3022 else if (round < 10 || round >= 15) 3023 __ sha1p(v20, __ T4S, tmp3, tmp4); 3024 else 3025 __ sha1m(v20, __ T4S, tmp3, tmp4); 3026 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3027 3028 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3029 } 3030 3031 __ addv(v7, __ T2S, v7, v21); 3032 __ addv(v6, __ T4S, v6, v20); 3033 3034 if (multi_block) { 3035 __ add(ofs, ofs, 64); 3036 __ cmp(ofs, limit); 3037 __ br(Assembler::LE, sha1_loop); 3038 __ mov(c_rarg0, ofs); // return ofs 3039 } 3040 3041 __ strq(v6, Address(state, 0)); 3042 __ strs(v7, Address(state, 16)); 3043 3044 __ ret(lr); 3045 3046 __ bind(keys); 3047 __ emit_int32(0x5a827999); 3048 __ emit_int32(0x6ed9eba1); 3049 __ emit_int32(0x8f1bbcdc); 3050 __ emit_int32(0xca62c1d6); 3051 3052 return start; 3053 } 3054 3055 3056 // Arguments: 3057 // 3058 // Inputs: 3059 // c_rarg0 - byte[] source+offset 3060 // c_rarg1 - int[] SHA.state 3061 // c_rarg2 - int offset 3062 // c_rarg3 - int limit 3063 // 3064 address generate_sha256_implCompress(bool multi_block, const char *name) { 3065 static const uint32_t round_consts[64] = { 3066 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3067 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3068 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3069 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3070 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3071 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3072 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3073 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3074 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3075 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3076 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3077 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3078 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3079 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3080 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3081 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3082 }; 3083 __ align(CodeEntryAlignment); 3084 StubCodeMark mark(this, "StubRoutines", name); 3085 address start = __ pc(); 3086 3087 Register buf = c_rarg0; 3088 Register state = c_rarg1; 3089 Register ofs = c_rarg2; 3090 Register limit = c_rarg3; 3091 3092 Label sha1_loop; 3093 3094 __ stpd(v8, v9, __ pre(sp, -32)); 3095 __ stpd(v10, v11, Address(sp, 16)); 3096 3097 // dga == v0 3098 // dgb == v1 3099 // dg0 == v2 3100 // dg1 == v3 3101 // dg2 == v4 3102 // t0 == v6 3103 // t1 == v7 3104 3105 // load 16 keys to v16..v31 3106 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3107 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3108 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3109 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3110 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3111 3112 // load 8 words (256 bits) state 3113 __ ldpq(v0, v1, state); 3114 3115 __ BIND(sha1_loop); 3116 // load 64 bytes of data into v8..v11 3117 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3118 __ rev32(v8, __ T16B, v8); 3119 __ rev32(v9, __ T16B, v9); 3120 __ rev32(v10, __ T16B, v10); 3121 __ rev32(v11, __ T16B, v11); 3122 3123 __ addv(v6, __ T4S, v8, v16); 3124 __ orr(v2, __ T16B, v0, v0); 3125 __ orr(v3, __ T16B, v1, v1); 3126 3127 FloatRegister d0 = v8; 3128 FloatRegister d1 = v9; 3129 FloatRegister d2 = v10; 3130 FloatRegister d3 = v11; 3131 3132 3133 for (int round = 0; round < 16; round++) { 3134 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3135 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3136 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3137 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3138 3139 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3140 __ orr(v4, __ T16B, v2, v2); 3141 if (round < 15) 3142 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3143 __ sha256h(v2, __ T4S, v3, tmp2); 3144 __ sha256h2(v3, __ T4S, v4, tmp2); 3145 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3146 3147 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3148 } 3149 3150 __ addv(v0, __ T4S, v0, v2); 3151 __ addv(v1, __ T4S, v1, v3); 3152 3153 if (multi_block) { 3154 __ add(ofs, ofs, 64); 3155 __ cmp(ofs, limit); 3156 __ br(Assembler::LE, sha1_loop); 3157 __ mov(c_rarg0, ofs); // return ofs 3158 } 3159 3160 __ ldpd(v10, v11, Address(sp, 16)); 3161 __ ldpd(v8, v9, __ post(sp, 32)); 3162 3163 __ stpq(v0, v1, state); 3164 3165 __ ret(lr); 3166 3167 return start; 3168 } 3169 3170 #ifndef BUILTIN_SIM 3171 // Safefetch stubs. 3172 void generate_safefetch(const char* name, int size, address* entry, 3173 address* fault_pc, address* continuation_pc) { 3174 // safefetch signatures: 3175 // int SafeFetch32(int* adr, int errValue); 3176 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3177 // 3178 // arguments: 3179 // c_rarg0 = adr 3180 // c_rarg1 = errValue 3181 // 3182 // result: 3183 // PPC_RET = *adr or errValue 3184 3185 StubCodeMark mark(this, "StubRoutines", name); 3186 3187 // Entry point, pc or function descriptor. 3188 *entry = __ pc(); 3189 3190 // Load *adr into c_rarg1, may fault. 3191 *fault_pc = __ pc(); 3192 switch (size) { 3193 case 4: 3194 // int32_t 3195 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3196 break; 3197 case 8: 3198 // int64_t 3199 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3200 break; 3201 default: 3202 ShouldNotReachHere(); 3203 } 3204 3205 // return errValue or *adr 3206 *continuation_pc = __ pc(); 3207 __ mov(r0, c_rarg1); 3208 __ ret(lr); 3209 } 3210 #endif 3211 3212 /** 3213 * Arguments: 3214 * 3215 * Inputs: 3216 * c_rarg0 - int crc 3217 * c_rarg1 - byte* buf 3218 * c_rarg2 - int length 3219 * 3220 * Ouput: 3221 * rax - int crc result 3222 */ 3223 address generate_updateBytesCRC32() { 3224 assert(UseCRC32Intrinsics, "what are we doing here?"); 3225 3226 __ align(CodeEntryAlignment); 3227 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3228 3229 address start = __ pc(); 3230 3231 const Register crc = c_rarg0; // crc 3232 const Register buf = c_rarg1; // source java byte array address 3233 const Register len = c_rarg2; // length 3234 const Register table0 = c_rarg3; // crc_table address 3235 const Register table1 = c_rarg4; 3236 const Register table2 = c_rarg5; 3237 const Register table3 = c_rarg6; 3238 const Register tmp3 = c_rarg7; 3239 3240 BLOCK_COMMENT("Entry:"); 3241 __ enter(); // required for proper stackwalking of RuntimeStub frame 3242 3243 __ kernel_crc32(crc, buf, len, 3244 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3245 3246 __ leave(); // required for proper stackwalking of RuntimeStub frame 3247 __ ret(lr); 3248 3249 return start; 3250 } 3251 3252 /** 3253 * Arguments: 3254 * 3255 * Inputs: 3256 * c_rarg0 - int crc 3257 * c_rarg1 - byte* buf 3258 * c_rarg2 - int length 3259 * c_rarg3 - int* table 3260 * 3261 * Ouput: 3262 * r0 - int crc result 3263 */ 3264 address generate_updateBytesCRC32C() { 3265 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3266 3267 __ align(CodeEntryAlignment); 3268 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3269 3270 address start = __ pc(); 3271 3272 const Register crc = c_rarg0; // crc 3273 const Register buf = c_rarg1; // source java byte array address 3274 const Register len = c_rarg2; // length 3275 const Register table0 = c_rarg3; // crc_table address 3276 const Register table1 = c_rarg4; 3277 const Register table2 = c_rarg5; 3278 const Register table3 = c_rarg6; 3279 const Register tmp3 = c_rarg7; 3280 3281 BLOCK_COMMENT("Entry:"); 3282 __ enter(); // required for proper stackwalking of RuntimeStub frame 3283 3284 __ kernel_crc32c(crc, buf, len, 3285 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3286 3287 __ leave(); // required for proper stackwalking of RuntimeStub frame 3288 __ ret(lr); 3289 3290 return start; 3291 } 3292 3293 /*** 3294 * Arguments: 3295 * 3296 * Inputs: 3297 * c_rarg0 - int adler 3298 * c_rarg1 - byte* buff 3299 * c_rarg2 - int len 3300 * 3301 * Output: 3302 * c_rarg0 - int adler result 3303 */ 3304 address generate_updateBytesAdler32() { 3305 __ align(CodeEntryAlignment); 3306 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3307 address start = __ pc(); 3308 3309 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3310 3311 // Aliases 3312 Register adler = c_rarg0; 3313 Register s1 = c_rarg0; 3314 Register s2 = c_rarg3; 3315 Register buff = c_rarg1; 3316 Register len = c_rarg2; 3317 Register nmax = r4; 3318 Register base = r5; 3319 Register count = r6; 3320 Register temp0 = rscratch1; 3321 Register temp1 = rscratch2; 3322 Register temp2 = r7; 3323 3324 // Max number of bytes we can process before having to take the mod 3325 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3326 unsigned long BASE = 0xfff1; 3327 unsigned long NMAX = 0x15B0; 3328 3329 __ mov(base, BASE); 3330 __ mov(nmax, NMAX); 3331 3332 // s1 is initialized to the lower 16 bits of adler 3333 // s2 is initialized to the upper 16 bits of adler 3334 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3335 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3336 3337 // The pipelined loop needs at least 16 elements for 1 iteration 3338 // It does check this, but it is more effective to skip to the cleanup loop 3339 __ cmp(len, 16); 3340 __ br(Assembler::HS, L_nmax); 3341 __ cbz(len, L_combine); 3342 3343 __ bind(L_simple_by1_loop); 3344 __ ldrb(temp0, Address(__ post(buff, 1))); 3345 __ add(s1, s1, temp0); 3346 __ add(s2, s2, s1); 3347 __ subs(len, len, 1); 3348 __ br(Assembler::HI, L_simple_by1_loop); 3349 3350 // s1 = s1 % BASE 3351 __ subs(temp0, s1, base); 3352 __ csel(s1, temp0, s1, Assembler::HS); 3353 3354 // s2 = s2 % BASE 3355 __ lsr(temp0, s2, 16); 3356 __ lsl(temp1, temp0, 4); 3357 __ sub(temp1, temp1, temp0); 3358 __ add(s2, temp1, s2, ext::uxth); 3359 3360 __ subs(temp0, s2, base); 3361 __ csel(s2, temp0, s2, Assembler::HS); 3362 3363 __ b(L_combine); 3364 3365 __ bind(L_nmax); 3366 __ subs(len, len, nmax); 3367 __ sub(count, nmax, 16); 3368 __ br(Assembler::LO, L_by16); 3369 3370 __ bind(L_nmax_loop); 3371 3372 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3373 3374 __ add(s1, s1, temp0, ext::uxtb); 3375 __ ubfx(temp2, temp0, 8, 8); 3376 __ add(s2, s2, s1); 3377 __ add(s1, s1, temp2); 3378 __ ubfx(temp2, temp0, 16, 8); 3379 __ add(s2, s2, s1); 3380 __ add(s1, s1, temp2); 3381 __ ubfx(temp2, temp0, 24, 8); 3382 __ add(s2, s2, s1); 3383 __ add(s1, s1, temp2); 3384 __ ubfx(temp2, temp0, 32, 8); 3385 __ add(s2, s2, s1); 3386 __ add(s1, s1, temp2); 3387 __ ubfx(temp2, temp0, 40, 8); 3388 __ add(s2, s2, s1); 3389 __ add(s1, s1, temp2); 3390 __ ubfx(temp2, temp0, 48, 8); 3391 __ add(s2, s2, s1); 3392 __ add(s1, s1, temp2); 3393 __ add(s2, s2, s1); 3394 __ add(s1, s1, temp0, Assembler::LSR, 56); 3395 __ add(s2, s2, s1); 3396 3397 __ add(s1, s1, temp1, ext::uxtb); 3398 __ ubfx(temp2, temp1, 8, 8); 3399 __ add(s2, s2, s1); 3400 __ add(s1, s1, temp2); 3401 __ ubfx(temp2, temp1, 16, 8); 3402 __ add(s2, s2, s1); 3403 __ add(s1, s1, temp2); 3404 __ ubfx(temp2, temp1, 24, 8); 3405 __ add(s2, s2, s1); 3406 __ add(s1, s1, temp2); 3407 __ ubfx(temp2, temp1, 32, 8); 3408 __ add(s2, s2, s1); 3409 __ add(s1, s1, temp2); 3410 __ ubfx(temp2, temp1, 40, 8); 3411 __ add(s2, s2, s1); 3412 __ add(s1, s1, temp2); 3413 __ ubfx(temp2, temp1, 48, 8); 3414 __ add(s2, s2, s1); 3415 __ add(s1, s1, temp2); 3416 __ add(s2, s2, s1); 3417 __ add(s1, s1, temp1, Assembler::LSR, 56); 3418 __ add(s2, s2, s1); 3419 3420 __ subs(count, count, 16); 3421 __ br(Assembler::HS, L_nmax_loop); 3422 3423 // s1 = s1 % BASE 3424 __ lsr(temp0, s1, 16); 3425 __ lsl(temp1, temp0, 4); 3426 __ sub(temp1, temp1, temp0); 3427 __ add(temp1, temp1, s1, ext::uxth); 3428 3429 __ lsr(temp0, temp1, 16); 3430 __ lsl(s1, temp0, 4); 3431 __ sub(s1, s1, temp0); 3432 __ add(s1, s1, temp1, ext:: uxth); 3433 3434 __ subs(temp0, s1, base); 3435 __ csel(s1, temp0, s1, Assembler::HS); 3436 3437 // s2 = s2 % BASE 3438 __ lsr(temp0, s2, 16); 3439 __ lsl(temp1, temp0, 4); 3440 __ sub(temp1, temp1, temp0); 3441 __ add(temp1, temp1, s2, ext::uxth); 3442 3443 __ lsr(temp0, temp1, 16); 3444 __ lsl(s2, temp0, 4); 3445 __ sub(s2, s2, temp0); 3446 __ add(s2, s2, temp1, ext:: uxth); 3447 3448 __ subs(temp0, s2, base); 3449 __ csel(s2, temp0, s2, Assembler::HS); 3450 3451 __ subs(len, len, nmax); 3452 __ sub(count, nmax, 16); 3453 __ br(Assembler::HS, L_nmax_loop); 3454 3455 __ bind(L_by16); 3456 __ adds(len, len, count); 3457 __ br(Assembler::LO, L_by1); 3458 3459 __ bind(L_by16_loop); 3460 3461 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3462 3463 __ add(s1, s1, temp0, ext::uxtb); 3464 __ ubfx(temp2, temp0, 8, 8); 3465 __ add(s2, s2, s1); 3466 __ add(s1, s1, temp2); 3467 __ ubfx(temp2, temp0, 16, 8); 3468 __ add(s2, s2, s1); 3469 __ add(s1, s1, temp2); 3470 __ ubfx(temp2, temp0, 24, 8); 3471 __ add(s2, s2, s1); 3472 __ add(s1, s1, temp2); 3473 __ ubfx(temp2, temp0, 32, 8); 3474 __ add(s2, s2, s1); 3475 __ add(s1, s1, temp2); 3476 __ ubfx(temp2, temp0, 40, 8); 3477 __ add(s2, s2, s1); 3478 __ add(s1, s1, temp2); 3479 __ ubfx(temp2, temp0, 48, 8); 3480 __ add(s2, s2, s1); 3481 __ add(s1, s1, temp2); 3482 __ add(s2, s2, s1); 3483 __ add(s1, s1, temp0, Assembler::LSR, 56); 3484 __ add(s2, s2, s1); 3485 3486 __ add(s1, s1, temp1, ext::uxtb); 3487 __ ubfx(temp2, temp1, 8, 8); 3488 __ add(s2, s2, s1); 3489 __ add(s1, s1, temp2); 3490 __ ubfx(temp2, temp1, 16, 8); 3491 __ add(s2, s2, s1); 3492 __ add(s1, s1, temp2); 3493 __ ubfx(temp2, temp1, 24, 8); 3494 __ add(s2, s2, s1); 3495 __ add(s1, s1, temp2); 3496 __ ubfx(temp2, temp1, 32, 8); 3497 __ add(s2, s2, s1); 3498 __ add(s1, s1, temp2); 3499 __ ubfx(temp2, temp1, 40, 8); 3500 __ add(s2, s2, s1); 3501 __ add(s1, s1, temp2); 3502 __ ubfx(temp2, temp1, 48, 8); 3503 __ add(s2, s2, s1); 3504 __ add(s1, s1, temp2); 3505 __ add(s2, s2, s1); 3506 __ add(s1, s1, temp1, Assembler::LSR, 56); 3507 __ add(s2, s2, s1); 3508 3509 __ subs(len, len, 16); 3510 __ br(Assembler::HS, L_by16_loop); 3511 3512 __ bind(L_by1); 3513 __ adds(len, len, 15); 3514 __ br(Assembler::LO, L_do_mod); 3515 3516 __ bind(L_by1_loop); 3517 __ ldrb(temp0, Address(__ post(buff, 1))); 3518 __ add(s1, temp0, s1); 3519 __ add(s2, s2, s1); 3520 __ subs(len, len, 1); 3521 __ br(Assembler::HS, L_by1_loop); 3522 3523 __ bind(L_do_mod); 3524 // s1 = s1 % BASE 3525 __ lsr(temp0, s1, 16); 3526 __ lsl(temp1, temp0, 4); 3527 __ sub(temp1, temp1, temp0); 3528 __ add(temp1, temp1, s1, ext::uxth); 3529 3530 __ lsr(temp0, temp1, 16); 3531 __ lsl(s1, temp0, 4); 3532 __ sub(s1, s1, temp0); 3533 __ add(s1, s1, temp1, ext:: uxth); 3534 3535 __ subs(temp0, s1, base); 3536 __ csel(s1, temp0, s1, Assembler::HS); 3537 3538 // s2 = s2 % BASE 3539 __ lsr(temp0, s2, 16); 3540 __ lsl(temp1, temp0, 4); 3541 __ sub(temp1, temp1, temp0); 3542 __ add(temp1, temp1, s2, ext::uxth); 3543 3544 __ lsr(temp0, temp1, 16); 3545 __ lsl(s2, temp0, 4); 3546 __ sub(s2, s2, temp0); 3547 __ add(s2, s2, temp1, ext:: uxth); 3548 3549 __ subs(temp0, s2, base); 3550 __ csel(s2, temp0, s2, Assembler::HS); 3551 3552 // Combine lower bits and higher bits 3553 __ bind(L_combine); 3554 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3555 3556 __ ret(lr); 3557 3558 return start; 3559 } 3560 3561 /** 3562 * Arguments: 3563 * 3564 * Input: 3565 * c_rarg0 - x address 3566 * c_rarg1 - x length 3567 * c_rarg2 - y address 3568 * c_rarg3 - y lenth 3569 * c_rarg4 - z address 3570 * c_rarg5 - z length 3571 */ 3572 address generate_multiplyToLen() { 3573 __ align(CodeEntryAlignment); 3574 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3575 3576 address start = __ pc(); 3577 const Register x = r0; 3578 const Register xlen = r1; 3579 const Register y = r2; 3580 const Register ylen = r3; 3581 const Register z = r4; 3582 const Register zlen = r5; 3583 3584 const Register tmp1 = r10; 3585 const Register tmp2 = r11; 3586 const Register tmp3 = r12; 3587 const Register tmp4 = r13; 3588 const Register tmp5 = r14; 3589 const Register tmp6 = r15; 3590 const Register tmp7 = r16; 3591 3592 BLOCK_COMMENT("Entry:"); 3593 __ enter(); // required for proper stackwalking of RuntimeStub frame 3594 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3595 __ leave(); // required for proper stackwalking of RuntimeStub frame 3596 __ ret(lr); 3597 3598 return start; 3599 } 3600 3601 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3602 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3603 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3604 // Karatsuba multiplication performs a 128*128 -> 256-bit 3605 // multiplication in three 128-bit multiplications and a few 3606 // additions. 3607 // 3608 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3609 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3610 // 3611 // Inputs: 3612 // 3613 // A0 in a.d[0] (subkey) 3614 // A1 in a.d[1] 3615 // (A1+A0) in a1_xor_a0.d[0] 3616 // 3617 // B0 in b.d[0] (state) 3618 // B1 in b.d[1] 3619 3620 __ ext(tmp1, __ T16B, b, b, 0x08); 3621 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3622 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3623 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3624 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3625 3626 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3627 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3628 __ eor(tmp2, __ T16B, tmp2, tmp4); 3629 __ eor(tmp2, __ T16B, tmp2, tmp3); 3630 3631 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3632 __ ins(result_hi, __ D, tmp2, 0, 1); 3633 __ ins(result_lo, __ D, tmp2, 1, 0); 3634 } 3635 3636 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3637 FloatRegister p, FloatRegister z, FloatRegister t1) { 3638 const FloatRegister t0 = result; 3639 3640 // The GCM field polynomial f is z^128 + p(z), where p = 3641 // z^7+z^2+z+1. 3642 // 3643 // z^128 === -p(z) (mod (z^128 + p(z))) 3644 // 3645 // so, given that the product we're reducing is 3646 // a == lo + hi * z^128 3647 // substituting, 3648 // === lo - hi * p(z) (mod (z^128 + p(z))) 3649 // 3650 // we reduce by multiplying hi by p(z) and subtracting the result 3651 // from (i.e. XORing it with) lo. Because p has no nonzero high 3652 // bits we can do this with two 64-bit multiplications, lo*p and 3653 // hi*p. 3654 3655 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3656 __ ext(t1, __ T16B, t0, z, 8); 3657 __ eor(hi, __ T16B, hi, t1); 3658 __ ext(t1, __ T16B, z, t0, 8); 3659 __ eor(lo, __ T16B, lo, t1); 3660 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3661 __ eor(result, __ T16B, lo, t0); 3662 } 3663 3664 /** 3665 * Arguments: 3666 * 3667 * Input: 3668 * c_rarg0 - current state address 3669 * c_rarg1 - H key address 3670 * c_rarg2 - data address 3671 * c_rarg3 - number of blocks 3672 * 3673 * Output: 3674 * Updated state at c_rarg0 3675 */ 3676 address generate_ghash_processBlocks() { 3677 // Bafflingly, GCM uses little-endian for the byte order, but 3678 // big-endian for the bit order. For example, the polynomial 1 is 3679 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3680 // 3681 // So, we must either reverse the bytes in each word and do 3682 // everything big-endian or reverse the bits in each byte and do 3683 // it little-endian. On AArch64 it's more idiomatic to reverse 3684 // the bits in each byte (we have an instruction, RBIT, to do 3685 // that) and keep the data in little-endian bit order throught the 3686 // calculation, bit-reversing the inputs and outputs. 3687 3688 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3689 __ align(wordSize * 2); 3690 address p = __ pc(); 3691 __ emit_int64(0x87); // The low-order bits of the field 3692 // polynomial (i.e. p = z^7+z^2+z+1) 3693 // repeated in the low and high parts of a 3694 // 128-bit vector 3695 __ emit_int64(0x87); 3696 3697 __ align(CodeEntryAlignment); 3698 address start = __ pc(); 3699 3700 Register state = c_rarg0; 3701 Register subkeyH = c_rarg1; 3702 Register data = c_rarg2; 3703 Register blocks = c_rarg3; 3704 3705 FloatRegister vzr = v30; 3706 __ eor(vzr, __ T16B, vzr, vzr); // zero register 3707 3708 __ ldrq(v0, Address(state)); 3709 __ ldrq(v1, Address(subkeyH)); 3710 3711 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 3712 __ rbit(v0, __ T16B, v0); 3713 __ rev64(v1, __ T16B, v1); 3714 __ rbit(v1, __ T16B, v1); 3715 3716 __ ldrq(v26, p); 3717 3718 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 3719 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 3720 3721 { 3722 Label L_ghash_loop; 3723 __ bind(L_ghash_loop); 3724 3725 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 3726 // reversing each byte 3727 __ rbit(v2, __ T16B, v2); 3728 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 3729 3730 // Multiply state in v2 by subkey in v1 3731 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 3732 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 3733 /*temps*/v6, v20, v18, v21); 3734 // Reduce v7:v5 by the field polynomial 3735 ghash_reduce(v0, v5, v7, v26, vzr, v20); 3736 3737 __ sub(blocks, blocks, 1); 3738 __ cbnz(blocks, L_ghash_loop); 3739 } 3740 3741 // The bit-reversed result is at this point in v0 3742 __ rev64(v1, __ T16B, v0); 3743 __ rbit(v1, __ T16B, v1); 3744 3745 __ st1(v1, __ T16B, state); 3746 __ ret(lr); 3747 3748 return start; 3749 } 3750 3751 // Continuation point for throwing of implicit exceptions that are 3752 // not handled in the current activation. Fabricates an exception 3753 // oop and initiates normal exception dispatching in this 3754 // frame. Since we need to preserve callee-saved values (currently 3755 // only for C2, but done for C1 as well) we need a callee-saved oop 3756 // map and therefore have to make these stubs into RuntimeStubs 3757 // rather than BufferBlobs. If the compiler needs all registers to 3758 // be preserved between the fault point and the exception handler 3759 // then it must assume responsibility for that in 3760 // AbstractCompiler::continuation_for_implicit_null_exception or 3761 // continuation_for_implicit_division_by_zero_exception. All other 3762 // implicit exceptions (e.g., NullPointerException or 3763 // AbstractMethodError on entry) are either at call sites or 3764 // otherwise assume that stack unwinding will be initiated, so 3765 // caller saved registers were assumed volatile in the compiler. 3766 3767 #undef __ 3768 #define __ masm-> 3769 3770 address generate_throw_exception(const char* name, 3771 address runtime_entry, 3772 Register arg1 = noreg, 3773 Register arg2 = noreg) { 3774 // Information about frame layout at time of blocking runtime call. 3775 // Note that we only have to preserve callee-saved registers since 3776 // the compilers are responsible for supplying a continuation point 3777 // if they expect all registers to be preserved. 3778 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 3779 enum layout { 3780 rfp_off = 0, 3781 rfp_off2, 3782 return_off, 3783 return_off2, 3784 framesize // inclusive of return address 3785 }; 3786 3787 int insts_size = 512; 3788 int locs_size = 64; 3789 3790 CodeBuffer code(name, insts_size, locs_size); 3791 OopMapSet* oop_maps = new OopMapSet(); 3792 MacroAssembler* masm = new MacroAssembler(&code); 3793 3794 address start = __ pc(); 3795 3796 // This is an inlined and slightly modified version of call_VM 3797 // which has the ability to fetch the return PC out of 3798 // thread-local storage and also sets up last_Java_sp slightly 3799 // differently than the real call_VM 3800 3801 __ enter(); // Save FP and LR before call 3802 3803 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3804 3805 // lr and fp are already in place 3806 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 3807 3808 int frame_complete = __ pc() - start; 3809 3810 // Set up last_Java_sp and last_Java_fp 3811 address the_pc = __ pc(); 3812 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 3813 3814 // Call runtime 3815 if (arg1 != noreg) { 3816 assert(arg2 != c_rarg1, "clobbered"); 3817 __ mov(c_rarg1, arg1); 3818 } 3819 if (arg2 != noreg) { 3820 __ mov(c_rarg2, arg2); 3821 } 3822 __ mov(c_rarg0, rthread); 3823 BLOCK_COMMENT("call runtime_entry"); 3824 __ mov(rscratch1, runtime_entry); 3825 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 3826 3827 // Generate oop map 3828 OopMap* map = new OopMap(framesize, 0); 3829 3830 oop_maps->add_gc_map(the_pc - start, map); 3831 3832 __ reset_last_Java_frame(true); 3833 __ maybe_isb(); 3834 3835 __ leave(); 3836 3837 // check for pending exceptions 3838 #ifdef ASSERT 3839 Label L; 3840 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 3841 __ cbnz(rscratch1, L); 3842 __ should_not_reach_here(); 3843 __ bind(L); 3844 #endif // ASSERT 3845 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3846 3847 3848 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3849 RuntimeStub* stub = 3850 RuntimeStub::new_runtime_stub(name, 3851 &code, 3852 frame_complete, 3853 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3854 oop_maps, false); 3855 return stub->entry_point(); 3856 } 3857 3858 class MontgomeryMultiplyGenerator : public MacroAssembler { 3859 3860 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3861 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 3862 3863 RegSet _toSave; 3864 bool _squaring; 3865 3866 public: 3867 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3868 : MacroAssembler(as->code()), _squaring(squaring) { 3869 3870 // Register allocation 3871 3872 Register reg = c_rarg0; 3873 Pa_base = reg; // Argument registers 3874 if (squaring) 3875 Pb_base = Pa_base; 3876 else 3877 Pb_base = ++reg; 3878 Pn_base = ++reg; 3879 Rlen= ++reg; 3880 inv = ++reg; 3881 Pm_base = ++reg; 3882 3883 // Working registers: 3884 Ra = ++reg; // The current digit of a, b, n, and m. 3885 Rb = ++reg; 3886 Rm = ++reg; 3887 Rn = ++reg; 3888 3889 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 3890 Pb = ++reg; 3891 Pm = ++reg; 3892 Pn = ++reg; 3893 3894 t0 = ++reg; // Three registers which form a 3895 t1 = ++reg; // triple-precision accumuator. 3896 t2 = ++reg; 3897 3898 Ri = ++reg; // Inner and outer loop indexes. 3899 Rj = ++reg; 3900 3901 Rhi_ab = ++reg; // Product registers: low and high parts 3902 Rlo_ab = ++reg; // of a*b and m*n. 3903 Rhi_mn = ++reg; 3904 Rlo_mn = ++reg; 3905 3906 // r19 and up are callee-saved. 3907 _toSave = RegSet::range(r19, reg) + Pm_base; 3908 } 3909 3910 private: 3911 void save_regs() { 3912 push(_toSave, sp); 3913 } 3914 3915 void restore_regs() { 3916 pop(_toSave, sp); 3917 } 3918 3919 template <typename T> 3920 void unroll_2(Register count, T block) { 3921 Label loop, end, odd; 3922 tbnz(count, 0, odd); 3923 cbz(count, end); 3924 align(16); 3925 bind(loop); 3926 (this->*block)(); 3927 bind(odd); 3928 (this->*block)(); 3929 subs(count, count, 2); 3930 br(Assembler::GT, loop); 3931 bind(end); 3932 } 3933 3934 template <typename T> 3935 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3936 Label loop, end, odd; 3937 tbnz(count, 0, odd); 3938 cbz(count, end); 3939 align(16); 3940 bind(loop); 3941 (this->*block)(d, s, tmp); 3942 bind(odd); 3943 (this->*block)(d, s, tmp); 3944 subs(count, count, 2); 3945 br(Assembler::GT, loop); 3946 bind(end); 3947 } 3948 3949 void pre1(RegisterOrConstant i) { 3950 block_comment("pre1"); 3951 // Pa = Pa_base; 3952 // Pb = Pb_base + i; 3953 // Pm = Pm_base; 3954 // Pn = Pn_base + i; 3955 // Ra = *Pa; 3956 // Rb = *Pb; 3957 // Rm = *Pm; 3958 // Rn = *Pn; 3959 ldr(Ra, Address(Pa_base)); 3960 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3961 ldr(Rm, Address(Pm_base)); 3962 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3963 lea(Pa, Address(Pa_base)); 3964 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3965 lea(Pm, Address(Pm_base)); 3966 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3967 3968 // Zero the m*n result. 3969 mov(Rhi_mn, zr); 3970 mov(Rlo_mn, zr); 3971 } 3972 3973 // The core multiply-accumulate step of a Montgomery 3974 // multiplication. The idea is to schedule operations as a 3975 // pipeline so that instructions with long latencies (loads and 3976 // multiplies) have time to complete before their results are 3977 // used. This most benefits in-order implementations of the 3978 // architecture but out-of-order ones also benefit. 3979 void step() { 3980 block_comment("step"); 3981 // MACC(Ra, Rb, t0, t1, t2); 3982 // Ra = *++Pa; 3983 // Rb = *--Pb; 3984 umulh(Rhi_ab, Ra, Rb); 3985 mul(Rlo_ab, Ra, Rb); 3986 ldr(Ra, pre(Pa, wordSize)); 3987 ldr(Rb, pre(Pb, -wordSize)); 3988 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 3989 // previous iteration. 3990 // MACC(Rm, Rn, t0, t1, t2); 3991 // Rm = *++Pm; 3992 // Rn = *--Pn; 3993 umulh(Rhi_mn, Rm, Rn); 3994 mul(Rlo_mn, Rm, Rn); 3995 ldr(Rm, pre(Pm, wordSize)); 3996 ldr(Rn, pre(Pn, -wordSize)); 3997 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3998 } 3999 4000 void post1() { 4001 block_comment("post1"); 4002 4003 // MACC(Ra, Rb, t0, t1, t2); 4004 // Ra = *++Pa; 4005 // Rb = *--Pb; 4006 umulh(Rhi_ab, Ra, Rb); 4007 mul(Rlo_ab, Ra, Rb); 4008 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4009 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4010 4011 // *Pm = Rm = t0 * inv; 4012 mul(Rm, t0, inv); 4013 str(Rm, Address(Pm)); 4014 4015 // MACC(Rm, Rn, t0, t1, t2); 4016 // t0 = t1; t1 = t2; t2 = 0; 4017 umulh(Rhi_mn, Rm, Rn); 4018 4019 #ifndef PRODUCT 4020 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4021 { 4022 mul(Rlo_mn, Rm, Rn); 4023 add(Rlo_mn, t0, Rlo_mn); 4024 Label ok; 4025 cbz(Rlo_mn, ok); { 4026 stop("broken Montgomery multiply"); 4027 } bind(ok); 4028 } 4029 #endif 4030 // We have very carefully set things up so that 4031 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4032 // the lower half of Rm * Rn because we know the result already: 4033 // it must be -t0. t0 + (-t0) must generate a carry iff 4034 // t0 != 0. So, rather than do a mul and an adds we just set 4035 // the carry flag iff t0 is nonzero. 4036 // 4037 // mul(Rlo_mn, Rm, Rn); 4038 // adds(zr, t0, Rlo_mn); 4039 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4040 adcs(t0, t1, Rhi_mn); 4041 adc(t1, t2, zr); 4042 mov(t2, zr); 4043 } 4044 4045 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 4046 block_comment("pre2"); 4047 // Pa = Pa_base + i-len; 4048 // Pb = Pb_base + len; 4049 // Pm = Pm_base + i-len; 4050 // Pn = Pn_base + len; 4051 4052 if (i.is_register()) { 4053 sub(Rj, i.as_register(), len); 4054 } else { 4055 mov(Rj, i.as_constant()); 4056 sub(Rj, Rj, len); 4057 } 4058 // Rj == i-len 4059 4060 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 4061 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 4062 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4063 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 4064 4065 // Ra = *++Pa; 4066 // Rb = *--Pb; 4067 // Rm = *++Pm; 4068 // Rn = *--Pn; 4069 ldr(Ra, pre(Pa, wordSize)); 4070 ldr(Rb, pre(Pb, -wordSize)); 4071 ldr(Rm, pre(Pm, wordSize)); 4072 ldr(Rn, pre(Pn, -wordSize)); 4073 4074 mov(Rhi_mn, zr); 4075 mov(Rlo_mn, zr); 4076 } 4077 4078 void post2(RegisterOrConstant i, RegisterOrConstant len) { 4079 block_comment("post2"); 4080 if (i.is_constant()) { 4081 mov(Rj, i.as_constant()-len.as_constant()); 4082 } else { 4083 sub(Rj, i.as_register(), len); 4084 } 4085 4086 adds(t0, t0, Rlo_mn); // The pending m*n, low part 4087 4088 // As soon as we know the least significant digit of our result, 4089 // store it. 4090 // Pm_base[i-len] = t0; 4091 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4092 4093 // t0 = t1; t1 = t2; t2 = 0; 4094 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 4095 adc(t1, t2, zr); 4096 mov(t2, zr); 4097 } 4098 4099 // A carry in t0 after Montgomery multiplication means that we 4100 // should subtract multiples of n from our result in m. We'll 4101 // keep doing that until there is no carry. 4102 void normalize(RegisterOrConstant len) { 4103 block_comment("normalize"); 4104 // while (t0) 4105 // t0 = sub(Pm_base, Pn_base, t0, len); 4106 Label loop, post, again; 4107 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 4108 cbz(t0, post); { 4109 bind(again); { 4110 mov(i, zr); 4111 mov(cnt, len); 4112 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4113 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4114 subs(zr, zr, zr); // set carry flag, i.e. no borrow 4115 align(16); 4116 bind(loop); { 4117 sbcs(Rm, Rm, Rn); 4118 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4119 add(i, i, 1); 4120 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4121 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4122 sub(cnt, cnt, 1); 4123 } cbnz(cnt, loop); 4124 sbc(t0, t0, zr); 4125 } cbnz(t0, again); 4126 } bind(post); 4127 } 4128 4129 // Move memory at s to d, reversing words. 4130 // Increments d to end of copied memory 4131 // Destroys tmp1, tmp2 4132 // Preserves len 4133 // Leaves s pointing to the address which was in d at start 4134 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 4135 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 4136 4137 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 4138 mov(tmp1, len); 4139 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 4140 sub(s, d, len, ext::uxtw, LogBytesPerWord); 4141 } 4142 // where 4143 void reverse1(Register d, Register s, Register tmp) { 4144 ldr(tmp, pre(s, -wordSize)); 4145 ror(tmp, tmp, 32); 4146 str(tmp, post(d, wordSize)); 4147 } 4148 4149 void step_squaring() { 4150 // An extra ACC 4151 step(); 4152 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4153 } 4154 4155 void last_squaring(RegisterOrConstant i) { 4156 Label dont; 4157 // if ((i & 1) == 0) { 4158 tbnz(i.as_register(), 0, dont); { 4159 // MACC(Ra, Rb, t0, t1, t2); 4160 // Ra = *++Pa; 4161 // Rb = *--Pb; 4162 umulh(Rhi_ab, Ra, Rb); 4163 mul(Rlo_ab, Ra, Rb); 4164 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4165 } bind(dont); 4166 } 4167 4168 void extra_step_squaring() { 4169 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4170 4171 // MACC(Rm, Rn, t0, t1, t2); 4172 // Rm = *++Pm; 4173 // Rn = *--Pn; 4174 umulh(Rhi_mn, Rm, Rn); 4175 mul(Rlo_mn, Rm, Rn); 4176 ldr(Rm, pre(Pm, wordSize)); 4177 ldr(Rn, pre(Pn, -wordSize)); 4178 } 4179 4180 void post1_squaring() { 4181 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4182 4183 // *Pm = Rm = t0 * inv; 4184 mul(Rm, t0, inv); 4185 str(Rm, Address(Pm)); 4186 4187 // MACC(Rm, Rn, t0, t1, t2); 4188 // t0 = t1; t1 = t2; t2 = 0; 4189 umulh(Rhi_mn, Rm, Rn); 4190 4191 #ifndef PRODUCT 4192 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4193 { 4194 mul(Rlo_mn, Rm, Rn); 4195 add(Rlo_mn, t0, Rlo_mn); 4196 Label ok; 4197 cbz(Rlo_mn, ok); { 4198 stop("broken Montgomery multiply"); 4199 } bind(ok); 4200 } 4201 #endif 4202 // We have very carefully set things up so that 4203 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4204 // the lower half of Rm * Rn because we know the result already: 4205 // it must be -t0. t0 + (-t0) must generate a carry iff 4206 // t0 != 0. So, rather than do a mul and an adds we just set 4207 // the carry flag iff t0 is nonzero. 4208 // 4209 // mul(Rlo_mn, Rm, Rn); 4210 // adds(zr, t0, Rlo_mn); 4211 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4212 adcs(t0, t1, Rhi_mn); 4213 adc(t1, t2, zr); 4214 mov(t2, zr); 4215 } 4216 4217 void acc(Register Rhi, Register Rlo, 4218 Register t0, Register t1, Register t2) { 4219 adds(t0, t0, Rlo); 4220 adcs(t1, t1, Rhi); 4221 adc(t2, t2, zr); 4222 } 4223 4224 public: 4225 /** 4226 * Fast Montgomery multiplication. The derivation of the 4227 * algorithm is in A Cryptographic Library for the Motorola 4228 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 4229 * 4230 * Arguments: 4231 * 4232 * Inputs for multiplication: 4233 * c_rarg0 - int array elements a 4234 * c_rarg1 - int array elements b 4235 * c_rarg2 - int array elements n (the modulus) 4236 * c_rarg3 - int length 4237 * c_rarg4 - int inv 4238 * c_rarg5 - int array elements m (the result) 4239 * 4240 * Inputs for squaring: 4241 * c_rarg0 - int array elements a 4242 * c_rarg1 - int array elements n (the modulus) 4243 * c_rarg2 - int length 4244 * c_rarg3 - int inv 4245 * c_rarg4 - int array elements m (the result) 4246 * 4247 */ 4248 address generate_multiply() { 4249 Label argh, nothing; 4250 bind(argh); 4251 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4252 4253 align(CodeEntryAlignment); 4254 address entry = pc(); 4255 4256 cbzw(Rlen, nothing); 4257 4258 enter(); 4259 4260 // Make room. 4261 cmpw(Rlen, 512); 4262 br(Assembler::HI, argh); 4263 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4264 andr(sp, Ra, -2 * wordSize); 4265 4266 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4267 4268 { 4269 // Copy input args, reversing as we go. We use Ra as a 4270 // temporary variable. 4271 reverse(Ra, Pa_base, Rlen, t0, t1); 4272 if (!_squaring) 4273 reverse(Ra, Pb_base, Rlen, t0, t1); 4274 reverse(Ra, Pn_base, Rlen, t0, t1); 4275 } 4276 4277 // Push all call-saved registers and also Pm_base which we'll need 4278 // at the end. 4279 save_regs(); 4280 4281 #ifndef PRODUCT 4282 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 4283 { 4284 ldr(Rn, Address(Pn_base, 0)); 4285 mul(Rlo_mn, Rn, inv); 4286 cmp(Rlo_mn, -1); 4287 Label ok; 4288 br(EQ, ok); { 4289 stop("broken inverse in Montgomery multiply"); 4290 } bind(ok); 4291 } 4292 #endif 4293 4294 mov(Pm_base, Ra); 4295 4296 mov(t0, zr); 4297 mov(t1, zr); 4298 mov(t2, zr); 4299 4300 block_comment("for (int i = 0; i < len; i++) {"); 4301 mov(Ri, zr); { 4302 Label loop, end; 4303 cmpw(Ri, Rlen); 4304 br(Assembler::GE, end); 4305 4306 bind(loop); 4307 pre1(Ri); 4308 4309 block_comment(" for (j = i; j; j--) {"); { 4310 movw(Rj, Ri); 4311 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4312 } block_comment(" } // j"); 4313 4314 post1(); 4315 addw(Ri, Ri, 1); 4316 cmpw(Ri, Rlen); 4317 br(Assembler::LT, loop); 4318 bind(end); 4319 block_comment("} // i"); 4320 } 4321 4322 block_comment("for (int i = len; i < 2*len; i++) {"); 4323 mov(Ri, Rlen); { 4324 Label loop, end; 4325 cmpw(Ri, Rlen, Assembler::LSL, 1); 4326 br(Assembler::GE, end); 4327 4328 bind(loop); 4329 pre2(Ri, Rlen); 4330 4331 block_comment(" for (j = len*2-i-1; j; j--) {"); { 4332 lslw(Rj, Rlen, 1); 4333 subw(Rj, Rj, Ri); 4334 subw(Rj, Rj, 1); 4335 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4336 } block_comment(" } // j"); 4337 4338 post2(Ri, Rlen); 4339 addw(Ri, Ri, 1); 4340 cmpw(Ri, Rlen, Assembler::LSL, 1); 4341 br(Assembler::LT, loop); 4342 bind(end); 4343 } 4344 block_comment("} // i"); 4345 4346 normalize(Rlen); 4347 4348 mov(Ra, Pm_base); // Save Pm_base in Ra 4349 restore_regs(); // Restore caller's Pm_base 4350 4351 // Copy our result into caller's Pm_base 4352 reverse(Pm_base, Ra, Rlen, t0, t1); 4353 4354 leave(); 4355 bind(nothing); 4356 ret(lr); 4357 4358 return entry; 4359 } 4360 // In C, approximately: 4361 4362 // void 4363 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 4364 // unsigned long Pn_base[], unsigned long Pm_base[], 4365 // unsigned long inv, int len) { 4366 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4367 // unsigned long *Pa, *Pb, *Pn, *Pm; 4368 // unsigned long Ra, Rb, Rn, Rm; 4369 4370 // int i; 4371 4372 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4373 4374 // for (i = 0; i < len; i++) { 4375 // int j; 4376 4377 // Pa = Pa_base; 4378 // Pb = Pb_base + i; 4379 // Pm = Pm_base; 4380 // Pn = Pn_base + i; 4381 4382 // Ra = *Pa; 4383 // Rb = *Pb; 4384 // Rm = *Pm; 4385 // Rn = *Pn; 4386 4387 // int iters = i; 4388 // for (j = 0; iters--; j++) { 4389 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4390 // MACC(Ra, Rb, t0, t1, t2); 4391 // Ra = *++Pa; 4392 // Rb = *--Pb; 4393 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4394 // MACC(Rm, Rn, t0, t1, t2); 4395 // Rm = *++Pm; 4396 // Rn = *--Pn; 4397 // } 4398 4399 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 4400 // MACC(Ra, Rb, t0, t1, t2); 4401 // *Pm = Rm = t0 * inv; 4402 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4403 // MACC(Rm, Rn, t0, t1, t2); 4404 4405 // assert(t0 == 0, "broken Montgomery multiply"); 4406 4407 // t0 = t1; t1 = t2; t2 = 0; 4408 // } 4409 4410 // for (i = len; i < 2*len; i++) { 4411 // int j; 4412 4413 // Pa = Pa_base + i-len; 4414 // Pb = Pb_base + len; 4415 // Pm = Pm_base + i-len; 4416 // Pn = Pn_base + len; 4417 4418 // Ra = *++Pa; 4419 // Rb = *--Pb; 4420 // Rm = *++Pm; 4421 // Rn = *--Pn; 4422 4423 // int iters = len*2-i-1; 4424 // for (j = i-len+1; iters--; j++) { 4425 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4426 // MACC(Ra, Rb, t0, t1, t2); 4427 // Ra = *++Pa; 4428 // Rb = *--Pb; 4429 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4430 // MACC(Rm, Rn, t0, t1, t2); 4431 // Rm = *++Pm; 4432 // Rn = *--Pn; 4433 // } 4434 4435 // Pm_base[i-len] = t0; 4436 // t0 = t1; t1 = t2; t2 = 0; 4437 // } 4438 4439 // while (t0) 4440 // t0 = sub(Pm_base, Pn_base, t0, len); 4441 // } 4442 4443 /** 4444 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4445 * multiplies than Montgomery multiplication so it should be up to 4446 * 25% faster. However, its loop control is more complex and it 4447 * may actually run slower on some machines. 4448 * 4449 * Arguments: 4450 * 4451 * Inputs: 4452 * c_rarg0 - int array elements a 4453 * c_rarg1 - int array elements n (the modulus) 4454 * c_rarg2 - int length 4455 * c_rarg3 - int inv 4456 * c_rarg4 - int array elements m (the result) 4457 * 4458 */ 4459 address generate_square() { 4460 Label argh; 4461 bind(argh); 4462 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4463 4464 align(CodeEntryAlignment); 4465 address entry = pc(); 4466 4467 enter(); 4468 4469 // Make room. 4470 cmpw(Rlen, 512); 4471 br(Assembler::HI, argh); 4472 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4473 andr(sp, Ra, -2 * wordSize); 4474 4475 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4476 4477 { 4478 // Copy input args, reversing as we go. We use Ra as a 4479 // temporary variable. 4480 reverse(Ra, Pa_base, Rlen, t0, t1); 4481 reverse(Ra, Pn_base, Rlen, t0, t1); 4482 } 4483 4484 // Push all call-saved registers and also Pm_base which we'll need 4485 // at the end. 4486 save_regs(); 4487 4488 mov(Pm_base, Ra); 4489 4490 mov(t0, zr); 4491 mov(t1, zr); 4492 mov(t2, zr); 4493 4494 block_comment("for (int i = 0; i < len; i++) {"); 4495 mov(Ri, zr); { 4496 Label loop, end; 4497 bind(loop); 4498 cmp(Ri, Rlen); 4499 br(Assembler::GE, end); 4500 4501 pre1(Ri); 4502 4503 block_comment("for (j = (i+1)/2; j; j--) {"); { 4504 add(Rj, Ri, 1); 4505 lsr(Rj, Rj, 1); 4506 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4507 } block_comment(" } // j"); 4508 4509 last_squaring(Ri); 4510 4511 block_comment(" for (j = i/2; j; j--) {"); { 4512 lsr(Rj, Ri, 1); 4513 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4514 } block_comment(" } // j"); 4515 4516 post1_squaring(); 4517 add(Ri, Ri, 1); 4518 cmp(Ri, Rlen); 4519 br(Assembler::LT, loop); 4520 4521 bind(end); 4522 block_comment("} // i"); 4523 } 4524 4525 block_comment("for (int i = len; i < 2*len; i++) {"); 4526 mov(Ri, Rlen); { 4527 Label loop, end; 4528 bind(loop); 4529 cmp(Ri, Rlen, Assembler::LSL, 1); 4530 br(Assembler::GE, end); 4531 4532 pre2(Ri, Rlen); 4533 4534 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4535 lsl(Rj, Rlen, 1); 4536 sub(Rj, Rj, Ri); 4537 sub(Rj, Rj, 1); 4538 lsr(Rj, Rj, 1); 4539 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4540 } block_comment(" } // j"); 4541 4542 last_squaring(Ri); 4543 4544 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4545 lsl(Rj, Rlen, 1); 4546 sub(Rj, Rj, Ri); 4547 lsr(Rj, Rj, 1); 4548 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4549 } block_comment(" } // j"); 4550 4551 post2(Ri, Rlen); 4552 add(Ri, Ri, 1); 4553 cmp(Ri, Rlen, Assembler::LSL, 1); 4554 4555 br(Assembler::LT, loop); 4556 bind(end); 4557 block_comment("} // i"); 4558 } 4559 4560 normalize(Rlen); 4561 4562 mov(Ra, Pm_base); // Save Pm_base in Ra 4563 restore_regs(); // Restore caller's Pm_base 4564 4565 // Copy our result into caller's Pm_base 4566 reverse(Pm_base, Ra, Rlen, t0, t1); 4567 4568 leave(); 4569 ret(lr); 4570 4571 return entry; 4572 } 4573 // In C, approximately: 4574 4575 // void 4576 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4577 // unsigned long Pm_base[], unsigned long inv, int len) { 4578 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4579 // unsigned long *Pa, *Pb, *Pn, *Pm; 4580 // unsigned long Ra, Rb, Rn, Rm; 4581 4582 // int i; 4583 4584 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4585 4586 // for (i = 0; i < len; i++) { 4587 // int j; 4588 4589 // Pa = Pa_base; 4590 // Pb = Pa_base + i; 4591 // Pm = Pm_base; 4592 // Pn = Pn_base + i; 4593 4594 // Ra = *Pa; 4595 // Rb = *Pb; 4596 // Rm = *Pm; 4597 // Rn = *Pn; 4598 4599 // int iters = (i+1)/2; 4600 // for (j = 0; iters--; j++) { 4601 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4602 // MACC2(Ra, Rb, t0, t1, t2); 4603 // Ra = *++Pa; 4604 // Rb = *--Pb; 4605 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4606 // MACC(Rm, Rn, t0, t1, t2); 4607 // Rm = *++Pm; 4608 // Rn = *--Pn; 4609 // } 4610 // if ((i & 1) == 0) { 4611 // assert(Ra == Pa_base[j], "must be"); 4612 // MACC(Ra, Ra, t0, t1, t2); 4613 // } 4614 // iters = i/2; 4615 // assert(iters == i-j, "must be"); 4616 // for (; iters--; j++) { 4617 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4618 // MACC(Rm, Rn, t0, t1, t2); 4619 // Rm = *++Pm; 4620 // Rn = *--Pn; 4621 // } 4622 4623 // *Pm = Rm = t0 * inv; 4624 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4625 // MACC(Rm, Rn, t0, t1, t2); 4626 4627 // assert(t0 == 0, "broken Montgomery multiply"); 4628 4629 // t0 = t1; t1 = t2; t2 = 0; 4630 // } 4631 4632 // for (i = len; i < 2*len; i++) { 4633 // int start = i-len+1; 4634 // int end = start + (len - start)/2; 4635 // int j; 4636 4637 // Pa = Pa_base + i-len; 4638 // Pb = Pa_base + len; 4639 // Pm = Pm_base + i-len; 4640 // Pn = Pn_base + len; 4641 4642 // Ra = *++Pa; 4643 // Rb = *--Pb; 4644 // Rm = *++Pm; 4645 // Rn = *--Pn; 4646 4647 // int iters = (2*len-i-1)/2; 4648 // assert(iters == end-start, "must be"); 4649 // for (j = start; iters--; j++) { 4650 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4651 // MACC2(Ra, Rb, t0, t1, t2); 4652 // Ra = *++Pa; 4653 // Rb = *--Pb; 4654 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4655 // MACC(Rm, Rn, t0, t1, t2); 4656 // Rm = *++Pm; 4657 // Rn = *--Pn; 4658 // } 4659 // if ((i & 1) == 0) { 4660 // assert(Ra == Pa_base[j], "must be"); 4661 // MACC(Ra, Ra, t0, t1, t2); 4662 // } 4663 // iters = (2*len-i)/2; 4664 // assert(iters == len-j, "must be"); 4665 // for (; iters--; j++) { 4666 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4667 // MACC(Rm, Rn, t0, t1, t2); 4668 // Rm = *++Pm; 4669 // Rn = *--Pn; 4670 // } 4671 // Pm_base[i-len] = t0; 4672 // t0 = t1; t1 = t2; t2 = 0; 4673 // } 4674 4675 // while (t0) 4676 // t0 = sub(Pm_base, Pn_base, t0, len); 4677 // } 4678 }; 4679 4680 // Initialization 4681 void generate_initial() { 4682 // Generate initial stubs and initializes the entry points 4683 4684 // entry points that exist in all platforms Note: This is code 4685 // that could be shared among different platforms - however the 4686 // benefit seems to be smaller than the disadvantage of having a 4687 // much more complicated generator structure. See also comment in 4688 // stubRoutines.hpp. 4689 4690 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4691 4692 StubRoutines::_call_stub_entry = 4693 generate_call_stub(StubRoutines::_call_stub_return_address); 4694 4695 // is referenced by megamorphic call 4696 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4697 4698 // Build this early so it's available for the interpreter. 4699 StubRoutines::_throw_StackOverflowError_entry = 4700 generate_throw_exception("StackOverflowError throw_exception", 4701 CAST_FROM_FN_PTR(address, 4702 SharedRuntime::throw_StackOverflowError)); 4703 StubRoutines::_throw_delayed_StackOverflowError_entry = 4704 generate_throw_exception("delayed StackOverflowError throw_exception", 4705 CAST_FROM_FN_PTR(address, 4706 SharedRuntime::throw_delayed_StackOverflowError)); 4707 if (UseCRC32Intrinsics) { 4708 // set table address before stub generation which use it 4709 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 4710 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4711 } 4712 } 4713 4714 void generate_all() { 4715 // support for verify_oop (must happen after universe_init) 4716 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4717 StubRoutines::_throw_AbstractMethodError_entry = 4718 generate_throw_exception("AbstractMethodError throw_exception", 4719 CAST_FROM_FN_PTR(address, 4720 SharedRuntime:: 4721 throw_AbstractMethodError)); 4722 4723 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4724 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4725 CAST_FROM_FN_PTR(address, 4726 SharedRuntime:: 4727 throw_IncompatibleClassChangeError)); 4728 4729 StubRoutines::_throw_NullPointerException_at_call_entry = 4730 generate_throw_exception("NullPointerException at call throw_exception", 4731 CAST_FROM_FN_PTR(address, 4732 SharedRuntime:: 4733 throw_NullPointerException_at_call)); 4734 4735 // arraycopy stubs used by compilers 4736 generate_arraycopy_stubs(); 4737 4738 if (UseMultiplyToLenIntrinsic) { 4739 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4740 } 4741 4742 if (UseMontgomeryMultiplyIntrinsic) { 4743 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4744 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4745 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4746 } 4747 4748 if (UseMontgomerySquareIntrinsic) { 4749 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4750 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4751 // We use generate_multiply() rather than generate_square() 4752 // because it's faster for the sizes of modulus we care about. 4753 StubRoutines::_montgomerySquare = g.generate_multiply(); 4754 } 4755 4756 #ifndef BUILTIN_SIM 4757 // generate GHASH intrinsics code 4758 if (UseGHASHIntrinsics) { 4759 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 4760 } 4761 4762 if (UseAESIntrinsics) { 4763 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4764 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4765 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4766 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4767 } 4768 4769 if (UseSHA1Intrinsics) { 4770 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4771 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4772 } 4773 if (UseSHA256Intrinsics) { 4774 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4775 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4776 } 4777 4778 if (UseCRC32CIntrinsics) { 4779 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 4780 } 4781 4782 // generate Adler32 intrinsics code 4783 if (UseAdler32Intrinsics) { 4784 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 4785 } 4786 4787 // Safefetch stubs. 4788 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4789 &StubRoutines::_safefetch32_fault_pc, 4790 &StubRoutines::_safefetch32_continuation_pc); 4791 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4792 &StubRoutines::_safefetchN_fault_pc, 4793 &StubRoutines::_safefetchN_continuation_pc); 4794 #endif 4795 StubRoutines::aarch64::set_completed(); 4796 } 4797 4798 public: 4799 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4800 if (all) { 4801 generate_all(); 4802 } else { 4803 generate_initial(); 4804 } 4805 } 4806 }; // end class declaration 4807 4808 void StubGenerator_generate(CodeBuffer* code, bool all) { 4809 StubGenerator g(code, all); 4810 }