1 /* 2 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "interpreter/interpreter.hpp" 30 #include "nativeInst_aarch64.hpp" 31 #include "oops/instanceOop.hpp" 32 #include "oops/method.hpp" 33 #include "oops/objArrayKlass.hpp" 34 #include "oops/oop.inline.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/frame.inline.hpp" 37 #include "runtime/handles.inline.hpp" 38 #include "runtime/sharedRuntime.hpp" 39 #include "runtime/stubCodeGenerator.hpp" 40 #include "runtime/stubRoutines.hpp" 41 #include "runtime/thread.inline.hpp" 42 #ifdef COMPILER2 43 #include "opto/runtime.hpp" 44 #endif 45 46 #ifdef BUILTIN_SIM 47 #include "../../../../../../simulator/simulator.hpp" 48 #endif 49 50 // Declaration and definition of StubGenerator (no .hpp file). 51 // For a more detailed description of the stub routine structure 52 // see the comment in stubRoutines.hpp 53 54 #undef __ 55 #define __ _masm-> 56 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 57 58 #ifdef PRODUCT 59 #define BLOCK_COMMENT(str) /* nothing */ 60 #else 61 #define BLOCK_COMMENT(str) __ block_comment(str) 62 #endif 63 64 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 65 66 // Stub Code definitions 67 68 class StubGenerator: public StubCodeGenerator { 69 private: 70 71 #ifdef PRODUCT 72 #define inc_counter_np(counter) ((void)0) 73 #else 74 void inc_counter_np_(int& counter) { 75 __ lea(rscratch2, ExternalAddress((address)&counter)); 76 __ ldrw(rscratch1, Address(rscratch2)); 77 __ addw(rscratch1, rscratch1, 1); 78 __ strw(rscratch1, Address(rscratch2)); 79 } 80 #define inc_counter_np(counter) \ 81 BLOCK_COMMENT("inc_counter " #counter); \ 82 inc_counter_np_(counter); 83 #endif 84 85 // Call stubs are used to call Java from C 86 // 87 // Arguments: 88 // c_rarg0: call wrapper address address 89 // c_rarg1: result address 90 // c_rarg2: result type BasicType 91 // c_rarg3: method Method* 92 // c_rarg4: (interpreter) entry point address 93 // c_rarg5: parameters intptr_t* 94 // c_rarg6: parameter size (in words) int 95 // c_rarg7: thread Thread* 96 // 97 // There is no return from the stub itself as any Java result 98 // is written to result 99 // 100 // we save r30 (lr) as the return PC at the base of the frame and 101 // link r29 (fp) below it as the frame pointer installing sp (r31) 102 // into fp. 103 // 104 // we save r0-r7, which accounts for all the c arguments. 105 // 106 // TODO: strictly do we need to save them all? they are treated as 107 // volatile by C so could we omit saving the ones we are going to 108 // place in global registers (thread? method?) or those we only use 109 // during setup of the Java call? 110 // 111 // we don't need to save r8 which C uses as an indirect result location 112 // return register. 113 // 114 // we don't need to save r9-r15 which both C and Java treat as 115 // volatile 116 // 117 // we don't need to save r16-18 because Java does not use them 118 // 119 // we save r19-r28 which Java uses as scratch registers and C 120 // expects to be callee-save 121 // 122 // we save the bottom 64 bits of each value stored in v8-v15; it is 123 // the responsibility of the caller to preserve larger values. 124 // 125 // so the stub frame looks like this when we enter Java code 126 // 127 // [ return_from_Java ] <--- sp 128 // [ argument word n ] 129 // ... 130 // -27 [ argument word 1 ] 131 // -26 [ saved v15 ] <--- sp_after_call 132 // -25 [ saved v14 ] 133 // -24 [ saved v13 ] 134 // -23 [ saved v12 ] 135 // -22 [ saved v11 ] 136 // -21 [ saved v10 ] 137 // -20 [ saved v9 ] 138 // -19 [ saved v8 ] 139 // -18 [ saved r28 ] 140 // -17 [ saved r27 ] 141 // -16 [ saved r26 ] 142 // -15 [ saved r25 ] 143 // -14 [ saved r24 ] 144 // -13 [ saved r23 ] 145 // -12 [ saved r22 ] 146 // -11 [ saved r21 ] 147 // -10 [ saved r20 ] 148 // -9 [ saved r19 ] 149 // -8 [ call wrapper (r0) ] 150 // -7 [ result (r1) ] 151 // -6 [ result type (r2) ] 152 // -5 [ method (r3) ] 153 // -4 [ entry point (r4) ] 154 // -3 [ parameters (r5) ] 155 // -2 [ parameter size (r6) ] 156 // -1 [ thread (r7) ] 157 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 158 // 1 [ saved lr (r30) ] 159 160 // Call stub stack layout word offsets from fp 161 enum call_stub_layout { 162 sp_after_call_off = -26, 163 164 d15_off = -26, 165 d13_off = -24, 166 d11_off = -22, 167 d9_off = -20, 168 169 r28_off = -18, 170 r26_off = -16, 171 r24_off = -14, 172 r22_off = -12, 173 r20_off = -10, 174 call_wrapper_off = -8, 175 result_off = -7, 176 result_type_off = -6, 177 method_off = -5, 178 entry_point_off = -4, 179 parameter_size_off = -2, 180 thread_off = -1, 181 fp_f = 0, 182 retaddr_off = 1, 183 }; 184 185 address generate_call_stub(address& return_address) { 186 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 187 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 188 "adjust this code"); 189 190 StubCodeMark mark(this, "StubRoutines", "call_stub"); 191 address start = __ pc(); 192 193 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 194 195 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 196 const Address result (rfp, result_off * wordSize); 197 const Address result_type (rfp, result_type_off * wordSize); 198 const Address method (rfp, method_off * wordSize); 199 const Address entry_point (rfp, entry_point_off * wordSize); 200 const Address parameter_size(rfp, parameter_size_off * wordSize); 201 202 const Address thread (rfp, thread_off * wordSize); 203 204 const Address d15_save (rfp, d15_off * wordSize); 205 const Address d13_save (rfp, d13_off * wordSize); 206 const Address d11_save (rfp, d11_off * wordSize); 207 const Address d9_save (rfp, d9_off * wordSize); 208 209 const Address r28_save (rfp, r28_off * wordSize); 210 const Address r26_save (rfp, r26_off * wordSize); 211 const Address r24_save (rfp, r24_off * wordSize); 212 const Address r22_save (rfp, r22_off * wordSize); 213 const Address r20_save (rfp, r20_off * wordSize); 214 215 // stub code 216 217 // we need a C prolog to bootstrap the x86 caller into the sim 218 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 219 220 address aarch64_entry = __ pc(); 221 222 #ifdef BUILTIN_SIM 223 // Save sender's SP for stack traces. 224 __ mov(rscratch1, sp); 225 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 226 #endif 227 // set up frame and move sp to end of save area 228 __ enter(); 229 __ sub(sp, rfp, -sp_after_call_off * wordSize); 230 231 // save register parameters and Java scratch/global registers 232 // n.b. we save thread even though it gets installed in 233 // rthread because we want to sanity check rthread later 234 __ str(c_rarg7, thread); 235 __ strw(c_rarg6, parameter_size); 236 __ stp(c_rarg4, c_rarg5, entry_point); 237 __ stp(c_rarg2, c_rarg3, result_type); 238 __ stp(c_rarg0, c_rarg1, call_wrapper); 239 240 __ stp(r20, r19, r20_save); 241 __ stp(r22, r21, r22_save); 242 __ stp(r24, r23, r24_save); 243 __ stp(r26, r25, r26_save); 244 __ stp(r28, r27, r28_save); 245 246 __ stpd(v9, v8, d9_save); 247 __ stpd(v11, v10, d11_save); 248 __ stpd(v13, v12, d13_save); 249 __ stpd(v15, v14, d15_save); 250 251 // install Java thread in global register now we have saved 252 // whatever value it held 253 __ mov(rthread, c_rarg7); 254 // And method 255 __ mov(rmethod, c_rarg3); 256 257 // set up the heapbase register 258 __ reinit_heapbase(); 259 260 #ifdef ASSERT 261 // make sure we have no pending exceptions 262 { 263 Label L; 264 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 265 __ cmp(rscratch1, (unsigned)NULL_WORD); 266 __ br(Assembler::EQ, L); 267 __ stop("StubRoutines::call_stub: entered with pending exception"); 268 __ BIND(L); 269 } 270 #endif 271 // pass parameters if any 272 __ mov(esp, sp); 273 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 274 __ andr(sp, rscratch1, -2 * wordSize); 275 276 BLOCK_COMMENT("pass parameters if any"); 277 Label parameters_done; 278 // parameter count is still in c_rarg6 279 // and parameter pointer identifying param 1 is in c_rarg5 280 __ cbzw(c_rarg6, parameters_done); 281 282 address loop = __ pc(); 283 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 284 __ subsw(c_rarg6, c_rarg6, 1); 285 __ push(rscratch1); 286 __ br(Assembler::GT, loop); 287 288 __ BIND(parameters_done); 289 290 // call Java entry -- passing methdoOop, and current sp 291 // rmethod: Method* 292 // r13: sender sp 293 BLOCK_COMMENT("call Java function"); 294 __ mov(r13, sp); 295 __ blr(c_rarg4); 296 297 // tell the simulator we have returned to the stub 298 299 // we do this here because the notify will already have been done 300 // if we get to the next instruction via an exception 301 // 302 // n.b. adding this instruction here affects the calculation of 303 // whether or not a routine returns to the call stub (used when 304 // doing stack walks) since the normal test is to check the return 305 // pc against the address saved below. so we may need to allow for 306 // this extra instruction in the check. 307 308 if (NotifySimulator) { 309 __ notify(Assembler::method_reentry); 310 } 311 // save current address for use by exception handling code 312 313 return_address = __ pc(); 314 315 // store result depending on type (everything that is not 316 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 317 // n.b. this assumes Java returns an integral result in r0 318 // and a floating result in j_farg0 319 __ ldr(j_rarg2, result); 320 Label is_long, is_float, is_double, exit; 321 __ ldr(j_rarg1, result_type); 322 __ cmp(j_rarg1, T_OBJECT); 323 __ br(Assembler::EQ, is_long); 324 __ cmp(j_rarg1, T_LONG); 325 __ br(Assembler::EQ, is_long); 326 __ cmp(j_rarg1, T_FLOAT); 327 __ br(Assembler::EQ, is_float); 328 __ cmp(j_rarg1, T_DOUBLE); 329 __ br(Assembler::EQ, is_double); 330 331 // handle T_INT case 332 __ strw(r0, Address(j_rarg2)); 333 334 __ BIND(exit); 335 336 // pop parameters 337 __ sub(esp, rfp, -sp_after_call_off * wordSize); 338 339 #ifdef ASSERT 340 // verify that threads correspond 341 { 342 Label L, S; 343 __ ldr(rscratch1, thread); 344 __ cmp(rthread, rscratch1); 345 __ br(Assembler::NE, S); 346 __ get_thread(rscratch1); 347 __ cmp(rthread, rscratch1); 348 __ br(Assembler::EQ, L); 349 __ BIND(S); 350 __ stop("StubRoutines::call_stub: threads must correspond"); 351 __ BIND(L); 352 } 353 #endif 354 355 // restore callee-save registers 356 __ ldpd(v15, v14, d15_save); 357 __ ldpd(v13, v12, d13_save); 358 __ ldpd(v11, v10, d11_save); 359 __ ldpd(v9, v8, d9_save); 360 361 __ ldp(r28, r27, r28_save); 362 __ ldp(r26, r25, r26_save); 363 __ ldp(r24, r23, r24_save); 364 __ ldp(r22, r21, r22_save); 365 __ ldp(r20, r19, r20_save); 366 367 __ ldp(c_rarg0, c_rarg1, call_wrapper); 368 __ ldrw(c_rarg2, result_type); 369 __ ldr(c_rarg3, method); 370 __ ldp(c_rarg4, c_rarg5, entry_point); 371 __ ldp(c_rarg6, c_rarg7, parameter_size); 372 373 #ifndef PRODUCT 374 // tell the simulator we are about to end Java execution 375 if (NotifySimulator) { 376 __ notify(Assembler::method_exit); 377 } 378 #endif 379 // leave frame and return to caller 380 __ leave(); 381 __ ret(lr); 382 383 // handle return types different from T_INT 384 385 __ BIND(is_long); 386 __ str(r0, Address(j_rarg2, 0)); 387 __ br(Assembler::AL, exit); 388 389 __ BIND(is_float); 390 __ strs(j_farg0, Address(j_rarg2, 0)); 391 __ br(Assembler::AL, exit); 392 393 __ BIND(is_double); 394 __ strd(j_farg0, Address(j_rarg2, 0)); 395 __ br(Assembler::AL, exit); 396 397 return start; 398 } 399 400 // Return point for a Java call if there's an exception thrown in 401 // Java code. The exception is caught and transformed into a 402 // pending exception stored in JavaThread that can be tested from 403 // within the VM. 404 // 405 // Note: Usually the parameters are removed by the callee. In case 406 // of an exception crossing an activation frame boundary, that is 407 // not the case if the callee is compiled code => need to setup the 408 // rsp. 409 // 410 // r0: exception oop 411 412 // NOTE: this is used as a target from the signal handler so it 413 // needs an x86 prolog which returns into the current simulator 414 // executing the generated catch_exception code. so the prolog 415 // needs to install rax in a sim register and adjust the sim's 416 // restart pc to enter the generated code at the start position 417 // then return from native to simulated execution. 418 419 address generate_catch_exception() { 420 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 421 address start = __ pc(); 422 423 // same as in generate_call_stub(): 424 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 425 const Address thread (rfp, thread_off * wordSize); 426 427 #ifdef ASSERT 428 // verify that threads correspond 429 { 430 Label L, S; 431 __ ldr(rscratch1, thread); 432 __ cmp(rthread, rscratch1); 433 __ br(Assembler::NE, S); 434 __ get_thread(rscratch1); 435 __ cmp(rthread, rscratch1); 436 __ br(Assembler::EQ, L); 437 __ bind(S); 438 __ stop("StubRoutines::catch_exception: threads must correspond"); 439 __ bind(L); 440 } 441 #endif 442 443 // set pending exception 444 __ verify_oop(r0); 445 446 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 447 __ mov(rscratch1, (address)__FILE__); 448 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 449 __ movw(rscratch1, (int)__LINE__); 450 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 451 452 // complete return to VM 453 assert(StubRoutines::_call_stub_return_address != NULL, 454 "_call_stub_return_address must have been generated before"); 455 __ b(StubRoutines::_call_stub_return_address); 456 457 return start; 458 } 459 460 // Continuation point for runtime calls returning with a pending 461 // exception. The pending exception check happened in the runtime 462 // or native call stub. The pending exception in Thread is 463 // converted into a Java-level exception. 464 // 465 // Contract with Java-level exception handlers: 466 // r0: exception 467 // r3: throwing pc 468 // 469 // NOTE: At entry of this stub, exception-pc must be in LR !! 470 471 // NOTE: this is always used as a jump target within generated code 472 // so it just needs to be generated code wiht no x86 prolog 473 474 address generate_forward_exception() { 475 StubCodeMark mark(this, "StubRoutines", "forward exception"); 476 address start = __ pc(); 477 478 // Upon entry, LR points to the return address returning into 479 // Java (interpreted or compiled) code; i.e., the return address 480 // becomes the throwing pc. 481 // 482 // Arguments pushed before the runtime call are still on the stack 483 // but the exception handler will reset the stack pointer -> 484 // ignore them. A potential result in registers can be ignored as 485 // well. 486 487 #ifdef ASSERT 488 // make sure this code is only executed if there is a pending exception 489 { 490 Label L; 491 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 492 __ cbnz(rscratch1, L); 493 __ stop("StubRoutines::forward exception: no pending exception (1)"); 494 __ bind(L); 495 } 496 #endif 497 498 // compute exception handler into r19 499 500 // call the VM to find the handler address associated with the 501 // caller address. pass thread in r0 and caller pc (ret address) 502 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 503 // the stack. 504 __ mov(c_rarg1, lr); 505 // lr will be trashed by the VM call so we move it to R19 506 // (callee-saved) because we also need to pass it to the handler 507 // returned by this call. 508 __ mov(r19, lr); 509 BLOCK_COMMENT("call exception_handler_for_return_address"); 510 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 511 SharedRuntime::exception_handler_for_return_address), 512 rthread, c_rarg1); 513 // we should not really care that lr is no longer the callee 514 // address. we saved the value the handler needs in r19 so we can 515 // just copy it to r3. however, the C2 handler will push its own 516 // frame and then calls into the VM and the VM code asserts that 517 // the PC for the frame above the handler belongs to a compiled 518 // Java method. So, we restore lr here to satisfy that assert. 519 __ mov(lr, r19); 520 // setup r0 & r3 & clear pending exception 521 __ mov(r3, r19); 522 __ mov(r19, r0); 523 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 524 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 525 526 #ifdef ASSERT 527 // make sure exception is set 528 { 529 Label L; 530 __ cbnz(r0, L); 531 __ stop("StubRoutines::forward exception: no pending exception (2)"); 532 __ bind(L); 533 } 534 #endif 535 536 // continue at exception handler 537 // r0: exception 538 // r3: throwing pc 539 // r19: exception handler 540 __ verify_oop(r0); 541 __ br(r19); 542 543 return start; 544 } 545 546 // Non-destructive plausibility checks for oops 547 // 548 // Arguments: 549 // r0: oop to verify 550 // rscratch1: error message 551 // 552 // Stack after saving c_rarg3: 553 // [tos + 0]: saved c_rarg3 554 // [tos + 1]: saved c_rarg2 555 // [tos + 2]: saved lr 556 // [tos + 3]: saved rscratch2 557 // [tos + 4]: saved r0 558 // [tos + 5]: saved rscratch1 559 address generate_verify_oop() { 560 561 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 562 address start = __ pc(); 563 564 Label exit, error; 565 566 // save c_rarg2 and c_rarg3 567 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 568 569 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 570 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 571 __ ldr(c_rarg3, Address(c_rarg2)); 572 __ add(c_rarg3, c_rarg3, 1); 573 __ str(c_rarg3, Address(c_rarg2)); 574 575 // object is in r0 576 // make sure object is 'reasonable' 577 __ cbz(r0, exit); // if obj is NULL it is OK 578 579 // Check if the oop is in the right area of memory 580 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 581 __ andr(c_rarg2, r0, c_rarg3); 582 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 583 584 // Compare c_rarg2 and c_rarg3. We don't use a compare 585 // instruction here because the flags register is live. 586 __ eor(c_rarg2, c_rarg2, c_rarg3); 587 __ cbnz(c_rarg2, error); 588 589 // make sure klass is 'reasonable', which is not zero. 590 __ load_klass(r0, r0); // get klass 591 __ cbz(r0, error); // if klass is NULL it is broken 592 593 // return if everything seems ok 594 __ bind(exit); 595 596 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 597 __ ret(lr); 598 599 // handle errors 600 __ bind(error); 601 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 602 603 __ push(RegSet::range(r0, r29), sp); 604 // debug(char* msg, int64_t pc, int64_t regs[]) 605 __ mov(c_rarg0, rscratch1); // pass address of error message 606 __ mov(c_rarg1, lr); // pass return address 607 __ mov(c_rarg2, sp); // pass address of regs on stack 608 #ifndef PRODUCT 609 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 610 #endif 611 BLOCK_COMMENT("call MacroAssembler::debug"); 612 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 613 __ blrt(rscratch1, 3, 0, 1); 614 615 return start; 616 } 617 618 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 619 620 // Generate code for an array write pre barrier 621 // 622 // addr - starting address 623 // count - element count 624 // tmp - scratch register 625 // 626 // Destroy no registers except rscratch1 and rscratch2 627 // 628 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 629 BarrierSet* bs = Universe::heap()->barrier_set(); 630 switch (bs->kind()) { 631 case BarrierSet::G1SATBCTLogging: 632 // With G1, don't generate the call if we statically know that the target in uninitialized 633 if (!dest_uninitialized) { 634 __ push_call_clobbered_registers(); 635 if (count == c_rarg0) { 636 if (addr == c_rarg1) { 637 // exactly backwards!! 638 __ mov(rscratch1, c_rarg0); 639 __ mov(c_rarg0, c_rarg1); 640 __ mov(c_rarg1, rscratch1); 641 } else { 642 __ mov(c_rarg1, count); 643 __ mov(c_rarg0, addr); 644 } 645 } else { 646 __ mov(c_rarg0, addr); 647 __ mov(c_rarg1, count); 648 } 649 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 650 __ pop_call_clobbered_registers(); 651 break; 652 case BarrierSet::CardTableForRS: 653 case BarrierSet::CardTableExtension: 654 case BarrierSet::ModRef: 655 break; 656 default: 657 ShouldNotReachHere(); 658 659 } 660 } 661 } 662 663 // 664 // Generate code for an array write post barrier 665 // 666 // Input: 667 // start - register containing starting address of destination array 668 // end - register containing ending address of destination array 669 // scratch - scratch register 670 // 671 // The input registers are overwritten. 672 // The ending address is inclusive. 673 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) { 674 assert_different_registers(start, end, scratch); 675 BarrierSet* bs = Universe::heap()->barrier_set(); 676 switch (bs->kind()) { 677 case BarrierSet::G1SATBCTLogging: 678 679 { 680 __ push_call_clobbered_registers(); 681 // must compute element count unless barrier set interface is changed (other platforms supply count) 682 assert_different_registers(start, end, scratch); 683 __ lea(scratch, Address(end, BytesPerHeapOop)); 684 __ sub(scratch, scratch, start); // subtract start to get #bytes 685 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 686 __ mov(c_rarg0, start); 687 __ mov(c_rarg1, scratch); 688 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 689 __ pop_call_clobbered_registers(); 690 } 691 break; 692 case BarrierSet::CardTableForRS: 693 case BarrierSet::CardTableExtension: 694 { 695 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 696 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 697 698 Label L_loop; 699 700 __ lsr(start, start, CardTableModRefBS::card_shift); 701 __ lsr(end, end, CardTableModRefBS::card_shift); 702 __ sub(end, end, start); // number of bytes to copy 703 704 const Register count = end; // 'end' register contains bytes count now 705 __ load_byte_map_base(scratch); 706 __ add(start, start, scratch); 707 if (UseConcMarkSweepGC) { 708 __ membar(__ StoreStore); 709 } 710 __ BIND(L_loop); 711 __ strb(zr, Address(start, count)); 712 __ subs(count, count, 1); 713 __ br(Assembler::GE, L_loop); 714 } 715 break; 716 default: 717 ShouldNotReachHere(); 718 719 } 720 } 721 722 address generate_zero_longs(Register base, Register cnt) { 723 Register tmp = rscratch1; 724 Register tmp2 = rscratch2; 725 int zva_length = VM_Version::zva_length(); 726 Label initial_table_end, loop_zva; 727 Label fini; 728 729 __ align(CodeEntryAlignment); 730 StubCodeMark mark(this, "StubRoutines", "zero_longs"); 731 address start = __ pc(); 732 733 // Base must be 16 byte aligned. If not just return and let caller handle it 734 __ tst(base, 0x0f); 735 __ br(Assembler::NE, fini); 736 // Align base with ZVA length. 737 __ neg(tmp, base); 738 __ andr(tmp, tmp, zva_length - 1); 739 740 // tmp: the number of bytes to be filled to align the base with ZVA length. 741 __ add(base, base, tmp); 742 __ sub(cnt, cnt, tmp, Assembler::ASR, 3); 743 __ adr(tmp2, initial_table_end); 744 __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 745 __ br(tmp2); 746 747 for (int i = -zva_length + 16; i < 0; i += 16) 748 __ stp(zr, zr, Address(base, i)); 749 __ bind(initial_table_end); 750 751 __ sub(cnt, cnt, zva_length >> 3); 752 __ bind(loop_zva); 753 __ dc(Assembler::ZVA, base); 754 __ subs(cnt, cnt, zva_length >> 3); 755 __ add(base, base, zva_length); 756 __ br(Assembler::GE, loop_zva); 757 __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 758 __ bind(fini); 759 __ ret(lr); 760 761 return start; 762 } 763 764 typedef enum { 765 copy_forwards = 1, 766 copy_backwards = -1 767 } copy_direction; 768 769 // Bulk copy of blocks of 8 words. 770 // 771 // count is a count of words. 772 // 773 // Precondition: count >= 8 774 // 775 // Postconditions: 776 // 777 // The least significant bit of count contains the remaining count 778 // of words to copy. The rest of count is trash. 779 // 780 // s and d are adjusted to point to the remaining words to copy 781 // 782 void generate_copy_longs(Label &start, Register s, Register d, Register count, 783 copy_direction direction) { 784 int unit = wordSize * direction; 785 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 786 787 int offset; 788 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 789 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 790 const Register stride = r13; 791 792 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 793 assert_different_registers(s, d, count, rscratch1); 794 795 Label again, drain; 796 const char *stub_name; 797 if (direction == copy_forwards) 798 stub_name = "foward_copy_longs"; 799 else 800 stub_name = "backward_copy_longs"; 801 StubCodeMark mark(this, "StubRoutines", stub_name); 802 __ align(CodeEntryAlignment); 803 __ bind(start); 804 805 Label unaligned_copy_long; 806 if (AvoidUnalignedAccesses) { 807 __ tbnz(d, 3, unaligned_copy_long); 808 } 809 810 if (direction == copy_forwards) { 811 __ sub(s, s, bias); 812 __ sub(d, d, bias); 813 } 814 815 #ifdef ASSERT 816 // Make sure we are never given < 8 words 817 { 818 Label L; 819 __ cmp(count, 8); 820 __ br(Assembler::GE, L); 821 __ stop("genrate_copy_longs called with < 8 words"); 822 __ bind(L); 823 } 824 #endif 825 826 // Fill 8 registers 827 if (UseSIMDForMemoryOps) { 828 __ ldpq(v0, v1, Address(s, 4 * unit)); 829 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 830 } else { 831 __ ldp(t0, t1, Address(s, 2 * unit)); 832 __ ldp(t2, t3, Address(s, 4 * unit)); 833 __ ldp(t4, t5, Address(s, 6 * unit)); 834 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 835 } 836 837 __ subs(count, count, 16); 838 __ br(Assembler::LO, drain); 839 840 int prefetch = PrefetchCopyIntervalInBytes; 841 bool use_stride = false; 842 if (direction == copy_backwards) { 843 use_stride = prefetch > 256; 844 prefetch = -prefetch; 845 if (use_stride) __ mov(stride, prefetch); 846 } 847 848 __ bind(again); 849 850 if (PrefetchCopyIntervalInBytes > 0) 851 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 852 853 if (UseSIMDForMemoryOps) { 854 __ stpq(v0, v1, Address(d, 4 * unit)); 855 __ ldpq(v0, v1, Address(s, 4 * unit)); 856 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 857 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 858 } else { 859 __ stp(t0, t1, Address(d, 2 * unit)); 860 __ ldp(t0, t1, Address(s, 2 * unit)); 861 __ stp(t2, t3, Address(d, 4 * unit)); 862 __ ldp(t2, t3, Address(s, 4 * unit)); 863 __ stp(t4, t5, Address(d, 6 * unit)); 864 __ ldp(t4, t5, Address(s, 6 * unit)); 865 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 866 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 867 } 868 869 __ subs(count, count, 8); 870 __ br(Assembler::HS, again); 871 872 // Drain 873 __ bind(drain); 874 if (UseSIMDForMemoryOps) { 875 __ stpq(v0, v1, Address(d, 4 * unit)); 876 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 877 } else { 878 __ stp(t0, t1, Address(d, 2 * unit)); 879 __ stp(t2, t3, Address(d, 4 * unit)); 880 __ stp(t4, t5, Address(d, 6 * unit)); 881 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 882 } 883 884 { 885 Label L1, L2; 886 __ tbz(count, exact_log2(4), L1); 887 if (UseSIMDForMemoryOps) { 888 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 889 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 890 } else { 891 __ ldp(t0, t1, Address(s, 2 * unit)); 892 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 893 __ stp(t0, t1, Address(d, 2 * unit)); 894 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 895 } 896 __ bind(L1); 897 898 if (direction == copy_forwards) { 899 __ add(s, s, bias); 900 __ add(d, d, bias); 901 } 902 903 __ tbz(count, 1, L2); 904 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 905 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 906 __ bind(L2); 907 } 908 909 __ ret(lr); 910 911 if (AvoidUnalignedAccesses) { 912 Label drain, again; 913 // Register order for storing. Order is different for backward copy. 914 915 __ bind(unaligned_copy_long); 916 917 // source address is even aligned, target odd aligned 918 // 919 // when forward copying word pairs we read long pairs at offsets 920 // {0, 2, 4, 6} (in long words). when backwards copying we read 921 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 922 // address by -2 in the forwards case so we can compute the 923 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 924 // or -1. 925 // 926 // when forward copying we need to store 1 word, 3 pairs and 927 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 928 // zero offset We adjust the destination by -1 which means we 929 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 930 // 931 // When backwards copyng we need to store 1 word, 3 pairs and 932 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 933 // offsets {1, 3, 5, 7, 8} * unit. 934 935 if (direction == copy_forwards) { 936 __ sub(s, s, 16); 937 __ sub(d, d, 8); 938 } 939 940 // Fill 8 registers 941 // 942 // for forwards copy s was offset by -16 from the original input 943 // value of s so the register contents are at these offsets 944 // relative to the 64 bit block addressed by that original input 945 // and so on for each successive 64 byte block when s is updated 946 // 947 // t0 at offset 0, t1 at offset 8 948 // t2 at offset 16, t3 at offset 24 949 // t4 at offset 32, t5 at offset 40 950 // t6 at offset 48, t7 at offset 56 951 952 // for backwards copy s was not offset so the register contents 953 // are at these offsets into the preceding 64 byte block 954 // relative to that original input and so on for each successive 955 // preceding 64 byte block when s is updated. this explains the 956 // slightly counter-intuitive looking pattern of register usage 957 // in the stp instructions for backwards copy. 958 // 959 // t0 at offset -16, t1 at offset -8 960 // t2 at offset -32, t3 at offset -24 961 // t4 at offset -48, t5 at offset -40 962 // t6 at offset -64, t7 at offset -56 963 964 __ ldp(t0, t1, Address(s, 2 * unit)); 965 __ ldp(t2, t3, Address(s, 4 * unit)); 966 __ ldp(t4, t5, Address(s, 6 * unit)); 967 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 968 969 __ subs(count, count, 16); 970 __ br(Assembler::LO, drain); 971 972 int prefetch = PrefetchCopyIntervalInBytes; 973 bool use_stride = false; 974 if (direction == copy_backwards) { 975 use_stride = prefetch > 256; 976 prefetch = -prefetch; 977 if (use_stride) __ mov(stride, prefetch); 978 } 979 980 __ bind(again); 981 982 if (PrefetchCopyIntervalInBytes > 0) 983 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 984 985 if (direction == copy_forwards) { 986 // allowing for the offset of -8 the store instructions place 987 // registers into the target 64 bit block at the following 988 // offsets 989 // 990 // t0 at offset 0 991 // t1 at offset 8, t2 at offset 16 992 // t3 at offset 24, t4 at offset 32 993 // t5 at offset 40, t6 at offset 48 994 // t7 at offset 56 995 996 __ str(t0, Address(d, 1 * unit)); 997 __ stp(t1, t2, Address(d, 2 * unit)); 998 __ ldp(t0, t1, Address(s, 2 * unit)); 999 __ stp(t3, t4, Address(d, 4 * unit)); 1000 __ ldp(t2, t3, Address(s, 4 * unit)); 1001 __ stp(t5, t6, Address(d, 6 * unit)); 1002 __ ldp(t4, t5, Address(s, 6 * unit)); 1003 __ str(t7, Address(__ pre(d, 8 * unit))); 1004 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1005 } else { 1006 // d was not offset when we started so the registers are 1007 // written into the 64 bit block preceding d with the following 1008 // offsets 1009 // 1010 // t1 at offset -8 1011 // t3 at offset -24, t0 at offset -16 1012 // t5 at offset -48, t2 at offset -32 1013 // t7 at offset -56, t4 at offset -48 1014 // t6 at offset -64 1015 // 1016 // note that this matches the offsets previously noted for the 1017 // loads 1018 1019 __ str(t1, Address(d, 1 * unit)); 1020 __ stp(t3, t0, Address(d, 3 * unit)); 1021 __ ldp(t0, t1, Address(s, 2 * unit)); 1022 __ stp(t5, t2, Address(d, 5 * unit)); 1023 __ ldp(t2, t3, Address(s, 4 * unit)); 1024 __ stp(t7, t4, Address(d, 7 * unit)); 1025 __ ldp(t4, t5, Address(s, 6 * unit)); 1026 __ str(t6, Address(__ pre(d, 8 * unit))); 1027 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1028 } 1029 1030 __ subs(count, count, 8); 1031 __ br(Assembler::HS, again); 1032 1033 // Drain 1034 // 1035 // this uses the same pattern of offsets and register arguments 1036 // as above 1037 __ bind(drain); 1038 if (direction == copy_forwards) { 1039 __ str(t0, Address(d, 1 * unit)); 1040 __ stp(t1, t2, Address(d, 2 * unit)); 1041 __ stp(t3, t4, Address(d, 4 * unit)); 1042 __ stp(t5, t6, Address(d, 6 * unit)); 1043 __ str(t7, Address(__ pre(d, 8 * unit))); 1044 } else { 1045 __ str(t1, Address(d, 1 * unit)); 1046 __ stp(t3, t0, Address(d, 3 * unit)); 1047 __ stp(t5, t2, Address(d, 5 * unit)); 1048 __ stp(t7, t4, Address(d, 7 * unit)); 1049 __ str(t6, Address(__ pre(d, 8 * unit))); 1050 } 1051 // now we need to copy any remaining part block which may 1052 // include a 4 word block subblock and/or a 2 word subblock. 1053 // bits 2 and 1 in the count are the tell-tale for whetehr we 1054 // have each such subblock 1055 { 1056 Label L1, L2; 1057 __ tbz(count, exact_log2(4), L1); 1058 // this is the same as above but copying only 4 longs hence 1059 // with ony one intervening stp between the str instructions 1060 // but note that the offsets and registers still follow the 1061 // same pattern 1062 __ ldp(t0, t1, Address(s, 2 * unit)); 1063 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1064 if (direction == copy_forwards) { 1065 __ str(t0, Address(d, 1 * unit)); 1066 __ stp(t1, t2, Address(d, 2 * unit)); 1067 __ str(t3, Address(__ pre(d, 4 * unit))); 1068 } else { 1069 __ str(t1, Address(d, 1 * unit)); 1070 __ stp(t3, t0, Address(d, 3 * unit)); 1071 __ str(t2, Address(__ pre(d, 4 * unit))); 1072 } 1073 __ bind(L1); 1074 1075 __ tbz(count, 1, L2); 1076 // this is the same as above but copying only 2 longs hence 1077 // there is no intervening stp between the str instructions 1078 // but note that the offset and register patterns are still 1079 // the same 1080 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1081 if (direction == copy_forwards) { 1082 __ str(t0, Address(d, 1 * unit)); 1083 __ str(t1, Address(__ pre(d, 2 * unit))); 1084 } else { 1085 __ str(t1, Address(d, 1 * unit)); 1086 __ str(t0, Address(__ pre(d, 2 * unit))); 1087 } 1088 __ bind(L2); 1089 1090 // for forwards copy we need to re-adjust the offsets we 1091 // applied so that s and d are follow the last words written 1092 1093 if (direction == copy_forwards) { 1094 __ add(s, s, 16); 1095 __ add(d, d, 8); 1096 } 1097 1098 } 1099 1100 __ ret(lr); 1101 } 1102 } 1103 1104 // Small copy: less than 16 bytes. 1105 // 1106 // NB: Ignores all of the bits of count which represent more than 15 1107 // bytes, so a caller doesn't have to mask them. 1108 1109 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1110 bool is_backwards = step < 0; 1111 size_t granularity = uabs(step); 1112 int direction = is_backwards ? -1 : 1; 1113 int unit = wordSize * direction; 1114 1115 Label Lpair, Lword, Lint, Lshort, Lbyte; 1116 1117 assert(granularity 1118 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1119 1120 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1121 1122 // ??? I don't know if this bit-test-and-branch is the right thing 1123 // to do. It does a lot of jumping, resulting in several 1124 // mispredicted branches. It might make more sense to do this 1125 // with something like Duff's device with a single computed branch. 1126 1127 __ tbz(count, 3 - exact_log2(granularity), Lword); 1128 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1129 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1130 __ bind(Lword); 1131 1132 if (granularity <= sizeof (jint)) { 1133 __ tbz(count, 2 - exact_log2(granularity), Lint); 1134 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1135 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1136 __ bind(Lint); 1137 } 1138 1139 if (granularity <= sizeof (jshort)) { 1140 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1141 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1142 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1143 __ bind(Lshort); 1144 } 1145 1146 if (granularity <= sizeof (jbyte)) { 1147 __ tbz(count, 0, Lbyte); 1148 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1149 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1150 __ bind(Lbyte); 1151 } 1152 } 1153 1154 Label copy_f, copy_b; 1155 1156 // All-singing all-dancing memory copy. 1157 // 1158 // Copy count units of memory from s to d. The size of a unit is 1159 // step, which can be positive or negative depending on the direction 1160 // of copy. If is_aligned is false, we align the source address. 1161 // 1162 1163 void copy_memory(bool is_aligned, Register s, Register d, 1164 Register count, Register tmp, int step) { 1165 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1166 bool is_backwards = step < 0; 1167 int granularity = uabs(step); 1168 const Register t0 = r3, t1 = r4; 1169 1170 // <= 96 bytes do inline. Direction doesn't matter because we always 1171 // load all the data before writing anything 1172 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1173 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1174 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1175 const Register send = r17, dend = r18; 1176 1177 if (PrefetchCopyIntervalInBytes > 0) 1178 __ prfm(Address(s, 0), PLDL1KEEP); 1179 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1180 __ br(Assembler::HI, copy_big); 1181 1182 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1183 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1184 1185 __ cmp(count, 16/granularity); 1186 __ br(Assembler::LS, copy16); 1187 1188 __ cmp(count, 64/granularity); 1189 __ br(Assembler::HI, copy80); 1190 1191 __ cmp(count, 32/granularity); 1192 __ br(Assembler::LS, copy32); 1193 1194 // 33..64 bytes 1195 if (UseSIMDForMemoryOps) { 1196 __ ldpq(v0, v1, Address(s, 0)); 1197 __ ldpq(v2, v3, Address(send, -32)); 1198 __ stpq(v0, v1, Address(d, 0)); 1199 __ stpq(v2, v3, Address(dend, -32)); 1200 } else { 1201 __ ldp(t0, t1, Address(s, 0)); 1202 __ ldp(t2, t3, Address(s, 16)); 1203 __ ldp(t4, t5, Address(send, -32)); 1204 __ ldp(t6, t7, Address(send, -16)); 1205 1206 __ stp(t0, t1, Address(d, 0)); 1207 __ stp(t2, t3, Address(d, 16)); 1208 __ stp(t4, t5, Address(dend, -32)); 1209 __ stp(t6, t7, Address(dend, -16)); 1210 } 1211 __ b(finish); 1212 1213 // 17..32 bytes 1214 __ bind(copy32); 1215 __ ldp(t0, t1, Address(s, 0)); 1216 __ ldp(t2, t3, Address(send, -16)); 1217 __ stp(t0, t1, Address(d, 0)); 1218 __ stp(t2, t3, Address(dend, -16)); 1219 __ b(finish); 1220 1221 // 65..80/96 bytes 1222 // (96 bytes if SIMD because we do 32 byes per instruction) 1223 __ bind(copy80); 1224 if (UseSIMDForMemoryOps) { 1225 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1226 __ ldpq(v4, v5, Address(send, -32)); 1227 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1228 __ stpq(v4, v5, Address(dend, -32)); 1229 } else { 1230 __ ldp(t0, t1, Address(s, 0)); 1231 __ ldp(t2, t3, Address(s, 16)); 1232 __ ldp(t4, t5, Address(s, 32)); 1233 __ ldp(t6, t7, Address(s, 48)); 1234 __ ldp(t8, t9, Address(send, -16)); 1235 1236 __ stp(t0, t1, Address(d, 0)); 1237 __ stp(t2, t3, Address(d, 16)); 1238 __ stp(t4, t5, Address(d, 32)); 1239 __ stp(t6, t7, Address(d, 48)); 1240 __ stp(t8, t9, Address(dend, -16)); 1241 } 1242 __ b(finish); 1243 1244 // 0..16 bytes 1245 __ bind(copy16); 1246 __ cmp(count, 8/granularity); 1247 __ br(Assembler::LO, copy8); 1248 1249 // 8..16 bytes 1250 __ ldr(t0, Address(s, 0)); 1251 __ ldr(t1, Address(send, -8)); 1252 __ str(t0, Address(d, 0)); 1253 __ str(t1, Address(dend, -8)); 1254 __ b(finish); 1255 1256 if (granularity < 8) { 1257 // 4..7 bytes 1258 __ bind(copy8); 1259 __ tbz(count, 2 - exact_log2(granularity), copy4); 1260 __ ldrw(t0, Address(s, 0)); 1261 __ ldrw(t1, Address(send, -4)); 1262 __ strw(t0, Address(d, 0)); 1263 __ strw(t1, Address(dend, -4)); 1264 __ b(finish); 1265 if (granularity < 4) { 1266 // 0..3 bytes 1267 __ bind(copy4); 1268 __ cbz(count, finish); // get rid of 0 case 1269 if (granularity == 2) { 1270 __ ldrh(t0, Address(s, 0)); 1271 __ strh(t0, Address(d, 0)); 1272 } else { // granularity == 1 1273 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1274 // the first and last byte. 1275 // Handle the 3 byte case by loading and storing base + count/2 1276 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1277 // This does means in the 1 byte case we load/store the same 1278 // byte 3 times. 1279 __ lsr(count, count, 1); 1280 __ ldrb(t0, Address(s, 0)); 1281 __ ldrb(t1, Address(send, -1)); 1282 __ ldrb(t2, Address(s, count)); 1283 __ strb(t0, Address(d, 0)); 1284 __ strb(t1, Address(dend, -1)); 1285 __ strb(t2, Address(d, count)); 1286 } 1287 __ b(finish); 1288 } 1289 } 1290 1291 __ bind(copy_big); 1292 if (is_backwards) { 1293 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1294 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1295 } 1296 1297 // Now we've got the small case out of the way we can align the 1298 // source address on a 2-word boundary. 1299 1300 Label aligned; 1301 1302 if (is_aligned) { 1303 // We may have to adjust by 1 word to get s 2-word-aligned. 1304 __ tbz(s, exact_log2(wordSize), aligned); 1305 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1306 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1307 __ sub(count, count, wordSize/granularity); 1308 } else { 1309 if (is_backwards) { 1310 __ andr(rscratch2, s, 2 * wordSize - 1); 1311 } else { 1312 __ neg(rscratch2, s); 1313 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1314 } 1315 // rscratch2 is the byte adjustment needed to align s. 1316 __ cbz(rscratch2, aligned); 1317 int shift = exact_log2(granularity); 1318 if (shift) __ lsr(rscratch2, rscratch2, shift); 1319 __ sub(count, count, rscratch2); 1320 1321 #if 0 1322 // ?? This code is only correct for a disjoint copy. It may or 1323 // may not make sense to use it in that case. 1324 1325 // Copy the first pair; s and d may not be aligned. 1326 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1327 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1328 1329 // Align s and d, adjust count 1330 if (is_backwards) { 1331 __ sub(s, s, rscratch2); 1332 __ sub(d, d, rscratch2); 1333 } else { 1334 __ add(s, s, rscratch2); 1335 __ add(d, d, rscratch2); 1336 } 1337 #else 1338 copy_memory_small(s, d, rscratch2, rscratch1, step); 1339 #endif 1340 } 1341 1342 __ bind(aligned); 1343 1344 // s is now 2-word-aligned. 1345 1346 // We have a count of units and some trailing bytes. Adjust the 1347 // count and do a bulk copy of words. 1348 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1349 if (direction == copy_forwards) 1350 __ bl(copy_f); 1351 else 1352 __ bl(copy_b); 1353 1354 // And the tail. 1355 copy_memory_small(s, d, count, tmp, step); 1356 1357 if (granularity >= 8) __ bind(copy8); 1358 if (granularity >= 4) __ bind(copy4); 1359 __ bind(finish); 1360 } 1361 1362 1363 void clobber_registers() { 1364 #ifdef ASSERT 1365 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1366 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1367 for (Register r = r3; r <= r18; r++) 1368 if (r != rscratch1) __ mov(r, rscratch1); 1369 #endif 1370 } 1371 1372 // Scan over array at a for count oops, verifying each one. 1373 // Preserves a and count, clobbers rscratch1 and rscratch2. 1374 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1375 Label loop, end; 1376 __ mov(rscratch1, a); 1377 __ mov(rscratch2, zr); 1378 __ bind(loop); 1379 __ cmp(rscratch2, count); 1380 __ br(Assembler::HS, end); 1381 if (size == (size_t)wordSize) { 1382 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1383 __ verify_oop(temp); 1384 } else { 1385 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1386 __ decode_heap_oop(temp); // calls verify_oop 1387 } 1388 __ add(rscratch2, rscratch2, size); 1389 __ b(loop); 1390 __ bind(end); 1391 } 1392 1393 // Arguments: 1394 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1395 // ignored 1396 // is_oop - true => oop array, so generate store check code 1397 // name - stub name string 1398 // 1399 // Inputs: 1400 // c_rarg0 - source array address 1401 // c_rarg1 - destination array address 1402 // c_rarg2 - element count, treated as ssize_t, can be zero 1403 // 1404 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1405 // the hardware handle it. The two dwords within qwords that span 1406 // cache line boundaries will still be loaded and stored atomicly. 1407 // 1408 // Side Effects: 1409 // disjoint_int_copy_entry is set to the no-overlap entry point 1410 // used by generate_conjoint_int_oop_copy(). 1411 // 1412 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1413 const char *name, bool dest_uninitialized = false) { 1414 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1415 __ align(CodeEntryAlignment); 1416 StubCodeMark mark(this, "StubRoutines", name); 1417 address start = __ pc(); 1418 __ enter(); 1419 1420 if (entry != NULL) { 1421 *entry = __ pc(); 1422 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1423 BLOCK_COMMENT("Entry:"); 1424 } 1425 1426 if (is_oop) { 1427 __ push(RegSet::of(d, count), sp); 1428 // no registers are destroyed by this call 1429 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1430 } 1431 copy_memory(aligned, s, d, count, rscratch1, size); 1432 if (is_oop) { 1433 __ pop(RegSet::of(d, count), sp); 1434 if (VerifyOops) 1435 verify_oop_array(size, d, count, r16); 1436 __ sub(count, count, 1); // make an inclusive end pointer 1437 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1438 gen_write_ref_array_post_barrier(d, count, rscratch1); 1439 } 1440 __ leave(); 1441 __ mov(r0, zr); // return 0 1442 __ ret(lr); 1443 #ifdef BUILTIN_SIM 1444 { 1445 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1446 sim->notifyCompile(const_cast<char*>(name), start); 1447 } 1448 #endif 1449 return start; 1450 } 1451 1452 // Arguments: 1453 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1454 // ignored 1455 // is_oop - true => oop array, so generate store check code 1456 // name - stub name string 1457 // 1458 // Inputs: 1459 // c_rarg0 - source array address 1460 // c_rarg1 - destination array address 1461 // c_rarg2 - element count, treated as ssize_t, can be zero 1462 // 1463 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1464 // the hardware handle it. The two dwords within qwords that span 1465 // cache line boundaries will still be loaded and stored atomicly. 1466 // 1467 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1468 address *entry, const char *name, 1469 bool dest_uninitialized = false) { 1470 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1471 1472 StubCodeMark mark(this, "StubRoutines", name); 1473 address start = __ pc(); 1474 __ enter(); 1475 1476 if (entry != NULL) { 1477 *entry = __ pc(); 1478 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1479 BLOCK_COMMENT("Entry:"); 1480 } 1481 1482 // use fwd copy when (d-s) above_equal (count*size) 1483 __ sub(rscratch1, d, s); 1484 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1485 __ br(Assembler::HS, nooverlap_target); 1486 1487 if (is_oop) { 1488 __ push(RegSet::of(d, count), sp); 1489 // no registers are destroyed by this call 1490 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1491 } 1492 copy_memory(aligned, s, d, count, rscratch1, -size); 1493 if (is_oop) { 1494 __ pop(RegSet::of(d, count), sp); 1495 if (VerifyOops) 1496 verify_oop_array(size, d, count, r16); 1497 __ sub(count, count, 1); // make an inclusive end pointer 1498 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1499 gen_write_ref_array_post_barrier(d, count, rscratch1); 1500 } 1501 __ leave(); 1502 __ mov(r0, zr); // return 0 1503 __ ret(lr); 1504 #ifdef BUILTIN_SIM 1505 { 1506 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1507 sim->notifyCompile(const_cast<char*>(name), start); 1508 } 1509 #endif 1510 return start; 1511 } 1512 1513 // Arguments: 1514 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1515 // ignored 1516 // name - stub name string 1517 // 1518 // Inputs: 1519 // c_rarg0 - source array address 1520 // c_rarg1 - destination array address 1521 // c_rarg2 - element count, treated as ssize_t, can be zero 1522 // 1523 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1524 // we let the hardware handle it. The one to eight bytes within words, 1525 // dwords or qwords that span cache line boundaries will still be loaded 1526 // and stored atomically. 1527 // 1528 // Side Effects: 1529 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1530 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1531 // we let the hardware handle it. The one to eight bytes within words, 1532 // dwords or qwords that span cache line boundaries will still be loaded 1533 // and stored atomically. 1534 // 1535 // Side Effects: 1536 // disjoint_byte_copy_entry is set to the no-overlap entry point 1537 // used by generate_conjoint_byte_copy(). 1538 // 1539 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1540 const bool not_oop = false; 1541 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1542 } 1543 1544 // Arguments: 1545 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1546 // ignored 1547 // name - stub name string 1548 // 1549 // Inputs: 1550 // c_rarg0 - source array address 1551 // c_rarg1 - destination array address 1552 // c_rarg2 - element count, treated as ssize_t, can be zero 1553 // 1554 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1555 // we let the hardware handle it. The one to eight bytes within words, 1556 // dwords or qwords that span cache line boundaries will still be loaded 1557 // and stored atomically. 1558 // 1559 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1560 address* entry, const char *name) { 1561 const bool not_oop = false; 1562 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1563 } 1564 1565 // Arguments: 1566 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1567 // ignored 1568 // name - stub name string 1569 // 1570 // Inputs: 1571 // c_rarg0 - source array address 1572 // c_rarg1 - destination array address 1573 // c_rarg2 - element count, treated as ssize_t, can be zero 1574 // 1575 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1576 // let the hardware handle it. The two or four words within dwords 1577 // or qwords that span cache line boundaries will still be loaded 1578 // and stored atomically. 1579 // 1580 // Side Effects: 1581 // disjoint_short_copy_entry is set to the no-overlap entry point 1582 // used by generate_conjoint_short_copy(). 1583 // 1584 address generate_disjoint_short_copy(bool aligned, 1585 address* entry, const char *name) { 1586 const bool not_oop = false; 1587 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1588 } 1589 1590 // Arguments: 1591 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1592 // ignored 1593 // name - stub name string 1594 // 1595 // Inputs: 1596 // c_rarg0 - source array address 1597 // c_rarg1 - destination array address 1598 // c_rarg2 - element count, treated as ssize_t, can be zero 1599 // 1600 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1601 // let the hardware handle it. The two or four words within dwords 1602 // or qwords that span cache line boundaries will still be loaded 1603 // and stored atomically. 1604 // 1605 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1606 address *entry, const char *name) { 1607 const bool not_oop = false; 1608 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1609 1610 } 1611 // Arguments: 1612 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1613 // ignored 1614 // name - stub name string 1615 // 1616 // Inputs: 1617 // c_rarg0 - source array address 1618 // c_rarg1 - destination array address 1619 // c_rarg2 - element count, treated as ssize_t, can be zero 1620 // 1621 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1622 // the hardware handle it. The two dwords within qwords that span 1623 // cache line boundaries will still be loaded and stored atomicly. 1624 // 1625 // Side Effects: 1626 // disjoint_int_copy_entry is set to the no-overlap entry point 1627 // used by generate_conjoint_int_oop_copy(). 1628 // 1629 address generate_disjoint_int_copy(bool aligned, address *entry, 1630 const char *name, bool dest_uninitialized = false) { 1631 const bool not_oop = false; 1632 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1633 } 1634 1635 // Arguments: 1636 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1637 // ignored 1638 // name - stub name string 1639 // 1640 // Inputs: 1641 // c_rarg0 - source array address 1642 // c_rarg1 - destination array address 1643 // c_rarg2 - element count, treated as ssize_t, can be zero 1644 // 1645 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1646 // the hardware handle it. The two dwords within qwords that span 1647 // cache line boundaries will still be loaded and stored atomicly. 1648 // 1649 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1650 address *entry, const char *name, 1651 bool dest_uninitialized = false) { 1652 const bool not_oop = false; 1653 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1654 } 1655 1656 1657 // Arguments: 1658 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1659 // ignored 1660 // name - stub name string 1661 // 1662 // Inputs: 1663 // c_rarg0 - source array address 1664 // c_rarg1 - destination array address 1665 // c_rarg2 - element count, treated as size_t, can be zero 1666 // 1667 // Side Effects: 1668 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1669 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1670 // 1671 address generate_disjoint_long_copy(bool aligned, address *entry, 1672 const char *name, bool dest_uninitialized = false) { 1673 const bool not_oop = false; 1674 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1675 } 1676 1677 // Arguments: 1678 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1679 // ignored 1680 // name - stub name string 1681 // 1682 // Inputs: 1683 // c_rarg0 - source array address 1684 // c_rarg1 - destination array address 1685 // c_rarg2 - element count, treated as size_t, can be zero 1686 // 1687 address generate_conjoint_long_copy(bool aligned, 1688 address nooverlap_target, address *entry, 1689 const char *name, bool dest_uninitialized = false) { 1690 const bool not_oop = false; 1691 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1692 } 1693 1694 // Arguments: 1695 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1696 // ignored 1697 // name - stub name string 1698 // 1699 // Inputs: 1700 // c_rarg0 - source array address 1701 // c_rarg1 - destination array address 1702 // c_rarg2 - element count, treated as size_t, can be zero 1703 // 1704 // Side Effects: 1705 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1706 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1707 // 1708 address generate_disjoint_oop_copy(bool aligned, address *entry, 1709 const char *name, bool dest_uninitialized) { 1710 const bool is_oop = true; 1711 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1712 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1713 } 1714 1715 // Arguments: 1716 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1717 // ignored 1718 // name - stub name string 1719 // 1720 // Inputs: 1721 // c_rarg0 - source array address 1722 // c_rarg1 - destination array address 1723 // c_rarg2 - element count, treated as size_t, can be zero 1724 // 1725 address generate_conjoint_oop_copy(bool aligned, 1726 address nooverlap_target, address *entry, 1727 const char *name, bool dest_uninitialized) { 1728 const bool is_oop = true; 1729 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1730 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1731 name, dest_uninitialized); 1732 } 1733 1734 1735 // Helper for generating a dynamic type check. 1736 // Smashes rscratch1. 1737 void generate_type_check(Register sub_klass, 1738 Register super_check_offset, 1739 Register super_klass, 1740 Label& L_success) { 1741 assert_different_registers(sub_klass, super_check_offset, super_klass); 1742 1743 BLOCK_COMMENT("type_check:"); 1744 1745 Label L_miss; 1746 1747 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1748 super_check_offset); 1749 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1750 1751 // Fall through on failure! 1752 __ BIND(L_miss); 1753 } 1754 1755 // 1756 // Generate checkcasting array copy stub 1757 // 1758 // Input: 1759 // c_rarg0 - source array address 1760 // c_rarg1 - destination array address 1761 // c_rarg2 - element count, treated as ssize_t, can be zero 1762 // c_rarg3 - size_t ckoff (super_check_offset) 1763 // c_rarg4 - oop ckval (super_klass) 1764 // 1765 // Output: 1766 // r0 == 0 - success 1767 // r0 == -1^K - failure, where K is partial transfer count 1768 // 1769 address generate_checkcast_copy(const char *name, address *entry, 1770 bool dest_uninitialized = false) { 1771 1772 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1773 1774 // Input registers (after setup_arg_regs) 1775 const Register from = c_rarg0; // source array address 1776 const Register to = c_rarg1; // destination array address 1777 const Register count = c_rarg2; // elementscount 1778 const Register ckoff = c_rarg3; // super_check_offset 1779 const Register ckval = c_rarg4; // super_klass 1780 1781 // Registers used as temps (r18, r19, r20 are save-on-entry) 1782 const Register count_save = r21; // orig elementscount 1783 const Register start_to = r20; // destination array start address 1784 const Register copied_oop = r18; // actual oop copied 1785 const Register r19_klass = r19; // oop._klass 1786 1787 //--------------------------------------------------------------- 1788 // Assembler stub will be used for this call to arraycopy 1789 // if the two arrays are subtypes of Object[] but the 1790 // destination array type is not equal to or a supertype 1791 // of the source type. Each element must be separately 1792 // checked. 1793 1794 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1795 copied_oop, r19_klass, count_save); 1796 1797 __ align(CodeEntryAlignment); 1798 StubCodeMark mark(this, "StubRoutines", name); 1799 address start = __ pc(); 1800 1801 __ enter(); // required for proper stackwalking of RuntimeStub frame 1802 1803 #ifdef ASSERT 1804 // caller guarantees that the arrays really are different 1805 // otherwise, we would have to make conjoint checks 1806 { Label L; 1807 array_overlap_test(L, TIMES_OOP); 1808 __ stop("checkcast_copy within a single array"); 1809 __ bind(L); 1810 } 1811 #endif //ASSERT 1812 1813 // Caller of this entry point must set up the argument registers. 1814 if (entry != NULL) { 1815 *entry = __ pc(); 1816 BLOCK_COMMENT("Entry:"); 1817 } 1818 1819 // Empty array: Nothing to do. 1820 __ cbz(count, L_done); 1821 1822 __ push(RegSet::of(r18, r19, r20, r21), sp); 1823 1824 #ifdef ASSERT 1825 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1826 // The ckoff and ckval must be mutually consistent, 1827 // even though caller generates both. 1828 { Label L; 1829 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1830 __ ldrw(start_to, Address(ckval, sco_offset)); 1831 __ cmpw(ckoff, start_to); 1832 __ br(Assembler::EQ, L); 1833 __ stop("super_check_offset inconsistent"); 1834 __ bind(L); 1835 } 1836 #endif //ASSERT 1837 1838 gen_write_ref_array_pre_barrier(to, count, dest_uninitialized); 1839 1840 // save the original count 1841 __ mov(count_save, count); 1842 1843 // Copy from low to high addresses 1844 __ mov(start_to, to); // Save destination array start address 1845 __ b(L_load_element); 1846 1847 // ======== begin loop ======== 1848 // (Loop is rotated; its entry is L_load_element.) 1849 // Loop control: 1850 // for (; count != 0; count--) { 1851 // copied_oop = load_heap_oop(from++); 1852 // ... generate_type_check ...; 1853 // store_heap_oop(to++, copied_oop); 1854 // } 1855 __ align(OptoLoopAlignment); 1856 1857 __ BIND(L_store_element); 1858 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1859 __ sub(count, count, 1); 1860 __ cbz(count, L_do_card_marks); 1861 1862 // ======== loop entry is here ======== 1863 __ BIND(L_load_element); 1864 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1865 __ cbz(copied_oop, L_store_element); 1866 1867 __ load_klass(r19_klass, copied_oop);// query the object klass 1868 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1869 // ======== end loop ======== 1870 1871 // It was a real error; we must depend on the caller to finish the job. 1872 // Register count = remaining oops, count_orig = total oops. 1873 // Emit GC store barriers for the oops we have copied and report 1874 // their number to the caller. 1875 1876 __ subs(count, count_save, count); // K = partially copied oop count 1877 __ eon(count, count, zr); // report (-1^K) to caller 1878 __ br(Assembler::EQ, L_done_pop); 1879 1880 __ BIND(L_do_card_marks); 1881 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1882 gen_write_ref_array_post_barrier(start_to, to, rscratch1); 1883 1884 __ bind(L_done_pop); 1885 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1886 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1887 1888 __ bind(L_done); 1889 __ mov(r0, count); 1890 __ leave(); 1891 __ ret(lr); 1892 1893 return start; 1894 } 1895 1896 // Perform range checks on the proposed arraycopy. 1897 // Kills temp, but nothing else. 1898 // Also, clean the sign bits of src_pos and dst_pos. 1899 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1900 Register src_pos, // source position (c_rarg1) 1901 Register dst, // destination array oo (c_rarg2) 1902 Register dst_pos, // destination position (c_rarg3) 1903 Register length, 1904 Register temp, 1905 Label& L_failed) { 1906 BLOCK_COMMENT("arraycopy_range_checks:"); 1907 1908 assert_different_registers(rscratch1, temp); 1909 1910 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1911 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1912 __ addw(temp, length, src_pos); 1913 __ cmpw(temp, rscratch1); 1914 __ br(Assembler::HI, L_failed); 1915 1916 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1917 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1918 __ addw(temp, length, dst_pos); 1919 __ cmpw(temp, rscratch1); 1920 __ br(Assembler::HI, L_failed); 1921 1922 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1923 __ movw(src_pos, src_pos); 1924 __ movw(dst_pos, dst_pos); 1925 1926 BLOCK_COMMENT("arraycopy_range_checks done"); 1927 } 1928 1929 // These stubs get called from some dumb test routine. 1930 // I'll write them properly when they're called from 1931 // something that's actually doing something. 1932 static void fake_arraycopy_stub(address src, address dst, int count) { 1933 assert(count == 0, "huh?"); 1934 } 1935 1936 1937 // 1938 // Generate 'unsafe' array copy stub 1939 // Though just as safe as the other stubs, it takes an unscaled 1940 // size_t argument instead of an element count. 1941 // 1942 // Input: 1943 // c_rarg0 - source array address 1944 // c_rarg1 - destination array address 1945 // c_rarg2 - byte count, treated as ssize_t, can be zero 1946 // 1947 // Examines the alignment of the operands and dispatches 1948 // to a long, int, short, or byte copy loop. 1949 // 1950 address generate_unsafe_copy(const char *name, 1951 address byte_copy_entry, 1952 address short_copy_entry, 1953 address int_copy_entry, 1954 address long_copy_entry) { 1955 Label L_long_aligned, L_int_aligned, L_short_aligned; 1956 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1957 1958 __ align(CodeEntryAlignment); 1959 StubCodeMark mark(this, "StubRoutines", name); 1960 address start = __ pc(); 1961 __ enter(); // required for proper stackwalking of RuntimeStub frame 1962 1963 // bump this on entry, not on exit: 1964 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1965 1966 __ orr(rscratch1, s, d); 1967 __ orr(rscratch1, rscratch1, count); 1968 1969 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1970 __ cbz(rscratch1, L_long_aligned); 1971 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1972 __ cbz(rscratch1, L_int_aligned); 1973 __ tbz(rscratch1, 0, L_short_aligned); 1974 __ b(RuntimeAddress(byte_copy_entry)); 1975 1976 __ BIND(L_short_aligned); 1977 __ lsr(count, count, LogBytesPerShort); // size => short_count 1978 __ b(RuntimeAddress(short_copy_entry)); 1979 __ BIND(L_int_aligned); 1980 __ lsr(count, count, LogBytesPerInt); // size => int_count 1981 __ b(RuntimeAddress(int_copy_entry)); 1982 __ BIND(L_long_aligned); 1983 __ lsr(count, count, LogBytesPerLong); // size => long_count 1984 __ b(RuntimeAddress(long_copy_entry)); 1985 1986 return start; 1987 } 1988 1989 // 1990 // Generate generic array copy stubs 1991 // 1992 // Input: 1993 // c_rarg0 - src oop 1994 // c_rarg1 - src_pos (32-bits) 1995 // c_rarg2 - dst oop 1996 // c_rarg3 - dst_pos (32-bits) 1997 // c_rarg4 - element count (32-bits) 1998 // 1999 // Output: 2000 // r0 == 0 - success 2001 // r0 == -1^K - failure, where K is partial transfer count 2002 // 2003 address generate_generic_copy(const char *name, 2004 address byte_copy_entry, address short_copy_entry, 2005 address int_copy_entry, address oop_copy_entry, 2006 address long_copy_entry, address checkcast_copy_entry) { 2007 2008 Label L_failed, L_failed_0, L_objArray; 2009 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2010 2011 // Input registers 2012 const Register src = c_rarg0; // source array oop 2013 const Register src_pos = c_rarg1; // source position 2014 const Register dst = c_rarg2; // destination array oop 2015 const Register dst_pos = c_rarg3; // destination position 2016 const Register length = c_rarg4; 2017 2018 StubCodeMark mark(this, "StubRoutines", name); 2019 2020 __ align(CodeEntryAlignment); 2021 address start = __ pc(); 2022 2023 __ enter(); // required for proper stackwalking of RuntimeStub frame 2024 2025 // bump this on entry, not on exit: 2026 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2027 2028 //----------------------------------------------------------------------- 2029 // Assembler stub will be used for this call to arraycopy 2030 // if the following conditions are met: 2031 // 2032 // (1) src and dst must not be null. 2033 // (2) src_pos must not be negative. 2034 // (3) dst_pos must not be negative. 2035 // (4) length must not be negative. 2036 // (5) src klass and dst klass should be the same and not NULL. 2037 // (6) src and dst should be arrays. 2038 // (7) src_pos + length must not exceed length of src. 2039 // (8) dst_pos + length must not exceed length of dst. 2040 // 2041 2042 // if (src == NULL) return -1; 2043 __ cbz(src, L_failed); 2044 2045 // if (src_pos < 0) return -1; 2046 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2047 2048 // if (dst == NULL) return -1; 2049 __ cbz(dst, L_failed); 2050 2051 // if (dst_pos < 0) return -1; 2052 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2053 2054 // registers used as temp 2055 const Register scratch_length = r16; // elements count to copy 2056 const Register scratch_src_klass = r17; // array klass 2057 const Register lh = r18; // layout helper 2058 2059 // if (length < 0) return -1; 2060 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2061 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2062 2063 __ load_klass(scratch_src_klass, src); 2064 #ifdef ASSERT 2065 // assert(src->klass() != NULL); 2066 { 2067 BLOCK_COMMENT("assert klasses not null {"); 2068 Label L1, L2; 2069 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2070 __ bind(L1); 2071 __ stop("broken null klass"); 2072 __ bind(L2); 2073 __ load_klass(rscratch1, dst); 2074 __ cbz(rscratch1, L1); // this would be broken also 2075 BLOCK_COMMENT("} assert klasses not null done"); 2076 } 2077 #endif 2078 2079 // Load layout helper (32-bits) 2080 // 2081 // |array_tag| | header_size | element_type | |log2_element_size| 2082 // 32 30 24 16 8 2 0 2083 // 2084 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2085 // 2086 2087 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2088 2089 // Handle objArrays completely differently... 2090 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2091 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2092 __ movw(rscratch1, objArray_lh); 2093 __ eorw(rscratch2, lh, rscratch1); 2094 __ cbzw(rscratch2, L_objArray); 2095 2096 // if (src->klass() != dst->klass()) return -1; 2097 __ load_klass(rscratch2, dst); 2098 __ eor(rscratch2, rscratch2, scratch_src_klass); 2099 __ cbnz(rscratch2, L_failed); 2100 2101 // if (!src->is_Array()) return -1; 2102 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2103 2104 // At this point, it is known to be a typeArray (array_tag 0x3). 2105 #ifdef ASSERT 2106 { 2107 BLOCK_COMMENT("assert primitive array {"); 2108 Label L; 2109 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2110 __ cmpw(lh, rscratch2); 2111 __ br(Assembler::GE, L); 2112 __ stop("must be a primitive array"); 2113 __ bind(L); 2114 BLOCK_COMMENT("} assert primitive array done"); 2115 } 2116 #endif 2117 2118 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2119 rscratch2, L_failed); 2120 2121 // TypeArrayKlass 2122 // 2123 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2124 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2125 // 2126 2127 const Register rscratch1_offset = rscratch1; // array offset 2128 const Register r18_elsize = lh; // element size 2129 2130 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2131 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2132 __ add(src, src, rscratch1_offset); // src array offset 2133 __ add(dst, dst, rscratch1_offset); // dst array offset 2134 BLOCK_COMMENT("choose copy loop based on element size"); 2135 2136 // next registers should be set before the jump to corresponding stub 2137 const Register from = c_rarg0; // source array address 2138 const Register to = c_rarg1; // destination array address 2139 const Register count = c_rarg2; // elements count 2140 2141 // 'from', 'to', 'count' registers should be set in such order 2142 // since they are the same as 'src', 'src_pos', 'dst'. 2143 2144 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2145 2146 // The possible values of elsize are 0-3, i.e. exact_log2(element 2147 // size in bytes). We do a simple bitwise binary search. 2148 __ BIND(L_copy_bytes); 2149 __ tbnz(r18_elsize, 1, L_copy_ints); 2150 __ tbnz(r18_elsize, 0, L_copy_shorts); 2151 __ lea(from, Address(src, src_pos));// src_addr 2152 __ lea(to, Address(dst, dst_pos));// dst_addr 2153 __ movw(count, scratch_length); // length 2154 __ b(RuntimeAddress(byte_copy_entry)); 2155 2156 __ BIND(L_copy_shorts); 2157 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2158 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2159 __ movw(count, scratch_length); // length 2160 __ b(RuntimeAddress(short_copy_entry)); 2161 2162 __ BIND(L_copy_ints); 2163 __ tbnz(r18_elsize, 0, L_copy_longs); 2164 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2165 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2166 __ movw(count, scratch_length); // length 2167 __ b(RuntimeAddress(int_copy_entry)); 2168 2169 __ BIND(L_copy_longs); 2170 #ifdef ASSERT 2171 { 2172 BLOCK_COMMENT("assert long copy {"); 2173 Label L; 2174 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2175 __ cmpw(r18_elsize, LogBytesPerLong); 2176 __ br(Assembler::EQ, L); 2177 __ stop("must be long copy, but elsize is wrong"); 2178 __ bind(L); 2179 BLOCK_COMMENT("} assert long copy done"); 2180 } 2181 #endif 2182 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2183 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2184 __ movw(count, scratch_length); // length 2185 __ b(RuntimeAddress(long_copy_entry)); 2186 2187 // ObjArrayKlass 2188 __ BIND(L_objArray); 2189 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2190 2191 Label L_plain_copy, L_checkcast_copy; 2192 // test array classes for subtyping 2193 __ load_klass(r18, dst); 2194 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2195 __ br(Assembler::NE, L_checkcast_copy); 2196 2197 // Identically typed arrays can be copied without element-wise checks. 2198 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2199 rscratch2, L_failed); 2200 2201 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2202 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2203 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2204 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2205 __ movw(count, scratch_length); // length 2206 __ BIND(L_plain_copy); 2207 __ b(RuntimeAddress(oop_copy_entry)); 2208 2209 __ BIND(L_checkcast_copy); 2210 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2211 { 2212 // Before looking at dst.length, make sure dst is also an objArray. 2213 __ ldrw(rscratch1, Address(r18, lh_offset)); 2214 __ movw(rscratch2, objArray_lh); 2215 __ eorw(rscratch1, rscratch1, rscratch2); 2216 __ cbnzw(rscratch1, L_failed); 2217 2218 // It is safe to examine both src.length and dst.length. 2219 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2220 r18, L_failed); 2221 2222 const Register rscratch2_dst_klass = rscratch2; 2223 __ load_klass(rscratch2_dst_klass, dst); // reload 2224 2225 // Marshal the base address arguments now, freeing registers. 2226 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2227 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2228 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2229 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2230 __ movw(count, length); // length (reloaded) 2231 Register sco_temp = c_rarg3; // this register is free now 2232 assert_different_registers(from, to, count, sco_temp, 2233 rscratch2_dst_klass, scratch_src_klass); 2234 // assert_clean_int(count, sco_temp); 2235 2236 // Generate the type check. 2237 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2238 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2239 // assert_clean_int(sco_temp, r18); 2240 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2241 2242 // Fetch destination element klass from the ObjArrayKlass header. 2243 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2244 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2245 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2246 2247 // the checkcast_copy loop needs two extra arguments: 2248 assert(c_rarg3 == sco_temp, "#3 already in place"); 2249 // Set up arguments for checkcast_copy_entry. 2250 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2251 __ b(RuntimeAddress(checkcast_copy_entry)); 2252 } 2253 2254 __ BIND(L_failed); 2255 __ mov(r0, -1); 2256 __ leave(); // required for proper stackwalking of RuntimeStub frame 2257 __ ret(lr); 2258 2259 return start; 2260 } 2261 2262 // 2263 // Generate stub for array fill. If "aligned" is true, the 2264 // "to" address is assumed to be heapword aligned. 2265 // 2266 // Arguments for generated stub: 2267 // to: c_rarg0 2268 // value: c_rarg1 2269 // count: c_rarg2 treated as signed 2270 // 2271 address generate_fill(BasicType t, bool aligned, const char *name) { 2272 __ align(CodeEntryAlignment); 2273 StubCodeMark mark(this, "StubRoutines", name); 2274 address start = __ pc(); 2275 2276 BLOCK_COMMENT("Entry:"); 2277 2278 const Register to = c_rarg0; // source array address 2279 const Register value = c_rarg1; // value 2280 const Register count = c_rarg2; // elements count 2281 2282 const Register bz_base = r10; // base for block_zero routine 2283 const Register cnt_words = r11; // temp register 2284 2285 __ enter(); 2286 2287 Label L_fill_elements, L_exit1; 2288 2289 int shift = -1; 2290 switch (t) { 2291 case T_BYTE: 2292 shift = 0; 2293 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2294 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2295 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2296 __ br(Assembler::LO, L_fill_elements); 2297 break; 2298 case T_SHORT: 2299 shift = 1; 2300 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2301 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2302 __ br(Assembler::LO, L_fill_elements); 2303 break; 2304 case T_INT: 2305 shift = 2; 2306 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2307 __ br(Assembler::LO, L_fill_elements); 2308 break; 2309 default: ShouldNotReachHere(); 2310 } 2311 2312 // Align source address at 8 bytes address boundary. 2313 Label L_skip_align1, L_skip_align2, L_skip_align4; 2314 if (!aligned) { 2315 switch (t) { 2316 case T_BYTE: 2317 // One byte misalignment happens only for byte arrays. 2318 __ tbz(to, 0, L_skip_align1); 2319 __ strb(value, Address(__ post(to, 1))); 2320 __ subw(count, count, 1); 2321 __ bind(L_skip_align1); 2322 // Fallthrough 2323 case T_SHORT: 2324 // Two bytes misalignment happens only for byte and short (char) arrays. 2325 __ tbz(to, 1, L_skip_align2); 2326 __ strh(value, Address(__ post(to, 2))); 2327 __ subw(count, count, 2 >> shift); 2328 __ bind(L_skip_align2); 2329 // Fallthrough 2330 case T_INT: 2331 // Align to 8 bytes, we know we are 4 byte aligned to start. 2332 __ tbz(to, 2, L_skip_align4); 2333 __ strw(value, Address(__ post(to, 4))); 2334 __ subw(count, count, 4 >> shift); 2335 __ bind(L_skip_align4); 2336 break; 2337 default: ShouldNotReachHere(); 2338 } 2339 } 2340 2341 // 2342 // Fill large chunks 2343 // 2344 __ lsrw(cnt_words, count, 3 - shift); // number of words 2345 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2346 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2347 if (UseBlockZeroing) { 2348 Label non_block_zeroing, rest; 2349 Register tmp = rscratch1; 2350 // count >= BlockZeroingLowLimit && value == 0 2351 __ subs(tmp, cnt_words, BlockZeroingLowLimit >> 3); 2352 __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE); 2353 __ br(Assembler::NE, non_block_zeroing); 2354 __ mov(bz_base, to); 2355 __ block_zero(bz_base, cnt_words, true); 2356 __ mov(to, bz_base); 2357 __ b(rest); 2358 __ bind(non_block_zeroing); 2359 __ fill_words(to, cnt_words, value); 2360 __ bind(rest); 2361 } 2362 else { 2363 __ fill_words(to, cnt_words, value); 2364 } 2365 2366 // Remaining count is less than 8 bytes. Fill it by a single store. 2367 // Note that the total length is no less than 8 bytes. 2368 if (t == T_BYTE || t == T_SHORT) { 2369 Label L_exit1; 2370 __ cbzw(count, L_exit1); 2371 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2372 __ str(value, Address(to, -8)); // overwrite some elements 2373 __ bind(L_exit1); 2374 __ leave(); 2375 __ ret(lr); 2376 } 2377 2378 // Handle copies less than 8 bytes. 2379 Label L_fill_2, L_fill_4, L_exit2; 2380 __ bind(L_fill_elements); 2381 switch (t) { 2382 case T_BYTE: 2383 __ tbz(count, 0, L_fill_2); 2384 __ strb(value, Address(__ post(to, 1))); 2385 __ bind(L_fill_2); 2386 __ tbz(count, 1, L_fill_4); 2387 __ strh(value, Address(__ post(to, 2))); 2388 __ bind(L_fill_4); 2389 __ tbz(count, 2, L_exit2); 2390 __ strw(value, Address(to)); 2391 break; 2392 case T_SHORT: 2393 __ tbz(count, 0, L_fill_4); 2394 __ strh(value, Address(__ post(to, 2))); 2395 __ bind(L_fill_4); 2396 __ tbz(count, 1, L_exit2); 2397 __ strw(value, Address(to)); 2398 break; 2399 case T_INT: 2400 __ cbzw(count, L_exit2); 2401 __ strw(value, Address(to)); 2402 break; 2403 default: ShouldNotReachHere(); 2404 } 2405 __ bind(L_exit2); 2406 __ leave(); 2407 __ ret(lr); 2408 return start; 2409 } 2410 2411 void generate_arraycopy_stubs() { 2412 address entry; 2413 address entry_jbyte_arraycopy; 2414 address entry_jshort_arraycopy; 2415 address entry_jint_arraycopy; 2416 address entry_oop_arraycopy; 2417 address entry_jlong_arraycopy; 2418 address entry_checkcast_arraycopy; 2419 2420 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2421 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2422 2423 StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11); 2424 2425 //*** jbyte 2426 // Always need aligned and unaligned versions 2427 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2428 "jbyte_disjoint_arraycopy"); 2429 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2430 &entry_jbyte_arraycopy, 2431 "jbyte_arraycopy"); 2432 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2433 "arrayof_jbyte_disjoint_arraycopy"); 2434 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2435 "arrayof_jbyte_arraycopy"); 2436 2437 //*** jshort 2438 // Always need aligned and unaligned versions 2439 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2440 "jshort_disjoint_arraycopy"); 2441 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2442 &entry_jshort_arraycopy, 2443 "jshort_arraycopy"); 2444 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2445 "arrayof_jshort_disjoint_arraycopy"); 2446 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2447 "arrayof_jshort_arraycopy"); 2448 2449 //*** jint 2450 // Aligned versions 2451 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2452 "arrayof_jint_disjoint_arraycopy"); 2453 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2454 "arrayof_jint_arraycopy"); 2455 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2456 // entry_jint_arraycopy always points to the unaligned version 2457 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2458 "jint_disjoint_arraycopy"); 2459 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2460 &entry_jint_arraycopy, 2461 "jint_arraycopy"); 2462 2463 //*** jlong 2464 // It is always aligned 2465 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2466 "arrayof_jlong_disjoint_arraycopy"); 2467 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2468 "arrayof_jlong_arraycopy"); 2469 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2470 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2471 2472 //*** oops 2473 { 2474 // With compressed oops we need unaligned versions; notice that 2475 // we overwrite entry_oop_arraycopy. 2476 bool aligned = !UseCompressedOops; 2477 2478 StubRoutines::_arrayof_oop_disjoint_arraycopy 2479 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2480 /*dest_uninitialized*/false); 2481 StubRoutines::_arrayof_oop_arraycopy 2482 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2483 /*dest_uninitialized*/false); 2484 // Aligned versions without pre-barriers 2485 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2486 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2487 /*dest_uninitialized*/true); 2488 StubRoutines::_arrayof_oop_arraycopy_uninit 2489 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2490 /*dest_uninitialized*/true); 2491 } 2492 2493 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2494 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2495 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2496 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2497 2498 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2499 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2500 /*dest_uninitialized*/true); 2501 2502 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2503 entry_jbyte_arraycopy, 2504 entry_jshort_arraycopy, 2505 entry_jint_arraycopy, 2506 entry_jlong_arraycopy); 2507 2508 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2509 entry_jbyte_arraycopy, 2510 entry_jshort_arraycopy, 2511 entry_jint_arraycopy, 2512 entry_oop_arraycopy, 2513 entry_jlong_arraycopy, 2514 entry_checkcast_arraycopy); 2515 2516 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2517 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2518 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2519 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2520 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2521 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2522 } 2523 2524 void generate_math_stubs() { Unimplemented(); } 2525 2526 // Arguments: 2527 // 2528 // Inputs: 2529 // c_rarg0 - source byte array address 2530 // c_rarg1 - destination byte array address 2531 // c_rarg2 - K (key) in little endian int array 2532 // 2533 address generate_aescrypt_encryptBlock() { 2534 __ align(CodeEntryAlignment); 2535 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2536 2537 Label L_doLast; 2538 2539 const Register from = c_rarg0; // source array address 2540 const Register to = c_rarg1; // destination array address 2541 const Register key = c_rarg2; // key array address 2542 const Register keylen = rscratch1; 2543 2544 address start = __ pc(); 2545 __ enter(); 2546 2547 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2548 2549 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2550 2551 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2552 __ rev32(v1, __ T16B, v1); 2553 __ rev32(v2, __ T16B, v2); 2554 __ rev32(v3, __ T16B, v3); 2555 __ rev32(v4, __ T16B, v4); 2556 __ aese(v0, v1); 2557 __ aesmc(v0, v0); 2558 __ aese(v0, v2); 2559 __ aesmc(v0, v0); 2560 __ aese(v0, v3); 2561 __ aesmc(v0, v0); 2562 __ aese(v0, v4); 2563 __ aesmc(v0, v0); 2564 2565 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2566 __ rev32(v1, __ T16B, v1); 2567 __ rev32(v2, __ T16B, v2); 2568 __ rev32(v3, __ T16B, v3); 2569 __ rev32(v4, __ T16B, v4); 2570 __ aese(v0, v1); 2571 __ aesmc(v0, v0); 2572 __ aese(v0, v2); 2573 __ aesmc(v0, v0); 2574 __ aese(v0, v3); 2575 __ aesmc(v0, v0); 2576 __ aese(v0, v4); 2577 __ aesmc(v0, v0); 2578 2579 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2580 __ rev32(v1, __ T16B, v1); 2581 __ rev32(v2, __ T16B, v2); 2582 2583 __ cmpw(keylen, 44); 2584 __ br(Assembler::EQ, L_doLast); 2585 2586 __ aese(v0, v1); 2587 __ aesmc(v0, v0); 2588 __ aese(v0, v2); 2589 __ aesmc(v0, v0); 2590 2591 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2592 __ rev32(v1, __ T16B, v1); 2593 __ rev32(v2, __ T16B, v2); 2594 2595 __ cmpw(keylen, 52); 2596 __ br(Assembler::EQ, L_doLast); 2597 2598 __ aese(v0, v1); 2599 __ aesmc(v0, v0); 2600 __ aese(v0, v2); 2601 __ aesmc(v0, v0); 2602 2603 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2604 __ rev32(v1, __ T16B, v1); 2605 __ rev32(v2, __ T16B, v2); 2606 2607 __ BIND(L_doLast); 2608 2609 __ aese(v0, v1); 2610 __ aesmc(v0, v0); 2611 __ aese(v0, v2); 2612 2613 __ ld1(v1, __ T16B, key); 2614 __ rev32(v1, __ T16B, v1); 2615 __ eor(v0, __ T16B, v0, v1); 2616 2617 __ st1(v0, __ T16B, to); 2618 2619 __ mov(r0, 0); 2620 2621 __ leave(); 2622 __ ret(lr); 2623 2624 return start; 2625 } 2626 2627 // Arguments: 2628 // 2629 // Inputs: 2630 // c_rarg0 - source byte array address 2631 // c_rarg1 - destination byte array address 2632 // c_rarg2 - K (key) in little endian int array 2633 // 2634 address generate_aescrypt_decryptBlock() { 2635 assert(UseAES, "need AES instructions and misaligned SSE support"); 2636 __ align(CodeEntryAlignment); 2637 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2638 Label L_doLast; 2639 2640 const Register from = c_rarg0; // source array address 2641 const Register to = c_rarg1; // destination array address 2642 const Register key = c_rarg2; // key array address 2643 const Register keylen = rscratch1; 2644 2645 address start = __ pc(); 2646 __ enter(); // required for proper stackwalking of RuntimeStub frame 2647 2648 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2649 2650 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2651 2652 __ ld1(v5, __ T16B, __ post(key, 16)); 2653 __ rev32(v5, __ T16B, v5); 2654 2655 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2656 __ rev32(v1, __ T16B, v1); 2657 __ rev32(v2, __ T16B, v2); 2658 __ rev32(v3, __ T16B, v3); 2659 __ rev32(v4, __ T16B, v4); 2660 __ aesd(v0, v1); 2661 __ aesimc(v0, v0); 2662 __ aesd(v0, v2); 2663 __ aesimc(v0, v0); 2664 __ aesd(v0, v3); 2665 __ aesimc(v0, v0); 2666 __ aesd(v0, v4); 2667 __ aesimc(v0, v0); 2668 2669 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2670 __ rev32(v1, __ T16B, v1); 2671 __ rev32(v2, __ T16B, v2); 2672 __ rev32(v3, __ T16B, v3); 2673 __ rev32(v4, __ T16B, v4); 2674 __ aesd(v0, v1); 2675 __ aesimc(v0, v0); 2676 __ aesd(v0, v2); 2677 __ aesimc(v0, v0); 2678 __ aesd(v0, v3); 2679 __ aesimc(v0, v0); 2680 __ aesd(v0, v4); 2681 __ aesimc(v0, v0); 2682 2683 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2684 __ rev32(v1, __ T16B, v1); 2685 __ rev32(v2, __ T16B, v2); 2686 2687 __ cmpw(keylen, 44); 2688 __ br(Assembler::EQ, L_doLast); 2689 2690 __ aesd(v0, v1); 2691 __ aesimc(v0, v0); 2692 __ aesd(v0, v2); 2693 __ aesimc(v0, v0); 2694 2695 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2696 __ rev32(v1, __ T16B, v1); 2697 __ rev32(v2, __ T16B, v2); 2698 2699 __ cmpw(keylen, 52); 2700 __ br(Assembler::EQ, L_doLast); 2701 2702 __ aesd(v0, v1); 2703 __ aesimc(v0, v0); 2704 __ aesd(v0, v2); 2705 __ aesimc(v0, v0); 2706 2707 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2708 __ rev32(v1, __ T16B, v1); 2709 __ rev32(v2, __ T16B, v2); 2710 2711 __ BIND(L_doLast); 2712 2713 __ aesd(v0, v1); 2714 __ aesimc(v0, v0); 2715 __ aesd(v0, v2); 2716 2717 __ eor(v0, __ T16B, v0, v5); 2718 2719 __ st1(v0, __ T16B, to); 2720 2721 __ mov(r0, 0); 2722 2723 __ leave(); 2724 __ ret(lr); 2725 2726 return start; 2727 } 2728 2729 // Arguments: 2730 // 2731 // Inputs: 2732 // c_rarg0 - source byte array address 2733 // c_rarg1 - destination byte array address 2734 // c_rarg2 - K (key) in little endian int array 2735 // c_rarg3 - r vector byte array address 2736 // c_rarg4 - input length 2737 // 2738 // Output: 2739 // x0 - input length 2740 // 2741 address generate_cipherBlockChaining_encryptAESCrypt() { 2742 assert(UseAES, "need AES instructions and misaligned SSE support"); 2743 __ align(CodeEntryAlignment); 2744 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2745 2746 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2747 2748 const Register from = c_rarg0; // source array address 2749 const Register to = c_rarg1; // destination array address 2750 const Register key = c_rarg2; // key array address 2751 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2752 // and left with the results of the last encryption block 2753 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2754 const Register keylen = rscratch1; 2755 2756 address start = __ pc(); 2757 2758 __ enter(); 2759 2760 __ movw(rscratch2, len_reg); 2761 2762 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2763 2764 __ ld1(v0, __ T16B, rvec); 2765 2766 __ cmpw(keylen, 52); 2767 __ br(Assembler::CC, L_loadkeys_44); 2768 __ br(Assembler::EQ, L_loadkeys_52); 2769 2770 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2771 __ rev32(v17, __ T16B, v17); 2772 __ rev32(v18, __ T16B, v18); 2773 __ BIND(L_loadkeys_52); 2774 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2775 __ rev32(v19, __ T16B, v19); 2776 __ rev32(v20, __ T16B, v20); 2777 __ BIND(L_loadkeys_44); 2778 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2779 __ rev32(v21, __ T16B, v21); 2780 __ rev32(v22, __ T16B, v22); 2781 __ rev32(v23, __ T16B, v23); 2782 __ rev32(v24, __ T16B, v24); 2783 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2784 __ rev32(v25, __ T16B, v25); 2785 __ rev32(v26, __ T16B, v26); 2786 __ rev32(v27, __ T16B, v27); 2787 __ rev32(v28, __ T16B, v28); 2788 __ ld1(v29, v30, v31, __ T16B, key); 2789 __ rev32(v29, __ T16B, v29); 2790 __ rev32(v30, __ T16B, v30); 2791 __ rev32(v31, __ T16B, v31); 2792 2793 __ BIND(L_aes_loop); 2794 __ ld1(v1, __ T16B, __ post(from, 16)); 2795 __ eor(v0, __ T16B, v0, v1); 2796 2797 __ br(Assembler::CC, L_rounds_44); 2798 __ br(Assembler::EQ, L_rounds_52); 2799 2800 __ aese(v0, v17); __ aesmc(v0, v0); 2801 __ aese(v0, v18); __ aesmc(v0, v0); 2802 __ BIND(L_rounds_52); 2803 __ aese(v0, v19); __ aesmc(v0, v0); 2804 __ aese(v0, v20); __ aesmc(v0, v0); 2805 __ BIND(L_rounds_44); 2806 __ aese(v0, v21); __ aesmc(v0, v0); 2807 __ aese(v0, v22); __ aesmc(v0, v0); 2808 __ aese(v0, v23); __ aesmc(v0, v0); 2809 __ aese(v0, v24); __ aesmc(v0, v0); 2810 __ aese(v0, v25); __ aesmc(v0, v0); 2811 __ aese(v0, v26); __ aesmc(v0, v0); 2812 __ aese(v0, v27); __ aesmc(v0, v0); 2813 __ aese(v0, v28); __ aesmc(v0, v0); 2814 __ aese(v0, v29); __ aesmc(v0, v0); 2815 __ aese(v0, v30); 2816 __ eor(v0, __ T16B, v0, v31); 2817 2818 __ st1(v0, __ T16B, __ post(to, 16)); 2819 2820 __ subw(len_reg, len_reg, 16); 2821 __ cbnzw(len_reg, L_aes_loop); 2822 2823 __ st1(v0, __ T16B, rvec); 2824 2825 __ mov(r0, rscratch2); 2826 2827 __ leave(); 2828 __ ret(lr); 2829 2830 return start; 2831 } 2832 2833 // Arguments: 2834 // 2835 // Inputs: 2836 // c_rarg0 - source byte array address 2837 // c_rarg1 - destination byte array address 2838 // c_rarg2 - K (key) in little endian int array 2839 // c_rarg3 - r vector byte array address 2840 // c_rarg4 - input length 2841 // 2842 // Output: 2843 // r0 - input length 2844 // 2845 address generate_cipherBlockChaining_decryptAESCrypt() { 2846 assert(UseAES, "need AES instructions and misaligned SSE support"); 2847 __ align(CodeEntryAlignment); 2848 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2849 2850 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2851 2852 const Register from = c_rarg0; // source array address 2853 const Register to = c_rarg1; // destination array address 2854 const Register key = c_rarg2; // key array address 2855 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2856 // and left with the results of the last encryption block 2857 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2858 const Register keylen = rscratch1; 2859 2860 address start = __ pc(); 2861 2862 __ enter(); 2863 2864 __ movw(rscratch2, len_reg); 2865 2866 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2867 2868 __ ld1(v2, __ T16B, rvec); 2869 2870 __ ld1(v31, __ T16B, __ post(key, 16)); 2871 __ rev32(v31, __ T16B, v31); 2872 2873 __ cmpw(keylen, 52); 2874 __ br(Assembler::CC, L_loadkeys_44); 2875 __ br(Assembler::EQ, L_loadkeys_52); 2876 2877 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2878 __ rev32(v17, __ T16B, v17); 2879 __ rev32(v18, __ T16B, v18); 2880 __ BIND(L_loadkeys_52); 2881 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2882 __ rev32(v19, __ T16B, v19); 2883 __ rev32(v20, __ T16B, v20); 2884 __ BIND(L_loadkeys_44); 2885 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2886 __ rev32(v21, __ T16B, v21); 2887 __ rev32(v22, __ T16B, v22); 2888 __ rev32(v23, __ T16B, v23); 2889 __ rev32(v24, __ T16B, v24); 2890 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2891 __ rev32(v25, __ T16B, v25); 2892 __ rev32(v26, __ T16B, v26); 2893 __ rev32(v27, __ T16B, v27); 2894 __ rev32(v28, __ T16B, v28); 2895 __ ld1(v29, v30, __ T16B, key); 2896 __ rev32(v29, __ T16B, v29); 2897 __ rev32(v30, __ T16B, v30); 2898 2899 __ BIND(L_aes_loop); 2900 __ ld1(v0, __ T16B, __ post(from, 16)); 2901 __ orr(v1, __ T16B, v0, v0); 2902 2903 __ br(Assembler::CC, L_rounds_44); 2904 __ br(Assembler::EQ, L_rounds_52); 2905 2906 __ aesd(v0, v17); __ aesimc(v0, v0); 2907 __ aesd(v0, v18); __ aesimc(v0, v0); 2908 __ BIND(L_rounds_52); 2909 __ aesd(v0, v19); __ aesimc(v0, v0); 2910 __ aesd(v0, v20); __ aesimc(v0, v0); 2911 __ BIND(L_rounds_44); 2912 __ aesd(v0, v21); __ aesimc(v0, v0); 2913 __ aesd(v0, v22); __ aesimc(v0, v0); 2914 __ aesd(v0, v23); __ aesimc(v0, v0); 2915 __ aesd(v0, v24); __ aesimc(v0, v0); 2916 __ aesd(v0, v25); __ aesimc(v0, v0); 2917 __ aesd(v0, v26); __ aesimc(v0, v0); 2918 __ aesd(v0, v27); __ aesimc(v0, v0); 2919 __ aesd(v0, v28); __ aesimc(v0, v0); 2920 __ aesd(v0, v29); __ aesimc(v0, v0); 2921 __ aesd(v0, v30); 2922 __ eor(v0, __ T16B, v0, v31); 2923 __ eor(v0, __ T16B, v0, v2); 2924 2925 __ st1(v0, __ T16B, __ post(to, 16)); 2926 __ orr(v2, __ T16B, v1, v1); 2927 2928 __ subw(len_reg, len_reg, 16); 2929 __ cbnzw(len_reg, L_aes_loop); 2930 2931 __ st1(v2, __ T16B, rvec); 2932 2933 __ mov(r0, rscratch2); 2934 2935 __ leave(); 2936 __ ret(lr); 2937 2938 return start; 2939 } 2940 2941 // Arguments: 2942 // 2943 // Inputs: 2944 // c_rarg0 - byte[] source+offset 2945 // c_rarg1 - int[] SHA.state 2946 // c_rarg2 - int offset 2947 // c_rarg3 - int limit 2948 // 2949 address generate_sha1_implCompress(bool multi_block, const char *name) { 2950 __ align(CodeEntryAlignment); 2951 StubCodeMark mark(this, "StubRoutines", name); 2952 address start = __ pc(); 2953 2954 Register buf = c_rarg0; 2955 Register state = c_rarg1; 2956 Register ofs = c_rarg2; 2957 Register limit = c_rarg3; 2958 2959 Label keys; 2960 Label sha1_loop; 2961 2962 // load the keys into v0..v3 2963 __ adr(rscratch1, keys); 2964 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2965 // load 5 words state into v6, v7 2966 __ ldrq(v6, Address(state, 0)); 2967 __ ldrs(v7, Address(state, 16)); 2968 2969 2970 __ BIND(sha1_loop); 2971 // load 64 bytes of data into v16..v19 2972 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2973 __ rev32(v16, __ T16B, v16); 2974 __ rev32(v17, __ T16B, v17); 2975 __ rev32(v18, __ T16B, v18); 2976 __ rev32(v19, __ T16B, v19); 2977 2978 // do the sha1 2979 __ addv(v4, __ T4S, v16, v0); 2980 __ orr(v20, __ T16B, v6, v6); 2981 2982 FloatRegister d0 = v16; 2983 FloatRegister d1 = v17; 2984 FloatRegister d2 = v18; 2985 FloatRegister d3 = v19; 2986 2987 for (int round = 0; round < 20; round++) { 2988 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2989 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2990 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2991 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2992 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2993 2994 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2995 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2996 __ sha1h(tmp2, __ T4S, v20); 2997 if (round < 5) 2998 __ sha1c(v20, __ T4S, tmp3, tmp4); 2999 else if (round < 10 || round >= 15) 3000 __ sha1p(v20, __ T4S, tmp3, tmp4); 3001 else 3002 __ sha1m(v20, __ T4S, tmp3, tmp4); 3003 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3004 3005 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3006 } 3007 3008 __ addv(v7, __ T2S, v7, v21); 3009 __ addv(v6, __ T4S, v6, v20); 3010 3011 if (multi_block) { 3012 __ add(ofs, ofs, 64); 3013 __ cmp(ofs, limit); 3014 __ br(Assembler::LE, sha1_loop); 3015 __ mov(c_rarg0, ofs); // return ofs 3016 } 3017 3018 __ strq(v6, Address(state, 0)); 3019 __ strs(v7, Address(state, 16)); 3020 3021 __ ret(lr); 3022 3023 __ bind(keys); 3024 __ emit_int32(0x5a827999); 3025 __ emit_int32(0x6ed9eba1); 3026 __ emit_int32(0x8f1bbcdc); 3027 __ emit_int32(0xca62c1d6); 3028 3029 return start; 3030 } 3031 3032 3033 // Arguments: 3034 // 3035 // Inputs: 3036 // c_rarg0 - byte[] source+offset 3037 // c_rarg1 - int[] SHA.state 3038 // c_rarg2 - int offset 3039 // c_rarg3 - int limit 3040 // 3041 address generate_sha256_implCompress(bool multi_block, const char *name) { 3042 static const uint32_t round_consts[64] = { 3043 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3044 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3045 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3046 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3047 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3048 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3049 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3050 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3051 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3052 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3053 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3054 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3055 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3056 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3057 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3058 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3059 }; 3060 __ align(CodeEntryAlignment); 3061 StubCodeMark mark(this, "StubRoutines", name); 3062 address start = __ pc(); 3063 3064 Register buf = c_rarg0; 3065 Register state = c_rarg1; 3066 Register ofs = c_rarg2; 3067 Register limit = c_rarg3; 3068 3069 Label sha1_loop; 3070 3071 __ stpd(v8, v9, __ pre(sp, -32)); 3072 __ stpd(v10, v11, Address(sp, 16)); 3073 3074 // dga == v0 3075 // dgb == v1 3076 // dg0 == v2 3077 // dg1 == v3 3078 // dg2 == v4 3079 // t0 == v6 3080 // t1 == v7 3081 3082 // load 16 keys to v16..v31 3083 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3084 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3085 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3086 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3087 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3088 3089 // load 8 words (256 bits) state 3090 __ ldpq(v0, v1, state); 3091 3092 __ BIND(sha1_loop); 3093 // load 64 bytes of data into v8..v11 3094 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3095 __ rev32(v8, __ T16B, v8); 3096 __ rev32(v9, __ T16B, v9); 3097 __ rev32(v10, __ T16B, v10); 3098 __ rev32(v11, __ T16B, v11); 3099 3100 __ addv(v6, __ T4S, v8, v16); 3101 __ orr(v2, __ T16B, v0, v0); 3102 __ orr(v3, __ T16B, v1, v1); 3103 3104 FloatRegister d0 = v8; 3105 FloatRegister d1 = v9; 3106 FloatRegister d2 = v10; 3107 FloatRegister d3 = v11; 3108 3109 3110 for (int round = 0; round < 16; round++) { 3111 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3112 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3113 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3114 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3115 3116 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3117 __ orr(v4, __ T16B, v2, v2); 3118 if (round < 15) 3119 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3120 __ sha256h(v2, __ T4S, v3, tmp2); 3121 __ sha256h2(v3, __ T4S, v4, tmp2); 3122 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3123 3124 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3125 } 3126 3127 __ addv(v0, __ T4S, v0, v2); 3128 __ addv(v1, __ T4S, v1, v3); 3129 3130 if (multi_block) { 3131 __ add(ofs, ofs, 64); 3132 __ cmp(ofs, limit); 3133 __ br(Assembler::LE, sha1_loop); 3134 __ mov(c_rarg0, ofs); // return ofs 3135 } 3136 3137 __ ldpd(v10, v11, Address(sp, 16)); 3138 __ ldpd(v8, v9, __ post(sp, 32)); 3139 3140 __ stpq(v0, v1, state); 3141 3142 __ ret(lr); 3143 3144 return start; 3145 } 3146 3147 #ifndef BUILTIN_SIM 3148 // Safefetch stubs. 3149 void generate_safefetch(const char* name, int size, address* entry, 3150 address* fault_pc, address* continuation_pc) { 3151 // safefetch signatures: 3152 // int SafeFetch32(int* adr, int errValue); 3153 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3154 // 3155 // arguments: 3156 // c_rarg0 = adr 3157 // c_rarg1 = errValue 3158 // 3159 // result: 3160 // PPC_RET = *adr or errValue 3161 3162 StubCodeMark mark(this, "StubRoutines", name); 3163 3164 // Entry point, pc or function descriptor. 3165 *entry = __ pc(); 3166 3167 // Load *adr into c_rarg1, may fault. 3168 *fault_pc = __ pc(); 3169 switch (size) { 3170 case 4: 3171 // int32_t 3172 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3173 break; 3174 case 8: 3175 // int64_t 3176 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3177 break; 3178 default: 3179 ShouldNotReachHere(); 3180 } 3181 3182 // return errValue or *adr 3183 *continuation_pc = __ pc(); 3184 __ mov(r0, c_rarg1); 3185 __ ret(lr); 3186 } 3187 #endif 3188 3189 /** 3190 * Arguments: 3191 * 3192 * Inputs: 3193 * c_rarg0 - int crc 3194 * c_rarg1 - byte* buf 3195 * c_rarg2 - int length 3196 * 3197 * Ouput: 3198 * rax - int crc result 3199 */ 3200 address generate_updateBytesCRC32() { 3201 assert(UseCRC32Intrinsics, "what are we doing here?"); 3202 3203 __ align(CodeEntryAlignment); 3204 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3205 3206 address start = __ pc(); 3207 3208 const Register crc = c_rarg0; // crc 3209 const Register buf = c_rarg1; // source java byte array address 3210 const Register len = c_rarg2; // length 3211 const Register table0 = c_rarg3; // crc_table address 3212 const Register table1 = c_rarg4; 3213 const Register table2 = c_rarg5; 3214 const Register table3 = c_rarg6; 3215 const Register tmp3 = c_rarg7; 3216 3217 BLOCK_COMMENT("Entry:"); 3218 __ enter(); // required for proper stackwalking of RuntimeStub frame 3219 3220 __ kernel_crc32(crc, buf, len, 3221 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3222 3223 __ leave(); // required for proper stackwalking of RuntimeStub frame 3224 __ ret(lr); 3225 3226 return start; 3227 } 3228 3229 /** 3230 * Arguments: 3231 * 3232 * Inputs: 3233 * c_rarg0 - int crc 3234 * c_rarg1 - byte* buf 3235 * c_rarg2 - int length 3236 * c_rarg3 - int* table 3237 * 3238 * Ouput: 3239 * r0 - int crc result 3240 */ 3241 address generate_updateBytesCRC32C() { 3242 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3243 3244 __ align(CodeEntryAlignment); 3245 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3246 3247 address start = __ pc(); 3248 3249 const Register crc = c_rarg0; // crc 3250 const Register buf = c_rarg1; // source java byte array address 3251 const Register len = c_rarg2; // length 3252 const Register table0 = c_rarg3; // crc_table address 3253 const Register table1 = c_rarg4; 3254 const Register table2 = c_rarg5; 3255 const Register table3 = c_rarg6; 3256 const Register tmp3 = c_rarg7; 3257 3258 BLOCK_COMMENT("Entry:"); 3259 __ enter(); // required for proper stackwalking of RuntimeStub frame 3260 3261 __ kernel_crc32c(crc, buf, len, 3262 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3263 3264 __ leave(); // required for proper stackwalking of RuntimeStub frame 3265 __ ret(lr); 3266 3267 return start; 3268 } 3269 3270 /*** 3271 * Arguments: 3272 * 3273 * Inputs: 3274 * c_rarg0 - int adler 3275 * c_rarg1 - byte* buff 3276 * c_rarg2 - int len 3277 * 3278 * Output: 3279 * c_rarg0 - int adler result 3280 */ 3281 address generate_updateBytesAdler32() { 3282 __ align(CodeEntryAlignment); 3283 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3284 address start = __ pc(); 3285 3286 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3287 3288 // Aliases 3289 Register adler = c_rarg0; 3290 Register s1 = c_rarg0; 3291 Register s2 = c_rarg3; 3292 Register buff = c_rarg1; 3293 Register len = c_rarg2; 3294 Register nmax = r4; 3295 Register base = r5; 3296 Register count = r6; 3297 Register temp0 = rscratch1; 3298 Register temp1 = rscratch2; 3299 Register temp2 = r7; 3300 3301 // Max number of bytes we can process before having to take the mod 3302 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3303 unsigned long BASE = 0xfff1; 3304 unsigned long NMAX = 0x15B0; 3305 3306 __ mov(base, BASE); 3307 __ mov(nmax, NMAX); 3308 3309 // s1 is initialized to the lower 16 bits of adler 3310 // s2 is initialized to the upper 16 bits of adler 3311 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3312 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3313 3314 // The pipelined loop needs at least 16 elements for 1 iteration 3315 // It does check this, but it is more effective to skip to the cleanup loop 3316 __ cmp(len, 16); 3317 __ br(Assembler::HS, L_nmax); 3318 __ cbz(len, L_combine); 3319 3320 __ bind(L_simple_by1_loop); 3321 __ ldrb(temp0, Address(__ post(buff, 1))); 3322 __ add(s1, s1, temp0); 3323 __ add(s2, s2, s1); 3324 __ subs(len, len, 1); 3325 __ br(Assembler::HI, L_simple_by1_loop); 3326 3327 // s1 = s1 % BASE 3328 __ subs(temp0, s1, base); 3329 __ csel(s1, temp0, s1, Assembler::HS); 3330 3331 // s2 = s2 % BASE 3332 __ lsr(temp0, s2, 16); 3333 __ lsl(temp1, temp0, 4); 3334 __ sub(temp1, temp1, temp0); 3335 __ add(s2, temp1, s2, ext::uxth); 3336 3337 __ subs(temp0, s2, base); 3338 __ csel(s2, temp0, s2, Assembler::HS); 3339 3340 __ b(L_combine); 3341 3342 __ bind(L_nmax); 3343 __ subs(len, len, nmax); 3344 __ sub(count, nmax, 16); 3345 __ br(Assembler::LO, L_by16); 3346 3347 __ bind(L_nmax_loop); 3348 3349 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3350 3351 __ add(s1, s1, temp0, ext::uxtb); 3352 __ ubfx(temp2, temp0, 8, 8); 3353 __ add(s2, s2, s1); 3354 __ add(s1, s1, temp2); 3355 __ ubfx(temp2, temp0, 16, 8); 3356 __ add(s2, s2, s1); 3357 __ add(s1, s1, temp2); 3358 __ ubfx(temp2, temp0, 24, 8); 3359 __ add(s2, s2, s1); 3360 __ add(s1, s1, temp2); 3361 __ ubfx(temp2, temp0, 32, 8); 3362 __ add(s2, s2, s1); 3363 __ add(s1, s1, temp2); 3364 __ ubfx(temp2, temp0, 40, 8); 3365 __ add(s2, s2, s1); 3366 __ add(s1, s1, temp2); 3367 __ ubfx(temp2, temp0, 48, 8); 3368 __ add(s2, s2, s1); 3369 __ add(s1, s1, temp2); 3370 __ add(s2, s2, s1); 3371 __ add(s1, s1, temp0, Assembler::LSR, 56); 3372 __ add(s2, s2, s1); 3373 3374 __ add(s1, s1, temp1, ext::uxtb); 3375 __ ubfx(temp2, temp1, 8, 8); 3376 __ add(s2, s2, s1); 3377 __ add(s1, s1, temp2); 3378 __ ubfx(temp2, temp1, 16, 8); 3379 __ add(s2, s2, s1); 3380 __ add(s1, s1, temp2); 3381 __ ubfx(temp2, temp1, 24, 8); 3382 __ add(s2, s2, s1); 3383 __ add(s1, s1, temp2); 3384 __ ubfx(temp2, temp1, 32, 8); 3385 __ add(s2, s2, s1); 3386 __ add(s1, s1, temp2); 3387 __ ubfx(temp2, temp1, 40, 8); 3388 __ add(s2, s2, s1); 3389 __ add(s1, s1, temp2); 3390 __ ubfx(temp2, temp1, 48, 8); 3391 __ add(s2, s2, s1); 3392 __ add(s1, s1, temp2); 3393 __ add(s2, s2, s1); 3394 __ add(s1, s1, temp1, Assembler::LSR, 56); 3395 __ add(s2, s2, s1); 3396 3397 __ subs(count, count, 16); 3398 __ br(Assembler::HS, L_nmax_loop); 3399 3400 // s1 = s1 % BASE 3401 __ lsr(temp0, s1, 16); 3402 __ lsl(temp1, temp0, 4); 3403 __ sub(temp1, temp1, temp0); 3404 __ add(temp1, temp1, s1, ext::uxth); 3405 3406 __ lsr(temp0, temp1, 16); 3407 __ lsl(s1, temp0, 4); 3408 __ sub(s1, s1, temp0); 3409 __ add(s1, s1, temp1, ext:: uxth); 3410 3411 __ subs(temp0, s1, base); 3412 __ csel(s1, temp0, s1, Assembler::HS); 3413 3414 // s2 = s2 % BASE 3415 __ lsr(temp0, s2, 16); 3416 __ lsl(temp1, temp0, 4); 3417 __ sub(temp1, temp1, temp0); 3418 __ add(temp1, temp1, s2, ext::uxth); 3419 3420 __ lsr(temp0, temp1, 16); 3421 __ lsl(s2, temp0, 4); 3422 __ sub(s2, s2, temp0); 3423 __ add(s2, s2, temp1, ext:: uxth); 3424 3425 __ subs(temp0, s2, base); 3426 __ csel(s2, temp0, s2, Assembler::HS); 3427 3428 __ subs(len, len, nmax); 3429 __ sub(count, nmax, 16); 3430 __ br(Assembler::HS, L_nmax_loop); 3431 3432 __ bind(L_by16); 3433 __ adds(len, len, count); 3434 __ br(Assembler::LO, L_by1); 3435 3436 __ bind(L_by16_loop); 3437 3438 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3439 3440 __ add(s1, s1, temp0, ext::uxtb); 3441 __ ubfx(temp2, temp0, 8, 8); 3442 __ add(s2, s2, s1); 3443 __ add(s1, s1, temp2); 3444 __ ubfx(temp2, temp0, 16, 8); 3445 __ add(s2, s2, s1); 3446 __ add(s1, s1, temp2); 3447 __ ubfx(temp2, temp0, 24, 8); 3448 __ add(s2, s2, s1); 3449 __ add(s1, s1, temp2); 3450 __ ubfx(temp2, temp0, 32, 8); 3451 __ add(s2, s2, s1); 3452 __ add(s1, s1, temp2); 3453 __ ubfx(temp2, temp0, 40, 8); 3454 __ add(s2, s2, s1); 3455 __ add(s1, s1, temp2); 3456 __ ubfx(temp2, temp0, 48, 8); 3457 __ add(s2, s2, s1); 3458 __ add(s1, s1, temp2); 3459 __ add(s2, s2, s1); 3460 __ add(s1, s1, temp0, Assembler::LSR, 56); 3461 __ add(s2, s2, s1); 3462 3463 __ add(s1, s1, temp1, ext::uxtb); 3464 __ ubfx(temp2, temp1, 8, 8); 3465 __ add(s2, s2, s1); 3466 __ add(s1, s1, temp2); 3467 __ ubfx(temp2, temp1, 16, 8); 3468 __ add(s2, s2, s1); 3469 __ add(s1, s1, temp2); 3470 __ ubfx(temp2, temp1, 24, 8); 3471 __ add(s2, s2, s1); 3472 __ add(s1, s1, temp2); 3473 __ ubfx(temp2, temp1, 32, 8); 3474 __ add(s2, s2, s1); 3475 __ add(s1, s1, temp2); 3476 __ ubfx(temp2, temp1, 40, 8); 3477 __ add(s2, s2, s1); 3478 __ add(s1, s1, temp2); 3479 __ ubfx(temp2, temp1, 48, 8); 3480 __ add(s2, s2, s1); 3481 __ add(s1, s1, temp2); 3482 __ add(s2, s2, s1); 3483 __ add(s1, s1, temp1, Assembler::LSR, 56); 3484 __ add(s2, s2, s1); 3485 3486 __ subs(len, len, 16); 3487 __ br(Assembler::HS, L_by16_loop); 3488 3489 __ bind(L_by1); 3490 __ adds(len, len, 15); 3491 __ br(Assembler::LO, L_do_mod); 3492 3493 __ bind(L_by1_loop); 3494 __ ldrb(temp0, Address(__ post(buff, 1))); 3495 __ add(s1, temp0, s1); 3496 __ add(s2, s2, s1); 3497 __ subs(len, len, 1); 3498 __ br(Assembler::HS, L_by1_loop); 3499 3500 __ bind(L_do_mod); 3501 // s1 = s1 % BASE 3502 __ lsr(temp0, s1, 16); 3503 __ lsl(temp1, temp0, 4); 3504 __ sub(temp1, temp1, temp0); 3505 __ add(temp1, temp1, s1, ext::uxth); 3506 3507 __ lsr(temp0, temp1, 16); 3508 __ lsl(s1, temp0, 4); 3509 __ sub(s1, s1, temp0); 3510 __ add(s1, s1, temp1, ext:: uxth); 3511 3512 __ subs(temp0, s1, base); 3513 __ csel(s1, temp0, s1, Assembler::HS); 3514 3515 // s2 = s2 % BASE 3516 __ lsr(temp0, s2, 16); 3517 __ lsl(temp1, temp0, 4); 3518 __ sub(temp1, temp1, temp0); 3519 __ add(temp1, temp1, s2, ext::uxth); 3520 3521 __ lsr(temp0, temp1, 16); 3522 __ lsl(s2, temp0, 4); 3523 __ sub(s2, s2, temp0); 3524 __ add(s2, s2, temp1, ext:: uxth); 3525 3526 __ subs(temp0, s2, base); 3527 __ csel(s2, temp0, s2, Assembler::HS); 3528 3529 // Combine lower bits and higher bits 3530 __ bind(L_combine); 3531 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3532 3533 __ ret(lr); 3534 3535 return start; 3536 } 3537 3538 /** 3539 * Arguments: 3540 * 3541 * Input: 3542 * c_rarg0 - x address 3543 * c_rarg1 - x length 3544 * c_rarg2 - y address 3545 * c_rarg3 - y lenth 3546 * c_rarg4 - z address 3547 * c_rarg5 - z length 3548 */ 3549 address generate_multiplyToLen() { 3550 __ align(CodeEntryAlignment); 3551 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3552 3553 address start = __ pc(); 3554 const Register x = r0; 3555 const Register xlen = r1; 3556 const Register y = r2; 3557 const Register ylen = r3; 3558 const Register z = r4; 3559 const Register zlen = r5; 3560 3561 const Register tmp1 = r10; 3562 const Register tmp2 = r11; 3563 const Register tmp3 = r12; 3564 const Register tmp4 = r13; 3565 const Register tmp5 = r14; 3566 const Register tmp6 = r15; 3567 const Register tmp7 = r16; 3568 3569 BLOCK_COMMENT("Entry:"); 3570 __ enter(); // required for proper stackwalking of RuntimeStub frame 3571 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3572 __ leave(); // required for proper stackwalking of RuntimeStub frame 3573 __ ret(lr); 3574 3575 return start; 3576 } 3577 3578 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3579 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3580 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3581 // Karatsuba multiplication performs a 128*128 -> 256-bit 3582 // multiplication in three 128-bit multiplications and a few 3583 // additions. 3584 // 3585 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3586 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3587 // 3588 // Inputs: 3589 // 3590 // A0 in a.d[0] (subkey) 3591 // A1 in a.d[1] 3592 // (A1+A0) in a1_xor_a0.d[0] 3593 // 3594 // B0 in b.d[0] (state) 3595 // B1 in b.d[1] 3596 3597 __ ext(tmp1, __ T16B, b, b, 0x08); 3598 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3599 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3600 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3601 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3602 3603 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3604 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3605 __ eor(tmp2, __ T16B, tmp2, tmp4); 3606 __ eor(tmp2, __ T16B, tmp2, tmp3); 3607 3608 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3609 __ ins(result_hi, __ D, tmp2, 0, 1); 3610 __ ins(result_lo, __ D, tmp2, 1, 0); 3611 } 3612 3613 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3614 FloatRegister p, FloatRegister z, FloatRegister t1) { 3615 const FloatRegister t0 = result; 3616 3617 // The GCM field polynomial f is z^128 + p(z), where p = 3618 // z^7+z^2+z+1. 3619 // 3620 // z^128 === -p(z) (mod (z^128 + p(z))) 3621 // 3622 // so, given that the product we're reducing is 3623 // a == lo + hi * z^128 3624 // substituting, 3625 // === lo - hi * p(z) (mod (z^128 + p(z))) 3626 // 3627 // we reduce by multiplying hi by p(z) and subtracting the result 3628 // from (i.e. XORing it with) lo. Because p has no nonzero high 3629 // bits we can do this with two 64-bit multiplications, lo*p and 3630 // hi*p. 3631 3632 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3633 __ ext(t1, __ T16B, t0, z, 8); 3634 __ eor(hi, __ T16B, hi, t1); 3635 __ ext(t1, __ T16B, z, t0, 8); 3636 __ eor(lo, __ T16B, lo, t1); 3637 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3638 __ eor(result, __ T16B, lo, t0); 3639 } 3640 3641 /** 3642 * Arguments: 3643 * 3644 * Input: 3645 * c_rarg0 - current state address 3646 * c_rarg1 - H key address 3647 * c_rarg2 - data address 3648 * c_rarg3 - number of blocks 3649 * 3650 * Output: 3651 * Updated state at c_rarg0 3652 */ 3653 address generate_ghash_processBlocks() { 3654 // Bafflingly, GCM uses little-endian for the byte order, but 3655 // big-endian for the bit order. For example, the polynomial 1 is 3656 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3657 // 3658 // So, we must either reverse the bytes in each word and do 3659 // everything big-endian or reverse the bits in each byte and do 3660 // it little-endian. On AArch64 it's more idiomatic to reverse 3661 // the bits in each byte (we have an instruction, RBIT, to do 3662 // that) and keep the data in little-endian bit order throught the 3663 // calculation, bit-reversing the inputs and outputs. 3664 3665 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3666 __ align(wordSize * 2); 3667 address p = __ pc(); 3668 __ emit_int64(0x87); // The low-order bits of the field 3669 // polynomial (i.e. p = z^7+z^2+z+1) 3670 // repeated in the low and high parts of a 3671 // 128-bit vector 3672 __ emit_int64(0x87); 3673 3674 __ align(CodeEntryAlignment); 3675 address start = __ pc(); 3676 3677 Register state = c_rarg0; 3678 Register subkeyH = c_rarg1; 3679 Register data = c_rarg2; 3680 Register blocks = c_rarg3; 3681 3682 FloatRegister vzr = v30; 3683 __ eor(vzr, __ T16B, vzr, vzr); // zero register 3684 3685 __ ldrq(v0, Address(state)); 3686 __ ldrq(v1, Address(subkeyH)); 3687 3688 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 3689 __ rbit(v0, __ T16B, v0); 3690 __ rev64(v1, __ T16B, v1); 3691 __ rbit(v1, __ T16B, v1); 3692 3693 __ ldrq(v26, p); 3694 3695 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 3696 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 3697 3698 { 3699 Label L_ghash_loop; 3700 __ bind(L_ghash_loop); 3701 3702 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 3703 // reversing each byte 3704 __ rbit(v2, __ T16B, v2); 3705 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 3706 3707 // Multiply state in v2 by subkey in v1 3708 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 3709 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 3710 /*temps*/v6, v20, v18, v21); 3711 // Reduce v7:v5 by the field polynomial 3712 ghash_reduce(v0, v5, v7, v26, vzr, v20); 3713 3714 __ sub(blocks, blocks, 1); 3715 __ cbnz(blocks, L_ghash_loop); 3716 } 3717 3718 // The bit-reversed result is at this point in v0 3719 __ rev64(v1, __ T16B, v0); 3720 __ rbit(v1, __ T16B, v1); 3721 3722 __ st1(v1, __ T16B, state); 3723 __ ret(lr); 3724 3725 return start; 3726 } 3727 3728 // Continuation point for throwing of implicit exceptions that are 3729 // not handled in the current activation. Fabricates an exception 3730 // oop and initiates normal exception dispatching in this 3731 // frame. Since we need to preserve callee-saved values (currently 3732 // only for C2, but done for C1 as well) we need a callee-saved oop 3733 // map and therefore have to make these stubs into RuntimeStubs 3734 // rather than BufferBlobs. If the compiler needs all registers to 3735 // be preserved between the fault point and the exception handler 3736 // then it must assume responsibility for that in 3737 // AbstractCompiler::continuation_for_implicit_null_exception or 3738 // continuation_for_implicit_division_by_zero_exception. All other 3739 // implicit exceptions (e.g., NullPointerException or 3740 // AbstractMethodError on entry) are either at call sites or 3741 // otherwise assume that stack unwinding will be initiated, so 3742 // caller saved registers were assumed volatile in the compiler. 3743 3744 #undef __ 3745 #define __ masm-> 3746 3747 address generate_throw_exception(const char* name, 3748 address runtime_entry, 3749 Register arg1 = noreg, 3750 Register arg2 = noreg) { 3751 // Information about frame layout at time of blocking runtime call. 3752 // Note that we only have to preserve callee-saved registers since 3753 // the compilers are responsible for supplying a continuation point 3754 // if they expect all registers to be preserved. 3755 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 3756 enum layout { 3757 rfp_off = 0, 3758 rfp_off2, 3759 return_off, 3760 return_off2, 3761 framesize // inclusive of return address 3762 }; 3763 3764 int insts_size = 512; 3765 int locs_size = 64; 3766 3767 CodeBuffer code(name, insts_size, locs_size); 3768 OopMapSet* oop_maps = new OopMapSet(); 3769 MacroAssembler* masm = new MacroAssembler(&code); 3770 3771 address start = __ pc(); 3772 3773 // This is an inlined and slightly modified version of call_VM 3774 // which has the ability to fetch the return PC out of 3775 // thread-local storage and also sets up last_Java_sp slightly 3776 // differently than the real call_VM 3777 3778 __ enter(); // Save FP and LR before call 3779 3780 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3781 3782 // lr and fp are already in place 3783 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 3784 3785 int frame_complete = __ pc() - start; 3786 3787 // Set up last_Java_sp and last_Java_fp 3788 address the_pc = __ pc(); 3789 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 3790 3791 // Call runtime 3792 if (arg1 != noreg) { 3793 assert(arg2 != c_rarg1, "clobbered"); 3794 __ mov(c_rarg1, arg1); 3795 } 3796 if (arg2 != noreg) { 3797 __ mov(c_rarg2, arg2); 3798 } 3799 __ mov(c_rarg0, rthread); 3800 BLOCK_COMMENT("call runtime_entry"); 3801 __ mov(rscratch1, runtime_entry); 3802 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 3803 3804 // Generate oop map 3805 OopMap* map = new OopMap(framesize, 0); 3806 3807 oop_maps->add_gc_map(the_pc - start, map); 3808 3809 __ reset_last_Java_frame(true); 3810 __ maybe_isb(); 3811 3812 __ leave(); 3813 3814 // check for pending exceptions 3815 #ifdef ASSERT 3816 Label L; 3817 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 3818 __ cbnz(rscratch1, L); 3819 __ should_not_reach_here(); 3820 __ bind(L); 3821 #endif // ASSERT 3822 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3823 3824 3825 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3826 RuntimeStub* stub = 3827 RuntimeStub::new_runtime_stub(name, 3828 &code, 3829 frame_complete, 3830 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3831 oop_maps, false); 3832 return stub->entry_point(); 3833 } 3834 3835 class MontgomeryMultiplyGenerator : public MacroAssembler { 3836 3837 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3838 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 3839 3840 RegSet _toSave; 3841 bool _squaring; 3842 3843 public: 3844 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3845 : MacroAssembler(as->code()), _squaring(squaring) { 3846 3847 // Register allocation 3848 3849 Register reg = c_rarg0; 3850 Pa_base = reg; // Argument registers 3851 if (squaring) 3852 Pb_base = Pa_base; 3853 else 3854 Pb_base = ++reg; 3855 Pn_base = ++reg; 3856 Rlen= ++reg; 3857 inv = ++reg; 3858 Pm_base = ++reg; 3859 3860 // Working registers: 3861 Ra = ++reg; // The current digit of a, b, n, and m. 3862 Rb = ++reg; 3863 Rm = ++reg; 3864 Rn = ++reg; 3865 3866 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 3867 Pb = ++reg; 3868 Pm = ++reg; 3869 Pn = ++reg; 3870 3871 t0 = ++reg; // Three registers which form a 3872 t1 = ++reg; // triple-precision accumuator. 3873 t2 = ++reg; 3874 3875 Ri = ++reg; // Inner and outer loop indexes. 3876 Rj = ++reg; 3877 3878 Rhi_ab = ++reg; // Product registers: low and high parts 3879 Rlo_ab = ++reg; // of a*b and m*n. 3880 Rhi_mn = ++reg; 3881 Rlo_mn = ++reg; 3882 3883 // r19 and up are callee-saved. 3884 _toSave = RegSet::range(r19, reg) + Pm_base; 3885 } 3886 3887 private: 3888 void save_regs() { 3889 push(_toSave, sp); 3890 } 3891 3892 void restore_regs() { 3893 pop(_toSave, sp); 3894 } 3895 3896 template <typename T> 3897 void unroll_2(Register count, T block) { 3898 Label loop, end, odd; 3899 tbnz(count, 0, odd); 3900 cbz(count, end); 3901 align(16); 3902 bind(loop); 3903 (this->*block)(); 3904 bind(odd); 3905 (this->*block)(); 3906 subs(count, count, 2); 3907 br(Assembler::GT, loop); 3908 bind(end); 3909 } 3910 3911 template <typename T> 3912 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3913 Label loop, end, odd; 3914 tbnz(count, 0, odd); 3915 cbz(count, end); 3916 align(16); 3917 bind(loop); 3918 (this->*block)(d, s, tmp); 3919 bind(odd); 3920 (this->*block)(d, s, tmp); 3921 subs(count, count, 2); 3922 br(Assembler::GT, loop); 3923 bind(end); 3924 } 3925 3926 void pre1(RegisterOrConstant i) { 3927 block_comment("pre1"); 3928 // Pa = Pa_base; 3929 // Pb = Pb_base + i; 3930 // Pm = Pm_base; 3931 // Pn = Pn_base + i; 3932 // Ra = *Pa; 3933 // Rb = *Pb; 3934 // Rm = *Pm; 3935 // Rn = *Pn; 3936 ldr(Ra, Address(Pa_base)); 3937 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3938 ldr(Rm, Address(Pm_base)); 3939 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3940 lea(Pa, Address(Pa_base)); 3941 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3942 lea(Pm, Address(Pm_base)); 3943 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3944 3945 // Zero the m*n result. 3946 mov(Rhi_mn, zr); 3947 mov(Rlo_mn, zr); 3948 } 3949 3950 // The core multiply-accumulate step of a Montgomery 3951 // multiplication. The idea is to schedule operations as a 3952 // pipeline so that instructions with long latencies (loads and 3953 // multiplies) have time to complete before their results are 3954 // used. This most benefits in-order implementations of the 3955 // architecture but out-of-order ones also benefit. 3956 void step() { 3957 block_comment("step"); 3958 // MACC(Ra, Rb, t0, t1, t2); 3959 // Ra = *++Pa; 3960 // Rb = *--Pb; 3961 umulh(Rhi_ab, Ra, Rb); 3962 mul(Rlo_ab, Ra, Rb); 3963 ldr(Ra, pre(Pa, wordSize)); 3964 ldr(Rb, pre(Pb, -wordSize)); 3965 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 3966 // previous iteration. 3967 // MACC(Rm, Rn, t0, t1, t2); 3968 // Rm = *++Pm; 3969 // Rn = *--Pn; 3970 umulh(Rhi_mn, Rm, Rn); 3971 mul(Rlo_mn, Rm, Rn); 3972 ldr(Rm, pre(Pm, wordSize)); 3973 ldr(Rn, pre(Pn, -wordSize)); 3974 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3975 } 3976 3977 void post1() { 3978 block_comment("post1"); 3979 3980 // MACC(Ra, Rb, t0, t1, t2); 3981 // Ra = *++Pa; 3982 // Rb = *--Pb; 3983 umulh(Rhi_ab, Ra, Rb); 3984 mul(Rlo_ab, Ra, Rb); 3985 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3986 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3987 3988 // *Pm = Rm = t0 * inv; 3989 mul(Rm, t0, inv); 3990 str(Rm, Address(Pm)); 3991 3992 // MACC(Rm, Rn, t0, t1, t2); 3993 // t0 = t1; t1 = t2; t2 = 0; 3994 umulh(Rhi_mn, Rm, Rn); 3995 3996 #ifndef PRODUCT 3997 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 3998 { 3999 mul(Rlo_mn, Rm, Rn); 4000 add(Rlo_mn, t0, Rlo_mn); 4001 Label ok; 4002 cbz(Rlo_mn, ok); { 4003 stop("broken Montgomery multiply"); 4004 } bind(ok); 4005 } 4006 #endif 4007 // We have very carefully set things up so that 4008 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4009 // the lower half of Rm * Rn because we know the result already: 4010 // it must be -t0. t0 + (-t0) must generate a carry iff 4011 // t0 != 0. So, rather than do a mul and an adds we just set 4012 // the carry flag iff t0 is nonzero. 4013 // 4014 // mul(Rlo_mn, Rm, Rn); 4015 // adds(zr, t0, Rlo_mn); 4016 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4017 adcs(t0, t1, Rhi_mn); 4018 adc(t1, t2, zr); 4019 mov(t2, zr); 4020 } 4021 4022 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 4023 block_comment("pre2"); 4024 // Pa = Pa_base + i-len; 4025 // Pb = Pb_base + len; 4026 // Pm = Pm_base + i-len; 4027 // Pn = Pn_base + len; 4028 4029 if (i.is_register()) { 4030 sub(Rj, i.as_register(), len); 4031 } else { 4032 mov(Rj, i.as_constant()); 4033 sub(Rj, Rj, len); 4034 } 4035 // Rj == i-len 4036 4037 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 4038 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 4039 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4040 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 4041 4042 // Ra = *++Pa; 4043 // Rb = *--Pb; 4044 // Rm = *++Pm; 4045 // Rn = *--Pn; 4046 ldr(Ra, pre(Pa, wordSize)); 4047 ldr(Rb, pre(Pb, -wordSize)); 4048 ldr(Rm, pre(Pm, wordSize)); 4049 ldr(Rn, pre(Pn, -wordSize)); 4050 4051 mov(Rhi_mn, zr); 4052 mov(Rlo_mn, zr); 4053 } 4054 4055 void post2(RegisterOrConstant i, RegisterOrConstant len) { 4056 block_comment("post2"); 4057 if (i.is_constant()) { 4058 mov(Rj, i.as_constant()-len.as_constant()); 4059 } else { 4060 sub(Rj, i.as_register(), len); 4061 } 4062 4063 adds(t0, t0, Rlo_mn); // The pending m*n, low part 4064 4065 // As soon as we know the least significant digit of our result, 4066 // store it. 4067 // Pm_base[i-len] = t0; 4068 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4069 4070 // t0 = t1; t1 = t2; t2 = 0; 4071 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 4072 adc(t1, t2, zr); 4073 mov(t2, zr); 4074 } 4075 4076 // A carry in t0 after Montgomery multiplication means that we 4077 // should subtract multiples of n from our result in m. We'll 4078 // keep doing that until there is no carry. 4079 void normalize(RegisterOrConstant len) { 4080 block_comment("normalize"); 4081 // while (t0) 4082 // t0 = sub(Pm_base, Pn_base, t0, len); 4083 Label loop, post, again; 4084 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 4085 cbz(t0, post); { 4086 bind(again); { 4087 mov(i, zr); 4088 mov(cnt, len); 4089 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4090 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4091 subs(zr, zr, zr); // set carry flag, i.e. no borrow 4092 align(16); 4093 bind(loop); { 4094 sbcs(Rm, Rm, Rn); 4095 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4096 add(i, i, 1); 4097 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4098 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4099 sub(cnt, cnt, 1); 4100 } cbnz(cnt, loop); 4101 sbc(t0, t0, zr); 4102 } cbnz(t0, again); 4103 } bind(post); 4104 } 4105 4106 // Move memory at s to d, reversing words. 4107 // Increments d to end of copied memory 4108 // Destroys tmp1, tmp2 4109 // Preserves len 4110 // Leaves s pointing to the address which was in d at start 4111 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 4112 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 4113 4114 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 4115 mov(tmp1, len); 4116 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 4117 sub(s, d, len, ext::uxtw, LogBytesPerWord); 4118 } 4119 // where 4120 void reverse1(Register d, Register s, Register tmp) { 4121 ldr(tmp, pre(s, -wordSize)); 4122 ror(tmp, tmp, 32); 4123 str(tmp, post(d, wordSize)); 4124 } 4125 4126 void step_squaring() { 4127 // An extra ACC 4128 step(); 4129 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4130 } 4131 4132 void last_squaring(RegisterOrConstant i) { 4133 Label dont; 4134 // if ((i & 1) == 0) { 4135 tbnz(i.as_register(), 0, dont); { 4136 // MACC(Ra, Rb, t0, t1, t2); 4137 // Ra = *++Pa; 4138 // Rb = *--Pb; 4139 umulh(Rhi_ab, Ra, Rb); 4140 mul(Rlo_ab, Ra, Rb); 4141 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4142 } bind(dont); 4143 } 4144 4145 void extra_step_squaring() { 4146 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4147 4148 // MACC(Rm, Rn, t0, t1, t2); 4149 // Rm = *++Pm; 4150 // Rn = *--Pn; 4151 umulh(Rhi_mn, Rm, Rn); 4152 mul(Rlo_mn, Rm, Rn); 4153 ldr(Rm, pre(Pm, wordSize)); 4154 ldr(Rn, pre(Pn, -wordSize)); 4155 } 4156 4157 void post1_squaring() { 4158 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4159 4160 // *Pm = Rm = t0 * inv; 4161 mul(Rm, t0, inv); 4162 str(Rm, Address(Pm)); 4163 4164 // MACC(Rm, Rn, t0, t1, t2); 4165 // t0 = t1; t1 = t2; t2 = 0; 4166 umulh(Rhi_mn, Rm, Rn); 4167 4168 #ifndef PRODUCT 4169 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4170 { 4171 mul(Rlo_mn, Rm, Rn); 4172 add(Rlo_mn, t0, Rlo_mn); 4173 Label ok; 4174 cbz(Rlo_mn, ok); { 4175 stop("broken Montgomery multiply"); 4176 } bind(ok); 4177 } 4178 #endif 4179 // We have very carefully set things up so that 4180 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4181 // the lower half of Rm * Rn because we know the result already: 4182 // it must be -t0. t0 + (-t0) must generate a carry iff 4183 // t0 != 0. So, rather than do a mul and an adds we just set 4184 // the carry flag iff t0 is nonzero. 4185 // 4186 // mul(Rlo_mn, Rm, Rn); 4187 // adds(zr, t0, Rlo_mn); 4188 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4189 adcs(t0, t1, Rhi_mn); 4190 adc(t1, t2, zr); 4191 mov(t2, zr); 4192 } 4193 4194 void acc(Register Rhi, Register Rlo, 4195 Register t0, Register t1, Register t2) { 4196 adds(t0, t0, Rlo); 4197 adcs(t1, t1, Rhi); 4198 adc(t2, t2, zr); 4199 } 4200 4201 public: 4202 /** 4203 * Fast Montgomery multiplication. The derivation of the 4204 * algorithm is in A Cryptographic Library for the Motorola 4205 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 4206 * 4207 * Arguments: 4208 * 4209 * Inputs for multiplication: 4210 * c_rarg0 - int array elements a 4211 * c_rarg1 - int array elements b 4212 * c_rarg2 - int array elements n (the modulus) 4213 * c_rarg3 - int length 4214 * c_rarg4 - int inv 4215 * c_rarg5 - int array elements m (the result) 4216 * 4217 * Inputs for squaring: 4218 * c_rarg0 - int array elements a 4219 * c_rarg1 - int array elements n (the modulus) 4220 * c_rarg2 - int length 4221 * c_rarg3 - int inv 4222 * c_rarg4 - int array elements m (the result) 4223 * 4224 */ 4225 address generate_multiply() { 4226 Label argh, nothing; 4227 bind(argh); 4228 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4229 4230 align(CodeEntryAlignment); 4231 address entry = pc(); 4232 4233 cbzw(Rlen, nothing); 4234 4235 enter(); 4236 4237 // Make room. 4238 cmpw(Rlen, 512); 4239 br(Assembler::HI, argh); 4240 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4241 andr(sp, Ra, -2 * wordSize); 4242 4243 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4244 4245 { 4246 // Copy input args, reversing as we go. We use Ra as a 4247 // temporary variable. 4248 reverse(Ra, Pa_base, Rlen, t0, t1); 4249 if (!_squaring) 4250 reverse(Ra, Pb_base, Rlen, t0, t1); 4251 reverse(Ra, Pn_base, Rlen, t0, t1); 4252 } 4253 4254 // Push all call-saved registers and also Pm_base which we'll need 4255 // at the end. 4256 save_regs(); 4257 4258 #ifndef PRODUCT 4259 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 4260 { 4261 ldr(Rn, Address(Pn_base, 0)); 4262 mul(Rlo_mn, Rn, inv); 4263 cmp(Rlo_mn, -1); 4264 Label ok; 4265 br(EQ, ok); { 4266 stop("broken inverse in Montgomery multiply"); 4267 } bind(ok); 4268 } 4269 #endif 4270 4271 mov(Pm_base, Ra); 4272 4273 mov(t0, zr); 4274 mov(t1, zr); 4275 mov(t2, zr); 4276 4277 block_comment("for (int i = 0; i < len; i++) {"); 4278 mov(Ri, zr); { 4279 Label loop, end; 4280 cmpw(Ri, Rlen); 4281 br(Assembler::GE, end); 4282 4283 bind(loop); 4284 pre1(Ri); 4285 4286 block_comment(" for (j = i; j; j--) {"); { 4287 movw(Rj, Ri); 4288 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4289 } block_comment(" } // j"); 4290 4291 post1(); 4292 addw(Ri, Ri, 1); 4293 cmpw(Ri, Rlen); 4294 br(Assembler::LT, loop); 4295 bind(end); 4296 block_comment("} // i"); 4297 } 4298 4299 block_comment("for (int i = len; i < 2*len; i++) {"); 4300 mov(Ri, Rlen); { 4301 Label loop, end; 4302 cmpw(Ri, Rlen, Assembler::LSL, 1); 4303 br(Assembler::GE, end); 4304 4305 bind(loop); 4306 pre2(Ri, Rlen); 4307 4308 block_comment(" for (j = len*2-i-1; j; j--) {"); { 4309 lslw(Rj, Rlen, 1); 4310 subw(Rj, Rj, Ri); 4311 subw(Rj, Rj, 1); 4312 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4313 } block_comment(" } // j"); 4314 4315 post2(Ri, Rlen); 4316 addw(Ri, Ri, 1); 4317 cmpw(Ri, Rlen, Assembler::LSL, 1); 4318 br(Assembler::LT, loop); 4319 bind(end); 4320 } 4321 block_comment("} // i"); 4322 4323 normalize(Rlen); 4324 4325 mov(Ra, Pm_base); // Save Pm_base in Ra 4326 restore_regs(); // Restore caller's Pm_base 4327 4328 // Copy our result into caller's Pm_base 4329 reverse(Pm_base, Ra, Rlen, t0, t1); 4330 4331 leave(); 4332 bind(nothing); 4333 ret(lr); 4334 4335 return entry; 4336 } 4337 // In C, approximately: 4338 4339 // void 4340 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 4341 // unsigned long Pn_base[], unsigned long Pm_base[], 4342 // unsigned long inv, int len) { 4343 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4344 // unsigned long *Pa, *Pb, *Pn, *Pm; 4345 // unsigned long Ra, Rb, Rn, Rm; 4346 4347 // int i; 4348 4349 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4350 4351 // for (i = 0; i < len; i++) { 4352 // int j; 4353 4354 // Pa = Pa_base; 4355 // Pb = Pb_base + i; 4356 // Pm = Pm_base; 4357 // Pn = Pn_base + i; 4358 4359 // Ra = *Pa; 4360 // Rb = *Pb; 4361 // Rm = *Pm; 4362 // Rn = *Pn; 4363 4364 // int iters = i; 4365 // for (j = 0; iters--; j++) { 4366 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4367 // MACC(Ra, Rb, t0, t1, t2); 4368 // Ra = *++Pa; 4369 // Rb = *--Pb; 4370 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4371 // MACC(Rm, Rn, t0, t1, t2); 4372 // Rm = *++Pm; 4373 // Rn = *--Pn; 4374 // } 4375 4376 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 4377 // MACC(Ra, Rb, t0, t1, t2); 4378 // *Pm = Rm = t0 * inv; 4379 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4380 // MACC(Rm, Rn, t0, t1, t2); 4381 4382 // assert(t0 == 0, "broken Montgomery multiply"); 4383 4384 // t0 = t1; t1 = t2; t2 = 0; 4385 // } 4386 4387 // for (i = len; i < 2*len; i++) { 4388 // int j; 4389 4390 // Pa = Pa_base + i-len; 4391 // Pb = Pb_base + len; 4392 // Pm = Pm_base + i-len; 4393 // Pn = Pn_base + len; 4394 4395 // Ra = *++Pa; 4396 // Rb = *--Pb; 4397 // Rm = *++Pm; 4398 // Rn = *--Pn; 4399 4400 // int iters = len*2-i-1; 4401 // for (j = i-len+1; iters--; j++) { 4402 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4403 // MACC(Ra, Rb, t0, t1, t2); 4404 // Ra = *++Pa; 4405 // Rb = *--Pb; 4406 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4407 // MACC(Rm, Rn, t0, t1, t2); 4408 // Rm = *++Pm; 4409 // Rn = *--Pn; 4410 // } 4411 4412 // Pm_base[i-len] = t0; 4413 // t0 = t1; t1 = t2; t2 = 0; 4414 // } 4415 4416 // while (t0) 4417 // t0 = sub(Pm_base, Pn_base, t0, len); 4418 // } 4419 4420 /** 4421 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4422 * multiplies than Montgomery multiplication so it should be up to 4423 * 25% faster. However, its loop control is more complex and it 4424 * may actually run slower on some machines. 4425 * 4426 * Arguments: 4427 * 4428 * Inputs: 4429 * c_rarg0 - int array elements a 4430 * c_rarg1 - int array elements n (the modulus) 4431 * c_rarg2 - int length 4432 * c_rarg3 - int inv 4433 * c_rarg4 - int array elements m (the result) 4434 * 4435 */ 4436 address generate_square() { 4437 Label argh; 4438 bind(argh); 4439 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4440 4441 align(CodeEntryAlignment); 4442 address entry = pc(); 4443 4444 enter(); 4445 4446 // Make room. 4447 cmpw(Rlen, 512); 4448 br(Assembler::HI, argh); 4449 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4450 andr(sp, Ra, -2 * wordSize); 4451 4452 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4453 4454 { 4455 // Copy input args, reversing as we go. We use Ra as a 4456 // temporary variable. 4457 reverse(Ra, Pa_base, Rlen, t0, t1); 4458 reverse(Ra, Pn_base, Rlen, t0, t1); 4459 } 4460 4461 // Push all call-saved registers and also Pm_base which we'll need 4462 // at the end. 4463 save_regs(); 4464 4465 mov(Pm_base, Ra); 4466 4467 mov(t0, zr); 4468 mov(t1, zr); 4469 mov(t2, zr); 4470 4471 block_comment("for (int i = 0; i < len; i++) {"); 4472 mov(Ri, zr); { 4473 Label loop, end; 4474 bind(loop); 4475 cmp(Ri, Rlen); 4476 br(Assembler::GE, end); 4477 4478 pre1(Ri); 4479 4480 block_comment("for (j = (i+1)/2; j; j--) {"); { 4481 add(Rj, Ri, 1); 4482 lsr(Rj, Rj, 1); 4483 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4484 } block_comment(" } // j"); 4485 4486 last_squaring(Ri); 4487 4488 block_comment(" for (j = i/2; j; j--) {"); { 4489 lsr(Rj, Ri, 1); 4490 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4491 } block_comment(" } // j"); 4492 4493 post1_squaring(); 4494 add(Ri, Ri, 1); 4495 cmp(Ri, Rlen); 4496 br(Assembler::LT, loop); 4497 4498 bind(end); 4499 block_comment("} // i"); 4500 } 4501 4502 block_comment("for (int i = len; i < 2*len; i++) {"); 4503 mov(Ri, Rlen); { 4504 Label loop, end; 4505 bind(loop); 4506 cmp(Ri, Rlen, Assembler::LSL, 1); 4507 br(Assembler::GE, end); 4508 4509 pre2(Ri, Rlen); 4510 4511 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4512 lsl(Rj, Rlen, 1); 4513 sub(Rj, Rj, Ri); 4514 sub(Rj, Rj, 1); 4515 lsr(Rj, Rj, 1); 4516 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4517 } block_comment(" } // j"); 4518 4519 last_squaring(Ri); 4520 4521 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4522 lsl(Rj, Rlen, 1); 4523 sub(Rj, Rj, Ri); 4524 lsr(Rj, Rj, 1); 4525 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4526 } block_comment(" } // j"); 4527 4528 post2(Ri, Rlen); 4529 add(Ri, Ri, 1); 4530 cmp(Ri, Rlen, Assembler::LSL, 1); 4531 4532 br(Assembler::LT, loop); 4533 bind(end); 4534 block_comment("} // i"); 4535 } 4536 4537 normalize(Rlen); 4538 4539 mov(Ra, Pm_base); // Save Pm_base in Ra 4540 restore_regs(); // Restore caller's Pm_base 4541 4542 // Copy our result into caller's Pm_base 4543 reverse(Pm_base, Ra, Rlen, t0, t1); 4544 4545 leave(); 4546 ret(lr); 4547 4548 return entry; 4549 } 4550 // In C, approximately: 4551 4552 // void 4553 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4554 // unsigned long Pm_base[], unsigned long inv, int len) { 4555 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4556 // unsigned long *Pa, *Pb, *Pn, *Pm; 4557 // unsigned long Ra, Rb, Rn, Rm; 4558 4559 // int i; 4560 4561 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4562 4563 // for (i = 0; i < len; i++) { 4564 // int j; 4565 4566 // Pa = Pa_base; 4567 // Pb = Pa_base + i; 4568 // Pm = Pm_base; 4569 // Pn = Pn_base + i; 4570 4571 // Ra = *Pa; 4572 // Rb = *Pb; 4573 // Rm = *Pm; 4574 // Rn = *Pn; 4575 4576 // int iters = (i+1)/2; 4577 // for (j = 0; iters--; j++) { 4578 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4579 // MACC2(Ra, Rb, t0, t1, t2); 4580 // Ra = *++Pa; 4581 // Rb = *--Pb; 4582 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4583 // MACC(Rm, Rn, t0, t1, t2); 4584 // Rm = *++Pm; 4585 // Rn = *--Pn; 4586 // } 4587 // if ((i & 1) == 0) { 4588 // assert(Ra == Pa_base[j], "must be"); 4589 // MACC(Ra, Ra, t0, t1, t2); 4590 // } 4591 // iters = i/2; 4592 // assert(iters == i-j, "must be"); 4593 // for (; iters--; j++) { 4594 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4595 // MACC(Rm, Rn, t0, t1, t2); 4596 // Rm = *++Pm; 4597 // Rn = *--Pn; 4598 // } 4599 4600 // *Pm = Rm = t0 * inv; 4601 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4602 // MACC(Rm, Rn, t0, t1, t2); 4603 4604 // assert(t0 == 0, "broken Montgomery multiply"); 4605 4606 // t0 = t1; t1 = t2; t2 = 0; 4607 // } 4608 4609 // for (i = len; i < 2*len; i++) { 4610 // int start = i-len+1; 4611 // int end = start + (len - start)/2; 4612 // int j; 4613 4614 // Pa = Pa_base + i-len; 4615 // Pb = Pa_base + len; 4616 // Pm = Pm_base + i-len; 4617 // Pn = Pn_base + len; 4618 4619 // Ra = *++Pa; 4620 // Rb = *--Pb; 4621 // Rm = *++Pm; 4622 // Rn = *--Pn; 4623 4624 // int iters = (2*len-i-1)/2; 4625 // assert(iters == end-start, "must be"); 4626 // for (j = start; iters--; j++) { 4627 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4628 // MACC2(Ra, Rb, t0, t1, t2); 4629 // Ra = *++Pa; 4630 // Rb = *--Pb; 4631 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4632 // MACC(Rm, Rn, t0, t1, t2); 4633 // Rm = *++Pm; 4634 // Rn = *--Pn; 4635 // } 4636 // if ((i & 1) == 0) { 4637 // assert(Ra == Pa_base[j], "must be"); 4638 // MACC(Ra, Ra, t0, t1, t2); 4639 // } 4640 // iters = (2*len-i)/2; 4641 // assert(iters == len-j, "must be"); 4642 // for (; iters--; j++) { 4643 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4644 // MACC(Rm, Rn, t0, t1, t2); 4645 // Rm = *++Pm; 4646 // Rn = *--Pn; 4647 // } 4648 // Pm_base[i-len] = t0; 4649 // t0 = t1; t1 = t2; t2 = 0; 4650 // } 4651 4652 // while (t0) 4653 // t0 = sub(Pm_base, Pn_base, t0, len); 4654 // } 4655 }; 4656 4657 // Initialization 4658 void generate_initial() { 4659 // Generate initial stubs and initializes the entry points 4660 4661 // entry points that exist in all platforms Note: This is code 4662 // that could be shared among different platforms - however the 4663 // benefit seems to be smaller than the disadvantage of having a 4664 // much more complicated generator structure. See also comment in 4665 // stubRoutines.hpp. 4666 4667 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4668 4669 StubRoutines::_call_stub_entry = 4670 generate_call_stub(StubRoutines::_call_stub_return_address); 4671 4672 // is referenced by megamorphic call 4673 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4674 4675 // Build this early so it's available for the interpreter. 4676 StubRoutines::_throw_StackOverflowError_entry = 4677 generate_throw_exception("StackOverflowError throw_exception", 4678 CAST_FROM_FN_PTR(address, 4679 SharedRuntime:: 4680 throw_StackOverflowError)); 4681 if (UseCRC32Intrinsics) { 4682 // set table address before stub generation which use it 4683 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 4684 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4685 } 4686 } 4687 4688 void generate_all() { 4689 // support for verify_oop (must happen after universe_init) 4690 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4691 StubRoutines::_throw_AbstractMethodError_entry = 4692 generate_throw_exception("AbstractMethodError throw_exception", 4693 CAST_FROM_FN_PTR(address, 4694 SharedRuntime:: 4695 throw_AbstractMethodError)); 4696 4697 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4698 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4699 CAST_FROM_FN_PTR(address, 4700 SharedRuntime:: 4701 throw_IncompatibleClassChangeError)); 4702 4703 StubRoutines::_throw_NullPointerException_at_call_entry = 4704 generate_throw_exception("NullPointerException at call throw_exception", 4705 CAST_FROM_FN_PTR(address, 4706 SharedRuntime:: 4707 throw_NullPointerException_at_call)); 4708 4709 // arraycopy stubs used by compilers 4710 generate_arraycopy_stubs(); 4711 4712 if (UseMultiplyToLenIntrinsic) { 4713 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4714 } 4715 4716 if (UseMontgomeryMultiplyIntrinsic) { 4717 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4718 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4719 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4720 } 4721 4722 if (UseMontgomerySquareIntrinsic) { 4723 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4724 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4725 // We use generate_multiply() rather than generate_square() 4726 // because it's faster for the sizes of modulus we care about. 4727 StubRoutines::_montgomerySquare = g.generate_multiply(); 4728 } 4729 4730 #ifndef BUILTIN_SIM 4731 // generate GHASH intrinsics code 4732 if (UseGHASHIntrinsics) { 4733 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 4734 } 4735 4736 if (UseAESIntrinsics) { 4737 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4738 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4739 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4740 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4741 } 4742 4743 if (UseSHA1Intrinsics) { 4744 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4745 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4746 } 4747 if (UseSHA256Intrinsics) { 4748 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4749 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4750 } 4751 4752 if (UseCRC32CIntrinsics) { 4753 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 4754 } 4755 4756 // generate Adler32 intrinsics code 4757 if (UseAdler32Intrinsics) { 4758 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 4759 } 4760 4761 // Safefetch stubs. 4762 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4763 &StubRoutines::_safefetch32_fault_pc, 4764 &StubRoutines::_safefetch32_continuation_pc); 4765 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4766 &StubRoutines::_safefetchN_fault_pc, 4767 &StubRoutines::_safefetchN_continuation_pc); 4768 #endif 4769 } 4770 4771 public: 4772 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4773 if (all) { 4774 generate_all(); 4775 } else { 4776 generate_initial(); 4777 } 4778 } 4779 }; // end class declaration 4780 4781 void StubGenerator_generate(CodeBuffer* code, bool all) { 4782 StubGenerator g(code, all); 4783 }