1 /* 2 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "interpreter/interpreter.hpp" 30 #include "nativeInst_aarch64.hpp" 31 #include "oops/instanceOop.hpp" 32 #include "oops/method.hpp" 33 #include "oops/objArrayKlass.hpp" 34 #include "oops/oop.inline.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/frame.inline.hpp" 37 #include "runtime/handles.inline.hpp" 38 #include "runtime/sharedRuntime.hpp" 39 #include "runtime/stubCodeGenerator.hpp" 40 #include "runtime/stubRoutines.hpp" 41 #include "runtime/thread.inline.hpp" 42 #ifdef COMPILER2 43 #include "opto/runtime.hpp" 44 #endif 45 46 #ifdef BUILTIN_SIM 47 #include "../../../../../../simulator/simulator.hpp" 48 #endif 49 50 // Declaration and definition of StubGenerator (no .hpp file). 51 // For a more detailed description of the stub routine structure 52 // see the comment in stubRoutines.hpp 53 54 #undef __ 55 #define __ _masm-> 56 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 57 58 #ifdef PRODUCT 59 #define BLOCK_COMMENT(str) /* nothing */ 60 #else 61 #define BLOCK_COMMENT(str) __ block_comment(str) 62 #endif 63 64 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 65 66 // Stub Code definitions 67 68 class StubGenerator: public StubCodeGenerator { 69 private: 70 71 #ifdef PRODUCT 72 #define inc_counter_np(counter) ((void)0) 73 #else 74 void inc_counter_np_(int& counter) { 75 __ lea(rscratch2, ExternalAddress((address)&counter)); 76 __ ldrw(rscratch1, Address(rscratch2)); 77 __ addw(rscratch1, rscratch1, 1); 78 __ strw(rscratch1, Address(rscratch2)); 79 } 80 #define inc_counter_np(counter) \ 81 BLOCK_COMMENT("inc_counter " #counter); \ 82 inc_counter_np_(counter); 83 #endif 84 85 // Call stubs are used to call Java from C 86 // 87 // Arguments: 88 // c_rarg0: call wrapper address address 89 // c_rarg1: result address 90 // c_rarg2: result type BasicType 91 // c_rarg3: method Method* 92 // c_rarg4: (interpreter) entry point address 93 // c_rarg5: parameters intptr_t* 94 // c_rarg6: parameter size (in words) int 95 // c_rarg7: thread Thread* 96 // 97 // There is no return from the stub itself as any Java result 98 // is written to result 99 // 100 // we save r30 (lr) as the return PC at the base of the frame and 101 // link r29 (fp) below it as the frame pointer installing sp (r31) 102 // into fp. 103 // 104 // we save r0-r7, which accounts for all the c arguments. 105 // 106 // TODO: strictly do we need to save them all? they are treated as 107 // volatile by C so could we omit saving the ones we are going to 108 // place in global registers (thread? method?) or those we only use 109 // during setup of the Java call? 110 // 111 // we don't need to save r8 which C uses as an indirect result location 112 // return register. 113 // 114 // we don't need to save r9-r15 which both C and Java treat as 115 // volatile 116 // 117 // we don't need to save r16-18 because Java does not use them 118 // 119 // we save r19-r28 which Java uses as scratch registers and C 120 // expects to be callee-save 121 // 122 // we save the bottom 64 bits of each value stored in v8-v15; it is 123 // the responsibility of the caller to preserve larger values. 124 // 125 // so the stub frame looks like this when we enter Java code 126 // 127 // [ return_from_Java ] <--- sp 128 // [ argument word n ] 129 // ... 130 // -27 [ argument word 1 ] 131 // -26 [ saved v15 ] <--- sp_after_call 132 // -25 [ saved v14 ] 133 // -24 [ saved v13 ] 134 // -23 [ saved v12 ] 135 // -22 [ saved v11 ] 136 // -21 [ saved v10 ] 137 // -20 [ saved v9 ] 138 // -19 [ saved v8 ] 139 // -18 [ saved r28 ] 140 // -17 [ saved r27 ] 141 // -16 [ saved r26 ] 142 // -15 [ saved r25 ] 143 // -14 [ saved r24 ] 144 // -13 [ saved r23 ] 145 // -12 [ saved r22 ] 146 // -11 [ saved r21 ] 147 // -10 [ saved r20 ] 148 // -9 [ saved r19 ] 149 // -8 [ call wrapper (r0) ] 150 // -7 [ result (r1) ] 151 // -6 [ result type (r2) ] 152 // -5 [ method (r3) ] 153 // -4 [ entry point (r4) ] 154 // -3 [ parameters (r5) ] 155 // -2 [ parameter size (r6) ] 156 // -1 [ thread (r7) ] 157 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 158 // 1 [ saved lr (r30) ] 159 160 // Call stub stack layout word offsets from fp 161 enum call_stub_layout { 162 sp_after_call_off = -26, 163 164 d15_off = -26, 165 d13_off = -24, 166 d11_off = -22, 167 d9_off = -20, 168 169 r28_off = -18, 170 r26_off = -16, 171 r24_off = -14, 172 r22_off = -12, 173 r20_off = -10, 174 call_wrapper_off = -8, 175 result_off = -7, 176 result_type_off = -6, 177 method_off = -5, 178 entry_point_off = -4, 179 parameter_size_off = -2, 180 thread_off = -1, 181 fp_f = 0, 182 retaddr_off = 1, 183 }; 184 185 address generate_call_stub(address& return_address) { 186 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 187 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 188 "adjust this code"); 189 190 StubCodeMark mark(this, "StubRoutines", "call_stub"); 191 address start = __ pc(); 192 193 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 194 195 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 196 const Address result (rfp, result_off * wordSize); 197 const Address result_type (rfp, result_type_off * wordSize); 198 const Address method (rfp, method_off * wordSize); 199 const Address entry_point (rfp, entry_point_off * wordSize); 200 const Address parameter_size(rfp, parameter_size_off * wordSize); 201 202 const Address thread (rfp, thread_off * wordSize); 203 204 const Address d15_save (rfp, d15_off * wordSize); 205 const Address d13_save (rfp, d13_off * wordSize); 206 const Address d11_save (rfp, d11_off * wordSize); 207 const Address d9_save (rfp, d9_off * wordSize); 208 209 const Address r28_save (rfp, r28_off * wordSize); 210 const Address r26_save (rfp, r26_off * wordSize); 211 const Address r24_save (rfp, r24_off * wordSize); 212 const Address r22_save (rfp, r22_off * wordSize); 213 const Address r20_save (rfp, r20_off * wordSize); 214 215 // stub code 216 217 // we need a C prolog to bootstrap the x86 caller into the sim 218 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 219 220 address aarch64_entry = __ pc(); 221 222 #ifdef BUILTIN_SIM 223 // Save sender's SP for stack traces. 224 __ mov(rscratch1, sp); 225 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 226 #endif 227 // set up frame and move sp to end of save area 228 __ enter(); 229 __ sub(sp, rfp, -sp_after_call_off * wordSize); 230 231 // save register parameters and Java scratch/global registers 232 // n.b. we save thread even though it gets installed in 233 // rthread because we want to sanity check rthread later 234 __ str(c_rarg7, thread); 235 __ strw(c_rarg6, parameter_size); 236 __ stp(c_rarg4, c_rarg5, entry_point); 237 __ stp(c_rarg2, c_rarg3, result_type); 238 __ stp(c_rarg0, c_rarg1, call_wrapper); 239 240 __ stp(r20, r19, r20_save); 241 __ stp(r22, r21, r22_save); 242 __ stp(r24, r23, r24_save); 243 __ stp(r26, r25, r26_save); 244 __ stp(r28, r27, r28_save); 245 246 __ stpd(v9, v8, d9_save); 247 __ stpd(v11, v10, d11_save); 248 __ stpd(v13, v12, d13_save); 249 __ stpd(v15, v14, d15_save); 250 251 // install Java thread in global register now we have saved 252 // whatever value it held 253 __ mov(rthread, c_rarg7); 254 // And method 255 __ mov(rmethod, c_rarg3); 256 257 // set up the heapbase register 258 __ reinit_heapbase(); 259 260 #ifdef ASSERT 261 // make sure we have no pending exceptions 262 { 263 Label L; 264 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 265 __ cmp(rscratch1, (unsigned)NULL_WORD); 266 __ br(Assembler::EQ, L); 267 __ stop("StubRoutines::call_stub: entered with pending exception"); 268 __ BIND(L); 269 } 270 #endif 271 // pass parameters if any 272 __ mov(esp, sp); 273 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 274 __ andr(sp, rscratch1, -2 * wordSize); 275 276 BLOCK_COMMENT("pass parameters if any"); 277 Label parameters_done; 278 // parameter count is still in c_rarg6 279 // and parameter pointer identifying param 1 is in c_rarg5 280 __ cbzw(c_rarg6, parameters_done); 281 282 address loop = __ pc(); 283 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 284 __ subsw(c_rarg6, c_rarg6, 1); 285 __ push(rscratch1); 286 __ br(Assembler::GT, loop); 287 288 __ BIND(parameters_done); 289 290 // call Java entry -- passing methdoOop, and current sp 291 // rmethod: Method* 292 // r13: sender sp 293 BLOCK_COMMENT("call Java function"); 294 __ mov(r13, sp); 295 __ blr(c_rarg4); 296 297 // tell the simulator we have returned to the stub 298 299 // we do this here because the notify will already have been done 300 // if we get to the next instruction via an exception 301 // 302 // n.b. adding this instruction here affects the calculation of 303 // whether or not a routine returns to the call stub (used when 304 // doing stack walks) since the normal test is to check the return 305 // pc against the address saved below. so we may need to allow for 306 // this extra instruction in the check. 307 308 if (NotifySimulator) { 309 __ notify(Assembler::method_reentry); 310 } 311 // save current address for use by exception handling code 312 313 return_address = __ pc(); 314 315 // store result depending on type (everything that is not 316 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 317 // n.b. this assumes Java returns an integral result in r0 318 // and a floating result in j_farg0 319 __ ldr(j_rarg2, result); 320 Label is_long, is_float, is_double, exit; 321 __ ldr(j_rarg1, result_type); 322 __ cmp(j_rarg1, T_OBJECT); 323 __ br(Assembler::EQ, is_long); 324 __ cmp(j_rarg1, T_LONG); 325 __ br(Assembler::EQ, is_long); 326 __ cmp(j_rarg1, T_FLOAT); 327 __ br(Assembler::EQ, is_float); 328 __ cmp(j_rarg1, T_DOUBLE); 329 __ br(Assembler::EQ, is_double); 330 331 // handle T_INT case 332 __ strw(r0, Address(j_rarg2)); 333 334 __ BIND(exit); 335 336 // pop parameters 337 __ sub(esp, rfp, -sp_after_call_off * wordSize); 338 339 #ifdef ASSERT 340 // verify that threads correspond 341 { 342 Label L, S; 343 __ ldr(rscratch1, thread); 344 __ cmp(rthread, rscratch1); 345 __ br(Assembler::NE, S); 346 __ get_thread(rscratch1); 347 __ cmp(rthread, rscratch1); 348 __ br(Assembler::EQ, L); 349 __ BIND(S); 350 __ stop("StubRoutines::call_stub: threads must correspond"); 351 __ BIND(L); 352 } 353 #endif 354 355 // restore callee-save registers 356 __ ldpd(v15, v14, d15_save); 357 __ ldpd(v13, v12, d13_save); 358 __ ldpd(v11, v10, d11_save); 359 __ ldpd(v9, v8, d9_save); 360 361 __ ldp(r28, r27, r28_save); 362 __ ldp(r26, r25, r26_save); 363 __ ldp(r24, r23, r24_save); 364 __ ldp(r22, r21, r22_save); 365 __ ldp(r20, r19, r20_save); 366 367 __ ldp(c_rarg0, c_rarg1, call_wrapper); 368 __ ldrw(c_rarg2, result_type); 369 __ ldr(c_rarg3, method); 370 __ ldp(c_rarg4, c_rarg5, entry_point); 371 __ ldp(c_rarg6, c_rarg7, parameter_size); 372 373 #ifndef PRODUCT 374 // tell the simulator we are about to end Java execution 375 if (NotifySimulator) { 376 __ notify(Assembler::method_exit); 377 } 378 #endif 379 // leave frame and return to caller 380 __ leave(); 381 __ ret(lr); 382 383 // handle return types different from T_INT 384 385 __ BIND(is_long); 386 __ str(r0, Address(j_rarg2, 0)); 387 __ br(Assembler::AL, exit); 388 389 __ BIND(is_float); 390 __ strs(j_farg0, Address(j_rarg2, 0)); 391 __ br(Assembler::AL, exit); 392 393 __ BIND(is_double); 394 __ strd(j_farg0, Address(j_rarg2, 0)); 395 __ br(Assembler::AL, exit); 396 397 return start; 398 } 399 400 // Return point for a Java call if there's an exception thrown in 401 // Java code. The exception is caught and transformed into a 402 // pending exception stored in JavaThread that can be tested from 403 // within the VM. 404 // 405 // Note: Usually the parameters are removed by the callee. In case 406 // of an exception crossing an activation frame boundary, that is 407 // not the case if the callee is compiled code => need to setup the 408 // rsp. 409 // 410 // r0: exception oop 411 412 // NOTE: this is used as a target from the signal handler so it 413 // needs an x86 prolog which returns into the current simulator 414 // executing the generated catch_exception code. so the prolog 415 // needs to install rax in a sim register and adjust the sim's 416 // restart pc to enter the generated code at the start position 417 // then return from native to simulated execution. 418 419 address generate_catch_exception() { 420 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 421 address start = __ pc(); 422 423 // same as in generate_call_stub(): 424 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 425 const Address thread (rfp, thread_off * wordSize); 426 427 #ifdef ASSERT 428 // verify that threads correspond 429 { 430 Label L, S; 431 __ ldr(rscratch1, thread); 432 __ cmp(rthread, rscratch1); 433 __ br(Assembler::NE, S); 434 __ get_thread(rscratch1); 435 __ cmp(rthread, rscratch1); 436 __ br(Assembler::EQ, L); 437 __ bind(S); 438 __ stop("StubRoutines::catch_exception: threads must correspond"); 439 __ bind(L); 440 } 441 #endif 442 443 // set pending exception 444 __ verify_oop(r0); 445 446 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 447 __ mov(rscratch1, (address)__FILE__); 448 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 449 __ movw(rscratch1, (int)__LINE__); 450 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 451 452 // complete return to VM 453 assert(StubRoutines::_call_stub_return_address != NULL, 454 "_call_stub_return_address must have been generated before"); 455 __ b(StubRoutines::_call_stub_return_address); 456 457 return start; 458 } 459 460 // Continuation point for runtime calls returning with a pending 461 // exception. The pending exception check happened in the runtime 462 // or native call stub. The pending exception in Thread is 463 // converted into a Java-level exception. 464 // 465 // Contract with Java-level exception handlers: 466 // r0: exception 467 // r3: throwing pc 468 // 469 // NOTE: At entry of this stub, exception-pc must be in LR !! 470 471 // NOTE: this is always used as a jump target within generated code 472 // so it just needs to be generated code wiht no x86 prolog 473 474 address generate_forward_exception() { 475 StubCodeMark mark(this, "StubRoutines", "forward exception"); 476 address start = __ pc(); 477 478 // Upon entry, LR points to the return address returning into 479 // Java (interpreted or compiled) code; i.e., the return address 480 // becomes the throwing pc. 481 // 482 // Arguments pushed before the runtime call are still on the stack 483 // but the exception handler will reset the stack pointer -> 484 // ignore them. A potential result in registers can be ignored as 485 // well. 486 487 #ifdef ASSERT 488 // make sure this code is only executed if there is a pending exception 489 { 490 Label L; 491 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 492 __ cbnz(rscratch1, L); 493 __ stop("StubRoutines::forward exception: no pending exception (1)"); 494 __ bind(L); 495 } 496 #endif 497 498 // compute exception handler into r19 499 500 // call the VM to find the handler address associated with the 501 // caller address. pass thread in r0 and caller pc (ret address) 502 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 503 // the stack. 504 __ mov(c_rarg1, lr); 505 // lr will be trashed by the VM call so we move it to R19 506 // (callee-saved) because we also need to pass it to the handler 507 // returned by this call. 508 __ mov(r19, lr); 509 BLOCK_COMMENT("call exception_handler_for_return_address"); 510 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 511 SharedRuntime::exception_handler_for_return_address), 512 rthread, c_rarg1); 513 // we should not really care that lr is no longer the callee 514 // address. we saved the value the handler needs in r19 so we can 515 // just copy it to r3. however, the C2 handler will push its own 516 // frame and then calls into the VM and the VM code asserts that 517 // the PC for the frame above the handler belongs to a compiled 518 // Java method. So, we restore lr here to satisfy that assert. 519 __ mov(lr, r19); 520 // setup r0 & r3 & clear pending exception 521 __ mov(r3, r19); 522 __ mov(r19, r0); 523 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 524 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 525 526 #ifdef ASSERT 527 // make sure exception is set 528 { 529 Label L; 530 __ cbnz(r0, L); 531 __ stop("StubRoutines::forward exception: no pending exception (2)"); 532 __ bind(L); 533 } 534 #endif 535 536 // continue at exception handler 537 // r0: exception 538 // r3: throwing pc 539 // r19: exception handler 540 __ verify_oop(r0); 541 __ br(r19); 542 543 return start; 544 } 545 546 // Non-destructive plausibility checks for oops 547 // 548 // Arguments: 549 // r0: oop to verify 550 // rscratch1: error message 551 // 552 // Stack after saving c_rarg3: 553 // [tos + 0]: saved c_rarg3 554 // [tos + 1]: saved c_rarg2 555 // [tos + 2]: saved lr 556 // [tos + 3]: saved rscratch2 557 // [tos + 4]: saved r0 558 // [tos + 5]: saved rscratch1 559 address generate_verify_oop() { 560 561 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 562 address start = __ pc(); 563 564 Label exit, error; 565 566 // save c_rarg2 and c_rarg3 567 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 568 569 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 570 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 571 __ ldr(c_rarg3, Address(c_rarg2)); 572 __ add(c_rarg3, c_rarg3, 1); 573 __ str(c_rarg3, Address(c_rarg2)); 574 575 // object is in r0 576 // make sure object is 'reasonable' 577 __ cbz(r0, exit); // if obj is NULL it is OK 578 579 // Check if the oop is in the right area of memory 580 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 581 __ andr(c_rarg2, r0, c_rarg3); 582 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 583 584 // Compare c_rarg2 and c_rarg3. We don't use a compare 585 // instruction here because the flags register is live. 586 __ eor(c_rarg2, c_rarg2, c_rarg3); 587 __ cbnz(c_rarg2, error); 588 589 // make sure klass is 'reasonable', which is not zero. 590 __ load_klass(r0, r0); // get klass 591 __ cbz(r0, error); // if klass is NULL it is broken 592 593 // return if everything seems ok 594 __ bind(exit); 595 596 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 597 __ ret(lr); 598 599 // handle errors 600 __ bind(error); 601 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 602 603 __ push(RegSet::range(r0, r29), sp); 604 // debug(char* msg, int64_t pc, int64_t regs[]) 605 __ mov(c_rarg0, rscratch1); // pass address of error message 606 __ mov(c_rarg1, lr); // pass return address 607 __ mov(c_rarg2, sp); // pass address of regs on stack 608 #ifndef PRODUCT 609 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 610 #endif 611 BLOCK_COMMENT("call MacroAssembler::debug"); 612 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 613 __ blrt(rscratch1, 3, 0, 1); 614 615 return start; 616 } 617 618 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 619 620 // Generate code for an array write pre barrier 621 // 622 // addr - starting address 623 // count - element count 624 // tmp - scratch register 625 // 626 // Destroy no registers except rscratch1 and rscratch2 627 // 628 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 629 BarrierSet* bs = Universe::heap()->barrier_set(); 630 switch (bs->kind()) { 631 case BarrierSet::G1SATBCTLogging: 632 // With G1, don't generate the call if we statically know that the target in uninitialized 633 if (!dest_uninitialized) { 634 __ push_call_clobbered_registers(); 635 if (count == c_rarg0) { 636 if (addr == c_rarg1) { 637 // exactly backwards!! 638 __ mov(rscratch1, c_rarg0); 639 __ mov(c_rarg0, c_rarg1); 640 __ mov(c_rarg1, rscratch1); 641 } else { 642 __ mov(c_rarg1, count); 643 __ mov(c_rarg0, addr); 644 } 645 } else { 646 __ mov(c_rarg0, addr); 647 __ mov(c_rarg1, count); 648 } 649 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 650 __ pop_call_clobbered_registers(); 651 break; 652 case BarrierSet::CardTableForRS: 653 case BarrierSet::CardTableExtension: 654 case BarrierSet::ModRef: 655 break; 656 default: 657 ShouldNotReachHere(); 658 659 } 660 } 661 } 662 663 // 664 // Generate code for an array write post barrier 665 // 666 // Input: 667 // start - register containing starting address of destination array 668 // end - register containing ending address of destination array 669 // scratch - scratch register 670 // 671 // The input registers are overwritten. 672 // The ending address is inclusive. 673 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) { 674 assert_different_registers(start, end, scratch); 675 BarrierSet* bs = Universe::heap()->barrier_set(); 676 switch (bs->kind()) { 677 case BarrierSet::G1SATBCTLogging: 678 679 { 680 __ push_call_clobbered_registers(); 681 // must compute element count unless barrier set interface is changed (other platforms supply count) 682 assert_different_registers(start, end, scratch); 683 __ lea(scratch, Address(end, BytesPerHeapOop)); 684 __ sub(scratch, scratch, start); // subtract start to get #bytes 685 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 686 __ mov(c_rarg0, start); 687 __ mov(c_rarg1, scratch); 688 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 689 __ pop_call_clobbered_registers(); 690 } 691 break; 692 case BarrierSet::CardTableForRS: 693 case BarrierSet::CardTableExtension: 694 { 695 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 696 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 697 698 Label L_loop; 699 700 __ lsr(start, start, CardTableModRefBS::card_shift); 701 __ lsr(end, end, CardTableModRefBS::card_shift); 702 __ sub(end, end, start); // number of bytes to copy 703 704 const Register count = end; // 'end' register contains bytes count now 705 __ load_byte_map_base(scratch); 706 __ add(start, start, scratch); 707 if (UseConcMarkSweepGC) { 708 __ membar(__ StoreStore); 709 } 710 __ BIND(L_loop); 711 __ strb(zr, Address(start, count)); 712 __ subs(count, count, 1); 713 __ br(Assembler::GE, L_loop); 714 } 715 break; 716 default: 717 ShouldNotReachHere(); 718 719 } 720 } 721 722 address generate_zero_longs(Register base, Register cnt) { 723 Register tmp = rscratch1; 724 Register tmp2 = rscratch2; 725 int zva_length = VM_Version::zva_length(); 726 Label initial_table_end, loop_zva; 727 Label fini; 728 729 __ align(CodeEntryAlignment); 730 StubCodeMark mark(this, "StubRoutines", "zero_longs"); 731 address start = __ pc(); 732 733 // Base must be 16 byte aligned. If not just return and let caller handle it 734 __ tst(base, 0x0f); 735 __ br(Assembler::NE, fini); 736 // Align base with ZVA length. 737 __ neg(tmp, base); 738 __ andr(tmp, tmp, zva_length - 1); 739 740 // tmp: the number of bytes to be filled to align the base with ZVA length. 741 __ add(base, base, tmp); 742 __ sub(cnt, cnt, tmp, Assembler::ASR, 3); 743 __ adr(tmp2, initial_table_end); 744 __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 745 __ br(tmp2); 746 747 for (int i = -zva_length + 16; i < 0; i += 16) 748 __ stp(zr, zr, Address(base, i)); 749 __ bind(initial_table_end); 750 751 __ sub(cnt, cnt, zva_length >> 3); 752 __ bind(loop_zva); 753 __ dc(Assembler::ZVA, base); 754 __ subs(cnt, cnt, zva_length >> 3); 755 __ add(base, base, zva_length); 756 __ br(Assembler::GE, loop_zva); 757 __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 758 __ bind(fini); 759 __ ret(lr); 760 761 return start; 762 } 763 764 typedef enum { 765 copy_forwards = 1, 766 copy_backwards = -1 767 } copy_direction; 768 769 // Bulk copy of blocks of 8 words. 770 // 771 // count is a count of words. 772 // 773 // Precondition: count >= 8 774 // 775 // Postconditions: 776 // 777 // The least significant bit of count contains the remaining count 778 // of words to copy. The rest of count is trash. 779 // 780 // s and d are adjusted to point to the remaining words to copy 781 // 782 void generate_copy_longs(Label &start, Register s, Register d, Register count, 783 copy_direction direction) { 784 int unit = wordSize * direction; 785 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 786 787 int offset; 788 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 789 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 790 const Register stride = r13; 791 792 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 793 assert_different_registers(s, d, count, rscratch1); 794 795 Label again, drain; 796 const char *stub_name; 797 if (direction == copy_forwards) 798 stub_name = "foward_copy_longs"; 799 else 800 stub_name = "backward_copy_longs"; 801 StubCodeMark mark(this, "StubRoutines", stub_name); 802 __ align(CodeEntryAlignment); 803 __ bind(start); 804 805 Label unaligned_copy_long; 806 if (AvoidUnalignedAccesses) { 807 __ tbnz(d, 3, unaligned_copy_long); 808 } 809 810 if (direction == copy_forwards) { 811 __ sub(s, s, bias); 812 __ sub(d, d, bias); 813 } 814 815 #ifdef ASSERT 816 // Make sure we are never given < 8 words 817 { 818 Label L; 819 __ cmp(count, 8); 820 __ br(Assembler::GE, L); 821 __ stop("genrate_copy_longs called with < 8 words"); 822 __ bind(L); 823 } 824 #endif 825 826 // Fill 8 registers 827 if (UseSIMDForMemoryOps) { 828 __ ldpq(v0, v1, Address(s, 4 * unit)); 829 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 830 } else { 831 __ ldp(t0, t1, Address(s, 2 * unit)); 832 __ ldp(t2, t3, Address(s, 4 * unit)); 833 __ ldp(t4, t5, Address(s, 6 * unit)); 834 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 835 } 836 837 __ subs(count, count, 16); 838 __ br(Assembler::LO, drain); 839 840 int prefetch = PrefetchCopyIntervalInBytes; 841 bool use_stride = false; 842 if (direction == copy_backwards) { 843 use_stride = prefetch > 256; 844 prefetch = -prefetch; 845 if (use_stride) __ mov(stride, prefetch); 846 } 847 848 __ bind(again); 849 850 if (PrefetchCopyIntervalInBytes > 0) 851 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 852 853 if (UseSIMDForMemoryOps) { 854 __ stpq(v0, v1, Address(d, 4 * unit)); 855 __ ldpq(v0, v1, Address(s, 4 * unit)); 856 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 857 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 858 } else { 859 __ stp(t0, t1, Address(d, 2 * unit)); 860 __ ldp(t0, t1, Address(s, 2 * unit)); 861 __ stp(t2, t3, Address(d, 4 * unit)); 862 __ ldp(t2, t3, Address(s, 4 * unit)); 863 __ stp(t4, t5, Address(d, 6 * unit)); 864 __ ldp(t4, t5, Address(s, 6 * unit)); 865 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 866 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 867 } 868 869 __ subs(count, count, 8); 870 __ br(Assembler::HS, again); 871 872 // Drain 873 __ bind(drain); 874 if (UseSIMDForMemoryOps) { 875 __ stpq(v0, v1, Address(d, 4 * unit)); 876 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 877 } else { 878 __ stp(t0, t1, Address(d, 2 * unit)); 879 __ stp(t2, t3, Address(d, 4 * unit)); 880 __ stp(t4, t5, Address(d, 6 * unit)); 881 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 882 } 883 884 { 885 Label L1, L2; 886 __ tbz(count, exact_log2(4), L1); 887 if (UseSIMDForMemoryOps) { 888 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 889 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 890 } else { 891 __ ldp(t0, t1, Address(s, 2 * unit)); 892 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 893 __ stp(t0, t1, Address(d, 2 * unit)); 894 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 895 } 896 __ bind(L1); 897 898 if (direction == copy_forwards) { 899 __ add(s, s, bias); 900 __ add(d, d, bias); 901 } 902 903 __ tbz(count, 1, L2); 904 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 905 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 906 __ bind(L2); 907 } 908 909 __ ret(lr); 910 911 if (AvoidUnalignedAccesses) { 912 Label drain, again; 913 // Register order for storing. Order is different for backward copy. 914 915 __ bind(unaligned_copy_long); 916 917 // source address is even aligned, target odd aligned 918 // 919 // when forward copying word pairs we read long pairs at offsets 920 // {0, 2, 4, 6} (in long words). when backwards copying we read 921 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 922 // address by -2 in the forwards case so we can compute the 923 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 924 // or -1. 925 // 926 // when forward copying we need to store 1 word, 3 pairs and 927 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 928 // zero offset We adjust the destination by -1 which means we 929 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 930 // 931 // When backwards copyng we need to store 1 word, 3 pairs and 932 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 933 // offsets {1, 3, 5, 7, 8} * unit. 934 935 if (direction == copy_forwards) { 936 __ sub(s, s, 16); 937 __ sub(d, d, 8); 938 } 939 940 // Fill 8 registers 941 // 942 // for forwards copy s was offset by -16 from the original input 943 // value of s so the register contents are at these offsets 944 // relative to the 64 bit block addressed by that original input 945 // and so on for each successive 64 byte block when s is updated 946 // 947 // t0 at offset 0, t1 at offset 8 948 // t2 at offset 16, t3 at offset 24 949 // t4 at offset 32, t5 at offset 40 950 // t6 at offset 48, t7 at offset 56 951 952 // for backwards copy s was not offset so the register contents 953 // are at these offsets into the preceding 64 byte block 954 // relative to that original input and so on for each successive 955 // preceding 64 byte block when s is updated. this explains the 956 // slightly counter-intuitive looking pattern of register usage 957 // in the stp instructions for backwards copy. 958 // 959 // t0 at offset -16, t1 at offset -8 960 // t2 at offset -32, t3 at offset -24 961 // t4 at offset -48, t5 at offset -40 962 // t6 at offset -64, t7 at offset -56 963 964 __ ldp(t0, t1, Address(s, 2 * unit)); 965 __ ldp(t2, t3, Address(s, 4 * unit)); 966 __ ldp(t4, t5, Address(s, 6 * unit)); 967 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 968 969 __ subs(count, count, 16); 970 __ br(Assembler::LO, drain); 971 972 int prefetch = PrefetchCopyIntervalInBytes; 973 bool use_stride = false; 974 if (direction == copy_backwards) { 975 use_stride = prefetch > 256; 976 prefetch = -prefetch; 977 if (use_stride) __ mov(stride, prefetch); 978 } 979 980 __ bind(again); 981 982 if (PrefetchCopyIntervalInBytes > 0) 983 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 984 985 if (direction == copy_forwards) { 986 // allowing for the offset of -8 the store instructions place 987 // registers into the target 64 bit block at the following 988 // offsets 989 // 990 // t0 at offset 0 991 // t1 at offset 8, t2 at offset 16 992 // t3 at offset 24, t4 at offset 32 993 // t5 at offset 40, t6 at offset 48 994 // t7 at offset 56 995 996 __ str(t0, Address(d, 1 * unit)); 997 __ stp(t1, t2, Address(d, 2 * unit)); 998 __ ldp(t0, t1, Address(s, 2 * unit)); 999 __ stp(t3, t4, Address(d, 4 * unit)); 1000 __ ldp(t2, t3, Address(s, 4 * unit)); 1001 __ stp(t5, t6, Address(d, 6 * unit)); 1002 __ ldp(t4, t5, Address(s, 6 * unit)); 1003 __ str(t7, Address(__ pre(d, 8 * unit))); 1004 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1005 } else { 1006 // d was not offset when we started so the registers are 1007 // written into the 64 bit block preceding d with the following 1008 // offsets 1009 // 1010 // t1 at offset -8 1011 // t3 at offset -24, t0 at offset -16 1012 // t5 at offset -48, t2 at offset -32 1013 // t7 at offset -56, t4 at offset -48 1014 // t6 at offset -64 1015 // 1016 // note that this matches the offsets previously noted for the 1017 // loads 1018 1019 __ str(t1, Address(d, 1 * unit)); 1020 __ stp(t3, t0, Address(d, 3 * unit)); 1021 __ ldp(t0, t1, Address(s, 2 * unit)); 1022 __ stp(t5, t2, Address(d, 5 * unit)); 1023 __ ldp(t2, t3, Address(s, 4 * unit)); 1024 __ stp(t7, t4, Address(d, 7 * unit)); 1025 __ ldp(t4, t5, Address(s, 6 * unit)); 1026 __ str(t6, Address(__ pre(d, 8 * unit))); 1027 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1028 } 1029 1030 __ subs(count, count, 8); 1031 __ br(Assembler::HS, again); 1032 1033 // Drain 1034 // 1035 // this uses the same pattern of offsets and register arguments 1036 // as above 1037 __ bind(drain); 1038 if (direction == copy_forwards) { 1039 __ str(t0, Address(d, 1 * unit)); 1040 __ stp(t1, t2, Address(d, 2 * unit)); 1041 __ stp(t3, t4, Address(d, 4 * unit)); 1042 __ stp(t5, t6, Address(d, 6 * unit)); 1043 __ str(t7, Address(__ pre(d, 8 * unit))); 1044 } else { 1045 __ str(t1, Address(d, 1 * unit)); 1046 __ stp(t3, t0, Address(d, 3 * unit)); 1047 __ stp(t5, t2, Address(d, 5 * unit)); 1048 __ stp(t7, t4, Address(d, 7 * unit)); 1049 __ str(t6, Address(__ pre(d, 8 * unit))); 1050 } 1051 // now we need to copy any remaining part block which may 1052 // include a 4 word block subblock and/or a 2 word subblock. 1053 // bits 2 and 1 in the count are the tell-tale for whetehr we 1054 // have each such subblock 1055 { 1056 Label L1, L2; 1057 __ tbz(count, exact_log2(4), L1); 1058 // this is the same as above but copying only 4 longs hence 1059 // with ony one intervening stp between the str instructions 1060 // but note that the offsets and registers still follow the 1061 // same pattern 1062 __ ldp(t0, t1, Address(s, 2 * unit)); 1063 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1064 if (direction == copy_forwards) { 1065 __ str(t0, Address(d, 1 * unit)); 1066 __ stp(t1, t2, Address(d, 2 * unit)); 1067 __ str(t3, Address(__ pre(d, 4 * unit))); 1068 } else { 1069 __ str(t1, Address(d, 1 * unit)); 1070 __ stp(t3, t0, Address(d, 3 * unit)); 1071 __ str(t2, Address(__ pre(d, 4 * unit))); 1072 } 1073 __ bind(L1); 1074 1075 __ tbz(count, 1, L2); 1076 // this is the same as above but copying only 2 longs hence 1077 // there is no intervening stp between the str instructions 1078 // but note that the offset and register patterns are still 1079 // the same 1080 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1081 if (direction == copy_forwards) { 1082 __ str(t0, Address(d, 1 * unit)); 1083 __ str(t1, Address(__ pre(d, 2 * unit))); 1084 } else { 1085 __ str(t1, Address(d, 1 * unit)); 1086 __ str(t0, Address(__ pre(d, 2 * unit))); 1087 } 1088 __ bind(L2); 1089 1090 // for forwards copy we need to re-adjust the offsets we 1091 // applied so that s and d are follow the last words written 1092 1093 if (direction == copy_forwards) { 1094 __ add(s, s, 16); 1095 __ add(d, d, 8); 1096 } 1097 1098 } 1099 1100 __ ret(lr); 1101 } 1102 } 1103 1104 // Small copy: less than 16 bytes. 1105 // 1106 // NB: Ignores all of the bits of count which represent more than 15 1107 // bytes, so a caller doesn't have to mask them. 1108 1109 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1110 bool is_backwards = step < 0; 1111 size_t granularity = uabs(step); 1112 int direction = is_backwards ? -1 : 1; 1113 int unit = wordSize * direction; 1114 1115 Label Lpair, Lword, Lint, Lshort, Lbyte; 1116 1117 assert(granularity 1118 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1119 1120 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1121 1122 // ??? I don't know if this bit-test-and-branch is the right thing 1123 // to do. It does a lot of jumping, resulting in several 1124 // mispredicted branches. It might make more sense to do this 1125 // with something like Duff's device with a single computed branch. 1126 1127 __ tbz(count, 3 - exact_log2(granularity), Lword); 1128 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1129 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1130 __ bind(Lword); 1131 1132 if (granularity <= sizeof (jint)) { 1133 __ tbz(count, 2 - exact_log2(granularity), Lint); 1134 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1135 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1136 __ bind(Lint); 1137 } 1138 1139 if (granularity <= sizeof (jshort)) { 1140 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1141 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1142 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1143 __ bind(Lshort); 1144 } 1145 1146 if (granularity <= sizeof (jbyte)) { 1147 __ tbz(count, 0, Lbyte); 1148 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1149 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1150 __ bind(Lbyte); 1151 } 1152 } 1153 1154 Label copy_f, copy_b; 1155 1156 // All-singing all-dancing memory copy. 1157 // 1158 // Copy count units of memory from s to d. The size of a unit is 1159 // step, which can be positive or negative depending on the direction 1160 // of copy. If is_aligned is false, we align the source address. 1161 // 1162 1163 void copy_memory(bool is_aligned, Register s, Register d, 1164 Register count, Register tmp, int step) { 1165 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1166 bool is_backwards = step < 0; 1167 int granularity = uabs(step); 1168 const Register t0 = r3, t1 = r4; 1169 1170 // <= 96 bytes do inline. Direction doesn't matter because we always 1171 // load all the data before writing anything 1172 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1173 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1174 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1175 const Register send = r17, dend = r18; 1176 1177 if (PrefetchCopyIntervalInBytes > 0) 1178 __ prfm(Address(s, 0), PLDL1KEEP); 1179 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1180 __ br(Assembler::HI, copy_big); 1181 1182 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1183 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1184 1185 __ cmp(count, 16/granularity); 1186 __ br(Assembler::LS, copy16); 1187 1188 __ cmp(count, 64/granularity); 1189 __ br(Assembler::HI, copy80); 1190 1191 __ cmp(count, 32/granularity); 1192 __ br(Assembler::LS, copy32); 1193 1194 // 33..64 bytes 1195 if (UseSIMDForMemoryOps) { 1196 __ ldpq(v0, v1, Address(s, 0)); 1197 __ ldpq(v2, v3, Address(send, -32)); 1198 __ stpq(v0, v1, Address(d, 0)); 1199 __ stpq(v2, v3, Address(dend, -32)); 1200 } else { 1201 __ ldp(t0, t1, Address(s, 0)); 1202 __ ldp(t2, t3, Address(s, 16)); 1203 __ ldp(t4, t5, Address(send, -32)); 1204 __ ldp(t6, t7, Address(send, -16)); 1205 1206 __ stp(t0, t1, Address(d, 0)); 1207 __ stp(t2, t3, Address(d, 16)); 1208 __ stp(t4, t5, Address(dend, -32)); 1209 __ stp(t6, t7, Address(dend, -16)); 1210 } 1211 __ b(finish); 1212 1213 // 17..32 bytes 1214 __ bind(copy32); 1215 __ ldp(t0, t1, Address(s, 0)); 1216 __ ldp(t2, t3, Address(send, -16)); 1217 __ stp(t0, t1, Address(d, 0)); 1218 __ stp(t2, t3, Address(dend, -16)); 1219 __ b(finish); 1220 1221 // 65..80/96 bytes 1222 // (96 bytes if SIMD because we do 32 byes per instruction) 1223 __ bind(copy80); 1224 if (UseSIMDForMemoryOps) { 1225 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1226 __ ldpq(v4, v5, Address(send, -32)); 1227 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1228 __ stpq(v4, v5, Address(dend, -32)); 1229 } else { 1230 __ ldp(t0, t1, Address(s, 0)); 1231 __ ldp(t2, t3, Address(s, 16)); 1232 __ ldp(t4, t5, Address(s, 32)); 1233 __ ldp(t6, t7, Address(s, 48)); 1234 __ ldp(t8, t9, Address(send, -16)); 1235 1236 __ stp(t0, t1, Address(d, 0)); 1237 __ stp(t2, t3, Address(d, 16)); 1238 __ stp(t4, t5, Address(d, 32)); 1239 __ stp(t6, t7, Address(d, 48)); 1240 __ stp(t8, t9, Address(dend, -16)); 1241 } 1242 __ b(finish); 1243 1244 // 0..16 bytes 1245 __ bind(copy16); 1246 __ cmp(count, 8/granularity); 1247 __ br(Assembler::LO, copy8); 1248 1249 // 8..16 bytes 1250 __ ldr(t0, Address(s, 0)); 1251 __ ldr(t1, Address(send, -8)); 1252 __ str(t0, Address(d, 0)); 1253 __ str(t1, Address(dend, -8)); 1254 __ b(finish); 1255 1256 if (granularity < 8) { 1257 // 4..7 bytes 1258 __ bind(copy8); 1259 __ tbz(count, 2 - exact_log2(granularity), copy4); 1260 __ ldrw(t0, Address(s, 0)); 1261 __ ldrw(t1, Address(send, -4)); 1262 __ strw(t0, Address(d, 0)); 1263 __ strw(t1, Address(dend, -4)); 1264 __ b(finish); 1265 if (granularity < 4) { 1266 // 0..3 bytes 1267 __ bind(copy4); 1268 __ cbz(count, finish); // get rid of 0 case 1269 if (granularity == 2) { 1270 __ ldrh(t0, Address(s, 0)); 1271 __ strh(t0, Address(d, 0)); 1272 } else { // granularity == 1 1273 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1274 // the first and last byte. 1275 // Handle the 3 byte case by loading and storing base + count/2 1276 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1277 // This does means in the 1 byte case we load/store the same 1278 // byte 3 times. 1279 __ lsr(count, count, 1); 1280 __ ldrb(t0, Address(s, 0)); 1281 __ ldrb(t1, Address(send, -1)); 1282 __ ldrb(t2, Address(s, count)); 1283 __ strb(t0, Address(d, 0)); 1284 __ strb(t1, Address(dend, -1)); 1285 __ strb(t2, Address(d, count)); 1286 } 1287 __ b(finish); 1288 } 1289 } 1290 1291 __ bind(copy_big); 1292 if (is_backwards) { 1293 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1294 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1295 } 1296 1297 // Now we've got the small case out of the way we can align the 1298 // source address on a 2-word boundary. 1299 1300 Label aligned; 1301 1302 if (is_aligned) { 1303 // We may have to adjust by 1 word to get s 2-word-aligned. 1304 __ tbz(s, exact_log2(wordSize), aligned); 1305 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1306 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1307 __ sub(count, count, wordSize/granularity); 1308 } else { 1309 if (is_backwards) { 1310 __ andr(rscratch2, s, 2 * wordSize - 1); 1311 } else { 1312 __ neg(rscratch2, s); 1313 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1314 } 1315 // rscratch2 is the byte adjustment needed to align s. 1316 __ cbz(rscratch2, aligned); 1317 int shift = exact_log2(granularity); 1318 if (shift) __ lsr(rscratch2, rscratch2, shift); 1319 __ sub(count, count, rscratch2); 1320 1321 #if 0 1322 // ?? This code is only correct for a disjoint copy. It may or 1323 // may not make sense to use it in that case. 1324 1325 // Copy the first pair; s and d may not be aligned. 1326 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1327 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1328 1329 // Align s and d, adjust count 1330 if (is_backwards) { 1331 __ sub(s, s, rscratch2); 1332 __ sub(d, d, rscratch2); 1333 } else { 1334 __ add(s, s, rscratch2); 1335 __ add(d, d, rscratch2); 1336 } 1337 #else 1338 copy_memory_small(s, d, rscratch2, rscratch1, step); 1339 #endif 1340 } 1341 1342 __ bind(aligned); 1343 1344 // s is now 2-word-aligned. 1345 1346 // We have a count of units and some trailing bytes. Adjust the 1347 // count and do a bulk copy of words. 1348 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1349 if (direction == copy_forwards) 1350 __ bl(copy_f); 1351 else 1352 __ bl(copy_b); 1353 1354 // And the tail. 1355 copy_memory_small(s, d, count, tmp, step); 1356 1357 if (granularity >= 8) __ bind(copy8); 1358 if (granularity >= 4) __ bind(copy4); 1359 __ bind(finish); 1360 } 1361 1362 1363 void clobber_registers() { 1364 #ifdef ASSERT 1365 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1366 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1367 for (Register r = r3; r <= r18; r++) 1368 if (r != rscratch1) __ mov(r, rscratch1); 1369 #endif 1370 } 1371 1372 // Scan over array at a for count oops, verifying each one. 1373 // Preserves a and count, clobbers rscratch1 and rscratch2. 1374 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1375 Label loop, end; 1376 __ mov(rscratch1, a); 1377 __ mov(rscratch2, zr); 1378 __ bind(loop); 1379 __ cmp(rscratch2, count); 1380 __ br(Assembler::HS, end); 1381 if (size == (size_t)wordSize) { 1382 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1383 __ verify_oop(temp); 1384 } else { 1385 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1386 __ decode_heap_oop(temp); // calls verify_oop 1387 } 1388 __ add(rscratch2, rscratch2, size); 1389 __ b(loop); 1390 __ bind(end); 1391 } 1392 1393 // Arguments: 1394 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1395 // ignored 1396 // is_oop - true => oop array, so generate store check code 1397 // name - stub name string 1398 // 1399 // Inputs: 1400 // c_rarg0 - source array address 1401 // c_rarg1 - destination array address 1402 // c_rarg2 - element count, treated as ssize_t, can be zero 1403 // 1404 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1405 // the hardware handle it. The two dwords within qwords that span 1406 // cache line boundaries will still be loaded and stored atomicly. 1407 // 1408 // Side Effects: 1409 // disjoint_int_copy_entry is set to the no-overlap entry point 1410 // used by generate_conjoint_int_oop_copy(). 1411 // 1412 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1413 const char *name, bool dest_uninitialized = false) { 1414 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1415 __ align(CodeEntryAlignment); 1416 StubCodeMark mark(this, "StubRoutines", name); 1417 address start = __ pc(); 1418 __ enter(); 1419 1420 if (entry != NULL) { 1421 *entry = __ pc(); 1422 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1423 BLOCK_COMMENT("Entry:"); 1424 } 1425 1426 if (is_oop) { 1427 __ push(RegSet::of(d, count), sp); 1428 // no registers are destroyed by this call 1429 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1430 } 1431 copy_memory(aligned, s, d, count, rscratch1, size); 1432 if (is_oop) { 1433 __ pop(RegSet::of(d, count), sp); 1434 if (VerifyOops) 1435 verify_oop_array(size, d, count, r16); 1436 __ sub(count, count, 1); // make an inclusive end pointer 1437 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1438 gen_write_ref_array_post_barrier(d, count, rscratch1); 1439 } 1440 __ leave(); 1441 __ mov(r0, zr); // return 0 1442 __ ret(lr); 1443 #ifdef BUILTIN_SIM 1444 { 1445 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1446 sim->notifyCompile(const_cast<char*>(name), start); 1447 } 1448 #endif 1449 return start; 1450 } 1451 1452 // Arguments: 1453 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1454 // ignored 1455 // is_oop - true => oop array, so generate store check code 1456 // name - stub name string 1457 // 1458 // Inputs: 1459 // c_rarg0 - source array address 1460 // c_rarg1 - destination array address 1461 // c_rarg2 - element count, treated as ssize_t, can be zero 1462 // 1463 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1464 // the hardware handle it. The two dwords within qwords that span 1465 // cache line boundaries will still be loaded and stored atomicly. 1466 // 1467 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1468 address *entry, const char *name, 1469 bool dest_uninitialized = false) { 1470 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1471 1472 StubCodeMark mark(this, "StubRoutines", name); 1473 address start = __ pc(); 1474 __ enter(); 1475 1476 if (entry != NULL) { 1477 *entry = __ pc(); 1478 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1479 BLOCK_COMMENT("Entry:"); 1480 } 1481 1482 // use fwd copy when (d-s) above_equal (count*size) 1483 __ sub(rscratch1, d, s); 1484 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1485 __ br(Assembler::HS, nooverlap_target); 1486 1487 if (is_oop) { 1488 __ push(RegSet::of(d, count), sp); 1489 // no registers are destroyed by this call 1490 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1491 } 1492 copy_memory(aligned, s, d, count, rscratch1, -size); 1493 if (is_oop) { 1494 __ pop(RegSet::of(d, count), sp); 1495 if (VerifyOops) 1496 verify_oop_array(size, d, count, r16); 1497 __ sub(count, count, 1); // make an inclusive end pointer 1498 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1499 gen_write_ref_array_post_barrier(d, count, rscratch1); 1500 } 1501 __ leave(); 1502 __ mov(r0, zr); // return 0 1503 __ ret(lr); 1504 #ifdef BUILTIN_SIM 1505 { 1506 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1507 sim->notifyCompile(const_cast<char*>(name), start); 1508 } 1509 #endif 1510 return start; 1511 } 1512 1513 // Arguments: 1514 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1515 // ignored 1516 // name - stub name string 1517 // 1518 // Inputs: 1519 // c_rarg0 - source array address 1520 // c_rarg1 - destination array address 1521 // c_rarg2 - element count, treated as ssize_t, can be zero 1522 // 1523 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1524 // we let the hardware handle it. The one to eight bytes within words, 1525 // dwords or qwords that span cache line boundaries will still be loaded 1526 // and stored atomically. 1527 // 1528 // Side Effects: 1529 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1530 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1531 // we let the hardware handle it. The one to eight bytes within words, 1532 // dwords or qwords that span cache line boundaries will still be loaded 1533 // and stored atomically. 1534 // 1535 // Side Effects: 1536 // disjoint_byte_copy_entry is set to the no-overlap entry point 1537 // used by generate_conjoint_byte_copy(). 1538 // 1539 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1540 const bool not_oop = false; 1541 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1542 } 1543 1544 // Arguments: 1545 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1546 // ignored 1547 // name - stub name string 1548 // 1549 // Inputs: 1550 // c_rarg0 - source array address 1551 // c_rarg1 - destination array address 1552 // c_rarg2 - element count, treated as ssize_t, can be zero 1553 // 1554 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1555 // we let the hardware handle it. The one to eight bytes within words, 1556 // dwords or qwords that span cache line boundaries will still be loaded 1557 // and stored atomically. 1558 // 1559 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1560 address* entry, const char *name) { 1561 const bool not_oop = false; 1562 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1563 } 1564 1565 // Arguments: 1566 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1567 // ignored 1568 // name - stub name string 1569 // 1570 // Inputs: 1571 // c_rarg0 - source array address 1572 // c_rarg1 - destination array address 1573 // c_rarg2 - element count, treated as ssize_t, can be zero 1574 // 1575 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1576 // let the hardware handle it. The two or four words within dwords 1577 // or qwords that span cache line boundaries will still be loaded 1578 // and stored atomically. 1579 // 1580 // Side Effects: 1581 // disjoint_short_copy_entry is set to the no-overlap entry point 1582 // used by generate_conjoint_short_copy(). 1583 // 1584 address generate_disjoint_short_copy(bool aligned, 1585 address* entry, const char *name) { 1586 const bool not_oop = false; 1587 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1588 } 1589 1590 // Arguments: 1591 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1592 // ignored 1593 // name - stub name string 1594 // 1595 // Inputs: 1596 // c_rarg0 - source array address 1597 // c_rarg1 - destination array address 1598 // c_rarg2 - element count, treated as ssize_t, can be zero 1599 // 1600 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1601 // let the hardware handle it. The two or four words within dwords 1602 // or qwords that span cache line boundaries will still be loaded 1603 // and stored atomically. 1604 // 1605 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1606 address *entry, const char *name) { 1607 const bool not_oop = false; 1608 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1609 1610 } 1611 // Arguments: 1612 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1613 // ignored 1614 // name - stub name string 1615 // 1616 // Inputs: 1617 // c_rarg0 - source array address 1618 // c_rarg1 - destination array address 1619 // c_rarg2 - element count, treated as ssize_t, can be zero 1620 // 1621 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1622 // the hardware handle it. The two dwords within qwords that span 1623 // cache line boundaries will still be loaded and stored atomicly. 1624 // 1625 // Side Effects: 1626 // disjoint_int_copy_entry is set to the no-overlap entry point 1627 // used by generate_conjoint_int_oop_copy(). 1628 // 1629 address generate_disjoint_int_copy(bool aligned, address *entry, 1630 const char *name, bool dest_uninitialized = false) { 1631 const bool not_oop = false; 1632 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1633 } 1634 1635 // Arguments: 1636 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1637 // ignored 1638 // name - stub name string 1639 // 1640 // Inputs: 1641 // c_rarg0 - source array address 1642 // c_rarg1 - destination array address 1643 // c_rarg2 - element count, treated as ssize_t, can be zero 1644 // 1645 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1646 // the hardware handle it. The two dwords within qwords that span 1647 // cache line boundaries will still be loaded and stored atomicly. 1648 // 1649 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1650 address *entry, const char *name, 1651 bool dest_uninitialized = false) { 1652 const bool not_oop = false; 1653 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1654 } 1655 1656 1657 // Arguments: 1658 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1659 // ignored 1660 // name - stub name string 1661 // 1662 // Inputs: 1663 // c_rarg0 - source array address 1664 // c_rarg1 - destination array address 1665 // c_rarg2 - element count, treated as size_t, can be zero 1666 // 1667 // Side Effects: 1668 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1669 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1670 // 1671 address generate_disjoint_long_copy(bool aligned, address *entry, 1672 const char *name, bool dest_uninitialized = false) { 1673 const bool not_oop = false; 1674 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1675 } 1676 1677 // Arguments: 1678 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1679 // ignored 1680 // name - stub name string 1681 // 1682 // Inputs: 1683 // c_rarg0 - source array address 1684 // c_rarg1 - destination array address 1685 // c_rarg2 - element count, treated as size_t, can be zero 1686 // 1687 address generate_conjoint_long_copy(bool aligned, 1688 address nooverlap_target, address *entry, 1689 const char *name, bool dest_uninitialized = false) { 1690 const bool not_oop = false; 1691 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1692 } 1693 1694 // Arguments: 1695 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1696 // ignored 1697 // name - stub name string 1698 // 1699 // Inputs: 1700 // c_rarg0 - source array address 1701 // c_rarg1 - destination array address 1702 // c_rarg2 - element count, treated as size_t, can be zero 1703 // 1704 // Side Effects: 1705 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1706 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1707 // 1708 address generate_disjoint_oop_copy(bool aligned, address *entry, 1709 const char *name, bool dest_uninitialized) { 1710 const bool is_oop = true; 1711 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1712 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1713 } 1714 1715 // Arguments: 1716 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1717 // ignored 1718 // name - stub name string 1719 // 1720 // Inputs: 1721 // c_rarg0 - source array address 1722 // c_rarg1 - destination array address 1723 // c_rarg2 - element count, treated as size_t, can be zero 1724 // 1725 address generate_conjoint_oop_copy(bool aligned, 1726 address nooverlap_target, address *entry, 1727 const char *name, bool dest_uninitialized) { 1728 const bool is_oop = true; 1729 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1730 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1731 name, dest_uninitialized); 1732 } 1733 1734 1735 // Helper for generating a dynamic type check. 1736 // Smashes rscratch1. 1737 void generate_type_check(Register sub_klass, 1738 Register super_check_offset, 1739 Register super_klass, 1740 Label& L_success) { 1741 assert_different_registers(sub_klass, super_check_offset, super_klass); 1742 1743 BLOCK_COMMENT("type_check:"); 1744 1745 Label L_miss; 1746 1747 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1748 super_check_offset); 1749 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1750 1751 // Fall through on failure! 1752 __ BIND(L_miss); 1753 } 1754 1755 // 1756 // Generate checkcasting array copy stub 1757 // 1758 // Input: 1759 // c_rarg0 - source array address 1760 // c_rarg1 - destination array address 1761 // c_rarg2 - element count, treated as ssize_t, can be zero 1762 // c_rarg3 - size_t ckoff (super_check_offset) 1763 // c_rarg4 - oop ckval (super_klass) 1764 // 1765 // Output: 1766 // r0 == 0 - success 1767 // r0 == -1^K - failure, where K is partial transfer count 1768 // 1769 address generate_checkcast_copy(const char *name, address *entry, 1770 bool dest_uninitialized = false) { 1771 1772 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1773 1774 // Input registers (after setup_arg_regs) 1775 const Register from = c_rarg0; // source array address 1776 const Register to = c_rarg1; // destination array address 1777 const Register count = c_rarg2; // elementscount 1778 const Register ckoff = c_rarg3; // super_check_offset 1779 const Register ckval = c_rarg4; // super_klass 1780 1781 // Registers used as temps (r18, r19, r20 are save-on-entry) 1782 const Register count_save = r21; // orig elementscount 1783 const Register start_to = r20; // destination array start address 1784 const Register copied_oop = r18; // actual oop copied 1785 const Register r19_klass = r19; // oop._klass 1786 1787 //--------------------------------------------------------------- 1788 // Assembler stub will be used for this call to arraycopy 1789 // if the two arrays are subtypes of Object[] but the 1790 // destination array type is not equal to or a supertype 1791 // of the source type. Each element must be separately 1792 // checked. 1793 1794 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1795 copied_oop, r19_klass, count_save); 1796 1797 __ align(CodeEntryAlignment); 1798 StubCodeMark mark(this, "StubRoutines", name); 1799 address start = __ pc(); 1800 1801 __ enter(); // required for proper stackwalking of RuntimeStub frame 1802 1803 #ifdef ASSERT 1804 // caller guarantees that the arrays really are different 1805 // otherwise, we would have to make conjoint checks 1806 { Label L; 1807 array_overlap_test(L, TIMES_OOP); 1808 __ stop("checkcast_copy within a single array"); 1809 __ bind(L); 1810 } 1811 #endif //ASSERT 1812 1813 // Caller of this entry point must set up the argument registers. 1814 if (entry != NULL) { 1815 *entry = __ pc(); 1816 BLOCK_COMMENT("Entry:"); 1817 } 1818 1819 // Empty array: Nothing to do. 1820 __ cbz(count, L_done); 1821 1822 __ push(RegSet::of(r18, r19, r20, r21), sp); 1823 1824 #ifdef ASSERT 1825 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1826 // The ckoff and ckval must be mutually consistent, 1827 // even though caller generates both. 1828 { Label L; 1829 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1830 __ ldrw(start_to, Address(ckval, sco_offset)); 1831 __ cmpw(ckoff, start_to); 1832 __ br(Assembler::EQ, L); 1833 __ stop("super_check_offset inconsistent"); 1834 __ bind(L); 1835 } 1836 #endif //ASSERT 1837 1838 gen_write_ref_array_pre_barrier(to, count, dest_uninitialized); 1839 1840 // save the original count 1841 __ mov(count_save, count); 1842 1843 // Copy from low to high addresses 1844 __ mov(start_to, to); // Save destination array start address 1845 __ b(L_load_element); 1846 1847 // ======== begin loop ======== 1848 // (Loop is rotated; its entry is L_load_element.) 1849 // Loop control: 1850 // for (; count != 0; count--) { 1851 // copied_oop = load_heap_oop(from++); 1852 // ... generate_type_check ...; 1853 // store_heap_oop(to++, copied_oop); 1854 // } 1855 __ align(OptoLoopAlignment); 1856 1857 __ BIND(L_store_element); 1858 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1859 __ sub(count, count, 1); 1860 __ cbz(count, L_do_card_marks); 1861 1862 // ======== loop entry is here ======== 1863 __ BIND(L_load_element); 1864 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1865 __ cbz(copied_oop, L_store_element); 1866 1867 __ load_klass(r19_klass, copied_oop);// query the object klass 1868 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1869 // ======== end loop ======== 1870 1871 // It was a real error; we must depend on the caller to finish the job. 1872 // Register count = remaining oops, count_orig = total oops. 1873 // Emit GC store barriers for the oops we have copied and report 1874 // their number to the caller. 1875 1876 __ subs(count, count_save, count); // K = partially copied oop count 1877 __ eon(count, count, zr); // report (-1^K) to caller 1878 __ br(Assembler::EQ, L_done_pop); 1879 1880 __ BIND(L_do_card_marks); 1881 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1882 gen_write_ref_array_post_barrier(start_to, to, rscratch1); 1883 1884 __ bind(L_done_pop); 1885 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1886 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1887 1888 __ bind(L_done); 1889 __ mov(r0, count); 1890 __ leave(); 1891 __ ret(lr); 1892 1893 return start; 1894 } 1895 1896 // Perform range checks on the proposed arraycopy. 1897 // Kills temp, but nothing else. 1898 // Also, clean the sign bits of src_pos and dst_pos. 1899 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1900 Register src_pos, // source position (c_rarg1) 1901 Register dst, // destination array oo (c_rarg2) 1902 Register dst_pos, // destination position (c_rarg3) 1903 Register length, 1904 Register temp, 1905 Label& L_failed) { 1906 BLOCK_COMMENT("arraycopy_range_checks:"); 1907 1908 assert_different_registers(rscratch1, temp); 1909 1910 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1911 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1912 __ addw(temp, length, src_pos); 1913 __ cmpw(temp, rscratch1); 1914 __ br(Assembler::HI, L_failed); 1915 1916 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1917 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1918 __ addw(temp, length, dst_pos); 1919 __ cmpw(temp, rscratch1); 1920 __ br(Assembler::HI, L_failed); 1921 1922 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1923 __ movw(src_pos, src_pos); 1924 __ movw(dst_pos, dst_pos); 1925 1926 BLOCK_COMMENT("arraycopy_range_checks done"); 1927 } 1928 1929 // These stubs get called from some dumb test routine. 1930 // I'll write them properly when they're called from 1931 // something that's actually doing something. 1932 static void fake_arraycopy_stub(address src, address dst, int count) { 1933 assert(count == 0, "huh?"); 1934 } 1935 1936 1937 // 1938 // Generate 'unsafe' array copy stub 1939 // Though just as safe as the other stubs, it takes an unscaled 1940 // size_t argument instead of an element count. 1941 // 1942 // Input: 1943 // c_rarg0 - source array address 1944 // c_rarg1 - destination array address 1945 // c_rarg2 - byte count, treated as ssize_t, can be zero 1946 // 1947 // Examines the alignment of the operands and dispatches 1948 // to a long, int, short, or byte copy loop. 1949 // 1950 address generate_unsafe_copy(const char *name, 1951 address byte_copy_entry, 1952 address short_copy_entry, 1953 address int_copy_entry, 1954 address long_copy_entry) { 1955 Label L_long_aligned, L_int_aligned, L_short_aligned; 1956 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1957 1958 __ align(CodeEntryAlignment); 1959 StubCodeMark mark(this, "StubRoutines", name); 1960 address start = __ pc(); 1961 __ enter(); // required for proper stackwalking of RuntimeStub frame 1962 1963 // bump this on entry, not on exit: 1964 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1965 1966 __ orr(rscratch1, s, d); 1967 __ orr(rscratch1, rscratch1, count); 1968 1969 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1970 __ cbz(rscratch1, L_long_aligned); 1971 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1972 __ cbz(rscratch1, L_int_aligned); 1973 __ tbz(rscratch1, 0, L_short_aligned); 1974 __ b(RuntimeAddress(byte_copy_entry)); 1975 1976 __ BIND(L_short_aligned); 1977 __ lsr(count, count, LogBytesPerShort); // size => short_count 1978 __ b(RuntimeAddress(short_copy_entry)); 1979 __ BIND(L_int_aligned); 1980 __ lsr(count, count, LogBytesPerInt); // size => int_count 1981 __ b(RuntimeAddress(int_copy_entry)); 1982 __ BIND(L_long_aligned); 1983 __ lsr(count, count, LogBytesPerLong); // size => long_count 1984 __ b(RuntimeAddress(long_copy_entry)); 1985 1986 return start; 1987 } 1988 1989 // 1990 // Generate generic array copy stubs 1991 // 1992 // Input: 1993 // c_rarg0 - src oop 1994 // c_rarg1 - src_pos (32-bits) 1995 // c_rarg2 - dst oop 1996 // c_rarg3 - dst_pos (32-bits) 1997 // c_rarg4 - element count (32-bits) 1998 // 1999 // Output: 2000 // r0 == 0 - success 2001 // r0 == -1^K - failure, where K is partial transfer count 2002 // 2003 address generate_generic_copy(const char *name, 2004 address byte_copy_entry, address short_copy_entry, 2005 address int_copy_entry, address oop_copy_entry, 2006 address long_copy_entry, address checkcast_copy_entry) { 2007 2008 Label L_failed, L_failed_0, L_objArray; 2009 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2010 2011 // Input registers 2012 const Register src = c_rarg0; // source array oop 2013 const Register src_pos = c_rarg1; // source position 2014 const Register dst = c_rarg2; // destination array oop 2015 const Register dst_pos = c_rarg3; // destination position 2016 const Register length = c_rarg4; 2017 2018 StubCodeMark mark(this, "StubRoutines", name); 2019 2020 __ align(CodeEntryAlignment); 2021 address start = __ pc(); 2022 2023 __ enter(); // required for proper stackwalking of RuntimeStub frame 2024 2025 // bump this on entry, not on exit: 2026 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2027 2028 //----------------------------------------------------------------------- 2029 // Assembler stub will be used for this call to arraycopy 2030 // if the following conditions are met: 2031 // 2032 // (1) src and dst must not be null. 2033 // (2) src_pos must not be negative. 2034 // (3) dst_pos must not be negative. 2035 // (4) length must not be negative. 2036 // (5) src klass and dst klass should be the same and not NULL. 2037 // (6) src and dst should be arrays. 2038 // (7) src_pos + length must not exceed length of src. 2039 // (8) dst_pos + length must not exceed length of dst. 2040 // 2041 2042 // if (src == NULL) return -1; 2043 __ cbz(src, L_failed); 2044 2045 // if (src_pos < 0) return -1; 2046 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2047 2048 // if (dst == NULL) return -1; 2049 __ cbz(dst, L_failed); 2050 2051 // if (dst_pos < 0) return -1; 2052 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2053 2054 // registers used as temp 2055 const Register scratch_length = r16; // elements count to copy 2056 const Register scratch_src_klass = r17; // array klass 2057 const Register lh = r18; // layout helper 2058 2059 // if (length < 0) return -1; 2060 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2061 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2062 2063 __ load_klass(scratch_src_klass, src); 2064 #ifdef ASSERT 2065 // assert(src->klass() != NULL); 2066 { 2067 BLOCK_COMMENT("assert klasses not null {"); 2068 Label L1, L2; 2069 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2070 __ bind(L1); 2071 __ stop("broken null klass"); 2072 __ bind(L2); 2073 __ load_klass(rscratch1, dst); 2074 __ cbz(rscratch1, L1); // this would be broken also 2075 BLOCK_COMMENT("} assert klasses not null done"); 2076 } 2077 #endif 2078 2079 // Load layout helper (32-bits) 2080 // 2081 // |array_tag| | header_size | element_type | |log2_element_size| 2082 // 32 30 24 16 8 2 0 2083 // 2084 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2085 // 2086 2087 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2088 2089 // Handle objArrays completely differently... 2090 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2091 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2092 __ movw(rscratch1, objArray_lh); 2093 __ eorw(rscratch2, lh, rscratch1); 2094 __ cbzw(rscratch2, L_objArray); 2095 2096 // if (src->klass() != dst->klass()) return -1; 2097 __ load_klass(rscratch2, dst); 2098 __ eor(rscratch2, rscratch2, scratch_src_klass); 2099 __ cbnz(rscratch2, L_failed); 2100 2101 // if (!src->is_Array()) return -1; 2102 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2103 2104 // At this point, it is known to be a typeArray (array_tag 0x3). 2105 #ifdef ASSERT 2106 { 2107 BLOCK_COMMENT("assert primitive array {"); 2108 Label L; 2109 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2110 __ cmpw(lh, rscratch2); 2111 __ br(Assembler::GE, L); 2112 __ stop("must be a primitive array"); 2113 __ bind(L); 2114 BLOCK_COMMENT("} assert primitive array done"); 2115 } 2116 #endif 2117 2118 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2119 rscratch2, L_failed); 2120 2121 // TypeArrayKlass 2122 // 2123 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2124 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2125 // 2126 2127 const Register rscratch1_offset = rscratch1; // array offset 2128 const Register r18_elsize = lh; // element size 2129 2130 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2131 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2132 __ add(src, src, rscratch1_offset); // src array offset 2133 __ add(dst, dst, rscratch1_offset); // dst array offset 2134 BLOCK_COMMENT("choose copy loop based on element size"); 2135 2136 // next registers should be set before the jump to corresponding stub 2137 const Register from = c_rarg0; // source array address 2138 const Register to = c_rarg1; // destination array address 2139 const Register count = c_rarg2; // elements count 2140 2141 // 'from', 'to', 'count' registers should be set in such order 2142 // since they are the same as 'src', 'src_pos', 'dst'. 2143 2144 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2145 2146 // The possible values of elsize are 0-3, i.e. exact_log2(element 2147 // size in bytes). We do a simple bitwise binary search. 2148 __ BIND(L_copy_bytes); 2149 __ tbnz(r18_elsize, 1, L_copy_ints); 2150 __ tbnz(r18_elsize, 0, L_copy_shorts); 2151 __ lea(from, Address(src, src_pos));// src_addr 2152 __ lea(to, Address(dst, dst_pos));// dst_addr 2153 __ movw(count, scratch_length); // length 2154 __ b(RuntimeAddress(byte_copy_entry)); 2155 2156 __ BIND(L_copy_shorts); 2157 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2158 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2159 __ movw(count, scratch_length); // length 2160 __ b(RuntimeAddress(short_copy_entry)); 2161 2162 __ BIND(L_copy_ints); 2163 __ tbnz(r18_elsize, 0, L_copy_longs); 2164 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2165 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2166 __ movw(count, scratch_length); // length 2167 __ b(RuntimeAddress(int_copy_entry)); 2168 2169 __ BIND(L_copy_longs); 2170 #ifdef ASSERT 2171 { 2172 BLOCK_COMMENT("assert long copy {"); 2173 Label L; 2174 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2175 __ cmpw(r18_elsize, LogBytesPerLong); 2176 __ br(Assembler::EQ, L); 2177 __ stop("must be long copy, but elsize is wrong"); 2178 __ bind(L); 2179 BLOCK_COMMENT("} assert long copy done"); 2180 } 2181 #endif 2182 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2183 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2184 __ movw(count, scratch_length); // length 2185 __ b(RuntimeAddress(long_copy_entry)); 2186 2187 // ObjArrayKlass 2188 __ BIND(L_objArray); 2189 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2190 2191 Label L_plain_copy, L_checkcast_copy; 2192 // test array classes for subtyping 2193 __ load_klass(r18, dst); 2194 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2195 __ br(Assembler::NE, L_checkcast_copy); 2196 2197 // Identically typed arrays can be copied without element-wise checks. 2198 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2199 rscratch2, L_failed); 2200 2201 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2202 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2203 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2204 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2205 __ movw(count, scratch_length); // length 2206 __ BIND(L_plain_copy); 2207 __ b(RuntimeAddress(oop_copy_entry)); 2208 2209 __ BIND(L_checkcast_copy); 2210 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2211 { 2212 // Before looking at dst.length, make sure dst is also an objArray. 2213 __ ldrw(rscratch1, Address(r18, lh_offset)); 2214 __ movw(rscratch2, objArray_lh); 2215 __ eorw(rscratch1, rscratch1, rscratch2); 2216 __ cbnzw(rscratch1, L_failed); 2217 2218 // It is safe to examine both src.length and dst.length. 2219 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2220 r18, L_failed); 2221 2222 const Register rscratch2_dst_klass = rscratch2; 2223 __ load_klass(rscratch2_dst_klass, dst); // reload 2224 2225 // Marshal the base address arguments now, freeing registers. 2226 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2227 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2228 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2229 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2230 __ movw(count, length); // length (reloaded) 2231 Register sco_temp = c_rarg3; // this register is free now 2232 assert_different_registers(from, to, count, sco_temp, 2233 rscratch2_dst_klass, scratch_src_klass); 2234 // assert_clean_int(count, sco_temp); 2235 2236 // Generate the type check. 2237 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2238 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2239 // assert_clean_int(sco_temp, r18); 2240 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2241 2242 // Fetch destination element klass from the ObjArrayKlass header. 2243 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2244 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2245 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2246 2247 // the checkcast_copy loop needs two extra arguments: 2248 assert(c_rarg3 == sco_temp, "#3 already in place"); 2249 // Set up arguments for checkcast_copy_entry. 2250 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2251 __ b(RuntimeAddress(checkcast_copy_entry)); 2252 } 2253 2254 __ BIND(L_failed); 2255 __ mov(r0, -1); 2256 __ leave(); // required for proper stackwalking of RuntimeStub frame 2257 __ ret(lr); 2258 2259 return start; 2260 } 2261 2262 // 2263 // Generate stub for array fill. If "aligned" is true, the 2264 // "to" address is assumed to be heapword aligned. 2265 // 2266 // Arguments for generated stub: 2267 // to: c_rarg0 2268 // value: c_rarg1 2269 // count: c_rarg2 treated as signed 2270 // 2271 address generate_fill(BasicType t, bool aligned, const char *name) { 2272 __ align(CodeEntryAlignment); 2273 StubCodeMark mark(this, "StubRoutines", name); 2274 address start = __ pc(); 2275 2276 BLOCK_COMMENT("Entry:"); 2277 2278 const Register to = c_rarg0; // source array address 2279 const Register value = c_rarg1; // value 2280 const Register count = c_rarg2; // elements count 2281 2282 const Register bz_base = r10; // base for block_zero routine 2283 const Register cnt_words = r11; // temp register 2284 2285 __ enter(); 2286 2287 Label L_fill_elements, L_exit1; 2288 2289 int shift = -1; 2290 switch (t) { 2291 case T_BYTE: 2292 shift = 0; 2293 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2294 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2295 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2296 __ br(Assembler::LO, L_fill_elements); 2297 break; 2298 case T_SHORT: 2299 shift = 1; 2300 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2301 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2302 __ br(Assembler::LO, L_fill_elements); 2303 break; 2304 case T_INT: 2305 shift = 2; 2306 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2307 __ br(Assembler::LO, L_fill_elements); 2308 break; 2309 default: ShouldNotReachHere(); 2310 } 2311 2312 // Align source address at 8 bytes address boundary. 2313 Label L_skip_align1, L_skip_align2, L_skip_align4; 2314 if (!aligned) { 2315 switch (t) { 2316 case T_BYTE: 2317 // One byte misalignment happens only for byte arrays. 2318 __ tbz(to, 0, L_skip_align1); 2319 __ strb(value, Address(__ post(to, 1))); 2320 __ subw(count, count, 1); 2321 __ bind(L_skip_align1); 2322 // Fallthrough 2323 case T_SHORT: 2324 // Two bytes misalignment happens only for byte and short (char) arrays. 2325 __ tbz(to, 1, L_skip_align2); 2326 __ strh(value, Address(__ post(to, 2))); 2327 __ subw(count, count, 2 >> shift); 2328 __ bind(L_skip_align2); 2329 // Fallthrough 2330 case T_INT: 2331 // Align to 8 bytes, we know we are 4 byte aligned to start. 2332 __ tbz(to, 2, L_skip_align4); 2333 __ strw(value, Address(__ post(to, 4))); 2334 __ subw(count, count, 4 >> shift); 2335 __ bind(L_skip_align4); 2336 break; 2337 default: ShouldNotReachHere(); 2338 } 2339 } 2340 2341 // 2342 // Fill large chunks 2343 // 2344 __ lsrw(cnt_words, count, 3 - shift); // number of words 2345 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2346 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2347 if (UseBlockZeroing) { 2348 Label non_block_zeroing, rest; 2349 // count >= BlockZeroingLowLimit && value == 0 2350 __ cmp(cnt_words, BlockZeroingLowLimit >> 3); 2351 __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE); 2352 __ br(Assembler::NE, non_block_zeroing); 2353 __ mov(bz_base, to); 2354 __ block_zero(bz_base, cnt_words, true); 2355 __ mov(to, bz_base); 2356 __ b(rest); 2357 __ bind(non_block_zeroing); 2358 __ fill_words(to, cnt_words, value); 2359 __ bind(rest); 2360 } 2361 else { 2362 __ fill_words(to, cnt_words, value); 2363 } 2364 2365 // Remaining count is less than 8 bytes. Fill it by a single store. 2366 // Note that the total length is no less than 8 bytes. 2367 if (t == T_BYTE || t == T_SHORT) { 2368 Label L_exit1; 2369 __ cbzw(count, L_exit1); 2370 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2371 __ str(value, Address(to, -8)); // overwrite some elements 2372 __ bind(L_exit1); 2373 __ leave(); 2374 __ ret(lr); 2375 } 2376 2377 // Handle copies less than 8 bytes. 2378 Label L_fill_2, L_fill_4, L_exit2; 2379 __ bind(L_fill_elements); 2380 switch (t) { 2381 case T_BYTE: 2382 __ tbz(count, 0, L_fill_2); 2383 __ strb(value, Address(__ post(to, 1))); 2384 __ bind(L_fill_2); 2385 __ tbz(count, 1, L_fill_4); 2386 __ strh(value, Address(__ post(to, 2))); 2387 __ bind(L_fill_4); 2388 __ tbz(count, 2, L_exit2); 2389 __ strw(value, Address(to)); 2390 break; 2391 case T_SHORT: 2392 __ tbz(count, 0, L_fill_4); 2393 __ strh(value, Address(__ post(to, 2))); 2394 __ bind(L_fill_4); 2395 __ tbz(count, 1, L_exit2); 2396 __ strw(value, Address(to)); 2397 break; 2398 case T_INT: 2399 __ cbzw(count, L_exit2); 2400 __ strw(value, Address(to)); 2401 break; 2402 default: ShouldNotReachHere(); 2403 } 2404 __ bind(L_exit2); 2405 __ leave(); 2406 __ ret(lr); 2407 return start; 2408 } 2409 2410 void generate_arraycopy_stubs() { 2411 address entry; 2412 address entry_jbyte_arraycopy; 2413 address entry_jshort_arraycopy; 2414 address entry_jint_arraycopy; 2415 address entry_oop_arraycopy; 2416 address entry_jlong_arraycopy; 2417 address entry_checkcast_arraycopy; 2418 2419 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2420 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2421 2422 StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11); 2423 2424 //*** jbyte 2425 // Always need aligned and unaligned versions 2426 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2427 "jbyte_disjoint_arraycopy"); 2428 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2429 &entry_jbyte_arraycopy, 2430 "jbyte_arraycopy"); 2431 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2432 "arrayof_jbyte_disjoint_arraycopy"); 2433 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2434 "arrayof_jbyte_arraycopy"); 2435 2436 //*** jshort 2437 // Always need aligned and unaligned versions 2438 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2439 "jshort_disjoint_arraycopy"); 2440 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2441 &entry_jshort_arraycopy, 2442 "jshort_arraycopy"); 2443 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2444 "arrayof_jshort_disjoint_arraycopy"); 2445 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2446 "arrayof_jshort_arraycopy"); 2447 2448 //*** jint 2449 // Aligned versions 2450 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2451 "arrayof_jint_disjoint_arraycopy"); 2452 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2453 "arrayof_jint_arraycopy"); 2454 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2455 // entry_jint_arraycopy always points to the unaligned version 2456 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2457 "jint_disjoint_arraycopy"); 2458 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2459 &entry_jint_arraycopy, 2460 "jint_arraycopy"); 2461 2462 //*** jlong 2463 // It is always aligned 2464 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2465 "arrayof_jlong_disjoint_arraycopy"); 2466 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2467 "arrayof_jlong_arraycopy"); 2468 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2469 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2470 2471 //*** oops 2472 { 2473 // With compressed oops we need unaligned versions; notice that 2474 // we overwrite entry_oop_arraycopy. 2475 bool aligned = !UseCompressedOops; 2476 2477 StubRoutines::_arrayof_oop_disjoint_arraycopy 2478 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2479 /*dest_uninitialized*/false); 2480 StubRoutines::_arrayof_oop_arraycopy 2481 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2482 /*dest_uninitialized*/false); 2483 // Aligned versions without pre-barriers 2484 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2485 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2486 /*dest_uninitialized*/true); 2487 StubRoutines::_arrayof_oop_arraycopy_uninit 2488 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2489 /*dest_uninitialized*/true); 2490 } 2491 2492 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2493 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2494 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2495 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2496 2497 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2498 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2499 /*dest_uninitialized*/true); 2500 2501 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2502 entry_jbyte_arraycopy, 2503 entry_jshort_arraycopy, 2504 entry_jint_arraycopy, 2505 entry_jlong_arraycopy); 2506 2507 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2508 entry_jbyte_arraycopy, 2509 entry_jshort_arraycopy, 2510 entry_jint_arraycopy, 2511 entry_oop_arraycopy, 2512 entry_jlong_arraycopy, 2513 entry_checkcast_arraycopy); 2514 2515 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2516 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2517 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2518 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2519 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2520 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2521 } 2522 2523 void generate_math_stubs() { Unimplemented(); } 2524 2525 // Arguments: 2526 // 2527 // Inputs: 2528 // c_rarg0 - source byte array address 2529 // c_rarg1 - destination byte array address 2530 // c_rarg2 - K (key) in little endian int array 2531 // 2532 address generate_aescrypt_encryptBlock() { 2533 __ align(CodeEntryAlignment); 2534 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2535 2536 Label L_doLast; 2537 2538 const Register from = c_rarg0; // source array address 2539 const Register to = c_rarg1; // destination array address 2540 const Register key = c_rarg2; // key array address 2541 const Register keylen = rscratch1; 2542 2543 address start = __ pc(); 2544 __ enter(); 2545 2546 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2547 2548 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2549 2550 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2551 __ rev32(v1, __ T16B, v1); 2552 __ rev32(v2, __ T16B, v2); 2553 __ rev32(v3, __ T16B, v3); 2554 __ rev32(v4, __ T16B, v4); 2555 __ aese(v0, v1); 2556 __ aesmc(v0, v0); 2557 __ aese(v0, v2); 2558 __ aesmc(v0, v0); 2559 __ aese(v0, v3); 2560 __ aesmc(v0, v0); 2561 __ aese(v0, v4); 2562 __ aesmc(v0, v0); 2563 2564 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2565 __ rev32(v1, __ T16B, v1); 2566 __ rev32(v2, __ T16B, v2); 2567 __ rev32(v3, __ T16B, v3); 2568 __ rev32(v4, __ T16B, v4); 2569 __ aese(v0, v1); 2570 __ aesmc(v0, v0); 2571 __ aese(v0, v2); 2572 __ aesmc(v0, v0); 2573 __ aese(v0, v3); 2574 __ aesmc(v0, v0); 2575 __ aese(v0, v4); 2576 __ aesmc(v0, v0); 2577 2578 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2579 __ rev32(v1, __ T16B, v1); 2580 __ rev32(v2, __ T16B, v2); 2581 2582 __ cmpw(keylen, 44); 2583 __ br(Assembler::EQ, L_doLast); 2584 2585 __ aese(v0, v1); 2586 __ aesmc(v0, v0); 2587 __ aese(v0, v2); 2588 __ aesmc(v0, v0); 2589 2590 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2591 __ rev32(v1, __ T16B, v1); 2592 __ rev32(v2, __ T16B, v2); 2593 2594 __ cmpw(keylen, 52); 2595 __ br(Assembler::EQ, L_doLast); 2596 2597 __ aese(v0, v1); 2598 __ aesmc(v0, v0); 2599 __ aese(v0, v2); 2600 __ aesmc(v0, v0); 2601 2602 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2603 __ rev32(v1, __ T16B, v1); 2604 __ rev32(v2, __ T16B, v2); 2605 2606 __ BIND(L_doLast); 2607 2608 __ aese(v0, v1); 2609 __ aesmc(v0, v0); 2610 __ aese(v0, v2); 2611 2612 __ ld1(v1, __ T16B, key); 2613 __ rev32(v1, __ T16B, v1); 2614 __ eor(v0, __ T16B, v0, v1); 2615 2616 __ st1(v0, __ T16B, to); 2617 2618 __ mov(r0, 0); 2619 2620 __ leave(); 2621 __ ret(lr); 2622 2623 return start; 2624 } 2625 2626 // Arguments: 2627 // 2628 // Inputs: 2629 // c_rarg0 - source byte array address 2630 // c_rarg1 - destination byte array address 2631 // c_rarg2 - K (key) in little endian int array 2632 // 2633 address generate_aescrypt_decryptBlock() { 2634 assert(UseAES, "need AES instructions and misaligned SSE support"); 2635 __ align(CodeEntryAlignment); 2636 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2637 Label L_doLast; 2638 2639 const Register from = c_rarg0; // source array address 2640 const Register to = c_rarg1; // destination array address 2641 const Register key = c_rarg2; // key array address 2642 const Register keylen = rscratch1; 2643 2644 address start = __ pc(); 2645 __ enter(); // required for proper stackwalking of RuntimeStub frame 2646 2647 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2648 2649 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2650 2651 __ ld1(v5, __ T16B, __ post(key, 16)); 2652 __ rev32(v5, __ T16B, v5); 2653 2654 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2655 __ rev32(v1, __ T16B, v1); 2656 __ rev32(v2, __ T16B, v2); 2657 __ rev32(v3, __ T16B, v3); 2658 __ rev32(v4, __ T16B, v4); 2659 __ aesd(v0, v1); 2660 __ aesimc(v0, v0); 2661 __ aesd(v0, v2); 2662 __ aesimc(v0, v0); 2663 __ aesd(v0, v3); 2664 __ aesimc(v0, v0); 2665 __ aesd(v0, v4); 2666 __ aesimc(v0, v0); 2667 2668 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2669 __ rev32(v1, __ T16B, v1); 2670 __ rev32(v2, __ T16B, v2); 2671 __ rev32(v3, __ T16B, v3); 2672 __ rev32(v4, __ T16B, v4); 2673 __ aesd(v0, v1); 2674 __ aesimc(v0, v0); 2675 __ aesd(v0, v2); 2676 __ aesimc(v0, v0); 2677 __ aesd(v0, v3); 2678 __ aesimc(v0, v0); 2679 __ aesd(v0, v4); 2680 __ aesimc(v0, v0); 2681 2682 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2683 __ rev32(v1, __ T16B, v1); 2684 __ rev32(v2, __ T16B, v2); 2685 2686 __ cmpw(keylen, 44); 2687 __ br(Assembler::EQ, L_doLast); 2688 2689 __ aesd(v0, v1); 2690 __ aesimc(v0, v0); 2691 __ aesd(v0, v2); 2692 __ aesimc(v0, v0); 2693 2694 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2695 __ rev32(v1, __ T16B, v1); 2696 __ rev32(v2, __ T16B, v2); 2697 2698 __ cmpw(keylen, 52); 2699 __ br(Assembler::EQ, L_doLast); 2700 2701 __ aesd(v0, v1); 2702 __ aesimc(v0, v0); 2703 __ aesd(v0, v2); 2704 __ aesimc(v0, v0); 2705 2706 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2707 __ rev32(v1, __ T16B, v1); 2708 __ rev32(v2, __ T16B, v2); 2709 2710 __ BIND(L_doLast); 2711 2712 __ aesd(v0, v1); 2713 __ aesimc(v0, v0); 2714 __ aesd(v0, v2); 2715 2716 __ eor(v0, __ T16B, v0, v5); 2717 2718 __ st1(v0, __ T16B, to); 2719 2720 __ mov(r0, 0); 2721 2722 __ leave(); 2723 __ ret(lr); 2724 2725 return start; 2726 } 2727 2728 // Arguments: 2729 // 2730 // Inputs: 2731 // c_rarg0 - source byte array address 2732 // c_rarg1 - destination byte array address 2733 // c_rarg2 - K (key) in little endian int array 2734 // c_rarg3 - r vector byte array address 2735 // c_rarg4 - input length 2736 // 2737 // Output: 2738 // x0 - input length 2739 // 2740 address generate_cipherBlockChaining_encryptAESCrypt() { 2741 assert(UseAES, "need AES instructions and misaligned SSE support"); 2742 __ align(CodeEntryAlignment); 2743 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2744 2745 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2746 2747 const Register from = c_rarg0; // source array address 2748 const Register to = c_rarg1; // destination array address 2749 const Register key = c_rarg2; // key array address 2750 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2751 // and left with the results of the last encryption block 2752 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2753 const Register keylen = rscratch1; 2754 2755 address start = __ pc(); 2756 __ enter(); 2757 2758 __ mov(rscratch2, len_reg); 2759 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2760 2761 __ ld1(v0, __ T16B, rvec); 2762 2763 __ cmpw(keylen, 52); 2764 __ br(Assembler::CC, L_loadkeys_44); 2765 __ br(Assembler::EQ, L_loadkeys_52); 2766 2767 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2768 __ rev32(v17, __ T16B, v17); 2769 __ rev32(v18, __ T16B, v18); 2770 __ BIND(L_loadkeys_52); 2771 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2772 __ rev32(v19, __ T16B, v19); 2773 __ rev32(v20, __ T16B, v20); 2774 __ BIND(L_loadkeys_44); 2775 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2776 __ rev32(v21, __ T16B, v21); 2777 __ rev32(v22, __ T16B, v22); 2778 __ rev32(v23, __ T16B, v23); 2779 __ rev32(v24, __ T16B, v24); 2780 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2781 __ rev32(v25, __ T16B, v25); 2782 __ rev32(v26, __ T16B, v26); 2783 __ rev32(v27, __ T16B, v27); 2784 __ rev32(v28, __ T16B, v28); 2785 __ ld1(v29, v30, v31, __ T16B, key); 2786 __ rev32(v29, __ T16B, v29); 2787 __ rev32(v30, __ T16B, v30); 2788 __ rev32(v31, __ T16B, v31); 2789 2790 __ BIND(L_aes_loop); 2791 __ ld1(v1, __ T16B, __ post(from, 16)); 2792 __ eor(v0, __ T16B, v0, v1); 2793 2794 __ br(Assembler::CC, L_rounds_44); 2795 __ br(Assembler::EQ, L_rounds_52); 2796 2797 __ aese(v0, v17); __ aesmc(v0, v0); 2798 __ aese(v0, v18); __ aesmc(v0, v0); 2799 __ BIND(L_rounds_52); 2800 __ aese(v0, v19); __ aesmc(v0, v0); 2801 __ aese(v0, v20); __ aesmc(v0, v0); 2802 __ BIND(L_rounds_44); 2803 __ aese(v0, v21); __ aesmc(v0, v0); 2804 __ aese(v0, v22); __ aesmc(v0, v0); 2805 __ aese(v0, v23); __ aesmc(v0, v0); 2806 __ aese(v0, v24); __ aesmc(v0, v0); 2807 __ aese(v0, v25); __ aesmc(v0, v0); 2808 __ aese(v0, v26); __ aesmc(v0, v0); 2809 __ aese(v0, v27); __ aesmc(v0, v0); 2810 __ aese(v0, v28); __ aesmc(v0, v0); 2811 __ aese(v0, v29); __ aesmc(v0, v0); 2812 __ aese(v0, v30); 2813 __ eor(v0, __ T16B, v0, v31); 2814 2815 __ st1(v0, __ T16B, __ post(to, 16)); 2816 __ sub(len_reg, len_reg, 16); 2817 __ cbnz(len_reg, L_aes_loop); 2818 2819 __ st1(v0, __ T16B, rvec); 2820 2821 __ mov(r0, rscratch2); 2822 2823 __ leave(); 2824 __ ret(lr); 2825 2826 return start; 2827 } 2828 2829 // Arguments: 2830 // 2831 // Inputs: 2832 // c_rarg0 - source byte array address 2833 // c_rarg1 - destination byte array address 2834 // c_rarg2 - K (key) in little endian int array 2835 // c_rarg3 - r vector byte array address 2836 // c_rarg4 - input length 2837 // 2838 // Output: 2839 // r0 - input length 2840 // 2841 address generate_cipherBlockChaining_decryptAESCrypt() { 2842 assert(UseAES, "need AES instructions and misaligned SSE support"); 2843 __ align(CodeEntryAlignment); 2844 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2845 2846 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2847 2848 const Register from = c_rarg0; // source array address 2849 const Register to = c_rarg1; // destination array address 2850 const Register key = c_rarg2; // key array address 2851 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2852 // and left with the results of the last encryption block 2853 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2854 const Register keylen = rscratch1; 2855 2856 address start = __ pc(); 2857 __ enter(); 2858 2859 __ mov(rscratch2, len_reg); 2860 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2861 2862 __ ld1(v2, __ T16B, rvec); 2863 2864 __ ld1(v31, __ T16B, __ post(key, 16)); 2865 __ rev32(v31, __ T16B, v31); 2866 2867 __ cmpw(keylen, 52); 2868 __ br(Assembler::CC, L_loadkeys_44); 2869 __ br(Assembler::EQ, L_loadkeys_52); 2870 2871 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2872 __ rev32(v17, __ T16B, v17); 2873 __ rev32(v18, __ T16B, v18); 2874 __ BIND(L_loadkeys_52); 2875 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2876 __ rev32(v19, __ T16B, v19); 2877 __ rev32(v20, __ T16B, v20); 2878 __ BIND(L_loadkeys_44); 2879 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2880 __ rev32(v21, __ T16B, v21); 2881 __ rev32(v22, __ T16B, v22); 2882 __ rev32(v23, __ T16B, v23); 2883 __ rev32(v24, __ T16B, v24); 2884 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2885 __ rev32(v25, __ T16B, v25); 2886 __ rev32(v26, __ T16B, v26); 2887 __ rev32(v27, __ T16B, v27); 2888 __ rev32(v28, __ T16B, v28); 2889 __ ld1(v29, v30, __ T16B, key); 2890 __ rev32(v29, __ T16B, v29); 2891 __ rev32(v30, __ T16B, v30); 2892 2893 __ BIND(L_aes_loop); 2894 __ ld1(v0, __ T16B, __ post(from, 16)); 2895 __ orr(v1, __ T16B, v0, v0); 2896 2897 __ br(Assembler::CC, L_rounds_44); 2898 __ br(Assembler::EQ, L_rounds_52); 2899 2900 __ aesd(v0, v17); __ aesimc(v0, v0); 2901 __ aesd(v0, v18); __ aesimc(v0, v0); 2902 __ BIND(L_rounds_52); 2903 __ aesd(v0, v19); __ aesimc(v0, v0); 2904 __ aesd(v0, v20); __ aesimc(v0, v0); 2905 __ BIND(L_rounds_44); 2906 __ aesd(v0, v21); __ aesimc(v0, v0); 2907 __ aesd(v0, v22); __ aesimc(v0, v0); 2908 __ aesd(v0, v23); __ aesimc(v0, v0); 2909 __ aesd(v0, v24); __ aesimc(v0, v0); 2910 __ aesd(v0, v25); __ aesimc(v0, v0); 2911 __ aesd(v0, v26); __ aesimc(v0, v0); 2912 __ aesd(v0, v27); __ aesimc(v0, v0); 2913 __ aesd(v0, v28); __ aesimc(v0, v0); 2914 __ aesd(v0, v29); __ aesimc(v0, v0); 2915 __ aesd(v0, v30); 2916 __ eor(v0, __ T16B, v0, v31); 2917 __ eor(v0, __ T16B, v0, v2); 2918 2919 __ st1(v0, __ T16B, __ post(to, 16)); 2920 __ orr(v2, __ T16B, v1, v1); 2921 2922 __ sub(len_reg, len_reg, 16); 2923 __ cbnz(len_reg, L_aes_loop); 2924 2925 __ st1(v2, __ T16B, rvec); 2926 2927 __ mov(r0, rscratch2); 2928 2929 __ leave(); 2930 __ ret(lr); 2931 2932 return start; 2933 } 2934 2935 // Arguments: 2936 // 2937 // Inputs: 2938 // c_rarg0 - byte[] source+offset 2939 // c_rarg1 - int[] SHA.state 2940 // c_rarg2 - int offset 2941 // c_rarg3 - int limit 2942 // 2943 address generate_sha1_implCompress(bool multi_block, const char *name) { 2944 __ align(CodeEntryAlignment); 2945 StubCodeMark mark(this, "StubRoutines", name); 2946 address start = __ pc(); 2947 2948 Register buf = c_rarg0; 2949 Register state = c_rarg1; 2950 Register ofs = c_rarg2; 2951 Register limit = c_rarg3; 2952 2953 Label keys; 2954 Label sha1_loop; 2955 2956 // load the keys into v0..v3 2957 __ adr(rscratch1, keys); 2958 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2959 // load 5 words state into v6, v7 2960 __ ldrq(v6, Address(state, 0)); 2961 __ ldrs(v7, Address(state, 16)); 2962 2963 2964 __ BIND(sha1_loop); 2965 // load 64 bytes of data into v16..v19 2966 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2967 __ rev32(v16, __ T16B, v16); 2968 __ rev32(v17, __ T16B, v17); 2969 __ rev32(v18, __ T16B, v18); 2970 __ rev32(v19, __ T16B, v19); 2971 2972 // do the sha1 2973 __ addv(v4, __ T4S, v16, v0); 2974 __ orr(v20, __ T16B, v6, v6); 2975 2976 FloatRegister d0 = v16; 2977 FloatRegister d1 = v17; 2978 FloatRegister d2 = v18; 2979 FloatRegister d3 = v19; 2980 2981 for (int round = 0; round < 20; round++) { 2982 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2983 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2984 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2985 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2986 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2987 2988 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2989 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2990 __ sha1h(tmp2, __ T4S, v20); 2991 if (round < 5) 2992 __ sha1c(v20, __ T4S, tmp3, tmp4); 2993 else if (round < 10 || round >= 15) 2994 __ sha1p(v20, __ T4S, tmp3, tmp4); 2995 else 2996 __ sha1m(v20, __ T4S, tmp3, tmp4); 2997 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2998 2999 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3000 } 3001 3002 __ addv(v7, __ T2S, v7, v21); 3003 __ addv(v6, __ T4S, v6, v20); 3004 3005 if (multi_block) { 3006 __ add(ofs, ofs, 64); 3007 __ cmp(ofs, limit); 3008 __ br(Assembler::LE, sha1_loop); 3009 __ mov(c_rarg0, ofs); // return ofs 3010 } 3011 3012 __ strq(v6, Address(state, 0)); 3013 __ strs(v7, Address(state, 16)); 3014 3015 __ ret(lr); 3016 3017 __ bind(keys); 3018 __ emit_int32(0x5a827999); 3019 __ emit_int32(0x6ed9eba1); 3020 __ emit_int32(0x8f1bbcdc); 3021 __ emit_int32(0xca62c1d6); 3022 3023 return start; 3024 } 3025 3026 3027 // Arguments: 3028 // 3029 // Inputs: 3030 // c_rarg0 - byte[] source+offset 3031 // c_rarg1 - int[] SHA.state 3032 // c_rarg2 - int offset 3033 // c_rarg3 - int limit 3034 // 3035 address generate_sha256_implCompress(bool multi_block, const char *name) { 3036 static const uint32_t round_consts[64] = { 3037 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3038 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3039 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3040 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3041 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3042 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3043 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3044 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3045 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3046 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3047 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3048 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3049 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3050 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3051 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3052 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3053 }; 3054 __ align(CodeEntryAlignment); 3055 StubCodeMark mark(this, "StubRoutines", name); 3056 address start = __ pc(); 3057 3058 Register buf = c_rarg0; 3059 Register state = c_rarg1; 3060 Register ofs = c_rarg2; 3061 Register limit = c_rarg3; 3062 3063 Label sha1_loop; 3064 3065 __ stpd(v8, v9, __ pre(sp, -32)); 3066 __ stpd(v10, v11, Address(sp, 16)); 3067 3068 // dga == v0 3069 // dgb == v1 3070 // dg0 == v2 3071 // dg1 == v3 3072 // dg2 == v4 3073 // t0 == v6 3074 // t1 == v7 3075 3076 // load 16 keys to v16..v31 3077 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3078 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3079 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3080 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3081 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3082 3083 // load 8 words (256 bits) state 3084 __ ldpq(v0, v1, state); 3085 3086 __ BIND(sha1_loop); 3087 // load 64 bytes of data into v8..v11 3088 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3089 __ rev32(v8, __ T16B, v8); 3090 __ rev32(v9, __ T16B, v9); 3091 __ rev32(v10, __ T16B, v10); 3092 __ rev32(v11, __ T16B, v11); 3093 3094 __ addv(v6, __ T4S, v8, v16); 3095 __ orr(v2, __ T16B, v0, v0); 3096 __ orr(v3, __ T16B, v1, v1); 3097 3098 FloatRegister d0 = v8; 3099 FloatRegister d1 = v9; 3100 FloatRegister d2 = v10; 3101 FloatRegister d3 = v11; 3102 3103 3104 for (int round = 0; round < 16; round++) { 3105 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3106 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3107 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3108 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3109 3110 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3111 __ orr(v4, __ T16B, v2, v2); 3112 if (round < 15) 3113 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3114 __ sha256h(v2, __ T4S, v3, tmp2); 3115 __ sha256h2(v3, __ T4S, v4, tmp2); 3116 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3117 3118 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3119 } 3120 3121 __ addv(v0, __ T4S, v0, v2); 3122 __ addv(v1, __ T4S, v1, v3); 3123 3124 if (multi_block) { 3125 __ add(ofs, ofs, 64); 3126 __ cmp(ofs, limit); 3127 __ br(Assembler::LE, sha1_loop); 3128 __ mov(c_rarg0, ofs); // return ofs 3129 } 3130 3131 __ ldpd(v10, v11, Address(sp, 16)); 3132 __ ldpd(v8, v9, __ post(sp, 32)); 3133 3134 __ stpq(v0, v1, state); 3135 3136 __ ret(lr); 3137 3138 return start; 3139 } 3140 3141 #ifndef BUILTIN_SIM 3142 // Safefetch stubs. 3143 void generate_safefetch(const char* name, int size, address* entry, 3144 address* fault_pc, address* continuation_pc) { 3145 // safefetch signatures: 3146 // int SafeFetch32(int* adr, int errValue); 3147 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3148 // 3149 // arguments: 3150 // c_rarg0 = adr 3151 // c_rarg1 = errValue 3152 // 3153 // result: 3154 // PPC_RET = *adr or errValue 3155 3156 StubCodeMark mark(this, "StubRoutines", name); 3157 3158 // Entry point, pc or function descriptor. 3159 *entry = __ pc(); 3160 3161 // Load *adr into c_rarg1, may fault. 3162 *fault_pc = __ pc(); 3163 switch (size) { 3164 case 4: 3165 // int32_t 3166 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3167 break; 3168 case 8: 3169 // int64_t 3170 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3171 break; 3172 default: 3173 ShouldNotReachHere(); 3174 } 3175 3176 // return errValue or *adr 3177 *continuation_pc = __ pc(); 3178 __ mov(r0, c_rarg1); 3179 __ ret(lr); 3180 } 3181 #endif 3182 3183 /** 3184 * Arguments: 3185 * 3186 * Inputs: 3187 * c_rarg0 - int crc 3188 * c_rarg1 - byte* buf 3189 * c_rarg2 - int length 3190 * 3191 * Ouput: 3192 * rax - int crc result 3193 */ 3194 address generate_updateBytesCRC32() { 3195 assert(UseCRC32Intrinsics, "what are we doing here?"); 3196 3197 __ align(CodeEntryAlignment); 3198 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3199 3200 address start = __ pc(); 3201 3202 const Register crc = c_rarg0; // crc 3203 const Register buf = c_rarg1; // source java byte array address 3204 const Register len = c_rarg2; // length 3205 const Register table0 = c_rarg3; // crc_table address 3206 const Register table1 = c_rarg4; 3207 const Register table2 = c_rarg5; 3208 const Register table3 = c_rarg6; 3209 const Register tmp3 = c_rarg7; 3210 3211 BLOCK_COMMENT("Entry:"); 3212 __ enter(); // required for proper stackwalking of RuntimeStub frame 3213 3214 __ kernel_crc32(crc, buf, len, 3215 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3216 3217 __ leave(); // required for proper stackwalking of RuntimeStub frame 3218 __ ret(lr); 3219 3220 return start; 3221 } 3222 3223 /** 3224 * Arguments: 3225 * 3226 * Inputs: 3227 * c_rarg0 - int crc 3228 * c_rarg1 - byte* buf 3229 * c_rarg2 - int length 3230 * c_rarg3 - int* table 3231 * 3232 * Ouput: 3233 * r0 - int crc result 3234 */ 3235 address generate_updateBytesCRC32C() { 3236 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3237 3238 __ align(CodeEntryAlignment); 3239 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3240 3241 address start = __ pc(); 3242 3243 const Register crc = c_rarg0; // crc 3244 const Register buf = c_rarg1; // source java byte array address 3245 const Register len = c_rarg2; // length 3246 const Register table0 = c_rarg3; // crc_table address 3247 const Register table1 = c_rarg4; 3248 const Register table2 = c_rarg5; 3249 const Register table3 = c_rarg6; 3250 const Register tmp3 = c_rarg7; 3251 3252 BLOCK_COMMENT("Entry:"); 3253 __ enter(); // required for proper stackwalking of RuntimeStub frame 3254 3255 __ kernel_crc32c(crc, buf, len, 3256 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3257 3258 __ leave(); // required for proper stackwalking of RuntimeStub frame 3259 __ ret(lr); 3260 3261 return start; 3262 } 3263 3264 /*** 3265 * Arguments: 3266 * 3267 * Inputs: 3268 * c_rarg0 - int adler 3269 * c_rarg1 - byte* buff 3270 * c_rarg2 - int len 3271 * 3272 * Output: 3273 * c_rarg0 - int adler result 3274 */ 3275 address generate_updateBytesAdler32() { 3276 __ align(CodeEntryAlignment); 3277 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3278 address start = __ pc(); 3279 3280 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3281 3282 // Aliases 3283 Register adler = c_rarg0; 3284 Register s1 = c_rarg0; 3285 Register s2 = c_rarg3; 3286 Register buff = c_rarg1; 3287 Register len = c_rarg2; 3288 Register nmax = r4; 3289 Register base = r5; 3290 Register count = r6; 3291 Register temp0 = rscratch1; 3292 Register temp1 = rscratch2; 3293 Register temp2 = r7; 3294 3295 // Max number of bytes we can process before having to take the mod 3296 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3297 unsigned long BASE = 0xfff1; 3298 unsigned long NMAX = 0x15B0; 3299 3300 __ mov(base, BASE); 3301 __ mov(nmax, NMAX); 3302 3303 // s1 is initialized to the lower 16 bits of adler 3304 // s2 is initialized to the upper 16 bits of adler 3305 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3306 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3307 3308 // The pipelined loop needs at least 16 elements for 1 iteration 3309 // It does check this, but it is more effective to skip to the cleanup loop 3310 __ cmp(len, 16); 3311 __ br(Assembler::HS, L_nmax); 3312 __ cbz(len, L_combine); 3313 3314 __ bind(L_simple_by1_loop); 3315 __ ldrb(temp0, Address(__ post(buff, 1))); 3316 __ add(s1, s1, temp0); 3317 __ add(s2, s2, s1); 3318 __ subs(len, len, 1); 3319 __ br(Assembler::HI, L_simple_by1_loop); 3320 3321 // s1 = s1 % BASE 3322 __ subs(temp0, s1, base); 3323 __ csel(s1, temp0, s1, Assembler::HS); 3324 3325 // s2 = s2 % BASE 3326 __ lsr(temp0, s2, 16); 3327 __ lsl(temp1, temp0, 4); 3328 __ sub(temp1, temp1, temp0); 3329 __ add(s2, temp1, s2, ext::uxth); 3330 3331 __ subs(temp0, s2, base); 3332 __ csel(s2, temp0, s2, Assembler::HS); 3333 3334 __ b(L_combine); 3335 3336 __ bind(L_nmax); 3337 __ subs(len, len, nmax); 3338 __ sub(count, nmax, 16); 3339 __ br(Assembler::LO, L_by16); 3340 3341 __ bind(L_nmax_loop); 3342 3343 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3344 3345 __ add(s1, s1, temp0, ext::uxtb); 3346 __ ubfx(temp2, temp0, 8, 8); 3347 __ add(s2, s2, s1); 3348 __ add(s1, s1, temp2); 3349 __ ubfx(temp2, temp0, 16, 8); 3350 __ add(s2, s2, s1); 3351 __ add(s1, s1, temp2); 3352 __ ubfx(temp2, temp0, 24, 8); 3353 __ add(s2, s2, s1); 3354 __ add(s1, s1, temp2); 3355 __ ubfx(temp2, temp0, 32, 8); 3356 __ add(s2, s2, s1); 3357 __ add(s1, s1, temp2); 3358 __ ubfx(temp2, temp0, 40, 8); 3359 __ add(s2, s2, s1); 3360 __ add(s1, s1, temp2); 3361 __ ubfx(temp2, temp0, 48, 8); 3362 __ add(s2, s2, s1); 3363 __ add(s1, s1, temp2); 3364 __ add(s2, s2, s1); 3365 __ add(s1, s1, temp0, Assembler::LSR, 56); 3366 __ add(s2, s2, s1); 3367 3368 __ add(s1, s1, temp1, ext::uxtb); 3369 __ ubfx(temp2, temp1, 8, 8); 3370 __ add(s2, s2, s1); 3371 __ add(s1, s1, temp2); 3372 __ ubfx(temp2, temp1, 16, 8); 3373 __ add(s2, s2, s1); 3374 __ add(s1, s1, temp2); 3375 __ ubfx(temp2, temp1, 24, 8); 3376 __ add(s2, s2, s1); 3377 __ add(s1, s1, temp2); 3378 __ ubfx(temp2, temp1, 32, 8); 3379 __ add(s2, s2, s1); 3380 __ add(s1, s1, temp2); 3381 __ ubfx(temp2, temp1, 40, 8); 3382 __ add(s2, s2, s1); 3383 __ add(s1, s1, temp2); 3384 __ ubfx(temp2, temp1, 48, 8); 3385 __ add(s2, s2, s1); 3386 __ add(s1, s1, temp2); 3387 __ add(s2, s2, s1); 3388 __ add(s1, s1, temp1, Assembler::LSR, 56); 3389 __ add(s2, s2, s1); 3390 3391 __ subs(count, count, 16); 3392 __ br(Assembler::HS, L_nmax_loop); 3393 3394 // s1 = s1 % BASE 3395 __ lsr(temp0, s1, 16); 3396 __ lsl(temp1, temp0, 4); 3397 __ sub(temp1, temp1, temp0); 3398 __ add(temp1, temp1, s1, ext::uxth); 3399 3400 __ lsr(temp0, temp1, 16); 3401 __ lsl(s1, temp0, 4); 3402 __ sub(s1, s1, temp0); 3403 __ add(s1, s1, temp1, ext:: uxth); 3404 3405 __ subs(temp0, s1, base); 3406 __ csel(s1, temp0, s1, Assembler::HS); 3407 3408 // s2 = s2 % BASE 3409 __ lsr(temp0, s2, 16); 3410 __ lsl(temp1, temp0, 4); 3411 __ sub(temp1, temp1, temp0); 3412 __ add(temp1, temp1, s2, ext::uxth); 3413 3414 __ lsr(temp0, temp1, 16); 3415 __ lsl(s2, temp0, 4); 3416 __ sub(s2, s2, temp0); 3417 __ add(s2, s2, temp1, ext:: uxth); 3418 3419 __ subs(temp0, s2, base); 3420 __ csel(s2, temp0, s2, Assembler::HS); 3421 3422 __ subs(len, len, nmax); 3423 __ sub(count, nmax, 16); 3424 __ br(Assembler::HS, L_nmax_loop); 3425 3426 __ bind(L_by16); 3427 __ adds(len, len, count); 3428 __ br(Assembler::LO, L_by1); 3429 3430 __ bind(L_by16_loop); 3431 3432 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3433 3434 __ add(s1, s1, temp0, ext::uxtb); 3435 __ ubfx(temp2, temp0, 8, 8); 3436 __ add(s2, s2, s1); 3437 __ add(s1, s1, temp2); 3438 __ ubfx(temp2, temp0, 16, 8); 3439 __ add(s2, s2, s1); 3440 __ add(s1, s1, temp2); 3441 __ ubfx(temp2, temp0, 24, 8); 3442 __ add(s2, s2, s1); 3443 __ add(s1, s1, temp2); 3444 __ ubfx(temp2, temp0, 32, 8); 3445 __ add(s2, s2, s1); 3446 __ add(s1, s1, temp2); 3447 __ ubfx(temp2, temp0, 40, 8); 3448 __ add(s2, s2, s1); 3449 __ add(s1, s1, temp2); 3450 __ ubfx(temp2, temp0, 48, 8); 3451 __ add(s2, s2, s1); 3452 __ add(s1, s1, temp2); 3453 __ add(s2, s2, s1); 3454 __ add(s1, s1, temp0, Assembler::LSR, 56); 3455 __ add(s2, s2, s1); 3456 3457 __ add(s1, s1, temp1, ext::uxtb); 3458 __ ubfx(temp2, temp1, 8, 8); 3459 __ add(s2, s2, s1); 3460 __ add(s1, s1, temp2); 3461 __ ubfx(temp2, temp1, 16, 8); 3462 __ add(s2, s2, s1); 3463 __ add(s1, s1, temp2); 3464 __ ubfx(temp2, temp1, 24, 8); 3465 __ add(s2, s2, s1); 3466 __ add(s1, s1, temp2); 3467 __ ubfx(temp2, temp1, 32, 8); 3468 __ add(s2, s2, s1); 3469 __ add(s1, s1, temp2); 3470 __ ubfx(temp2, temp1, 40, 8); 3471 __ add(s2, s2, s1); 3472 __ add(s1, s1, temp2); 3473 __ ubfx(temp2, temp1, 48, 8); 3474 __ add(s2, s2, s1); 3475 __ add(s1, s1, temp2); 3476 __ add(s2, s2, s1); 3477 __ add(s1, s1, temp1, Assembler::LSR, 56); 3478 __ add(s2, s2, s1); 3479 3480 __ subs(len, len, 16); 3481 __ br(Assembler::HS, L_by16_loop); 3482 3483 __ bind(L_by1); 3484 __ adds(len, len, 15); 3485 __ br(Assembler::LO, L_do_mod); 3486 3487 __ bind(L_by1_loop); 3488 __ ldrb(temp0, Address(__ post(buff, 1))); 3489 __ add(s1, temp0, s1); 3490 __ add(s2, s2, s1); 3491 __ subs(len, len, 1); 3492 __ br(Assembler::HS, L_by1_loop); 3493 3494 __ bind(L_do_mod); 3495 // s1 = s1 % BASE 3496 __ lsr(temp0, s1, 16); 3497 __ lsl(temp1, temp0, 4); 3498 __ sub(temp1, temp1, temp0); 3499 __ add(temp1, temp1, s1, ext::uxth); 3500 3501 __ lsr(temp0, temp1, 16); 3502 __ lsl(s1, temp0, 4); 3503 __ sub(s1, s1, temp0); 3504 __ add(s1, s1, temp1, ext:: uxth); 3505 3506 __ subs(temp0, s1, base); 3507 __ csel(s1, temp0, s1, Assembler::HS); 3508 3509 // s2 = s2 % BASE 3510 __ lsr(temp0, s2, 16); 3511 __ lsl(temp1, temp0, 4); 3512 __ sub(temp1, temp1, temp0); 3513 __ add(temp1, temp1, s2, ext::uxth); 3514 3515 __ lsr(temp0, temp1, 16); 3516 __ lsl(s2, temp0, 4); 3517 __ sub(s2, s2, temp0); 3518 __ add(s2, s2, temp1, ext:: uxth); 3519 3520 __ subs(temp0, s2, base); 3521 __ csel(s2, temp0, s2, Assembler::HS); 3522 3523 // Combine lower bits and higher bits 3524 __ bind(L_combine); 3525 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3526 3527 __ ret(lr); 3528 3529 return start; 3530 } 3531 3532 /** 3533 * Arguments: 3534 * 3535 * Input: 3536 * c_rarg0 - x address 3537 * c_rarg1 - x length 3538 * c_rarg2 - y address 3539 * c_rarg3 - y lenth 3540 * c_rarg4 - z address 3541 * c_rarg5 - z length 3542 */ 3543 address generate_multiplyToLen() { 3544 __ align(CodeEntryAlignment); 3545 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3546 3547 address start = __ pc(); 3548 const Register x = r0; 3549 const Register xlen = r1; 3550 const Register y = r2; 3551 const Register ylen = r3; 3552 const Register z = r4; 3553 const Register zlen = r5; 3554 3555 const Register tmp1 = r10; 3556 const Register tmp2 = r11; 3557 const Register tmp3 = r12; 3558 const Register tmp4 = r13; 3559 const Register tmp5 = r14; 3560 const Register tmp6 = r15; 3561 const Register tmp7 = r16; 3562 3563 BLOCK_COMMENT("Entry:"); 3564 __ enter(); // required for proper stackwalking of RuntimeStub frame 3565 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3566 __ leave(); // required for proper stackwalking of RuntimeStub frame 3567 __ ret(lr); 3568 3569 return start; 3570 } 3571 3572 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3573 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3574 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3575 // Karatsuba multiplication performs a 128*128 -> 256-bit 3576 // multiplication in three 128-bit multiplications and a few 3577 // additions. 3578 // 3579 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3580 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3581 // 3582 // Inputs: 3583 // 3584 // A0 in a.d[0] (subkey) 3585 // A1 in a.d[1] 3586 // (A1+A0) in a1_xor_a0.d[0] 3587 // 3588 // B0 in b.d[0] (state) 3589 // B1 in b.d[1] 3590 3591 __ ext(tmp1, __ T16B, b, b, 0x08); 3592 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3593 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3594 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3595 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3596 3597 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3598 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3599 __ eor(tmp2, __ T16B, tmp2, tmp4); 3600 __ eor(tmp2, __ T16B, tmp2, tmp3); 3601 3602 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3603 __ ins(result_hi, __ D, tmp2, 0, 1); 3604 __ ins(result_lo, __ D, tmp2, 1, 0); 3605 } 3606 3607 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3608 FloatRegister p, FloatRegister z, FloatRegister t1) { 3609 const FloatRegister t0 = result; 3610 3611 // The GCM field polynomial f is z^128 + p(z), where p = 3612 // z^7+z^2+z+1. 3613 // 3614 // z^128 === -p(z) (mod (z^128 + p(z))) 3615 // 3616 // so, given that the product we're reducing is 3617 // a == lo + hi * z^128 3618 // substituting, 3619 // === lo - hi * p(z) (mod (z^128 + p(z))) 3620 // 3621 // we reduce by multiplying hi by p(z) and subtracting the result 3622 // from (i.e. XORing it with) lo. Because p has no nonzero high 3623 // bits we can do this with two 64-bit multiplications, lo*p and 3624 // hi*p. 3625 3626 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3627 __ ext(t1, __ T16B, t0, z, 8); 3628 __ eor(hi, __ T16B, hi, t1); 3629 __ ext(t1, __ T16B, z, t0, 8); 3630 __ eor(lo, __ T16B, lo, t1); 3631 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3632 __ eor(result, __ T16B, lo, t0); 3633 } 3634 3635 /** 3636 * Arguments: 3637 * 3638 * Input: 3639 * c_rarg0 - current state address 3640 * c_rarg1 - H key address 3641 * c_rarg2 - data address 3642 * c_rarg3 - number of blocks 3643 * 3644 * Output: 3645 * Updated state at c_rarg0 3646 */ 3647 address generate_ghash_processBlocks() { 3648 // Bafflingly, GCM uses little-endian for the byte order, but 3649 // big-endian for the bit order. For example, the polynomial 1 is 3650 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3651 // 3652 // So, we must either reverse the bytes in each word and do 3653 // everything big-endian or reverse the bits in each byte and do 3654 // it little-endian. On AArch64 it's more idiomatic to reverse 3655 // the bits in each byte (we have an instruction, RBIT, to do 3656 // that) and keep the data in little-endian bit order throught the 3657 // calculation, bit-reversing the inputs and outputs. 3658 3659 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3660 __ align(wordSize * 2); 3661 address p = __ pc(); 3662 __ emit_int64(0x87); // The low-order bits of the field 3663 // polynomial (i.e. p = z^7+z^2+z+1) 3664 // repeated in the low and high parts of a 3665 // 128-bit vector 3666 __ emit_int64(0x87); 3667 3668 __ align(CodeEntryAlignment); 3669 address start = __ pc(); 3670 3671 Register state = c_rarg0; 3672 Register subkeyH = c_rarg1; 3673 Register data = c_rarg2; 3674 Register blocks = c_rarg3; 3675 3676 FloatRegister vzr = v30; 3677 __ eor(vzr, __ T16B, vzr, vzr); // zero register 3678 3679 __ ldrq(v0, Address(state)); 3680 __ ldrq(v1, Address(subkeyH)); 3681 3682 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 3683 __ rbit(v0, __ T16B, v0); 3684 __ rev64(v1, __ T16B, v1); 3685 __ rbit(v1, __ T16B, v1); 3686 3687 __ ldrq(v26, p); 3688 3689 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 3690 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 3691 3692 { 3693 Label L_ghash_loop; 3694 __ bind(L_ghash_loop); 3695 3696 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 3697 // reversing each byte 3698 __ rbit(v2, __ T16B, v2); 3699 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 3700 3701 // Multiply state in v2 by subkey in v1 3702 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 3703 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 3704 /*temps*/v6, v20, v18, v21); 3705 // Reduce v7:v5 by the field polynomial 3706 ghash_reduce(v0, v5, v7, v26, vzr, v20); 3707 3708 __ sub(blocks, blocks, 1); 3709 __ cbnz(blocks, L_ghash_loop); 3710 } 3711 3712 // The bit-reversed result is at this point in v0 3713 __ rev64(v1, __ T16B, v0); 3714 __ rbit(v1, __ T16B, v1); 3715 3716 __ st1(v1, __ T16B, state); 3717 __ ret(lr); 3718 3719 return start; 3720 } 3721 3722 // Continuation point for throwing of implicit exceptions that are 3723 // not handled in the current activation. Fabricates an exception 3724 // oop and initiates normal exception dispatching in this 3725 // frame. Since we need to preserve callee-saved values (currently 3726 // only for C2, but done for C1 as well) we need a callee-saved oop 3727 // map and therefore have to make these stubs into RuntimeStubs 3728 // rather than BufferBlobs. If the compiler needs all registers to 3729 // be preserved between the fault point and the exception handler 3730 // then it must assume responsibility for that in 3731 // AbstractCompiler::continuation_for_implicit_null_exception or 3732 // continuation_for_implicit_division_by_zero_exception. All other 3733 // implicit exceptions (e.g., NullPointerException or 3734 // AbstractMethodError on entry) are either at call sites or 3735 // otherwise assume that stack unwinding will be initiated, so 3736 // caller saved registers were assumed volatile in the compiler. 3737 3738 #undef __ 3739 #define __ masm-> 3740 3741 address generate_throw_exception(const char* name, 3742 address runtime_entry, 3743 Register arg1 = noreg, 3744 Register arg2 = noreg) { 3745 // Information about frame layout at time of blocking runtime call. 3746 // Note that we only have to preserve callee-saved registers since 3747 // the compilers are responsible for supplying a continuation point 3748 // if they expect all registers to be preserved. 3749 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 3750 enum layout { 3751 rfp_off = 0, 3752 rfp_off2, 3753 return_off, 3754 return_off2, 3755 framesize // inclusive of return address 3756 }; 3757 3758 int insts_size = 512; 3759 int locs_size = 64; 3760 3761 CodeBuffer code(name, insts_size, locs_size); 3762 OopMapSet* oop_maps = new OopMapSet(); 3763 MacroAssembler* masm = new MacroAssembler(&code); 3764 3765 address start = __ pc(); 3766 3767 // This is an inlined and slightly modified version of call_VM 3768 // which has the ability to fetch the return PC out of 3769 // thread-local storage and also sets up last_Java_sp slightly 3770 // differently than the real call_VM 3771 3772 __ enter(); // Save FP and LR before call 3773 3774 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3775 3776 // lr and fp are already in place 3777 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 3778 3779 int frame_complete = __ pc() - start; 3780 3781 // Set up last_Java_sp and last_Java_fp 3782 address the_pc = __ pc(); 3783 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 3784 3785 // Call runtime 3786 if (arg1 != noreg) { 3787 assert(arg2 != c_rarg1, "clobbered"); 3788 __ mov(c_rarg1, arg1); 3789 } 3790 if (arg2 != noreg) { 3791 __ mov(c_rarg2, arg2); 3792 } 3793 __ mov(c_rarg0, rthread); 3794 BLOCK_COMMENT("call runtime_entry"); 3795 __ mov(rscratch1, runtime_entry); 3796 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 3797 3798 // Generate oop map 3799 OopMap* map = new OopMap(framesize, 0); 3800 3801 oop_maps->add_gc_map(the_pc - start, map); 3802 3803 __ reset_last_Java_frame(true, true); 3804 __ maybe_isb(); 3805 3806 __ leave(); 3807 3808 // check for pending exceptions 3809 #ifdef ASSERT 3810 Label L; 3811 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 3812 __ cbnz(rscratch1, L); 3813 __ should_not_reach_here(); 3814 __ bind(L); 3815 #endif // ASSERT 3816 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3817 3818 3819 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3820 RuntimeStub* stub = 3821 RuntimeStub::new_runtime_stub(name, 3822 &code, 3823 frame_complete, 3824 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3825 oop_maps, false); 3826 return stub->entry_point(); 3827 } 3828 3829 class MontgomeryMultiplyGenerator : public MacroAssembler { 3830 3831 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3832 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 3833 3834 RegSet _toSave; 3835 bool _squaring; 3836 3837 public: 3838 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3839 : MacroAssembler(as->code()), _squaring(squaring) { 3840 3841 // Register allocation 3842 3843 Register reg = c_rarg0; 3844 Pa_base = reg; // Argument registers 3845 if (squaring) 3846 Pb_base = Pa_base; 3847 else 3848 Pb_base = ++reg; 3849 Pn_base = ++reg; 3850 Rlen= ++reg; 3851 inv = ++reg; 3852 Pm_base = ++reg; 3853 3854 // Working registers: 3855 Ra = ++reg; // The current digit of a, b, n, and m. 3856 Rb = ++reg; 3857 Rm = ++reg; 3858 Rn = ++reg; 3859 3860 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 3861 Pb = ++reg; 3862 Pm = ++reg; 3863 Pn = ++reg; 3864 3865 t0 = ++reg; // Three registers which form a 3866 t1 = ++reg; // triple-precision accumuator. 3867 t2 = ++reg; 3868 3869 Ri = ++reg; // Inner and outer loop indexes. 3870 Rj = ++reg; 3871 3872 Rhi_ab = ++reg; // Product registers: low and high parts 3873 Rlo_ab = ++reg; // of a*b and m*n. 3874 Rhi_mn = ++reg; 3875 Rlo_mn = ++reg; 3876 3877 // r19 and up are callee-saved. 3878 _toSave = RegSet::range(r19, reg) + Pm_base; 3879 } 3880 3881 private: 3882 void save_regs() { 3883 push(_toSave, sp); 3884 } 3885 3886 void restore_regs() { 3887 pop(_toSave, sp); 3888 } 3889 3890 template <typename T> 3891 void unroll_2(Register count, T block) { 3892 Label loop, end, odd; 3893 tbnz(count, 0, odd); 3894 cbz(count, end); 3895 align(16); 3896 bind(loop); 3897 (this->*block)(); 3898 bind(odd); 3899 (this->*block)(); 3900 subs(count, count, 2); 3901 br(Assembler::GT, loop); 3902 bind(end); 3903 } 3904 3905 template <typename T> 3906 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3907 Label loop, end, odd; 3908 tbnz(count, 0, odd); 3909 cbz(count, end); 3910 align(16); 3911 bind(loop); 3912 (this->*block)(d, s, tmp); 3913 bind(odd); 3914 (this->*block)(d, s, tmp); 3915 subs(count, count, 2); 3916 br(Assembler::GT, loop); 3917 bind(end); 3918 } 3919 3920 void pre1(RegisterOrConstant i) { 3921 block_comment("pre1"); 3922 // Pa = Pa_base; 3923 // Pb = Pb_base + i; 3924 // Pm = Pm_base; 3925 // Pn = Pn_base + i; 3926 // Ra = *Pa; 3927 // Rb = *Pb; 3928 // Rm = *Pm; 3929 // Rn = *Pn; 3930 ldr(Ra, Address(Pa_base)); 3931 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3932 ldr(Rm, Address(Pm_base)); 3933 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3934 lea(Pa, Address(Pa_base)); 3935 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3936 lea(Pm, Address(Pm_base)); 3937 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3938 3939 // Zero the m*n result. 3940 mov(Rhi_mn, zr); 3941 mov(Rlo_mn, zr); 3942 } 3943 3944 // The core multiply-accumulate step of a Montgomery 3945 // multiplication. The idea is to schedule operations as a 3946 // pipeline so that instructions with long latencies (loads and 3947 // multiplies) have time to complete before their results are 3948 // used. This most benefits in-order implementations of the 3949 // architecture but out-of-order ones also benefit. 3950 void step() { 3951 block_comment("step"); 3952 // MACC(Ra, Rb, t0, t1, t2); 3953 // Ra = *++Pa; 3954 // Rb = *--Pb; 3955 umulh(Rhi_ab, Ra, Rb); 3956 mul(Rlo_ab, Ra, Rb); 3957 ldr(Ra, pre(Pa, wordSize)); 3958 ldr(Rb, pre(Pb, -wordSize)); 3959 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 3960 // previous iteration. 3961 // MACC(Rm, Rn, t0, t1, t2); 3962 // Rm = *++Pm; 3963 // Rn = *--Pn; 3964 umulh(Rhi_mn, Rm, Rn); 3965 mul(Rlo_mn, Rm, Rn); 3966 ldr(Rm, pre(Pm, wordSize)); 3967 ldr(Rn, pre(Pn, -wordSize)); 3968 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3969 } 3970 3971 void post1() { 3972 block_comment("post1"); 3973 3974 // MACC(Ra, Rb, t0, t1, t2); 3975 // Ra = *++Pa; 3976 // Rb = *--Pb; 3977 umulh(Rhi_ab, Ra, Rb); 3978 mul(Rlo_ab, Ra, Rb); 3979 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3980 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3981 3982 // *Pm = Rm = t0 * inv; 3983 mul(Rm, t0, inv); 3984 str(Rm, Address(Pm)); 3985 3986 // MACC(Rm, Rn, t0, t1, t2); 3987 // t0 = t1; t1 = t2; t2 = 0; 3988 umulh(Rhi_mn, Rm, Rn); 3989 3990 #ifndef PRODUCT 3991 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 3992 { 3993 mul(Rlo_mn, Rm, Rn); 3994 add(Rlo_mn, t0, Rlo_mn); 3995 Label ok; 3996 cbz(Rlo_mn, ok); { 3997 stop("broken Montgomery multiply"); 3998 } bind(ok); 3999 } 4000 #endif 4001 // We have very carefully set things up so that 4002 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4003 // the lower half of Rm * Rn because we know the result already: 4004 // it must be -t0. t0 + (-t0) must generate a carry iff 4005 // t0 != 0. So, rather than do a mul and an adds we just set 4006 // the carry flag iff t0 is nonzero. 4007 // 4008 // mul(Rlo_mn, Rm, Rn); 4009 // adds(zr, t0, Rlo_mn); 4010 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4011 adcs(t0, t1, Rhi_mn); 4012 adc(t1, t2, zr); 4013 mov(t2, zr); 4014 } 4015 4016 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 4017 block_comment("pre2"); 4018 // Pa = Pa_base + i-len; 4019 // Pb = Pb_base + len; 4020 // Pm = Pm_base + i-len; 4021 // Pn = Pn_base + len; 4022 4023 if (i.is_register()) { 4024 sub(Rj, i.as_register(), len); 4025 } else { 4026 mov(Rj, i.as_constant()); 4027 sub(Rj, Rj, len); 4028 } 4029 // Rj == i-len 4030 4031 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 4032 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 4033 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4034 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 4035 4036 // Ra = *++Pa; 4037 // Rb = *--Pb; 4038 // Rm = *++Pm; 4039 // Rn = *--Pn; 4040 ldr(Ra, pre(Pa, wordSize)); 4041 ldr(Rb, pre(Pb, -wordSize)); 4042 ldr(Rm, pre(Pm, wordSize)); 4043 ldr(Rn, pre(Pn, -wordSize)); 4044 4045 mov(Rhi_mn, zr); 4046 mov(Rlo_mn, zr); 4047 } 4048 4049 void post2(RegisterOrConstant i, RegisterOrConstant len) { 4050 block_comment("post2"); 4051 if (i.is_constant()) { 4052 mov(Rj, i.as_constant()-len.as_constant()); 4053 } else { 4054 sub(Rj, i.as_register(), len); 4055 } 4056 4057 adds(t0, t0, Rlo_mn); // The pending m*n, low part 4058 4059 // As soon as we know the least significant digit of our result, 4060 // store it. 4061 // Pm_base[i-len] = t0; 4062 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4063 4064 // t0 = t1; t1 = t2; t2 = 0; 4065 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 4066 adc(t1, t2, zr); 4067 mov(t2, zr); 4068 } 4069 4070 // A carry in t0 after Montgomery multiplication means that we 4071 // should subtract multiples of n from our result in m. We'll 4072 // keep doing that until there is no carry. 4073 void normalize(RegisterOrConstant len) { 4074 block_comment("normalize"); 4075 // while (t0) 4076 // t0 = sub(Pm_base, Pn_base, t0, len); 4077 Label loop, post, again; 4078 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 4079 cbz(t0, post); { 4080 bind(again); { 4081 mov(i, zr); 4082 mov(cnt, len); 4083 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4084 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4085 subs(zr, zr, zr); // set carry flag, i.e. no borrow 4086 align(16); 4087 bind(loop); { 4088 sbcs(Rm, Rm, Rn); 4089 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4090 add(i, i, 1); 4091 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4092 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4093 sub(cnt, cnt, 1); 4094 } cbnz(cnt, loop); 4095 sbc(t0, t0, zr); 4096 } cbnz(t0, again); 4097 } bind(post); 4098 } 4099 4100 // Move memory at s to d, reversing words. 4101 // Increments d to end of copied memory 4102 // Destroys tmp1, tmp2 4103 // Preserves len 4104 // Leaves s pointing to the address which was in d at start 4105 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 4106 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 4107 4108 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 4109 mov(tmp1, len); 4110 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 4111 sub(s, d, len, ext::uxtw, LogBytesPerWord); 4112 } 4113 // where 4114 void reverse1(Register d, Register s, Register tmp) { 4115 ldr(tmp, pre(s, -wordSize)); 4116 ror(tmp, tmp, 32); 4117 str(tmp, post(d, wordSize)); 4118 } 4119 4120 void step_squaring() { 4121 // An extra ACC 4122 step(); 4123 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4124 } 4125 4126 void last_squaring(RegisterOrConstant i) { 4127 Label dont; 4128 // if ((i & 1) == 0) { 4129 tbnz(i.as_register(), 0, dont); { 4130 // MACC(Ra, Rb, t0, t1, t2); 4131 // Ra = *++Pa; 4132 // Rb = *--Pb; 4133 umulh(Rhi_ab, Ra, Rb); 4134 mul(Rlo_ab, Ra, Rb); 4135 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4136 } bind(dont); 4137 } 4138 4139 void extra_step_squaring() { 4140 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4141 4142 // MACC(Rm, Rn, t0, t1, t2); 4143 // Rm = *++Pm; 4144 // Rn = *--Pn; 4145 umulh(Rhi_mn, Rm, Rn); 4146 mul(Rlo_mn, Rm, Rn); 4147 ldr(Rm, pre(Pm, wordSize)); 4148 ldr(Rn, pre(Pn, -wordSize)); 4149 } 4150 4151 void post1_squaring() { 4152 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4153 4154 // *Pm = Rm = t0 * inv; 4155 mul(Rm, t0, inv); 4156 str(Rm, Address(Pm)); 4157 4158 // MACC(Rm, Rn, t0, t1, t2); 4159 // t0 = t1; t1 = t2; t2 = 0; 4160 umulh(Rhi_mn, Rm, Rn); 4161 4162 #ifndef PRODUCT 4163 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4164 { 4165 mul(Rlo_mn, Rm, Rn); 4166 add(Rlo_mn, t0, Rlo_mn); 4167 Label ok; 4168 cbz(Rlo_mn, ok); { 4169 stop("broken Montgomery multiply"); 4170 } bind(ok); 4171 } 4172 #endif 4173 // We have very carefully set things up so that 4174 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4175 // the lower half of Rm * Rn because we know the result already: 4176 // it must be -t0. t0 + (-t0) must generate a carry iff 4177 // t0 != 0. So, rather than do a mul and an adds we just set 4178 // the carry flag iff t0 is nonzero. 4179 // 4180 // mul(Rlo_mn, Rm, Rn); 4181 // adds(zr, t0, Rlo_mn); 4182 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4183 adcs(t0, t1, Rhi_mn); 4184 adc(t1, t2, zr); 4185 mov(t2, zr); 4186 } 4187 4188 void acc(Register Rhi, Register Rlo, 4189 Register t0, Register t1, Register t2) { 4190 adds(t0, t0, Rlo); 4191 adcs(t1, t1, Rhi); 4192 adc(t2, t2, zr); 4193 } 4194 4195 public: 4196 /** 4197 * Fast Montgomery multiplication. The derivation of the 4198 * algorithm is in A Cryptographic Library for the Motorola 4199 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 4200 * 4201 * Arguments: 4202 * 4203 * Inputs for multiplication: 4204 * c_rarg0 - int array elements a 4205 * c_rarg1 - int array elements b 4206 * c_rarg2 - int array elements n (the modulus) 4207 * c_rarg3 - int length 4208 * c_rarg4 - int inv 4209 * c_rarg5 - int array elements m (the result) 4210 * 4211 * Inputs for squaring: 4212 * c_rarg0 - int array elements a 4213 * c_rarg1 - int array elements n (the modulus) 4214 * c_rarg2 - int length 4215 * c_rarg3 - int inv 4216 * c_rarg4 - int array elements m (the result) 4217 * 4218 */ 4219 address generate_multiply() { 4220 Label argh, nothing; 4221 bind(argh); 4222 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4223 4224 align(CodeEntryAlignment); 4225 address entry = pc(); 4226 4227 cbzw(Rlen, nothing); 4228 4229 enter(); 4230 4231 // Make room. 4232 cmpw(Rlen, 512); 4233 br(Assembler::HI, argh); 4234 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4235 andr(sp, Ra, -2 * wordSize); 4236 4237 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4238 4239 { 4240 // Copy input args, reversing as we go. We use Ra as a 4241 // temporary variable. 4242 reverse(Ra, Pa_base, Rlen, t0, t1); 4243 if (!_squaring) 4244 reverse(Ra, Pb_base, Rlen, t0, t1); 4245 reverse(Ra, Pn_base, Rlen, t0, t1); 4246 } 4247 4248 // Push all call-saved registers and also Pm_base which we'll need 4249 // at the end. 4250 save_regs(); 4251 4252 #ifndef PRODUCT 4253 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 4254 { 4255 ldr(Rn, Address(Pn_base, 0)); 4256 mul(Rlo_mn, Rn, inv); 4257 cmp(Rlo_mn, -1); 4258 Label ok; 4259 br(EQ, ok); { 4260 stop("broken inverse in Montgomery multiply"); 4261 } bind(ok); 4262 } 4263 #endif 4264 4265 mov(Pm_base, Ra); 4266 4267 mov(t0, zr); 4268 mov(t1, zr); 4269 mov(t2, zr); 4270 4271 block_comment("for (int i = 0; i < len; i++) {"); 4272 mov(Ri, zr); { 4273 Label loop, end; 4274 cmpw(Ri, Rlen); 4275 br(Assembler::GE, end); 4276 4277 bind(loop); 4278 pre1(Ri); 4279 4280 block_comment(" for (j = i; j; j--) {"); { 4281 movw(Rj, Ri); 4282 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4283 } block_comment(" } // j"); 4284 4285 post1(); 4286 addw(Ri, Ri, 1); 4287 cmpw(Ri, Rlen); 4288 br(Assembler::LT, loop); 4289 bind(end); 4290 block_comment("} // i"); 4291 } 4292 4293 block_comment("for (int i = len; i < 2*len; i++) {"); 4294 mov(Ri, Rlen); { 4295 Label loop, end; 4296 cmpw(Ri, Rlen, Assembler::LSL, 1); 4297 br(Assembler::GE, end); 4298 4299 bind(loop); 4300 pre2(Ri, Rlen); 4301 4302 block_comment(" for (j = len*2-i-1; j; j--) {"); { 4303 lslw(Rj, Rlen, 1); 4304 subw(Rj, Rj, Ri); 4305 subw(Rj, Rj, 1); 4306 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4307 } block_comment(" } // j"); 4308 4309 post2(Ri, Rlen); 4310 addw(Ri, Ri, 1); 4311 cmpw(Ri, Rlen, Assembler::LSL, 1); 4312 br(Assembler::LT, loop); 4313 bind(end); 4314 } 4315 block_comment("} // i"); 4316 4317 normalize(Rlen); 4318 4319 mov(Ra, Pm_base); // Save Pm_base in Ra 4320 restore_regs(); // Restore caller's Pm_base 4321 4322 // Copy our result into caller's Pm_base 4323 reverse(Pm_base, Ra, Rlen, t0, t1); 4324 4325 leave(); 4326 bind(nothing); 4327 ret(lr); 4328 4329 return entry; 4330 } 4331 // In C, approximately: 4332 4333 // void 4334 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 4335 // unsigned long Pn_base[], unsigned long Pm_base[], 4336 // unsigned long inv, int len) { 4337 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4338 // unsigned long *Pa, *Pb, *Pn, *Pm; 4339 // unsigned long Ra, Rb, Rn, Rm; 4340 4341 // int i; 4342 4343 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4344 4345 // for (i = 0; i < len; i++) { 4346 // int j; 4347 4348 // Pa = Pa_base; 4349 // Pb = Pb_base + i; 4350 // Pm = Pm_base; 4351 // Pn = Pn_base + i; 4352 4353 // Ra = *Pa; 4354 // Rb = *Pb; 4355 // Rm = *Pm; 4356 // Rn = *Pn; 4357 4358 // int iters = i; 4359 // for (j = 0; iters--; j++) { 4360 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4361 // MACC(Ra, Rb, t0, t1, t2); 4362 // Ra = *++Pa; 4363 // Rb = *--Pb; 4364 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4365 // MACC(Rm, Rn, t0, t1, t2); 4366 // Rm = *++Pm; 4367 // Rn = *--Pn; 4368 // } 4369 4370 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 4371 // MACC(Ra, Rb, t0, t1, t2); 4372 // *Pm = Rm = t0 * inv; 4373 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4374 // MACC(Rm, Rn, t0, t1, t2); 4375 4376 // assert(t0 == 0, "broken Montgomery multiply"); 4377 4378 // t0 = t1; t1 = t2; t2 = 0; 4379 // } 4380 4381 // for (i = len; i < 2*len; i++) { 4382 // int j; 4383 4384 // Pa = Pa_base + i-len; 4385 // Pb = Pb_base + len; 4386 // Pm = Pm_base + i-len; 4387 // Pn = Pn_base + len; 4388 4389 // Ra = *++Pa; 4390 // Rb = *--Pb; 4391 // Rm = *++Pm; 4392 // Rn = *--Pn; 4393 4394 // int iters = len*2-i-1; 4395 // for (j = i-len+1; iters--; j++) { 4396 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4397 // MACC(Ra, Rb, t0, t1, t2); 4398 // Ra = *++Pa; 4399 // Rb = *--Pb; 4400 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4401 // MACC(Rm, Rn, t0, t1, t2); 4402 // Rm = *++Pm; 4403 // Rn = *--Pn; 4404 // } 4405 4406 // Pm_base[i-len] = t0; 4407 // t0 = t1; t1 = t2; t2 = 0; 4408 // } 4409 4410 // while (t0) 4411 // t0 = sub(Pm_base, Pn_base, t0, len); 4412 // } 4413 4414 /** 4415 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4416 * multiplies than Montgomery multiplication so it should be up to 4417 * 25% faster. However, its loop control is more complex and it 4418 * may actually run slower on some machines. 4419 * 4420 * Arguments: 4421 * 4422 * Inputs: 4423 * c_rarg0 - int array elements a 4424 * c_rarg1 - int array elements n (the modulus) 4425 * c_rarg2 - int length 4426 * c_rarg3 - int inv 4427 * c_rarg4 - int array elements m (the result) 4428 * 4429 */ 4430 address generate_square() { 4431 Label argh; 4432 bind(argh); 4433 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4434 4435 align(CodeEntryAlignment); 4436 address entry = pc(); 4437 4438 enter(); 4439 4440 // Make room. 4441 cmpw(Rlen, 512); 4442 br(Assembler::HI, argh); 4443 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4444 andr(sp, Ra, -2 * wordSize); 4445 4446 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4447 4448 { 4449 // Copy input args, reversing as we go. We use Ra as a 4450 // temporary variable. 4451 reverse(Ra, Pa_base, Rlen, t0, t1); 4452 reverse(Ra, Pn_base, Rlen, t0, t1); 4453 } 4454 4455 // Push all call-saved registers and also Pm_base which we'll need 4456 // at the end. 4457 save_regs(); 4458 4459 mov(Pm_base, Ra); 4460 4461 mov(t0, zr); 4462 mov(t1, zr); 4463 mov(t2, zr); 4464 4465 block_comment("for (int i = 0; i < len; i++) {"); 4466 mov(Ri, zr); { 4467 Label loop, end; 4468 bind(loop); 4469 cmp(Ri, Rlen); 4470 br(Assembler::GE, end); 4471 4472 pre1(Ri); 4473 4474 block_comment("for (j = (i+1)/2; j; j--) {"); { 4475 add(Rj, Ri, 1); 4476 lsr(Rj, Rj, 1); 4477 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4478 } block_comment(" } // j"); 4479 4480 last_squaring(Ri); 4481 4482 block_comment(" for (j = i/2; j; j--) {"); { 4483 lsr(Rj, Ri, 1); 4484 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4485 } block_comment(" } // j"); 4486 4487 post1_squaring(); 4488 add(Ri, Ri, 1); 4489 cmp(Ri, Rlen); 4490 br(Assembler::LT, loop); 4491 4492 bind(end); 4493 block_comment("} // i"); 4494 } 4495 4496 block_comment("for (int i = len; i < 2*len; i++) {"); 4497 mov(Ri, Rlen); { 4498 Label loop, end; 4499 bind(loop); 4500 cmp(Ri, Rlen, Assembler::LSL, 1); 4501 br(Assembler::GE, end); 4502 4503 pre2(Ri, Rlen); 4504 4505 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4506 lsl(Rj, Rlen, 1); 4507 sub(Rj, Rj, Ri); 4508 sub(Rj, Rj, 1); 4509 lsr(Rj, Rj, 1); 4510 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4511 } block_comment(" } // j"); 4512 4513 last_squaring(Ri); 4514 4515 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4516 lsl(Rj, Rlen, 1); 4517 sub(Rj, Rj, Ri); 4518 lsr(Rj, Rj, 1); 4519 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4520 } block_comment(" } // j"); 4521 4522 post2(Ri, Rlen); 4523 add(Ri, Ri, 1); 4524 cmp(Ri, Rlen, Assembler::LSL, 1); 4525 4526 br(Assembler::LT, loop); 4527 bind(end); 4528 block_comment("} // i"); 4529 } 4530 4531 normalize(Rlen); 4532 4533 mov(Ra, Pm_base); // Save Pm_base in Ra 4534 restore_regs(); // Restore caller's Pm_base 4535 4536 // Copy our result into caller's Pm_base 4537 reverse(Pm_base, Ra, Rlen, t0, t1); 4538 4539 leave(); 4540 ret(lr); 4541 4542 return entry; 4543 } 4544 // In C, approximately: 4545 4546 // void 4547 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4548 // unsigned long Pm_base[], unsigned long inv, int len) { 4549 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4550 // unsigned long *Pa, *Pb, *Pn, *Pm; 4551 // unsigned long Ra, Rb, Rn, Rm; 4552 4553 // int i; 4554 4555 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4556 4557 // for (i = 0; i < len; i++) { 4558 // int j; 4559 4560 // Pa = Pa_base; 4561 // Pb = Pa_base + i; 4562 // Pm = Pm_base; 4563 // Pn = Pn_base + i; 4564 4565 // Ra = *Pa; 4566 // Rb = *Pb; 4567 // Rm = *Pm; 4568 // Rn = *Pn; 4569 4570 // int iters = (i+1)/2; 4571 // for (j = 0; iters--; j++) { 4572 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4573 // MACC2(Ra, Rb, t0, t1, t2); 4574 // Ra = *++Pa; 4575 // Rb = *--Pb; 4576 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4577 // MACC(Rm, Rn, t0, t1, t2); 4578 // Rm = *++Pm; 4579 // Rn = *--Pn; 4580 // } 4581 // if ((i & 1) == 0) { 4582 // assert(Ra == Pa_base[j], "must be"); 4583 // MACC(Ra, Ra, t0, t1, t2); 4584 // } 4585 // iters = i/2; 4586 // assert(iters == i-j, "must be"); 4587 // for (; iters--; j++) { 4588 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4589 // MACC(Rm, Rn, t0, t1, t2); 4590 // Rm = *++Pm; 4591 // Rn = *--Pn; 4592 // } 4593 4594 // *Pm = Rm = t0 * inv; 4595 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4596 // MACC(Rm, Rn, t0, t1, t2); 4597 4598 // assert(t0 == 0, "broken Montgomery multiply"); 4599 4600 // t0 = t1; t1 = t2; t2 = 0; 4601 // } 4602 4603 // for (i = len; i < 2*len; i++) { 4604 // int start = i-len+1; 4605 // int end = start + (len - start)/2; 4606 // int j; 4607 4608 // Pa = Pa_base + i-len; 4609 // Pb = Pa_base + len; 4610 // Pm = Pm_base + i-len; 4611 // Pn = Pn_base + len; 4612 4613 // Ra = *++Pa; 4614 // Rb = *--Pb; 4615 // Rm = *++Pm; 4616 // Rn = *--Pn; 4617 4618 // int iters = (2*len-i-1)/2; 4619 // assert(iters == end-start, "must be"); 4620 // for (j = start; iters--; j++) { 4621 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4622 // MACC2(Ra, Rb, t0, t1, t2); 4623 // Ra = *++Pa; 4624 // Rb = *--Pb; 4625 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4626 // MACC(Rm, Rn, t0, t1, t2); 4627 // Rm = *++Pm; 4628 // Rn = *--Pn; 4629 // } 4630 // if ((i & 1) == 0) { 4631 // assert(Ra == Pa_base[j], "must be"); 4632 // MACC(Ra, Ra, t0, t1, t2); 4633 // } 4634 // iters = (2*len-i)/2; 4635 // assert(iters == len-j, "must be"); 4636 // for (; iters--; j++) { 4637 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4638 // MACC(Rm, Rn, t0, t1, t2); 4639 // Rm = *++Pm; 4640 // Rn = *--Pn; 4641 // } 4642 // Pm_base[i-len] = t0; 4643 // t0 = t1; t1 = t2; t2 = 0; 4644 // } 4645 4646 // while (t0) 4647 // t0 = sub(Pm_base, Pn_base, t0, len); 4648 // } 4649 }; 4650 4651 // Initialization 4652 void generate_initial() { 4653 // Generate initial stubs and initializes the entry points 4654 4655 // entry points that exist in all platforms Note: This is code 4656 // that could be shared among different platforms - however the 4657 // benefit seems to be smaller than the disadvantage of having a 4658 // much more complicated generator structure. See also comment in 4659 // stubRoutines.hpp. 4660 4661 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4662 4663 StubRoutines::_call_stub_entry = 4664 generate_call_stub(StubRoutines::_call_stub_return_address); 4665 4666 // is referenced by megamorphic call 4667 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4668 4669 // Build this early so it's available for the interpreter. 4670 StubRoutines::_throw_StackOverflowError_entry = 4671 generate_throw_exception("StackOverflowError throw_exception", 4672 CAST_FROM_FN_PTR(address, 4673 SharedRuntime:: 4674 throw_StackOverflowError)); 4675 if (UseCRC32Intrinsics) { 4676 // set table address before stub generation which use it 4677 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 4678 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4679 } 4680 } 4681 4682 void generate_all() { 4683 // support for verify_oop (must happen after universe_init) 4684 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4685 StubRoutines::_throw_AbstractMethodError_entry = 4686 generate_throw_exception("AbstractMethodError throw_exception", 4687 CAST_FROM_FN_PTR(address, 4688 SharedRuntime:: 4689 throw_AbstractMethodError)); 4690 4691 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4692 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4693 CAST_FROM_FN_PTR(address, 4694 SharedRuntime:: 4695 throw_IncompatibleClassChangeError)); 4696 4697 StubRoutines::_throw_NullPointerException_at_call_entry = 4698 generate_throw_exception("NullPointerException at call throw_exception", 4699 CAST_FROM_FN_PTR(address, 4700 SharedRuntime:: 4701 throw_NullPointerException_at_call)); 4702 4703 // arraycopy stubs used by compilers 4704 generate_arraycopy_stubs(); 4705 4706 if (UseMultiplyToLenIntrinsic) { 4707 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4708 } 4709 4710 if (UseMontgomeryMultiplyIntrinsic) { 4711 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4712 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4713 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4714 } 4715 4716 if (UseMontgomerySquareIntrinsic) { 4717 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4718 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4719 // We use generate_multiply() rather than generate_square() 4720 // because it's faster for the sizes of modulus we care about. 4721 StubRoutines::_montgomerySquare = g.generate_multiply(); 4722 } 4723 4724 #ifndef BUILTIN_SIM 4725 // generate GHASH intrinsics code 4726 if (UseGHASHIntrinsics) { 4727 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 4728 } 4729 4730 if (UseAESIntrinsics) { 4731 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4732 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4733 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4734 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4735 } 4736 4737 if (UseSHA1Intrinsics) { 4738 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4739 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4740 } 4741 if (UseSHA256Intrinsics) { 4742 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4743 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4744 } 4745 4746 if (UseCRC32CIntrinsics) { 4747 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 4748 } 4749 4750 // generate Adler32 intrinsics code 4751 if (UseAdler32Intrinsics) { 4752 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 4753 } 4754 4755 // Safefetch stubs. 4756 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4757 &StubRoutines::_safefetch32_fault_pc, 4758 &StubRoutines::_safefetch32_continuation_pc); 4759 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4760 &StubRoutines::_safefetchN_fault_pc, 4761 &StubRoutines::_safefetchN_continuation_pc); 4762 #endif 4763 } 4764 4765 public: 4766 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4767 if (all) { 4768 generate_all(); 4769 } else { 4770 generate_initial(); 4771 } 4772 } 4773 }; // end class declaration 4774 4775 void StubGenerator_generate(CodeBuffer* code, bool all) { 4776 StubGenerator g(code, all); 4777 }