1 /* 2 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "interpreter/interpreter.hpp" 30 #include "nativeInst_aarch64.hpp" 31 #include "oops/instanceOop.hpp" 32 #include "oops/method.hpp" 33 #include "oops/objArrayKlass.hpp" 34 #include "oops/oop.inline.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/frame.inline.hpp" 37 #include "runtime/handles.inline.hpp" 38 #include "runtime/sharedRuntime.hpp" 39 #include "runtime/stubCodeGenerator.hpp" 40 #include "runtime/stubRoutines.hpp" 41 #include "runtime/thread.inline.hpp" 42 #include "utilities/align.hpp" 43 #ifdef COMPILER2 44 #include "opto/runtime.hpp" 45 #endif 46 47 #ifdef BUILTIN_SIM 48 #include "../../../../../../simulator/simulator.hpp" 49 #endif 50 51 // Declaration and definition of StubGenerator (no .hpp file). 52 // For a more detailed description of the stub routine structure 53 // see the comment in stubRoutines.hpp 54 55 #undef __ 56 #define __ _masm-> 57 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 58 59 #ifdef PRODUCT 60 #define BLOCK_COMMENT(str) /* nothing */ 61 #else 62 #define BLOCK_COMMENT(str) __ block_comment(str) 63 #endif 64 65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 66 67 // Stub Code definitions 68 69 class StubGenerator: public StubCodeGenerator { 70 private: 71 72 #ifdef PRODUCT 73 #define inc_counter_np(counter) ((void)0) 74 #else 75 void inc_counter_np_(int& counter) { 76 __ lea(rscratch2, ExternalAddress((address)&counter)); 77 __ ldrw(rscratch1, Address(rscratch2)); 78 __ addw(rscratch1, rscratch1, 1); 79 __ strw(rscratch1, Address(rscratch2)); 80 } 81 #define inc_counter_np(counter) \ 82 BLOCK_COMMENT("inc_counter " #counter); \ 83 inc_counter_np_(counter); 84 #endif 85 86 // Call stubs are used to call Java from C 87 // 88 // Arguments: 89 // c_rarg0: call wrapper address address 90 // c_rarg1: result address 91 // c_rarg2: result type BasicType 92 // c_rarg3: method Method* 93 // c_rarg4: (interpreter) entry point address 94 // c_rarg5: parameters intptr_t* 95 // c_rarg6: parameter size (in words) int 96 // c_rarg7: thread Thread* 97 // 98 // There is no return from the stub itself as any Java result 99 // is written to result 100 // 101 // we save r30 (lr) as the return PC at the base of the frame and 102 // link r29 (fp) below it as the frame pointer installing sp (r31) 103 // into fp. 104 // 105 // we save r0-r7, which accounts for all the c arguments. 106 // 107 // TODO: strictly do we need to save them all? they are treated as 108 // volatile by C so could we omit saving the ones we are going to 109 // place in global registers (thread? method?) or those we only use 110 // during setup of the Java call? 111 // 112 // we don't need to save r8 which C uses as an indirect result location 113 // return register. 114 // 115 // we don't need to save r9-r15 which both C and Java treat as 116 // volatile 117 // 118 // we don't need to save r16-18 because Java does not use them 119 // 120 // we save r19-r28 which Java uses as scratch registers and C 121 // expects to be callee-save 122 // 123 // we save the bottom 64 bits of each value stored in v8-v15; it is 124 // the responsibility of the caller to preserve larger values. 125 // 126 // so the stub frame looks like this when we enter Java code 127 // 128 // [ return_from_Java ] <--- sp 129 // [ argument word n ] 130 // ... 131 // -27 [ argument word 1 ] 132 // -26 [ saved v15 ] <--- sp_after_call 133 // -25 [ saved v14 ] 134 // -24 [ saved v13 ] 135 // -23 [ saved v12 ] 136 // -22 [ saved v11 ] 137 // -21 [ saved v10 ] 138 // -20 [ saved v9 ] 139 // -19 [ saved v8 ] 140 // -18 [ saved r28 ] 141 // -17 [ saved r27 ] 142 // -16 [ saved r26 ] 143 // -15 [ saved r25 ] 144 // -14 [ saved r24 ] 145 // -13 [ saved r23 ] 146 // -12 [ saved r22 ] 147 // -11 [ saved r21 ] 148 // -10 [ saved r20 ] 149 // -9 [ saved r19 ] 150 // -8 [ call wrapper (r0) ] 151 // -7 [ result (r1) ] 152 // -6 [ result type (r2) ] 153 // -5 [ method (r3) ] 154 // -4 [ entry point (r4) ] 155 // -3 [ parameters (r5) ] 156 // -2 [ parameter size (r6) ] 157 // -1 [ thread (r7) ] 158 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 159 // 1 [ saved lr (r30) ] 160 161 // Call stub stack layout word offsets from fp 162 enum call_stub_layout { 163 sp_after_call_off = -26, 164 165 d15_off = -26, 166 d13_off = -24, 167 d11_off = -22, 168 d9_off = -20, 169 170 r28_off = -18, 171 r26_off = -16, 172 r24_off = -14, 173 r22_off = -12, 174 r20_off = -10, 175 call_wrapper_off = -8, 176 result_off = -7, 177 result_type_off = -6, 178 method_off = -5, 179 entry_point_off = -4, 180 parameter_size_off = -2, 181 thread_off = -1, 182 fp_f = 0, 183 retaddr_off = 1, 184 }; 185 186 address generate_call_stub(address& return_address) { 187 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 188 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 189 "adjust this code"); 190 191 StubCodeMark mark(this, "StubRoutines", "call_stub"); 192 address start = __ pc(); 193 194 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 195 196 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 197 const Address result (rfp, result_off * wordSize); 198 const Address result_type (rfp, result_type_off * wordSize); 199 const Address method (rfp, method_off * wordSize); 200 const Address entry_point (rfp, entry_point_off * wordSize); 201 const Address parameter_size(rfp, parameter_size_off * wordSize); 202 203 const Address thread (rfp, thread_off * wordSize); 204 205 const Address d15_save (rfp, d15_off * wordSize); 206 const Address d13_save (rfp, d13_off * wordSize); 207 const Address d11_save (rfp, d11_off * wordSize); 208 const Address d9_save (rfp, d9_off * wordSize); 209 210 const Address r28_save (rfp, r28_off * wordSize); 211 const Address r26_save (rfp, r26_off * wordSize); 212 const Address r24_save (rfp, r24_off * wordSize); 213 const Address r22_save (rfp, r22_off * wordSize); 214 const Address r20_save (rfp, r20_off * wordSize); 215 216 // stub code 217 218 // we need a C prolog to bootstrap the x86 caller into the sim 219 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 220 221 address aarch64_entry = __ pc(); 222 223 #ifdef BUILTIN_SIM 224 // Save sender's SP for stack traces. 225 __ mov(rscratch1, sp); 226 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 227 #endif 228 // set up frame and move sp to end of save area 229 __ enter(); 230 __ sub(sp, rfp, -sp_after_call_off * wordSize); 231 232 // save register parameters and Java scratch/global registers 233 // n.b. we save thread even though it gets installed in 234 // rthread because we want to sanity check rthread later 235 __ str(c_rarg7, thread); 236 __ strw(c_rarg6, parameter_size); 237 __ stp(c_rarg4, c_rarg5, entry_point); 238 __ stp(c_rarg2, c_rarg3, result_type); 239 __ stp(c_rarg0, c_rarg1, call_wrapper); 240 241 __ stp(r20, r19, r20_save); 242 __ stp(r22, r21, r22_save); 243 __ stp(r24, r23, r24_save); 244 __ stp(r26, r25, r26_save); 245 __ stp(r28, r27, r28_save); 246 247 __ stpd(v9, v8, d9_save); 248 __ stpd(v11, v10, d11_save); 249 __ stpd(v13, v12, d13_save); 250 __ stpd(v15, v14, d15_save); 251 252 // install Java thread in global register now we have saved 253 // whatever value it held 254 __ mov(rthread, c_rarg7); 255 // And method 256 __ mov(rmethod, c_rarg3); 257 258 // set up the heapbase register 259 __ reinit_heapbase(); 260 261 #ifdef ASSERT 262 // make sure we have no pending exceptions 263 { 264 Label L; 265 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 266 __ cmp(rscratch1, (unsigned)NULL_WORD); 267 __ br(Assembler::EQ, L); 268 __ stop("StubRoutines::call_stub: entered with pending exception"); 269 __ BIND(L); 270 } 271 #endif 272 // pass parameters if any 273 __ mov(esp, sp); 274 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 275 __ andr(sp, rscratch1, -2 * wordSize); 276 277 BLOCK_COMMENT("pass parameters if any"); 278 Label parameters_done; 279 // parameter count is still in c_rarg6 280 // and parameter pointer identifying param 1 is in c_rarg5 281 __ cbzw(c_rarg6, parameters_done); 282 283 address loop = __ pc(); 284 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 285 __ subsw(c_rarg6, c_rarg6, 1); 286 __ push(rscratch1); 287 __ br(Assembler::GT, loop); 288 289 __ BIND(parameters_done); 290 291 // call Java entry -- passing methdoOop, and current sp 292 // rmethod: Method* 293 // r13: sender sp 294 BLOCK_COMMENT("call Java function"); 295 __ mov(r13, sp); 296 __ blr(c_rarg4); 297 298 // tell the simulator we have returned to the stub 299 300 // we do this here because the notify will already have been done 301 // if we get to the next instruction via an exception 302 // 303 // n.b. adding this instruction here affects the calculation of 304 // whether or not a routine returns to the call stub (used when 305 // doing stack walks) since the normal test is to check the return 306 // pc against the address saved below. so we may need to allow for 307 // this extra instruction in the check. 308 309 if (NotifySimulator) { 310 __ notify(Assembler::method_reentry); 311 } 312 // save current address for use by exception handling code 313 314 return_address = __ pc(); 315 316 // store result depending on type (everything that is not 317 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 318 // n.b. this assumes Java returns an integral result in r0 319 // and a floating result in j_farg0 320 __ ldr(j_rarg2, result); 321 Label is_long, is_float, is_double, exit; 322 __ ldr(j_rarg1, result_type); 323 __ cmp(j_rarg1, T_OBJECT); 324 __ br(Assembler::EQ, is_long); 325 __ cmp(j_rarg1, T_LONG); 326 __ br(Assembler::EQ, is_long); 327 __ cmp(j_rarg1, T_FLOAT); 328 __ br(Assembler::EQ, is_float); 329 __ cmp(j_rarg1, T_DOUBLE); 330 __ br(Assembler::EQ, is_double); 331 332 // handle T_INT case 333 __ strw(r0, Address(j_rarg2)); 334 335 __ BIND(exit); 336 337 // pop parameters 338 __ sub(esp, rfp, -sp_after_call_off * wordSize); 339 340 #ifdef ASSERT 341 // verify that threads correspond 342 { 343 Label L, S; 344 __ ldr(rscratch1, thread); 345 __ cmp(rthread, rscratch1); 346 __ br(Assembler::NE, S); 347 __ get_thread(rscratch1); 348 __ cmp(rthread, rscratch1); 349 __ br(Assembler::EQ, L); 350 __ BIND(S); 351 __ stop("StubRoutines::call_stub: threads must correspond"); 352 __ BIND(L); 353 } 354 #endif 355 356 // restore callee-save registers 357 __ ldpd(v15, v14, d15_save); 358 __ ldpd(v13, v12, d13_save); 359 __ ldpd(v11, v10, d11_save); 360 __ ldpd(v9, v8, d9_save); 361 362 __ ldp(r28, r27, r28_save); 363 __ ldp(r26, r25, r26_save); 364 __ ldp(r24, r23, r24_save); 365 __ ldp(r22, r21, r22_save); 366 __ ldp(r20, r19, r20_save); 367 368 __ ldp(c_rarg0, c_rarg1, call_wrapper); 369 __ ldrw(c_rarg2, result_type); 370 __ ldr(c_rarg3, method); 371 __ ldp(c_rarg4, c_rarg5, entry_point); 372 __ ldp(c_rarg6, c_rarg7, parameter_size); 373 374 #ifndef PRODUCT 375 // tell the simulator we are about to end Java execution 376 if (NotifySimulator) { 377 __ notify(Assembler::method_exit); 378 } 379 #endif 380 // leave frame and return to caller 381 __ leave(); 382 __ ret(lr); 383 384 // handle return types different from T_INT 385 386 __ BIND(is_long); 387 __ str(r0, Address(j_rarg2, 0)); 388 __ br(Assembler::AL, exit); 389 390 __ BIND(is_float); 391 __ strs(j_farg0, Address(j_rarg2, 0)); 392 __ br(Assembler::AL, exit); 393 394 __ BIND(is_double); 395 __ strd(j_farg0, Address(j_rarg2, 0)); 396 __ br(Assembler::AL, exit); 397 398 return start; 399 } 400 401 // Return point for a Java call if there's an exception thrown in 402 // Java code. The exception is caught and transformed into a 403 // pending exception stored in JavaThread that can be tested from 404 // within the VM. 405 // 406 // Note: Usually the parameters are removed by the callee. In case 407 // of an exception crossing an activation frame boundary, that is 408 // not the case if the callee is compiled code => need to setup the 409 // rsp. 410 // 411 // r0: exception oop 412 413 // NOTE: this is used as a target from the signal handler so it 414 // needs an x86 prolog which returns into the current simulator 415 // executing the generated catch_exception code. so the prolog 416 // needs to install rax in a sim register and adjust the sim's 417 // restart pc to enter the generated code at the start position 418 // then return from native to simulated execution. 419 420 address generate_catch_exception() { 421 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 422 address start = __ pc(); 423 424 // same as in generate_call_stub(): 425 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 426 const Address thread (rfp, thread_off * wordSize); 427 428 #ifdef ASSERT 429 // verify that threads correspond 430 { 431 Label L, S; 432 __ ldr(rscratch1, thread); 433 __ cmp(rthread, rscratch1); 434 __ br(Assembler::NE, S); 435 __ get_thread(rscratch1); 436 __ cmp(rthread, rscratch1); 437 __ br(Assembler::EQ, L); 438 __ bind(S); 439 __ stop("StubRoutines::catch_exception: threads must correspond"); 440 __ bind(L); 441 } 442 #endif 443 444 // set pending exception 445 __ verify_oop(r0); 446 447 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 448 __ mov(rscratch1, (address)__FILE__); 449 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 450 __ movw(rscratch1, (int)__LINE__); 451 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 452 453 // complete return to VM 454 assert(StubRoutines::_call_stub_return_address != NULL, 455 "_call_stub_return_address must have been generated before"); 456 __ b(StubRoutines::_call_stub_return_address); 457 458 return start; 459 } 460 461 // Continuation point for runtime calls returning with a pending 462 // exception. The pending exception check happened in the runtime 463 // or native call stub. The pending exception in Thread is 464 // converted into a Java-level exception. 465 // 466 // Contract with Java-level exception handlers: 467 // r0: exception 468 // r3: throwing pc 469 // 470 // NOTE: At entry of this stub, exception-pc must be in LR !! 471 472 // NOTE: this is always used as a jump target within generated code 473 // so it just needs to be generated code wiht no x86 prolog 474 475 address generate_forward_exception() { 476 StubCodeMark mark(this, "StubRoutines", "forward exception"); 477 address start = __ pc(); 478 479 // Upon entry, LR points to the return address returning into 480 // Java (interpreted or compiled) code; i.e., the return address 481 // becomes the throwing pc. 482 // 483 // Arguments pushed before the runtime call are still on the stack 484 // but the exception handler will reset the stack pointer -> 485 // ignore them. A potential result in registers can be ignored as 486 // well. 487 488 #ifdef ASSERT 489 // make sure this code is only executed if there is a pending exception 490 { 491 Label L; 492 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 493 __ cbnz(rscratch1, L); 494 __ stop("StubRoutines::forward exception: no pending exception (1)"); 495 __ bind(L); 496 } 497 #endif 498 499 // compute exception handler into r19 500 501 // call the VM to find the handler address associated with the 502 // caller address. pass thread in r0 and caller pc (ret address) 503 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 504 // the stack. 505 __ mov(c_rarg1, lr); 506 // lr will be trashed by the VM call so we move it to R19 507 // (callee-saved) because we also need to pass it to the handler 508 // returned by this call. 509 __ mov(r19, lr); 510 BLOCK_COMMENT("call exception_handler_for_return_address"); 511 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 512 SharedRuntime::exception_handler_for_return_address), 513 rthread, c_rarg1); 514 // we should not really care that lr is no longer the callee 515 // address. we saved the value the handler needs in r19 so we can 516 // just copy it to r3. however, the C2 handler will push its own 517 // frame and then calls into the VM and the VM code asserts that 518 // the PC for the frame above the handler belongs to a compiled 519 // Java method. So, we restore lr here to satisfy that assert. 520 __ mov(lr, r19); 521 // setup r0 & r3 & clear pending exception 522 __ mov(r3, r19); 523 __ mov(r19, r0); 524 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 525 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 526 527 #ifdef ASSERT 528 // make sure exception is set 529 { 530 Label L; 531 __ cbnz(r0, L); 532 __ stop("StubRoutines::forward exception: no pending exception (2)"); 533 __ bind(L); 534 } 535 #endif 536 537 // continue at exception handler 538 // r0: exception 539 // r3: throwing pc 540 // r19: exception handler 541 __ verify_oop(r0); 542 __ br(r19); 543 544 return start; 545 } 546 547 // Non-destructive plausibility checks for oops 548 // 549 // Arguments: 550 // r0: oop to verify 551 // rscratch1: error message 552 // 553 // Stack after saving c_rarg3: 554 // [tos + 0]: saved c_rarg3 555 // [tos + 1]: saved c_rarg2 556 // [tos + 2]: saved lr 557 // [tos + 3]: saved rscratch2 558 // [tos + 4]: saved r0 559 // [tos + 5]: saved rscratch1 560 address generate_verify_oop() { 561 562 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 563 address start = __ pc(); 564 565 Label exit, error; 566 567 // save c_rarg2 and c_rarg3 568 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 569 570 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 571 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 572 __ ldr(c_rarg3, Address(c_rarg2)); 573 __ add(c_rarg3, c_rarg3, 1); 574 __ str(c_rarg3, Address(c_rarg2)); 575 576 // object is in r0 577 // make sure object is 'reasonable' 578 __ cbz(r0, exit); // if obj is NULL it is OK 579 580 // Check if the oop is in the right area of memory 581 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 582 __ andr(c_rarg2, r0, c_rarg3); 583 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 584 585 // Compare c_rarg2 and c_rarg3. We don't use a compare 586 // instruction here because the flags register is live. 587 __ eor(c_rarg2, c_rarg2, c_rarg3); 588 __ cbnz(c_rarg2, error); 589 590 // make sure klass is 'reasonable', which is not zero. 591 __ load_klass(r0, r0); // get klass 592 __ cbz(r0, error); // if klass is NULL it is broken 593 594 // return if everything seems ok 595 __ bind(exit); 596 597 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 598 __ ret(lr); 599 600 // handle errors 601 __ bind(error); 602 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 603 604 __ push(RegSet::range(r0, r29), sp); 605 // debug(char* msg, int64_t pc, int64_t regs[]) 606 __ mov(c_rarg0, rscratch1); // pass address of error message 607 __ mov(c_rarg1, lr); // pass return address 608 __ mov(c_rarg2, sp); // pass address of regs on stack 609 #ifndef PRODUCT 610 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 611 #endif 612 BLOCK_COMMENT("call MacroAssembler::debug"); 613 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 614 __ blrt(rscratch1, 3, 0, 1); 615 616 return start; 617 } 618 619 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 620 621 // Generate code for an array write pre barrier 622 // 623 // addr - starting address 624 // count - element count 625 // tmp - scratch register 626 // saved_regs - registers to be saved before calling static_write_ref_array_pre 627 // 628 // Callers must specify which registers to preserve in saved_regs. 629 // Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs. 630 // 631 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized, RegSet saved_regs) { 632 BarrierSet* bs = Universe::heap()->barrier_set(); 633 switch (bs->kind()) { 634 case BarrierSet::G1SATBCTLogging: 635 // With G1, don't generate the call if we statically know that the target in uninitialized 636 if (!dest_uninitialized) { 637 __ push(saved_regs, sp); 638 if (count == c_rarg0) { 639 if (addr == c_rarg1) { 640 // exactly backwards!! 641 __ mov(rscratch1, c_rarg0); 642 __ mov(c_rarg0, c_rarg1); 643 __ mov(c_rarg1, rscratch1); 644 } else { 645 __ mov(c_rarg1, count); 646 __ mov(c_rarg0, addr); 647 } 648 } else { 649 __ mov(c_rarg0, addr); 650 __ mov(c_rarg1, count); 651 } 652 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 653 __ pop(saved_regs, sp); 654 break; 655 case BarrierSet::CardTableForRS: 656 case BarrierSet::CardTableExtension: 657 case BarrierSet::ModRef: 658 break; 659 default: 660 ShouldNotReachHere(); 661 662 } 663 } 664 } 665 666 // 667 // Generate code for an array write post barrier 668 // 669 // Input: 670 // start - register containing starting address of destination array 671 // end - register containing ending address of destination array 672 // scratch - scratch register 673 // saved_regs - registers to be saved before calling static_write_ref_array_post 674 // 675 // The input registers are overwritten. 676 // The ending address is inclusive. 677 // Callers must specify which registers to preserve in saved_regs. 678 // Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs. 679 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch, RegSet saved_regs) { 680 assert_different_registers(start, end, scratch); 681 BarrierSet* bs = Universe::heap()->barrier_set(); 682 switch (bs->kind()) { 683 case BarrierSet::G1SATBCTLogging: 684 685 { 686 __ push(saved_regs, sp); 687 // must compute element count unless barrier set interface is changed (other platforms supply count) 688 assert_different_registers(start, end, scratch); 689 __ lea(scratch, Address(end, BytesPerHeapOop)); 690 __ sub(scratch, scratch, start); // subtract start to get #bytes 691 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 692 __ mov(c_rarg0, start); 693 __ mov(c_rarg1, scratch); 694 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 695 __ pop(saved_regs, sp); 696 } 697 break; 698 case BarrierSet::CardTableForRS: 699 case BarrierSet::CardTableExtension: 700 { 701 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 702 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 703 704 Label L_loop; 705 706 __ lsr(start, start, CardTableModRefBS::card_shift); 707 __ lsr(end, end, CardTableModRefBS::card_shift); 708 __ sub(end, end, start); // number of bytes to copy 709 710 const Register count = end; // 'end' register contains bytes count now 711 __ load_byte_map_base(scratch); 712 __ add(start, start, scratch); 713 if (UseConcMarkSweepGC) { 714 __ membar(__ StoreStore); 715 } 716 __ BIND(L_loop); 717 __ strb(zr, Address(start, count)); 718 __ subs(count, count, 1); 719 __ br(Assembler::GE, L_loop); 720 } 721 break; 722 default: 723 ShouldNotReachHere(); 724 725 } 726 } 727 728 // The inner part of zero_words(). This is the bulk operation, 729 // zeroing words in blocks, possibly using DC ZVA to do it. The 730 // caller is responsible for zeroing the last few words. 731 // 732 // Inputs: 733 // r10: the HeapWord-aligned base address of an array to zero. 734 // r11: the count in HeapWords, r11 > 0. 735 // 736 // Returns r10 and r11, adjusted for the caller to clear. 737 // r10: the base address of the tail of words left to clear. 738 // r11: the number of words in the tail. 739 // r11 < MacroAssembler::zero_words_block_size. 740 741 address generate_zero_blocks() { 742 Label store_pair, loop_store_pair, done; 743 Label base_aligned; 744 745 Register base = r10, cnt = r11; 746 747 __ align(CodeEntryAlignment); 748 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 749 address start = __ pc(); 750 751 if (UseBlockZeroing) { 752 int zva_length = VM_Version::zva_length(); 753 754 // Ensure ZVA length can be divided by 16. This is required by 755 // the subsequent operations. 756 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 757 758 __ tbz(base, 3, base_aligned); 759 __ str(zr, Address(__ post(base, 8))); 760 __ sub(cnt, cnt, 1); 761 __ bind(base_aligned); 762 763 // Ensure count >= zva_length * 2 so that it still deserves a zva after 764 // alignment. 765 Label small; 766 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 767 __ subs(rscratch1, cnt, low_limit >> 3); 768 __ br(Assembler::LT, small); 769 __ zero_dcache_blocks(base, cnt); 770 __ bind(small); 771 } 772 773 { 774 // Number of stp instructions we'll unroll 775 const int unroll = 776 MacroAssembler::zero_words_block_size / 2; 777 // Clear the remaining blocks. 778 Label loop; 779 __ subs(cnt, cnt, unroll * 2); 780 __ br(Assembler::LT, done); 781 __ bind(loop); 782 for (int i = 0; i < unroll; i++) 783 __ stp(zr, zr, __ post(base, 16)); 784 __ subs(cnt, cnt, unroll * 2); 785 __ br(Assembler::GE, loop); 786 __ bind(done); 787 __ add(cnt, cnt, unroll * 2); 788 } 789 790 __ ret(lr); 791 792 return start; 793 } 794 795 796 typedef enum { 797 copy_forwards = 1, 798 copy_backwards = -1 799 } copy_direction; 800 801 // Bulk copy of blocks of 8 words. 802 // 803 // count is a count of words. 804 // 805 // Precondition: count >= 8 806 // 807 // Postconditions: 808 // 809 // The least significant bit of count contains the remaining count 810 // of words to copy. The rest of count is trash. 811 // 812 // s and d are adjusted to point to the remaining words to copy 813 // 814 void generate_copy_longs(Label &start, Register s, Register d, Register count, 815 copy_direction direction) { 816 int unit = wordSize * direction; 817 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 818 819 int offset; 820 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 821 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 822 const Register stride = r13; 823 824 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 825 assert_different_registers(s, d, count, rscratch1); 826 827 Label again, drain; 828 const char *stub_name; 829 if (direction == copy_forwards) 830 stub_name = "forward_copy_longs"; 831 else 832 stub_name = "backward_copy_longs"; 833 StubCodeMark mark(this, "StubRoutines", stub_name); 834 __ align(CodeEntryAlignment); 835 __ bind(start); 836 837 Label unaligned_copy_long; 838 if (AvoidUnalignedAccesses) { 839 __ tbnz(d, 3, unaligned_copy_long); 840 } 841 842 if (direction == copy_forwards) { 843 __ sub(s, s, bias); 844 __ sub(d, d, bias); 845 } 846 847 #ifdef ASSERT 848 // Make sure we are never given < 8 words 849 { 850 Label L; 851 __ cmp(count, 8); 852 __ br(Assembler::GE, L); 853 __ stop("genrate_copy_longs called with < 8 words"); 854 __ bind(L); 855 } 856 #endif 857 858 // Fill 8 registers 859 if (UseSIMDForMemoryOps) { 860 __ ldpq(v0, v1, Address(s, 4 * unit)); 861 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 862 } else { 863 __ ldp(t0, t1, Address(s, 2 * unit)); 864 __ ldp(t2, t3, Address(s, 4 * unit)); 865 __ ldp(t4, t5, Address(s, 6 * unit)); 866 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 867 } 868 869 __ subs(count, count, 16); 870 __ br(Assembler::LO, drain); 871 872 int prefetch = PrefetchCopyIntervalInBytes; 873 bool use_stride = false; 874 if (direction == copy_backwards) { 875 use_stride = prefetch > 256; 876 prefetch = -prefetch; 877 if (use_stride) __ mov(stride, prefetch); 878 } 879 880 __ bind(again); 881 882 if (PrefetchCopyIntervalInBytes > 0) 883 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 884 885 if (UseSIMDForMemoryOps) { 886 __ stpq(v0, v1, Address(d, 4 * unit)); 887 __ ldpq(v0, v1, Address(s, 4 * unit)); 888 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 889 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 890 } else { 891 __ stp(t0, t1, Address(d, 2 * unit)); 892 __ ldp(t0, t1, Address(s, 2 * unit)); 893 __ stp(t2, t3, Address(d, 4 * unit)); 894 __ ldp(t2, t3, Address(s, 4 * unit)); 895 __ stp(t4, t5, Address(d, 6 * unit)); 896 __ ldp(t4, t5, Address(s, 6 * unit)); 897 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 898 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 899 } 900 901 __ subs(count, count, 8); 902 __ br(Assembler::HS, again); 903 904 // Drain 905 __ bind(drain); 906 if (UseSIMDForMemoryOps) { 907 __ stpq(v0, v1, Address(d, 4 * unit)); 908 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 909 } else { 910 __ stp(t0, t1, Address(d, 2 * unit)); 911 __ stp(t2, t3, Address(d, 4 * unit)); 912 __ stp(t4, t5, Address(d, 6 * unit)); 913 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 914 } 915 916 { 917 Label L1, L2; 918 __ tbz(count, exact_log2(4), L1); 919 if (UseSIMDForMemoryOps) { 920 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 921 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 922 } else { 923 __ ldp(t0, t1, Address(s, 2 * unit)); 924 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 925 __ stp(t0, t1, Address(d, 2 * unit)); 926 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 927 } 928 __ bind(L1); 929 930 if (direction == copy_forwards) { 931 __ add(s, s, bias); 932 __ add(d, d, bias); 933 } 934 935 __ tbz(count, 1, L2); 936 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 937 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 938 __ bind(L2); 939 } 940 941 __ ret(lr); 942 943 if (AvoidUnalignedAccesses) { 944 Label drain, again; 945 // Register order for storing. Order is different for backward copy. 946 947 __ bind(unaligned_copy_long); 948 949 // source address is even aligned, target odd aligned 950 // 951 // when forward copying word pairs we read long pairs at offsets 952 // {0, 2, 4, 6} (in long words). when backwards copying we read 953 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 954 // address by -2 in the forwards case so we can compute the 955 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 956 // or -1. 957 // 958 // when forward copying we need to store 1 word, 3 pairs and 959 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 960 // zero offset We adjust the destination by -1 which means we 961 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 962 // 963 // When backwards copyng we need to store 1 word, 3 pairs and 964 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 965 // offsets {1, 3, 5, 7, 8} * unit. 966 967 if (direction == copy_forwards) { 968 __ sub(s, s, 16); 969 __ sub(d, d, 8); 970 } 971 972 // Fill 8 registers 973 // 974 // for forwards copy s was offset by -16 from the original input 975 // value of s so the register contents are at these offsets 976 // relative to the 64 bit block addressed by that original input 977 // and so on for each successive 64 byte block when s is updated 978 // 979 // t0 at offset 0, t1 at offset 8 980 // t2 at offset 16, t3 at offset 24 981 // t4 at offset 32, t5 at offset 40 982 // t6 at offset 48, t7 at offset 56 983 984 // for backwards copy s was not offset so the register contents 985 // are at these offsets into the preceding 64 byte block 986 // relative to that original input and so on for each successive 987 // preceding 64 byte block when s is updated. this explains the 988 // slightly counter-intuitive looking pattern of register usage 989 // in the stp instructions for backwards copy. 990 // 991 // t0 at offset -16, t1 at offset -8 992 // t2 at offset -32, t3 at offset -24 993 // t4 at offset -48, t5 at offset -40 994 // t6 at offset -64, t7 at offset -56 995 996 __ ldp(t0, t1, Address(s, 2 * unit)); 997 __ ldp(t2, t3, Address(s, 4 * unit)); 998 __ ldp(t4, t5, Address(s, 6 * unit)); 999 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1000 1001 __ subs(count, count, 16); 1002 __ br(Assembler::LO, drain); 1003 1004 int prefetch = PrefetchCopyIntervalInBytes; 1005 bool use_stride = false; 1006 if (direction == copy_backwards) { 1007 use_stride = prefetch > 256; 1008 prefetch = -prefetch; 1009 if (use_stride) __ mov(stride, prefetch); 1010 } 1011 1012 __ bind(again); 1013 1014 if (PrefetchCopyIntervalInBytes > 0) 1015 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1016 1017 if (direction == copy_forwards) { 1018 // allowing for the offset of -8 the store instructions place 1019 // registers into the target 64 bit block at the following 1020 // offsets 1021 // 1022 // t0 at offset 0 1023 // t1 at offset 8, t2 at offset 16 1024 // t3 at offset 24, t4 at offset 32 1025 // t5 at offset 40, t6 at offset 48 1026 // t7 at offset 56 1027 1028 __ str(t0, Address(d, 1 * unit)); 1029 __ stp(t1, t2, Address(d, 2 * unit)); 1030 __ ldp(t0, t1, Address(s, 2 * unit)); 1031 __ stp(t3, t4, Address(d, 4 * unit)); 1032 __ ldp(t2, t3, Address(s, 4 * unit)); 1033 __ stp(t5, t6, Address(d, 6 * unit)); 1034 __ ldp(t4, t5, Address(s, 6 * unit)); 1035 __ str(t7, Address(__ pre(d, 8 * unit))); 1036 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1037 } else { 1038 // d was not offset when we started so the registers are 1039 // written into the 64 bit block preceding d with the following 1040 // offsets 1041 // 1042 // t1 at offset -8 1043 // t3 at offset -24, t0 at offset -16 1044 // t5 at offset -48, t2 at offset -32 1045 // t7 at offset -56, t4 at offset -48 1046 // t6 at offset -64 1047 // 1048 // note that this matches the offsets previously noted for the 1049 // loads 1050 1051 __ str(t1, Address(d, 1 * unit)); 1052 __ stp(t3, t0, Address(d, 3 * unit)); 1053 __ ldp(t0, t1, Address(s, 2 * unit)); 1054 __ stp(t5, t2, Address(d, 5 * unit)); 1055 __ ldp(t2, t3, Address(s, 4 * unit)); 1056 __ stp(t7, t4, Address(d, 7 * unit)); 1057 __ ldp(t4, t5, Address(s, 6 * unit)); 1058 __ str(t6, Address(__ pre(d, 8 * unit))); 1059 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1060 } 1061 1062 __ subs(count, count, 8); 1063 __ br(Assembler::HS, again); 1064 1065 // Drain 1066 // 1067 // this uses the same pattern of offsets and register arguments 1068 // as above 1069 __ bind(drain); 1070 if (direction == copy_forwards) { 1071 __ str(t0, Address(d, 1 * unit)); 1072 __ stp(t1, t2, Address(d, 2 * unit)); 1073 __ stp(t3, t4, Address(d, 4 * unit)); 1074 __ stp(t5, t6, Address(d, 6 * unit)); 1075 __ str(t7, Address(__ pre(d, 8 * unit))); 1076 } else { 1077 __ str(t1, Address(d, 1 * unit)); 1078 __ stp(t3, t0, Address(d, 3 * unit)); 1079 __ stp(t5, t2, Address(d, 5 * unit)); 1080 __ stp(t7, t4, Address(d, 7 * unit)); 1081 __ str(t6, Address(__ pre(d, 8 * unit))); 1082 } 1083 // now we need to copy any remaining part block which may 1084 // include a 4 word block subblock and/or a 2 word subblock. 1085 // bits 2 and 1 in the count are the tell-tale for whetehr we 1086 // have each such subblock 1087 { 1088 Label L1, L2; 1089 __ tbz(count, exact_log2(4), L1); 1090 // this is the same as above but copying only 4 longs hence 1091 // with ony one intervening stp between the str instructions 1092 // but note that the offsets and registers still follow the 1093 // same pattern 1094 __ ldp(t0, t1, Address(s, 2 * unit)); 1095 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1096 if (direction == copy_forwards) { 1097 __ str(t0, Address(d, 1 * unit)); 1098 __ stp(t1, t2, Address(d, 2 * unit)); 1099 __ str(t3, Address(__ pre(d, 4 * unit))); 1100 } else { 1101 __ str(t1, Address(d, 1 * unit)); 1102 __ stp(t3, t0, Address(d, 3 * unit)); 1103 __ str(t2, Address(__ pre(d, 4 * unit))); 1104 } 1105 __ bind(L1); 1106 1107 __ tbz(count, 1, L2); 1108 // this is the same as above but copying only 2 longs hence 1109 // there is no intervening stp between the str instructions 1110 // but note that the offset and register patterns are still 1111 // the same 1112 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1113 if (direction == copy_forwards) { 1114 __ str(t0, Address(d, 1 * unit)); 1115 __ str(t1, Address(__ pre(d, 2 * unit))); 1116 } else { 1117 __ str(t1, Address(d, 1 * unit)); 1118 __ str(t0, Address(__ pre(d, 2 * unit))); 1119 } 1120 __ bind(L2); 1121 1122 // for forwards copy we need to re-adjust the offsets we 1123 // applied so that s and d are follow the last words written 1124 1125 if (direction == copy_forwards) { 1126 __ add(s, s, 16); 1127 __ add(d, d, 8); 1128 } 1129 1130 } 1131 1132 __ ret(lr); 1133 } 1134 } 1135 1136 // Small copy: less than 16 bytes. 1137 // 1138 // NB: Ignores all of the bits of count which represent more than 15 1139 // bytes, so a caller doesn't have to mask them. 1140 1141 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1142 bool is_backwards = step < 0; 1143 size_t granularity = uabs(step); 1144 int direction = is_backwards ? -1 : 1; 1145 int unit = wordSize * direction; 1146 1147 Label Lpair, Lword, Lint, Lshort, Lbyte; 1148 1149 assert(granularity 1150 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1151 1152 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1153 1154 // ??? I don't know if this bit-test-and-branch is the right thing 1155 // to do. It does a lot of jumping, resulting in several 1156 // mispredicted branches. It might make more sense to do this 1157 // with something like Duff's device with a single computed branch. 1158 1159 __ tbz(count, 3 - exact_log2(granularity), Lword); 1160 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1161 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1162 __ bind(Lword); 1163 1164 if (granularity <= sizeof (jint)) { 1165 __ tbz(count, 2 - exact_log2(granularity), Lint); 1166 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1167 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1168 __ bind(Lint); 1169 } 1170 1171 if (granularity <= sizeof (jshort)) { 1172 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1173 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1174 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1175 __ bind(Lshort); 1176 } 1177 1178 if (granularity <= sizeof (jbyte)) { 1179 __ tbz(count, 0, Lbyte); 1180 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1181 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1182 __ bind(Lbyte); 1183 } 1184 } 1185 1186 Label copy_f, copy_b; 1187 1188 // All-singing all-dancing memory copy. 1189 // 1190 // Copy count units of memory from s to d. The size of a unit is 1191 // step, which can be positive or negative depending on the direction 1192 // of copy. If is_aligned is false, we align the source address. 1193 // 1194 1195 void copy_memory(bool is_aligned, Register s, Register d, 1196 Register count, Register tmp, int step) { 1197 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1198 bool is_backwards = step < 0; 1199 int granularity = uabs(step); 1200 const Register t0 = r3, t1 = r4; 1201 1202 // <= 96 bytes do inline. Direction doesn't matter because we always 1203 // load all the data before writing anything 1204 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1205 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1206 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1207 const Register send = r17, dend = r18; 1208 1209 if (PrefetchCopyIntervalInBytes > 0) 1210 __ prfm(Address(s, 0), PLDL1KEEP); 1211 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1212 __ br(Assembler::HI, copy_big); 1213 1214 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1215 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1216 1217 __ cmp(count, 16/granularity); 1218 __ br(Assembler::LS, copy16); 1219 1220 __ cmp(count, 64/granularity); 1221 __ br(Assembler::HI, copy80); 1222 1223 __ cmp(count, 32/granularity); 1224 __ br(Assembler::LS, copy32); 1225 1226 // 33..64 bytes 1227 if (UseSIMDForMemoryOps) { 1228 __ ldpq(v0, v1, Address(s, 0)); 1229 __ ldpq(v2, v3, Address(send, -32)); 1230 __ stpq(v0, v1, Address(d, 0)); 1231 __ stpq(v2, v3, Address(dend, -32)); 1232 } else { 1233 __ ldp(t0, t1, Address(s, 0)); 1234 __ ldp(t2, t3, Address(s, 16)); 1235 __ ldp(t4, t5, Address(send, -32)); 1236 __ ldp(t6, t7, Address(send, -16)); 1237 1238 __ stp(t0, t1, Address(d, 0)); 1239 __ stp(t2, t3, Address(d, 16)); 1240 __ stp(t4, t5, Address(dend, -32)); 1241 __ stp(t6, t7, Address(dend, -16)); 1242 } 1243 __ b(finish); 1244 1245 // 17..32 bytes 1246 __ bind(copy32); 1247 __ ldp(t0, t1, Address(s, 0)); 1248 __ ldp(t2, t3, Address(send, -16)); 1249 __ stp(t0, t1, Address(d, 0)); 1250 __ stp(t2, t3, Address(dend, -16)); 1251 __ b(finish); 1252 1253 // 65..80/96 bytes 1254 // (96 bytes if SIMD because we do 32 byes per instruction) 1255 __ bind(copy80); 1256 if (UseSIMDForMemoryOps) { 1257 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1258 __ ldpq(v4, v5, Address(send, -32)); 1259 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1260 __ stpq(v4, v5, Address(dend, -32)); 1261 } else { 1262 __ ldp(t0, t1, Address(s, 0)); 1263 __ ldp(t2, t3, Address(s, 16)); 1264 __ ldp(t4, t5, Address(s, 32)); 1265 __ ldp(t6, t7, Address(s, 48)); 1266 __ ldp(t8, t9, Address(send, -16)); 1267 1268 __ stp(t0, t1, Address(d, 0)); 1269 __ stp(t2, t3, Address(d, 16)); 1270 __ stp(t4, t5, Address(d, 32)); 1271 __ stp(t6, t7, Address(d, 48)); 1272 __ stp(t8, t9, Address(dend, -16)); 1273 } 1274 __ b(finish); 1275 1276 // 0..16 bytes 1277 __ bind(copy16); 1278 __ cmp(count, 8/granularity); 1279 __ br(Assembler::LO, copy8); 1280 1281 // 8..16 bytes 1282 __ ldr(t0, Address(s, 0)); 1283 __ ldr(t1, Address(send, -8)); 1284 __ str(t0, Address(d, 0)); 1285 __ str(t1, Address(dend, -8)); 1286 __ b(finish); 1287 1288 if (granularity < 8) { 1289 // 4..7 bytes 1290 __ bind(copy8); 1291 __ tbz(count, 2 - exact_log2(granularity), copy4); 1292 __ ldrw(t0, Address(s, 0)); 1293 __ ldrw(t1, Address(send, -4)); 1294 __ strw(t0, Address(d, 0)); 1295 __ strw(t1, Address(dend, -4)); 1296 __ b(finish); 1297 if (granularity < 4) { 1298 // 0..3 bytes 1299 __ bind(copy4); 1300 __ cbz(count, finish); // get rid of 0 case 1301 if (granularity == 2) { 1302 __ ldrh(t0, Address(s, 0)); 1303 __ strh(t0, Address(d, 0)); 1304 } else { // granularity == 1 1305 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1306 // the first and last byte. 1307 // Handle the 3 byte case by loading and storing base + count/2 1308 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1309 // This does means in the 1 byte case we load/store the same 1310 // byte 3 times. 1311 __ lsr(count, count, 1); 1312 __ ldrb(t0, Address(s, 0)); 1313 __ ldrb(t1, Address(send, -1)); 1314 __ ldrb(t2, Address(s, count)); 1315 __ strb(t0, Address(d, 0)); 1316 __ strb(t1, Address(dend, -1)); 1317 __ strb(t2, Address(d, count)); 1318 } 1319 __ b(finish); 1320 } 1321 } 1322 1323 __ bind(copy_big); 1324 if (is_backwards) { 1325 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1326 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1327 } 1328 1329 // Now we've got the small case out of the way we can align the 1330 // source address on a 2-word boundary. 1331 1332 Label aligned; 1333 1334 if (is_aligned) { 1335 // We may have to adjust by 1 word to get s 2-word-aligned. 1336 __ tbz(s, exact_log2(wordSize), aligned); 1337 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1338 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1339 __ sub(count, count, wordSize/granularity); 1340 } else { 1341 if (is_backwards) { 1342 __ andr(rscratch2, s, 2 * wordSize - 1); 1343 } else { 1344 __ neg(rscratch2, s); 1345 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1346 } 1347 // rscratch2 is the byte adjustment needed to align s. 1348 __ cbz(rscratch2, aligned); 1349 int shift = exact_log2(granularity); 1350 if (shift) __ lsr(rscratch2, rscratch2, shift); 1351 __ sub(count, count, rscratch2); 1352 1353 #if 0 1354 // ?? This code is only correct for a disjoint copy. It may or 1355 // may not make sense to use it in that case. 1356 1357 // Copy the first pair; s and d may not be aligned. 1358 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1359 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1360 1361 // Align s and d, adjust count 1362 if (is_backwards) { 1363 __ sub(s, s, rscratch2); 1364 __ sub(d, d, rscratch2); 1365 } else { 1366 __ add(s, s, rscratch2); 1367 __ add(d, d, rscratch2); 1368 } 1369 #else 1370 copy_memory_small(s, d, rscratch2, rscratch1, step); 1371 #endif 1372 } 1373 1374 __ bind(aligned); 1375 1376 // s is now 2-word-aligned. 1377 1378 // We have a count of units and some trailing bytes. Adjust the 1379 // count and do a bulk copy of words. 1380 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1381 if (direction == copy_forwards) 1382 __ bl(copy_f); 1383 else 1384 __ bl(copy_b); 1385 1386 // And the tail. 1387 copy_memory_small(s, d, count, tmp, step); 1388 1389 if (granularity >= 8) __ bind(copy8); 1390 if (granularity >= 4) __ bind(copy4); 1391 __ bind(finish); 1392 } 1393 1394 1395 void clobber_registers() { 1396 #ifdef ASSERT 1397 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1398 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1399 for (Register r = r3; r <= r18; r++) 1400 if (r != rscratch1) __ mov(r, rscratch1); 1401 #endif 1402 } 1403 1404 // Scan over array at a for count oops, verifying each one. 1405 // Preserves a and count, clobbers rscratch1 and rscratch2. 1406 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1407 Label loop, end; 1408 __ mov(rscratch1, a); 1409 __ mov(rscratch2, zr); 1410 __ bind(loop); 1411 __ cmp(rscratch2, count); 1412 __ br(Assembler::HS, end); 1413 if (size == (size_t)wordSize) { 1414 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1415 __ verify_oop(temp); 1416 } else { 1417 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1418 __ decode_heap_oop(temp); // calls verify_oop 1419 } 1420 __ add(rscratch2, rscratch2, size); 1421 __ b(loop); 1422 __ bind(end); 1423 } 1424 1425 // Arguments: 1426 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1427 // ignored 1428 // is_oop - true => oop array, so generate store check code 1429 // name - stub name string 1430 // 1431 // Inputs: 1432 // c_rarg0 - source array address 1433 // c_rarg1 - destination array address 1434 // c_rarg2 - element count, treated as ssize_t, can be zero 1435 // 1436 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1437 // the hardware handle it. The two dwords within qwords that span 1438 // cache line boundaries will still be loaded and stored atomicly. 1439 // 1440 // Side Effects: 1441 // disjoint_int_copy_entry is set to the no-overlap entry point 1442 // used by generate_conjoint_int_oop_copy(). 1443 // 1444 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1445 const char *name, bool dest_uninitialized = false) { 1446 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1447 RegSet saved_reg = RegSet::of(s, d, count); 1448 __ align(CodeEntryAlignment); 1449 StubCodeMark mark(this, "StubRoutines", name); 1450 address start = __ pc(); 1451 __ enter(); 1452 1453 if (entry != NULL) { 1454 *entry = __ pc(); 1455 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1456 BLOCK_COMMENT("Entry:"); 1457 } 1458 1459 if (is_oop) { 1460 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_reg); 1461 // save regs before copy_memory 1462 __ push(RegSet::of(d, count), sp); 1463 } 1464 copy_memory(aligned, s, d, count, rscratch1, size); 1465 if (is_oop) { 1466 __ pop(RegSet::of(d, count), sp); 1467 if (VerifyOops) 1468 verify_oop_array(size, d, count, r16); 1469 __ sub(count, count, 1); // make an inclusive end pointer 1470 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1471 gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet()); 1472 } 1473 __ leave(); 1474 __ mov(r0, zr); // return 0 1475 __ ret(lr); 1476 #ifdef BUILTIN_SIM 1477 { 1478 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1479 sim->notifyCompile(const_cast<char*>(name), start); 1480 } 1481 #endif 1482 return start; 1483 } 1484 1485 // Arguments: 1486 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1487 // ignored 1488 // is_oop - true => oop array, so generate store check code 1489 // name - stub name string 1490 // 1491 // Inputs: 1492 // c_rarg0 - source array address 1493 // c_rarg1 - destination array address 1494 // c_rarg2 - element count, treated as ssize_t, can be zero 1495 // 1496 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1497 // the hardware handle it. The two dwords within qwords that span 1498 // cache line boundaries will still be loaded and stored atomicly. 1499 // 1500 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1501 address *entry, const char *name, 1502 bool dest_uninitialized = false) { 1503 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1504 RegSet saved_regs = RegSet::of(s, d, count); 1505 StubCodeMark mark(this, "StubRoutines", name); 1506 address start = __ pc(); 1507 __ enter(); 1508 1509 if (entry != NULL) { 1510 *entry = __ pc(); 1511 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1512 BLOCK_COMMENT("Entry:"); 1513 } 1514 1515 // use fwd copy when (d-s) above_equal (count*size) 1516 __ sub(rscratch1, d, s); 1517 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1518 __ br(Assembler::HS, nooverlap_target); 1519 1520 if (is_oop) { 1521 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_regs); 1522 // save regs before copy_memory 1523 __ push(RegSet::of(d, count), sp); 1524 } 1525 copy_memory(aligned, s, d, count, rscratch1, -size); 1526 if (is_oop) { 1527 __ pop(RegSet::of(d, count), sp); 1528 if (VerifyOops) 1529 verify_oop_array(size, d, count, r16); 1530 __ sub(count, count, 1); // make an inclusive end pointer 1531 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1532 gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet()); 1533 } 1534 __ leave(); 1535 __ mov(r0, zr); // return 0 1536 __ ret(lr); 1537 #ifdef BUILTIN_SIM 1538 { 1539 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1540 sim->notifyCompile(const_cast<char*>(name), start); 1541 } 1542 #endif 1543 return start; 1544 } 1545 1546 // Arguments: 1547 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1548 // ignored 1549 // name - stub name string 1550 // 1551 // Inputs: 1552 // c_rarg0 - source array address 1553 // c_rarg1 - destination array address 1554 // c_rarg2 - element count, treated as ssize_t, can be zero 1555 // 1556 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1557 // we let the hardware handle it. The one to eight bytes within words, 1558 // dwords or qwords that span cache line boundaries will still be loaded 1559 // and stored atomically. 1560 // 1561 // Side Effects: 1562 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1563 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1564 // we let the hardware handle it. The one to eight bytes within words, 1565 // dwords or qwords that span cache line boundaries will still be loaded 1566 // and stored atomically. 1567 // 1568 // Side Effects: 1569 // disjoint_byte_copy_entry is set to the no-overlap entry point 1570 // used by generate_conjoint_byte_copy(). 1571 // 1572 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1573 const bool not_oop = false; 1574 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1575 } 1576 1577 // Arguments: 1578 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1579 // ignored 1580 // name - stub name string 1581 // 1582 // Inputs: 1583 // c_rarg0 - source array address 1584 // c_rarg1 - destination array address 1585 // c_rarg2 - element count, treated as ssize_t, can be zero 1586 // 1587 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1588 // we let the hardware handle it. The one to eight bytes within words, 1589 // dwords or qwords that span cache line boundaries will still be loaded 1590 // and stored atomically. 1591 // 1592 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1593 address* entry, const char *name) { 1594 const bool not_oop = false; 1595 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1596 } 1597 1598 // Arguments: 1599 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1600 // ignored 1601 // name - stub name string 1602 // 1603 // Inputs: 1604 // c_rarg0 - source array address 1605 // c_rarg1 - destination array address 1606 // c_rarg2 - element count, treated as ssize_t, can be zero 1607 // 1608 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1609 // let the hardware handle it. The two or four words within dwords 1610 // or qwords that span cache line boundaries will still be loaded 1611 // and stored atomically. 1612 // 1613 // Side Effects: 1614 // disjoint_short_copy_entry is set to the no-overlap entry point 1615 // used by generate_conjoint_short_copy(). 1616 // 1617 address generate_disjoint_short_copy(bool aligned, 1618 address* entry, const char *name) { 1619 const bool not_oop = false; 1620 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1621 } 1622 1623 // Arguments: 1624 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1625 // ignored 1626 // name - stub name string 1627 // 1628 // Inputs: 1629 // c_rarg0 - source array address 1630 // c_rarg1 - destination array address 1631 // c_rarg2 - element count, treated as ssize_t, can be zero 1632 // 1633 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1634 // let the hardware handle it. The two or four words within dwords 1635 // or qwords that span cache line boundaries will still be loaded 1636 // and stored atomically. 1637 // 1638 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1639 address *entry, const char *name) { 1640 const bool not_oop = false; 1641 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1642 1643 } 1644 // Arguments: 1645 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1646 // ignored 1647 // name - stub name string 1648 // 1649 // Inputs: 1650 // c_rarg0 - source array address 1651 // c_rarg1 - destination array address 1652 // c_rarg2 - element count, treated as ssize_t, can be zero 1653 // 1654 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1655 // the hardware handle it. The two dwords within qwords that span 1656 // cache line boundaries will still be loaded and stored atomicly. 1657 // 1658 // Side Effects: 1659 // disjoint_int_copy_entry is set to the no-overlap entry point 1660 // used by generate_conjoint_int_oop_copy(). 1661 // 1662 address generate_disjoint_int_copy(bool aligned, address *entry, 1663 const char *name, bool dest_uninitialized = false) { 1664 const bool not_oop = false; 1665 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1666 } 1667 1668 // Arguments: 1669 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1670 // ignored 1671 // name - stub name string 1672 // 1673 // Inputs: 1674 // c_rarg0 - source array address 1675 // c_rarg1 - destination array address 1676 // c_rarg2 - element count, treated as ssize_t, can be zero 1677 // 1678 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1679 // the hardware handle it. The two dwords within qwords that span 1680 // cache line boundaries will still be loaded and stored atomicly. 1681 // 1682 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1683 address *entry, const char *name, 1684 bool dest_uninitialized = false) { 1685 const bool not_oop = false; 1686 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1687 } 1688 1689 1690 // Arguments: 1691 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1692 // ignored 1693 // name - stub name string 1694 // 1695 // Inputs: 1696 // c_rarg0 - source array address 1697 // c_rarg1 - destination array address 1698 // c_rarg2 - element count, treated as size_t, can be zero 1699 // 1700 // Side Effects: 1701 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1702 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1703 // 1704 address generate_disjoint_long_copy(bool aligned, address *entry, 1705 const char *name, bool dest_uninitialized = false) { 1706 const bool not_oop = false; 1707 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1708 } 1709 1710 // Arguments: 1711 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1712 // ignored 1713 // name - stub name string 1714 // 1715 // Inputs: 1716 // c_rarg0 - source array address 1717 // c_rarg1 - destination array address 1718 // c_rarg2 - element count, treated as size_t, can be zero 1719 // 1720 address generate_conjoint_long_copy(bool aligned, 1721 address nooverlap_target, address *entry, 1722 const char *name, bool dest_uninitialized = false) { 1723 const bool not_oop = false; 1724 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1725 } 1726 1727 // Arguments: 1728 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1729 // ignored 1730 // name - stub name string 1731 // 1732 // Inputs: 1733 // c_rarg0 - source array address 1734 // c_rarg1 - destination array address 1735 // c_rarg2 - element count, treated as size_t, can be zero 1736 // 1737 // Side Effects: 1738 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1739 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1740 // 1741 address generate_disjoint_oop_copy(bool aligned, address *entry, 1742 const char *name, bool dest_uninitialized) { 1743 const bool is_oop = true; 1744 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1745 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1746 } 1747 1748 // Arguments: 1749 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1750 // ignored 1751 // name - stub name string 1752 // 1753 // Inputs: 1754 // c_rarg0 - source array address 1755 // c_rarg1 - destination array address 1756 // c_rarg2 - element count, treated as size_t, can be zero 1757 // 1758 address generate_conjoint_oop_copy(bool aligned, 1759 address nooverlap_target, address *entry, 1760 const char *name, bool dest_uninitialized) { 1761 const bool is_oop = true; 1762 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1763 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1764 name, dest_uninitialized); 1765 } 1766 1767 1768 // Helper for generating a dynamic type check. 1769 // Smashes rscratch1. 1770 void generate_type_check(Register sub_klass, 1771 Register super_check_offset, 1772 Register super_klass, 1773 Label& L_success) { 1774 assert_different_registers(sub_klass, super_check_offset, super_klass); 1775 1776 BLOCK_COMMENT("type_check:"); 1777 1778 Label L_miss; 1779 1780 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1781 super_check_offset); 1782 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1783 1784 // Fall through on failure! 1785 __ BIND(L_miss); 1786 } 1787 1788 // 1789 // Generate checkcasting array copy stub 1790 // 1791 // Input: 1792 // c_rarg0 - source array address 1793 // c_rarg1 - destination array address 1794 // c_rarg2 - element count, treated as ssize_t, can be zero 1795 // c_rarg3 - size_t ckoff (super_check_offset) 1796 // c_rarg4 - oop ckval (super_klass) 1797 // 1798 // Output: 1799 // r0 == 0 - success 1800 // r0 == -1^K - failure, where K is partial transfer count 1801 // 1802 address generate_checkcast_copy(const char *name, address *entry, 1803 bool dest_uninitialized = false) { 1804 1805 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1806 1807 // Input registers (after setup_arg_regs) 1808 const Register from = c_rarg0; // source array address 1809 const Register to = c_rarg1; // destination array address 1810 const Register count = c_rarg2; // elementscount 1811 const Register ckoff = c_rarg3; // super_check_offset 1812 const Register ckval = c_rarg4; // super_klass 1813 1814 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1815 RegSet wb_post_saved_regs = RegSet::of(count); 1816 1817 // Registers used as temps (r18, r19, r20 are save-on-entry) 1818 const Register count_save = r21; // orig elementscount 1819 const Register start_to = r20; // destination array start address 1820 const Register copied_oop = r18; // actual oop copied 1821 const Register r19_klass = r19; // oop._klass 1822 1823 //--------------------------------------------------------------- 1824 // Assembler stub will be used for this call to arraycopy 1825 // if the two arrays are subtypes of Object[] but the 1826 // destination array type is not equal to or a supertype 1827 // of the source type. Each element must be separately 1828 // checked. 1829 1830 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1831 copied_oop, r19_klass, count_save); 1832 1833 __ align(CodeEntryAlignment); 1834 StubCodeMark mark(this, "StubRoutines", name); 1835 address start = __ pc(); 1836 1837 __ enter(); // required for proper stackwalking of RuntimeStub frame 1838 1839 #ifdef ASSERT 1840 // caller guarantees that the arrays really are different 1841 // otherwise, we would have to make conjoint checks 1842 { Label L; 1843 array_overlap_test(L, TIMES_OOP); 1844 __ stop("checkcast_copy within a single array"); 1845 __ bind(L); 1846 } 1847 #endif //ASSERT 1848 1849 // Caller of this entry point must set up the argument registers. 1850 if (entry != NULL) { 1851 *entry = __ pc(); 1852 BLOCK_COMMENT("Entry:"); 1853 } 1854 1855 // Empty array: Nothing to do. 1856 __ cbz(count, L_done); 1857 1858 __ push(RegSet::of(r18, r19, r20, r21), sp); 1859 1860 #ifdef ASSERT 1861 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1862 // The ckoff and ckval must be mutually consistent, 1863 // even though caller generates both. 1864 { Label L; 1865 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1866 __ ldrw(start_to, Address(ckval, sco_offset)); 1867 __ cmpw(ckoff, start_to); 1868 __ br(Assembler::EQ, L); 1869 __ stop("super_check_offset inconsistent"); 1870 __ bind(L); 1871 } 1872 #endif //ASSERT 1873 1874 gen_write_ref_array_pre_barrier(to, count, dest_uninitialized, wb_pre_saved_regs); 1875 1876 // save the original count 1877 __ mov(count_save, count); 1878 1879 // Copy from low to high addresses 1880 __ mov(start_to, to); // Save destination array start address 1881 __ b(L_load_element); 1882 1883 // ======== begin loop ======== 1884 // (Loop is rotated; its entry is L_load_element.) 1885 // Loop control: 1886 // for (; count != 0; count--) { 1887 // copied_oop = load_heap_oop(from++); 1888 // ... generate_type_check ...; 1889 // store_heap_oop(to++, copied_oop); 1890 // } 1891 __ align(OptoLoopAlignment); 1892 1893 __ BIND(L_store_element); 1894 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1895 __ sub(count, count, 1); 1896 __ cbz(count, L_do_card_marks); 1897 1898 // ======== loop entry is here ======== 1899 __ BIND(L_load_element); 1900 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1901 __ cbz(copied_oop, L_store_element); 1902 1903 __ load_klass(r19_klass, copied_oop);// query the object klass 1904 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1905 // ======== end loop ======== 1906 1907 // It was a real error; we must depend on the caller to finish the job. 1908 // Register count = remaining oops, count_orig = total oops. 1909 // Emit GC store barriers for the oops we have copied and report 1910 // their number to the caller. 1911 1912 __ subs(count, count_save, count); // K = partially copied oop count 1913 __ eon(count, count, zr); // report (-1^K) to caller 1914 __ br(Assembler::EQ, L_done_pop); 1915 1916 __ BIND(L_do_card_marks); 1917 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1918 gen_write_ref_array_post_barrier(start_to, to, rscratch1, wb_post_saved_regs); 1919 1920 __ bind(L_done_pop); 1921 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1922 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1923 1924 __ bind(L_done); 1925 __ mov(r0, count); 1926 __ leave(); 1927 __ ret(lr); 1928 1929 return start; 1930 } 1931 1932 // Perform range checks on the proposed arraycopy. 1933 // Kills temp, but nothing else. 1934 // Also, clean the sign bits of src_pos and dst_pos. 1935 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1936 Register src_pos, // source position (c_rarg1) 1937 Register dst, // destination array oo (c_rarg2) 1938 Register dst_pos, // destination position (c_rarg3) 1939 Register length, 1940 Register temp, 1941 Label& L_failed) { 1942 BLOCK_COMMENT("arraycopy_range_checks:"); 1943 1944 assert_different_registers(rscratch1, temp); 1945 1946 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1947 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1948 __ addw(temp, length, src_pos); 1949 __ cmpw(temp, rscratch1); 1950 __ br(Assembler::HI, L_failed); 1951 1952 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1953 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1954 __ addw(temp, length, dst_pos); 1955 __ cmpw(temp, rscratch1); 1956 __ br(Assembler::HI, L_failed); 1957 1958 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1959 __ movw(src_pos, src_pos); 1960 __ movw(dst_pos, dst_pos); 1961 1962 BLOCK_COMMENT("arraycopy_range_checks done"); 1963 } 1964 1965 // These stubs get called from some dumb test routine. 1966 // I'll write them properly when they're called from 1967 // something that's actually doing something. 1968 static void fake_arraycopy_stub(address src, address dst, int count) { 1969 assert(count == 0, "huh?"); 1970 } 1971 1972 1973 // 1974 // Generate 'unsafe' array copy stub 1975 // Though just as safe as the other stubs, it takes an unscaled 1976 // size_t argument instead of an element count. 1977 // 1978 // Input: 1979 // c_rarg0 - source array address 1980 // c_rarg1 - destination array address 1981 // c_rarg2 - byte count, treated as ssize_t, can be zero 1982 // 1983 // Examines the alignment of the operands and dispatches 1984 // to a long, int, short, or byte copy loop. 1985 // 1986 address generate_unsafe_copy(const char *name, 1987 address byte_copy_entry, 1988 address short_copy_entry, 1989 address int_copy_entry, 1990 address long_copy_entry) { 1991 Label L_long_aligned, L_int_aligned, L_short_aligned; 1992 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1993 1994 __ align(CodeEntryAlignment); 1995 StubCodeMark mark(this, "StubRoutines", name); 1996 address start = __ pc(); 1997 __ enter(); // required for proper stackwalking of RuntimeStub frame 1998 1999 // bump this on entry, not on exit: 2000 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2001 2002 __ orr(rscratch1, s, d); 2003 __ orr(rscratch1, rscratch1, count); 2004 2005 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2006 __ cbz(rscratch1, L_long_aligned); 2007 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2008 __ cbz(rscratch1, L_int_aligned); 2009 __ tbz(rscratch1, 0, L_short_aligned); 2010 __ b(RuntimeAddress(byte_copy_entry)); 2011 2012 __ BIND(L_short_aligned); 2013 __ lsr(count, count, LogBytesPerShort); // size => short_count 2014 __ b(RuntimeAddress(short_copy_entry)); 2015 __ BIND(L_int_aligned); 2016 __ lsr(count, count, LogBytesPerInt); // size => int_count 2017 __ b(RuntimeAddress(int_copy_entry)); 2018 __ BIND(L_long_aligned); 2019 __ lsr(count, count, LogBytesPerLong); // size => long_count 2020 __ b(RuntimeAddress(long_copy_entry)); 2021 2022 return start; 2023 } 2024 2025 // 2026 // Generate generic array copy stubs 2027 // 2028 // Input: 2029 // c_rarg0 - src oop 2030 // c_rarg1 - src_pos (32-bits) 2031 // c_rarg2 - dst oop 2032 // c_rarg3 - dst_pos (32-bits) 2033 // c_rarg4 - element count (32-bits) 2034 // 2035 // Output: 2036 // r0 == 0 - success 2037 // r0 == -1^K - failure, where K is partial transfer count 2038 // 2039 address generate_generic_copy(const char *name, 2040 address byte_copy_entry, address short_copy_entry, 2041 address int_copy_entry, address oop_copy_entry, 2042 address long_copy_entry, address checkcast_copy_entry) { 2043 2044 Label L_failed, L_failed_0, L_objArray; 2045 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2046 2047 // Input registers 2048 const Register src = c_rarg0; // source array oop 2049 const Register src_pos = c_rarg1; // source position 2050 const Register dst = c_rarg2; // destination array oop 2051 const Register dst_pos = c_rarg3; // destination position 2052 const Register length = c_rarg4; 2053 2054 StubCodeMark mark(this, "StubRoutines", name); 2055 2056 __ align(CodeEntryAlignment); 2057 address start = __ pc(); 2058 2059 __ enter(); // required for proper stackwalking of RuntimeStub frame 2060 2061 // bump this on entry, not on exit: 2062 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2063 2064 //----------------------------------------------------------------------- 2065 // Assembler stub will be used for this call to arraycopy 2066 // if the following conditions are met: 2067 // 2068 // (1) src and dst must not be null. 2069 // (2) src_pos must not be negative. 2070 // (3) dst_pos must not be negative. 2071 // (4) length must not be negative. 2072 // (5) src klass and dst klass should be the same and not NULL. 2073 // (6) src and dst should be arrays. 2074 // (7) src_pos + length must not exceed length of src. 2075 // (8) dst_pos + length must not exceed length of dst. 2076 // 2077 2078 // if (src == NULL) return -1; 2079 __ cbz(src, L_failed); 2080 2081 // if (src_pos < 0) return -1; 2082 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2083 2084 // if (dst == NULL) return -1; 2085 __ cbz(dst, L_failed); 2086 2087 // if (dst_pos < 0) return -1; 2088 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2089 2090 // registers used as temp 2091 const Register scratch_length = r16; // elements count to copy 2092 const Register scratch_src_klass = r17; // array klass 2093 const Register lh = r18; // layout helper 2094 2095 // if (length < 0) return -1; 2096 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2097 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2098 2099 __ load_klass(scratch_src_klass, src); 2100 #ifdef ASSERT 2101 // assert(src->klass() != NULL); 2102 { 2103 BLOCK_COMMENT("assert klasses not null {"); 2104 Label L1, L2; 2105 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2106 __ bind(L1); 2107 __ stop("broken null klass"); 2108 __ bind(L2); 2109 __ load_klass(rscratch1, dst); 2110 __ cbz(rscratch1, L1); // this would be broken also 2111 BLOCK_COMMENT("} assert klasses not null done"); 2112 } 2113 #endif 2114 2115 // Load layout helper (32-bits) 2116 // 2117 // |array_tag| | header_size | element_type | |log2_element_size| 2118 // 32 30 24 16 8 2 0 2119 // 2120 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2121 // 2122 2123 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2124 2125 // Handle objArrays completely differently... 2126 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2127 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2128 __ movw(rscratch1, objArray_lh); 2129 __ eorw(rscratch2, lh, rscratch1); 2130 __ cbzw(rscratch2, L_objArray); 2131 2132 // if (src->klass() != dst->klass()) return -1; 2133 __ load_klass(rscratch2, dst); 2134 __ eor(rscratch2, rscratch2, scratch_src_klass); 2135 __ cbnz(rscratch2, L_failed); 2136 2137 // if (!src->is_Array()) return -1; 2138 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2139 2140 // At this point, it is known to be a typeArray (array_tag 0x3). 2141 #ifdef ASSERT 2142 { 2143 BLOCK_COMMENT("assert primitive array {"); 2144 Label L; 2145 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2146 __ cmpw(lh, rscratch2); 2147 __ br(Assembler::GE, L); 2148 __ stop("must be a primitive array"); 2149 __ bind(L); 2150 BLOCK_COMMENT("} assert primitive array done"); 2151 } 2152 #endif 2153 2154 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2155 rscratch2, L_failed); 2156 2157 // TypeArrayKlass 2158 // 2159 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2160 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2161 // 2162 2163 const Register rscratch1_offset = rscratch1; // array offset 2164 const Register r18_elsize = lh; // element size 2165 2166 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2167 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2168 __ add(src, src, rscratch1_offset); // src array offset 2169 __ add(dst, dst, rscratch1_offset); // dst array offset 2170 BLOCK_COMMENT("choose copy loop based on element size"); 2171 2172 // next registers should be set before the jump to corresponding stub 2173 const Register from = c_rarg0; // source array address 2174 const Register to = c_rarg1; // destination array address 2175 const Register count = c_rarg2; // elements count 2176 2177 // 'from', 'to', 'count' registers should be set in such order 2178 // since they are the same as 'src', 'src_pos', 'dst'. 2179 2180 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2181 2182 // The possible values of elsize are 0-3, i.e. exact_log2(element 2183 // size in bytes). We do a simple bitwise binary search. 2184 __ BIND(L_copy_bytes); 2185 __ tbnz(r18_elsize, 1, L_copy_ints); 2186 __ tbnz(r18_elsize, 0, L_copy_shorts); 2187 __ lea(from, Address(src, src_pos));// src_addr 2188 __ lea(to, Address(dst, dst_pos));// dst_addr 2189 __ movw(count, scratch_length); // length 2190 __ b(RuntimeAddress(byte_copy_entry)); 2191 2192 __ BIND(L_copy_shorts); 2193 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2194 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2195 __ movw(count, scratch_length); // length 2196 __ b(RuntimeAddress(short_copy_entry)); 2197 2198 __ BIND(L_copy_ints); 2199 __ tbnz(r18_elsize, 0, L_copy_longs); 2200 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2201 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2202 __ movw(count, scratch_length); // length 2203 __ b(RuntimeAddress(int_copy_entry)); 2204 2205 __ BIND(L_copy_longs); 2206 #ifdef ASSERT 2207 { 2208 BLOCK_COMMENT("assert long copy {"); 2209 Label L; 2210 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2211 __ cmpw(r18_elsize, LogBytesPerLong); 2212 __ br(Assembler::EQ, L); 2213 __ stop("must be long copy, but elsize is wrong"); 2214 __ bind(L); 2215 BLOCK_COMMENT("} assert long copy done"); 2216 } 2217 #endif 2218 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2219 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2220 __ movw(count, scratch_length); // length 2221 __ b(RuntimeAddress(long_copy_entry)); 2222 2223 // ObjArrayKlass 2224 __ BIND(L_objArray); 2225 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2226 2227 Label L_plain_copy, L_checkcast_copy; 2228 // test array classes for subtyping 2229 __ load_klass(r18, dst); 2230 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2231 __ br(Assembler::NE, L_checkcast_copy); 2232 2233 // Identically typed arrays can be copied without element-wise checks. 2234 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2235 rscratch2, L_failed); 2236 2237 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2238 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2239 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2240 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2241 __ movw(count, scratch_length); // length 2242 __ BIND(L_plain_copy); 2243 __ b(RuntimeAddress(oop_copy_entry)); 2244 2245 __ BIND(L_checkcast_copy); 2246 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2247 { 2248 // Before looking at dst.length, make sure dst is also an objArray. 2249 __ ldrw(rscratch1, Address(r18, lh_offset)); 2250 __ movw(rscratch2, objArray_lh); 2251 __ eorw(rscratch1, rscratch1, rscratch2); 2252 __ cbnzw(rscratch1, L_failed); 2253 2254 // It is safe to examine both src.length and dst.length. 2255 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2256 r18, L_failed); 2257 2258 const Register rscratch2_dst_klass = rscratch2; 2259 __ load_klass(rscratch2_dst_klass, dst); // reload 2260 2261 // Marshal the base address arguments now, freeing registers. 2262 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2263 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2264 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2265 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2266 __ movw(count, length); // length (reloaded) 2267 Register sco_temp = c_rarg3; // this register is free now 2268 assert_different_registers(from, to, count, sco_temp, 2269 rscratch2_dst_klass, scratch_src_klass); 2270 // assert_clean_int(count, sco_temp); 2271 2272 // Generate the type check. 2273 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2274 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2275 // assert_clean_int(sco_temp, r18); 2276 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2277 2278 // Fetch destination element klass from the ObjArrayKlass header. 2279 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2280 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2281 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2282 2283 // the checkcast_copy loop needs two extra arguments: 2284 assert(c_rarg3 == sco_temp, "#3 already in place"); 2285 // Set up arguments for checkcast_copy_entry. 2286 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2287 __ b(RuntimeAddress(checkcast_copy_entry)); 2288 } 2289 2290 __ BIND(L_failed); 2291 __ mov(r0, -1); 2292 __ leave(); // required for proper stackwalking of RuntimeStub frame 2293 __ ret(lr); 2294 2295 return start; 2296 } 2297 2298 // 2299 // Generate stub for array fill. If "aligned" is true, the 2300 // "to" address is assumed to be heapword aligned. 2301 // 2302 // Arguments for generated stub: 2303 // to: c_rarg0 2304 // value: c_rarg1 2305 // count: c_rarg2 treated as signed 2306 // 2307 address generate_fill(BasicType t, bool aligned, const char *name) { 2308 __ align(CodeEntryAlignment); 2309 StubCodeMark mark(this, "StubRoutines", name); 2310 address start = __ pc(); 2311 2312 BLOCK_COMMENT("Entry:"); 2313 2314 const Register to = c_rarg0; // source array address 2315 const Register value = c_rarg1; // value 2316 const Register count = c_rarg2; // elements count 2317 2318 const Register bz_base = r10; // base for block_zero routine 2319 const Register cnt_words = r11; // temp register 2320 2321 __ enter(); 2322 2323 Label L_fill_elements, L_exit1; 2324 2325 int shift = -1; 2326 switch (t) { 2327 case T_BYTE: 2328 shift = 0; 2329 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2330 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2331 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2332 __ br(Assembler::LO, L_fill_elements); 2333 break; 2334 case T_SHORT: 2335 shift = 1; 2336 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2337 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2338 __ br(Assembler::LO, L_fill_elements); 2339 break; 2340 case T_INT: 2341 shift = 2; 2342 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2343 __ br(Assembler::LO, L_fill_elements); 2344 break; 2345 default: ShouldNotReachHere(); 2346 } 2347 2348 // Align source address at 8 bytes address boundary. 2349 Label L_skip_align1, L_skip_align2, L_skip_align4; 2350 if (!aligned) { 2351 switch (t) { 2352 case T_BYTE: 2353 // One byte misalignment happens only for byte arrays. 2354 __ tbz(to, 0, L_skip_align1); 2355 __ strb(value, Address(__ post(to, 1))); 2356 __ subw(count, count, 1); 2357 __ bind(L_skip_align1); 2358 // Fallthrough 2359 case T_SHORT: 2360 // Two bytes misalignment happens only for byte and short (char) arrays. 2361 __ tbz(to, 1, L_skip_align2); 2362 __ strh(value, Address(__ post(to, 2))); 2363 __ subw(count, count, 2 >> shift); 2364 __ bind(L_skip_align2); 2365 // Fallthrough 2366 case T_INT: 2367 // Align to 8 bytes, we know we are 4 byte aligned to start. 2368 __ tbz(to, 2, L_skip_align4); 2369 __ strw(value, Address(__ post(to, 4))); 2370 __ subw(count, count, 4 >> shift); 2371 __ bind(L_skip_align4); 2372 break; 2373 default: ShouldNotReachHere(); 2374 } 2375 } 2376 2377 // 2378 // Fill large chunks 2379 // 2380 __ lsrw(cnt_words, count, 3 - shift); // number of words 2381 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2382 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2383 if (UseBlockZeroing) { 2384 Label non_block_zeroing, rest; 2385 // If the fill value is zero we can use the fast zero_words(). 2386 __ cbnz(value, non_block_zeroing); 2387 __ mov(bz_base, to); 2388 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2389 __ zero_words(bz_base, cnt_words); 2390 __ b(rest); 2391 __ bind(non_block_zeroing); 2392 __ fill_words(to, cnt_words, value); 2393 __ bind(rest); 2394 } else { 2395 __ fill_words(to, cnt_words, value); 2396 } 2397 2398 // Remaining count is less than 8 bytes. Fill it by a single store. 2399 // Note that the total length is no less than 8 bytes. 2400 if (t == T_BYTE || t == T_SHORT) { 2401 Label L_exit1; 2402 __ cbzw(count, L_exit1); 2403 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2404 __ str(value, Address(to, -8)); // overwrite some elements 2405 __ bind(L_exit1); 2406 __ leave(); 2407 __ ret(lr); 2408 } 2409 2410 // Handle copies less than 8 bytes. 2411 Label L_fill_2, L_fill_4, L_exit2; 2412 __ bind(L_fill_elements); 2413 switch (t) { 2414 case T_BYTE: 2415 __ tbz(count, 0, L_fill_2); 2416 __ strb(value, Address(__ post(to, 1))); 2417 __ bind(L_fill_2); 2418 __ tbz(count, 1, L_fill_4); 2419 __ strh(value, Address(__ post(to, 2))); 2420 __ bind(L_fill_4); 2421 __ tbz(count, 2, L_exit2); 2422 __ strw(value, Address(to)); 2423 break; 2424 case T_SHORT: 2425 __ tbz(count, 0, L_fill_4); 2426 __ strh(value, Address(__ post(to, 2))); 2427 __ bind(L_fill_4); 2428 __ tbz(count, 1, L_exit2); 2429 __ strw(value, Address(to)); 2430 break; 2431 case T_INT: 2432 __ cbzw(count, L_exit2); 2433 __ strw(value, Address(to)); 2434 break; 2435 default: ShouldNotReachHere(); 2436 } 2437 __ bind(L_exit2); 2438 __ leave(); 2439 __ ret(lr); 2440 return start; 2441 } 2442 2443 void generate_arraycopy_stubs() { 2444 address entry; 2445 address entry_jbyte_arraycopy; 2446 address entry_jshort_arraycopy; 2447 address entry_jint_arraycopy; 2448 address entry_oop_arraycopy; 2449 address entry_jlong_arraycopy; 2450 address entry_checkcast_arraycopy; 2451 2452 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2453 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2454 2455 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2456 2457 //*** jbyte 2458 // Always need aligned and unaligned versions 2459 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2460 "jbyte_disjoint_arraycopy"); 2461 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2462 &entry_jbyte_arraycopy, 2463 "jbyte_arraycopy"); 2464 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2465 "arrayof_jbyte_disjoint_arraycopy"); 2466 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2467 "arrayof_jbyte_arraycopy"); 2468 2469 //*** jshort 2470 // Always need aligned and unaligned versions 2471 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2472 "jshort_disjoint_arraycopy"); 2473 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2474 &entry_jshort_arraycopy, 2475 "jshort_arraycopy"); 2476 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2477 "arrayof_jshort_disjoint_arraycopy"); 2478 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2479 "arrayof_jshort_arraycopy"); 2480 2481 //*** jint 2482 // Aligned versions 2483 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2484 "arrayof_jint_disjoint_arraycopy"); 2485 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2486 "arrayof_jint_arraycopy"); 2487 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2488 // entry_jint_arraycopy always points to the unaligned version 2489 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2490 "jint_disjoint_arraycopy"); 2491 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2492 &entry_jint_arraycopy, 2493 "jint_arraycopy"); 2494 2495 //*** jlong 2496 // It is always aligned 2497 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2498 "arrayof_jlong_disjoint_arraycopy"); 2499 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2500 "arrayof_jlong_arraycopy"); 2501 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2502 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2503 2504 //*** oops 2505 { 2506 // With compressed oops we need unaligned versions; notice that 2507 // we overwrite entry_oop_arraycopy. 2508 bool aligned = !UseCompressedOops; 2509 2510 StubRoutines::_arrayof_oop_disjoint_arraycopy 2511 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2512 /*dest_uninitialized*/false); 2513 StubRoutines::_arrayof_oop_arraycopy 2514 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2515 /*dest_uninitialized*/false); 2516 // Aligned versions without pre-barriers 2517 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2518 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2519 /*dest_uninitialized*/true); 2520 StubRoutines::_arrayof_oop_arraycopy_uninit 2521 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2522 /*dest_uninitialized*/true); 2523 } 2524 2525 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2526 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2527 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2528 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2529 2530 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2531 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2532 /*dest_uninitialized*/true); 2533 2534 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2535 entry_jbyte_arraycopy, 2536 entry_jshort_arraycopy, 2537 entry_jint_arraycopy, 2538 entry_jlong_arraycopy); 2539 2540 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2541 entry_jbyte_arraycopy, 2542 entry_jshort_arraycopy, 2543 entry_jint_arraycopy, 2544 entry_oop_arraycopy, 2545 entry_jlong_arraycopy, 2546 entry_checkcast_arraycopy); 2547 2548 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2549 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2550 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2551 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2552 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2553 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2554 } 2555 2556 void generate_math_stubs() { Unimplemented(); } 2557 2558 // Arguments: 2559 // 2560 // Inputs: 2561 // c_rarg0 - source byte array address 2562 // c_rarg1 - destination byte array address 2563 // c_rarg2 - K (key) in little endian int array 2564 // 2565 address generate_aescrypt_encryptBlock() { 2566 __ align(CodeEntryAlignment); 2567 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2568 2569 Label L_doLast; 2570 2571 const Register from = c_rarg0; // source array address 2572 const Register to = c_rarg1; // destination array address 2573 const Register key = c_rarg2; // key array address 2574 const Register keylen = rscratch1; 2575 2576 address start = __ pc(); 2577 __ enter(); 2578 2579 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2580 2581 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2582 2583 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2584 __ rev32(v1, __ T16B, v1); 2585 __ rev32(v2, __ T16B, v2); 2586 __ rev32(v3, __ T16B, v3); 2587 __ rev32(v4, __ T16B, v4); 2588 __ aese(v0, v1); 2589 __ aesmc(v0, v0); 2590 __ aese(v0, v2); 2591 __ aesmc(v0, v0); 2592 __ aese(v0, v3); 2593 __ aesmc(v0, v0); 2594 __ aese(v0, v4); 2595 __ aesmc(v0, v0); 2596 2597 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2598 __ rev32(v1, __ T16B, v1); 2599 __ rev32(v2, __ T16B, v2); 2600 __ rev32(v3, __ T16B, v3); 2601 __ rev32(v4, __ T16B, v4); 2602 __ aese(v0, v1); 2603 __ aesmc(v0, v0); 2604 __ aese(v0, v2); 2605 __ aesmc(v0, v0); 2606 __ aese(v0, v3); 2607 __ aesmc(v0, v0); 2608 __ aese(v0, v4); 2609 __ aesmc(v0, v0); 2610 2611 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2612 __ rev32(v1, __ T16B, v1); 2613 __ rev32(v2, __ T16B, v2); 2614 2615 __ cmpw(keylen, 44); 2616 __ br(Assembler::EQ, L_doLast); 2617 2618 __ aese(v0, v1); 2619 __ aesmc(v0, v0); 2620 __ aese(v0, v2); 2621 __ aesmc(v0, v0); 2622 2623 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2624 __ rev32(v1, __ T16B, v1); 2625 __ rev32(v2, __ T16B, v2); 2626 2627 __ cmpw(keylen, 52); 2628 __ br(Assembler::EQ, L_doLast); 2629 2630 __ aese(v0, v1); 2631 __ aesmc(v0, v0); 2632 __ aese(v0, v2); 2633 __ aesmc(v0, v0); 2634 2635 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2636 __ rev32(v1, __ T16B, v1); 2637 __ rev32(v2, __ T16B, v2); 2638 2639 __ BIND(L_doLast); 2640 2641 __ aese(v0, v1); 2642 __ aesmc(v0, v0); 2643 __ aese(v0, v2); 2644 2645 __ ld1(v1, __ T16B, key); 2646 __ rev32(v1, __ T16B, v1); 2647 __ eor(v0, __ T16B, v0, v1); 2648 2649 __ st1(v0, __ T16B, to); 2650 2651 __ mov(r0, 0); 2652 2653 __ leave(); 2654 __ ret(lr); 2655 2656 return start; 2657 } 2658 2659 // Arguments: 2660 // 2661 // Inputs: 2662 // c_rarg0 - source byte array address 2663 // c_rarg1 - destination byte array address 2664 // c_rarg2 - K (key) in little endian int array 2665 // 2666 address generate_aescrypt_decryptBlock() { 2667 assert(UseAES, "need AES instructions and misaligned SSE support"); 2668 __ align(CodeEntryAlignment); 2669 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2670 Label L_doLast; 2671 2672 const Register from = c_rarg0; // source array address 2673 const Register to = c_rarg1; // destination array address 2674 const Register key = c_rarg2; // key array address 2675 const Register keylen = rscratch1; 2676 2677 address start = __ pc(); 2678 __ enter(); // required for proper stackwalking of RuntimeStub frame 2679 2680 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2681 2682 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2683 2684 __ ld1(v5, __ T16B, __ post(key, 16)); 2685 __ rev32(v5, __ T16B, v5); 2686 2687 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2688 __ rev32(v1, __ T16B, v1); 2689 __ rev32(v2, __ T16B, v2); 2690 __ rev32(v3, __ T16B, v3); 2691 __ rev32(v4, __ T16B, v4); 2692 __ aesd(v0, v1); 2693 __ aesimc(v0, v0); 2694 __ aesd(v0, v2); 2695 __ aesimc(v0, v0); 2696 __ aesd(v0, v3); 2697 __ aesimc(v0, v0); 2698 __ aesd(v0, v4); 2699 __ aesimc(v0, v0); 2700 2701 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2702 __ rev32(v1, __ T16B, v1); 2703 __ rev32(v2, __ T16B, v2); 2704 __ rev32(v3, __ T16B, v3); 2705 __ rev32(v4, __ T16B, v4); 2706 __ aesd(v0, v1); 2707 __ aesimc(v0, v0); 2708 __ aesd(v0, v2); 2709 __ aesimc(v0, v0); 2710 __ aesd(v0, v3); 2711 __ aesimc(v0, v0); 2712 __ aesd(v0, v4); 2713 __ aesimc(v0, v0); 2714 2715 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2716 __ rev32(v1, __ T16B, v1); 2717 __ rev32(v2, __ T16B, v2); 2718 2719 __ cmpw(keylen, 44); 2720 __ br(Assembler::EQ, L_doLast); 2721 2722 __ aesd(v0, v1); 2723 __ aesimc(v0, v0); 2724 __ aesd(v0, v2); 2725 __ aesimc(v0, v0); 2726 2727 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2728 __ rev32(v1, __ T16B, v1); 2729 __ rev32(v2, __ T16B, v2); 2730 2731 __ cmpw(keylen, 52); 2732 __ br(Assembler::EQ, L_doLast); 2733 2734 __ aesd(v0, v1); 2735 __ aesimc(v0, v0); 2736 __ aesd(v0, v2); 2737 __ aesimc(v0, v0); 2738 2739 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2740 __ rev32(v1, __ T16B, v1); 2741 __ rev32(v2, __ T16B, v2); 2742 2743 __ BIND(L_doLast); 2744 2745 __ aesd(v0, v1); 2746 __ aesimc(v0, v0); 2747 __ aesd(v0, v2); 2748 2749 __ eor(v0, __ T16B, v0, v5); 2750 2751 __ st1(v0, __ T16B, to); 2752 2753 __ mov(r0, 0); 2754 2755 __ leave(); 2756 __ ret(lr); 2757 2758 return start; 2759 } 2760 2761 // Arguments: 2762 // 2763 // Inputs: 2764 // c_rarg0 - source byte array address 2765 // c_rarg1 - destination byte array address 2766 // c_rarg2 - K (key) in little endian int array 2767 // c_rarg3 - r vector byte array address 2768 // c_rarg4 - input length 2769 // 2770 // Output: 2771 // x0 - input length 2772 // 2773 address generate_cipherBlockChaining_encryptAESCrypt() { 2774 assert(UseAES, "need AES instructions and misaligned SSE support"); 2775 __ align(CodeEntryAlignment); 2776 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2777 2778 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2779 2780 const Register from = c_rarg0; // source array address 2781 const Register to = c_rarg1; // destination array address 2782 const Register key = c_rarg2; // key array address 2783 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2784 // and left with the results of the last encryption block 2785 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2786 const Register keylen = rscratch1; 2787 2788 address start = __ pc(); 2789 2790 __ enter(); 2791 2792 __ movw(rscratch2, len_reg); 2793 2794 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2795 2796 __ ld1(v0, __ T16B, rvec); 2797 2798 __ cmpw(keylen, 52); 2799 __ br(Assembler::CC, L_loadkeys_44); 2800 __ br(Assembler::EQ, L_loadkeys_52); 2801 2802 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2803 __ rev32(v17, __ T16B, v17); 2804 __ rev32(v18, __ T16B, v18); 2805 __ BIND(L_loadkeys_52); 2806 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2807 __ rev32(v19, __ T16B, v19); 2808 __ rev32(v20, __ T16B, v20); 2809 __ BIND(L_loadkeys_44); 2810 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2811 __ rev32(v21, __ T16B, v21); 2812 __ rev32(v22, __ T16B, v22); 2813 __ rev32(v23, __ T16B, v23); 2814 __ rev32(v24, __ T16B, v24); 2815 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2816 __ rev32(v25, __ T16B, v25); 2817 __ rev32(v26, __ T16B, v26); 2818 __ rev32(v27, __ T16B, v27); 2819 __ rev32(v28, __ T16B, v28); 2820 __ ld1(v29, v30, v31, __ T16B, key); 2821 __ rev32(v29, __ T16B, v29); 2822 __ rev32(v30, __ T16B, v30); 2823 __ rev32(v31, __ T16B, v31); 2824 2825 __ BIND(L_aes_loop); 2826 __ ld1(v1, __ T16B, __ post(from, 16)); 2827 __ eor(v0, __ T16B, v0, v1); 2828 2829 __ br(Assembler::CC, L_rounds_44); 2830 __ br(Assembler::EQ, L_rounds_52); 2831 2832 __ aese(v0, v17); __ aesmc(v0, v0); 2833 __ aese(v0, v18); __ aesmc(v0, v0); 2834 __ BIND(L_rounds_52); 2835 __ aese(v0, v19); __ aesmc(v0, v0); 2836 __ aese(v0, v20); __ aesmc(v0, v0); 2837 __ BIND(L_rounds_44); 2838 __ aese(v0, v21); __ aesmc(v0, v0); 2839 __ aese(v0, v22); __ aesmc(v0, v0); 2840 __ aese(v0, v23); __ aesmc(v0, v0); 2841 __ aese(v0, v24); __ aesmc(v0, v0); 2842 __ aese(v0, v25); __ aesmc(v0, v0); 2843 __ aese(v0, v26); __ aesmc(v0, v0); 2844 __ aese(v0, v27); __ aesmc(v0, v0); 2845 __ aese(v0, v28); __ aesmc(v0, v0); 2846 __ aese(v0, v29); __ aesmc(v0, v0); 2847 __ aese(v0, v30); 2848 __ eor(v0, __ T16B, v0, v31); 2849 2850 __ st1(v0, __ T16B, __ post(to, 16)); 2851 2852 __ subw(len_reg, len_reg, 16); 2853 __ cbnzw(len_reg, L_aes_loop); 2854 2855 __ st1(v0, __ T16B, rvec); 2856 2857 __ mov(r0, rscratch2); 2858 2859 __ leave(); 2860 __ ret(lr); 2861 2862 return start; 2863 } 2864 2865 // Arguments: 2866 // 2867 // Inputs: 2868 // c_rarg0 - source byte array address 2869 // c_rarg1 - destination byte array address 2870 // c_rarg2 - K (key) in little endian int array 2871 // c_rarg3 - r vector byte array address 2872 // c_rarg4 - input length 2873 // 2874 // Output: 2875 // r0 - input length 2876 // 2877 address generate_cipherBlockChaining_decryptAESCrypt() { 2878 assert(UseAES, "need AES instructions and misaligned SSE support"); 2879 __ align(CodeEntryAlignment); 2880 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2881 2882 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2883 2884 const Register from = c_rarg0; // source array address 2885 const Register to = c_rarg1; // destination array address 2886 const Register key = c_rarg2; // key array address 2887 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2888 // and left with the results of the last encryption block 2889 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2890 const Register keylen = rscratch1; 2891 2892 address start = __ pc(); 2893 2894 __ enter(); 2895 2896 __ movw(rscratch2, len_reg); 2897 2898 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2899 2900 __ ld1(v2, __ T16B, rvec); 2901 2902 __ ld1(v31, __ T16B, __ post(key, 16)); 2903 __ rev32(v31, __ T16B, v31); 2904 2905 __ cmpw(keylen, 52); 2906 __ br(Assembler::CC, L_loadkeys_44); 2907 __ br(Assembler::EQ, L_loadkeys_52); 2908 2909 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2910 __ rev32(v17, __ T16B, v17); 2911 __ rev32(v18, __ T16B, v18); 2912 __ BIND(L_loadkeys_52); 2913 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2914 __ rev32(v19, __ T16B, v19); 2915 __ rev32(v20, __ T16B, v20); 2916 __ BIND(L_loadkeys_44); 2917 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2918 __ rev32(v21, __ T16B, v21); 2919 __ rev32(v22, __ T16B, v22); 2920 __ rev32(v23, __ T16B, v23); 2921 __ rev32(v24, __ T16B, v24); 2922 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2923 __ rev32(v25, __ T16B, v25); 2924 __ rev32(v26, __ T16B, v26); 2925 __ rev32(v27, __ T16B, v27); 2926 __ rev32(v28, __ T16B, v28); 2927 __ ld1(v29, v30, __ T16B, key); 2928 __ rev32(v29, __ T16B, v29); 2929 __ rev32(v30, __ T16B, v30); 2930 2931 __ BIND(L_aes_loop); 2932 __ ld1(v0, __ T16B, __ post(from, 16)); 2933 __ orr(v1, __ T16B, v0, v0); 2934 2935 __ br(Assembler::CC, L_rounds_44); 2936 __ br(Assembler::EQ, L_rounds_52); 2937 2938 __ aesd(v0, v17); __ aesimc(v0, v0); 2939 __ aesd(v0, v18); __ aesimc(v0, v0); 2940 __ BIND(L_rounds_52); 2941 __ aesd(v0, v19); __ aesimc(v0, v0); 2942 __ aesd(v0, v20); __ aesimc(v0, v0); 2943 __ BIND(L_rounds_44); 2944 __ aesd(v0, v21); __ aesimc(v0, v0); 2945 __ aesd(v0, v22); __ aesimc(v0, v0); 2946 __ aesd(v0, v23); __ aesimc(v0, v0); 2947 __ aesd(v0, v24); __ aesimc(v0, v0); 2948 __ aesd(v0, v25); __ aesimc(v0, v0); 2949 __ aesd(v0, v26); __ aesimc(v0, v0); 2950 __ aesd(v0, v27); __ aesimc(v0, v0); 2951 __ aesd(v0, v28); __ aesimc(v0, v0); 2952 __ aesd(v0, v29); __ aesimc(v0, v0); 2953 __ aesd(v0, v30); 2954 __ eor(v0, __ T16B, v0, v31); 2955 __ eor(v0, __ T16B, v0, v2); 2956 2957 __ st1(v0, __ T16B, __ post(to, 16)); 2958 __ orr(v2, __ T16B, v1, v1); 2959 2960 __ subw(len_reg, len_reg, 16); 2961 __ cbnzw(len_reg, L_aes_loop); 2962 2963 __ st1(v2, __ T16B, rvec); 2964 2965 __ mov(r0, rscratch2); 2966 2967 __ leave(); 2968 __ ret(lr); 2969 2970 return start; 2971 } 2972 2973 // Arguments: 2974 // 2975 // Inputs: 2976 // c_rarg0 - byte[] source+offset 2977 // c_rarg1 - int[] SHA.state 2978 // c_rarg2 - int offset 2979 // c_rarg3 - int limit 2980 // 2981 address generate_sha1_implCompress(bool multi_block, const char *name) { 2982 __ align(CodeEntryAlignment); 2983 StubCodeMark mark(this, "StubRoutines", name); 2984 address start = __ pc(); 2985 2986 Register buf = c_rarg0; 2987 Register state = c_rarg1; 2988 Register ofs = c_rarg2; 2989 Register limit = c_rarg3; 2990 2991 Label keys; 2992 Label sha1_loop; 2993 2994 // load the keys into v0..v3 2995 __ adr(rscratch1, keys); 2996 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2997 // load 5 words state into v6, v7 2998 __ ldrq(v6, Address(state, 0)); 2999 __ ldrs(v7, Address(state, 16)); 3000 3001 3002 __ BIND(sha1_loop); 3003 // load 64 bytes of data into v16..v19 3004 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3005 __ rev32(v16, __ T16B, v16); 3006 __ rev32(v17, __ T16B, v17); 3007 __ rev32(v18, __ T16B, v18); 3008 __ rev32(v19, __ T16B, v19); 3009 3010 // do the sha1 3011 __ addv(v4, __ T4S, v16, v0); 3012 __ orr(v20, __ T16B, v6, v6); 3013 3014 FloatRegister d0 = v16; 3015 FloatRegister d1 = v17; 3016 FloatRegister d2 = v18; 3017 FloatRegister d3 = v19; 3018 3019 for (int round = 0; round < 20; round++) { 3020 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3021 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3022 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3023 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3024 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3025 3026 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3027 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3028 __ sha1h(tmp2, __ T4S, v20); 3029 if (round < 5) 3030 __ sha1c(v20, __ T4S, tmp3, tmp4); 3031 else if (round < 10 || round >= 15) 3032 __ sha1p(v20, __ T4S, tmp3, tmp4); 3033 else 3034 __ sha1m(v20, __ T4S, tmp3, tmp4); 3035 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3036 3037 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3038 } 3039 3040 __ addv(v7, __ T2S, v7, v21); 3041 __ addv(v6, __ T4S, v6, v20); 3042 3043 if (multi_block) { 3044 __ add(ofs, ofs, 64); 3045 __ cmp(ofs, limit); 3046 __ br(Assembler::LE, sha1_loop); 3047 __ mov(c_rarg0, ofs); // return ofs 3048 } 3049 3050 __ strq(v6, Address(state, 0)); 3051 __ strs(v7, Address(state, 16)); 3052 3053 __ ret(lr); 3054 3055 __ bind(keys); 3056 __ emit_int32(0x5a827999); 3057 __ emit_int32(0x6ed9eba1); 3058 __ emit_int32(0x8f1bbcdc); 3059 __ emit_int32(0xca62c1d6); 3060 3061 return start; 3062 } 3063 3064 3065 // Arguments: 3066 // 3067 // Inputs: 3068 // c_rarg0 - byte[] source+offset 3069 // c_rarg1 - int[] SHA.state 3070 // c_rarg2 - int offset 3071 // c_rarg3 - int limit 3072 // 3073 address generate_sha256_implCompress(bool multi_block, const char *name) { 3074 static const uint32_t round_consts[64] = { 3075 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3076 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3077 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3078 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3079 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3080 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3081 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3082 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3083 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3084 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3085 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3086 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3087 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3088 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3089 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3090 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3091 }; 3092 __ align(CodeEntryAlignment); 3093 StubCodeMark mark(this, "StubRoutines", name); 3094 address start = __ pc(); 3095 3096 Register buf = c_rarg0; 3097 Register state = c_rarg1; 3098 Register ofs = c_rarg2; 3099 Register limit = c_rarg3; 3100 3101 Label sha1_loop; 3102 3103 __ stpd(v8, v9, __ pre(sp, -32)); 3104 __ stpd(v10, v11, Address(sp, 16)); 3105 3106 // dga == v0 3107 // dgb == v1 3108 // dg0 == v2 3109 // dg1 == v3 3110 // dg2 == v4 3111 // t0 == v6 3112 // t1 == v7 3113 3114 // load 16 keys to v16..v31 3115 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3116 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3117 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3118 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3119 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3120 3121 // load 8 words (256 bits) state 3122 __ ldpq(v0, v1, state); 3123 3124 __ BIND(sha1_loop); 3125 // load 64 bytes of data into v8..v11 3126 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3127 __ rev32(v8, __ T16B, v8); 3128 __ rev32(v9, __ T16B, v9); 3129 __ rev32(v10, __ T16B, v10); 3130 __ rev32(v11, __ T16B, v11); 3131 3132 __ addv(v6, __ T4S, v8, v16); 3133 __ orr(v2, __ T16B, v0, v0); 3134 __ orr(v3, __ T16B, v1, v1); 3135 3136 FloatRegister d0 = v8; 3137 FloatRegister d1 = v9; 3138 FloatRegister d2 = v10; 3139 FloatRegister d3 = v11; 3140 3141 3142 for (int round = 0; round < 16; round++) { 3143 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3144 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3145 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3146 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3147 3148 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3149 __ orr(v4, __ T16B, v2, v2); 3150 if (round < 15) 3151 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3152 __ sha256h(v2, __ T4S, v3, tmp2); 3153 __ sha256h2(v3, __ T4S, v4, tmp2); 3154 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3155 3156 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3157 } 3158 3159 __ addv(v0, __ T4S, v0, v2); 3160 __ addv(v1, __ T4S, v1, v3); 3161 3162 if (multi_block) { 3163 __ add(ofs, ofs, 64); 3164 __ cmp(ofs, limit); 3165 __ br(Assembler::LE, sha1_loop); 3166 __ mov(c_rarg0, ofs); // return ofs 3167 } 3168 3169 __ ldpd(v10, v11, Address(sp, 16)); 3170 __ ldpd(v8, v9, __ post(sp, 32)); 3171 3172 __ stpq(v0, v1, state); 3173 3174 __ ret(lr); 3175 3176 return start; 3177 } 3178 3179 #ifndef BUILTIN_SIM 3180 // Safefetch stubs. 3181 void generate_safefetch(const char* name, int size, address* entry, 3182 address* fault_pc, address* continuation_pc) { 3183 // safefetch signatures: 3184 // int SafeFetch32(int* adr, int errValue); 3185 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3186 // 3187 // arguments: 3188 // c_rarg0 = adr 3189 // c_rarg1 = errValue 3190 // 3191 // result: 3192 // PPC_RET = *adr or errValue 3193 3194 StubCodeMark mark(this, "StubRoutines", name); 3195 3196 // Entry point, pc or function descriptor. 3197 *entry = __ pc(); 3198 3199 // Load *adr into c_rarg1, may fault. 3200 *fault_pc = __ pc(); 3201 switch (size) { 3202 case 4: 3203 // int32_t 3204 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3205 break; 3206 case 8: 3207 // int64_t 3208 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3209 break; 3210 default: 3211 ShouldNotReachHere(); 3212 } 3213 3214 // return errValue or *adr 3215 *continuation_pc = __ pc(); 3216 __ mov(r0, c_rarg1); 3217 __ ret(lr); 3218 } 3219 #endif 3220 3221 /** 3222 * Arguments: 3223 * 3224 * Inputs: 3225 * c_rarg0 - int crc 3226 * c_rarg1 - byte* buf 3227 * c_rarg2 - int length 3228 * 3229 * Ouput: 3230 * rax - int crc result 3231 */ 3232 address generate_updateBytesCRC32() { 3233 assert(UseCRC32Intrinsics, "what are we doing here?"); 3234 3235 __ align(CodeEntryAlignment); 3236 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3237 3238 address start = __ pc(); 3239 3240 const Register crc = c_rarg0; // crc 3241 const Register buf = c_rarg1; // source java byte array address 3242 const Register len = c_rarg2; // length 3243 const Register table0 = c_rarg3; // crc_table address 3244 const Register table1 = c_rarg4; 3245 const Register table2 = c_rarg5; 3246 const Register table3 = c_rarg6; 3247 const Register tmp3 = c_rarg7; 3248 3249 BLOCK_COMMENT("Entry:"); 3250 __ enter(); // required for proper stackwalking of RuntimeStub frame 3251 3252 __ kernel_crc32(crc, buf, len, 3253 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3254 3255 __ leave(); // required for proper stackwalking of RuntimeStub frame 3256 __ ret(lr); 3257 3258 return start; 3259 } 3260 3261 /** 3262 * Arguments: 3263 * 3264 * Inputs: 3265 * c_rarg0 - int crc 3266 * c_rarg1 - byte* buf 3267 * c_rarg2 - int length 3268 * c_rarg3 - int* table 3269 * 3270 * Ouput: 3271 * r0 - int crc result 3272 */ 3273 address generate_updateBytesCRC32C() { 3274 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3275 3276 __ align(CodeEntryAlignment); 3277 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3278 3279 address start = __ pc(); 3280 3281 const Register crc = c_rarg0; // crc 3282 const Register buf = c_rarg1; // source java byte array address 3283 const Register len = c_rarg2; // length 3284 const Register table0 = c_rarg3; // crc_table address 3285 const Register table1 = c_rarg4; 3286 const Register table2 = c_rarg5; 3287 const Register table3 = c_rarg6; 3288 const Register tmp3 = c_rarg7; 3289 3290 BLOCK_COMMENT("Entry:"); 3291 __ enter(); // required for proper stackwalking of RuntimeStub frame 3292 3293 __ kernel_crc32c(crc, buf, len, 3294 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3295 3296 __ leave(); // required for proper stackwalking of RuntimeStub frame 3297 __ ret(lr); 3298 3299 return start; 3300 } 3301 3302 /*** 3303 * Arguments: 3304 * 3305 * Inputs: 3306 * c_rarg0 - int adler 3307 * c_rarg1 - byte* buff 3308 * c_rarg2 - int len 3309 * 3310 * Output: 3311 * c_rarg0 - int adler result 3312 */ 3313 address generate_updateBytesAdler32() { 3314 __ align(CodeEntryAlignment); 3315 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3316 address start = __ pc(); 3317 3318 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3319 3320 // Aliases 3321 Register adler = c_rarg0; 3322 Register s1 = c_rarg0; 3323 Register s2 = c_rarg3; 3324 Register buff = c_rarg1; 3325 Register len = c_rarg2; 3326 Register nmax = r4; 3327 Register base = r5; 3328 Register count = r6; 3329 Register temp0 = rscratch1; 3330 Register temp1 = rscratch2; 3331 Register temp2 = r7; 3332 3333 // Max number of bytes we can process before having to take the mod 3334 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3335 unsigned long BASE = 0xfff1; 3336 unsigned long NMAX = 0x15B0; 3337 3338 __ mov(base, BASE); 3339 __ mov(nmax, NMAX); 3340 3341 // s1 is initialized to the lower 16 bits of adler 3342 // s2 is initialized to the upper 16 bits of adler 3343 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3344 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3345 3346 // The pipelined loop needs at least 16 elements for 1 iteration 3347 // It does check this, but it is more effective to skip to the cleanup loop 3348 __ cmp(len, 16); 3349 __ br(Assembler::HS, L_nmax); 3350 __ cbz(len, L_combine); 3351 3352 __ bind(L_simple_by1_loop); 3353 __ ldrb(temp0, Address(__ post(buff, 1))); 3354 __ add(s1, s1, temp0); 3355 __ add(s2, s2, s1); 3356 __ subs(len, len, 1); 3357 __ br(Assembler::HI, L_simple_by1_loop); 3358 3359 // s1 = s1 % BASE 3360 __ subs(temp0, s1, base); 3361 __ csel(s1, temp0, s1, Assembler::HS); 3362 3363 // s2 = s2 % BASE 3364 __ lsr(temp0, s2, 16); 3365 __ lsl(temp1, temp0, 4); 3366 __ sub(temp1, temp1, temp0); 3367 __ add(s2, temp1, s2, ext::uxth); 3368 3369 __ subs(temp0, s2, base); 3370 __ csel(s2, temp0, s2, Assembler::HS); 3371 3372 __ b(L_combine); 3373 3374 __ bind(L_nmax); 3375 __ subs(len, len, nmax); 3376 __ sub(count, nmax, 16); 3377 __ br(Assembler::LO, L_by16); 3378 3379 __ bind(L_nmax_loop); 3380 3381 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3382 3383 __ add(s1, s1, temp0, ext::uxtb); 3384 __ ubfx(temp2, temp0, 8, 8); 3385 __ add(s2, s2, s1); 3386 __ add(s1, s1, temp2); 3387 __ ubfx(temp2, temp0, 16, 8); 3388 __ add(s2, s2, s1); 3389 __ add(s1, s1, temp2); 3390 __ ubfx(temp2, temp0, 24, 8); 3391 __ add(s2, s2, s1); 3392 __ add(s1, s1, temp2); 3393 __ ubfx(temp2, temp0, 32, 8); 3394 __ add(s2, s2, s1); 3395 __ add(s1, s1, temp2); 3396 __ ubfx(temp2, temp0, 40, 8); 3397 __ add(s2, s2, s1); 3398 __ add(s1, s1, temp2); 3399 __ ubfx(temp2, temp0, 48, 8); 3400 __ add(s2, s2, s1); 3401 __ add(s1, s1, temp2); 3402 __ add(s2, s2, s1); 3403 __ add(s1, s1, temp0, Assembler::LSR, 56); 3404 __ add(s2, s2, s1); 3405 3406 __ add(s1, s1, temp1, ext::uxtb); 3407 __ ubfx(temp2, temp1, 8, 8); 3408 __ add(s2, s2, s1); 3409 __ add(s1, s1, temp2); 3410 __ ubfx(temp2, temp1, 16, 8); 3411 __ add(s2, s2, s1); 3412 __ add(s1, s1, temp2); 3413 __ ubfx(temp2, temp1, 24, 8); 3414 __ add(s2, s2, s1); 3415 __ add(s1, s1, temp2); 3416 __ ubfx(temp2, temp1, 32, 8); 3417 __ add(s2, s2, s1); 3418 __ add(s1, s1, temp2); 3419 __ ubfx(temp2, temp1, 40, 8); 3420 __ add(s2, s2, s1); 3421 __ add(s1, s1, temp2); 3422 __ ubfx(temp2, temp1, 48, 8); 3423 __ add(s2, s2, s1); 3424 __ add(s1, s1, temp2); 3425 __ add(s2, s2, s1); 3426 __ add(s1, s1, temp1, Assembler::LSR, 56); 3427 __ add(s2, s2, s1); 3428 3429 __ subs(count, count, 16); 3430 __ br(Assembler::HS, L_nmax_loop); 3431 3432 // s1 = s1 % BASE 3433 __ lsr(temp0, s1, 16); 3434 __ lsl(temp1, temp0, 4); 3435 __ sub(temp1, temp1, temp0); 3436 __ add(temp1, temp1, s1, ext::uxth); 3437 3438 __ lsr(temp0, temp1, 16); 3439 __ lsl(s1, temp0, 4); 3440 __ sub(s1, s1, temp0); 3441 __ add(s1, s1, temp1, ext:: uxth); 3442 3443 __ subs(temp0, s1, base); 3444 __ csel(s1, temp0, s1, Assembler::HS); 3445 3446 // s2 = s2 % BASE 3447 __ lsr(temp0, s2, 16); 3448 __ lsl(temp1, temp0, 4); 3449 __ sub(temp1, temp1, temp0); 3450 __ add(temp1, temp1, s2, ext::uxth); 3451 3452 __ lsr(temp0, temp1, 16); 3453 __ lsl(s2, temp0, 4); 3454 __ sub(s2, s2, temp0); 3455 __ add(s2, s2, temp1, ext:: uxth); 3456 3457 __ subs(temp0, s2, base); 3458 __ csel(s2, temp0, s2, Assembler::HS); 3459 3460 __ subs(len, len, nmax); 3461 __ sub(count, nmax, 16); 3462 __ br(Assembler::HS, L_nmax_loop); 3463 3464 __ bind(L_by16); 3465 __ adds(len, len, count); 3466 __ br(Assembler::LO, L_by1); 3467 3468 __ bind(L_by16_loop); 3469 3470 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3471 3472 __ add(s1, s1, temp0, ext::uxtb); 3473 __ ubfx(temp2, temp0, 8, 8); 3474 __ add(s2, s2, s1); 3475 __ add(s1, s1, temp2); 3476 __ ubfx(temp2, temp0, 16, 8); 3477 __ add(s2, s2, s1); 3478 __ add(s1, s1, temp2); 3479 __ ubfx(temp2, temp0, 24, 8); 3480 __ add(s2, s2, s1); 3481 __ add(s1, s1, temp2); 3482 __ ubfx(temp2, temp0, 32, 8); 3483 __ add(s2, s2, s1); 3484 __ add(s1, s1, temp2); 3485 __ ubfx(temp2, temp0, 40, 8); 3486 __ add(s2, s2, s1); 3487 __ add(s1, s1, temp2); 3488 __ ubfx(temp2, temp0, 48, 8); 3489 __ add(s2, s2, s1); 3490 __ add(s1, s1, temp2); 3491 __ add(s2, s2, s1); 3492 __ add(s1, s1, temp0, Assembler::LSR, 56); 3493 __ add(s2, s2, s1); 3494 3495 __ add(s1, s1, temp1, ext::uxtb); 3496 __ ubfx(temp2, temp1, 8, 8); 3497 __ add(s2, s2, s1); 3498 __ add(s1, s1, temp2); 3499 __ ubfx(temp2, temp1, 16, 8); 3500 __ add(s2, s2, s1); 3501 __ add(s1, s1, temp2); 3502 __ ubfx(temp2, temp1, 24, 8); 3503 __ add(s2, s2, s1); 3504 __ add(s1, s1, temp2); 3505 __ ubfx(temp2, temp1, 32, 8); 3506 __ add(s2, s2, s1); 3507 __ add(s1, s1, temp2); 3508 __ ubfx(temp2, temp1, 40, 8); 3509 __ add(s2, s2, s1); 3510 __ add(s1, s1, temp2); 3511 __ ubfx(temp2, temp1, 48, 8); 3512 __ add(s2, s2, s1); 3513 __ add(s1, s1, temp2); 3514 __ add(s2, s2, s1); 3515 __ add(s1, s1, temp1, Assembler::LSR, 56); 3516 __ add(s2, s2, s1); 3517 3518 __ subs(len, len, 16); 3519 __ br(Assembler::HS, L_by16_loop); 3520 3521 __ bind(L_by1); 3522 __ adds(len, len, 15); 3523 __ br(Assembler::LO, L_do_mod); 3524 3525 __ bind(L_by1_loop); 3526 __ ldrb(temp0, Address(__ post(buff, 1))); 3527 __ add(s1, temp0, s1); 3528 __ add(s2, s2, s1); 3529 __ subs(len, len, 1); 3530 __ br(Assembler::HS, L_by1_loop); 3531 3532 __ bind(L_do_mod); 3533 // s1 = s1 % BASE 3534 __ lsr(temp0, s1, 16); 3535 __ lsl(temp1, temp0, 4); 3536 __ sub(temp1, temp1, temp0); 3537 __ add(temp1, temp1, s1, ext::uxth); 3538 3539 __ lsr(temp0, temp1, 16); 3540 __ lsl(s1, temp0, 4); 3541 __ sub(s1, s1, temp0); 3542 __ add(s1, s1, temp1, ext:: uxth); 3543 3544 __ subs(temp0, s1, base); 3545 __ csel(s1, temp0, s1, Assembler::HS); 3546 3547 // s2 = s2 % BASE 3548 __ lsr(temp0, s2, 16); 3549 __ lsl(temp1, temp0, 4); 3550 __ sub(temp1, temp1, temp0); 3551 __ add(temp1, temp1, s2, ext::uxth); 3552 3553 __ lsr(temp0, temp1, 16); 3554 __ lsl(s2, temp0, 4); 3555 __ sub(s2, s2, temp0); 3556 __ add(s2, s2, temp1, ext:: uxth); 3557 3558 __ subs(temp0, s2, base); 3559 __ csel(s2, temp0, s2, Assembler::HS); 3560 3561 // Combine lower bits and higher bits 3562 __ bind(L_combine); 3563 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3564 3565 __ ret(lr); 3566 3567 return start; 3568 } 3569 3570 /** 3571 * Arguments: 3572 * 3573 * Input: 3574 * c_rarg0 - x address 3575 * c_rarg1 - x length 3576 * c_rarg2 - y address 3577 * c_rarg3 - y lenth 3578 * c_rarg4 - z address 3579 * c_rarg5 - z length 3580 */ 3581 address generate_multiplyToLen() { 3582 __ align(CodeEntryAlignment); 3583 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3584 3585 address start = __ pc(); 3586 const Register x = r0; 3587 const Register xlen = r1; 3588 const Register y = r2; 3589 const Register ylen = r3; 3590 const Register z = r4; 3591 const Register zlen = r5; 3592 3593 const Register tmp1 = r10; 3594 const Register tmp2 = r11; 3595 const Register tmp3 = r12; 3596 const Register tmp4 = r13; 3597 const Register tmp5 = r14; 3598 const Register tmp6 = r15; 3599 const Register tmp7 = r16; 3600 3601 BLOCK_COMMENT("Entry:"); 3602 __ enter(); // required for proper stackwalking of RuntimeStub frame 3603 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3604 __ leave(); // required for proper stackwalking of RuntimeStub frame 3605 __ ret(lr); 3606 3607 return start; 3608 } 3609 3610 address generate_squareToLen() { 3611 // squareToLen algorithm for sizes 1..127 described in java code works 3612 // faster than multiply_to_len on some CPUs and slower on others, but 3613 // multiply_to_len shows a bit better overall results 3614 __ align(CodeEntryAlignment); 3615 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3616 address start = __ pc(); 3617 3618 const Register x = r0; 3619 const Register xlen = r1; 3620 const Register z = r2; 3621 const Register zlen = r3; 3622 const Register y = r4; // == x 3623 const Register ylen = r5; // == xlen 3624 3625 const Register tmp1 = r10; 3626 const Register tmp2 = r11; 3627 const Register tmp3 = r12; 3628 const Register tmp4 = r13; 3629 const Register tmp5 = r14; 3630 const Register tmp6 = r15; 3631 const Register tmp7 = r16; 3632 3633 RegSet spilled_regs = RegSet::of(y, ylen); 3634 BLOCK_COMMENT("Entry:"); 3635 __ enter(); 3636 __ push(spilled_regs, sp); 3637 __ mov(y, x); 3638 __ mov(ylen, xlen); 3639 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3640 __ pop(spilled_regs, sp); 3641 __ leave(); 3642 __ ret(lr); 3643 return start; 3644 } 3645 3646 address generate_mulAdd() { 3647 __ align(CodeEntryAlignment); 3648 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3649 3650 address start = __ pc(); 3651 3652 const Register out = r0; 3653 const Register in = r1; 3654 const Register offset = r2; 3655 const Register len = r3; 3656 const Register k = r4; 3657 3658 BLOCK_COMMENT("Entry:"); 3659 __ enter(); 3660 __ mul_add(out, in, offset, len, k); 3661 __ leave(); 3662 __ ret(lr); 3663 3664 return start; 3665 } 3666 3667 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3668 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3669 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3670 // Karatsuba multiplication performs a 128*128 -> 256-bit 3671 // multiplication in three 128-bit multiplications and a few 3672 // additions. 3673 // 3674 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3675 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3676 // 3677 // Inputs: 3678 // 3679 // A0 in a.d[0] (subkey) 3680 // A1 in a.d[1] 3681 // (A1+A0) in a1_xor_a0.d[0] 3682 // 3683 // B0 in b.d[0] (state) 3684 // B1 in b.d[1] 3685 3686 __ ext(tmp1, __ T16B, b, b, 0x08); 3687 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3688 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3689 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3690 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3691 3692 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3693 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3694 __ eor(tmp2, __ T16B, tmp2, tmp4); 3695 __ eor(tmp2, __ T16B, tmp2, tmp3); 3696 3697 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3698 __ ins(result_hi, __ D, tmp2, 0, 1); 3699 __ ins(result_lo, __ D, tmp2, 1, 0); 3700 } 3701 3702 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3703 FloatRegister p, FloatRegister z, FloatRegister t1) { 3704 const FloatRegister t0 = result; 3705 3706 // The GCM field polynomial f is z^128 + p(z), where p = 3707 // z^7+z^2+z+1. 3708 // 3709 // z^128 === -p(z) (mod (z^128 + p(z))) 3710 // 3711 // so, given that the product we're reducing is 3712 // a == lo + hi * z^128 3713 // substituting, 3714 // === lo - hi * p(z) (mod (z^128 + p(z))) 3715 // 3716 // we reduce by multiplying hi by p(z) and subtracting the result 3717 // from (i.e. XORing it with) lo. Because p has no nonzero high 3718 // bits we can do this with two 64-bit multiplications, lo*p and 3719 // hi*p. 3720 3721 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3722 __ ext(t1, __ T16B, t0, z, 8); 3723 __ eor(hi, __ T16B, hi, t1); 3724 __ ext(t1, __ T16B, z, t0, 8); 3725 __ eor(lo, __ T16B, lo, t1); 3726 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3727 __ eor(result, __ T16B, lo, t0); 3728 } 3729 3730 address generate_has_negatives(address &has_negatives_long) { 3731 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3732 const int large_loop_size = 64; 3733 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3734 int dcache_line = VM_Version::dcache_line_size(); 3735 3736 Register ary1 = r1, len = r2, result = r0; 3737 3738 __ align(CodeEntryAlignment); 3739 address entry = __ pc(); 3740 3741 __ enter(); 3742 3743 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3744 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3745 3746 __ cmp(len, 15); 3747 __ br(Assembler::GT, LEN_OVER_15); 3748 // The only case when execution falls into this code is when pointer is near 3749 // the end of memory page and we have to avoid reading next page 3750 __ add(ary1, ary1, len); 3751 __ subs(len, len, 8); 3752 __ br(Assembler::GT, LEN_OVER_8); 3753 __ ldr(rscratch2, Address(ary1, -8)); 3754 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3755 __ lsrv(rscratch2, rscratch2, rscratch1); 3756 __ tst(rscratch2, UPPER_BIT_MASK); 3757 __ cset(result, Assembler::NE); 3758 __ leave(); 3759 __ ret(lr); 3760 __ bind(LEN_OVER_8); 3761 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3762 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3763 __ tst(rscratch2, UPPER_BIT_MASK); 3764 __ br(Assembler::NE, RET_TRUE_NO_POP); 3765 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3766 __ lsrv(rscratch1, rscratch1, rscratch2); 3767 __ tst(rscratch1, UPPER_BIT_MASK); 3768 __ cset(result, Assembler::NE); 3769 __ leave(); 3770 __ ret(lr); 3771 3772 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3773 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3774 3775 has_negatives_long = __ pc(); // 2nd entry point 3776 3777 __ enter(); 3778 3779 __ bind(LEN_OVER_15); 3780 __ push(spilled_regs, sp); 3781 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3782 __ cbz(rscratch2, ALIGNED); 3783 __ ldp(tmp6, tmp1, Address(ary1)); 3784 __ mov(tmp5, 16); 3785 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3786 __ add(ary1, ary1, rscratch1); 3787 __ sub(len, len, rscratch1); 3788 __ orr(tmp6, tmp6, tmp1); 3789 __ tst(tmp6, UPPER_BIT_MASK); 3790 __ br(Assembler::NE, RET_TRUE); 3791 3792 __ bind(ALIGNED); 3793 __ cmp(len, large_loop_size); 3794 __ br(Assembler::LT, CHECK_16); 3795 // Perform 16-byte load as early return in pre-loop to handle situation 3796 // when initially aligned large array has negative values at starting bytes, 3797 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3798 // slower. Cases with negative bytes further ahead won't be affected that 3799 // much. In fact, it'll be faster due to early loads, less instructions and 3800 // less branches in LARGE_LOOP. 3801 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3802 __ sub(len, len, 16); 3803 __ orr(tmp6, tmp6, tmp1); 3804 __ tst(tmp6, UPPER_BIT_MASK); 3805 __ br(Assembler::NE, RET_TRUE); 3806 __ cmp(len, large_loop_size); 3807 __ br(Assembler::LT, CHECK_16); 3808 3809 if (SoftwarePrefetchHintDistance >= 0 3810 && SoftwarePrefetchHintDistance >= dcache_line) { 3811 // initial prefetch 3812 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3813 } 3814 __ bind(LARGE_LOOP); 3815 if (SoftwarePrefetchHintDistance >= 0) { 3816 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3817 } 3818 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3819 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3820 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3821 // instructions per cycle and have less branches, but this approach disables 3822 // early return, thus, all 64 bytes are loaded and checked every time. 3823 __ ldp(tmp2, tmp3, Address(ary1)); 3824 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3825 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3826 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3827 __ add(ary1, ary1, large_loop_size); 3828 __ sub(len, len, large_loop_size); 3829 __ orr(tmp2, tmp2, tmp3); 3830 __ orr(tmp4, tmp4, tmp5); 3831 __ orr(rscratch1, rscratch1, rscratch2); 3832 __ orr(tmp6, tmp6, tmp1); 3833 __ orr(tmp2, tmp2, tmp4); 3834 __ orr(rscratch1, rscratch1, tmp6); 3835 __ orr(tmp2, tmp2, rscratch1); 3836 __ tst(tmp2, UPPER_BIT_MASK); 3837 __ br(Assembler::NE, RET_TRUE); 3838 __ cmp(len, large_loop_size); 3839 __ br(Assembler::GE, LARGE_LOOP); 3840 3841 __ bind(CHECK_16); // small 16-byte load pre-loop 3842 __ cmp(len, 16); 3843 __ br(Assembler::LT, POST_LOOP16); 3844 3845 __ bind(LOOP16); // small 16-byte load loop 3846 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3847 __ sub(len, len, 16); 3848 __ orr(tmp2, tmp2, tmp3); 3849 __ tst(tmp2, UPPER_BIT_MASK); 3850 __ br(Assembler::NE, RET_TRUE); 3851 __ cmp(len, 16); 3852 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3853 3854 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3855 __ cmp(len, 8); 3856 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3857 __ ldr(tmp3, Address(__ post(ary1, 8))); 3858 __ sub(len, len, 8); 3859 __ tst(tmp3, UPPER_BIT_MASK); 3860 __ br(Assembler::NE, RET_TRUE); 3861 3862 __ bind(POST_LOOP16_LOAD_TAIL); 3863 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3864 __ ldr(tmp1, Address(ary1)); 3865 __ mov(tmp2, 64); 3866 __ sub(tmp4, tmp2, len, __ LSL, 3); 3867 __ lslv(tmp1, tmp1, tmp4); 3868 __ tst(tmp1, UPPER_BIT_MASK); 3869 __ br(Assembler::NE, RET_TRUE); 3870 // Fallthrough 3871 3872 __ bind(RET_FALSE); 3873 __ pop(spilled_regs, sp); 3874 __ leave(); 3875 __ mov(result, zr); 3876 __ ret(lr); 3877 3878 __ bind(RET_TRUE); 3879 __ pop(spilled_regs, sp); 3880 __ bind(RET_TRUE_NO_POP); 3881 __ leave(); 3882 __ mov(result, 1); 3883 __ ret(lr); 3884 3885 __ bind(DONE); 3886 __ pop(spilled_regs, sp); 3887 __ leave(); 3888 __ ret(lr); 3889 return entry; 3890 } 3891 3892 address generate_large_array_equals_byte() { 3893 return generate_large_array_equals(1); 3894 } 3895 3896 address generate_large_array_equals_char() { 3897 return generate_large_array_equals(2); 3898 } 3899 3900 // a1 = r1 - array1 address 3901 // a2 = r2 - array2 address 3902 // result = r0 - return value. Already contains "false" 3903 // cnt1 = r4 - amount of elements left to check, reduced by elem_per_word 3904 address generate_large_array_equals(int elem_size) { 3905 StubCodeMark mark(this, "StubRoutines", elem_size == 1 3906 ? "large_array_equals_byte" 3907 : "large_array_equals_char"); 3908 Register a1 = r1, a2 = r2, result = r0, cnt1 = r4, tmp1 = rscratch1, 3909 tmp2 = rscratch2, tmp3 = r6, tmp4 = r7; 3910 Label LARGE_LOOP, NOT_EQUAL; 3911 int elem_per_word = wordSize/elem_size; 3912 int branchThreshold = MAX(80, SoftwarePrefetchHintDistance)/elem_size - elem_per_word; 3913 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 3914 3915 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4); 3916 3917 __ align(CodeEntryAlignment); 3918 address entry = __ pc(); 3919 __ enter(); 3920 3921 if (!UseSIMDForArrayEquals) { 3922 // pre-loop 3923 __ push(spilled_regs, sp); 3924 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3925 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3926 } 3927 __ bind(LARGE_LOOP); // unrolled to 64 bytes loop with possible prefetching 3928 if (SoftwarePrefetchHintDistance >= 0) { 3929 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3930 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3931 } 3932 if (UseSIMDForArrayEquals) { 3933 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3934 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3935 __ eor(v0, __ T2D, v0, v4); 3936 __ eor(v1, __ T2D, v1, v5); 3937 __ eor(v2, __ T2D, v2, v6); 3938 __ eor(v3, __ T2D, v3, v7); 3939 3940 __ orr(v0, __ T2D, v0, v1); 3941 __ orr(v1, __ T2D, v2, v3); 3942 __ orr(v0, __ T2D, v0, v1); 3943 3944 __ umov(tmp1, v0, __ D, 0); 3945 __ cbnz(tmp1, NOT_EQUAL); 3946 __ umov(tmp1, v0, __ D, 1); 3947 __ cbnz(tmp1, NOT_EQUAL); 3948 __ sub(cnt1, cnt1, 64/elem_size); 3949 __ cmp(cnt1, branchThreshold); 3950 __ br(__ GT, LARGE_LOOP); 3951 } else { 3952 __ eor(tmp1, tmp1, tmp2); 3953 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3954 __ cbnz(tmp1, NOT_EQUAL); 3955 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3956 __ eor(tmp3, tmp3, tmp4); 3957 __ ldr(tmp4, Address(__ post(a2, wordSize))); 3958 __ cbnz(tmp3, NOT_EQUAL); 3959 __ ldr(tmp3, Address(__ post(a1, wordSize))); 3960 3961 __ eor(tmp1, tmp1, tmp2); 3962 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3963 __ cbnz(tmp1, NOT_EQUAL); 3964 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3965 __ eor(tmp3, tmp3, tmp4); 3966 __ ldr(tmp4, Address(__ post(a2, wordSize))); 3967 __ cbnz(tmp3, NOT_EQUAL); 3968 __ ldr(tmp3, Address(__ post(a1, wordSize))); 3969 3970 __ eor(tmp1, tmp1, tmp2); 3971 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3972 __ cbnz(tmp1, NOT_EQUAL); 3973 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3974 __ eor(tmp3, tmp3, tmp4); 3975 __ ldr(tmp4, Address(__ post(a2, wordSize))); 3976 __ cbnz(tmp3, NOT_EQUAL); 3977 __ ldr(tmp3, Address(__ post(a1, wordSize))); 3978 3979 // loads below are for next loop iteration 3980 __ eor(tmp1, tmp1, tmp2); 3981 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3982 __ cbnz(tmp1, NOT_EQUAL); 3983 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3984 __ eor(tmp3, tmp3, tmp4); 3985 __ ldr(tmp4, Address(__ post(a2, wordSize))); 3986 __ cbnz(tmp3, NOT_EQUAL); 3987 __ ldr(tmp3, Address(__ post(a1, wordSize))); 3988 3989 __ sub(cnt1, cnt1, 8 * elem_per_word); 3990 // run this loop until we have memory to prefetch(but at least 64+16 bytes). 3991 __ cmp(cnt1, branchThreshold); 3992 __ br(Assembler::GT, LARGE_LOOP); 3993 // both a1 and a2 are shifted more than needed by wordSize and tmp1-tmp4 3994 // contains still-not-checked value. Check it in this post-loop, also update 3995 // cnt1 accordingly 3996 __ eor(tmp1, tmp1, tmp2); 3997 __ cbnz(tmp1, NOT_EQUAL); 3998 __ eor(tmp3, tmp3, tmp4); 3999 __ cbnz(tmp3, NOT_EQUAL); 4000 __ sub(cnt1, cnt1, 2 * elem_per_word); 4001 } 4002 4003 __ mov(result, true); 4004 __ bind(NOT_EQUAL); 4005 if (!UseSIMDForArrayEquals) { 4006 __ pop(spilled_regs, sp); 4007 } 4008 __ leave(); 4009 __ ret(lr); 4010 return entry; 4011 } 4012 4013 /** 4014 * Arguments: 4015 * 4016 * Input: 4017 * c_rarg0 - current state address 4018 * c_rarg1 - H key address 4019 * c_rarg2 - data address 4020 * c_rarg3 - number of blocks 4021 * 4022 * Output: 4023 * Updated state at c_rarg0 4024 */ 4025 address generate_ghash_processBlocks() { 4026 // Bafflingly, GCM uses little-endian for the byte order, but 4027 // big-endian for the bit order. For example, the polynomial 1 is 4028 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4029 // 4030 // So, we must either reverse the bytes in each word and do 4031 // everything big-endian or reverse the bits in each byte and do 4032 // it little-endian. On AArch64 it's more idiomatic to reverse 4033 // the bits in each byte (we have an instruction, RBIT, to do 4034 // that) and keep the data in little-endian bit order throught the 4035 // calculation, bit-reversing the inputs and outputs. 4036 4037 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4038 __ align(wordSize * 2); 4039 address p = __ pc(); 4040 __ emit_int64(0x87); // The low-order bits of the field 4041 // polynomial (i.e. p = z^7+z^2+z+1) 4042 // repeated in the low and high parts of a 4043 // 128-bit vector 4044 __ emit_int64(0x87); 4045 4046 __ align(CodeEntryAlignment); 4047 address start = __ pc(); 4048 4049 Register state = c_rarg0; 4050 Register subkeyH = c_rarg1; 4051 Register data = c_rarg2; 4052 Register blocks = c_rarg3; 4053 4054 FloatRegister vzr = v30; 4055 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4056 4057 __ ldrq(v0, Address(state)); 4058 __ ldrq(v1, Address(subkeyH)); 4059 4060 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4061 __ rbit(v0, __ T16B, v0); 4062 __ rev64(v1, __ T16B, v1); 4063 __ rbit(v1, __ T16B, v1); 4064 4065 __ ldrq(v26, p); 4066 4067 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4068 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4069 4070 { 4071 Label L_ghash_loop; 4072 __ bind(L_ghash_loop); 4073 4074 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4075 // reversing each byte 4076 __ rbit(v2, __ T16B, v2); 4077 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4078 4079 // Multiply state in v2 by subkey in v1 4080 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4081 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4082 /*temps*/v6, v20, v18, v21); 4083 // Reduce v7:v5 by the field polynomial 4084 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4085 4086 __ sub(blocks, blocks, 1); 4087 __ cbnz(blocks, L_ghash_loop); 4088 } 4089 4090 // The bit-reversed result is at this point in v0 4091 __ rev64(v1, __ T16B, v0); 4092 __ rbit(v1, __ T16B, v1); 4093 4094 __ st1(v1, __ T16B, state); 4095 __ ret(lr); 4096 4097 return start; 4098 } 4099 4100 // Continuation point for throwing of implicit exceptions that are 4101 // not handled in the current activation. Fabricates an exception 4102 // oop and initiates normal exception dispatching in this 4103 // frame. Since we need to preserve callee-saved values (currently 4104 // only for C2, but done for C1 as well) we need a callee-saved oop 4105 // map and therefore have to make these stubs into RuntimeStubs 4106 // rather than BufferBlobs. If the compiler needs all registers to 4107 // be preserved between the fault point and the exception handler 4108 // then it must assume responsibility for that in 4109 // AbstractCompiler::continuation_for_implicit_null_exception or 4110 // continuation_for_implicit_division_by_zero_exception. All other 4111 // implicit exceptions (e.g., NullPointerException or 4112 // AbstractMethodError on entry) are either at call sites or 4113 // otherwise assume that stack unwinding will be initiated, so 4114 // caller saved registers were assumed volatile in the compiler. 4115 4116 #undef __ 4117 #define __ masm-> 4118 4119 address generate_throw_exception(const char* name, 4120 address runtime_entry, 4121 Register arg1 = noreg, 4122 Register arg2 = noreg) { 4123 // Information about frame layout at time of blocking runtime call. 4124 // Note that we only have to preserve callee-saved registers since 4125 // the compilers are responsible for supplying a continuation point 4126 // if they expect all registers to be preserved. 4127 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4128 enum layout { 4129 rfp_off = 0, 4130 rfp_off2, 4131 return_off, 4132 return_off2, 4133 framesize // inclusive of return address 4134 }; 4135 4136 int insts_size = 512; 4137 int locs_size = 64; 4138 4139 CodeBuffer code(name, insts_size, locs_size); 4140 OopMapSet* oop_maps = new OopMapSet(); 4141 MacroAssembler* masm = new MacroAssembler(&code); 4142 4143 address start = __ pc(); 4144 4145 // This is an inlined and slightly modified version of call_VM 4146 // which has the ability to fetch the return PC out of 4147 // thread-local storage and also sets up last_Java_sp slightly 4148 // differently than the real call_VM 4149 4150 __ enter(); // Save FP and LR before call 4151 4152 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4153 4154 // lr and fp are already in place 4155 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4156 4157 int frame_complete = __ pc() - start; 4158 4159 // Set up last_Java_sp and last_Java_fp 4160 address the_pc = __ pc(); 4161 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 4162 4163 // Call runtime 4164 if (arg1 != noreg) { 4165 assert(arg2 != c_rarg1, "clobbered"); 4166 __ mov(c_rarg1, arg1); 4167 } 4168 if (arg2 != noreg) { 4169 __ mov(c_rarg2, arg2); 4170 } 4171 __ mov(c_rarg0, rthread); 4172 BLOCK_COMMENT("call runtime_entry"); 4173 __ mov(rscratch1, runtime_entry); 4174 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4175 4176 // Generate oop map 4177 OopMap* map = new OopMap(framesize, 0); 4178 4179 oop_maps->add_gc_map(the_pc - start, map); 4180 4181 __ reset_last_Java_frame(true); 4182 __ maybe_isb(); 4183 4184 __ leave(); 4185 4186 // check for pending exceptions 4187 #ifdef ASSERT 4188 Label L; 4189 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4190 __ cbnz(rscratch1, L); 4191 __ should_not_reach_here(); 4192 __ bind(L); 4193 #endif // ASSERT 4194 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4195 4196 4197 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4198 RuntimeStub* stub = 4199 RuntimeStub::new_runtime_stub(name, 4200 &code, 4201 frame_complete, 4202 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4203 oop_maps, false); 4204 return stub->entry_point(); 4205 } 4206 4207 class MontgomeryMultiplyGenerator : public MacroAssembler { 4208 4209 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4210 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4211 4212 RegSet _toSave; 4213 bool _squaring; 4214 4215 public: 4216 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4217 : MacroAssembler(as->code()), _squaring(squaring) { 4218 4219 // Register allocation 4220 4221 Register reg = c_rarg0; 4222 Pa_base = reg; // Argument registers 4223 if (squaring) 4224 Pb_base = Pa_base; 4225 else 4226 Pb_base = ++reg; 4227 Pn_base = ++reg; 4228 Rlen= ++reg; 4229 inv = ++reg; 4230 Pm_base = ++reg; 4231 4232 // Working registers: 4233 Ra = ++reg; // The current digit of a, b, n, and m. 4234 Rb = ++reg; 4235 Rm = ++reg; 4236 Rn = ++reg; 4237 4238 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4239 Pb = ++reg; 4240 Pm = ++reg; 4241 Pn = ++reg; 4242 4243 t0 = ++reg; // Three registers which form a 4244 t1 = ++reg; // triple-precision accumuator. 4245 t2 = ++reg; 4246 4247 Ri = ++reg; // Inner and outer loop indexes. 4248 Rj = ++reg; 4249 4250 Rhi_ab = ++reg; // Product registers: low and high parts 4251 Rlo_ab = ++reg; // of a*b and m*n. 4252 Rhi_mn = ++reg; 4253 Rlo_mn = ++reg; 4254 4255 // r19 and up are callee-saved. 4256 _toSave = RegSet::range(r19, reg) + Pm_base; 4257 } 4258 4259 private: 4260 void save_regs() { 4261 push(_toSave, sp); 4262 } 4263 4264 void restore_regs() { 4265 pop(_toSave, sp); 4266 } 4267 4268 template <typename T> 4269 void unroll_2(Register count, T block) { 4270 Label loop, end, odd; 4271 tbnz(count, 0, odd); 4272 cbz(count, end); 4273 align(16); 4274 bind(loop); 4275 (this->*block)(); 4276 bind(odd); 4277 (this->*block)(); 4278 subs(count, count, 2); 4279 br(Assembler::GT, loop); 4280 bind(end); 4281 } 4282 4283 template <typename T> 4284 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4285 Label loop, end, odd; 4286 tbnz(count, 0, odd); 4287 cbz(count, end); 4288 align(16); 4289 bind(loop); 4290 (this->*block)(d, s, tmp); 4291 bind(odd); 4292 (this->*block)(d, s, tmp); 4293 subs(count, count, 2); 4294 br(Assembler::GT, loop); 4295 bind(end); 4296 } 4297 4298 void pre1(RegisterOrConstant i) { 4299 block_comment("pre1"); 4300 // Pa = Pa_base; 4301 // Pb = Pb_base + i; 4302 // Pm = Pm_base; 4303 // Pn = Pn_base + i; 4304 // Ra = *Pa; 4305 // Rb = *Pb; 4306 // Rm = *Pm; 4307 // Rn = *Pn; 4308 ldr(Ra, Address(Pa_base)); 4309 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4310 ldr(Rm, Address(Pm_base)); 4311 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4312 lea(Pa, Address(Pa_base)); 4313 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4314 lea(Pm, Address(Pm_base)); 4315 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4316 4317 // Zero the m*n result. 4318 mov(Rhi_mn, zr); 4319 mov(Rlo_mn, zr); 4320 } 4321 4322 // The core multiply-accumulate step of a Montgomery 4323 // multiplication. The idea is to schedule operations as a 4324 // pipeline so that instructions with long latencies (loads and 4325 // multiplies) have time to complete before their results are 4326 // used. This most benefits in-order implementations of the 4327 // architecture but out-of-order ones also benefit. 4328 void step() { 4329 block_comment("step"); 4330 // MACC(Ra, Rb, t0, t1, t2); 4331 // Ra = *++Pa; 4332 // Rb = *--Pb; 4333 umulh(Rhi_ab, Ra, Rb); 4334 mul(Rlo_ab, Ra, Rb); 4335 ldr(Ra, pre(Pa, wordSize)); 4336 ldr(Rb, pre(Pb, -wordSize)); 4337 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4338 // previous iteration. 4339 // MACC(Rm, Rn, t0, t1, t2); 4340 // Rm = *++Pm; 4341 // Rn = *--Pn; 4342 umulh(Rhi_mn, Rm, Rn); 4343 mul(Rlo_mn, Rm, Rn); 4344 ldr(Rm, pre(Pm, wordSize)); 4345 ldr(Rn, pre(Pn, -wordSize)); 4346 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4347 } 4348 4349 void post1() { 4350 block_comment("post1"); 4351 4352 // MACC(Ra, Rb, t0, t1, t2); 4353 // Ra = *++Pa; 4354 // Rb = *--Pb; 4355 umulh(Rhi_ab, Ra, Rb); 4356 mul(Rlo_ab, Ra, Rb); 4357 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4358 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4359 4360 // *Pm = Rm = t0 * inv; 4361 mul(Rm, t0, inv); 4362 str(Rm, Address(Pm)); 4363 4364 // MACC(Rm, Rn, t0, t1, t2); 4365 // t0 = t1; t1 = t2; t2 = 0; 4366 umulh(Rhi_mn, Rm, Rn); 4367 4368 #ifndef PRODUCT 4369 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4370 { 4371 mul(Rlo_mn, Rm, Rn); 4372 add(Rlo_mn, t0, Rlo_mn); 4373 Label ok; 4374 cbz(Rlo_mn, ok); { 4375 stop("broken Montgomery multiply"); 4376 } bind(ok); 4377 } 4378 #endif 4379 // We have very carefully set things up so that 4380 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4381 // the lower half of Rm * Rn because we know the result already: 4382 // it must be -t0. t0 + (-t0) must generate a carry iff 4383 // t0 != 0. So, rather than do a mul and an adds we just set 4384 // the carry flag iff t0 is nonzero. 4385 // 4386 // mul(Rlo_mn, Rm, Rn); 4387 // adds(zr, t0, Rlo_mn); 4388 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4389 adcs(t0, t1, Rhi_mn); 4390 adc(t1, t2, zr); 4391 mov(t2, zr); 4392 } 4393 4394 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 4395 block_comment("pre2"); 4396 // Pa = Pa_base + i-len; 4397 // Pb = Pb_base + len; 4398 // Pm = Pm_base + i-len; 4399 // Pn = Pn_base + len; 4400 4401 if (i.is_register()) { 4402 sub(Rj, i.as_register(), len); 4403 } else { 4404 mov(Rj, i.as_constant()); 4405 sub(Rj, Rj, len); 4406 } 4407 // Rj == i-len 4408 4409 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 4410 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 4411 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4412 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 4413 4414 // Ra = *++Pa; 4415 // Rb = *--Pb; 4416 // Rm = *++Pm; 4417 // Rn = *--Pn; 4418 ldr(Ra, pre(Pa, wordSize)); 4419 ldr(Rb, pre(Pb, -wordSize)); 4420 ldr(Rm, pre(Pm, wordSize)); 4421 ldr(Rn, pre(Pn, -wordSize)); 4422 4423 mov(Rhi_mn, zr); 4424 mov(Rlo_mn, zr); 4425 } 4426 4427 void post2(RegisterOrConstant i, RegisterOrConstant len) { 4428 block_comment("post2"); 4429 if (i.is_constant()) { 4430 mov(Rj, i.as_constant()-len.as_constant()); 4431 } else { 4432 sub(Rj, i.as_register(), len); 4433 } 4434 4435 adds(t0, t0, Rlo_mn); // The pending m*n, low part 4436 4437 // As soon as we know the least significant digit of our result, 4438 // store it. 4439 // Pm_base[i-len] = t0; 4440 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4441 4442 // t0 = t1; t1 = t2; t2 = 0; 4443 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 4444 adc(t1, t2, zr); 4445 mov(t2, zr); 4446 } 4447 4448 // A carry in t0 after Montgomery multiplication means that we 4449 // should subtract multiples of n from our result in m. We'll 4450 // keep doing that until there is no carry. 4451 void normalize(RegisterOrConstant len) { 4452 block_comment("normalize"); 4453 // while (t0) 4454 // t0 = sub(Pm_base, Pn_base, t0, len); 4455 Label loop, post, again; 4456 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 4457 cbz(t0, post); { 4458 bind(again); { 4459 mov(i, zr); 4460 mov(cnt, len); 4461 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4462 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4463 subs(zr, zr, zr); // set carry flag, i.e. no borrow 4464 align(16); 4465 bind(loop); { 4466 sbcs(Rm, Rm, Rn); 4467 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4468 add(i, i, 1); 4469 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4470 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4471 sub(cnt, cnt, 1); 4472 } cbnz(cnt, loop); 4473 sbc(t0, t0, zr); 4474 } cbnz(t0, again); 4475 } bind(post); 4476 } 4477 4478 // Move memory at s to d, reversing words. 4479 // Increments d to end of copied memory 4480 // Destroys tmp1, tmp2 4481 // Preserves len 4482 // Leaves s pointing to the address which was in d at start 4483 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 4484 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 4485 4486 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 4487 mov(tmp1, len); 4488 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 4489 sub(s, d, len, ext::uxtw, LogBytesPerWord); 4490 } 4491 // where 4492 void reverse1(Register d, Register s, Register tmp) { 4493 ldr(tmp, pre(s, -wordSize)); 4494 ror(tmp, tmp, 32); 4495 str(tmp, post(d, wordSize)); 4496 } 4497 4498 void step_squaring() { 4499 // An extra ACC 4500 step(); 4501 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4502 } 4503 4504 void last_squaring(RegisterOrConstant i) { 4505 Label dont; 4506 // if ((i & 1) == 0) { 4507 tbnz(i.as_register(), 0, dont); { 4508 // MACC(Ra, Rb, t0, t1, t2); 4509 // Ra = *++Pa; 4510 // Rb = *--Pb; 4511 umulh(Rhi_ab, Ra, Rb); 4512 mul(Rlo_ab, Ra, Rb); 4513 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4514 } bind(dont); 4515 } 4516 4517 void extra_step_squaring() { 4518 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4519 4520 // MACC(Rm, Rn, t0, t1, t2); 4521 // Rm = *++Pm; 4522 // Rn = *--Pn; 4523 umulh(Rhi_mn, Rm, Rn); 4524 mul(Rlo_mn, Rm, Rn); 4525 ldr(Rm, pre(Pm, wordSize)); 4526 ldr(Rn, pre(Pn, -wordSize)); 4527 } 4528 4529 void post1_squaring() { 4530 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4531 4532 // *Pm = Rm = t0 * inv; 4533 mul(Rm, t0, inv); 4534 str(Rm, Address(Pm)); 4535 4536 // MACC(Rm, Rn, t0, t1, t2); 4537 // t0 = t1; t1 = t2; t2 = 0; 4538 umulh(Rhi_mn, Rm, Rn); 4539 4540 #ifndef PRODUCT 4541 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4542 { 4543 mul(Rlo_mn, Rm, Rn); 4544 add(Rlo_mn, t0, Rlo_mn); 4545 Label ok; 4546 cbz(Rlo_mn, ok); { 4547 stop("broken Montgomery multiply"); 4548 } bind(ok); 4549 } 4550 #endif 4551 // We have very carefully set things up so that 4552 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4553 // the lower half of Rm * Rn because we know the result already: 4554 // it must be -t0. t0 + (-t0) must generate a carry iff 4555 // t0 != 0. So, rather than do a mul and an adds we just set 4556 // the carry flag iff t0 is nonzero. 4557 // 4558 // mul(Rlo_mn, Rm, Rn); 4559 // adds(zr, t0, Rlo_mn); 4560 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4561 adcs(t0, t1, Rhi_mn); 4562 adc(t1, t2, zr); 4563 mov(t2, zr); 4564 } 4565 4566 void acc(Register Rhi, Register Rlo, 4567 Register t0, Register t1, Register t2) { 4568 adds(t0, t0, Rlo); 4569 adcs(t1, t1, Rhi); 4570 adc(t2, t2, zr); 4571 } 4572 4573 public: 4574 /** 4575 * Fast Montgomery multiplication. The derivation of the 4576 * algorithm is in A Cryptographic Library for the Motorola 4577 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 4578 * 4579 * Arguments: 4580 * 4581 * Inputs for multiplication: 4582 * c_rarg0 - int array elements a 4583 * c_rarg1 - int array elements b 4584 * c_rarg2 - int array elements n (the modulus) 4585 * c_rarg3 - int length 4586 * c_rarg4 - int inv 4587 * c_rarg5 - int array elements m (the result) 4588 * 4589 * Inputs for squaring: 4590 * c_rarg0 - int array elements a 4591 * c_rarg1 - int array elements n (the modulus) 4592 * c_rarg2 - int length 4593 * c_rarg3 - int inv 4594 * c_rarg4 - int array elements m (the result) 4595 * 4596 */ 4597 address generate_multiply() { 4598 Label argh, nothing; 4599 bind(argh); 4600 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4601 4602 align(CodeEntryAlignment); 4603 address entry = pc(); 4604 4605 cbzw(Rlen, nothing); 4606 4607 enter(); 4608 4609 // Make room. 4610 cmpw(Rlen, 512); 4611 br(Assembler::HI, argh); 4612 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4613 andr(sp, Ra, -2 * wordSize); 4614 4615 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4616 4617 { 4618 // Copy input args, reversing as we go. We use Ra as a 4619 // temporary variable. 4620 reverse(Ra, Pa_base, Rlen, t0, t1); 4621 if (!_squaring) 4622 reverse(Ra, Pb_base, Rlen, t0, t1); 4623 reverse(Ra, Pn_base, Rlen, t0, t1); 4624 } 4625 4626 // Push all call-saved registers and also Pm_base which we'll need 4627 // at the end. 4628 save_regs(); 4629 4630 #ifndef PRODUCT 4631 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 4632 { 4633 ldr(Rn, Address(Pn_base, 0)); 4634 mul(Rlo_mn, Rn, inv); 4635 cmp(Rlo_mn, -1); 4636 Label ok; 4637 br(EQ, ok); { 4638 stop("broken inverse in Montgomery multiply"); 4639 } bind(ok); 4640 } 4641 #endif 4642 4643 mov(Pm_base, Ra); 4644 4645 mov(t0, zr); 4646 mov(t1, zr); 4647 mov(t2, zr); 4648 4649 block_comment("for (int i = 0; i < len; i++) {"); 4650 mov(Ri, zr); { 4651 Label loop, end; 4652 cmpw(Ri, Rlen); 4653 br(Assembler::GE, end); 4654 4655 bind(loop); 4656 pre1(Ri); 4657 4658 block_comment(" for (j = i; j; j--) {"); { 4659 movw(Rj, Ri); 4660 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4661 } block_comment(" } // j"); 4662 4663 post1(); 4664 addw(Ri, Ri, 1); 4665 cmpw(Ri, Rlen); 4666 br(Assembler::LT, loop); 4667 bind(end); 4668 block_comment("} // i"); 4669 } 4670 4671 block_comment("for (int i = len; i < 2*len; i++) {"); 4672 mov(Ri, Rlen); { 4673 Label loop, end; 4674 cmpw(Ri, Rlen, Assembler::LSL, 1); 4675 br(Assembler::GE, end); 4676 4677 bind(loop); 4678 pre2(Ri, Rlen); 4679 4680 block_comment(" for (j = len*2-i-1; j; j--) {"); { 4681 lslw(Rj, Rlen, 1); 4682 subw(Rj, Rj, Ri); 4683 subw(Rj, Rj, 1); 4684 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4685 } block_comment(" } // j"); 4686 4687 post2(Ri, Rlen); 4688 addw(Ri, Ri, 1); 4689 cmpw(Ri, Rlen, Assembler::LSL, 1); 4690 br(Assembler::LT, loop); 4691 bind(end); 4692 } 4693 block_comment("} // i"); 4694 4695 normalize(Rlen); 4696 4697 mov(Ra, Pm_base); // Save Pm_base in Ra 4698 restore_regs(); // Restore caller's Pm_base 4699 4700 // Copy our result into caller's Pm_base 4701 reverse(Pm_base, Ra, Rlen, t0, t1); 4702 4703 leave(); 4704 bind(nothing); 4705 ret(lr); 4706 4707 return entry; 4708 } 4709 // In C, approximately: 4710 4711 // void 4712 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 4713 // unsigned long Pn_base[], unsigned long Pm_base[], 4714 // unsigned long inv, int len) { 4715 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4716 // unsigned long *Pa, *Pb, *Pn, *Pm; 4717 // unsigned long Ra, Rb, Rn, Rm; 4718 4719 // int i; 4720 4721 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4722 4723 // for (i = 0; i < len; i++) { 4724 // int j; 4725 4726 // Pa = Pa_base; 4727 // Pb = Pb_base + i; 4728 // Pm = Pm_base; 4729 // Pn = Pn_base + i; 4730 4731 // Ra = *Pa; 4732 // Rb = *Pb; 4733 // Rm = *Pm; 4734 // Rn = *Pn; 4735 4736 // int iters = i; 4737 // for (j = 0; iters--; j++) { 4738 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4739 // MACC(Ra, Rb, t0, t1, t2); 4740 // Ra = *++Pa; 4741 // Rb = *--Pb; 4742 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4743 // MACC(Rm, Rn, t0, t1, t2); 4744 // Rm = *++Pm; 4745 // Rn = *--Pn; 4746 // } 4747 4748 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 4749 // MACC(Ra, Rb, t0, t1, t2); 4750 // *Pm = Rm = t0 * inv; 4751 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4752 // MACC(Rm, Rn, t0, t1, t2); 4753 4754 // assert(t0 == 0, "broken Montgomery multiply"); 4755 4756 // t0 = t1; t1 = t2; t2 = 0; 4757 // } 4758 4759 // for (i = len; i < 2*len; i++) { 4760 // int j; 4761 4762 // Pa = Pa_base + i-len; 4763 // Pb = Pb_base + len; 4764 // Pm = Pm_base + i-len; 4765 // Pn = Pn_base + len; 4766 4767 // Ra = *++Pa; 4768 // Rb = *--Pb; 4769 // Rm = *++Pm; 4770 // Rn = *--Pn; 4771 4772 // int iters = len*2-i-1; 4773 // for (j = i-len+1; iters--; j++) { 4774 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4775 // MACC(Ra, Rb, t0, t1, t2); 4776 // Ra = *++Pa; 4777 // Rb = *--Pb; 4778 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4779 // MACC(Rm, Rn, t0, t1, t2); 4780 // Rm = *++Pm; 4781 // Rn = *--Pn; 4782 // } 4783 4784 // Pm_base[i-len] = t0; 4785 // t0 = t1; t1 = t2; t2 = 0; 4786 // } 4787 4788 // while (t0) 4789 // t0 = sub(Pm_base, Pn_base, t0, len); 4790 // } 4791 4792 /** 4793 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4794 * multiplies than Montgomery multiplication so it should be up to 4795 * 25% faster. However, its loop control is more complex and it 4796 * may actually run slower on some machines. 4797 * 4798 * Arguments: 4799 * 4800 * Inputs: 4801 * c_rarg0 - int array elements a 4802 * c_rarg1 - int array elements n (the modulus) 4803 * c_rarg2 - int length 4804 * c_rarg3 - int inv 4805 * c_rarg4 - int array elements m (the result) 4806 * 4807 */ 4808 address generate_square() { 4809 Label argh; 4810 bind(argh); 4811 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4812 4813 align(CodeEntryAlignment); 4814 address entry = pc(); 4815 4816 enter(); 4817 4818 // Make room. 4819 cmpw(Rlen, 512); 4820 br(Assembler::HI, argh); 4821 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4822 andr(sp, Ra, -2 * wordSize); 4823 4824 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4825 4826 { 4827 // Copy input args, reversing as we go. We use Ra as a 4828 // temporary variable. 4829 reverse(Ra, Pa_base, Rlen, t0, t1); 4830 reverse(Ra, Pn_base, Rlen, t0, t1); 4831 } 4832 4833 // Push all call-saved registers and also Pm_base which we'll need 4834 // at the end. 4835 save_regs(); 4836 4837 mov(Pm_base, Ra); 4838 4839 mov(t0, zr); 4840 mov(t1, zr); 4841 mov(t2, zr); 4842 4843 block_comment("for (int i = 0; i < len; i++) {"); 4844 mov(Ri, zr); { 4845 Label loop, end; 4846 bind(loop); 4847 cmp(Ri, Rlen); 4848 br(Assembler::GE, end); 4849 4850 pre1(Ri); 4851 4852 block_comment("for (j = (i+1)/2; j; j--) {"); { 4853 add(Rj, Ri, 1); 4854 lsr(Rj, Rj, 1); 4855 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4856 } block_comment(" } // j"); 4857 4858 last_squaring(Ri); 4859 4860 block_comment(" for (j = i/2; j; j--) {"); { 4861 lsr(Rj, Ri, 1); 4862 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4863 } block_comment(" } // j"); 4864 4865 post1_squaring(); 4866 add(Ri, Ri, 1); 4867 cmp(Ri, Rlen); 4868 br(Assembler::LT, loop); 4869 4870 bind(end); 4871 block_comment("} // i"); 4872 } 4873 4874 block_comment("for (int i = len; i < 2*len; i++) {"); 4875 mov(Ri, Rlen); { 4876 Label loop, end; 4877 bind(loop); 4878 cmp(Ri, Rlen, Assembler::LSL, 1); 4879 br(Assembler::GE, end); 4880 4881 pre2(Ri, Rlen); 4882 4883 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4884 lsl(Rj, Rlen, 1); 4885 sub(Rj, Rj, Ri); 4886 sub(Rj, Rj, 1); 4887 lsr(Rj, Rj, 1); 4888 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4889 } block_comment(" } // j"); 4890 4891 last_squaring(Ri); 4892 4893 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4894 lsl(Rj, Rlen, 1); 4895 sub(Rj, Rj, Ri); 4896 lsr(Rj, Rj, 1); 4897 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4898 } block_comment(" } // j"); 4899 4900 post2(Ri, Rlen); 4901 add(Ri, Ri, 1); 4902 cmp(Ri, Rlen, Assembler::LSL, 1); 4903 4904 br(Assembler::LT, loop); 4905 bind(end); 4906 block_comment("} // i"); 4907 } 4908 4909 normalize(Rlen); 4910 4911 mov(Ra, Pm_base); // Save Pm_base in Ra 4912 restore_regs(); // Restore caller's Pm_base 4913 4914 // Copy our result into caller's Pm_base 4915 reverse(Pm_base, Ra, Rlen, t0, t1); 4916 4917 leave(); 4918 ret(lr); 4919 4920 return entry; 4921 } 4922 // In C, approximately: 4923 4924 // void 4925 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4926 // unsigned long Pm_base[], unsigned long inv, int len) { 4927 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4928 // unsigned long *Pa, *Pb, *Pn, *Pm; 4929 // unsigned long Ra, Rb, Rn, Rm; 4930 4931 // int i; 4932 4933 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4934 4935 // for (i = 0; i < len; i++) { 4936 // int j; 4937 4938 // Pa = Pa_base; 4939 // Pb = Pa_base + i; 4940 // Pm = Pm_base; 4941 // Pn = Pn_base + i; 4942 4943 // Ra = *Pa; 4944 // Rb = *Pb; 4945 // Rm = *Pm; 4946 // Rn = *Pn; 4947 4948 // int iters = (i+1)/2; 4949 // for (j = 0; iters--; j++) { 4950 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4951 // MACC2(Ra, Rb, t0, t1, t2); 4952 // Ra = *++Pa; 4953 // Rb = *--Pb; 4954 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4955 // MACC(Rm, Rn, t0, t1, t2); 4956 // Rm = *++Pm; 4957 // Rn = *--Pn; 4958 // } 4959 // if ((i & 1) == 0) { 4960 // assert(Ra == Pa_base[j], "must be"); 4961 // MACC(Ra, Ra, t0, t1, t2); 4962 // } 4963 // iters = i/2; 4964 // assert(iters == i-j, "must be"); 4965 // for (; iters--; j++) { 4966 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4967 // MACC(Rm, Rn, t0, t1, t2); 4968 // Rm = *++Pm; 4969 // Rn = *--Pn; 4970 // } 4971 4972 // *Pm = Rm = t0 * inv; 4973 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4974 // MACC(Rm, Rn, t0, t1, t2); 4975 4976 // assert(t0 == 0, "broken Montgomery multiply"); 4977 4978 // t0 = t1; t1 = t2; t2 = 0; 4979 // } 4980 4981 // for (i = len; i < 2*len; i++) { 4982 // int start = i-len+1; 4983 // int end = start + (len - start)/2; 4984 // int j; 4985 4986 // Pa = Pa_base + i-len; 4987 // Pb = Pa_base + len; 4988 // Pm = Pm_base + i-len; 4989 // Pn = Pn_base + len; 4990 4991 // Ra = *++Pa; 4992 // Rb = *--Pb; 4993 // Rm = *++Pm; 4994 // Rn = *--Pn; 4995 4996 // int iters = (2*len-i-1)/2; 4997 // assert(iters == end-start, "must be"); 4998 // for (j = start; iters--; j++) { 4999 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5000 // MACC2(Ra, Rb, t0, t1, t2); 5001 // Ra = *++Pa; 5002 // Rb = *--Pb; 5003 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5004 // MACC(Rm, Rn, t0, t1, t2); 5005 // Rm = *++Pm; 5006 // Rn = *--Pn; 5007 // } 5008 // if ((i & 1) == 0) { 5009 // assert(Ra == Pa_base[j], "must be"); 5010 // MACC(Ra, Ra, t0, t1, t2); 5011 // } 5012 // iters = (2*len-i)/2; 5013 // assert(iters == len-j, "must be"); 5014 // for (; iters--; j++) { 5015 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5016 // MACC(Rm, Rn, t0, t1, t2); 5017 // Rm = *++Pm; 5018 // Rn = *--Pn; 5019 // } 5020 // Pm_base[i-len] = t0; 5021 // t0 = t1; t1 = t2; t2 = 0; 5022 // } 5023 5024 // while (t0) 5025 // t0 = sub(Pm_base, Pn_base, t0, len); 5026 // } 5027 }; 5028 5029 5030 // Initialization 5031 void generate_initial() { 5032 // Generate initial stubs and initializes the entry points 5033 5034 // entry points that exist in all platforms Note: This is code 5035 // that could be shared among different platforms - however the 5036 // benefit seems to be smaller than the disadvantage of having a 5037 // much more complicated generator structure. See also comment in 5038 // stubRoutines.hpp. 5039 5040 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5041 5042 StubRoutines::_call_stub_entry = 5043 generate_call_stub(StubRoutines::_call_stub_return_address); 5044 5045 // is referenced by megamorphic call 5046 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5047 5048 // Build this early so it's available for the interpreter. 5049 StubRoutines::_throw_StackOverflowError_entry = 5050 generate_throw_exception("StackOverflowError throw_exception", 5051 CAST_FROM_FN_PTR(address, 5052 SharedRuntime::throw_StackOverflowError)); 5053 StubRoutines::_throw_delayed_StackOverflowError_entry = 5054 generate_throw_exception("delayed StackOverflowError throw_exception", 5055 CAST_FROM_FN_PTR(address, 5056 SharedRuntime::throw_delayed_StackOverflowError)); 5057 if (UseCRC32Intrinsics) { 5058 // set table address before stub generation which use it 5059 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5060 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5061 } 5062 5063 if (UseCRC32CIntrinsics) { 5064 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5065 } 5066 } 5067 5068 void generate_all() { 5069 // support for verify_oop (must happen after universe_init) 5070 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5071 StubRoutines::_throw_AbstractMethodError_entry = 5072 generate_throw_exception("AbstractMethodError throw_exception", 5073 CAST_FROM_FN_PTR(address, 5074 SharedRuntime:: 5075 throw_AbstractMethodError)); 5076 5077 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5078 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5079 CAST_FROM_FN_PTR(address, 5080 SharedRuntime:: 5081 throw_IncompatibleClassChangeError)); 5082 5083 StubRoutines::_throw_NullPointerException_at_call_entry = 5084 generate_throw_exception("NullPointerException at call throw_exception", 5085 CAST_FROM_FN_PTR(address, 5086 SharedRuntime:: 5087 throw_NullPointerException_at_call)); 5088 5089 // arraycopy stubs used by compilers 5090 generate_arraycopy_stubs(); 5091 5092 // has negatives stub for large arrays. 5093 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5094 5095 // array equals stub for large arrays. 5096 StubRoutines::aarch64::_large_array_equals_byte = generate_large_array_equals_byte(); 5097 StubRoutines::aarch64::_large_array_equals_char = generate_large_array_equals_char(); 5098 5099 if (UseMultiplyToLenIntrinsic) { 5100 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5101 } 5102 5103 if (UseSquareToLenIntrinsic) { 5104 StubRoutines::_squareToLen = generate_squareToLen(); 5105 } 5106 5107 if (UseMulAddIntrinsic) { 5108 StubRoutines::_mulAdd = generate_mulAdd(); 5109 } 5110 5111 if (UseMontgomeryMultiplyIntrinsic) { 5112 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5113 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5114 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5115 } 5116 5117 if (UseMontgomerySquareIntrinsic) { 5118 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5119 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5120 // We use generate_multiply() rather than generate_square() 5121 // because it's faster for the sizes of modulus we care about. 5122 StubRoutines::_montgomerySquare = g.generate_multiply(); 5123 } 5124 5125 #ifndef BUILTIN_SIM 5126 // generate GHASH intrinsics code 5127 if (UseGHASHIntrinsics) { 5128 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5129 } 5130 5131 if (UseAESIntrinsics) { 5132 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5133 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5134 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5135 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5136 } 5137 5138 if (UseSHA1Intrinsics) { 5139 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5140 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5141 } 5142 if (UseSHA256Intrinsics) { 5143 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5144 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5145 } 5146 5147 // generate Adler32 intrinsics code 5148 if (UseAdler32Intrinsics) { 5149 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5150 } 5151 5152 // Safefetch stubs. 5153 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5154 &StubRoutines::_safefetch32_fault_pc, 5155 &StubRoutines::_safefetch32_continuation_pc); 5156 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5157 &StubRoutines::_safefetchN_fault_pc, 5158 &StubRoutines::_safefetchN_continuation_pc); 5159 #endif 5160 StubRoutines::aarch64::set_completed(); 5161 } 5162 5163 public: 5164 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5165 if (all) { 5166 generate_all(); 5167 } else { 5168 generate_initial(); 5169 } 5170 } 5171 }; // end class declaration 5172 5173 void StubGenerator_generate(CodeBuffer* code, bool all) { 5174 StubGenerator g(code, all); 5175 }