1 /* 2 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "interpreter/interpreter.hpp" 30 #include "nativeInst_aarch64.hpp" 31 #include "oops/instanceOop.hpp" 32 #include "oops/method.hpp" 33 #include "oops/objArrayKlass.hpp" 34 #include "oops/oop.inline.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/frame.inline.hpp" 37 #include "runtime/handles.inline.hpp" 38 #include "runtime/sharedRuntime.hpp" 39 #include "runtime/stubCodeGenerator.hpp" 40 #include "runtime/stubRoutines.hpp" 41 #include "runtime/thread.inline.hpp" 42 #include "utilities/align.hpp" 43 #ifdef COMPILER2 44 #include "opto/runtime.hpp" 45 #endif 46 47 #ifdef BUILTIN_SIM 48 #include "../../../../../../simulator/simulator.hpp" 49 #endif 50 51 // Declaration and definition of StubGenerator (no .hpp file). 52 // For a more detailed description of the stub routine structure 53 // see the comment in stubRoutines.hpp 54 55 #undef __ 56 #define __ _masm-> 57 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 58 59 #ifdef PRODUCT 60 #define BLOCK_COMMENT(str) /* nothing */ 61 #else 62 #define BLOCK_COMMENT(str) __ block_comment(str) 63 #endif 64 65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 66 67 // Stub Code definitions 68 69 class StubGenerator: public StubCodeGenerator { 70 private: 71 72 #ifdef PRODUCT 73 #define inc_counter_np(counter) ((void)0) 74 #else 75 void inc_counter_np_(int& counter) { 76 __ lea(rscratch2, ExternalAddress((address)&counter)); 77 __ ldrw(rscratch1, Address(rscratch2)); 78 __ addw(rscratch1, rscratch1, 1); 79 __ strw(rscratch1, Address(rscratch2)); 80 } 81 #define inc_counter_np(counter) \ 82 BLOCK_COMMENT("inc_counter " #counter); \ 83 inc_counter_np_(counter); 84 #endif 85 86 // Call stubs are used to call Java from C 87 // 88 // Arguments: 89 // c_rarg0: call wrapper address address 90 // c_rarg1: result address 91 // c_rarg2: result type BasicType 92 // c_rarg3: method Method* 93 // c_rarg4: (interpreter) entry point address 94 // c_rarg5: parameters intptr_t* 95 // c_rarg6: parameter size (in words) int 96 // c_rarg7: thread Thread* 97 // 98 // There is no return from the stub itself as any Java result 99 // is written to result 100 // 101 // we save r30 (lr) as the return PC at the base of the frame and 102 // link r29 (fp) below it as the frame pointer installing sp (r31) 103 // into fp. 104 // 105 // we save r0-r7, which accounts for all the c arguments. 106 // 107 // TODO: strictly do we need to save them all? they are treated as 108 // volatile by C so could we omit saving the ones we are going to 109 // place in global registers (thread? method?) or those we only use 110 // during setup of the Java call? 111 // 112 // we don't need to save r8 which C uses as an indirect result location 113 // return register. 114 // 115 // we don't need to save r9-r15 which both C and Java treat as 116 // volatile 117 // 118 // we don't need to save r16-18 because Java does not use them 119 // 120 // we save r19-r28 which Java uses as scratch registers and C 121 // expects to be callee-save 122 // 123 // we save the bottom 64 bits of each value stored in v8-v15; it is 124 // the responsibility of the caller to preserve larger values. 125 // 126 // so the stub frame looks like this when we enter Java code 127 // 128 // [ return_from_Java ] <--- sp 129 // [ argument word n ] 130 // ... 131 // -27 [ argument word 1 ] 132 // -26 [ saved v15 ] <--- sp_after_call 133 // -25 [ saved v14 ] 134 // -24 [ saved v13 ] 135 // -23 [ saved v12 ] 136 // -22 [ saved v11 ] 137 // -21 [ saved v10 ] 138 // -20 [ saved v9 ] 139 // -19 [ saved v8 ] 140 // -18 [ saved r28 ] 141 // -17 [ saved r27 ] 142 // -16 [ saved r26 ] 143 // -15 [ saved r25 ] 144 // -14 [ saved r24 ] 145 // -13 [ saved r23 ] 146 // -12 [ saved r22 ] 147 // -11 [ saved r21 ] 148 // -10 [ saved r20 ] 149 // -9 [ saved r19 ] 150 // -8 [ call wrapper (r0) ] 151 // -7 [ result (r1) ] 152 // -6 [ result type (r2) ] 153 // -5 [ method (r3) ] 154 // -4 [ entry point (r4) ] 155 // -3 [ parameters (r5) ] 156 // -2 [ parameter size (r6) ] 157 // -1 [ thread (r7) ] 158 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 159 // 1 [ saved lr (r30) ] 160 161 // Call stub stack layout word offsets from fp 162 enum call_stub_layout { 163 sp_after_call_off = -26, 164 165 d15_off = -26, 166 d13_off = -24, 167 d11_off = -22, 168 d9_off = -20, 169 170 r28_off = -18, 171 r26_off = -16, 172 r24_off = -14, 173 r22_off = -12, 174 r20_off = -10, 175 call_wrapper_off = -8, 176 result_off = -7, 177 result_type_off = -6, 178 method_off = -5, 179 entry_point_off = -4, 180 parameter_size_off = -2, 181 thread_off = -1, 182 fp_f = 0, 183 retaddr_off = 1, 184 }; 185 186 address generate_call_stub(address& return_address) { 187 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 188 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 189 "adjust this code"); 190 191 StubCodeMark mark(this, "StubRoutines", "call_stub"); 192 address start = __ pc(); 193 194 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 195 196 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 197 const Address result (rfp, result_off * wordSize); 198 const Address result_type (rfp, result_type_off * wordSize); 199 const Address method (rfp, method_off * wordSize); 200 const Address entry_point (rfp, entry_point_off * wordSize); 201 const Address parameter_size(rfp, parameter_size_off * wordSize); 202 203 const Address thread (rfp, thread_off * wordSize); 204 205 const Address d15_save (rfp, d15_off * wordSize); 206 const Address d13_save (rfp, d13_off * wordSize); 207 const Address d11_save (rfp, d11_off * wordSize); 208 const Address d9_save (rfp, d9_off * wordSize); 209 210 const Address r28_save (rfp, r28_off * wordSize); 211 const Address r26_save (rfp, r26_off * wordSize); 212 const Address r24_save (rfp, r24_off * wordSize); 213 const Address r22_save (rfp, r22_off * wordSize); 214 const Address r20_save (rfp, r20_off * wordSize); 215 216 // stub code 217 218 // we need a C prolog to bootstrap the x86 caller into the sim 219 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 220 221 address aarch64_entry = __ pc(); 222 223 #ifdef BUILTIN_SIM 224 // Save sender's SP for stack traces. 225 __ mov(rscratch1, sp); 226 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 227 #endif 228 // set up frame and move sp to end of save area 229 __ enter(); 230 __ sub(sp, rfp, -sp_after_call_off * wordSize); 231 232 // save register parameters and Java scratch/global registers 233 // n.b. we save thread even though it gets installed in 234 // rthread because we want to sanity check rthread later 235 __ str(c_rarg7, thread); 236 __ strw(c_rarg6, parameter_size); 237 __ stp(c_rarg4, c_rarg5, entry_point); 238 __ stp(c_rarg2, c_rarg3, result_type); 239 __ stp(c_rarg0, c_rarg1, call_wrapper); 240 241 __ stp(r20, r19, r20_save); 242 __ stp(r22, r21, r22_save); 243 __ stp(r24, r23, r24_save); 244 __ stp(r26, r25, r26_save); 245 __ stp(r28, r27, r28_save); 246 247 __ stpd(v9, v8, d9_save); 248 __ stpd(v11, v10, d11_save); 249 __ stpd(v13, v12, d13_save); 250 __ stpd(v15, v14, d15_save); 251 252 // install Java thread in global register now we have saved 253 // whatever value it held 254 __ mov(rthread, c_rarg7); 255 // And method 256 __ mov(rmethod, c_rarg3); 257 258 // set up the heapbase register 259 __ reinit_heapbase(); 260 261 #ifdef ASSERT 262 // make sure we have no pending exceptions 263 { 264 Label L; 265 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 266 __ cmp(rscratch1, (unsigned)NULL_WORD); 267 __ br(Assembler::EQ, L); 268 __ stop("StubRoutines::call_stub: entered with pending exception"); 269 __ BIND(L); 270 } 271 #endif 272 // pass parameters if any 273 __ mov(esp, sp); 274 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 275 __ andr(sp, rscratch1, -2 * wordSize); 276 277 BLOCK_COMMENT("pass parameters if any"); 278 Label parameters_done; 279 // parameter count is still in c_rarg6 280 // and parameter pointer identifying param 1 is in c_rarg5 281 __ cbzw(c_rarg6, parameters_done); 282 283 address loop = __ pc(); 284 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 285 __ subsw(c_rarg6, c_rarg6, 1); 286 __ push(rscratch1); 287 __ br(Assembler::GT, loop); 288 289 __ BIND(parameters_done); 290 291 // call Java entry -- passing methdoOop, and current sp 292 // rmethod: Method* 293 // r13: sender sp 294 BLOCK_COMMENT("call Java function"); 295 __ mov(r13, sp); 296 __ blr(c_rarg4); 297 298 // tell the simulator we have returned to the stub 299 300 // we do this here because the notify will already have been done 301 // if we get to the next instruction via an exception 302 // 303 // n.b. adding this instruction here affects the calculation of 304 // whether or not a routine returns to the call stub (used when 305 // doing stack walks) since the normal test is to check the return 306 // pc against the address saved below. so we may need to allow for 307 // this extra instruction in the check. 308 309 if (NotifySimulator) { 310 __ notify(Assembler::method_reentry); 311 } 312 // save current address for use by exception handling code 313 314 return_address = __ pc(); 315 316 // store result depending on type (everything that is not 317 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 318 // n.b. this assumes Java returns an integral result in r0 319 // and a floating result in j_farg0 320 __ ldr(j_rarg2, result); 321 Label is_long, is_float, is_double, exit; 322 __ ldr(j_rarg1, result_type); 323 __ cmp(j_rarg1, T_OBJECT); 324 __ br(Assembler::EQ, is_long); 325 __ cmp(j_rarg1, T_LONG); 326 __ br(Assembler::EQ, is_long); 327 __ cmp(j_rarg1, T_FLOAT); 328 __ br(Assembler::EQ, is_float); 329 __ cmp(j_rarg1, T_DOUBLE); 330 __ br(Assembler::EQ, is_double); 331 332 // handle T_INT case 333 __ strw(r0, Address(j_rarg2)); 334 335 __ BIND(exit); 336 337 // pop parameters 338 __ sub(esp, rfp, -sp_after_call_off * wordSize); 339 340 #ifdef ASSERT 341 // verify that threads correspond 342 { 343 Label L, S; 344 __ ldr(rscratch1, thread); 345 __ cmp(rthread, rscratch1); 346 __ br(Assembler::NE, S); 347 __ get_thread(rscratch1); 348 __ cmp(rthread, rscratch1); 349 __ br(Assembler::EQ, L); 350 __ BIND(S); 351 __ stop("StubRoutines::call_stub: threads must correspond"); 352 __ BIND(L); 353 } 354 #endif 355 356 // restore callee-save registers 357 __ ldpd(v15, v14, d15_save); 358 __ ldpd(v13, v12, d13_save); 359 __ ldpd(v11, v10, d11_save); 360 __ ldpd(v9, v8, d9_save); 361 362 __ ldp(r28, r27, r28_save); 363 __ ldp(r26, r25, r26_save); 364 __ ldp(r24, r23, r24_save); 365 __ ldp(r22, r21, r22_save); 366 __ ldp(r20, r19, r20_save); 367 368 __ ldp(c_rarg0, c_rarg1, call_wrapper); 369 __ ldrw(c_rarg2, result_type); 370 __ ldr(c_rarg3, method); 371 __ ldp(c_rarg4, c_rarg5, entry_point); 372 __ ldp(c_rarg6, c_rarg7, parameter_size); 373 374 #ifndef PRODUCT 375 // tell the simulator we are about to end Java execution 376 if (NotifySimulator) { 377 __ notify(Assembler::method_exit); 378 } 379 #endif 380 // leave frame and return to caller 381 __ leave(); 382 __ ret(lr); 383 384 // handle return types different from T_INT 385 386 __ BIND(is_long); 387 __ str(r0, Address(j_rarg2, 0)); 388 __ br(Assembler::AL, exit); 389 390 __ BIND(is_float); 391 __ strs(j_farg0, Address(j_rarg2, 0)); 392 __ br(Assembler::AL, exit); 393 394 __ BIND(is_double); 395 __ strd(j_farg0, Address(j_rarg2, 0)); 396 __ br(Assembler::AL, exit); 397 398 return start; 399 } 400 401 // Return point for a Java call if there's an exception thrown in 402 // Java code. The exception is caught and transformed into a 403 // pending exception stored in JavaThread that can be tested from 404 // within the VM. 405 // 406 // Note: Usually the parameters are removed by the callee. In case 407 // of an exception crossing an activation frame boundary, that is 408 // not the case if the callee is compiled code => need to setup the 409 // rsp. 410 // 411 // r0: exception oop 412 413 // NOTE: this is used as a target from the signal handler so it 414 // needs an x86 prolog which returns into the current simulator 415 // executing the generated catch_exception code. so the prolog 416 // needs to install rax in a sim register and adjust the sim's 417 // restart pc to enter the generated code at the start position 418 // then return from native to simulated execution. 419 420 address generate_catch_exception() { 421 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 422 address start = __ pc(); 423 424 // same as in generate_call_stub(): 425 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 426 const Address thread (rfp, thread_off * wordSize); 427 428 #ifdef ASSERT 429 // verify that threads correspond 430 { 431 Label L, S; 432 __ ldr(rscratch1, thread); 433 __ cmp(rthread, rscratch1); 434 __ br(Assembler::NE, S); 435 __ get_thread(rscratch1); 436 __ cmp(rthread, rscratch1); 437 __ br(Assembler::EQ, L); 438 __ bind(S); 439 __ stop("StubRoutines::catch_exception: threads must correspond"); 440 __ bind(L); 441 } 442 #endif 443 444 // set pending exception 445 __ verify_oop(r0); 446 447 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 448 __ mov(rscratch1, (address)__FILE__); 449 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 450 __ movw(rscratch1, (int)__LINE__); 451 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 452 453 // complete return to VM 454 assert(StubRoutines::_call_stub_return_address != NULL, 455 "_call_stub_return_address must have been generated before"); 456 __ b(StubRoutines::_call_stub_return_address); 457 458 return start; 459 } 460 461 // Continuation point for runtime calls returning with a pending 462 // exception. The pending exception check happened in the runtime 463 // or native call stub. The pending exception in Thread is 464 // converted into a Java-level exception. 465 // 466 // Contract with Java-level exception handlers: 467 // r0: exception 468 // r3: throwing pc 469 // 470 // NOTE: At entry of this stub, exception-pc must be in LR !! 471 472 // NOTE: this is always used as a jump target within generated code 473 // so it just needs to be generated code wiht no x86 prolog 474 475 address generate_forward_exception() { 476 StubCodeMark mark(this, "StubRoutines", "forward exception"); 477 address start = __ pc(); 478 479 // Upon entry, LR points to the return address returning into 480 // Java (interpreted or compiled) code; i.e., the return address 481 // becomes the throwing pc. 482 // 483 // Arguments pushed before the runtime call are still on the stack 484 // but the exception handler will reset the stack pointer -> 485 // ignore them. A potential result in registers can be ignored as 486 // well. 487 488 #ifdef ASSERT 489 // make sure this code is only executed if there is a pending exception 490 { 491 Label L; 492 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 493 __ cbnz(rscratch1, L); 494 __ stop("StubRoutines::forward exception: no pending exception (1)"); 495 __ bind(L); 496 } 497 #endif 498 499 // compute exception handler into r19 500 501 // call the VM to find the handler address associated with the 502 // caller address. pass thread in r0 and caller pc (ret address) 503 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 504 // the stack. 505 __ mov(c_rarg1, lr); 506 // lr will be trashed by the VM call so we move it to R19 507 // (callee-saved) because we also need to pass it to the handler 508 // returned by this call. 509 __ mov(r19, lr); 510 BLOCK_COMMENT("call exception_handler_for_return_address"); 511 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 512 SharedRuntime::exception_handler_for_return_address), 513 rthread, c_rarg1); 514 // we should not really care that lr is no longer the callee 515 // address. we saved the value the handler needs in r19 so we can 516 // just copy it to r3. however, the C2 handler will push its own 517 // frame and then calls into the VM and the VM code asserts that 518 // the PC for the frame above the handler belongs to a compiled 519 // Java method. So, we restore lr here to satisfy that assert. 520 __ mov(lr, r19); 521 // setup r0 & r3 & clear pending exception 522 __ mov(r3, r19); 523 __ mov(r19, r0); 524 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 525 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 526 527 #ifdef ASSERT 528 // make sure exception is set 529 { 530 Label L; 531 __ cbnz(r0, L); 532 __ stop("StubRoutines::forward exception: no pending exception (2)"); 533 __ bind(L); 534 } 535 #endif 536 537 // continue at exception handler 538 // r0: exception 539 // r3: throwing pc 540 // r19: exception handler 541 __ verify_oop(r0); 542 __ br(r19); 543 544 return start; 545 } 546 547 // Non-destructive plausibility checks for oops 548 // 549 // Arguments: 550 // r0: oop to verify 551 // rscratch1: error message 552 // 553 // Stack after saving c_rarg3: 554 // [tos + 0]: saved c_rarg3 555 // [tos + 1]: saved c_rarg2 556 // [tos + 2]: saved lr 557 // [tos + 3]: saved rscratch2 558 // [tos + 4]: saved r0 559 // [tos + 5]: saved rscratch1 560 address generate_verify_oop() { 561 562 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 563 address start = __ pc(); 564 565 Label exit, error; 566 567 // save c_rarg2 and c_rarg3 568 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 569 570 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 571 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 572 __ ldr(c_rarg3, Address(c_rarg2)); 573 __ add(c_rarg3, c_rarg3, 1); 574 __ str(c_rarg3, Address(c_rarg2)); 575 576 // object is in r0 577 // make sure object is 'reasonable' 578 __ cbz(r0, exit); // if obj is NULL it is OK 579 580 // Check if the oop is in the right area of memory 581 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 582 __ andr(c_rarg2, r0, c_rarg3); 583 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 584 585 // Compare c_rarg2 and c_rarg3. We don't use a compare 586 // instruction here because the flags register is live. 587 __ eor(c_rarg2, c_rarg2, c_rarg3); 588 __ cbnz(c_rarg2, error); 589 590 // make sure klass is 'reasonable', which is not zero. 591 __ load_klass(r0, r0); // get klass 592 __ cbz(r0, error); // if klass is NULL it is broken 593 594 // return if everything seems ok 595 __ bind(exit); 596 597 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 598 __ ret(lr); 599 600 // handle errors 601 __ bind(error); 602 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 603 604 __ push(RegSet::range(r0, r29), sp); 605 // debug(char* msg, int64_t pc, int64_t regs[]) 606 __ mov(c_rarg0, rscratch1); // pass address of error message 607 __ mov(c_rarg1, lr); // pass return address 608 __ mov(c_rarg2, sp); // pass address of regs on stack 609 #ifndef PRODUCT 610 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 611 #endif 612 BLOCK_COMMENT("call MacroAssembler::debug"); 613 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 614 __ blrt(rscratch1, 3, 0, 1); 615 616 return start; 617 } 618 619 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 620 621 // Generate code for an array write pre barrier 622 // 623 // addr - starting address 624 // count - element count 625 // tmp - scratch register 626 // saved_regs - registers to be saved before calling static_write_ref_array_pre 627 // 628 // Callers must specify which registers to preserve in saved_regs. 629 // Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs. 630 // 631 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized, RegSet saved_regs) { 632 BarrierSet* bs = Universe::heap()->barrier_set(); 633 switch (bs->kind()) { 634 case BarrierSet::G1SATBCTLogging: 635 // With G1, don't generate the call if we statically know that the target in uninitialized 636 if (!dest_uninitialized) { 637 __ push(saved_regs, sp); 638 if (count == c_rarg0) { 639 if (addr == c_rarg1) { 640 // exactly backwards!! 641 __ mov(rscratch1, c_rarg0); 642 __ mov(c_rarg0, c_rarg1); 643 __ mov(c_rarg1, rscratch1); 644 } else { 645 __ mov(c_rarg1, count); 646 __ mov(c_rarg0, addr); 647 } 648 } else { 649 __ mov(c_rarg0, addr); 650 __ mov(c_rarg1, count); 651 } 652 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 653 __ pop(saved_regs, sp); 654 break; 655 case BarrierSet::CardTableForRS: 656 case BarrierSet::CardTableExtension: 657 case BarrierSet::ModRef: 658 break; 659 default: 660 ShouldNotReachHere(); 661 662 } 663 } 664 } 665 666 // 667 // Generate code for an array write post barrier 668 // 669 // Input: 670 // start - register containing starting address of destination array 671 // end - register containing ending address of destination array 672 // scratch - scratch register 673 // saved_regs - registers to be saved before calling static_write_ref_array_post 674 // 675 // The input registers are overwritten. 676 // The ending address is inclusive. 677 // Callers must specify which registers to preserve in saved_regs. 678 // Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs. 679 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch, RegSet saved_regs) { 680 assert_different_registers(start, end, scratch); 681 BarrierSet* bs = Universe::heap()->barrier_set(); 682 switch (bs->kind()) { 683 case BarrierSet::G1SATBCTLogging: 684 685 { 686 __ push(saved_regs, sp); 687 // must compute element count unless barrier set interface is changed (other platforms supply count) 688 assert_different_registers(start, end, scratch); 689 __ lea(scratch, Address(end, BytesPerHeapOop)); 690 __ sub(scratch, scratch, start); // subtract start to get #bytes 691 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 692 __ mov(c_rarg0, start); 693 __ mov(c_rarg1, scratch); 694 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 695 __ pop(saved_regs, sp); 696 } 697 break; 698 case BarrierSet::CardTableForRS: 699 case BarrierSet::CardTableExtension: 700 { 701 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 702 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 703 704 Label L_loop; 705 706 __ lsr(start, start, CardTableModRefBS::card_shift); 707 __ lsr(end, end, CardTableModRefBS::card_shift); 708 __ sub(end, end, start); // number of bytes to copy 709 710 const Register count = end; // 'end' register contains bytes count now 711 __ load_byte_map_base(scratch); 712 __ add(start, start, scratch); 713 if (UseConcMarkSweepGC) { 714 __ membar(__ StoreStore); 715 } 716 __ BIND(L_loop); 717 __ strb(zr, Address(start, count)); 718 __ subs(count, count, 1); 719 __ br(Assembler::GE, L_loop); 720 } 721 break; 722 default: 723 ShouldNotReachHere(); 724 725 } 726 } 727 728 // The inner part of zero_words(). This is the bulk operation, 729 // zeroing words in blocks, possibly using DC ZVA to do it. The 730 // caller is responsible for zeroing the last few words. 731 // 732 // Inputs: 733 // r10: the HeapWord-aligned base address of an array to zero. 734 // r11: the count in HeapWords, r11 > 0. 735 // 736 // Returns r10 and r11, adjusted for the caller to clear. 737 // r10: the base address of the tail of words left to clear. 738 // r11: the number of words in the tail. 739 // r11 < MacroAssembler::zero_words_block_size. 740 741 address generate_zero_blocks() { 742 Label store_pair, loop_store_pair, done; 743 Label base_aligned; 744 745 Register base = r10, cnt = r11; 746 747 __ align(CodeEntryAlignment); 748 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 749 address start = __ pc(); 750 751 if (UseBlockZeroing) { 752 int zva_length = VM_Version::zva_length(); 753 754 // Ensure ZVA length can be divided by 16. This is required by 755 // the subsequent operations. 756 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 757 758 __ tbz(base, 3, base_aligned); 759 __ str(zr, Address(__ post(base, 8))); 760 __ sub(cnt, cnt, 1); 761 __ bind(base_aligned); 762 763 // Ensure count >= zva_length * 2 so that it still deserves a zva after 764 // alignment. 765 Label small; 766 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 767 __ subs(rscratch1, cnt, low_limit >> 3); 768 __ br(Assembler::LT, small); 769 __ zero_dcache_blocks(base, cnt); 770 __ bind(small); 771 } 772 773 { 774 // Number of stp instructions we'll unroll 775 const int unroll = 776 MacroAssembler::zero_words_block_size / 2; 777 // Clear the remaining blocks. 778 Label loop; 779 __ subs(cnt, cnt, unroll * 2); 780 __ br(Assembler::LT, done); 781 __ bind(loop); 782 for (int i = 0; i < unroll; i++) 783 __ stp(zr, zr, __ post(base, 16)); 784 __ subs(cnt, cnt, unroll * 2); 785 __ br(Assembler::GE, loop); 786 __ bind(done); 787 __ add(cnt, cnt, unroll * 2); 788 } 789 790 __ ret(lr); 791 792 return start; 793 } 794 795 796 typedef enum { 797 copy_forwards = 1, 798 copy_backwards = -1 799 } copy_direction; 800 801 // Bulk copy of blocks of 8 words. 802 // 803 // count is a count of words. 804 // 805 // Precondition: count >= 8 806 // 807 // Postconditions: 808 // 809 // The least significant bit of count contains the remaining count 810 // of words to copy. The rest of count is trash. 811 // 812 // s and d are adjusted to point to the remaining words to copy 813 // 814 void generate_copy_longs(Label &start, Register s, Register d, Register count, 815 copy_direction direction) { 816 int unit = wordSize * direction; 817 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 818 819 int offset; 820 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 821 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 822 const Register stride = r13; 823 824 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 825 assert_different_registers(s, d, count, rscratch1); 826 827 Label again, drain; 828 const char *stub_name; 829 if (direction == copy_forwards) 830 stub_name = "forward_copy_longs"; 831 else 832 stub_name = "backward_copy_longs"; 833 StubCodeMark mark(this, "StubRoutines", stub_name); 834 __ align(CodeEntryAlignment); 835 __ bind(start); 836 837 Label unaligned_copy_long; 838 if (AvoidUnalignedAccesses) { 839 __ tbnz(d, 3, unaligned_copy_long); 840 } 841 842 if (direction == copy_forwards) { 843 __ sub(s, s, bias); 844 __ sub(d, d, bias); 845 } 846 847 #ifdef ASSERT 848 // Make sure we are never given < 8 words 849 { 850 Label L; 851 __ cmp(count, 8); 852 __ br(Assembler::GE, L); 853 __ stop("genrate_copy_longs called with < 8 words"); 854 __ bind(L); 855 } 856 #endif 857 858 // Fill 8 registers 859 if (UseSIMDForMemoryOps) { 860 __ ldpq(v0, v1, Address(s, 4 * unit)); 861 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 862 } else { 863 __ ldp(t0, t1, Address(s, 2 * unit)); 864 __ ldp(t2, t3, Address(s, 4 * unit)); 865 __ ldp(t4, t5, Address(s, 6 * unit)); 866 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 867 } 868 869 __ subs(count, count, 16); 870 __ br(Assembler::LO, drain); 871 872 int prefetch = PrefetchCopyIntervalInBytes; 873 bool use_stride = false; 874 if (direction == copy_backwards) { 875 use_stride = prefetch > 256; 876 prefetch = -prefetch; 877 if (use_stride) __ mov(stride, prefetch); 878 } 879 880 __ bind(again); 881 882 if (PrefetchCopyIntervalInBytes > 0) 883 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 884 885 if (UseSIMDForMemoryOps) { 886 __ stpq(v0, v1, Address(d, 4 * unit)); 887 __ ldpq(v0, v1, Address(s, 4 * unit)); 888 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 889 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 890 } else { 891 __ stp(t0, t1, Address(d, 2 * unit)); 892 __ ldp(t0, t1, Address(s, 2 * unit)); 893 __ stp(t2, t3, Address(d, 4 * unit)); 894 __ ldp(t2, t3, Address(s, 4 * unit)); 895 __ stp(t4, t5, Address(d, 6 * unit)); 896 __ ldp(t4, t5, Address(s, 6 * unit)); 897 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 898 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 899 } 900 901 __ subs(count, count, 8); 902 __ br(Assembler::HS, again); 903 904 // Drain 905 __ bind(drain); 906 if (UseSIMDForMemoryOps) { 907 __ stpq(v0, v1, Address(d, 4 * unit)); 908 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 909 } else { 910 __ stp(t0, t1, Address(d, 2 * unit)); 911 __ stp(t2, t3, Address(d, 4 * unit)); 912 __ stp(t4, t5, Address(d, 6 * unit)); 913 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 914 } 915 916 { 917 Label L1, L2; 918 __ tbz(count, exact_log2(4), L1); 919 if (UseSIMDForMemoryOps) { 920 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 921 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 922 } else { 923 __ ldp(t0, t1, Address(s, 2 * unit)); 924 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 925 __ stp(t0, t1, Address(d, 2 * unit)); 926 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 927 } 928 __ bind(L1); 929 930 if (direction == copy_forwards) { 931 __ add(s, s, bias); 932 __ add(d, d, bias); 933 } 934 935 __ tbz(count, 1, L2); 936 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 937 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 938 __ bind(L2); 939 } 940 941 __ ret(lr); 942 943 if (AvoidUnalignedAccesses) { 944 Label drain, again; 945 // Register order for storing. Order is different for backward copy. 946 947 __ bind(unaligned_copy_long); 948 949 // source address is even aligned, target odd aligned 950 // 951 // when forward copying word pairs we read long pairs at offsets 952 // {0, 2, 4, 6} (in long words). when backwards copying we read 953 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 954 // address by -2 in the forwards case so we can compute the 955 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 956 // or -1. 957 // 958 // when forward copying we need to store 1 word, 3 pairs and 959 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 960 // zero offset We adjust the destination by -1 which means we 961 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 962 // 963 // When backwards copyng we need to store 1 word, 3 pairs and 964 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 965 // offsets {1, 3, 5, 7, 8} * unit. 966 967 if (direction == copy_forwards) { 968 __ sub(s, s, 16); 969 __ sub(d, d, 8); 970 } 971 972 // Fill 8 registers 973 // 974 // for forwards copy s was offset by -16 from the original input 975 // value of s so the register contents are at these offsets 976 // relative to the 64 bit block addressed by that original input 977 // and so on for each successive 64 byte block when s is updated 978 // 979 // t0 at offset 0, t1 at offset 8 980 // t2 at offset 16, t3 at offset 24 981 // t4 at offset 32, t5 at offset 40 982 // t6 at offset 48, t7 at offset 56 983 984 // for backwards copy s was not offset so the register contents 985 // are at these offsets into the preceding 64 byte block 986 // relative to that original input and so on for each successive 987 // preceding 64 byte block when s is updated. this explains the 988 // slightly counter-intuitive looking pattern of register usage 989 // in the stp instructions for backwards copy. 990 // 991 // t0 at offset -16, t1 at offset -8 992 // t2 at offset -32, t3 at offset -24 993 // t4 at offset -48, t5 at offset -40 994 // t6 at offset -64, t7 at offset -56 995 996 __ ldp(t0, t1, Address(s, 2 * unit)); 997 __ ldp(t2, t3, Address(s, 4 * unit)); 998 __ ldp(t4, t5, Address(s, 6 * unit)); 999 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1000 1001 __ subs(count, count, 16); 1002 __ br(Assembler::LO, drain); 1003 1004 int prefetch = PrefetchCopyIntervalInBytes; 1005 bool use_stride = false; 1006 if (direction == copy_backwards) { 1007 use_stride = prefetch > 256; 1008 prefetch = -prefetch; 1009 if (use_stride) __ mov(stride, prefetch); 1010 } 1011 1012 __ bind(again); 1013 1014 if (PrefetchCopyIntervalInBytes > 0) 1015 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1016 1017 if (direction == copy_forwards) { 1018 // allowing for the offset of -8 the store instructions place 1019 // registers into the target 64 bit block at the following 1020 // offsets 1021 // 1022 // t0 at offset 0 1023 // t1 at offset 8, t2 at offset 16 1024 // t3 at offset 24, t4 at offset 32 1025 // t5 at offset 40, t6 at offset 48 1026 // t7 at offset 56 1027 1028 __ str(t0, Address(d, 1 * unit)); 1029 __ stp(t1, t2, Address(d, 2 * unit)); 1030 __ ldp(t0, t1, Address(s, 2 * unit)); 1031 __ stp(t3, t4, Address(d, 4 * unit)); 1032 __ ldp(t2, t3, Address(s, 4 * unit)); 1033 __ stp(t5, t6, Address(d, 6 * unit)); 1034 __ ldp(t4, t5, Address(s, 6 * unit)); 1035 __ str(t7, Address(__ pre(d, 8 * unit))); 1036 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1037 } else { 1038 // d was not offset when we started so the registers are 1039 // written into the 64 bit block preceding d with the following 1040 // offsets 1041 // 1042 // t1 at offset -8 1043 // t3 at offset -24, t0 at offset -16 1044 // t5 at offset -48, t2 at offset -32 1045 // t7 at offset -56, t4 at offset -48 1046 // t6 at offset -64 1047 // 1048 // note that this matches the offsets previously noted for the 1049 // loads 1050 1051 __ str(t1, Address(d, 1 * unit)); 1052 __ stp(t3, t0, Address(d, 3 * unit)); 1053 __ ldp(t0, t1, Address(s, 2 * unit)); 1054 __ stp(t5, t2, Address(d, 5 * unit)); 1055 __ ldp(t2, t3, Address(s, 4 * unit)); 1056 __ stp(t7, t4, Address(d, 7 * unit)); 1057 __ ldp(t4, t5, Address(s, 6 * unit)); 1058 __ str(t6, Address(__ pre(d, 8 * unit))); 1059 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1060 } 1061 1062 __ subs(count, count, 8); 1063 __ br(Assembler::HS, again); 1064 1065 // Drain 1066 // 1067 // this uses the same pattern of offsets and register arguments 1068 // as above 1069 __ bind(drain); 1070 if (direction == copy_forwards) { 1071 __ str(t0, Address(d, 1 * unit)); 1072 __ stp(t1, t2, Address(d, 2 * unit)); 1073 __ stp(t3, t4, Address(d, 4 * unit)); 1074 __ stp(t5, t6, Address(d, 6 * unit)); 1075 __ str(t7, Address(__ pre(d, 8 * unit))); 1076 } else { 1077 __ str(t1, Address(d, 1 * unit)); 1078 __ stp(t3, t0, Address(d, 3 * unit)); 1079 __ stp(t5, t2, Address(d, 5 * unit)); 1080 __ stp(t7, t4, Address(d, 7 * unit)); 1081 __ str(t6, Address(__ pre(d, 8 * unit))); 1082 } 1083 // now we need to copy any remaining part block which may 1084 // include a 4 word block subblock and/or a 2 word subblock. 1085 // bits 2 and 1 in the count are the tell-tale for whetehr we 1086 // have each such subblock 1087 { 1088 Label L1, L2; 1089 __ tbz(count, exact_log2(4), L1); 1090 // this is the same as above but copying only 4 longs hence 1091 // with ony one intervening stp between the str instructions 1092 // but note that the offsets and registers still follow the 1093 // same pattern 1094 __ ldp(t0, t1, Address(s, 2 * unit)); 1095 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1096 if (direction == copy_forwards) { 1097 __ str(t0, Address(d, 1 * unit)); 1098 __ stp(t1, t2, Address(d, 2 * unit)); 1099 __ str(t3, Address(__ pre(d, 4 * unit))); 1100 } else { 1101 __ str(t1, Address(d, 1 * unit)); 1102 __ stp(t3, t0, Address(d, 3 * unit)); 1103 __ str(t2, Address(__ pre(d, 4 * unit))); 1104 } 1105 __ bind(L1); 1106 1107 __ tbz(count, 1, L2); 1108 // this is the same as above but copying only 2 longs hence 1109 // there is no intervening stp between the str instructions 1110 // but note that the offset and register patterns are still 1111 // the same 1112 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1113 if (direction == copy_forwards) { 1114 __ str(t0, Address(d, 1 * unit)); 1115 __ str(t1, Address(__ pre(d, 2 * unit))); 1116 } else { 1117 __ str(t1, Address(d, 1 * unit)); 1118 __ str(t0, Address(__ pre(d, 2 * unit))); 1119 } 1120 __ bind(L2); 1121 1122 // for forwards copy we need to re-adjust the offsets we 1123 // applied so that s and d are follow the last words written 1124 1125 if (direction == copy_forwards) { 1126 __ add(s, s, 16); 1127 __ add(d, d, 8); 1128 } 1129 1130 } 1131 1132 __ ret(lr); 1133 } 1134 } 1135 1136 // Small copy: less than 16 bytes. 1137 // 1138 // NB: Ignores all of the bits of count which represent more than 15 1139 // bytes, so a caller doesn't have to mask them. 1140 1141 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1142 bool is_backwards = step < 0; 1143 size_t granularity = uabs(step); 1144 int direction = is_backwards ? -1 : 1; 1145 int unit = wordSize * direction; 1146 1147 Label Lpair, Lword, Lint, Lshort, Lbyte; 1148 1149 assert(granularity 1150 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1151 1152 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1153 1154 // ??? I don't know if this bit-test-and-branch is the right thing 1155 // to do. It does a lot of jumping, resulting in several 1156 // mispredicted branches. It might make more sense to do this 1157 // with something like Duff's device with a single computed branch. 1158 1159 __ tbz(count, 3 - exact_log2(granularity), Lword); 1160 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1161 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1162 __ bind(Lword); 1163 1164 if (granularity <= sizeof (jint)) { 1165 __ tbz(count, 2 - exact_log2(granularity), Lint); 1166 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1167 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1168 __ bind(Lint); 1169 } 1170 1171 if (granularity <= sizeof (jshort)) { 1172 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1173 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1174 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1175 __ bind(Lshort); 1176 } 1177 1178 if (granularity <= sizeof (jbyte)) { 1179 __ tbz(count, 0, Lbyte); 1180 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1181 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1182 __ bind(Lbyte); 1183 } 1184 } 1185 1186 Label copy_f, copy_b; 1187 1188 // All-singing all-dancing memory copy. 1189 // 1190 // Copy count units of memory from s to d. The size of a unit is 1191 // step, which can be positive or negative depending on the direction 1192 // of copy. If is_aligned is false, we align the source address. 1193 // 1194 1195 void copy_memory(bool is_aligned, Register s, Register d, 1196 Register count, Register tmp, int step) { 1197 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1198 bool is_backwards = step < 0; 1199 int granularity = uabs(step); 1200 const Register t0 = r3, t1 = r4; 1201 1202 // <= 96 bytes do inline. Direction doesn't matter because we always 1203 // load all the data before writing anything 1204 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1205 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1206 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1207 const Register send = r17, dend = r18; 1208 1209 if (PrefetchCopyIntervalInBytes > 0) 1210 __ prfm(Address(s, 0), PLDL1KEEP); 1211 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1212 __ br(Assembler::HI, copy_big); 1213 1214 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1215 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1216 1217 __ cmp(count, 16/granularity); 1218 __ br(Assembler::LS, copy16); 1219 1220 __ cmp(count, 64/granularity); 1221 __ br(Assembler::HI, copy80); 1222 1223 __ cmp(count, 32/granularity); 1224 __ br(Assembler::LS, copy32); 1225 1226 // 33..64 bytes 1227 if (UseSIMDForMemoryOps) { 1228 __ ldpq(v0, v1, Address(s, 0)); 1229 __ ldpq(v2, v3, Address(send, -32)); 1230 __ stpq(v0, v1, Address(d, 0)); 1231 __ stpq(v2, v3, Address(dend, -32)); 1232 } else { 1233 __ ldp(t0, t1, Address(s, 0)); 1234 __ ldp(t2, t3, Address(s, 16)); 1235 __ ldp(t4, t5, Address(send, -32)); 1236 __ ldp(t6, t7, Address(send, -16)); 1237 1238 __ stp(t0, t1, Address(d, 0)); 1239 __ stp(t2, t3, Address(d, 16)); 1240 __ stp(t4, t5, Address(dend, -32)); 1241 __ stp(t6, t7, Address(dend, -16)); 1242 } 1243 __ b(finish); 1244 1245 // 17..32 bytes 1246 __ bind(copy32); 1247 __ ldp(t0, t1, Address(s, 0)); 1248 __ ldp(t2, t3, Address(send, -16)); 1249 __ stp(t0, t1, Address(d, 0)); 1250 __ stp(t2, t3, Address(dend, -16)); 1251 __ b(finish); 1252 1253 // 65..80/96 bytes 1254 // (96 bytes if SIMD because we do 32 byes per instruction) 1255 __ bind(copy80); 1256 if (UseSIMDForMemoryOps) { 1257 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1258 __ ldpq(v4, v5, Address(send, -32)); 1259 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1260 __ stpq(v4, v5, Address(dend, -32)); 1261 } else { 1262 __ ldp(t0, t1, Address(s, 0)); 1263 __ ldp(t2, t3, Address(s, 16)); 1264 __ ldp(t4, t5, Address(s, 32)); 1265 __ ldp(t6, t7, Address(s, 48)); 1266 __ ldp(t8, t9, Address(send, -16)); 1267 1268 __ stp(t0, t1, Address(d, 0)); 1269 __ stp(t2, t3, Address(d, 16)); 1270 __ stp(t4, t5, Address(d, 32)); 1271 __ stp(t6, t7, Address(d, 48)); 1272 __ stp(t8, t9, Address(dend, -16)); 1273 } 1274 __ b(finish); 1275 1276 // 0..16 bytes 1277 __ bind(copy16); 1278 __ cmp(count, 8/granularity); 1279 __ br(Assembler::LO, copy8); 1280 1281 // 8..16 bytes 1282 __ ldr(t0, Address(s, 0)); 1283 __ ldr(t1, Address(send, -8)); 1284 __ str(t0, Address(d, 0)); 1285 __ str(t1, Address(dend, -8)); 1286 __ b(finish); 1287 1288 if (granularity < 8) { 1289 // 4..7 bytes 1290 __ bind(copy8); 1291 __ tbz(count, 2 - exact_log2(granularity), copy4); 1292 __ ldrw(t0, Address(s, 0)); 1293 __ ldrw(t1, Address(send, -4)); 1294 __ strw(t0, Address(d, 0)); 1295 __ strw(t1, Address(dend, -4)); 1296 __ b(finish); 1297 if (granularity < 4) { 1298 // 0..3 bytes 1299 __ bind(copy4); 1300 __ cbz(count, finish); // get rid of 0 case 1301 if (granularity == 2) { 1302 __ ldrh(t0, Address(s, 0)); 1303 __ strh(t0, Address(d, 0)); 1304 } else { // granularity == 1 1305 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1306 // the first and last byte. 1307 // Handle the 3 byte case by loading and storing base + count/2 1308 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1309 // This does means in the 1 byte case we load/store the same 1310 // byte 3 times. 1311 __ lsr(count, count, 1); 1312 __ ldrb(t0, Address(s, 0)); 1313 __ ldrb(t1, Address(send, -1)); 1314 __ ldrb(t2, Address(s, count)); 1315 __ strb(t0, Address(d, 0)); 1316 __ strb(t1, Address(dend, -1)); 1317 __ strb(t2, Address(d, count)); 1318 } 1319 __ b(finish); 1320 } 1321 } 1322 1323 __ bind(copy_big); 1324 if (is_backwards) { 1325 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1326 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1327 } 1328 1329 // Now we've got the small case out of the way we can align the 1330 // source address on a 2-word boundary. 1331 1332 Label aligned; 1333 1334 if (is_aligned) { 1335 // We may have to adjust by 1 word to get s 2-word-aligned. 1336 __ tbz(s, exact_log2(wordSize), aligned); 1337 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1338 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1339 __ sub(count, count, wordSize/granularity); 1340 } else { 1341 if (is_backwards) { 1342 __ andr(rscratch2, s, 2 * wordSize - 1); 1343 } else { 1344 __ neg(rscratch2, s); 1345 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1346 } 1347 // rscratch2 is the byte adjustment needed to align s. 1348 __ cbz(rscratch2, aligned); 1349 int shift = exact_log2(granularity); 1350 if (shift) __ lsr(rscratch2, rscratch2, shift); 1351 __ sub(count, count, rscratch2); 1352 1353 #if 0 1354 // ?? This code is only correct for a disjoint copy. It may or 1355 // may not make sense to use it in that case. 1356 1357 // Copy the first pair; s and d may not be aligned. 1358 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1359 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1360 1361 // Align s and d, adjust count 1362 if (is_backwards) { 1363 __ sub(s, s, rscratch2); 1364 __ sub(d, d, rscratch2); 1365 } else { 1366 __ add(s, s, rscratch2); 1367 __ add(d, d, rscratch2); 1368 } 1369 #else 1370 copy_memory_small(s, d, rscratch2, rscratch1, step); 1371 #endif 1372 } 1373 1374 __ bind(aligned); 1375 1376 // s is now 2-word-aligned. 1377 1378 // We have a count of units and some trailing bytes. Adjust the 1379 // count and do a bulk copy of words. 1380 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1381 if (direction == copy_forwards) 1382 __ bl(copy_f); 1383 else 1384 __ bl(copy_b); 1385 1386 // And the tail. 1387 copy_memory_small(s, d, count, tmp, step); 1388 1389 if (granularity >= 8) __ bind(copy8); 1390 if (granularity >= 4) __ bind(copy4); 1391 __ bind(finish); 1392 } 1393 1394 1395 void clobber_registers() { 1396 #ifdef ASSERT 1397 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1398 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1399 for (Register r = r3; r <= r18; r++) 1400 if (r != rscratch1) __ mov(r, rscratch1); 1401 #endif 1402 } 1403 1404 // Scan over array at a for count oops, verifying each one. 1405 // Preserves a and count, clobbers rscratch1 and rscratch2. 1406 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1407 Label loop, end; 1408 __ mov(rscratch1, a); 1409 __ mov(rscratch2, zr); 1410 __ bind(loop); 1411 __ cmp(rscratch2, count); 1412 __ br(Assembler::HS, end); 1413 if (size == (size_t)wordSize) { 1414 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1415 __ verify_oop(temp); 1416 } else { 1417 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1418 __ decode_heap_oop(temp); // calls verify_oop 1419 } 1420 __ add(rscratch2, rscratch2, size); 1421 __ b(loop); 1422 __ bind(end); 1423 } 1424 1425 // Arguments: 1426 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1427 // ignored 1428 // is_oop - true => oop array, so generate store check code 1429 // name - stub name string 1430 // 1431 // Inputs: 1432 // c_rarg0 - source array address 1433 // c_rarg1 - destination array address 1434 // c_rarg2 - element count, treated as ssize_t, can be zero 1435 // 1436 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1437 // the hardware handle it. The two dwords within qwords that span 1438 // cache line boundaries will still be loaded and stored atomicly. 1439 // 1440 // Side Effects: 1441 // disjoint_int_copy_entry is set to the no-overlap entry point 1442 // used by generate_conjoint_int_oop_copy(). 1443 // 1444 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1445 const char *name, bool dest_uninitialized = false) { 1446 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1447 RegSet saved_reg = RegSet::of(s, d, count); 1448 __ align(CodeEntryAlignment); 1449 StubCodeMark mark(this, "StubRoutines", name); 1450 address start = __ pc(); 1451 __ enter(); 1452 1453 if (entry != NULL) { 1454 *entry = __ pc(); 1455 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1456 BLOCK_COMMENT("Entry:"); 1457 } 1458 1459 if (is_oop) { 1460 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_reg); 1461 // save regs before copy_memory 1462 __ push(RegSet::of(d, count), sp); 1463 } 1464 copy_memory(aligned, s, d, count, rscratch1, size); 1465 if (is_oop) { 1466 __ pop(RegSet::of(d, count), sp); 1467 if (VerifyOops) 1468 verify_oop_array(size, d, count, r16); 1469 __ sub(count, count, 1); // make an inclusive end pointer 1470 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1471 gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet()); 1472 } 1473 __ leave(); 1474 __ mov(r0, zr); // return 0 1475 __ ret(lr); 1476 #ifdef BUILTIN_SIM 1477 { 1478 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1479 sim->notifyCompile(const_cast<char*>(name), start); 1480 } 1481 #endif 1482 return start; 1483 } 1484 1485 // Arguments: 1486 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1487 // ignored 1488 // is_oop - true => oop array, so generate store check code 1489 // name - stub name string 1490 // 1491 // Inputs: 1492 // c_rarg0 - source array address 1493 // c_rarg1 - destination array address 1494 // c_rarg2 - element count, treated as ssize_t, can be zero 1495 // 1496 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1497 // the hardware handle it. The two dwords within qwords that span 1498 // cache line boundaries will still be loaded and stored atomicly. 1499 // 1500 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1501 address *entry, const char *name, 1502 bool dest_uninitialized = false) { 1503 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1504 RegSet saved_regs = RegSet::of(s, d, count); 1505 StubCodeMark mark(this, "StubRoutines", name); 1506 address start = __ pc(); 1507 __ enter(); 1508 1509 if (entry != NULL) { 1510 *entry = __ pc(); 1511 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1512 BLOCK_COMMENT("Entry:"); 1513 } 1514 1515 // use fwd copy when (d-s) above_equal (count*size) 1516 __ sub(rscratch1, d, s); 1517 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1518 __ br(Assembler::HS, nooverlap_target); 1519 1520 if (is_oop) { 1521 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_regs); 1522 // save regs before copy_memory 1523 __ push(RegSet::of(d, count), sp); 1524 } 1525 copy_memory(aligned, s, d, count, rscratch1, -size); 1526 if (is_oop) { 1527 __ pop(RegSet::of(d, count), sp); 1528 if (VerifyOops) 1529 verify_oop_array(size, d, count, r16); 1530 __ sub(count, count, 1); // make an inclusive end pointer 1531 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1532 gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet()); 1533 } 1534 __ leave(); 1535 __ mov(r0, zr); // return 0 1536 __ ret(lr); 1537 #ifdef BUILTIN_SIM 1538 { 1539 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1540 sim->notifyCompile(const_cast<char*>(name), start); 1541 } 1542 #endif 1543 return start; 1544 } 1545 1546 // Arguments: 1547 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1548 // ignored 1549 // name - stub name string 1550 // 1551 // Inputs: 1552 // c_rarg0 - source array address 1553 // c_rarg1 - destination array address 1554 // c_rarg2 - element count, treated as ssize_t, can be zero 1555 // 1556 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1557 // we let the hardware handle it. The one to eight bytes within words, 1558 // dwords or qwords that span cache line boundaries will still be loaded 1559 // and stored atomically. 1560 // 1561 // Side Effects: 1562 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1563 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1564 // we let the hardware handle it. The one to eight bytes within words, 1565 // dwords or qwords that span cache line boundaries will still be loaded 1566 // and stored atomically. 1567 // 1568 // Side Effects: 1569 // disjoint_byte_copy_entry is set to the no-overlap entry point 1570 // used by generate_conjoint_byte_copy(). 1571 // 1572 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1573 const bool not_oop = false; 1574 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1575 } 1576 1577 // Arguments: 1578 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1579 // ignored 1580 // name - stub name string 1581 // 1582 // Inputs: 1583 // c_rarg0 - source array address 1584 // c_rarg1 - destination array address 1585 // c_rarg2 - element count, treated as ssize_t, can be zero 1586 // 1587 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1588 // we let the hardware handle it. The one to eight bytes within words, 1589 // dwords or qwords that span cache line boundaries will still be loaded 1590 // and stored atomically. 1591 // 1592 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1593 address* entry, const char *name) { 1594 const bool not_oop = false; 1595 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1596 } 1597 1598 // Arguments: 1599 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1600 // ignored 1601 // name - stub name string 1602 // 1603 // Inputs: 1604 // c_rarg0 - source array address 1605 // c_rarg1 - destination array address 1606 // c_rarg2 - element count, treated as ssize_t, can be zero 1607 // 1608 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1609 // let the hardware handle it. The two or four words within dwords 1610 // or qwords that span cache line boundaries will still be loaded 1611 // and stored atomically. 1612 // 1613 // Side Effects: 1614 // disjoint_short_copy_entry is set to the no-overlap entry point 1615 // used by generate_conjoint_short_copy(). 1616 // 1617 address generate_disjoint_short_copy(bool aligned, 1618 address* entry, const char *name) { 1619 const bool not_oop = false; 1620 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1621 } 1622 1623 // Arguments: 1624 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1625 // ignored 1626 // name - stub name string 1627 // 1628 // Inputs: 1629 // c_rarg0 - source array address 1630 // c_rarg1 - destination array address 1631 // c_rarg2 - element count, treated as ssize_t, can be zero 1632 // 1633 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1634 // let the hardware handle it. The two or four words within dwords 1635 // or qwords that span cache line boundaries will still be loaded 1636 // and stored atomically. 1637 // 1638 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1639 address *entry, const char *name) { 1640 const bool not_oop = false; 1641 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1642 1643 } 1644 // Arguments: 1645 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1646 // ignored 1647 // name - stub name string 1648 // 1649 // Inputs: 1650 // c_rarg0 - source array address 1651 // c_rarg1 - destination array address 1652 // c_rarg2 - element count, treated as ssize_t, can be zero 1653 // 1654 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1655 // the hardware handle it. The two dwords within qwords that span 1656 // cache line boundaries will still be loaded and stored atomicly. 1657 // 1658 // Side Effects: 1659 // disjoint_int_copy_entry is set to the no-overlap entry point 1660 // used by generate_conjoint_int_oop_copy(). 1661 // 1662 address generate_disjoint_int_copy(bool aligned, address *entry, 1663 const char *name, bool dest_uninitialized = false) { 1664 const bool not_oop = false; 1665 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1666 } 1667 1668 // Arguments: 1669 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1670 // ignored 1671 // name - stub name string 1672 // 1673 // Inputs: 1674 // c_rarg0 - source array address 1675 // c_rarg1 - destination array address 1676 // c_rarg2 - element count, treated as ssize_t, can be zero 1677 // 1678 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1679 // the hardware handle it. The two dwords within qwords that span 1680 // cache line boundaries will still be loaded and stored atomicly. 1681 // 1682 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1683 address *entry, const char *name, 1684 bool dest_uninitialized = false) { 1685 const bool not_oop = false; 1686 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1687 } 1688 1689 1690 // Arguments: 1691 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1692 // ignored 1693 // name - stub name string 1694 // 1695 // Inputs: 1696 // c_rarg0 - source array address 1697 // c_rarg1 - destination array address 1698 // c_rarg2 - element count, treated as size_t, can be zero 1699 // 1700 // Side Effects: 1701 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1702 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1703 // 1704 address generate_disjoint_long_copy(bool aligned, address *entry, 1705 const char *name, bool dest_uninitialized = false) { 1706 const bool not_oop = false; 1707 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1708 } 1709 1710 // Arguments: 1711 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1712 // ignored 1713 // name - stub name string 1714 // 1715 // Inputs: 1716 // c_rarg0 - source array address 1717 // c_rarg1 - destination array address 1718 // c_rarg2 - element count, treated as size_t, can be zero 1719 // 1720 address generate_conjoint_long_copy(bool aligned, 1721 address nooverlap_target, address *entry, 1722 const char *name, bool dest_uninitialized = false) { 1723 const bool not_oop = false; 1724 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1725 } 1726 1727 // Arguments: 1728 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1729 // ignored 1730 // name - stub name string 1731 // 1732 // Inputs: 1733 // c_rarg0 - source array address 1734 // c_rarg1 - destination array address 1735 // c_rarg2 - element count, treated as size_t, can be zero 1736 // 1737 // Side Effects: 1738 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1739 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1740 // 1741 address generate_disjoint_oop_copy(bool aligned, address *entry, 1742 const char *name, bool dest_uninitialized) { 1743 const bool is_oop = true; 1744 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1745 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1746 } 1747 1748 // Arguments: 1749 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1750 // ignored 1751 // name - stub name string 1752 // 1753 // Inputs: 1754 // c_rarg0 - source array address 1755 // c_rarg1 - destination array address 1756 // c_rarg2 - element count, treated as size_t, can be zero 1757 // 1758 address generate_conjoint_oop_copy(bool aligned, 1759 address nooverlap_target, address *entry, 1760 const char *name, bool dest_uninitialized) { 1761 const bool is_oop = true; 1762 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1763 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1764 name, dest_uninitialized); 1765 } 1766 1767 1768 // Helper for generating a dynamic type check. 1769 // Smashes rscratch1. 1770 void generate_type_check(Register sub_klass, 1771 Register super_check_offset, 1772 Register super_klass, 1773 Label& L_success) { 1774 assert_different_registers(sub_klass, super_check_offset, super_klass); 1775 1776 BLOCK_COMMENT("type_check:"); 1777 1778 Label L_miss; 1779 1780 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1781 super_check_offset); 1782 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1783 1784 // Fall through on failure! 1785 __ BIND(L_miss); 1786 } 1787 1788 // 1789 // Generate checkcasting array copy stub 1790 // 1791 // Input: 1792 // c_rarg0 - source array address 1793 // c_rarg1 - destination array address 1794 // c_rarg2 - element count, treated as ssize_t, can be zero 1795 // c_rarg3 - size_t ckoff (super_check_offset) 1796 // c_rarg4 - oop ckval (super_klass) 1797 // 1798 // Output: 1799 // r0 == 0 - success 1800 // r0 == -1^K - failure, where K is partial transfer count 1801 // 1802 address generate_checkcast_copy(const char *name, address *entry, 1803 bool dest_uninitialized = false) { 1804 1805 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1806 1807 // Input registers (after setup_arg_regs) 1808 const Register from = c_rarg0; // source array address 1809 const Register to = c_rarg1; // destination array address 1810 const Register count = c_rarg2; // elementscount 1811 const Register ckoff = c_rarg3; // super_check_offset 1812 const Register ckval = c_rarg4; // super_klass 1813 1814 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1815 RegSet wb_post_saved_regs = RegSet::of(count); 1816 1817 // Registers used as temps (r18, r19, r20 are save-on-entry) 1818 const Register count_save = r21; // orig elementscount 1819 const Register start_to = r20; // destination array start address 1820 const Register copied_oop = r18; // actual oop copied 1821 const Register r19_klass = r19; // oop._klass 1822 1823 //--------------------------------------------------------------- 1824 // Assembler stub will be used for this call to arraycopy 1825 // if the two arrays are subtypes of Object[] but the 1826 // destination array type is not equal to or a supertype 1827 // of the source type. Each element must be separately 1828 // checked. 1829 1830 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1831 copied_oop, r19_klass, count_save); 1832 1833 __ align(CodeEntryAlignment); 1834 StubCodeMark mark(this, "StubRoutines", name); 1835 address start = __ pc(); 1836 1837 __ enter(); // required for proper stackwalking of RuntimeStub frame 1838 1839 #ifdef ASSERT 1840 // caller guarantees that the arrays really are different 1841 // otherwise, we would have to make conjoint checks 1842 { Label L; 1843 array_overlap_test(L, TIMES_OOP); 1844 __ stop("checkcast_copy within a single array"); 1845 __ bind(L); 1846 } 1847 #endif //ASSERT 1848 1849 // Caller of this entry point must set up the argument registers. 1850 if (entry != NULL) { 1851 *entry = __ pc(); 1852 BLOCK_COMMENT("Entry:"); 1853 } 1854 1855 // Empty array: Nothing to do. 1856 __ cbz(count, L_done); 1857 1858 __ push(RegSet::of(r18, r19, r20, r21), sp); 1859 1860 #ifdef ASSERT 1861 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1862 // The ckoff and ckval must be mutually consistent, 1863 // even though caller generates both. 1864 { Label L; 1865 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1866 __ ldrw(start_to, Address(ckval, sco_offset)); 1867 __ cmpw(ckoff, start_to); 1868 __ br(Assembler::EQ, L); 1869 __ stop("super_check_offset inconsistent"); 1870 __ bind(L); 1871 } 1872 #endif //ASSERT 1873 1874 gen_write_ref_array_pre_barrier(to, count, dest_uninitialized, wb_pre_saved_regs); 1875 1876 // save the original count 1877 __ mov(count_save, count); 1878 1879 // Copy from low to high addresses 1880 __ mov(start_to, to); // Save destination array start address 1881 __ b(L_load_element); 1882 1883 // ======== begin loop ======== 1884 // (Loop is rotated; its entry is L_load_element.) 1885 // Loop control: 1886 // for (; count != 0; count--) { 1887 // copied_oop = load_heap_oop(from++); 1888 // ... generate_type_check ...; 1889 // store_heap_oop(to++, copied_oop); 1890 // } 1891 __ align(OptoLoopAlignment); 1892 1893 __ BIND(L_store_element); 1894 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1895 __ sub(count, count, 1); 1896 __ cbz(count, L_do_card_marks); 1897 1898 // ======== loop entry is here ======== 1899 __ BIND(L_load_element); 1900 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1901 __ cbz(copied_oop, L_store_element); 1902 1903 __ load_klass(r19_klass, copied_oop);// query the object klass 1904 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1905 // ======== end loop ======== 1906 1907 // It was a real error; we must depend on the caller to finish the job. 1908 // Register count = remaining oops, count_orig = total oops. 1909 // Emit GC store barriers for the oops we have copied and report 1910 // their number to the caller. 1911 1912 __ subs(count, count_save, count); // K = partially copied oop count 1913 __ eon(count, count, zr); // report (-1^K) to caller 1914 __ br(Assembler::EQ, L_done_pop); 1915 1916 __ BIND(L_do_card_marks); 1917 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1918 gen_write_ref_array_post_barrier(start_to, to, rscratch1, wb_post_saved_regs); 1919 1920 __ bind(L_done_pop); 1921 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1922 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1923 1924 __ bind(L_done); 1925 __ mov(r0, count); 1926 __ leave(); 1927 __ ret(lr); 1928 1929 return start; 1930 } 1931 1932 // Perform range checks on the proposed arraycopy. 1933 // Kills temp, but nothing else. 1934 // Also, clean the sign bits of src_pos and dst_pos. 1935 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1936 Register src_pos, // source position (c_rarg1) 1937 Register dst, // destination array oo (c_rarg2) 1938 Register dst_pos, // destination position (c_rarg3) 1939 Register length, 1940 Register temp, 1941 Label& L_failed) { 1942 BLOCK_COMMENT("arraycopy_range_checks:"); 1943 1944 assert_different_registers(rscratch1, temp); 1945 1946 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1947 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1948 __ addw(temp, length, src_pos); 1949 __ cmpw(temp, rscratch1); 1950 __ br(Assembler::HI, L_failed); 1951 1952 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1953 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1954 __ addw(temp, length, dst_pos); 1955 __ cmpw(temp, rscratch1); 1956 __ br(Assembler::HI, L_failed); 1957 1958 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1959 __ movw(src_pos, src_pos); 1960 __ movw(dst_pos, dst_pos); 1961 1962 BLOCK_COMMENT("arraycopy_range_checks done"); 1963 } 1964 1965 // These stubs get called from some dumb test routine. 1966 // I'll write them properly when they're called from 1967 // something that's actually doing something. 1968 static void fake_arraycopy_stub(address src, address dst, int count) { 1969 assert(count == 0, "huh?"); 1970 } 1971 1972 1973 // 1974 // Generate 'unsafe' array copy stub 1975 // Though just as safe as the other stubs, it takes an unscaled 1976 // size_t argument instead of an element count. 1977 // 1978 // Input: 1979 // c_rarg0 - source array address 1980 // c_rarg1 - destination array address 1981 // c_rarg2 - byte count, treated as ssize_t, can be zero 1982 // 1983 // Examines the alignment of the operands and dispatches 1984 // to a long, int, short, or byte copy loop. 1985 // 1986 address generate_unsafe_copy(const char *name, 1987 address byte_copy_entry, 1988 address short_copy_entry, 1989 address int_copy_entry, 1990 address long_copy_entry) { 1991 Label L_long_aligned, L_int_aligned, L_short_aligned; 1992 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1993 1994 __ align(CodeEntryAlignment); 1995 StubCodeMark mark(this, "StubRoutines", name); 1996 address start = __ pc(); 1997 __ enter(); // required for proper stackwalking of RuntimeStub frame 1998 1999 // bump this on entry, not on exit: 2000 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2001 2002 __ orr(rscratch1, s, d); 2003 __ orr(rscratch1, rscratch1, count); 2004 2005 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2006 __ cbz(rscratch1, L_long_aligned); 2007 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2008 __ cbz(rscratch1, L_int_aligned); 2009 __ tbz(rscratch1, 0, L_short_aligned); 2010 __ b(RuntimeAddress(byte_copy_entry)); 2011 2012 __ BIND(L_short_aligned); 2013 __ lsr(count, count, LogBytesPerShort); // size => short_count 2014 __ b(RuntimeAddress(short_copy_entry)); 2015 __ BIND(L_int_aligned); 2016 __ lsr(count, count, LogBytesPerInt); // size => int_count 2017 __ b(RuntimeAddress(int_copy_entry)); 2018 __ BIND(L_long_aligned); 2019 __ lsr(count, count, LogBytesPerLong); // size => long_count 2020 __ b(RuntimeAddress(long_copy_entry)); 2021 2022 return start; 2023 } 2024 2025 // 2026 // Generate generic array copy stubs 2027 // 2028 // Input: 2029 // c_rarg0 - src oop 2030 // c_rarg1 - src_pos (32-bits) 2031 // c_rarg2 - dst oop 2032 // c_rarg3 - dst_pos (32-bits) 2033 // c_rarg4 - element count (32-bits) 2034 // 2035 // Output: 2036 // r0 == 0 - success 2037 // r0 == -1^K - failure, where K is partial transfer count 2038 // 2039 address generate_generic_copy(const char *name, 2040 address byte_copy_entry, address short_copy_entry, 2041 address int_copy_entry, address oop_copy_entry, 2042 address long_copy_entry, address checkcast_copy_entry) { 2043 2044 Label L_failed, L_failed_0, L_objArray; 2045 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2046 2047 // Input registers 2048 const Register src = c_rarg0; // source array oop 2049 const Register src_pos = c_rarg1; // source position 2050 const Register dst = c_rarg2; // destination array oop 2051 const Register dst_pos = c_rarg3; // destination position 2052 const Register length = c_rarg4; 2053 2054 StubCodeMark mark(this, "StubRoutines", name); 2055 2056 __ align(CodeEntryAlignment); 2057 address start = __ pc(); 2058 2059 __ enter(); // required for proper stackwalking of RuntimeStub frame 2060 2061 // bump this on entry, not on exit: 2062 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2063 2064 //----------------------------------------------------------------------- 2065 // Assembler stub will be used for this call to arraycopy 2066 // if the following conditions are met: 2067 // 2068 // (1) src and dst must not be null. 2069 // (2) src_pos must not be negative. 2070 // (3) dst_pos must not be negative. 2071 // (4) length must not be negative. 2072 // (5) src klass and dst klass should be the same and not NULL. 2073 // (6) src and dst should be arrays. 2074 // (7) src_pos + length must not exceed length of src. 2075 // (8) dst_pos + length must not exceed length of dst. 2076 // 2077 2078 // if (src == NULL) return -1; 2079 __ cbz(src, L_failed); 2080 2081 // if (src_pos < 0) return -1; 2082 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2083 2084 // if (dst == NULL) return -1; 2085 __ cbz(dst, L_failed); 2086 2087 // if (dst_pos < 0) return -1; 2088 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2089 2090 // registers used as temp 2091 const Register scratch_length = r16; // elements count to copy 2092 const Register scratch_src_klass = r17; // array klass 2093 const Register lh = r18; // layout helper 2094 2095 // if (length < 0) return -1; 2096 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2097 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2098 2099 __ load_klass(scratch_src_klass, src); 2100 #ifdef ASSERT 2101 // assert(src->klass() != NULL); 2102 { 2103 BLOCK_COMMENT("assert klasses not null {"); 2104 Label L1, L2; 2105 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2106 __ bind(L1); 2107 __ stop("broken null klass"); 2108 __ bind(L2); 2109 __ load_klass(rscratch1, dst); 2110 __ cbz(rscratch1, L1); // this would be broken also 2111 BLOCK_COMMENT("} assert klasses not null done"); 2112 } 2113 #endif 2114 2115 // Load layout helper (32-bits) 2116 // 2117 // |array_tag| | header_size | element_type | |log2_element_size| 2118 // 32 30 24 16 8 2 0 2119 // 2120 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2121 // 2122 2123 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2124 2125 // Handle objArrays completely differently... 2126 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2127 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2128 __ movw(rscratch1, objArray_lh); 2129 __ eorw(rscratch2, lh, rscratch1); 2130 __ cbzw(rscratch2, L_objArray); 2131 2132 // if (src->klass() != dst->klass()) return -1; 2133 __ load_klass(rscratch2, dst); 2134 __ eor(rscratch2, rscratch2, scratch_src_klass); 2135 __ cbnz(rscratch2, L_failed); 2136 2137 // if (!src->is_Array()) return -1; 2138 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2139 2140 // At this point, it is known to be a typeArray (array_tag 0x3). 2141 #ifdef ASSERT 2142 { 2143 BLOCK_COMMENT("assert primitive array {"); 2144 Label L; 2145 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2146 __ cmpw(lh, rscratch2); 2147 __ br(Assembler::GE, L); 2148 __ stop("must be a primitive array"); 2149 __ bind(L); 2150 BLOCK_COMMENT("} assert primitive array done"); 2151 } 2152 #endif 2153 2154 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2155 rscratch2, L_failed); 2156 2157 // TypeArrayKlass 2158 // 2159 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2160 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2161 // 2162 2163 const Register rscratch1_offset = rscratch1; // array offset 2164 const Register r18_elsize = lh; // element size 2165 2166 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2167 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2168 __ add(src, src, rscratch1_offset); // src array offset 2169 __ add(dst, dst, rscratch1_offset); // dst array offset 2170 BLOCK_COMMENT("choose copy loop based on element size"); 2171 2172 // next registers should be set before the jump to corresponding stub 2173 const Register from = c_rarg0; // source array address 2174 const Register to = c_rarg1; // destination array address 2175 const Register count = c_rarg2; // elements count 2176 2177 // 'from', 'to', 'count' registers should be set in such order 2178 // since they are the same as 'src', 'src_pos', 'dst'. 2179 2180 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2181 2182 // The possible values of elsize are 0-3, i.e. exact_log2(element 2183 // size in bytes). We do a simple bitwise binary search. 2184 __ BIND(L_copy_bytes); 2185 __ tbnz(r18_elsize, 1, L_copy_ints); 2186 __ tbnz(r18_elsize, 0, L_copy_shorts); 2187 __ lea(from, Address(src, src_pos));// src_addr 2188 __ lea(to, Address(dst, dst_pos));// dst_addr 2189 __ movw(count, scratch_length); // length 2190 __ b(RuntimeAddress(byte_copy_entry)); 2191 2192 __ BIND(L_copy_shorts); 2193 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2194 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2195 __ movw(count, scratch_length); // length 2196 __ b(RuntimeAddress(short_copy_entry)); 2197 2198 __ BIND(L_copy_ints); 2199 __ tbnz(r18_elsize, 0, L_copy_longs); 2200 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2201 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2202 __ movw(count, scratch_length); // length 2203 __ b(RuntimeAddress(int_copy_entry)); 2204 2205 __ BIND(L_copy_longs); 2206 #ifdef ASSERT 2207 { 2208 BLOCK_COMMENT("assert long copy {"); 2209 Label L; 2210 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2211 __ cmpw(r18_elsize, LogBytesPerLong); 2212 __ br(Assembler::EQ, L); 2213 __ stop("must be long copy, but elsize is wrong"); 2214 __ bind(L); 2215 BLOCK_COMMENT("} assert long copy done"); 2216 } 2217 #endif 2218 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2219 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2220 __ movw(count, scratch_length); // length 2221 __ b(RuntimeAddress(long_copy_entry)); 2222 2223 // ObjArrayKlass 2224 __ BIND(L_objArray); 2225 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2226 2227 Label L_plain_copy, L_checkcast_copy; 2228 // test array classes for subtyping 2229 __ load_klass(r18, dst); 2230 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2231 __ br(Assembler::NE, L_checkcast_copy); 2232 2233 // Identically typed arrays can be copied without element-wise checks. 2234 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2235 rscratch2, L_failed); 2236 2237 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2238 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2239 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2240 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2241 __ movw(count, scratch_length); // length 2242 __ BIND(L_plain_copy); 2243 __ b(RuntimeAddress(oop_copy_entry)); 2244 2245 __ BIND(L_checkcast_copy); 2246 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2247 { 2248 // Before looking at dst.length, make sure dst is also an objArray. 2249 __ ldrw(rscratch1, Address(r18, lh_offset)); 2250 __ movw(rscratch2, objArray_lh); 2251 __ eorw(rscratch1, rscratch1, rscratch2); 2252 __ cbnzw(rscratch1, L_failed); 2253 2254 // It is safe to examine both src.length and dst.length. 2255 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2256 r18, L_failed); 2257 2258 const Register rscratch2_dst_klass = rscratch2; 2259 __ load_klass(rscratch2_dst_klass, dst); // reload 2260 2261 // Marshal the base address arguments now, freeing registers. 2262 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2263 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2264 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2265 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2266 __ movw(count, length); // length (reloaded) 2267 Register sco_temp = c_rarg3; // this register is free now 2268 assert_different_registers(from, to, count, sco_temp, 2269 rscratch2_dst_klass, scratch_src_klass); 2270 // assert_clean_int(count, sco_temp); 2271 2272 // Generate the type check. 2273 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2274 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2275 // assert_clean_int(sco_temp, r18); 2276 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2277 2278 // Fetch destination element klass from the ObjArrayKlass header. 2279 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2280 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2281 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2282 2283 // the checkcast_copy loop needs two extra arguments: 2284 assert(c_rarg3 == sco_temp, "#3 already in place"); 2285 // Set up arguments for checkcast_copy_entry. 2286 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2287 __ b(RuntimeAddress(checkcast_copy_entry)); 2288 } 2289 2290 __ BIND(L_failed); 2291 __ mov(r0, -1); 2292 __ leave(); // required for proper stackwalking of RuntimeStub frame 2293 __ ret(lr); 2294 2295 return start; 2296 } 2297 2298 // 2299 // Generate stub for array fill. If "aligned" is true, the 2300 // "to" address is assumed to be heapword aligned. 2301 // 2302 // Arguments for generated stub: 2303 // to: c_rarg0 2304 // value: c_rarg1 2305 // count: c_rarg2 treated as signed 2306 // 2307 address generate_fill(BasicType t, bool aligned, const char *name) { 2308 __ align(CodeEntryAlignment); 2309 StubCodeMark mark(this, "StubRoutines", name); 2310 address start = __ pc(); 2311 2312 BLOCK_COMMENT("Entry:"); 2313 2314 const Register to = c_rarg0; // source array address 2315 const Register value = c_rarg1; // value 2316 const Register count = c_rarg2; // elements count 2317 2318 const Register bz_base = r10; // base for block_zero routine 2319 const Register cnt_words = r11; // temp register 2320 2321 __ enter(); 2322 2323 Label L_fill_elements, L_exit1; 2324 2325 int shift = -1; 2326 switch (t) { 2327 case T_BYTE: 2328 shift = 0; 2329 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2330 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2331 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2332 __ br(Assembler::LO, L_fill_elements); 2333 break; 2334 case T_SHORT: 2335 shift = 1; 2336 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2337 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2338 __ br(Assembler::LO, L_fill_elements); 2339 break; 2340 case T_INT: 2341 shift = 2; 2342 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2343 __ br(Assembler::LO, L_fill_elements); 2344 break; 2345 default: ShouldNotReachHere(); 2346 } 2347 2348 // Align source address at 8 bytes address boundary. 2349 Label L_skip_align1, L_skip_align2, L_skip_align4; 2350 if (!aligned) { 2351 switch (t) { 2352 case T_BYTE: 2353 // One byte misalignment happens only for byte arrays. 2354 __ tbz(to, 0, L_skip_align1); 2355 __ strb(value, Address(__ post(to, 1))); 2356 __ subw(count, count, 1); 2357 __ bind(L_skip_align1); 2358 // Fallthrough 2359 case T_SHORT: 2360 // Two bytes misalignment happens only for byte and short (char) arrays. 2361 __ tbz(to, 1, L_skip_align2); 2362 __ strh(value, Address(__ post(to, 2))); 2363 __ subw(count, count, 2 >> shift); 2364 __ bind(L_skip_align2); 2365 // Fallthrough 2366 case T_INT: 2367 // Align to 8 bytes, we know we are 4 byte aligned to start. 2368 __ tbz(to, 2, L_skip_align4); 2369 __ strw(value, Address(__ post(to, 4))); 2370 __ subw(count, count, 4 >> shift); 2371 __ bind(L_skip_align4); 2372 break; 2373 default: ShouldNotReachHere(); 2374 } 2375 } 2376 2377 // 2378 // Fill large chunks 2379 // 2380 __ lsrw(cnt_words, count, 3 - shift); // number of words 2381 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2382 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2383 if (UseBlockZeroing) { 2384 Label non_block_zeroing, rest; 2385 // If the fill value is zero we can use the fast zero_words(). 2386 __ cbnz(value, non_block_zeroing); 2387 __ mov(bz_base, to); 2388 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2389 __ zero_words(bz_base, cnt_words); 2390 __ b(rest); 2391 __ bind(non_block_zeroing); 2392 __ fill_words(to, cnt_words, value); 2393 __ bind(rest); 2394 } else { 2395 __ fill_words(to, cnt_words, value); 2396 } 2397 2398 // Remaining count is less than 8 bytes. Fill it by a single store. 2399 // Note that the total length is no less than 8 bytes. 2400 if (t == T_BYTE || t == T_SHORT) { 2401 Label L_exit1; 2402 __ cbzw(count, L_exit1); 2403 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2404 __ str(value, Address(to, -8)); // overwrite some elements 2405 __ bind(L_exit1); 2406 __ leave(); 2407 __ ret(lr); 2408 } 2409 2410 // Handle copies less than 8 bytes. 2411 Label L_fill_2, L_fill_4, L_exit2; 2412 __ bind(L_fill_elements); 2413 switch (t) { 2414 case T_BYTE: 2415 __ tbz(count, 0, L_fill_2); 2416 __ strb(value, Address(__ post(to, 1))); 2417 __ bind(L_fill_2); 2418 __ tbz(count, 1, L_fill_4); 2419 __ strh(value, Address(__ post(to, 2))); 2420 __ bind(L_fill_4); 2421 __ tbz(count, 2, L_exit2); 2422 __ strw(value, Address(to)); 2423 break; 2424 case T_SHORT: 2425 __ tbz(count, 0, L_fill_4); 2426 __ strh(value, Address(__ post(to, 2))); 2427 __ bind(L_fill_4); 2428 __ tbz(count, 1, L_exit2); 2429 __ strw(value, Address(to)); 2430 break; 2431 case T_INT: 2432 __ cbzw(count, L_exit2); 2433 __ strw(value, Address(to)); 2434 break; 2435 default: ShouldNotReachHere(); 2436 } 2437 __ bind(L_exit2); 2438 __ leave(); 2439 __ ret(lr); 2440 return start; 2441 } 2442 2443 void generate_arraycopy_stubs() { 2444 address entry; 2445 address entry_jbyte_arraycopy; 2446 address entry_jshort_arraycopy; 2447 address entry_jint_arraycopy; 2448 address entry_oop_arraycopy; 2449 address entry_jlong_arraycopy; 2450 address entry_checkcast_arraycopy; 2451 2452 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2453 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2454 2455 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2456 2457 //*** jbyte 2458 // Always need aligned and unaligned versions 2459 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2460 "jbyte_disjoint_arraycopy"); 2461 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2462 &entry_jbyte_arraycopy, 2463 "jbyte_arraycopy"); 2464 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2465 "arrayof_jbyte_disjoint_arraycopy"); 2466 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2467 "arrayof_jbyte_arraycopy"); 2468 2469 //*** jshort 2470 // Always need aligned and unaligned versions 2471 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2472 "jshort_disjoint_arraycopy"); 2473 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2474 &entry_jshort_arraycopy, 2475 "jshort_arraycopy"); 2476 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2477 "arrayof_jshort_disjoint_arraycopy"); 2478 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2479 "arrayof_jshort_arraycopy"); 2480 2481 //*** jint 2482 // Aligned versions 2483 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2484 "arrayof_jint_disjoint_arraycopy"); 2485 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2486 "arrayof_jint_arraycopy"); 2487 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2488 // entry_jint_arraycopy always points to the unaligned version 2489 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2490 "jint_disjoint_arraycopy"); 2491 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2492 &entry_jint_arraycopy, 2493 "jint_arraycopy"); 2494 2495 //*** jlong 2496 // It is always aligned 2497 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2498 "arrayof_jlong_disjoint_arraycopy"); 2499 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2500 "arrayof_jlong_arraycopy"); 2501 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2502 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2503 2504 //*** oops 2505 { 2506 // With compressed oops we need unaligned versions; notice that 2507 // we overwrite entry_oop_arraycopy. 2508 bool aligned = !UseCompressedOops; 2509 2510 StubRoutines::_arrayof_oop_disjoint_arraycopy 2511 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2512 /*dest_uninitialized*/false); 2513 StubRoutines::_arrayof_oop_arraycopy 2514 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2515 /*dest_uninitialized*/false); 2516 // Aligned versions without pre-barriers 2517 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2518 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2519 /*dest_uninitialized*/true); 2520 StubRoutines::_arrayof_oop_arraycopy_uninit 2521 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2522 /*dest_uninitialized*/true); 2523 } 2524 2525 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2526 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2527 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2528 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2529 2530 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2531 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2532 /*dest_uninitialized*/true); 2533 2534 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2535 entry_jbyte_arraycopy, 2536 entry_jshort_arraycopy, 2537 entry_jint_arraycopy, 2538 entry_jlong_arraycopy); 2539 2540 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2541 entry_jbyte_arraycopy, 2542 entry_jshort_arraycopy, 2543 entry_jint_arraycopy, 2544 entry_oop_arraycopy, 2545 entry_jlong_arraycopy, 2546 entry_checkcast_arraycopy); 2547 2548 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2549 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2550 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2551 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2552 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2553 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2554 } 2555 2556 void generate_math_stubs() { Unimplemented(); } 2557 2558 // Arguments: 2559 // 2560 // Inputs: 2561 // c_rarg0 - source byte array address 2562 // c_rarg1 - destination byte array address 2563 // c_rarg2 - K (key) in little endian int array 2564 // 2565 address generate_aescrypt_encryptBlock() { 2566 __ align(CodeEntryAlignment); 2567 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2568 2569 Label L_doLast; 2570 2571 const Register from = c_rarg0; // source array address 2572 const Register to = c_rarg1; // destination array address 2573 const Register key = c_rarg2; // key array address 2574 const Register keylen = rscratch1; 2575 2576 address start = __ pc(); 2577 __ enter(); 2578 2579 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2580 2581 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2582 2583 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2584 __ rev32(v1, __ T16B, v1); 2585 __ rev32(v2, __ T16B, v2); 2586 __ rev32(v3, __ T16B, v3); 2587 __ rev32(v4, __ T16B, v4); 2588 __ aese(v0, v1); 2589 __ aesmc(v0, v0); 2590 __ aese(v0, v2); 2591 __ aesmc(v0, v0); 2592 __ aese(v0, v3); 2593 __ aesmc(v0, v0); 2594 __ aese(v0, v4); 2595 __ aesmc(v0, v0); 2596 2597 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2598 __ rev32(v1, __ T16B, v1); 2599 __ rev32(v2, __ T16B, v2); 2600 __ rev32(v3, __ T16B, v3); 2601 __ rev32(v4, __ T16B, v4); 2602 __ aese(v0, v1); 2603 __ aesmc(v0, v0); 2604 __ aese(v0, v2); 2605 __ aesmc(v0, v0); 2606 __ aese(v0, v3); 2607 __ aesmc(v0, v0); 2608 __ aese(v0, v4); 2609 __ aesmc(v0, v0); 2610 2611 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2612 __ rev32(v1, __ T16B, v1); 2613 __ rev32(v2, __ T16B, v2); 2614 2615 __ cmpw(keylen, 44); 2616 __ br(Assembler::EQ, L_doLast); 2617 2618 __ aese(v0, v1); 2619 __ aesmc(v0, v0); 2620 __ aese(v0, v2); 2621 __ aesmc(v0, v0); 2622 2623 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2624 __ rev32(v1, __ T16B, v1); 2625 __ rev32(v2, __ T16B, v2); 2626 2627 __ cmpw(keylen, 52); 2628 __ br(Assembler::EQ, L_doLast); 2629 2630 __ aese(v0, v1); 2631 __ aesmc(v0, v0); 2632 __ aese(v0, v2); 2633 __ aesmc(v0, v0); 2634 2635 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2636 __ rev32(v1, __ T16B, v1); 2637 __ rev32(v2, __ T16B, v2); 2638 2639 __ BIND(L_doLast); 2640 2641 __ aese(v0, v1); 2642 __ aesmc(v0, v0); 2643 __ aese(v0, v2); 2644 2645 __ ld1(v1, __ T16B, key); 2646 __ rev32(v1, __ T16B, v1); 2647 __ eor(v0, __ T16B, v0, v1); 2648 2649 __ st1(v0, __ T16B, to); 2650 2651 __ mov(r0, 0); 2652 2653 __ leave(); 2654 __ ret(lr); 2655 2656 return start; 2657 } 2658 2659 // Arguments: 2660 // 2661 // Inputs: 2662 // c_rarg0 - source byte array address 2663 // c_rarg1 - destination byte array address 2664 // c_rarg2 - K (key) in little endian int array 2665 // 2666 address generate_aescrypt_decryptBlock() { 2667 assert(UseAES, "need AES instructions and misaligned SSE support"); 2668 __ align(CodeEntryAlignment); 2669 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2670 Label L_doLast; 2671 2672 const Register from = c_rarg0; // source array address 2673 const Register to = c_rarg1; // destination array address 2674 const Register key = c_rarg2; // key array address 2675 const Register keylen = rscratch1; 2676 2677 address start = __ pc(); 2678 __ enter(); // required for proper stackwalking of RuntimeStub frame 2679 2680 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2681 2682 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2683 2684 __ ld1(v5, __ T16B, __ post(key, 16)); 2685 __ rev32(v5, __ T16B, v5); 2686 2687 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2688 __ rev32(v1, __ T16B, v1); 2689 __ rev32(v2, __ T16B, v2); 2690 __ rev32(v3, __ T16B, v3); 2691 __ rev32(v4, __ T16B, v4); 2692 __ aesd(v0, v1); 2693 __ aesimc(v0, v0); 2694 __ aesd(v0, v2); 2695 __ aesimc(v0, v0); 2696 __ aesd(v0, v3); 2697 __ aesimc(v0, v0); 2698 __ aesd(v0, v4); 2699 __ aesimc(v0, v0); 2700 2701 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2702 __ rev32(v1, __ T16B, v1); 2703 __ rev32(v2, __ T16B, v2); 2704 __ rev32(v3, __ T16B, v3); 2705 __ rev32(v4, __ T16B, v4); 2706 __ aesd(v0, v1); 2707 __ aesimc(v0, v0); 2708 __ aesd(v0, v2); 2709 __ aesimc(v0, v0); 2710 __ aesd(v0, v3); 2711 __ aesimc(v0, v0); 2712 __ aesd(v0, v4); 2713 __ aesimc(v0, v0); 2714 2715 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2716 __ rev32(v1, __ T16B, v1); 2717 __ rev32(v2, __ T16B, v2); 2718 2719 __ cmpw(keylen, 44); 2720 __ br(Assembler::EQ, L_doLast); 2721 2722 __ aesd(v0, v1); 2723 __ aesimc(v0, v0); 2724 __ aesd(v0, v2); 2725 __ aesimc(v0, v0); 2726 2727 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2728 __ rev32(v1, __ T16B, v1); 2729 __ rev32(v2, __ T16B, v2); 2730 2731 __ cmpw(keylen, 52); 2732 __ br(Assembler::EQ, L_doLast); 2733 2734 __ aesd(v0, v1); 2735 __ aesimc(v0, v0); 2736 __ aesd(v0, v2); 2737 __ aesimc(v0, v0); 2738 2739 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2740 __ rev32(v1, __ T16B, v1); 2741 __ rev32(v2, __ T16B, v2); 2742 2743 __ BIND(L_doLast); 2744 2745 __ aesd(v0, v1); 2746 __ aesimc(v0, v0); 2747 __ aesd(v0, v2); 2748 2749 __ eor(v0, __ T16B, v0, v5); 2750 2751 __ st1(v0, __ T16B, to); 2752 2753 __ mov(r0, 0); 2754 2755 __ leave(); 2756 __ ret(lr); 2757 2758 return start; 2759 } 2760 2761 // Arguments: 2762 // 2763 // Inputs: 2764 // c_rarg0 - source byte array address 2765 // c_rarg1 - destination byte array address 2766 // c_rarg2 - K (key) in little endian int array 2767 // c_rarg3 - r vector byte array address 2768 // c_rarg4 - input length 2769 // 2770 // Output: 2771 // x0 - input length 2772 // 2773 address generate_cipherBlockChaining_encryptAESCrypt() { 2774 assert(UseAES, "need AES instructions and misaligned SSE support"); 2775 __ align(CodeEntryAlignment); 2776 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2777 2778 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2779 2780 const Register from = c_rarg0; // source array address 2781 const Register to = c_rarg1; // destination array address 2782 const Register key = c_rarg2; // key array address 2783 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2784 // and left with the results of the last encryption block 2785 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2786 const Register keylen = rscratch1; 2787 2788 address start = __ pc(); 2789 2790 __ enter(); 2791 2792 __ movw(rscratch2, len_reg); 2793 2794 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2795 2796 __ ld1(v0, __ T16B, rvec); 2797 2798 __ cmpw(keylen, 52); 2799 __ br(Assembler::CC, L_loadkeys_44); 2800 __ br(Assembler::EQ, L_loadkeys_52); 2801 2802 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2803 __ rev32(v17, __ T16B, v17); 2804 __ rev32(v18, __ T16B, v18); 2805 __ BIND(L_loadkeys_52); 2806 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2807 __ rev32(v19, __ T16B, v19); 2808 __ rev32(v20, __ T16B, v20); 2809 __ BIND(L_loadkeys_44); 2810 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2811 __ rev32(v21, __ T16B, v21); 2812 __ rev32(v22, __ T16B, v22); 2813 __ rev32(v23, __ T16B, v23); 2814 __ rev32(v24, __ T16B, v24); 2815 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2816 __ rev32(v25, __ T16B, v25); 2817 __ rev32(v26, __ T16B, v26); 2818 __ rev32(v27, __ T16B, v27); 2819 __ rev32(v28, __ T16B, v28); 2820 __ ld1(v29, v30, v31, __ T16B, key); 2821 __ rev32(v29, __ T16B, v29); 2822 __ rev32(v30, __ T16B, v30); 2823 __ rev32(v31, __ T16B, v31); 2824 2825 __ BIND(L_aes_loop); 2826 __ ld1(v1, __ T16B, __ post(from, 16)); 2827 __ eor(v0, __ T16B, v0, v1); 2828 2829 __ br(Assembler::CC, L_rounds_44); 2830 __ br(Assembler::EQ, L_rounds_52); 2831 2832 __ aese(v0, v17); __ aesmc(v0, v0); 2833 __ aese(v0, v18); __ aesmc(v0, v0); 2834 __ BIND(L_rounds_52); 2835 __ aese(v0, v19); __ aesmc(v0, v0); 2836 __ aese(v0, v20); __ aesmc(v0, v0); 2837 __ BIND(L_rounds_44); 2838 __ aese(v0, v21); __ aesmc(v0, v0); 2839 __ aese(v0, v22); __ aesmc(v0, v0); 2840 __ aese(v0, v23); __ aesmc(v0, v0); 2841 __ aese(v0, v24); __ aesmc(v0, v0); 2842 __ aese(v0, v25); __ aesmc(v0, v0); 2843 __ aese(v0, v26); __ aesmc(v0, v0); 2844 __ aese(v0, v27); __ aesmc(v0, v0); 2845 __ aese(v0, v28); __ aesmc(v0, v0); 2846 __ aese(v0, v29); __ aesmc(v0, v0); 2847 __ aese(v0, v30); 2848 __ eor(v0, __ T16B, v0, v31); 2849 2850 __ st1(v0, __ T16B, __ post(to, 16)); 2851 2852 __ subw(len_reg, len_reg, 16); 2853 __ cbnzw(len_reg, L_aes_loop); 2854 2855 __ st1(v0, __ T16B, rvec); 2856 2857 __ mov(r0, rscratch2); 2858 2859 __ leave(); 2860 __ ret(lr); 2861 2862 return start; 2863 } 2864 2865 // Arguments: 2866 // 2867 // Inputs: 2868 // c_rarg0 - source byte array address 2869 // c_rarg1 - destination byte array address 2870 // c_rarg2 - K (key) in little endian int array 2871 // c_rarg3 - r vector byte array address 2872 // c_rarg4 - input length 2873 // 2874 // Output: 2875 // r0 - input length 2876 // 2877 address generate_cipherBlockChaining_decryptAESCrypt() { 2878 assert(UseAES, "need AES instructions and misaligned SSE support"); 2879 __ align(CodeEntryAlignment); 2880 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2881 2882 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2883 2884 const Register from = c_rarg0; // source array address 2885 const Register to = c_rarg1; // destination array address 2886 const Register key = c_rarg2; // key array address 2887 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2888 // and left with the results of the last encryption block 2889 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2890 const Register keylen = rscratch1; 2891 2892 address start = __ pc(); 2893 2894 __ enter(); 2895 2896 __ movw(rscratch2, len_reg); 2897 2898 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2899 2900 __ ld1(v2, __ T16B, rvec); 2901 2902 __ ld1(v31, __ T16B, __ post(key, 16)); 2903 __ rev32(v31, __ T16B, v31); 2904 2905 __ cmpw(keylen, 52); 2906 __ br(Assembler::CC, L_loadkeys_44); 2907 __ br(Assembler::EQ, L_loadkeys_52); 2908 2909 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2910 __ rev32(v17, __ T16B, v17); 2911 __ rev32(v18, __ T16B, v18); 2912 __ BIND(L_loadkeys_52); 2913 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2914 __ rev32(v19, __ T16B, v19); 2915 __ rev32(v20, __ T16B, v20); 2916 __ BIND(L_loadkeys_44); 2917 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2918 __ rev32(v21, __ T16B, v21); 2919 __ rev32(v22, __ T16B, v22); 2920 __ rev32(v23, __ T16B, v23); 2921 __ rev32(v24, __ T16B, v24); 2922 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2923 __ rev32(v25, __ T16B, v25); 2924 __ rev32(v26, __ T16B, v26); 2925 __ rev32(v27, __ T16B, v27); 2926 __ rev32(v28, __ T16B, v28); 2927 __ ld1(v29, v30, __ T16B, key); 2928 __ rev32(v29, __ T16B, v29); 2929 __ rev32(v30, __ T16B, v30); 2930 2931 __ BIND(L_aes_loop); 2932 __ ld1(v0, __ T16B, __ post(from, 16)); 2933 __ orr(v1, __ T16B, v0, v0); 2934 2935 __ br(Assembler::CC, L_rounds_44); 2936 __ br(Assembler::EQ, L_rounds_52); 2937 2938 __ aesd(v0, v17); __ aesimc(v0, v0); 2939 __ aesd(v0, v18); __ aesimc(v0, v0); 2940 __ BIND(L_rounds_52); 2941 __ aesd(v0, v19); __ aesimc(v0, v0); 2942 __ aesd(v0, v20); __ aesimc(v0, v0); 2943 __ BIND(L_rounds_44); 2944 __ aesd(v0, v21); __ aesimc(v0, v0); 2945 __ aesd(v0, v22); __ aesimc(v0, v0); 2946 __ aesd(v0, v23); __ aesimc(v0, v0); 2947 __ aesd(v0, v24); __ aesimc(v0, v0); 2948 __ aesd(v0, v25); __ aesimc(v0, v0); 2949 __ aesd(v0, v26); __ aesimc(v0, v0); 2950 __ aesd(v0, v27); __ aesimc(v0, v0); 2951 __ aesd(v0, v28); __ aesimc(v0, v0); 2952 __ aesd(v0, v29); __ aesimc(v0, v0); 2953 __ aesd(v0, v30); 2954 __ eor(v0, __ T16B, v0, v31); 2955 __ eor(v0, __ T16B, v0, v2); 2956 2957 __ st1(v0, __ T16B, __ post(to, 16)); 2958 __ orr(v2, __ T16B, v1, v1); 2959 2960 __ subw(len_reg, len_reg, 16); 2961 __ cbnzw(len_reg, L_aes_loop); 2962 2963 __ st1(v2, __ T16B, rvec); 2964 2965 __ mov(r0, rscratch2); 2966 2967 __ leave(); 2968 __ ret(lr); 2969 2970 return start; 2971 } 2972 2973 // Arguments: 2974 // 2975 // Inputs: 2976 // c_rarg0 - byte[] source+offset 2977 // c_rarg1 - int[] SHA.state 2978 // c_rarg2 - int offset 2979 // c_rarg3 - int limit 2980 // 2981 address generate_sha1_implCompress(bool multi_block, const char *name) { 2982 __ align(CodeEntryAlignment); 2983 StubCodeMark mark(this, "StubRoutines", name); 2984 address start = __ pc(); 2985 2986 Register buf = c_rarg0; 2987 Register state = c_rarg1; 2988 Register ofs = c_rarg2; 2989 Register limit = c_rarg3; 2990 2991 Label keys; 2992 Label sha1_loop; 2993 2994 // load the keys into v0..v3 2995 __ adr(rscratch1, keys); 2996 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2997 // load 5 words state into v6, v7 2998 __ ldrq(v6, Address(state, 0)); 2999 __ ldrs(v7, Address(state, 16)); 3000 3001 3002 __ BIND(sha1_loop); 3003 // load 64 bytes of data into v16..v19 3004 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3005 __ rev32(v16, __ T16B, v16); 3006 __ rev32(v17, __ T16B, v17); 3007 __ rev32(v18, __ T16B, v18); 3008 __ rev32(v19, __ T16B, v19); 3009 3010 // do the sha1 3011 __ addv(v4, __ T4S, v16, v0); 3012 __ orr(v20, __ T16B, v6, v6); 3013 3014 FloatRegister d0 = v16; 3015 FloatRegister d1 = v17; 3016 FloatRegister d2 = v18; 3017 FloatRegister d3 = v19; 3018 3019 for (int round = 0; round < 20; round++) { 3020 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3021 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3022 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3023 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3024 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3025 3026 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3027 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3028 __ sha1h(tmp2, __ T4S, v20); 3029 if (round < 5) 3030 __ sha1c(v20, __ T4S, tmp3, tmp4); 3031 else if (round < 10 || round >= 15) 3032 __ sha1p(v20, __ T4S, tmp3, tmp4); 3033 else 3034 __ sha1m(v20, __ T4S, tmp3, tmp4); 3035 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3036 3037 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3038 } 3039 3040 __ addv(v7, __ T2S, v7, v21); 3041 __ addv(v6, __ T4S, v6, v20); 3042 3043 if (multi_block) { 3044 __ add(ofs, ofs, 64); 3045 __ cmp(ofs, limit); 3046 __ br(Assembler::LE, sha1_loop); 3047 __ mov(c_rarg0, ofs); // return ofs 3048 } 3049 3050 __ strq(v6, Address(state, 0)); 3051 __ strs(v7, Address(state, 16)); 3052 3053 __ ret(lr); 3054 3055 __ bind(keys); 3056 __ emit_int32(0x5a827999); 3057 __ emit_int32(0x6ed9eba1); 3058 __ emit_int32(0x8f1bbcdc); 3059 __ emit_int32(0xca62c1d6); 3060 3061 return start; 3062 } 3063 3064 3065 // Arguments: 3066 // 3067 // Inputs: 3068 // c_rarg0 - byte[] source+offset 3069 // c_rarg1 - int[] SHA.state 3070 // c_rarg2 - int offset 3071 // c_rarg3 - int limit 3072 // 3073 address generate_sha256_implCompress(bool multi_block, const char *name) { 3074 static const uint32_t round_consts[64] = { 3075 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3076 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3077 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3078 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3079 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3080 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3081 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3082 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3083 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3084 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3085 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3086 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3087 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3088 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3089 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3090 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3091 }; 3092 __ align(CodeEntryAlignment); 3093 StubCodeMark mark(this, "StubRoutines", name); 3094 address start = __ pc(); 3095 3096 Register buf = c_rarg0; 3097 Register state = c_rarg1; 3098 Register ofs = c_rarg2; 3099 Register limit = c_rarg3; 3100 3101 Label sha1_loop; 3102 3103 __ stpd(v8, v9, __ pre(sp, -32)); 3104 __ stpd(v10, v11, Address(sp, 16)); 3105 3106 // dga == v0 3107 // dgb == v1 3108 // dg0 == v2 3109 // dg1 == v3 3110 // dg2 == v4 3111 // t0 == v6 3112 // t1 == v7 3113 3114 // load 16 keys to v16..v31 3115 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3116 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3117 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3118 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3119 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3120 3121 // load 8 words (256 bits) state 3122 __ ldpq(v0, v1, state); 3123 3124 __ BIND(sha1_loop); 3125 // load 64 bytes of data into v8..v11 3126 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3127 __ rev32(v8, __ T16B, v8); 3128 __ rev32(v9, __ T16B, v9); 3129 __ rev32(v10, __ T16B, v10); 3130 __ rev32(v11, __ T16B, v11); 3131 3132 __ addv(v6, __ T4S, v8, v16); 3133 __ orr(v2, __ T16B, v0, v0); 3134 __ orr(v3, __ T16B, v1, v1); 3135 3136 FloatRegister d0 = v8; 3137 FloatRegister d1 = v9; 3138 FloatRegister d2 = v10; 3139 FloatRegister d3 = v11; 3140 3141 3142 for (int round = 0; round < 16; round++) { 3143 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3144 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3145 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3146 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3147 3148 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3149 __ orr(v4, __ T16B, v2, v2); 3150 if (round < 15) 3151 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3152 __ sha256h(v2, __ T4S, v3, tmp2); 3153 __ sha256h2(v3, __ T4S, v4, tmp2); 3154 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3155 3156 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3157 } 3158 3159 __ addv(v0, __ T4S, v0, v2); 3160 __ addv(v1, __ T4S, v1, v3); 3161 3162 if (multi_block) { 3163 __ add(ofs, ofs, 64); 3164 __ cmp(ofs, limit); 3165 __ br(Assembler::LE, sha1_loop); 3166 __ mov(c_rarg0, ofs); // return ofs 3167 } 3168 3169 __ ldpd(v10, v11, Address(sp, 16)); 3170 __ ldpd(v8, v9, __ post(sp, 32)); 3171 3172 __ stpq(v0, v1, state); 3173 3174 __ ret(lr); 3175 3176 return start; 3177 } 3178 3179 #ifndef BUILTIN_SIM 3180 // Safefetch stubs. 3181 void generate_safefetch(const char* name, int size, address* entry, 3182 address* fault_pc, address* continuation_pc) { 3183 // safefetch signatures: 3184 // int SafeFetch32(int* adr, int errValue); 3185 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3186 // 3187 // arguments: 3188 // c_rarg0 = adr 3189 // c_rarg1 = errValue 3190 // 3191 // result: 3192 // PPC_RET = *adr or errValue 3193 3194 StubCodeMark mark(this, "StubRoutines", name); 3195 3196 // Entry point, pc or function descriptor. 3197 *entry = __ pc(); 3198 3199 // Load *adr into c_rarg1, may fault. 3200 *fault_pc = __ pc(); 3201 switch (size) { 3202 case 4: 3203 // int32_t 3204 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3205 break; 3206 case 8: 3207 // int64_t 3208 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3209 break; 3210 default: 3211 ShouldNotReachHere(); 3212 } 3213 3214 // return errValue or *adr 3215 *continuation_pc = __ pc(); 3216 __ mov(r0, c_rarg1); 3217 __ ret(lr); 3218 } 3219 #endif 3220 3221 /** 3222 * Arguments: 3223 * 3224 * Inputs: 3225 * c_rarg0 - int crc 3226 * c_rarg1 - byte* buf 3227 * c_rarg2 - int length 3228 * 3229 * Ouput: 3230 * rax - int crc result 3231 */ 3232 address generate_updateBytesCRC32() { 3233 assert(UseCRC32Intrinsics, "what are we doing here?"); 3234 3235 __ align(CodeEntryAlignment); 3236 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3237 3238 address start = __ pc(); 3239 3240 const Register crc = c_rarg0; // crc 3241 const Register buf = c_rarg1; // source java byte array address 3242 const Register len = c_rarg2; // length 3243 const Register table0 = c_rarg3; // crc_table address 3244 const Register table1 = c_rarg4; 3245 const Register table2 = c_rarg5; 3246 const Register table3 = c_rarg6; 3247 const Register tmp3 = c_rarg7; 3248 3249 BLOCK_COMMENT("Entry:"); 3250 __ enter(); // required for proper stackwalking of RuntimeStub frame 3251 3252 __ kernel_crc32(crc, buf, len, 3253 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3254 3255 __ leave(); // required for proper stackwalking of RuntimeStub frame 3256 __ ret(lr); 3257 3258 return start; 3259 } 3260 3261 /** 3262 * Arguments: 3263 * 3264 * Inputs: 3265 * c_rarg0 - int crc 3266 * c_rarg1 - byte* buf 3267 * c_rarg2 - int length 3268 * c_rarg3 - int* table 3269 * 3270 * Ouput: 3271 * r0 - int crc result 3272 */ 3273 address generate_updateBytesCRC32C() { 3274 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3275 3276 __ align(CodeEntryAlignment); 3277 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3278 3279 address start = __ pc(); 3280 3281 const Register crc = c_rarg0; // crc 3282 const Register buf = c_rarg1; // source java byte array address 3283 const Register len = c_rarg2; // length 3284 const Register table0 = c_rarg3; // crc_table address 3285 const Register table1 = c_rarg4; 3286 const Register table2 = c_rarg5; 3287 const Register table3 = c_rarg6; 3288 const Register tmp3 = c_rarg7; 3289 3290 BLOCK_COMMENT("Entry:"); 3291 __ enter(); // required for proper stackwalking of RuntimeStub frame 3292 3293 __ kernel_crc32c(crc, buf, len, 3294 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3295 3296 __ leave(); // required for proper stackwalking of RuntimeStub frame 3297 __ ret(lr); 3298 3299 return start; 3300 } 3301 3302 /*** 3303 * Arguments: 3304 * 3305 * Inputs: 3306 * c_rarg0 - int adler 3307 * c_rarg1 - byte* buff 3308 * c_rarg2 - int len 3309 * 3310 * Output: 3311 * c_rarg0 - int adler result 3312 */ 3313 address generate_updateBytesAdler32() { 3314 __ align(CodeEntryAlignment); 3315 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3316 address start = __ pc(); 3317 3318 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3319 3320 // Aliases 3321 Register adler = c_rarg0; 3322 Register s1 = c_rarg0; 3323 Register s2 = c_rarg3; 3324 Register buff = c_rarg1; 3325 Register len = c_rarg2; 3326 Register nmax = r4; 3327 Register base = r5; 3328 Register count = r6; 3329 Register temp0 = rscratch1; 3330 Register temp1 = rscratch2; 3331 Register temp2 = r7; 3332 3333 // Max number of bytes we can process before having to take the mod 3334 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3335 unsigned long BASE = 0xfff1; 3336 unsigned long NMAX = 0x15B0; 3337 3338 __ mov(base, BASE); 3339 __ mov(nmax, NMAX); 3340 3341 // s1 is initialized to the lower 16 bits of adler 3342 // s2 is initialized to the upper 16 bits of adler 3343 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3344 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3345 3346 // The pipelined loop needs at least 16 elements for 1 iteration 3347 // It does check this, but it is more effective to skip to the cleanup loop 3348 __ cmp(len, 16); 3349 __ br(Assembler::HS, L_nmax); 3350 __ cbz(len, L_combine); 3351 3352 __ bind(L_simple_by1_loop); 3353 __ ldrb(temp0, Address(__ post(buff, 1))); 3354 __ add(s1, s1, temp0); 3355 __ add(s2, s2, s1); 3356 __ subs(len, len, 1); 3357 __ br(Assembler::HI, L_simple_by1_loop); 3358 3359 // s1 = s1 % BASE 3360 __ subs(temp0, s1, base); 3361 __ csel(s1, temp0, s1, Assembler::HS); 3362 3363 // s2 = s2 % BASE 3364 __ lsr(temp0, s2, 16); 3365 __ lsl(temp1, temp0, 4); 3366 __ sub(temp1, temp1, temp0); 3367 __ add(s2, temp1, s2, ext::uxth); 3368 3369 __ subs(temp0, s2, base); 3370 __ csel(s2, temp0, s2, Assembler::HS); 3371 3372 __ b(L_combine); 3373 3374 __ bind(L_nmax); 3375 __ subs(len, len, nmax); 3376 __ sub(count, nmax, 16); 3377 __ br(Assembler::LO, L_by16); 3378 3379 __ bind(L_nmax_loop); 3380 3381 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3382 3383 __ add(s1, s1, temp0, ext::uxtb); 3384 __ ubfx(temp2, temp0, 8, 8); 3385 __ add(s2, s2, s1); 3386 __ add(s1, s1, temp2); 3387 __ ubfx(temp2, temp0, 16, 8); 3388 __ add(s2, s2, s1); 3389 __ add(s1, s1, temp2); 3390 __ ubfx(temp2, temp0, 24, 8); 3391 __ add(s2, s2, s1); 3392 __ add(s1, s1, temp2); 3393 __ ubfx(temp2, temp0, 32, 8); 3394 __ add(s2, s2, s1); 3395 __ add(s1, s1, temp2); 3396 __ ubfx(temp2, temp0, 40, 8); 3397 __ add(s2, s2, s1); 3398 __ add(s1, s1, temp2); 3399 __ ubfx(temp2, temp0, 48, 8); 3400 __ add(s2, s2, s1); 3401 __ add(s1, s1, temp2); 3402 __ add(s2, s2, s1); 3403 __ add(s1, s1, temp0, Assembler::LSR, 56); 3404 __ add(s2, s2, s1); 3405 3406 __ add(s1, s1, temp1, ext::uxtb); 3407 __ ubfx(temp2, temp1, 8, 8); 3408 __ add(s2, s2, s1); 3409 __ add(s1, s1, temp2); 3410 __ ubfx(temp2, temp1, 16, 8); 3411 __ add(s2, s2, s1); 3412 __ add(s1, s1, temp2); 3413 __ ubfx(temp2, temp1, 24, 8); 3414 __ add(s2, s2, s1); 3415 __ add(s1, s1, temp2); 3416 __ ubfx(temp2, temp1, 32, 8); 3417 __ add(s2, s2, s1); 3418 __ add(s1, s1, temp2); 3419 __ ubfx(temp2, temp1, 40, 8); 3420 __ add(s2, s2, s1); 3421 __ add(s1, s1, temp2); 3422 __ ubfx(temp2, temp1, 48, 8); 3423 __ add(s2, s2, s1); 3424 __ add(s1, s1, temp2); 3425 __ add(s2, s2, s1); 3426 __ add(s1, s1, temp1, Assembler::LSR, 56); 3427 __ add(s2, s2, s1); 3428 3429 __ subs(count, count, 16); 3430 __ br(Assembler::HS, L_nmax_loop); 3431 3432 // s1 = s1 % BASE 3433 __ lsr(temp0, s1, 16); 3434 __ lsl(temp1, temp0, 4); 3435 __ sub(temp1, temp1, temp0); 3436 __ add(temp1, temp1, s1, ext::uxth); 3437 3438 __ lsr(temp0, temp1, 16); 3439 __ lsl(s1, temp0, 4); 3440 __ sub(s1, s1, temp0); 3441 __ add(s1, s1, temp1, ext:: uxth); 3442 3443 __ subs(temp0, s1, base); 3444 __ csel(s1, temp0, s1, Assembler::HS); 3445 3446 // s2 = s2 % BASE 3447 __ lsr(temp0, s2, 16); 3448 __ lsl(temp1, temp0, 4); 3449 __ sub(temp1, temp1, temp0); 3450 __ add(temp1, temp1, s2, ext::uxth); 3451 3452 __ lsr(temp0, temp1, 16); 3453 __ lsl(s2, temp0, 4); 3454 __ sub(s2, s2, temp0); 3455 __ add(s2, s2, temp1, ext:: uxth); 3456 3457 __ subs(temp0, s2, base); 3458 __ csel(s2, temp0, s2, Assembler::HS); 3459 3460 __ subs(len, len, nmax); 3461 __ sub(count, nmax, 16); 3462 __ br(Assembler::HS, L_nmax_loop); 3463 3464 __ bind(L_by16); 3465 __ adds(len, len, count); 3466 __ br(Assembler::LO, L_by1); 3467 3468 __ bind(L_by16_loop); 3469 3470 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3471 3472 __ add(s1, s1, temp0, ext::uxtb); 3473 __ ubfx(temp2, temp0, 8, 8); 3474 __ add(s2, s2, s1); 3475 __ add(s1, s1, temp2); 3476 __ ubfx(temp2, temp0, 16, 8); 3477 __ add(s2, s2, s1); 3478 __ add(s1, s1, temp2); 3479 __ ubfx(temp2, temp0, 24, 8); 3480 __ add(s2, s2, s1); 3481 __ add(s1, s1, temp2); 3482 __ ubfx(temp2, temp0, 32, 8); 3483 __ add(s2, s2, s1); 3484 __ add(s1, s1, temp2); 3485 __ ubfx(temp2, temp0, 40, 8); 3486 __ add(s2, s2, s1); 3487 __ add(s1, s1, temp2); 3488 __ ubfx(temp2, temp0, 48, 8); 3489 __ add(s2, s2, s1); 3490 __ add(s1, s1, temp2); 3491 __ add(s2, s2, s1); 3492 __ add(s1, s1, temp0, Assembler::LSR, 56); 3493 __ add(s2, s2, s1); 3494 3495 __ add(s1, s1, temp1, ext::uxtb); 3496 __ ubfx(temp2, temp1, 8, 8); 3497 __ add(s2, s2, s1); 3498 __ add(s1, s1, temp2); 3499 __ ubfx(temp2, temp1, 16, 8); 3500 __ add(s2, s2, s1); 3501 __ add(s1, s1, temp2); 3502 __ ubfx(temp2, temp1, 24, 8); 3503 __ add(s2, s2, s1); 3504 __ add(s1, s1, temp2); 3505 __ ubfx(temp2, temp1, 32, 8); 3506 __ add(s2, s2, s1); 3507 __ add(s1, s1, temp2); 3508 __ ubfx(temp2, temp1, 40, 8); 3509 __ add(s2, s2, s1); 3510 __ add(s1, s1, temp2); 3511 __ ubfx(temp2, temp1, 48, 8); 3512 __ add(s2, s2, s1); 3513 __ add(s1, s1, temp2); 3514 __ add(s2, s2, s1); 3515 __ add(s1, s1, temp1, Assembler::LSR, 56); 3516 __ add(s2, s2, s1); 3517 3518 __ subs(len, len, 16); 3519 __ br(Assembler::HS, L_by16_loop); 3520 3521 __ bind(L_by1); 3522 __ adds(len, len, 15); 3523 __ br(Assembler::LO, L_do_mod); 3524 3525 __ bind(L_by1_loop); 3526 __ ldrb(temp0, Address(__ post(buff, 1))); 3527 __ add(s1, temp0, s1); 3528 __ add(s2, s2, s1); 3529 __ subs(len, len, 1); 3530 __ br(Assembler::HS, L_by1_loop); 3531 3532 __ bind(L_do_mod); 3533 // s1 = s1 % BASE 3534 __ lsr(temp0, s1, 16); 3535 __ lsl(temp1, temp0, 4); 3536 __ sub(temp1, temp1, temp0); 3537 __ add(temp1, temp1, s1, ext::uxth); 3538 3539 __ lsr(temp0, temp1, 16); 3540 __ lsl(s1, temp0, 4); 3541 __ sub(s1, s1, temp0); 3542 __ add(s1, s1, temp1, ext:: uxth); 3543 3544 __ subs(temp0, s1, base); 3545 __ csel(s1, temp0, s1, Assembler::HS); 3546 3547 // s2 = s2 % BASE 3548 __ lsr(temp0, s2, 16); 3549 __ lsl(temp1, temp0, 4); 3550 __ sub(temp1, temp1, temp0); 3551 __ add(temp1, temp1, s2, ext::uxth); 3552 3553 __ lsr(temp0, temp1, 16); 3554 __ lsl(s2, temp0, 4); 3555 __ sub(s2, s2, temp0); 3556 __ add(s2, s2, temp1, ext:: uxth); 3557 3558 __ subs(temp0, s2, base); 3559 __ csel(s2, temp0, s2, Assembler::HS); 3560 3561 // Combine lower bits and higher bits 3562 __ bind(L_combine); 3563 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3564 3565 __ ret(lr); 3566 3567 return start; 3568 } 3569 3570 /** 3571 * Arguments: 3572 * 3573 * Input: 3574 * c_rarg0 - x address 3575 * c_rarg1 - x length 3576 * c_rarg2 - y address 3577 * c_rarg3 - y lenth 3578 * c_rarg4 - z address 3579 * c_rarg5 - z length 3580 */ 3581 address generate_multiplyToLen() { 3582 __ align(CodeEntryAlignment); 3583 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3584 3585 address start = __ pc(); 3586 const Register x = r0; 3587 const Register xlen = r1; 3588 const Register y = r2; 3589 const Register ylen = r3; 3590 const Register z = r4; 3591 const Register zlen = r5; 3592 3593 const Register tmp1 = r10; 3594 const Register tmp2 = r11; 3595 const Register tmp3 = r12; 3596 const Register tmp4 = r13; 3597 const Register tmp5 = r14; 3598 const Register tmp6 = r15; 3599 const Register tmp7 = r16; 3600 3601 BLOCK_COMMENT("Entry:"); 3602 __ enter(); // required for proper stackwalking of RuntimeStub frame 3603 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3604 __ leave(); // required for proper stackwalking of RuntimeStub frame 3605 __ ret(lr); 3606 3607 return start; 3608 } 3609 3610 address generate_squareToLen() { 3611 // squareToLen algorithm for sizes 1..127 described in java code works 3612 // faster than multiply_to_len on some CPUs and slower on others, but 3613 // multiply_to_len shows a bit better overall results 3614 __ align(CodeEntryAlignment); 3615 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3616 address start = __ pc(); 3617 3618 const Register x = r0; 3619 const Register xlen = r1; 3620 const Register z = r2; 3621 const Register zlen = r3; 3622 const Register y = r4; // == x 3623 const Register ylen = r5; // == xlen 3624 3625 const Register tmp1 = r10; 3626 const Register tmp2 = r11; 3627 const Register tmp3 = r12; 3628 const Register tmp4 = r13; 3629 const Register tmp5 = r14; 3630 const Register tmp6 = r15; 3631 const Register tmp7 = r16; 3632 3633 RegSet spilled_regs = RegSet::of(y, ylen); 3634 BLOCK_COMMENT("Entry:"); 3635 __ enter(); 3636 __ push(spilled_regs, sp); 3637 __ mov(y, x); 3638 __ mov(ylen, xlen); 3639 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3640 __ pop(spilled_regs, sp); 3641 __ leave(); 3642 __ ret(lr); 3643 return start; 3644 } 3645 3646 address generate_mulAdd() { 3647 __ align(CodeEntryAlignment); 3648 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3649 3650 address start = __ pc(); 3651 3652 const Register out = r0; 3653 const Register in = r1; 3654 const Register offset = r2; 3655 const Register len = r3; 3656 const Register k = r4; 3657 3658 BLOCK_COMMENT("Entry:"); 3659 __ enter(); 3660 __ mul_add(out, in, offset, len, k); 3661 __ leave(); 3662 __ ret(lr); 3663 3664 return start; 3665 } 3666 3667 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3668 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3669 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3670 // Karatsuba multiplication performs a 128*128 -> 256-bit 3671 // multiplication in three 128-bit multiplications and a few 3672 // additions. 3673 // 3674 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3675 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3676 // 3677 // Inputs: 3678 // 3679 // A0 in a.d[0] (subkey) 3680 // A1 in a.d[1] 3681 // (A1+A0) in a1_xor_a0.d[0] 3682 // 3683 // B0 in b.d[0] (state) 3684 // B1 in b.d[1] 3685 3686 __ ext(tmp1, __ T16B, b, b, 0x08); 3687 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3688 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3689 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3690 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3691 3692 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3693 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3694 __ eor(tmp2, __ T16B, tmp2, tmp4); 3695 __ eor(tmp2, __ T16B, tmp2, tmp3); 3696 3697 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3698 __ ins(result_hi, __ D, tmp2, 0, 1); 3699 __ ins(result_lo, __ D, tmp2, 1, 0); 3700 } 3701 3702 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3703 FloatRegister p, FloatRegister z, FloatRegister t1) { 3704 const FloatRegister t0 = result; 3705 3706 // The GCM field polynomial f is z^128 + p(z), where p = 3707 // z^7+z^2+z+1. 3708 // 3709 // z^128 === -p(z) (mod (z^128 + p(z))) 3710 // 3711 // so, given that the product we're reducing is 3712 // a == lo + hi * z^128 3713 // substituting, 3714 // === lo - hi * p(z) (mod (z^128 + p(z))) 3715 // 3716 // we reduce by multiplying hi by p(z) and subtracting the result 3717 // from (i.e. XORing it with) lo. Because p has no nonzero high 3718 // bits we can do this with two 64-bit multiplications, lo*p and 3719 // hi*p. 3720 3721 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3722 __ ext(t1, __ T16B, t0, z, 8); 3723 __ eor(hi, __ T16B, hi, t1); 3724 __ ext(t1, __ T16B, z, t0, 8); 3725 __ eor(lo, __ T16B, lo, t1); 3726 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3727 __ eor(result, __ T16B, lo, t0); 3728 } 3729 3730 address generate_has_negatives(address &has_negatives_long) { 3731 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3732 const int large_loop_size = 64; 3733 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3734 int dcache_line = VM_Version::dcache_line_size(); 3735 3736 Register ary1 = r1, len = r2, result = r0; 3737 3738 __ align(CodeEntryAlignment); 3739 address entry = __ pc(); 3740 3741 __ enter(); 3742 3743 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3744 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3745 3746 __ cmp(len, 15); 3747 __ br(Assembler::GT, LEN_OVER_15); 3748 // The only case when execution falls into this code is when pointer is near 3749 // the end of memory page and we have to avoid reading next page 3750 __ add(ary1, ary1, len); 3751 __ subs(len, len, 8); 3752 __ br(Assembler::GT, LEN_OVER_8); 3753 __ ldr(rscratch2, Address(ary1, -8)); 3754 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3755 __ lsrv(rscratch2, rscratch2, rscratch1); 3756 __ tst(rscratch2, UPPER_BIT_MASK); 3757 __ cset(result, Assembler::NE); 3758 __ leave(); 3759 __ ret(lr); 3760 __ bind(LEN_OVER_8); 3761 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3762 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3763 __ tst(rscratch2, UPPER_BIT_MASK); 3764 __ br(Assembler::NE, RET_TRUE_NO_POP); 3765 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3766 __ lsrv(rscratch1, rscratch1, rscratch2); 3767 __ tst(rscratch1, UPPER_BIT_MASK); 3768 __ cset(result, Assembler::NE); 3769 __ leave(); 3770 __ ret(lr); 3771 3772 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3773 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3774 3775 has_negatives_long = __ pc(); // 2nd entry point 3776 3777 __ enter(); 3778 3779 __ bind(LEN_OVER_15); 3780 __ push(spilled_regs, sp); 3781 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3782 __ cbz(rscratch2, ALIGNED); 3783 __ ldp(tmp6, tmp1, Address(ary1)); 3784 __ mov(tmp5, 16); 3785 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3786 __ add(ary1, ary1, rscratch1); 3787 __ sub(len, len, rscratch1); 3788 __ orr(tmp6, tmp6, tmp1); 3789 __ tst(tmp6, UPPER_BIT_MASK); 3790 __ br(Assembler::NE, RET_TRUE); 3791 3792 __ bind(ALIGNED); 3793 __ cmp(len, large_loop_size); 3794 __ br(Assembler::LT, CHECK_16); 3795 // Perform 16-byte load as early return in pre-loop to handle situation 3796 // when initially aligned large array has negative values at starting bytes, 3797 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3798 // slower. Cases with negative bytes further ahead won't be affected that 3799 // much. In fact, it'll be faster due to early loads, less instructions and 3800 // less branches in LARGE_LOOP. 3801 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3802 __ sub(len, len, 16); 3803 __ orr(tmp6, tmp6, tmp1); 3804 __ tst(tmp6, UPPER_BIT_MASK); 3805 __ br(Assembler::NE, RET_TRUE); 3806 __ cmp(len, large_loop_size); 3807 __ br(Assembler::LT, CHECK_16); 3808 3809 if (SoftwarePrefetchHintDistance >= 0 3810 && SoftwarePrefetchHintDistance >= dcache_line) { 3811 // initial prefetch 3812 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3813 } 3814 __ bind(LARGE_LOOP); 3815 if (SoftwarePrefetchHintDistance >= 0) { 3816 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3817 } 3818 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3819 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3820 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3821 // instructions per cycle and have less branches, but this approach disables 3822 // early return, thus, all 64 bytes are loaded and checked every time. 3823 __ ldp(tmp2, tmp3, Address(ary1)); 3824 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3825 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3826 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3827 __ add(ary1, ary1, large_loop_size); 3828 __ sub(len, len, large_loop_size); 3829 __ orr(tmp2, tmp2, tmp3); 3830 __ orr(tmp4, tmp4, tmp5); 3831 __ orr(rscratch1, rscratch1, rscratch2); 3832 __ orr(tmp6, tmp6, tmp1); 3833 __ orr(tmp2, tmp2, tmp4); 3834 __ orr(rscratch1, rscratch1, tmp6); 3835 __ orr(tmp2, tmp2, rscratch1); 3836 __ tst(tmp2, UPPER_BIT_MASK); 3837 __ br(Assembler::NE, RET_TRUE); 3838 __ cmp(len, large_loop_size); 3839 __ br(Assembler::GE, LARGE_LOOP); 3840 3841 __ bind(CHECK_16); // small 16-byte load pre-loop 3842 __ cmp(len, 16); 3843 __ br(Assembler::LT, POST_LOOP16); 3844 3845 __ bind(LOOP16); // small 16-byte load loop 3846 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3847 __ sub(len, len, 16); 3848 __ orr(tmp2, tmp2, tmp3); 3849 __ tst(tmp2, UPPER_BIT_MASK); 3850 __ br(Assembler::NE, RET_TRUE); 3851 __ cmp(len, 16); 3852 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3853 3854 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3855 __ cmp(len, 8); 3856 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3857 __ ldr(tmp3, Address(__ post(ary1, 8))); 3858 __ sub(len, len, 8); 3859 __ tst(tmp3, UPPER_BIT_MASK); 3860 __ br(Assembler::NE, RET_TRUE); 3861 3862 __ bind(POST_LOOP16_LOAD_TAIL); 3863 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3864 __ ldr(tmp1, Address(ary1)); 3865 __ mov(tmp2, 64); 3866 __ sub(tmp4, tmp2, len, __ LSL, 3); 3867 __ lslv(tmp1, tmp1, tmp4); 3868 __ tst(tmp1, UPPER_BIT_MASK); 3869 __ br(Assembler::NE, RET_TRUE); 3870 // Fallthrough 3871 3872 __ bind(RET_FALSE); 3873 __ pop(spilled_regs, sp); 3874 __ leave(); 3875 __ mov(result, zr); 3876 __ ret(lr); 3877 3878 __ bind(RET_TRUE); 3879 __ pop(spilled_regs, sp); 3880 __ bind(RET_TRUE_NO_POP); 3881 __ leave(); 3882 __ mov(result, 1); 3883 __ ret(lr); 3884 3885 __ bind(DONE); 3886 __ pop(spilled_regs, sp); 3887 __ leave(); 3888 __ ret(lr); 3889 return entry; 3890 } 3891 /** 3892 * Arguments: 3893 * 3894 * Input: 3895 * c_rarg0 - current state address 3896 * c_rarg1 - H key address 3897 * c_rarg2 - data address 3898 * c_rarg3 - number of blocks 3899 * 3900 * Output: 3901 * Updated state at c_rarg0 3902 */ 3903 address generate_ghash_processBlocks() { 3904 // Bafflingly, GCM uses little-endian for the byte order, but 3905 // big-endian for the bit order. For example, the polynomial 1 is 3906 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3907 // 3908 // So, we must either reverse the bytes in each word and do 3909 // everything big-endian or reverse the bits in each byte and do 3910 // it little-endian. On AArch64 it's more idiomatic to reverse 3911 // the bits in each byte (we have an instruction, RBIT, to do 3912 // that) and keep the data in little-endian bit order throught the 3913 // calculation, bit-reversing the inputs and outputs. 3914 3915 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3916 __ align(wordSize * 2); 3917 address p = __ pc(); 3918 __ emit_int64(0x87); // The low-order bits of the field 3919 // polynomial (i.e. p = z^7+z^2+z+1) 3920 // repeated in the low and high parts of a 3921 // 128-bit vector 3922 __ emit_int64(0x87); 3923 3924 __ align(CodeEntryAlignment); 3925 address start = __ pc(); 3926 3927 Register state = c_rarg0; 3928 Register subkeyH = c_rarg1; 3929 Register data = c_rarg2; 3930 Register blocks = c_rarg3; 3931 3932 FloatRegister vzr = v30; 3933 __ eor(vzr, __ T16B, vzr, vzr); // zero register 3934 3935 __ ldrq(v0, Address(state)); 3936 __ ldrq(v1, Address(subkeyH)); 3937 3938 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 3939 __ rbit(v0, __ T16B, v0); 3940 __ rev64(v1, __ T16B, v1); 3941 __ rbit(v1, __ T16B, v1); 3942 3943 __ ldrq(v26, p); 3944 3945 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 3946 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 3947 3948 { 3949 Label L_ghash_loop; 3950 __ bind(L_ghash_loop); 3951 3952 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 3953 // reversing each byte 3954 __ rbit(v2, __ T16B, v2); 3955 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 3956 3957 // Multiply state in v2 by subkey in v1 3958 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 3959 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 3960 /*temps*/v6, v20, v18, v21); 3961 // Reduce v7:v5 by the field polynomial 3962 ghash_reduce(v0, v5, v7, v26, vzr, v20); 3963 3964 __ sub(blocks, blocks, 1); 3965 __ cbnz(blocks, L_ghash_loop); 3966 } 3967 3968 // The bit-reversed result is at this point in v0 3969 __ rev64(v1, __ T16B, v0); 3970 __ rbit(v1, __ T16B, v1); 3971 3972 __ st1(v1, __ T16B, state); 3973 __ ret(lr); 3974 3975 return start; 3976 } 3977 3978 // Continuation point for throwing of implicit exceptions that are 3979 // not handled in the current activation. Fabricates an exception 3980 // oop and initiates normal exception dispatching in this 3981 // frame. Since we need to preserve callee-saved values (currently 3982 // only for C2, but done for C1 as well) we need a callee-saved oop 3983 // map and therefore have to make these stubs into RuntimeStubs 3984 // rather than BufferBlobs. If the compiler needs all registers to 3985 // be preserved between the fault point and the exception handler 3986 // then it must assume responsibility for that in 3987 // AbstractCompiler::continuation_for_implicit_null_exception or 3988 // continuation_for_implicit_division_by_zero_exception. All other 3989 // implicit exceptions (e.g., NullPointerException or 3990 // AbstractMethodError on entry) are either at call sites or 3991 // otherwise assume that stack unwinding will be initiated, so 3992 // caller saved registers were assumed volatile in the compiler. 3993 3994 #undef __ 3995 #define __ masm-> 3996 3997 address generate_throw_exception(const char* name, 3998 address runtime_entry, 3999 Register arg1 = noreg, 4000 Register arg2 = noreg) { 4001 // Information about frame layout at time of blocking runtime call. 4002 // Note that we only have to preserve callee-saved registers since 4003 // the compilers are responsible for supplying a continuation point 4004 // if they expect all registers to be preserved. 4005 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4006 enum layout { 4007 rfp_off = 0, 4008 rfp_off2, 4009 return_off, 4010 return_off2, 4011 framesize // inclusive of return address 4012 }; 4013 4014 int insts_size = 512; 4015 int locs_size = 64; 4016 4017 CodeBuffer code(name, insts_size, locs_size); 4018 OopMapSet* oop_maps = new OopMapSet(); 4019 MacroAssembler* masm = new MacroAssembler(&code); 4020 4021 address start = __ pc(); 4022 4023 // This is an inlined and slightly modified version of call_VM 4024 // which has the ability to fetch the return PC out of 4025 // thread-local storage and also sets up last_Java_sp slightly 4026 // differently than the real call_VM 4027 4028 __ enter(); // Save FP and LR before call 4029 4030 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4031 4032 // lr and fp are already in place 4033 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4034 4035 int frame_complete = __ pc() - start; 4036 4037 // Set up last_Java_sp and last_Java_fp 4038 address the_pc = __ pc(); 4039 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 4040 4041 // Call runtime 4042 if (arg1 != noreg) { 4043 assert(arg2 != c_rarg1, "clobbered"); 4044 __ mov(c_rarg1, arg1); 4045 } 4046 if (arg2 != noreg) { 4047 __ mov(c_rarg2, arg2); 4048 } 4049 __ mov(c_rarg0, rthread); 4050 BLOCK_COMMENT("call runtime_entry"); 4051 __ mov(rscratch1, runtime_entry); 4052 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4053 4054 // Generate oop map 4055 OopMap* map = new OopMap(framesize, 0); 4056 4057 oop_maps->add_gc_map(the_pc - start, map); 4058 4059 __ reset_last_Java_frame(true); 4060 __ maybe_isb(); 4061 4062 __ leave(); 4063 4064 // check for pending exceptions 4065 #ifdef ASSERT 4066 Label L; 4067 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4068 __ cbnz(rscratch1, L); 4069 __ should_not_reach_here(); 4070 __ bind(L); 4071 #endif // ASSERT 4072 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4073 4074 4075 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4076 RuntimeStub* stub = 4077 RuntimeStub::new_runtime_stub(name, 4078 &code, 4079 frame_complete, 4080 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4081 oop_maps, false); 4082 return stub->entry_point(); 4083 } 4084 4085 class MontgomeryMultiplyGenerator : public MacroAssembler { 4086 4087 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4088 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4089 4090 RegSet _toSave; 4091 bool _squaring; 4092 4093 public: 4094 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4095 : MacroAssembler(as->code()), _squaring(squaring) { 4096 4097 // Register allocation 4098 4099 Register reg = c_rarg0; 4100 Pa_base = reg; // Argument registers 4101 if (squaring) 4102 Pb_base = Pa_base; 4103 else 4104 Pb_base = ++reg; 4105 Pn_base = ++reg; 4106 Rlen= ++reg; 4107 inv = ++reg; 4108 Pm_base = ++reg; 4109 4110 // Working registers: 4111 Ra = ++reg; // The current digit of a, b, n, and m. 4112 Rb = ++reg; 4113 Rm = ++reg; 4114 Rn = ++reg; 4115 4116 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4117 Pb = ++reg; 4118 Pm = ++reg; 4119 Pn = ++reg; 4120 4121 t0 = ++reg; // Three registers which form a 4122 t1 = ++reg; // triple-precision accumuator. 4123 t2 = ++reg; 4124 4125 Ri = ++reg; // Inner and outer loop indexes. 4126 Rj = ++reg; 4127 4128 Rhi_ab = ++reg; // Product registers: low and high parts 4129 Rlo_ab = ++reg; // of a*b and m*n. 4130 Rhi_mn = ++reg; 4131 Rlo_mn = ++reg; 4132 4133 // r19 and up are callee-saved. 4134 _toSave = RegSet::range(r19, reg) + Pm_base; 4135 } 4136 4137 private: 4138 void save_regs() { 4139 push(_toSave, sp); 4140 } 4141 4142 void restore_regs() { 4143 pop(_toSave, sp); 4144 } 4145 4146 template <typename T> 4147 void unroll_2(Register count, T block) { 4148 Label loop, end, odd; 4149 tbnz(count, 0, odd); 4150 cbz(count, end); 4151 align(16); 4152 bind(loop); 4153 (this->*block)(); 4154 bind(odd); 4155 (this->*block)(); 4156 subs(count, count, 2); 4157 br(Assembler::GT, loop); 4158 bind(end); 4159 } 4160 4161 template <typename T> 4162 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4163 Label loop, end, odd; 4164 tbnz(count, 0, odd); 4165 cbz(count, end); 4166 align(16); 4167 bind(loop); 4168 (this->*block)(d, s, tmp); 4169 bind(odd); 4170 (this->*block)(d, s, tmp); 4171 subs(count, count, 2); 4172 br(Assembler::GT, loop); 4173 bind(end); 4174 } 4175 4176 void pre1(RegisterOrConstant i) { 4177 block_comment("pre1"); 4178 // Pa = Pa_base; 4179 // Pb = Pb_base + i; 4180 // Pm = Pm_base; 4181 // Pn = Pn_base + i; 4182 // Ra = *Pa; 4183 // Rb = *Pb; 4184 // Rm = *Pm; 4185 // Rn = *Pn; 4186 ldr(Ra, Address(Pa_base)); 4187 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4188 ldr(Rm, Address(Pm_base)); 4189 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4190 lea(Pa, Address(Pa_base)); 4191 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4192 lea(Pm, Address(Pm_base)); 4193 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4194 4195 // Zero the m*n result. 4196 mov(Rhi_mn, zr); 4197 mov(Rlo_mn, zr); 4198 } 4199 4200 // The core multiply-accumulate step of a Montgomery 4201 // multiplication. The idea is to schedule operations as a 4202 // pipeline so that instructions with long latencies (loads and 4203 // multiplies) have time to complete before their results are 4204 // used. This most benefits in-order implementations of the 4205 // architecture but out-of-order ones also benefit. 4206 void step() { 4207 block_comment("step"); 4208 // MACC(Ra, Rb, t0, t1, t2); 4209 // Ra = *++Pa; 4210 // Rb = *--Pb; 4211 umulh(Rhi_ab, Ra, Rb); 4212 mul(Rlo_ab, Ra, Rb); 4213 ldr(Ra, pre(Pa, wordSize)); 4214 ldr(Rb, pre(Pb, -wordSize)); 4215 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4216 // previous iteration. 4217 // MACC(Rm, Rn, t0, t1, t2); 4218 // Rm = *++Pm; 4219 // Rn = *--Pn; 4220 umulh(Rhi_mn, Rm, Rn); 4221 mul(Rlo_mn, Rm, Rn); 4222 ldr(Rm, pre(Pm, wordSize)); 4223 ldr(Rn, pre(Pn, -wordSize)); 4224 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4225 } 4226 4227 void post1() { 4228 block_comment("post1"); 4229 4230 // MACC(Ra, Rb, t0, t1, t2); 4231 // Ra = *++Pa; 4232 // Rb = *--Pb; 4233 umulh(Rhi_ab, Ra, Rb); 4234 mul(Rlo_ab, Ra, Rb); 4235 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4236 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4237 4238 // *Pm = Rm = t0 * inv; 4239 mul(Rm, t0, inv); 4240 str(Rm, Address(Pm)); 4241 4242 // MACC(Rm, Rn, t0, t1, t2); 4243 // t0 = t1; t1 = t2; t2 = 0; 4244 umulh(Rhi_mn, Rm, Rn); 4245 4246 #ifndef PRODUCT 4247 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4248 { 4249 mul(Rlo_mn, Rm, Rn); 4250 add(Rlo_mn, t0, Rlo_mn); 4251 Label ok; 4252 cbz(Rlo_mn, ok); { 4253 stop("broken Montgomery multiply"); 4254 } bind(ok); 4255 } 4256 #endif 4257 // We have very carefully set things up so that 4258 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4259 // the lower half of Rm * Rn because we know the result already: 4260 // it must be -t0. t0 + (-t0) must generate a carry iff 4261 // t0 != 0. So, rather than do a mul and an adds we just set 4262 // the carry flag iff t0 is nonzero. 4263 // 4264 // mul(Rlo_mn, Rm, Rn); 4265 // adds(zr, t0, Rlo_mn); 4266 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4267 adcs(t0, t1, Rhi_mn); 4268 adc(t1, t2, zr); 4269 mov(t2, zr); 4270 } 4271 4272 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 4273 block_comment("pre2"); 4274 // Pa = Pa_base + i-len; 4275 // Pb = Pb_base + len; 4276 // Pm = Pm_base + i-len; 4277 // Pn = Pn_base + len; 4278 4279 if (i.is_register()) { 4280 sub(Rj, i.as_register(), len); 4281 } else { 4282 mov(Rj, i.as_constant()); 4283 sub(Rj, Rj, len); 4284 } 4285 // Rj == i-len 4286 4287 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 4288 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 4289 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4290 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 4291 4292 // Ra = *++Pa; 4293 // Rb = *--Pb; 4294 // Rm = *++Pm; 4295 // Rn = *--Pn; 4296 ldr(Ra, pre(Pa, wordSize)); 4297 ldr(Rb, pre(Pb, -wordSize)); 4298 ldr(Rm, pre(Pm, wordSize)); 4299 ldr(Rn, pre(Pn, -wordSize)); 4300 4301 mov(Rhi_mn, zr); 4302 mov(Rlo_mn, zr); 4303 } 4304 4305 void post2(RegisterOrConstant i, RegisterOrConstant len) { 4306 block_comment("post2"); 4307 if (i.is_constant()) { 4308 mov(Rj, i.as_constant()-len.as_constant()); 4309 } else { 4310 sub(Rj, i.as_register(), len); 4311 } 4312 4313 adds(t0, t0, Rlo_mn); // The pending m*n, low part 4314 4315 // As soon as we know the least significant digit of our result, 4316 // store it. 4317 // Pm_base[i-len] = t0; 4318 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4319 4320 // t0 = t1; t1 = t2; t2 = 0; 4321 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 4322 adc(t1, t2, zr); 4323 mov(t2, zr); 4324 } 4325 4326 // A carry in t0 after Montgomery multiplication means that we 4327 // should subtract multiples of n from our result in m. We'll 4328 // keep doing that until there is no carry. 4329 void normalize(RegisterOrConstant len) { 4330 block_comment("normalize"); 4331 // while (t0) 4332 // t0 = sub(Pm_base, Pn_base, t0, len); 4333 Label loop, post, again; 4334 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 4335 cbz(t0, post); { 4336 bind(again); { 4337 mov(i, zr); 4338 mov(cnt, len); 4339 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4340 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4341 subs(zr, zr, zr); // set carry flag, i.e. no borrow 4342 align(16); 4343 bind(loop); { 4344 sbcs(Rm, Rm, Rn); 4345 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4346 add(i, i, 1); 4347 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4348 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4349 sub(cnt, cnt, 1); 4350 } cbnz(cnt, loop); 4351 sbc(t0, t0, zr); 4352 } cbnz(t0, again); 4353 } bind(post); 4354 } 4355 4356 // Move memory at s to d, reversing words. 4357 // Increments d to end of copied memory 4358 // Destroys tmp1, tmp2 4359 // Preserves len 4360 // Leaves s pointing to the address which was in d at start 4361 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 4362 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 4363 4364 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 4365 mov(tmp1, len); 4366 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 4367 sub(s, d, len, ext::uxtw, LogBytesPerWord); 4368 } 4369 // where 4370 void reverse1(Register d, Register s, Register tmp) { 4371 ldr(tmp, pre(s, -wordSize)); 4372 ror(tmp, tmp, 32); 4373 str(tmp, post(d, wordSize)); 4374 } 4375 4376 void step_squaring() { 4377 // An extra ACC 4378 step(); 4379 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4380 } 4381 4382 void last_squaring(RegisterOrConstant i) { 4383 Label dont; 4384 // if ((i & 1) == 0) { 4385 tbnz(i.as_register(), 0, dont); { 4386 // MACC(Ra, Rb, t0, t1, t2); 4387 // Ra = *++Pa; 4388 // Rb = *--Pb; 4389 umulh(Rhi_ab, Ra, Rb); 4390 mul(Rlo_ab, Ra, Rb); 4391 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4392 } bind(dont); 4393 } 4394 4395 void extra_step_squaring() { 4396 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4397 4398 // MACC(Rm, Rn, t0, t1, t2); 4399 // Rm = *++Pm; 4400 // Rn = *--Pn; 4401 umulh(Rhi_mn, Rm, Rn); 4402 mul(Rlo_mn, Rm, Rn); 4403 ldr(Rm, pre(Pm, wordSize)); 4404 ldr(Rn, pre(Pn, -wordSize)); 4405 } 4406 4407 void post1_squaring() { 4408 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4409 4410 // *Pm = Rm = t0 * inv; 4411 mul(Rm, t0, inv); 4412 str(Rm, Address(Pm)); 4413 4414 // MACC(Rm, Rn, t0, t1, t2); 4415 // t0 = t1; t1 = t2; t2 = 0; 4416 umulh(Rhi_mn, Rm, Rn); 4417 4418 #ifndef PRODUCT 4419 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4420 { 4421 mul(Rlo_mn, Rm, Rn); 4422 add(Rlo_mn, t0, Rlo_mn); 4423 Label ok; 4424 cbz(Rlo_mn, ok); { 4425 stop("broken Montgomery multiply"); 4426 } bind(ok); 4427 } 4428 #endif 4429 // We have very carefully set things up so that 4430 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4431 // the lower half of Rm * Rn because we know the result already: 4432 // it must be -t0. t0 + (-t0) must generate a carry iff 4433 // t0 != 0. So, rather than do a mul and an adds we just set 4434 // the carry flag iff t0 is nonzero. 4435 // 4436 // mul(Rlo_mn, Rm, Rn); 4437 // adds(zr, t0, Rlo_mn); 4438 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4439 adcs(t0, t1, Rhi_mn); 4440 adc(t1, t2, zr); 4441 mov(t2, zr); 4442 } 4443 4444 void acc(Register Rhi, Register Rlo, 4445 Register t0, Register t1, Register t2) { 4446 adds(t0, t0, Rlo); 4447 adcs(t1, t1, Rhi); 4448 adc(t2, t2, zr); 4449 } 4450 4451 public: 4452 /** 4453 * Fast Montgomery multiplication. The derivation of the 4454 * algorithm is in A Cryptographic Library for the Motorola 4455 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 4456 * 4457 * Arguments: 4458 * 4459 * Inputs for multiplication: 4460 * c_rarg0 - int array elements a 4461 * c_rarg1 - int array elements b 4462 * c_rarg2 - int array elements n (the modulus) 4463 * c_rarg3 - int length 4464 * c_rarg4 - int inv 4465 * c_rarg5 - int array elements m (the result) 4466 * 4467 * Inputs for squaring: 4468 * c_rarg0 - int array elements a 4469 * c_rarg1 - int array elements n (the modulus) 4470 * c_rarg2 - int length 4471 * c_rarg3 - int inv 4472 * c_rarg4 - int array elements m (the result) 4473 * 4474 */ 4475 address generate_multiply() { 4476 Label argh, nothing; 4477 bind(argh); 4478 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4479 4480 align(CodeEntryAlignment); 4481 address entry = pc(); 4482 4483 cbzw(Rlen, nothing); 4484 4485 enter(); 4486 4487 // Make room. 4488 cmpw(Rlen, 512); 4489 br(Assembler::HI, argh); 4490 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4491 andr(sp, Ra, -2 * wordSize); 4492 4493 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4494 4495 { 4496 // Copy input args, reversing as we go. We use Ra as a 4497 // temporary variable. 4498 reverse(Ra, Pa_base, Rlen, t0, t1); 4499 if (!_squaring) 4500 reverse(Ra, Pb_base, Rlen, t0, t1); 4501 reverse(Ra, Pn_base, Rlen, t0, t1); 4502 } 4503 4504 // Push all call-saved registers and also Pm_base which we'll need 4505 // at the end. 4506 save_regs(); 4507 4508 #ifndef PRODUCT 4509 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 4510 { 4511 ldr(Rn, Address(Pn_base, 0)); 4512 mul(Rlo_mn, Rn, inv); 4513 cmp(Rlo_mn, -1); 4514 Label ok; 4515 br(EQ, ok); { 4516 stop("broken inverse in Montgomery multiply"); 4517 } bind(ok); 4518 } 4519 #endif 4520 4521 mov(Pm_base, Ra); 4522 4523 mov(t0, zr); 4524 mov(t1, zr); 4525 mov(t2, zr); 4526 4527 block_comment("for (int i = 0; i < len; i++) {"); 4528 mov(Ri, zr); { 4529 Label loop, end; 4530 cmpw(Ri, Rlen); 4531 br(Assembler::GE, end); 4532 4533 bind(loop); 4534 pre1(Ri); 4535 4536 block_comment(" for (j = i; j; j--) {"); { 4537 movw(Rj, Ri); 4538 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4539 } block_comment(" } // j"); 4540 4541 post1(); 4542 addw(Ri, Ri, 1); 4543 cmpw(Ri, Rlen); 4544 br(Assembler::LT, loop); 4545 bind(end); 4546 block_comment("} // i"); 4547 } 4548 4549 block_comment("for (int i = len; i < 2*len; i++) {"); 4550 mov(Ri, Rlen); { 4551 Label loop, end; 4552 cmpw(Ri, Rlen, Assembler::LSL, 1); 4553 br(Assembler::GE, end); 4554 4555 bind(loop); 4556 pre2(Ri, Rlen); 4557 4558 block_comment(" for (j = len*2-i-1; j; j--) {"); { 4559 lslw(Rj, Rlen, 1); 4560 subw(Rj, Rj, Ri); 4561 subw(Rj, Rj, 1); 4562 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4563 } block_comment(" } // j"); 4564 4565 post2(Ri, Rlen); 4566 addw(Ri, Ri, 1); 4567 cmpw(Ri, Rlen, Assembler::LSL, 1); 4568 br(Assembler::LT, loop); 4569 bind(end); 4570 } 4571 block_comment("} // i"); 4572 4573 normalize(Rlen); 4574 4575 mov(Ra, Pm_base); // Save Pm_base in Ra 4576 restore_regs(); // Restore caller's Pm_base 4577 4578 // Copy our result into caller's Pm_base 4579 reverse(Pm_base, Ra, Rlen, t0, t1); 4580 4581 leave(); 4582 bind(nothing); 4583 ret(lr); 4584 4585 return entry; 4586 } 4587 // In C, approximately: 4588 4589 // void 4590 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 4591 // unsigned long Pn_base[], unsigned long Pm_base[], 4592 // unsigned long inv, int len) { 4593 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4594 // unsigned long *Pa, *Pb, *Pn, *Pm; 4595 // unsigned long Ra, Rb, Rn, Rm; 4596 4597 // int i; 4598 4599 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4600 4601 // for (i = 0; i < len; i++) { 4602 // int j; 4603 4604 // Pa = Pa_base; 4605 // Pb = Pb_base + i; 4606 // Pm = Pm_base; 4607 // Pn = Pn_base + i; 4608 4609 // Ra = *Pa; 4610 // Rb = *Pb; 4611 // Rm = *Pm; 4612 // Rn = *Pn; 4613 4614 // int iters = i; 4615 // for (j = 0; iters--; j++) { 4616 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4617 // MACC(Ra, Rb, t0, t1, t2); 4618 // Ra = *++Pa; 4619 // Rb = *--Pb; 4620 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4621 // MACC(Rm, Rn, t0, t1, t2); 4622 // Rm = *++Pm; 4623 // Rn = *--Pn; 4624 // } 4625 4626 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 4627 // MACC(Ra, Rb, t0, t1, t2); 4628 // *Pm = Rm = t0 * inv; 4629 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4630 // MACC(Rm, Rn, t0, t1, t2); 4631 4632 // assert(t0 == 0, "broken Montgomery multiply"); 4633 4634 // t0 = t1; t1 = t2; t2 = 0; 4635 // } 4636 4637 // for (i = len; i < 2*len; i++) { 4638 // int j; 4639 4640 // Pa = Pa_base + i-len; 4641 // Pb = Pb_base + len; 4642 // Pm = Pm_base + i-len; 4643 // Pn = Pn_base + len; 4644 4645 // Ra = *++Pa; 4646 // Rb = *--Pb; 4647 // Rm = *++Pm; 4648 // Rn = *--Pn; 4649 4650 // int iters = len*2-i-1; 4651 // for (j = i-len+1; iters--; j++) { 4652 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4653 // MACC(Ra, Rb, t0, t1, t2); 4654 // Ra = *++Pa; 4655 // Rb = *--Pb; 4656 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4657 // MACC(Rm, Rn, t0, t1, t2); 4658 // Rm = *++Pm; 4659 // Rn = *--Pn; 4660 // } 4661 4662 // Pm_base[i-len] = t0; 4663 // t0 = t1; t1 = t2; t2 = 0; 4664 // } 4665 4666 // while (t0) 4667 // t0 = sub(Pm_base, Pn_base, t0, len); 4668 // } 4669 4670 /** 4671 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4672 * multiplies than Montgomery multiplication so it should be up to 4673 * 25% faster. However, its loop control is more complex and it 4674 * may actually run slower on some machines. 4675 * 4676 * Arguments: 4677 * 4678 * Inputs: 4679 * c_rarg0 - int array elements a 4680 * c_rarg1 - int array elements n (the modulus) 4681 * c_rarg2 - int length 4682 * c_rarg3 - int inv 4683 * c_rarg4 - int array elements m (the result) 4684 * 4685 */ 4686 address generate_square() { 4687 Label argh; 4688 bind(argh); 4689 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4690 4691 align(CodeEntryAlignment); 4692 address entry = pc(); 4693 4694 enter(); 4695 4696 // Make room. 4697 cmpw(Rlen, 512); 4698 br(Assembler::HI, argh); 4699 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4700 andr(sp, Ra, -2 * wordSize); 4701 4702 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4703 4704 { 4705 // Copy input args, reversing as we go. We use Ra as a 4706 // temporary variable. 4707 reverse(Ra, Pa_base, Rlen, t0, t1); 4708 reverse(Ra, Pn_base, Rlen, t0, t1); 4709 } 4710 4711 // Push all call-saved registers and also Pm_base which we'll need 4712 // at the end. 4713 save_regs(); 4714 4715 mov(Pm_base, Ra); 4716 4717 mov(t0, zr); 4718 mov(t1, zr); 4719 mov(t2, zr); 4720 4721 block_comment("for (int i = 0; i < len; i++) {"); 4722 mov(Ri, zr); { 4723 Label loop, end; 4724 bind(loop); 4725 cmp(Ri, Rlen); 4726 br(Assembler::GE, end); 4727 4728 pre1(Ri); 4729 4730 block_comment("for (j = (i+1)/2; j; j--) {"); { 4731 add(Rj, Ri, 1); 4732 lsr(Rj, Rj, 1); 4733 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4734 } block_comment(" } // j"); 4735 4736 last_squaring(Ri); 4737 4738 block_comment(" for (j = i/2; j; j--) {"); { 4739 lsr(Rj, Ri, 1); 4740 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4741 } block_comment(" } // j"); 4742 4743 post1_squaring(); 4744 add(Ri, Ri, 1); 4745 cmp(Ri, Rlen); 4746 br(Assembler::LT, loop); 4747 4748 bind(end); 4749 block_comment("} // i"); 4750 } 4751 4752 block_comment("for (int i = len; i < 2*len; i++) {"); 4753 mov(Ri, Rlen); { 4754 Label loop, end; 4755 bind(loop); 4756 cmp(Ri, Rlen, Assembler::LSL, 1); 4757 br(Assembler::GE, end); 4758 4759 pre2(Ri, Rlen); 4760 4761 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4762 lsl(Rj, Rlen, 1); 4763 sub(Rj, Rj, Ri); 4764 sub(Rj, Rj, 1); 4765 lsr(Rj, Rj, 1); 4766 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4767 } block_comment(" } // j"); 4768 4769 last_squaring(Ri); 4770 4771 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4772 lsl(Rj, Rlen, 1); 4773 sub(Rj, Rj, Ri); 4774 lsr(Rj, Rj, 1); 4775 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4776 } block_comment(" } // j"); 4777 4778 post2(Ri, Rlen); 4779 add(Ri, Ri, 1); 4780 cmp(Ri, Rlen, Assembler::LSL, 1); 4781 4782 br(Assembler::LT, loop); 4783 bind(end); 4784 block_comment("} // i"); 4785 } 4786 4787 normalize(Rlen); 4788 4789 mov(Ra, Pm_base); // Save Pm_base in Ra 4790 restore_regs(); // Restore caller's Pm_base 4791 4792 // Copy our result into caller's Pm_base 4793 reverse(Pm_base, Ra, Rlen, t0, t1); 4794 4795 leave(); 4796 ret(lr); 4797 4798 return entry; 4799 } 4800 // In C, approximately: 4801 4802 // void 4803 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4804 // unsigned long Pm_base[], unsigned long inv, int len) { 4805 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4806 // unsigned long *Pa, *Pb, *Pn, *Pm; 4807 // unsigned long Ra, Rb, Rn, Rm; 4808 4809 // int i; 4810 4811 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4812 4813 // for (i = 0; i < len; i++) { 4814 // int j; 4815 4816 // Pa = Pa_base; 4817 // Pb = Pa_base + i; 4818 // Pm = Pm_base; 4819 // Pn = Pn_base + i; 4820 4821 // Ra = *Pa; 4822 // Rb = *Pb; 4823 // Rm = *Pm; 4824 // Rn = *Pn; 4825 4826 // int iters = (i+1)/2; 4827 // for (j = 0; iters--; j++) { 4828 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4829 // MACC2(Ra, Rb, t0, t1, t2); 4830 // Ra = *++Pa; 4831 // Rb = *--Pb; 4832 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4833 // MACC(Rm, Rn, t0, t1, t2); 4834 // Rm = *++Pm; 4835 // Rn = *--Pn; 4836 // } 4837 // if ((i & 1) == 0) { 4838 // assert(Ra == Pa_base[j], "must be"); 4839 // MACC(Ra, Ra, t0, t1, t2); 4840 // } 4841 // iters = i/2; 4842 // assert(iters == i-j, "must be"); 4843 // for (; iters--; j++) { 4844 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4845 // MACC(Rm, Rn, t0, t1, t2); 4846 // Rm = *++Pm; 4847 // Rn = *--Pn; 4848 // } 4849 4850 // *Pm = Rm = t0 * inv; 4851 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4852 // MACC(Rm, Rn, t0, t1, t2); 4853 4854 // assert(t0 == 0, "broken Montgomery multiply"); 4855 4856 // t0 = t1; t1 = t2; t2 = 0; 4857 // } 4858 4859 // for (i = len; i < 2*len; i++) { 4860 // int start = i-len+1; 4861 // int end = start + (len - start)/2; 4862 // int j; 4863 4864 // Pa = Pa_base + i-len; 4865 // Pb = Pa_base + len; 4866 // Pm = Pm_base + i-len; 4867 // Pn = Pn_base + len; 4868 4869 // Ra = *++Pa; 4870 // Rb = *--Pb; 4871 // Rm = *++Pm; 4872 // Rn = *--Pn; 4873 4874 // int iters = (2*len-i-1)/2; 4875 // assert(iters == end-start, "must be"); 4876 // for (j = start; iters--; j++) { 4877 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4878 // MACC2(Ra, Rb, t0, t1, t2); 4879 // Ra = *++Pa; 4880 // Rb = *--Pb; 4881 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4882 // MACC(Rm, Rn, t0, t1, t2); 4883 // Rm = *++Pm; 4884 // Rn = *--Pn; 4885 // } 4886 // if ((i & 1) == 0) { 4887 // assert(Ra == Pa_base[j], "must be"); 4888 // MACC(Ra, Ra, t0, t1, t2); 4889 // } 4890 // iters = (2*len-i)/2; 4891 // assert(iters == len-j, "must be"); 4892 // for (; iters--; j++) { 4893 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4894 // MACC(Rm, Rn, t0, t1, t2); 4895 // Rm = *++Pm; 4896 // Rn = *--Pn; 4897 // } 4898 // Pm_base[i-len] = t0; 4899 // t0 = t1; t1 = t2; t2 = 0; 4900 // } 4901 4902 // while (t0) 4903 // t0 = sub(Pm_base, Pn_base, t0, len); 4904 // } 4905 }; 4906 4907 4908 // Initialization 4909 void generate_initial() { 4910 // Generate initial stubs and initializes the entry points 4911 4912 // entry points that exist in all platforms Note: This is code 4913 // that could be shared among different platforms - however the 4914 // benefit seems to be smaller than the disadvantage of having a 4915 // much more complicated generator structure. See also comment in 4916 // stubRoutines.hpp. 4917 4918 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4919 4920 StubRoutines::_call_stub_entry = 4921 generate_call_stub(StubRoutines::_call_stub_return_address); 4922 4923 // is referenced by megamorphic call 4924 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4925 4926 // Build this early so it's available for the interpreter. 4927 StubRoutines::_throw_StackOverflowError_entry = 4928 generate_throw_exception("StackOverflowError throw_exception", 4929 CAST_FROM_FN_PTR(address, 4930 SharedRuntime::throw_StackOverflowError)); 4931 StubRoutines::_throw_delayed_StackOverflowError_entry = 4932 generate_throw_exception("delayed StackOverflowError throw_exception", 4933 CAST_FROM_FN_PTR(address, 4934 SharedRuntime::throw_delayed_StackOverflowError)); 4935 if (UseCRC32Intrinsics) { 4936 // set table address before stub generation which use it 4937 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 4938 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4939 } 4940 4941 if (UseCRC32CIntrinsics) { 4942 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 4943 } 4944 } 4945 4946 void generate_all() { 4947 // support for verify_oop (must happen after universe_init) 4948 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4949 StubRoutines::_throw_AbstractMethodError_entry = 4950 generate_throw_exception("AbstractMethodError throw_exception", 4951 CAST_FROM_FN_PTR(address, 4952 SharedRuntime:: 4953 throw_AbstractMethodError)); 4954 4955 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4956 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4957 CAST_FROM_FN_PTR(address, 4958 SharedRuntime:: 4959 throw_IncompatibleClassChangeError)); 4960 4961 StubRoutines::_throw_NullPointerException_at_call_entry = 4962 generate_throw_exception("NullPointerException at call throw_exception", 4963 CAST_FROM_FN_PTR(address, 4964 SharedRuntime:: 4965 throw_NullPointerException_at_call)); 4966 4967 // arraycopy stubs used by compilers 4968 generate_arraycopy_stubs(); 4969 4970 // has negatives stub for large arrays. 4971 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 4972 4973 if (UseMultiplyToLenIntrinsic) { 4974 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4975 } 4976 4977 if (UseSquareToLenIntrinsic) { 4978 StubRoutines::_squareToLen = generate_squareToLen(); 4979 } 4980 4981 if (UseMulAddIntrinsic) { 4982 StubRoutines::_mulAdd = generate_mulAdd(); 4983 } 4984 4985 if (UseMontgomeryMultiplyIntrinsic) { 4986 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4987 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4988 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4989 } 4990 4991 if (UseMontgomerySquareIntrinsic) { 4992 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4993 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4994 // We use generate_multiply() rather than generate_square() 4995 // because it's faster for the sizes of modulus we care about. 4996 StubRoutines::_montgomerySquare = g.generate_multiply(); 4997 } 4998 4999 #ifndef BUILTIN_SIM 5000 // generate GHASH intrinsics code 5001 if (UseGHASHIntrinsics) { 5002 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5003 } 5004 5005 if (UseAESIntrinsics) { 5006 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5007 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5008 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5009 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5010 } 5011 5012 if (UseSHA1Intrinsics) { 5013 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5014 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5015 } 5016 if (UseSHA256Intrinsics) { 5017 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5018 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5019 } 5020 5021 // generate Adler32 intrinsics code 5022 if (UseAdler32Intrinsics) { 5023 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5024 } 5025 5026 // Safefetch stubs. 5027 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5028 &StubRoutines::_safefetch32_fault_pc, 5029 &StubRoutines::_safefetch32_continuation_pc); 5030 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5031 &StubRoutines::_safefetchN_fault_pc, 5032 &StubRoutines::_safefetchN_continuation_pc); 5033 #endif 5034 StubRoutines::aarch64::set_completed(); 5035 } 5036 5037 public: 5038 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5039 if (all) { 5040 generate_all(); 5041 } else { 5042 generate_initial(); 5043 } 5044 } 5045 }; // end class declaration 5046 5047 void StubGenerator_generate(CodeBuffer* code, bool all) { 5048 StubGenerator g(code, all); 5049 }