1 /* 2 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "interpreter/interpreter.hpp" 30 #include "nativeInst_aarch64.hpp" 31 #include "oops/instanceOop.hpp" 32 #include "oops/method.hpp" 33 #include "oops/objArrayKlass.hpp" 34 #include "oops/oop.inline.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/frame.inline.hpp" 37 #include "runtime/handles.inline.hpp" 38 #include "runtime/sharedRuntime.hpp" 39 #include "runtime/stubCodeGenerator.hpp" 40 #include "runtime/stubRoutines.hpp" 41 #include "runtime/thread.inline.hpp" 42 #ifdef COMPILER2 43 #include "opto/runtime.hpp" 44 #endif 45 46 #ifdef BUILTIN_SIM 47 #include "../../../../../../simulator/simulator.hpp" 48 #endif 49 50 // Declaration and definition of StubGenerator (no .hpp file). 51 // For a more detailed description of the stub routine structure 52 // see the comment in stubRoutines.hpp 53 54 #undef __ 55 #define __ _masm-> 56 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 57 58 #ifdef PRODUCT 59 #define BLOCK_COMMENT(str) /* nothing */ 60 #else 61 #define BLOCK_COMMENT(str) __ block_comment(str) 62 #endif 63 64 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 65 66 // Stub Code definitions 67 68 class StubGenerator: public StubCodeGenerator { 69 private: 70 71 #ifdef PRODUCT 72 #define inc_counter_np(counter) ((void)0) 73 #else 74 void inc_counter_np_(int& counter) { 75 __ lea(rscratch2, ExternalAddress((address)&counter)); 76 __ ldrw(rscratch1, Address(rscratch2)); 77 __ addw(rscratch1, rscratch1, 1); 78 __ strw(rscratch1, Address(rscratch2)); 79 } 80 #define inc_counter_np(counter) \ 81 BLOCK_COMMENT("inc_counter " #counter); \ 82 inc_counter_np_(counter); 83 #endif 84 85 // Call stubs are used to call Java from C 86 // 87 // Arguments: 88 // c_rarg0: call wrapper address address 89 // c_rarg1: result address 90 // c_rarg2: result type BasicType 91 // c_rarg3: method Method* 92 // c_rarg4: (interpreter) entry point address 93 // c_rarg5: parameters intptr_t* 94 // c_rarg6: parameter size (in words) int 95 // c_rarg7: thread Thread* 96 // 97 // There is no return from the stub itself as any Java result 98 // is written to result 99 // 100 // we save r30 (lr) as the return PC at the base of the frame and 101 // link r29 (fp) below it as the frame pointer installing sp (r31) 102 // into fp. 103 // 104 // we save r0-r7, which accounts for all the c arguments. 105 // 106 // TODO: strictly do we need to save them all? they are treated as 107 // volatile by C so could we omit saving the ones we are going to 108 // place in global registers (thread? method?) or those we only use 109 // during setup of the Java call? 110 // 111 // we don't need to save r8 which C uses as an indirect result location 112 // return register. 113 // 114 // we don't need to save r9-r15 which both C and Java treat as 115 // volatile 116 // 117 // we don't need to save r16-18 because Java does not use them 118 // 119 // we save r19-r28 which Java uses as scratch registers and C 120 // expects to be callee-save 121 // 122 // we save the bottom 64 bits of each value stored in v8-v15; it is 123 // the responsibility of the caller to preserve larger values. 124 // 125 // so the stub frame looks like this when we enter Java code 126 // 127 // [ return_from_Java ] <--- sp 128 // [ argument word n ] 129 // ... 130 // -27 [ argument word 1 ] 131 // -26 [ saved v15 ] <--- sp_after_call 132 // -25 [ saved v14 ] 133 // -24 [ saved v13 ] 134 // -23 [ saved v12 ] 135 // -22 [ saved v11 ] 136 // -21 [ saved v10 ] 137 // -20 [ saved v9 ] 138 // -19 [ saved v8 ] 139 // -18 [ saved r28 ] 140 // -17 [ saved r27 ] 141 // -16 [ saved r26 ] 142 // -15 [ saved r25 ] 143 // -14 [ saved r24 ] 144 // -13 [ saved r23 ] 145 // -12 [ saved r22 ] 146 // -11 [ saved r21 ] 147 // -10 [ saved r20 ] 148 // -9 [ saved r19 ] 149 // -8 [ call wrapper (r0) ] 150 // -7 [ result (r1) ] 151 // -6 [ result type (r2) ] 152 // -5 [ method (r3) ] 153 // -4 [ entry point (r4) ] 154 // -3 [ parameters (r5) ] 155 // -2 [ parameter size (r6) ] 156 // -1 [ thread (r7) ] 157 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 158 // 1 [ saved lr (r30) ] 159 160 // Call stub stack layout word offsets from fp 161 enum call_stub_layout { 162 sp_after_call_off = -26, 163 164 d15_off = -26, 165 d13_off = -24, 166 d11_off = -22, 167 d9_off = -20, 168 169 r28_off = -18, 170 r26_off = -16, 171 r24_off = -14, 172 r22_off = -12, 173 r20_off = -10, 174 call_wrapper_off = -8, 175 result_off = -7, 176 result_type_off = -6, 177 method_off = -5, 178 entry_point_off = -4, 179 parameter_size_off = -2, 180 thread_off = -1, 181 fp_f = 0, 182 retaddr_off = 1, 183 }; 184 185 address generate_call_stub(address& return_address) { 186 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 187 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 188 "adjust this code"); 189 190 StubCodeMark mark(this, "StubRoutines", "call_stub"); 191 address start = __ pc(); 192 193 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 194 195 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 196 const Address result (rfp, result_off * wordSize); 197 const Address result_type (rfp, result_type_off * wordSize); 198 const Address method (rfp, method_off * wordSize); 199 const Address entry_point (rfp, entry_point_off * wordSize); 200 const Address parameter_size(rfp, parameter_size_off * wordSize); 201 202 const Address thread (rfp, thread_off * wordSize); 203 204 const Address d15_save (rfp, d15_off * wordSize); 205 const Address d13_save (rfp, d13_off * wordSize); 206 const Address d11_save (rfp, d11_off * wordSize); 207 const Address d9_save (rfp, d9_off * wordSize); 208 209 const Address r28_save (rfp, r28_off * wordSize); 210 const Address r26_save (rfp, r26_off * wordSize); 211 const Address r24_save (rfp, r24_off * wordSize); 212 const Address r22_save (rfp, r22_off * wordSize); 213 const Address r20_save (rfp, r20_off * wordSize); 214 215 // stub code 216 217 // we need a C prolog to bootstrap the x86 caller into the sim 218 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 219 220 address aarch64_entry = __ pc(); 221 222 #ifdef BUILTIN_SIM 223 // Save sender's SP for stack traces. 224 __ mov(rscratch1, sp); 225 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 226 #endif 227 // set up frame and move sp to end of save area 228 __ enter(); 229 __ sub(sp, rfp, -sp_after_call_off * wordSize); 230 231 // save register parameters and Java scratch/global registers 232 // n.b. we save thread even though it gets installed in 233 // rthread because we want to sanity check rthread later 234 __ str(c_rarg7, thread); 235 __ strw(c_rarg6, parameter_size); 236 __ stp(c_rarg4, c_rarg5, entry_point); 237 __ stp(c_rarg2, c_rarg3, result_type); 238 __ stp(c_rarg0, c_rarg1, call_wrapper); 239 240 __ stp(r20, r19, r20_save); 241 __ stp(r22, r21, r22_save); 242 __ stp(r24, r23, r24_save); 243 __ stp(r26, r25, r26_save); 244 __ stp(r28, r27, r28_save); 245 246 __ stpd(v9, v8, d9_save); 247 __ stpd(v11, v10, d11_save); 248 __ stpd(v13, v12, d13_save); 249 __ stpd(v15, v14, d15_save); 250 251 // install Java thread in global register now we have saved 252 // whatever value it held 253 __ mov(rthread, c_rarg7); 254 // And method 255 __ mov(rmethod, c_rarg3); 256 257 // set up the heapbase register 258 __ reinit_heapbase(); 259 260 #ifdef ASSERT 261 // make sure we have no pending exceptions 262 { 263 Label L; 264 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 265 __ cmp(rscratch1, (unsigned)NULL_WORD); 266 __ br(Assembler::EQ, L); 267 __ stop("StubRoutines::call_stub: entered with pending exception"); 268 __ BIND(L); 269 } 270 #endif 271 // pass parameters if any 272 __ mov(esp, sp); 273 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 274 __ andr(sp, rscratch1, -2 * wordSize); 275 276 BLOCK_COMMENT("pass parameters if any"); 277 Label parameters_done; 278 // parameter count is still in c_rarg6 279 // and parameter pointer identifying param 1 is in c_rarg5 280 __ cbzw(c_rarg6, parameters_done); 281 282 address loop = __ pc(); 283 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 284 __ subsw(c_rarg6, c_rarg6, 1); 285 __ push(rscratch1); 286 __ br(Assembler::GT, loop); 287 288 __ BIND(parameters_done); 289 290 // call Java entry -- passing methdoOop, and current sp 291 // rmethod: Method* 292 // r13: sender sp 293 BLOCK_COMMENT("call Java function"); 294 __ mov(r13, sp); 295 __ blr(c_rarg4); 296 297 // tell the simulator we have returned to the stub 298 299 // we do this here because the notify will already have been done 300 // if we get to the next instruction via an exception 301 // 302 // n.b. adding this instruction here affects the calculation of 303 // whether or not a routine returns to the call stub (used when 304 // doing stack walks) since the normal test is to check the return 305 // pc against the address saved below. so we may need to allow for 306 // this extra instruction in the check. 307 308 if (NotifySimulator) { 309 __ notify(Assembler::method_reentry); 310 } 311 // save current address for use by exception handling code 312 313 return_address = __ pc(); 314 315 // store result depending on type (everything that is not 316 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 317 // n.b. this assumes Java returns an integral result in r0 318 // and a floating result in j_farg0 319 __ ldr(j_rarg2, result); 320 Label is_long, is_float, is_double, exit; 321 __ ldr(j_rarg1, result_type); 322 __ cmp(j_rarg1, T_OBJECT); 323 __ br(Assembler::EQ, is_long); 324 __ cmp(j_rarg1, T_LONG); 325 __ br(Assembler::EQ, is_long); 326 __ cmp(j_rarg1, T_FLOAT); 327 __ br(Assembler::EQ, is_float); 328 __ cmp(j_rarg1, T_DOUBLE); 329 __ br(Assembler::EQ, is_double); 330 331 // handle T_INT case 332 __ strw(r0, Address(j_rarg2)); 333 334 __ BIND(exit); 335 336 // pop parameters 337 __ sub(esp, rfp, -sp_after_call_off * wordSize); 338 339 #ifdef ASSERT 340 // verify that threads correspond 341 { 342 Label L, S; 343 __ ldr(rscratch1, thread); 344 __ cmp(rthread, rscratch1); 345 __ br(Assembler::NE, S); 346 __ get_thread(rscratch1); 347 __ cmp(rthread, rscratch1); 348 __ br(Assembler::EQ, L); 349 __ BIND(S); 350 __ stop("StubRoutines::call_stub: threads must correspond"); 351 __ BIND(L); 352 } 353 #endif 354 355 // restore callee-save registers 356 __ ldpd(v15, v14, d15_save); 357 __ ldpd(v13, v12, d13_save); 358 __ ldpd(v11, v10, d11_save); 359 __ ldpd(v9, v8, d9_save); 360 361 __ ldp(r28, r27, r28_save); 362 __ ldp(r26, r25, r26_save); 363 __ ldp(r24, r23, r24_save); 364 __ ldp(r22, r21, r22_save); 365 __ ldp(r20, r19, r20_save); 366 367 __ ldp(c_rarg0, c_rarg1, call_wrapper); 368 __ ldrw(c_rarg2, result_type); 369 __ ldr(c_rarg3, method); 370 __ ldp(c_rarg4, c_rarg5, entry_point); 371 __ ldp(c_rarg6, c_rarg7, parameter_size); 372 373 #ifndef PRODUCT 374 // tell the simulator we are about to end Java execution 375 if (NotifySimulator) { 376 __ notify(Assembler::method_exit); 377 } 378 #endif 379 // leave frame and return to caller 380 __ leave(); 381 __ ret(lr); 382 383 // handle return types different from T_INT 384 385 __ BIND(is_long); 386 __ str(r0, Address(j_rarg2, 0)); 387 __ br(Assembler::AL, exit); 388 389 __ BIND(is_float); 390 __ strs(j_farg0, Address(j_rarg2, 0)); 391 __ br(Assembler::AL, exit); 392 393 __ BIND(is_double); 394 __ strd(j_farg0, Address(j_rarg2, 0)); 395 __ br(Assembler::AL, exit); 396 397 return start; 398 } 399 400 // Return point for a Java call if there's an exception thrown in 401 // Java code. The exception is caught and transformed into a 402 // pending exception stored in JavaThread that can be tested from 403 // within the VM. 404 // 405 // Note: Usually the parameters are removed by the callee. In case 406 // of an exception crossing an activation frame boundary, that is 407 // not the case if the callee is compiled code => need to setup the 408 // rsp. 409 // 410 // r0: exception oop 411 412 // NOTE: this is used as a target from the signal handler so it 413 // needs an x86 prolog which returns into the current simulator 414 // executing the generated catch_exception code. so the prolog 415 // needs to install rax in a sim register and adjust the sim's 416 // restart pc to enter the generated code at the start position 417 // then return from native to simulated execution. 418 419 address generate_catch_exception() { 420 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 421 address start = __ pc(); 422 423 // same as in generate_call_stub(): 424 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 425 const Address thread (rfp, thread_off * wordSize); 426 427 #ifdef ASSERT 428 // verify that threads correspond 429 { 430 Label L, S; 431 __ ldr(rscratch1, thread); 432 __ cmp(rthread, rscratch1); 433 __ br(Assembler::NE, S); 434 __ get_thread(rscratch1); 435 __ cmp(rthread, rscratch1); 436 __ br(Assembler::EQ, L); 437 __ bind(S); 438 __ stop("StubRoutines::catch_exception: threads must correspond"); 439 __ bind(L); 440 } 441 #endif 442 443 // set pending exception 444 __ verify_oop(r0); 445 446 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 447 __ mov(rscratch1, (address)__FILE__); 448 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 449 __ movw(rscratch1, (int)__LINE__); 450 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 451 452 // complete return to VM 453 assert(StubRoutines::_call_stub_return_address != NULL, 454 "_call_stub_return_address must have been generated before"); 455 __ b(StubRoutines::_call_stub_return_address); 456 457 return start; 458 } 459 460 // Continuation point for runtime calls returning with a pending 461 // exception. The pending exception check happened in the runtime 462 // or native call stub. The pending exception in Thread is 463 // converted into a Java-level exception. 464 // 465 // Contract with Java-level exception handlers: 466 // r0: exception 467 // r3: throwing pc 468 // 469 // NOTE: At entry of this stub, exception-pc must be in LR !! 470 471 // NOTE: this is always used as a jump target within generated code 472 // so it just needs to be generated code wiht no x86 prolog 473 474 address generate_forward_exception() { 475 StubCodeMark mark(this, "StubRoutines", "forward exception"); 476 address start = __ pc(); 477 478 // Upon entry, LR points to the return address returning into 479 // Java (interpreted or compiled) code; i.e., the return address 480 // becomes the throwing pc. 481 // 482 // Arguments pushed before the runtime call are still on the stack 483 // but the exception handler will reset the stack pointer -> 484 // ignore them. A potential result in registers can be ignored as 485 // well. 486 487 #ifdef ASSERT 488 // make sure this code is only executed if there is a pending exception 489 { 490 Label L; 491 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 492 __ cbnz(rscratch1, L); 493 __ stop("StubRoutines::forward exception: no pending exception (1)"); 494 __ bind(L); 495 } 496 #endif 497 498 // compute exception handler into r19 499 500 // call the VM to find the handler address associated with the 501 // caller address. pass thread in r0 and caller pc (ret address) 502 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 503 // the stack. 504 __ mov(c_rarg1, lr); 505 // lr will be trashed by the VM call so we move it to R19 506 // (callee-saved) because we also need to pass it to the handler 507 // returned by this call. 508 __ mov(r19, lr); 509 BLOCK_COMMENT("call exception_handler_for_return_address"); 510 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 511 SharedRuntime::exception_handler_for_return_address), 512 rthread, c_rarg1); 513 // we should not really care that lr is no longer the callee 514 // address. we saved the value the handler needs in r19 so we can 515 // just copy it to r3. however, the C2 handler will push its own 516 // frame and then calls into the VM and the VM code asserts that 517 // the PC for the frame above the handler belongs to a compiled 518 // Java method. So, we restore lr here to satisfy that assert. 519 __ mov(lr, r19); 520 // setup r0 & r3 & clear pending exception 521 __ mov(r3, r19); 522 __ mov(r19, r0); 523 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 524 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 525 526 #ifdef ASSERT 527 // make sure exception is set 528 { 529 Label L; 530 __ cbnz(r0, L); 531 __ stop("StubRoutines::forward exception: no pending exception (2)"); 532 __ bind(L); 533 } 534 #endif 535 536 // continue at exception handler 537 // r0: exception 538 // r3: throwing pc 539 // r19: exception handler 540 __ verify_oop(r0); 541 __ br(r19); 542 543 return start; 544 } 545 546 // Non-destructive plausibility checks for oops 547 // 548 // Arguments: 549 // r0: oop to verify 550 // rscratch1: error message 551 // 552 // Stack after saving c_rarg3: 553 // [tos + 0]: saved c_rarg3 554 // [tos + 1]: saved c_rarg2 555 // [tos + 2]: saved lr 556 // [tos + 3]: saved rscratch2 557 // [tos + 4]: saved r0 558 // [tos + 5]: saved rscratch1 559 address generate_verify_oop() { 560 561 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 562 address start = __ pc(); 563 564 Label exit, error; 565 566 // save c_rarg2 and c_rarg3 567 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 568 569 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 570 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 571 __ ldr(c_rarg3, Address(c_rarg2)); 572 __ add(c_rarg3, c_rarg3, 1); 573 __ str(c_rarg3, Address(c_rarg2)); 574 575 // object is in r0 576 // make sure object is 'reasonable' 577 __ cbz(r0, exit); // if obj is NULL it is OK 578 579 // Check if the oop is in the right area of memory 580 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 581 __ andr(c_rarg2, r0, c_rarg3); 582 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 583 584 // Compare c_rarg2 and c_rarg3. We don't use a compare 585 // instruction here because the flags register is live. 586 __ eor(c_rarg2, c_rarg2, c_rarg3); 587 __ cbnz(c_rarg2, error); 588 589 // make sure klass is 'reasonable', which is not zero. 590 __ load_klass(r0, r0); // get klass 591 __ cbz(r0, error); // if klass is NULL it is broken 592 593 // return if everything seems ok 594 __ bind(exit); 595 596 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 597 __ ret(lr); 598 599 // handle errors 600 __ bind(error); 601 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 602 603 __ push(RegSet::range(r0, r29), sp); 604 // debug(char* msg, int64_t pc, int64_t regs[]) 605 __ mov(c_rarg0, rscratch1); // pass address of error message 606 __ mov(c_rarg1, lr); // pass return address 607 __ mov(c_rarg2, sp); // pass address of regs on stack 608 #ifndef PRODUCT 609 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 610 #endif 611 BLOCK_COMMENT("call MacroAssembler::debug"); 612 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 613 __ blrt(rscratch1, 3, 0, 1); 614 615 return start; 616 } 617 618 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 619 620 // Generate code for an array write pre barrier 621 // 622 // addr - starting address 623 // count - element count 624 // tmp - scratch register 625 // 626 // Destroy no registers except rscratch1 and rscratch2 627 // 628 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 629 BarrierSet* bs = Universe::heap()->barrier_set(); 630 switch (bs->kind()) { 631 case BarrierSet::G1SATBCTLogging: 632 // With G1, don't generate the call if we statically know that the target in uninitialized 633 if (!dest_uninitialized) { 634 __ push_call_clobbered_registers(); 635 if (count == c_rarg0) { 636 if (addr == c_rarg1) { 637 // exactly backwards!! 638 __ mov(rscratch1, c_rarg0); 639 __ mov(c_rarg0, c_rarg1); 640 __ mov(c_rarg1, rscratch1); 641 } else { 642 __ mov(c_rarg1, count); 643 __ mov(c_rarg0, addr); 644 } 645 } else { 646 __ mov(c_rarg0, addr); 647 __ mov(c_rarg1, count); 648 } 649 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 650 __ pop_call_clobbered_registers(); 651 break; 652 case BarrierSet::CardTableForRS: 653 case BarrierSet::CardTableExtension: 654 case BarrierSet::ModRef: 655 break; 656 default: 657 ShouldNotReachHere(); 658 659 } 660 } 661 } 662 663 // 664 // Generate code for an array write post barrier 665 // 666 // Input: 667 // start - register containing starting address of destination array 668 // end - register containing ending address of destination array 669 // scratch - scratch register 670 // 671 // The input registers are overwritten. 672 // The ending address is inclusive. 673 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) { 674 assert_different_registers(start, end, scratch); 675 BarrierSet* bs = Universe::heap()->barrier_set(); 676 switch (bs->kind()) { 677 case BarrierSet::G1SATBCTLogging: 678 679 { 680 __ push_call_clobbered_registers(); 681 // must compute element count unless barrier set interface is changed (other platforms supply count) 682 assert_different_registers(start, end, scratch); 683 __ lea(scratch, Address(end, BytesPerHeapOop)); 684 __ sub(scratch, scratch, start); // subtract start to get #bytes 685 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 686 __ mov(c_rarg0, start); 687 __ mov(c_rarg1, scratch); 688 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 689 __ pop_call_clobbered_registers(); 690 } 691 break; 692 case BarrierSet::CardTableForRS: 693 case BarrierSet::CardTableExtension: 694 { 695 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 696 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 697 698 Label L_loop; 699 700 __ lsr(start, start, CardTableModRefBS::card_shift); 701 __ lsr(end, end, CardTableModRefBS::card_shift); 702 __ sub(end, end, start); // number of bytes to copy 703 704 const Register count = end; // 'end' register contains bytes count now 705 __ load_byte_map_base(scratch); 706 __ add(start, start, scratch); 707 if (UseConcMarkSweepGC) { 708 __ membar(__ StoreStore); 709 } 710 __ BIND(L_loop); 711 __ strb(zr, Address(start, count)); 712 __ subs(count, count, 1); 713 __ br(Assembler::GE, L_loop); 714 } 715 break; 716 default: 717 ShouldNotReachHere(); 718 719 } 720 } 721 722 address generate_zero_longs(Register base, Register cnt) { 723 Register tmp = rscratch1; 724 Register tmp2 = rscratch2; 725 int zva_length = VM_Version::zva_length(); 726 Label initial_table_end, loop_zva; 727 Label fini; 728 729 __ align(CodeEntryAlignment); 730 StubCodeMark mark(this, "StubRoutines", "zero_longs"); 731 address start = __ pc(); 732 733 // Base must be 16 byte aligned. If not just return and let caller handle it 734 __ tst(base, 0x0f); 735 __ br(Assembler::NE, fini); 736 // Align base with ZVA length. 737 __ neg(tmp, base); 738 __ andr(tmp, tmp, zva_length - 1); 739 740 // tmp: the number of bytes to be filled to align the base with ZVA length. 741 __ add(base, base, tmp); 742 __ sub(cnt, cnt, tmp, Assembler::ASR, 3); 743 __ adr(tmp2, initial_table_end); 744 __ sub(tmp2, tmp2, tmp, Assembler::LSR, 2); 745 __ br(tmp2); 746 747 for (int i = -zva_length + 16; i < 0; i += 16) 748 __ stp(zr, zr, Address(base, i)); 749 __ bind(initial_table_end); 750 751 __ sub(cnt, cnt, zva_length >> 3); 752 __ bind(loop_zva); 753 __ dc(Assembler::ZVA, base); 754 __ subs(cnt, cnt, zva_length >> 3); 755 __ add(base, base, zva_length); 756 __ br(Assembler::GE, loop_zva); 757 __ add(cnt, cnt, zva_length >> 3); // count not zeroed by DC ZVA 758 __ bind(fini); 759 __ ret(lr); 760 761 return start; 762 } 763 764 typedef enum { 765 copy_forwards = 1, 766 copy_backwards = -1 767 } copy_direction; 768 769 // Bulk copy of blocks of 8 words. 770 // 771 // count is a count of words. 772 // 773 // Precondition: count >= 8 774 // 775 // Postconditions: 776 // 777 // The least significant bit of count contains the remaining count 778 // of words to copy. The rest of count is trash. 779 // 780 // s and d are adjusted to point to the remaining words to copy 781 // 782 void generate_copy_longs(Label &start, Register s, Register d, Register count, 783 copy_direction direction) { 784 int unit = wordSize * direction; 785 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 786 787 int offset; 788 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 789 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 790 const Register stride = r13; 791 792 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 793 assert_different_registers(s, d, count, rscratch1); 794 795 Label again, drain; 796 const char *stub_name; 797 if (direction == copy_forwards) 798 stub_name = "foward_copy_longs"; 799 else 800 stub_name = "backward_copy_longs"; 801 StubCodeMark mark(this, "StubRoutines", stub_name); 802 __ align(CodeEntryAlignment); 803 __ bind(start); 804 if (direction == copy_forwards) { 805 __ sub(s, s, bias); 806 __ sub(d, d, bias); 807 } 808 809 #ifdef ASSERT 810 // Make sure we are never given < 8 words 811 { 812 Label L; 813 __ cmp(count, 8); 814 __ br(Assembler::GE, L); 815 __ stop("genrate_copy_longs called with < 8 words"); 816 __ bind(L); 817 } 818 #endif 819 820 // Fill 8 registers 821 if (UseSIMDForMemoryOps) { 822 __ ldpq(v0, v1, Address(s, 4 * unit)); 823 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 824 } else { 825 __ ldp(t0, t1, Address(s, 2 * unit)); 826 __ ldp(t2, t3, Address(s, 4 * unit)); 827 __ ldp(t4, t5, Address(s, 6 * unit)); 828 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 829 } 830 831 __ subs(count, count, 16); 832 __ br(Assembler::LO, drain); 833 834 int prefetch = PrefetchCopyIntervalInBytes; 835 bool use_stride = false; 836 if (direction == copy_backwards) { 837 use_stride = prefetch > 256; 838 prefetch = -prefetch; 839 if (use_stride) __ mov(stride, prefetch); 840 } 841 842 __ bind(again); 843 844 if (PrefetchCopyIntervalInBytes > 0) 845 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 846 847 if (UseSIMDForMemoryOps) { 848 __ stpq(v0, v1, Address(d, 4 * unit)); 849 __ ldpq(v0, v1, Address(s, 4 * unit)); 850 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 851 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 852 } else { 853 __ stp(t0, t1, Address(d, 2 * unit)); 854 __ ldp(t0, t1, Address(s, 2 * unit)); 855 __ stp(t2, t3, Address(d, 4 * unit)); 856 __ ldp(t2, t3, Address(s, 4 * unit)); 857 __ stp(t4, t5, Address(d, 6 * unit)); 858 __ ldp(t4, t5, Address(s, 6 * unit)); 859 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 860 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 861 } 862 863 __ subs(count, count, 8); 864 __ br(Assembler::HS, again); 865 866 // Drain 867 __ bind(drain); 868 if (UseSIMDForMemoryOps) { 869 __ stpq(v0, v1, Address(d, 4 * unit)); 870 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 871 } else { 872 __ stp(t0, t1, Address(d, 2 * unit)); 873 __ stp(t2, t3, Address(d, 4 * unit)); 874 __ stp(t4, t5, Address(d, 6 * unit)); 875 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 876 } 877 878 { 879 Label L1, L2; 880 __ tbz(count, exact_log2(4), L1); 881 if (UseSIMDForMemoryOps) { 882 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 883 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 884 } else { 885 __ ldp(t0, t1, Address(s, 2 * unit)); 886 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 887 __ stp(t0, t1, Address(d, 2 * unit)); 888 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 889 } 890 __ bind(L1); 891 892 if (direction == copy_forwards) { 893 __ add(s, s, bias); 894 __ add(d, d, bias); 895 } 896 897 __ tbz(count, 1, L2); 898 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 899 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 900 __ bind(L2); 901 } 902 903 __ ret(lr); 904 } 905 906 // Small copy: less than 16 bytes. 907 // 908 // NB: Ignores all of the bits of count which represent more than 15 909 // bytes, so a caller doesn't have to mask them. 910 911 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 912 bool is_backwards = step < 0; 913 size_t granularity = uabs(step); 914 int direction = is_backwards ? -1 : 1; 915 int unit = wordSize * direction; 916 917 Label Lpair, Lword, Lint, Lshort, Lbyte; 918 919 assert(granularity 920 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 921 922 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 923 924 // ??? I don't know if this bit-test-and-branch is the right thing 925 // to do. It does a lot of jumping, resulting in several 926 // mispredicted branches. It might make more sense to do this 927 // with something like Duff's device with a single computed branch. 928 929 __ tbz(count, 3 - exact_log2(granularity), Lword); 930 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 931 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 932 __ bind(Lword); 933 934 if (granularity <= sizeof (jint)) { 935 __ tbz(count, 2 - exact_log2(granularity), Lint); 936 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 937 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 938 __ bind(Lint); 939 } 940 941 if (granularity <= sizeof (jshort)) { 942 __ tbz(count, 1 - exact_log2(granularity), Lshort); 943 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 944 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 945 __ bind(Lshort); 946 } 947 948 if (granularity <= sizeof (jbyte)) { 949 __ tbz(count, 0, Lbyte); 950 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 951 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 952 __ bind(Lbyte); 953 } 954 } 955 956 Label copy_f, copy_b; 957 958 // All-singing all-dancing memory copy. 959 // 960 // Copy count units of memory from s to d. The size of a unit is 961 // step, which can be positive or negative depending on the direction 962 // of copy. If is_aligned is false, we align the source address. 963 // 964 965 void copy_memory(bool is_aligned, Register s, Register d, 966 Register count, Register tmp, int step) { 967 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 968 bool is_backwards = step < 0; 969 int granularity = uabs(step); 970 const Register t0 = r3, t1 = r4; 971 972 // <= 96 bytes do inline. Direction doesn't matter because we always 973 // load all the data before writing anything 974 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 975 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 976 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 977 const Register send = r17, dend = r18; 978 979 if (PrefetchCopyIntervalInBytes > 0) 980 __ prfm(Address(s, 0), PLDL1KEEP); 981 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 982 __ br(Assembler::HI, copy_big); 983 984 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 985 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 986 987 __ cmp(count, 16/granularity); 988 __ br(Assembler::LS, copy16); 989 990 __ cmp(count, 64/granularity); 991 __ br(Assembler::HI, copy80); 992 993 __ cmp(count, 32/granularity); 994 __ br(Assembler::LS, copy32); 995 996 // 33..64 bytes 997 if (UseSIMDForMemoryOps) { 998 __ ldpq(v0, v1, Address(s, 0)); 999 __ ldpq(v2, v3, Address(send, -32)); 1000 __ stpq(v0, v1, Address(d, 0)); 1001 __ stpq(v2, v3, Address(dend, -32)); 1002 } else { 1003 __ ldp(t0, t1, Address(s, 0)); 1004 __ ldp(t2, t3, Address(s, 16)); 1005 __ ldp(t4, t5, Address(send, -32)); 1006 __ ldp(t6, t7, Address(send, -16)); 1007 1008 __ stp(t0, t1, Address(d, 0)); 1009 __ stp(t2, t3, Address(d, 16)); 1010 __ stp(t4, t5, Address(dend, -32)); 1011 __ stp(t6, t7, Address(dend, -16)); 1012 } 1013 __ b(finish); 1014 1015 // 17..32 bytes 1016 __ bind(copy32); 1017 __ ldp(t0, t1, Address(s, 0)); 1018 __ ldp(t2, t3, Address(send, -16)); 1019 __ stp(t0, t1, Address(d, 0)); 1020 __ stp(t2, t3, Address(dend, -16)); 1021 __ b(finish); 1022 1023 // 65..80/96 bytes 1024 // (96 bytes if SIMD because we do 32 byes per instruction) 1025 __ bind(copy80); 1026 if (UseSIMDForMemoryOps) { 1027 __ ldpq(v0, v1, Address(s, 0)); 1028 __ ldpq(v2, v3, Address(s, 32)); 1029 __ ldpq(v4, v5, Address(send, -32)); 1030 __ stpq(v0, v1, Address(d, 0)); 1031 __ stpq(v2, v3, Address(d, 32)); 1032 __ stpq(v4, v5, Address(dend, -32)); 1033 } else { 1034 __ ldp(t0, t1, Address(s, 0)); 1035 __ ldp(t2, t3, Address(s, 16)); 1036 __ ldp(t4, t5, Address(s, 32)); 1037 __ ldp(t6, t7, Address(s, 48)); 1038 __ ldp(t8, t9, Address(send, -16)); 1039 1040 __ stp(t0, t1, Address(d, 0)); 1041 __ stp(t2, t3, Address(d, 16)); 1042 __ stp(t4, t5, Address(d, 32)); 1043 __ stp(t6, t7, Address(d, 48)); 1044 __ stp(t8, t9, Address(dend, -16)); 1045 } 1046 __ b(finish); 1047 1048 // 0..16 bytes 1049 __ bind(copy16); 1050 __ cmp(count, 8/granularity); 1051 __ br(Assembler::LO, copy8); 1052 1053 // 8..16 bytes 1054 __ ldr(t0, Address(s, 0)); 1055 __ ldr(t1, Address(send, -8)); 1056 __ str(t0, Address(d, 0)); 1057 __ str(t1, Address(dend, -8)); 1058 __ b(finish); 1059 1060 if (granularity < 8) { 1061 // 4..7 bytes 1062 __ bind(copy8); 1063 __ tbz(count, 2 - exact_log2(granularity), copy4); 1064 __ ldrw(t0, Address(s, 0)); 1065 __ ldrw(t1, Address(send, -4)); 1066 __ strw(t0, Address(d, 0)); 1067 __ strw(t1, Address(dend, -4)); 1068 __ b(finish); 1069 if (granularity < 4) { 1070 // 0..3 bytes 1071 __ bind(copy4); 1072 __ cbz(count, finish); // get rid of 0 case 1073 if (granularity == 2) { 1074 __ ldrh(t0, Address(s, 0)); 1075 __ strh(t0, Address(d, 0)); 1076 } else { // granularity == 1 1077 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1078 // the first and last byte. 1079 // Handle the 3 byte case by loading and storing base + count/2 1080 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1081 // This does means in the 1 byte case we load/store the same 1082 // byte 3 times. 1083 __ lsr(count, count, 1); 1084 __ ldrb(t0, Address(s, 0)); 1085 __ ldrb(t1, Address(send, -1)); 1086 __ ldrb(t2, Address(s, count)); 1087 __ strb(t0, Address(d, 0)); 1088 __ strb(t1, Address(dend, -1)); 1089 __ strb(t2, Address(d, count)); 1090 } 1091 __ b(finish); 1092 } 1093 } 1094 1095 __ bind(copy_big); 1096 if (is_backwards) { 1097 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1098 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1099 } 1100 1101 // Now we've got the small case out of the way we can align the 1102 // source address on a 2-word boundary. 1103 1104 Label aligned; 1105 1106 if (is_aligned) { 1107 // We may have to adjust by 1 word to get s 2-word-aligned. 1108 __ tbz(s, exact_log2(wordSize), aligned); 1109 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1110 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1111 __ sub(count, count, wordSize/granularity); 1112 } else { 1113 if (is_backwards) { 1114 __ andr(rscratch2, s, 2 * wordSize - 1); 1115 } else { 1116 __ neg(rscratch2, s); 1117 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1118 } 1119 // rscratch2 is the byte adjustment needed to align s. 1120 __ cbz(rscratch2, aligned); 1121 int shift = exact_log2(granularity); 1122 if (shift) __ lsr(rscratch2, rscratch2, shift); 1123 __ sub(count, count, rscratch2); 1124 1125 #if 0 1126 // ?? This code is only correct for a disjoint copy. It may or 1127 // may not make sense to use it in that case. 1128 1129 // Copy the first pair; s and d may not be aligned. 1130 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1131 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1132 1133 // Align s and d, adjust count 1134 if (is_backwards) { 1135 __ sub(s, s, rscratch2); 1136 __ sub(d, d, rscratch2); 1137 } else { 1138 __ add(s, s, rscratch2); 1139 __ add(d, d, rscratch2); 1140 } 1141 #else 1142 copy_memory_small(s, d, rscratch2, rscratch1, step); 1143 #endif 1144 } 1145 1146 __ bind(aligned); 1147 1148 // s is now 2-word-aligned. 1149 1150 // We have a count of units and some trailing bytes. Adjust the 1151 // count and do a bulk copy of words. 1152 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1153 if (direction == copy_forwards) 1154 __ bl(copy_f); 1155 else 1156 __ bl(copy_b); 1157 1158 // And the tail. 1159 copy_memory_small(s, d, count, tmp, step); 1160 1161 if (granularity >= 8) __ bind(copy8); 1162 if (granularity >= 4) __ bind(copy4); 1163 __ bind(finish); 1164 } 1165 1166 1167 void clobber_registers() { 1168 #ifdef ASSERT 1169 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1170 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1171 for (Register r = r3; r <= r18; r++) 1172 if (r != rscratch1) __ mov(r, rscratch1); 1173 #endif 1174 } 1175 1176 // Scan over array at a for count oops, verifying each one. 1177 // Preserves a and count, clobbers rscratch1 and rscratch2. 1178 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1179 Label loop, end; 1180 __ mov(rscratch1, a); 1181 __ mov(rscratch2, zr); 1182 __ bind(loop); 1183 __ cmp(rscratch2, count); 1184 __ br(Assembler::HS, end); 1185 if (size == (size_t)wordSize) { 1186 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1187 __ verify_oop(temp); 1188 } else { 1189 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1190 __ decode_heap_oop(temp); // calls verify_oop 1191 } 1192 __ add(rscratch2, rscratch2, size); 1193 __ b(loop); 1194 __ bind(end); 1195 } 1196 1197 // Arguments: 1198 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1199 // ignored 1200 // is_oop - true => oop array, so generate store check code 1201 // name - stub name string 1202 // 1203 // Inputs: 1204 // c_rarg0 - source array address 1205 // c_rarg1 - destination array address 1206 // c_rarg2 - element count, treated as ssize_t, can be zero 1207 // 1208 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1209 // the hardware handle it. The two dwords within qwords that span 1210 // cache line boundaries will still be loaded and stored atomicly. 1211 // 1212 // Side Effects: 1213 // disjoint_int_copy_entry is set to the no-overlap entry point 1214 // used by generate_conjoint_int_oop_copy(). 1215 // 1216 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1217 const char *name, bool dest_uninitialized = false) { 1218 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1219 __ align(CodeEntryAlignment); 1220 StubCodeMark mark(this, "StubRoutines", name); 1221 address start = __ pc(); 1222 __ enter(); 1223 1224 if (entry != NULL) { 1225 *entry = __ pc(); 1226 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1227 BLOCK_COMMENT("Entry:"); 1228 } 1229 1230 if (is_oop) { 1231 __ push(RegSet::of(d, count), sp); 1232 // no registers are destroyed by this call 1233 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1234 } 1235 copy_memory(aligned, s, d, count, rscratch1, size); 1236 if (is_oop) { 1237 __ pop(RegSet::of(d, count), sp); 1238 if (VerifyOops) 1239 verify_oop_array(size, d, count, r16); 1240 __ sub(count, count, 1); // make an inclusive end pointer 1241 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1242 gen_write_ref_array_post_barrier(d, count, rscratch1); 1243 } 1244 __ leave(); 1245 __ mov(r0, zr); // return 0 1246 __ ret(lr); 1247 #ifdef BUILTIN_SIM 1248 { 1249 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1250 sim->notifyCompile(const_cast<char*>(name), start); 1251 } 1252 #endif 1253 return start; 1254 } 1255 1256 // Arguments: 1257 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1258 // ignored 1259 // is_oop - true => oop array, so generate store check code 1260 // name - stub name string 1261 // 1262 // Inputs: 1263 // c_rarg0 - source array address 1264 // c_rarg1 - destination array address 1265 // c_rarg2 - element count, treated as ssize_t, can be zero 1266 // 1267 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1268 // the hardware handle it. The two dwords within qwords that span 1269 // cache line boundaries will still be loaded and stored atomicly. 1270 // 1271 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1272 address *entry, const char *name, 1273 bool dest_uninitialized = false) { 1274 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1275 1276 StubCodeMark mark(this, "StubRoutines", name); 1277 address start = __ pc(); 1278 __ enter(); 1279 1280 if (entry != NULL) { 1281 *entry = __ pc(); 1282 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1283 BLOCK_COMMENT("Entry:"); 1284 } 1285 1286 // use fwd copy when (d-s) above_equal (count*size) 1287 __ sub(rscratch1, d, s); 1288 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1289 __ br(Assembler::HS, nooverlap_target); 1290 1291 if (is_oop) { 1292 __ push(RegSet::of(d, count), sp); 1293 // no registers are destroyed by this call 1294 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1295 } 1296 copy_memory(aligned, s, d, count, rscratch1, -size); 1297 if (is_oop) { 1298 __ pop(RegSet::of(d, count), sp); 1299 if (VerifyOops) 1300 verify_oop_array(size, d, count, r16); 1301 __ sub(count, count, 1); // make an inclusive end pointer 1302 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1303 gen_write_ref_array_post_barrier(d, count, rscratch1); 1304 } 1305 __ leave(); 1306 __ mov(r0, zr); // return 0 1307 __ ret(lr); 1308 #ifdef BUILTIN_SIM 1309 { 1310 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1311 sim->notifyCompile(const_cast<char*>(name), start); 1312 } 1313 #endif 1314 return start; 1315 } 1316 1317 // Arguments: 1318 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1319 // ignored 1320 // name - stub name string 1321 // 1322 // Inputs: 1323 // c_rarg0 - source array address 1324 // c_rarg1 - destination array address 1325 // c_rarg2 - element count, treated as ssize_t, can be zero 1326 // 1327 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1328 // we let the hardware handle it. The one to eight bytes within words, 1329 // dwords or qwords that span cache line boundaries will still be loaded 1330 // and stored atomically. 1331 // 1332 // Side Effects: 1333 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1334 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1335 // we let the hardware handle it. The one to eight bytes within words, 1336 // dwords or qwords that span cache line boundaries will still be loaded 1337 // and stored atomically. 1338 // 1339 // Side Effects: 1340 // disjoint_byte_copy_entry is set to the no-overlap entry point 1341 // used by generate_conjoint_byte_copy(). 1342 // 1343 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1344 const bool not_oop = false; 1345 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1346 } 1347 1348 // Arguments: 1349 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1350 // ignored 1351 // name - stub name string 1352 // 1353 // Inputs: 1354 // c_rarg0 - source array address 1355 // c_rarg1 - destination array address 1356 // c_rarg2 - element count, treated as ssize_t, can be zero 1357 // 1358 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1359 // we let the hardware handle it. The one to eight bytes within words, 1360 // dwords or qwords that span cache line boundaries will still be loaded 1361 // and stored atomically. 1362 // 1363 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1364 address* entry, const char *name) { 1365 const bool not_oop = false; 1366 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1367 } 1368 1369 // Arguments: 1370 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1371 // ignored 1372 // name - stub name string 1373 // 1374 // Inputs: 1375 // c_rarg0 - source array address 1376 // c_rarg1 - destination array address 1377 // c_rarg2 - element count, treated as ssize_t, can be zero 1378 // 1379 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1380 // let the hardware handle it. The two or four words within dwords 1381 // or qwords that span cache line boundaries will still be loaded 1382 // and stored atomically. 1383 // 1384 // Side Effects: 1385 // disjoint_short_copy_entry is set to the no-overlap entry point 1386 // used by generate_conjoint_short_copy(). 1387 // 1388 address generate_disjoint_short_copy(bool aligned, 1389 address* entry, const char *name) { 1390 const bool not_oop = false; 1391 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1392 } 1393 1394 // Arguments: 1395 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1396 // ignored 1397 // name - stub name string 1398 // 1399 // Inputs: 1400 // c_rarg0 - source array address 1401 // c_rarg1 - destination array address 1402 // c_rarg2 - element count, treated as ssize_t, can be zero 1403 // 1404 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1405 // let the hardware handle it. The two or four words within dwords 1406 // or qwords that span cache line boundaries will still be loaded 1407 // and stored atomically. 1408 // 1409 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1410 address *entry, const char *name) { 1411 const bool not_oop = false; 1412 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1413 1414 } 1415 // Arguments: 1416 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1417 // ignored 1418 // name - stub name string 1419 // 1420 // Inputs: 1421 // c_rarg0 - source array address 1422 // c_rarg1 - destination array address 1423 // c_rarg2 - element count, treated as ssize_t, can be zero 1424 // 1425 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1426 // the hardware handle it. The two dwords within qwords that span 1427 // cache line boundaries will still be loaded and stored atomicly. 1428 // 1429 // Side Effects: 1430 // disjoint_int_copy_entry is set to the no-overlap entry point 1431 // used by generate_conjoint_int_oop_copy(). 1432 // 1433 address generate_disjoint_int_copy(bool aligned, address *entry, 1434 const char *name, bool dest_uninitialized = false) { 1435 const bool not_oop = false; 1436 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1437 } 1438 1439 // Arguments: 1440 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1441 // ignored 1442 // name - stub name string 1443 // 1444 // Inputs: 1445 // c_rarg0 - source array address 1446 // c_rarg1 - destination array address 1447 // c_rarg2 - element count, treated as ssize_t, can be zero 1448 // 1449 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1450 // the hardware handle it. The two dwords within qwords that span 1451 // cache line boundaries will still be loaded and stored atomicly. 1452 // 1453 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1454 address *entry, const char *name, 1455 bool dest_uninitialized = false) { 1456 const bool not_oop = false; 1457 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1458 } 1459 1460 1461 // Arguments: 1462 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1463 // ignored 1464 // name - stub name string 1465 // 1466 // Inputs: 1467 // c_rarg0 - source array address 1468 // c_rarg1 - destination array address 1469 // c_rarg2 - element count, treated as size_t, can be zero 1470 // 1471 // Side Effects: 1472 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1473 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1474 // 1475 address generate_disjoint_long_copy(bool aligned, address *entry, 1476 const char *name, bool dest_uninitialized = false) { 1477 const bool not_oop = false; 1478 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1479 } 1480 1481 // Arguments: 1482 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1483 // ignored 1484 // name - stub name string 1485 // 1486 // Inputs: 1487 // c_rarg0 - source array address 1488 // c_rarg1 - destination array address 1489 // c_rarg2 - element count, treated as size_t, can be zero 1490 // 1491 address generate_conjoint_long_copy(bool aligned, 1492 address nooverlap_target, address *entry, 1493 const char *name, bool dest_uninitialized = false) { 1494 const bool not_oop = false; 1495 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1496 } 1497 1498 // Arguments: 1499 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1500 // ignored 1501 // name - stub name string 1502 // 1503 // Inputs: 1504 // c_rarg0 - source array address 1505 // c_rarg1 - destination array address 1506 // c_rarg2 - element count, treated as size_t, can be zero 1507 // 1508 // Side Effects: 1509 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1510 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1511 // 1512 address generate_disjoint_oop_copy(bool aligned, address *entry, 1513 const char *name, bool dest_uninitialized) { 1514 const bool is_oop = true; 1515 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1516 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1517 } 1518 1519 // Arguments: 1520 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1521 // ignored 1522 // name - stub name string 1523 // 1524 // Inputs: 1525 // c_rarg0 - source array address 1526 // c_rarg1 - destination array address 1527 // c_rarg2 - element count, treated as size_t, can be zero 1528 // 1529 address generate_conjoint_oop_copy(bool aligned, 1530 address nooverlap_target, address *entry, 1531 const char *name, bool dest_uninitialized) { 1532 const bool is_oop = true; 1533 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1534 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1535 name, dest_uninitialized); 1536 } 1537 1538 1539 // Helper for generating a dynamic type check. 1540 // Smashes rscratch1. 1541 void generate_type_check(Register sub_klass, 1542 Register super_check_offset, 1543 Register super_klass, 1544 Label& L_success) { 1545 assert_different_registers(sub_klass, super_check_offset, super_klass); 1546 1547 BLOCK_COMMENT("type_check:"); 1548 1549 Label L_miss; 1550 1551 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1552 super_check_offset); 1553 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1554 1555 // Fall through on failure! 1556 __ BIND(L_miss); 1557 } 1558 1559 // 1560 // Generate checkcasting array copy stub 1561 // 1562 // Input: 1563 // c_rarg0 - source array address 1564 // c_rarg1 - destination array address 1565 // c_rarg2 - element count, treated as ssize_t, can be zero 1566 // c_rarg3 - size_t ckoff (super_check_offset) 1567 // c_rarg4 - oop ckval (super_klass) 1568 // 1569 // Output: 1570 // r0 == 0 - success 1571 // r0 == -1^K - failure, where K is partial transfer count 1572 // 1573 address generate_checkcast_copy(const char *name, address *entry, 1574 bool dest_uninitialized = false) { 1575 1576 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1577 1578 // Input registers (after setup_arg_regs) 1579 const Register from = c_rarg0; // source array address 1580 const Register to = c_rarg1; // destination array address 1581 const Register count = c_rarg2; // elementscount 1582 const Register ckoff = c_rarg3; // super_check_offset 1583 const Register ckval = c_rarg4; // super_klass 1584 1585 // Registers used as temps (r18, r19, r20 are save-on-entry) 1586 const Register count_save = r21; // orig elementscount 1587 const Register start_to = r20; // destination array start address 1588 const Register copied_oop = r18; // actual oop copied 1589 const Register r19_klass = r19; // oop._klass 1590 1591 //--------------------------------------------------------------- 1592 // Assembler stub will be used for this call to arraycopy 1593 // if the two arrays are subtypes of Object[] but the 1594 // destination array type is not equal to or a supertype 1595 // of the source type. Each element must be separately 1596 // checked. 1597 1598 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1599 copied_oop, r19_klass, count_save); 1600 1601 __ align(CodeEntryAlignment); 1602 StubCodeMark mark(this, "StubRoutines", name); 1603 address start = __ pc(); 1604 1605 __ enter(); // required for proper stackwalking of RuntimeStub frame 1606 1607 #ifdef ASSERT 1608 // caller guarantees that the arrays really are different 1609 // otherwise, we would have to make conjoint checks 1610 { Label L; 1611 array_overlap_test(L, TIMES_OOP); 1612 __ stop("checkcast_copy within a single array"); 1613 __ bind(L); 1614 } 1615 #endif //ASSERT 1616 1617 // Caller of this entry point must set up the argument registers. 1618 if (entry != NULL) { 1619 *entry = __ pc(); 1620 BLOCK_COMMENT("Entry:"); 1621 } 1622 1623 // Empty array: Nothing to do. 1624 __ cbz(count, L_done); 1625 1626 __ push(RegSet::of(r18, r19, r20, r21), sp); 1627 1628 #ifdef ASSERT 1629 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1630 // The ckoff and ckval must be mutually consistent, 1631 // even though caller generates both. 1632 { Label L; 1633 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1634 __ ldrw(start_to, Address(ckval, sco_offset)); 1635 __ cmpw(ckoff, start_to); 1636 __ br(Assembler::EQ, L); 1637 __ stop("super_check_offset inconsistent"); 1638 __ bind(L); 1639 } 1640 #endif //ASSERT 1641 1642 gen_write_ref_array_pre_barrier(to, count, dest_uninitialized); 1643 1644 // save the original count 1645 __ mov(count_save, count); 1646 1647 // Copy from low to high addresses 1648 __ mov(start_to, to); // Save destination array start address 1649 __ b(L_load_element); 1650 1651 // ======== begin loop ======== 1652 // (Loop is rotated; its entry is L_load_element.) 1653 // Loop control: 1654 // for (; count != 0; count--) { 1655 // copied_oop = load_heap_oop(from++); 1656 // ... generate_type_check ...; 1657 // store_heap_oop(to++, copied_oop); 1658 // } 1659 __ align(OptoLoopAlignment); 1660 1661 __ BIND(L_store_element); 1662 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1663 __ sub(count, count, 1); 1664 __ cbz(count, L_do_card_marks); 1665 1666 // ======== loop entry is here ======== 1667 __ BIND(L_load_element); 1668 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1669 __ cbz(copied_oop, L_store_element); 1670 1671 __ load_klass(r19_klass, copied_oop);// query the object klass 1672 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1673 // ======== end loop ======== 1674 1675 // It was a real error; we must depend on the caller to finish the job. 1676 // Register count = remaining oops, count_orig = total oops. 1677 // Emit GC store barriers for the oops we have copied and report 1678 // their number to the caller. 1679 1680 __ subs(count, count_save, count); // K = partially copied oop count 1681 __ eon(count, count, zr); // report (-1^K) to caller 1682 __ br(Assembler::EQ, L_done_pop); 1683 1684 __ BIND(L_do_card_marks); 1685 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1686 gen_write_ref_array_post_barrier(start_to, to, rscratch1); 1687 1688 __ bind(L_done_pop); 1689 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1690 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1691 1692 __ bind(L_done); 1693 __ mov(r0, count); 1694 __ leave(); 1695 __ ret(lr); 1696 1697 return start; 1698 } 1699 1700 // Perform range checks on the proposed arraycopy. 1701 // Kills temp, but nothing else. 1702 // Also, clean the sign bits of src_pos and dst_pos. 1703 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1704 Register src_pos, // source position (c_rarg1) 1705 Register dst, // destination array oo (c_rarg2) 1706 Register dst_pos, // destination position (c_rarg3) 1707 Register length, 1708 Register temp, 1709 Label& L_failed) { 1710 BLOCK_COMMENT("arraycopy_range_checks:"); 1711 1712 assert_different_registers(rscratch1, temp); 1713 1714 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1715 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1716 __ addw(temp, length, src_pos); 1717 __ cmpw(temp, rscratch1); 1718 __ br(Assembler::HI, L_failed); 1719 1720 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1721 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1722 __ addw(temp, length, dst_pos); 1723 __ cmpw(temp, rscratch1); 1724 __ br(Assembler::HI, L_failed); 1725 1726 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1727 __ movw(src_pos, src_pos); 1728 __ movw(dst_pos, dst_pos); 1729 1730 BLOCK_COMMENT("arraycopy_range_checks done"); 1731 } 1732 1733 // These stubs get called from some dumb test routine. 1734 // I'll write them properly when they're called from 1735 // something that's actually doing something. 1736 static void fake_arraycopy_stub(address src, address dst, int count) { 1737 assert(count == 0, "huh?"); 1738 } 1739 1740 1741 // 1742 // Generate 'unsafe' array copy stub 1743 // Though just as safe as the other stubs, it takes an unscaled 1744 // size_t argument instead of an element count. 1745 // 1746 // Input: 1747 // c_rarg0 - source array address 1748 // c_rarg1 - destination array address 1749 // c_rarg2 - byte count, treated as ssize_t, can be zero 1750 // 1751 // Examines the alignment of the operands and dispatches 1752 // to a long, int, short, or byte copy loop. 1753 // 1754 address generate_unsafe_copy(const char *name, 1755 address byte_copy_entry, 1756 address short_copy_entry, 1757 address int_copy_entry, 1758 address long_copy_entry) { 1759 Label L_long_aligned, L_int_aligned, L_short_aligned; 1760 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1761 1762 __ align(CodeEntryAlignment); 1763 StubCodeMark mark(this, "StubRoutines", name); 1764 address start = __ pc(); 1765 __ enter(); // required for proper stackwalking of RuntimeStub frame 1766 1767 // bump this on entry, not on exit: 1768 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1769 1770 __ orr(rscratch1, s, d); 1771 __ orr(rscratch1, rscratch1, count); 1772 1773 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1774 __ cbz(rscratch1, L_long_aligned); 1775 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1776 __ cbz(rscratch1, L_int_aligned); 1777 __ tbz(rscratch1, 0, L_short_aligned); 1778 __ b(RuntimeAddress(byte_copy_entry)); 1779 1780 __ BIND(L_short_aligned); 1781 __ lsr(count, count, LogBytesPerShort); // size => short_count 1782 __ b(RuntimeAddress(short_copy_entry)); 1783 __ BIND(L_int_aligned); 1784 __ lsr(count, count, LogBytesPerInt); // size => int_count 1785 __ b(RuntimeAddress(int_copy_entry)); 1786 __ BIND(L_long_aligned); 1787 __ lsr(count, count, LogBytesPerLong); // size => long_count 1788 __ b(RuntimeAddress(long_copy_entry)); 1789 1790 return start; 1791 } 1792 1793 // 1794 // Generate generic array copy stubs 1795 // 1796 // Input: 1797 // c_rarg0 - src oop 1798 // c_rarg1 - src_pos (32-bits) 1799 // c_rarg2 - dst oop 1800 // c_rarg3 - dst_pos (32-bits) 1801 // c_rarg4 - element count (32-bits) 1802 // 1803 // Output: 1804 // r0 == 0 - success 1805 // r0 == -1^K - failure, where K is partial transfer count 1806 // 1807 address generate_generic_copy(const char *name, 1808 address byte_copy_entry, address short_copy_entry, 1809 address int_copy_entry, address oop_copy_entry, 1810 address long_copy_entry, address checkcast_copy_entry) { 1811 1812 Label L_failed, L_failed_0, L_objArray; 1813 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1814 1815 // Input registers 1816 const Register src = c_rarg0; // source array oop 1817 const Register src_pos = c_rarg1; // source position 1818 const Register dst = c_rarg2; // destination array oop 1819 const Register dst_pos = c_rarg3; // destination position 1820 const Register length = c_rarg4; 1821 1822 StubCodeMark mark(this, "StubRoutines", name); 1823 1824 __ align(CodeEntryAlignment); 1825 address start = __ pc(); 1826 1827 __ enter(); // required for proper stackwalking of RuntimeStub frame 1828 1829 // bump this on entry, not on exit: 1830 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1831 1832 //----------------------------------------------------------------------- 1833 // Assembler stub will be used for this call to arraycopy 1834 // if the following conditions are met: 1835 // 1836 // (1) src and dst must not be null. 1837 // (2) src_pos must not be negative. 1838 // (3) dst_pos must not be negative. 1839 // (4) length must not be negative. 1840 // (5) src klass and dst klass should be the same and not NULL. 1841 // (6) src and dst should be arrays. 1842 // (7) src_pos + length must not exceed length of src. 1843 // (8) dst_pos + length must not exceed length of dst. 1844 // 1845 1846 // if (src == NULL) return -1; 1847 __ cbz(src, L_failed); 1848 1849 // if (src_pos < 0) return -1; 1850 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 1851 1852 // if (dst == NULL) return -1; 1853 __ cbz(dst, L_failed); 1854 1855 // if (dst_pos < 0) return -1; 1856 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 1857 1858 // registers used as temp 1859 const Register scratch_length = r16; // elements count to copy 1860 const Register scratch_src_klass = r17; // array klass 1861 const Register lh = r18; // layout helper 1862 1863 // if (length < 0) return -1; 1864 __ movw(scratch_length, length); // length (elements count, 32-bits value) 1865 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 1866 1867 __ load_klass(scratch_src_klass, src); 1868 #ifdef ASSERT 1869 // assert(src->klass() != NULL); 1870 { 1871 BLOCK_COMMENT("assert klasses not null {"); 1872 Label L1, L2; 1873 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 1874 __ bind(L1); 1875 __ stop("broken null klass"); 1876 __ bind(L2); 1877 __ load_klass(rscratch1, dst); 1878 __ cbz(rscratch1, L1); // this would be broken also 1879 BLOCK_COMMENT("} assert klasses not null done"); 1880 } 1881 #endif 1882 1883 // Load layout helper (32-bits) 1884 // 1885 // |array_tag| | header_size | element_type | |log2_element_size| 1886 // 32 30 24 16 8 2 0 1887 // 1888 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 1889 // 1890 1891 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 1892 1893 // Handle objArrays completely differently... 1894 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 1895 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 1896 __ movw(rscratch1, objArray_lh); 1897 __ eorw(rscratch2, lh, rscratch1); 1898 __ cbzw(rscratch2, L_objArray); 1899 1900 // if (src->klass() != dst->klass()) return -1; 1901 __ load_klass(rscratch2, dst); 1902 __ eor(rscratch2, rscratch2, scratch_src_klass); 1903 __ cbnz(rscratch2, L_failed); 1904 1905 // if (!src->is_Array()) return -1; 1906 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 1907 1908 // At this point, it is known to be a typeArray (array_tag 0x3). 1909 #ifdef ASSERT 1910 { 1911 BLOCK_COMMENT("assert primitive array {"); 1912 Label L; 1913 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 1914 __ cmpw(lh, rscratch2); 1915 __ br(Assembler::GE, L); 1916 __ stop("must be a primitive array"); 1917 __ bind(L); 1918 BLOCK_COMMENT("} assert primitive array done"); 1919 } 1920 #endif 1921 1922 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1923 rscratch2, L_failed); 1924 1925 // TypeArrayKlass 1926 // 1927 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 1928 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 1929 // 1930 1931 const Register rscratch1_offset = rscratch1; // array offset 1932 const Register r18_elsize = lh; // element size 1933 1934 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 1935 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 1936 __ add(src, src, rscratch1_offset); // src array offset 1937 __ add(dst, dst, rscratch1_offset); // dst array offset 1938 BLOCK_COMMENT("choose copy loop based on element size"); 1939 1940 // next registers should be set before the jump to corresponding stub 1941 const Register from = c_rarg0; // source array address 1942 const Register to = c_rarg1; // destination array address 1943 const Register count = c_rarg2; // elements count 1944 1945 // 'from', 'to', 'count' registers should be set in such order 1946 // since they are the same as 'src', 'src_pos', 'dst'. 1947 1948 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 1949 1950 // The possible values of elsize are 0-3, i.e. exact_log2(element 1951 // size in bytes). We do a simple bitwise binary search. 1952 __ BIND(L_copy_bytes); 1953 __ tbnz(r18_elsize, 1, L_copy_ints); 1954 __ tbnz(r18_elsize, 0, L_copy_shorts); 1955 __ lea(from, Address(src, src_pos));// src_addr 1956 __ lea(to, Address(dst, dst_pos));// dst_addr 1957 __ movw(count, scratch_length); // length 1958 __ b(RuntimeAddress(byte_copy_entry)); 1959 1960 __ BIND(L_copy_shorts); 1961 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 1962 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 1963 __ movw(count, scratch_length); // length 1964 __ b(RuntimeAddress(short_copy_entry)); 1965 1966 __ BIND(L_copy_ints); 1967 __ tbnz(r18_elsize, 0, L_copy_longs); 1968 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 1969 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 1970 __ movw(count, scratch_length); // length 1971 __ b(RuntimeAddress(int_copy_entry)); 1972 1973 __ BIND(L_copy_longs); 1974 #ifdef ASSERT 1975 { 1976 BLOCK_COMMENT("assert long copy {"); 1977 Label L; 1978 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 1979 __ cmpw(r18_elsize, LogBytesPerLong); 1980 __ br(Assembler::EQ, L); 1981 __ stop("must be long copy, but elsize is wrong"); 1982 __ bind(L); 1983 BLOCK_COMMENT("} assert long copy done"); 1984 } 1985 #endif 1986 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 1987 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 1988 __ movw(count, scratch_length); // length 1989 __ b(RuntimeAddress(long_copy_entry)); 1990 1991 // ObjArrayKlass 1992 __ BIND(L_objArray); 1993 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 1994 1995 Label L_plain_copy, L_checkcast_copy; 1996 // test array classes for subtyping 1997 __ load_klass(r18, dst); 1998 __ cmp(scratch_src_klass, r18); // usual case is exact equality 1999 __ br(Assembler::NE, L_checkcast_copy); 2000 2001 // Identically typed arrays can be copied without element-wise checks. 2002 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2003 rscratch2, L_failed); 2004 2005 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2006 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2007 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2008 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2009 __ movw(count, scratch_length); // length 2010 __ BIND(L_plain_copy); 2011 __ b(RuntimeAddress(oop_copy_entry)); 2012 2013 __ BIND(L_checkcast_copy); 2014 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2015 { 2016 // Before looking at dst.length, make sure dst is also an objArray. 2017 __ ldrw(rscratch1, Address(r18, lh_offset)); 2018 __ movw(rscratch2, objArray_lh); 2019 __ eorw(rscratch1, rscratch1, rscratch2); 2020 __ cbnzw(rscratch1, L_failed); 2021 2022 // It is safe to examine both src.length and dst.length. 2023 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2024 r18, L_failed); 2025 2026 const Register rscratch2_dst_klass = rscratch2; 2027 __ load_klass(rscratch2_dst_klass, dst); // reload 2028 2029 // Marshal the base address arguments now, freeing registers. 2030 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2031 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2032 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2033 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2034 __ movw(count, length); // length (reloaded) 2035 Register sco_temp = c_rarg3; // this register is free now 2036 assert_different_registers(from, to, count, sco_temp, 2037 rscratch2_dst_klass, scratch_src_klass); 2038 // assert_clean_int(count, sco_temp); 2039 2040 // Generate the type check. 2041 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2042 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2043 // assert_clean_int(sco_temp, r18); 2044 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2045 2046 // Fetch destination element klass from the ObjArrayKlass header. 2047 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2048 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2049 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2050 2051 // the checkcast_copy loop needs two extra arguments: 2052 assert(c_rarg3 == sco_temp, "#3 already in place"); 2053 // Set up arguments for checkcast_copy_entry. 2054 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2055 __ b(RuntimeAddress(checkcast_copy_entry)); 2056 } 2057 2058 __ BIND(L_failed); 2059 __ mov(r0, -1); 2060 __ leave(); // required for proper stackwalking of RuntimeStub frame 2061 __ ret(lr); 2062 2063 return start; 2064 } 2065 2066 // 2067 // Generate stub for array fill. If "aligned" is true, the 2068 // "to" address is assumed to be heapword aligned. 2069 // 2070 // Arguments for generated stub: 2071 // to: c_rarg0 2072 // value: c_rarg1 2073 // count: c_rarg2 treated as signed 2074 // 2075 address generate_fill(BasicType t, bool aligned, const char *name) { 2076 __ align(CodeEntryAlignment); 2077 StubCodeMark mark(this, "StubRoutines", name); 2078 address start = __ pc(); 2079 2080 BLOCK_COMMENT("Entry:"); 2081 2082 const Register to = c_rarg0; // source array address 2083 const Register value = c_rarg1; // value 2084 const Register count = c_rarg2; // elements count 2085 2086 const Register bz_base = r10; // base for block_zero routine 2087 const Register cnt_words = r11; // temp register 2088 2089 __ enter(); 2090 2091 Label L_fill_elements, L_exit1; 2092 2093 int shift = -1; 2094 switch (t) { 2095 case T_BYTE: 2096 shift = 0; 2097 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2098 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2099 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2100 __ br(Assembler::LO, L_fill_elements); 2101 break; 2102 case T_SHORT: 2103 shift = 1; 2104 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2105 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2106 __ br(Assembler::LO, L_fill_elements); 2107 break; 2108 case T_INT: 2109 shift = 2; 2110 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2111 __ br(Assembler::LO, L_fill_elements); 2112 break; 2113 default: ShouldNotReachHere(); 2114 } 2115 2116 // Align source address at 8 bytes address boundary. 2117 Label L_skip_align1, L_skip_align2, L_skip_align4; 2118 if (!aligned) { 2119 switch (t) { 2120 case T_BYTE: 2121 // One byte misalignment happens only for byte arrays. 2122 __ tbz(to, 0, L_skip_align1); 2123 __ strb(value, Address(__ post(to, 1))); 2124 __ subw(count, count, 1); 2125 __ bind(L_skip_align1); 2126 // Fallthrough 2127 case T_SHORT: 2128 // Two bytes misalignment happens only for byte and short (char) arrays. 2129 __ tbz(to, 1, L_skip_align2); 2130 __ strh(value, Address(__ post(to, 2))); 2131 __ subw(count, count, 2 >> shift); 2132 __ bind(L_skip_align2); 2133 // Fallthrough 2134 case T_INT: 2135 // Align to 8 bytes, we know we are 4 byte aligned to start. 2136 __ tbz(to, 2, L_skip_align4); 2137 __ strw(value, Address(__ post(to, 4))); 2138 __ subw(count, count, 4 >> shift); 2139 __ bind(L_skip_align4); 2140 break; 2141 default: ShouldNotReachHere(); 2142 } 2143 } 2144 2145 // 2146 // Fill large chunks 2147 // 2148 __ lsrw(cnt_words, count, 3 - shift); // number of words 2149 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2150 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2151 if (UseBlockZeroing) { 2152 Label non_block_zeroing, rest; 2153 // count >= BlockZeroingLowLimit && value == 0 2154 __ cmp(cnt_words, BlockZeroingLowLimit >> 3); 2155 __ ccmp(value, 0 /* comparing value */, 0 /* NZCV */, Assembler::GE); 2156 __ br(Assembler::NE, non_block_zeroing); 2157 __ mov(bz_base, to); 2158 __ block_zero(bz_base, cnt_words, true); 2159 __ mov(to, bz_base); 2160 __ b(rest); 2161 __ bind(non_block_zeroing); 2162 __ fill_words(to, cnt_words, value); 2163 __ bind(rest); 2164 } 2165 else { 2166 __ fill_words(to, cnt_words, value); 2167 } 2168 2169 // Remaining count is less than 8 bytes. Fill it by a single store. 2170 // Note that the total length is no less than 8 bytes. 2171 if (t == T_BYTE || t == T_SHORT) { 2172 Label L_exit1; 2173 __ cbzw(count, L_exit1); 2174 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2175 __ str(value, Address(to, -8)); // overwrite some elements 2176 __ bind(L_exit1); 2177 __ leave(); 2178 __ ret(lr); 2179 } 2180 2181 // Handle copies less than 8 bytes. 2182 Label L_fill_2, L_fill_4, L_exit2; 2183 __ bind(L_fill_elements); 2184 switch (t) { 2185 case T_BYTE: 2186 __ tbz(count, 0, L_fill_2); 2187 __ strb(value, Address(__ post(to, 1))); 2188 __ bind(L_fill_2); 2189 __ tbz(count, 1, L_fill_4); 2190 __ strh(value, Address(__ post(to, 2))); 2191 __ bind(L_fill_4); 2192 __ tbz(count, 2, L_exit2); 2193 __ strw(value, Address(to)); 2194 break; 2195 case T_SHORT: 2196 __ tbz(count, 0, L_fill_4); 2197 __ strh(value, Address(__ post(to, 2))); 2198 __ bind(L_fill_4); 2199 __ tbz(count, 1, L_exit2); 2200 __ strw(value, Address(to)); 2201 break; 2202 case T_INT: 2203 __ cbzw(count, L_exit2); 2204 __ strw(value, Address(to)); 2205 break; 2206 default: ShouldNotReachHere(); 2207 } 2208 __ bind(L_exit2); 2209 __ leave(); 2210 __ ret(lr); 2211 return start; 2212 } 2213 2214 void generate_arraycopy_stubs() { 2215 address entry; 2216 address entry_jbyte_arraycopy; 2217 address entry_jshort_arraycopy; 2218 address entry_jint_arraycopy; 2219 address entry_oop_arraycopy; 2220 address entry_jlong_arraycopy; 2221 address entry_checkcast_arraycopy; 2222 2223 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2224 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2225 2226 StubRoutines::aarch64::_zero_longs = generate_zero_longs(r10, r11); 2227 2228 //*** jbyte 2229 // Always need aligned and unaligned versions 2230 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2231 "jbyte_disjoint_arraycopy"); 2232 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2233 &entry_jbyte_arraycopy, 2234 "jbyte_arraycopy"); 2235 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2236 "arrayof_jbyte_disjoint_arraycopy"); 2237 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2238 "arrayof_jbyte_arraycopy"); 2239 2240 //*** jshort 2241 // Always need aligned and unaligned versions 2242 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2243 "jshort_disjoint_arraycopy"); 2244 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2245 &entry_jshort_arraycopy, 2246 "jshort_arraycopy"); 2247 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2248 "arrayof_jshort_disjoint_arraycopy"); 2249 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2250 "arrayof_jshort_arraycopy"); 2251 2252 //*** jint 2253 // Aligned versions 2254 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2255 "arrayof_jint_disjoint_arraycopy"); 2256 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2257 "arrayof_jint_arraycopy"); 2258 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2259 // entry_jint_arraycopy always points to the unaligned version 2260 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2261 "jint_disjoint_arraycopy"); 2262 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2263 &entry_jint_arraycopy, 2264 "jint_arraycopy"); 2265 2266 //*** jlong 2267 // It is always aligned 2268 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2269 "arrayof_jlong_disjoint_arraycopy"); 2270 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2271 "arrayof_jlong_arraycopy"); 2272 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2273 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2274 2275 //*** oops 2276 { 2277 // With compressed oops we need unaligned versions; notice that 2278 // we overwrite entry_oop_arraycopy. 2279 bool aligned = !UseCompressedOops; 2280 2281 StubRoutines::_arrayof_oop_disjoint_arraycopy 2282 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2283 /*dest_uninitialized*/false); 2284 StubRoutines::_arrayof_oop_arraycopy 2285 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2286 /*dest_uninitialized*/false); 2287 // Aligned versions without pre-barriers 2288 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2289 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2290 /*dest_uninitialized*/true); 2291 StubRoutines::_arrayof_oop_arraycopy_uninit 2292 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2293 /*dest_uninitialized*/true); 2294 } 2295 2296 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2297 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2298 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2299 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2300 2301 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2302 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2303 /*dest_uninitialized*/true); 2304 2305 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2306 entry_jbyte_arraycopy, 2307 entry_jshort_arraycopy, 2308 entry_jint_arraycopy, 2309 entry_jlong_arraycopy); 2310 2311 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2312 entry_jbyte_arraycopy, 2313 entry_jshort_arraycopy, 2314 entry_jint_arraycopy, 2315 entry_oop_arraycopy, 2316 entry_jlong_arraycopy, 2317 entry_checkcast_arraycopy); 2318 2319 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2320 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2321 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2322 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2323 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2324 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2325 } 2326 2327 void generate_math_stubs() { Unimplemented(); } 2328 2329 // Arguments: 2330 // 2331 // Inputs: 2332 // c_rarg0 - source byte array address 2333 // c_rarg1 - destination byte array address 2334 // c_rarg2 - K (key) in little endian int array 2335 // 2336 address generate_aescrypt_encryptBlock() { 2337 __ align(CodeEntryAlignment); 2338 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2339 2340 Label L_doLast; 2341 2342 const Register from = c_rarg0; // source array address 2343 const Register to = c_rarg1; // destination array address 2344 const Register key = c_rarg2; // key array address 2345 const Register keylen = rscratch1; 2346 2347 address start = __ pc(); 2348 __ enter(); 2349 2350 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2351 2352 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2353 2354 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2355 __ rev32(v1, __ T16B, v1); 2356 __ rev32(v2, __ T16B, v2); 2357 __ rev32(v3, __ T16B, v3); 2358 __ rev32(v4, __ T16B, v4); 2359 __ aese(v0, v1); 2360 __ aesmc(v0, v0); 2361 __ aese(v0, v2); 2362 __ aesmc(v0, v0); 2363 __ aese(v0, v3); 2364 __ aesmc(v0, v0); 2365 __ aese(v0, v4); 2366 __ aesmc(v0, v0); 2367 2368 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2369 __ rev32(v1, __ T16B, v1); 2370 __ rev32(v2, __ T16B, v2); 2371 __ rev32(v3, __ T16B, v3); 2372 __ rev32(v4, __ T16B, v4); 2373 __ aese(v0, v1); 2374 __ aesmc(v0, v0); 2375 __ aese(v0, v2); 2376 __ aesmc(v0, v0); 2377 __ aese(v0, v3); 2378 __ aesmc(v0, v0); 2379 __ aese(v0, v4); 2380 __ aesmc(v0, v0); 2381 2382 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2383 __ rev32(v1, __ T16B, v1); 2384 __ rev32(v2, __ T16B, v2); 2385 2386 __ cmpw(keylen, 44); 2387 __ br(Assembler::EQ, L_doLast); 2388 2389 __ aese(v0, v1); 2390 __ aesmc(v0, v0); 2391 __ aese(v0, v2); 2392 __ aesmc(v0, v0); 2393 2394 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2395 __ rev32(v1, __ T16B, v1); 2396 __ rev32(v2, __ T16B, v2); 2397 2398 __ cmpw(keylen, 52); 2399 __ br(Assembler::EQ, L_doLast); 2400 2401 __ aese(v0, v1); 2402 __ aesmc(v0, v0); 2403 __ aese(v0, v2); 2404 __ aesmc(v0, v0); 2405 2406 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2407 __ rev32(v1, __ T16B, v1); 2408 __ rev32(v2, __ T16B, v2); 2409 2410 __ BIND(L_doLast); 2411 2412 __ aese(v0, v1); 2413 __ aesmc(v0, v0); 2414 __ aese(v0, v2); 2415 2416 __ ld1(v1, __ T16B, key); 2417 __ rev32(v1, __ T16B, v1); 2418 __ eor(v0, __ T16B, v0, v1); 2419 2420 __ st1(v0, __ T16B, to); 2421 2422 __ mov(r0, 0); 2423 2424 __ leave(); 2425 __ ret(lr); 2426 2427 return start; 2428 } 2429 2430 // Arguments: 2431 // 2432 // Inputs: 2433 // c_rarg0 - source byte array address 2434 // c_rarg1 - destination byte array address 2435 // c_rarg2 - K (key) in little endian int array 2436 // 2437 address generate_aescrypt_decryptBlock() { 2438 assert(UseAES, "need AES instructions and misaligned SSE support"); 2439 __ align(CodeEntryAlignment); 2440 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2441 Label L_doLast; 2442 2443 const Register from = c_rarg0; // source array address 2444 const Register to = c_rarg1; // destination array address 2445 const Register key = c_rarg2; // key array address 2446 const Register keylen = rscratch1; 2447 2448 address start = __ pc(); 2449 __ enter(); // required for proper stackwalking of RuntimeStub frame 2450 2451 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2452 2453 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2454 2455 __ ld1(v5, __ T16B, __ post(key, 16)); 2456 __ rev32(v5, __ T16B, v5); 2457 2458 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2459 __ rev32(v1, __ T16B, v1); 2460 __ rev32(v2, __ T16B, v2); 2461 __ rev32(v3, __ T16B, v3); 2462 __ rev32(v4, __ T16B, v4); 2463 __ aesd(v0, v1); 2464 __ aesimc(v0, v0); 2465 __ aesd(v0, v2); 2466 __ aesimc(v0, v0); 2467 __ aesd(v0, v3); 2468 __ aesimc(v0, v0); 2469 __ aesd(v0, v4); 2470 __ aesimc(v0, v0); 2471 2472 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2473 __ rev32(v1, __ T16B, v1); 2474 __ rev32(v2, __ T16B, v2); 2475 __ rev32(v3, __ T16B, v3); 2476 __ rev32(v4, __ T16B, v4); 2477 __ aesd(v0, v1); 2478 __ aesimc(v0, v0); 2479 __ aesd(v0, v2); 2480 __ aesimc(v0, v0); 2481 __ aesd(v0, v3); 2482 __ aesimc(v0, v0); 2483 __ aesd(v0, v4); 2484 __ aesimc(v0, v0); 2485 2486 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2487 __ rev32(v1, __ T16B, v1); 2488 __ rev32(v2, __ T16B, v2); 2489 2490 __ cmpw(keylen, 44); 2491 __ br(Assembler::EQ, L_doLast); 2492 2493 __ aesd(v0, v1); 2494 __ aesimc(v0, v0); 2495 __ aesd(v0, v2); 2496 __ aesimc(v0, v0); 2497 2498 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2499 __ rev32(v1, __ T16B, v1); 2500 __ rev32(v2, __ T16B, v2); 2501 2502 __ cmpw(keylen, 52); 2503 __ br(Assembler::EQ, L_doLast); 2504 2505 __ aesd(v0, v1); 2506 __ aesimc(v0, v0); 2507 __ aesd(v0, v2); 2508 __ aesimc(v0, v0); 2509 2510 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2511 __ rev32(v1, __ T16B, v1); 2512 __ rev32(v2, __ T16B, v2); 2513 2514 __ BIND(L_doLast); 2515 2516 __ aesd(v0, v1); 2517 __ aesimc(v0, v0); 2518 __ aesd(v0, v2); 2519 2520 __ eor(v0, __ T16B, v0, v5); 2521 2522 __ st1(v0, __ T16B, to); 2523 2524 __ mov(r0, 0); 2525 2526 __ leave(); 2527 __ ret(lr); 2528 2529 return start; 2530 } 2531 2532 // Arguments: 2533 // 2534 // Inputs: 2535 // c_rarg0 - source byte array address 2536 // c_rarg1 - destination byte array address 2537 // c_rarg2 - K (key) in little endian int array 2538 // c_rarg3 - r vector byte array address 2539 // c_rarg4 - input length 2540 // 2541 // Output: 2542 // x0 - input length 2543 // 2544 address generate_cipherBlockChaining_encryptAESCrypt() { 2545 assert(UseAES, "need AES instructions and misaligned SSE support"); 2546 __ align(CodeEntryAlignment); 2547 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2548 2549 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2550 2551 const Register from = c_rarg0; // source array address 2552 const Register to = c_rarg1; // destination array address 2553 const Register key = c_rarg2; // key array address 2554 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2555 // and left with the results of the last encryption block 2556 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2557 const Register keylen = rscratch1; 2558 2559 address start = __ pc(); 2560 __ enter(); 2561 2562 __ mov(rscratch2, len_reg); 2563 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2564 2565 __ ld1(v0, __ T16B, rvec); 2566 2567 __ cmpw(keylen, 52); 2568 __ br(Assembler::CC, L_loadkeys_44); 2569 __ br(Assembler::EQ, L_loadkeys_52); 2570 2571 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2572 __ rev32(v17, __ T16B, v17); 2573 __ rev32(v18, __ T16B, v18); 2574 __ BIND(L_loadkeys_52); 2575 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2576 __ rev32(v19, __ T16B, v19); 2577 __ rev32(v20, __ T16B, v20); 2578 __ BIND(L_loadkeys_44); 2579 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2580 __ rev32(v21, __ T16B, v21); 2581 __ rev32(v22, __ T16B, v22); 2582 __ rev32(v23, __ T16B, v23); 2583 __ rev32(v24, __ T16B, v24); 2584 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2585 __ rev32(v25, __ T16B, v25); 2586 __ rev32(v26, __ T16B, v26); 2587 __ rev32(v27, __ T16B, v27); 2588 __ rev32(v28, __ T16B, v28); 2589 __ ld1(v29, v30, v31, __ T16B, key); 2590 __ rev32(v29, __ T16B, v29); 2591 __ rev32(v30, __ T16B, v30); 2592 __ rev32(v31, __ T16B, v31); 2593 2594 __ BIND(L_aes_loop); 2595 __ ld1(v1, __ T16B, __ post(from, 16)); 2596 __ eor(v0, __ T16B, v0, v1); 2597 2598 __ br(Assembler::CC, L_rounds_44); 2599 __ br(Assembler::EQ, L_rounds_52); 2600 2601 __ aese(v0, v17); __ aesmc(v0, v0); 2602 __ aese(v0, v18); __ aesmc(v0, v0); 2603 __ BIND(L_rounds_52); 2604 __ aese(v0, v19); __ aesmc(v0, v0); 2605 __ aese(v0, v20); __ aesmc(v0, v0); 2606 __ BIND(L_rounds_44); 2607 __ aese(v0, v21); __ aesmc(v0, v0); 2608 __ aese(v0, v22); __ aesmc(v0, v0); 2609 __ aese(v0, v23); __ aesmc(v0, v0); 2610 __ aese(v0, v24); __ aesmc(v0, v0); 2611 __ aese(v0, v25); __ aesmc(v0, v0); 2612 __ aese(v0, v26); __ aesmc(v0, v0); 2613 __ aese(v0, v27); __ aesmc(v0, v0); 2614 __ aese(v0, v28); __ aesmc(v0, v0); 2615 __ aese(v0, v29); __ aesmc(v0, v0); 2616 __ aese(v0, v30); 2617 __ eor(v0, __ T16B, v0, v31); 2618 2619 __ st1(v0, __ T16B, __ post(to, 16)); 2620 __ sub(len_reg, len_reg, 16); 2621 __ cbnz(len_reg, L_aes_loop); 2622 2623 __ st1(v0, __ T16B, rvec); 2624 2625 __ mov(r0, rscratch2); 2626 2627 __ leave(); 2628 __ ret(lr); 2629 2630 return start; 2631 } 2632 2633 // Arguments: 2634 // 2635 // Inputs: 2636 // c_rarg0 - source byte array address 2637 // c_rarg1 - destination byte array address 2638 // c_rarg2 - K (key) in little endian int array 2639 // c_rarg3 - r vector byte array address 2640 // c_rarg4 - input length 2641 // 2642 // Output: 2643 // r0 - input length 2644 // 2645 address generate_cipherBlockChaining_decryptAESCrypt() { 2646 assert(UseAES, "need AES instructions and misaligned SSE support"); 2647 __ align(CodeEntryAlignment); 2648 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2649 2650 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2651 2652 const Register from = c_rarg0; // source array address 2653 const Register to = c_rarg1; // destination array address 2654 const Register key = c_rarg2; // key array address 2655 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2656 // and left with the results of the last encryption block 2657 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2658 const Register keylen = rscratch1; 2659 2660 address start = __ pc(); 2661 __ enter(); 2662 2663 __ mov(rscratch2, len_reg); 2664 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2665 2666 __ ld1(v2, __ T16B, rvec); 2667 2668 __ ld1(v31, __ T16B, __ post(key, 16)); 2669 __ rev32(v31, __ T16B, v31); 2670 2671 __ cmpw(keylen, 52); 2672 __ br(Assembler::CC, L_loadkeys_44); 2673 __ br(Assembler::EQ, L_loadkeys_52); 2674 2675 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2676 __ rev32(v17, __ T16B, v17); 2677 __ rev32(v18, __ T16B, v18); 2678 __ BIND(L_loadkeys_52); 2679 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2680 __ rev32(v19, __ T16B, v19); 2681 __ rev32(v20, __ T16B, v20); 2682 __ BIND(L_loadkeys_44); 2683 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2684 __ rev32(v21, __ T16B, v21); 2685 __ rev32(v22, __ T16B, v22); 2686 __ rev32(v23, __ T16B, v23); 2687 __ rev32(v24, __ T16B, v24); 2688 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2689 __ rev32(v25, __ T16B, v25); 2690 __ rev32(v26, __ T16B, v26); 2691 __ rev32(v27, __ T16B, v27); 2692 __ rev32(v28, __ T16B, v28); 2693 __ ld1(v29, v30, __ T16B, key); 2694 __ rev32(v29, __ T16B, v29); 2695 __ rev32(v30, __ T16B, v30); 2696 2697 __ BIND(L_aes_loop); 2698 __ ld1(v0, __ T16B, __ post(from, 16)); 2699 __ orr(v1, __ T16B, v0, v0); 2700 2701 __ br(Assembler::CC, L_rounds_44); 2702 __ br(Assembler::EQ, L_rounds_52); 2703 2704 __ aesd(v0, v17); __ aesimc(v0, v0); 2705 __ aesd(v0, v18); __ aesimc(v0, v0); 2706 __ BIND(L_rounds_52); 2707 __ aesd(v0, v19); __ aesimc(v0, v0); 2708 __ aesd(v0, v20); __ aesimc(v0, v0); 2709 __ BIND(L_rounds_44); 2710 __ aesd(v0, v21); __ aesimc(v0, v0); 2711 __ aesd(v0, v22); __ aesimc(v0, v0); 2712 __ aesd(v0, v23); __ aesimc(v0, v0); 2713 __ aesd(v0, v24); __ aesimc(v0, v0); 2714 __ aesd(v0, v25); __ aesimc(v0, v0); 2715 __ aesd(v0, v26); __ aesimc(v0, v0); 2716 __ aesd(v0, v27); __ aesimc(v0, v0); 2717 __ aesd(v0, v28); __ aesimc(v0, v0); 2718 __ aesd(v0, v29); __ aesimc(v0, v0); 2719 __ aesd(v0, v30); 2720 __ eor(v0, __ T16B, v0, v31); 2721 __ eor(v0, __ T16B, v0, v2); 2722 2723 __ st1(v0, __ T16B, __ post(to, 16)); 2724 __ orr(v2, __ T16B, v1, v1); 2725 2726 __ sub(len_reg, len_reg, 16); 2727 __ cbnz(len_reg, L_aes_loop); 2728 2729 __ st1(v2, __ T16B, rvec); 2730 2731 __ mov(r0, rscratch2); 2732 2733 __ leave(); 2734 __ ret(lr); 2735 2736 return start; 2737 } 2738 2739 // Arguments: 2740 // 2741 // Inputs: 2742 // c_rarg0 - byte[] source+offset 2743 // c_rarg1 - int[] SHA.state 2744 // c_rarg2 - int offset 2745 // c_rarg3 - int limit 2746 // 2747 address generate_sha1_implCompress(bool multi_block, const char *name) { 2748 __ align(CodeEntryAlignment); 2749 StubCodeMark mark(this, "StubRoutines", name); 2750 address start = __ pc(); 2751 2752 Register buf = c_rarg0; 2753 Register state = c_rarg1; 2754 Register ofs = c_rarg2; 2755 Register limit = c_rarg3; 2756 2757 Label keys; 2758 Label sha1_loop; 2759 2760 // load the keys into v0..v3 2761 __ adr(rscratch1, keys); 2762 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2763 // load 5 words state into v6, v7 2764 __ ldrq(v6, Address(state, 0)); 2765 __ ldrs(v7, Address(state, 16)); 2766 2767 2768 __ BIND(sha1_loop); 2769 // load 64 bytes of data into v16..v19 2770 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2771 __ rev32(v16, __ T16B, v16); 2772 __ rev32(v17, __ T16B, v17); 2773 __ rev32(v18, __ T16B, v18); 2774 __ rev32(v19, __ T16B, v19); 2775 2776 // do the sha1 2777 __ addv(v4, __ T4S, v16, v0); 2778 __ orr(v20, __ T16B, v6, v6); 2779 2780 FloatRegister d0 = v16; 2781 FloatRegister d1 = v17; 2782 FloatRegister d2 = v18; 2783 FloatRegister d3 = v19; 2784 2785 for (int round = 0; round < 20; round++) { 2786 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2787 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2788 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2789 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2790 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2791 2792 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2793 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2794 __ sha1h(tmp2, __ T4S, v20); 2795 if (round < 5) 2796 __ sha1c(v20, __ T4S, tmp3, tmp4); 2797 else if (round < 10 || round >= 15) 2798 __ sha1p(v20, __ T4S, tmp3, tmp4); 2799 else 2800 __ sha1m(v20, __ T4S, tmp3, tmp4); 2801 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2802 2803 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2804 } 2805 2806 __ addv(v7, __ T2S, v7, v21); 2807 __ addv(v6, __ T4S, v6, v20); 2808 2809 if (multi_block) { 2810 __ add(ofs, ofs, 64); 2811 __ cmp(ofs, limit); 2812 __ br(Assembler::LE, sha1_loop); 2813 __ mov(c_rarg0, ofs); // return ofs 2814 } 2815 2816 __ strq(v6, Address(state, 0)); 2817 __ strs(v7, Address(state, 16)); 2818 2819 __ ret(lr); 2820 2821 __ bind(keys); 2822 __ emit_int32(0x5a827999); 2823 __ emit_int32(0x6ed9eba1); 2824 __ emit_int32(0x8f1bbcdc); 2825 __ emit_int32(0xca62c1d6); 2826 2827 return start; 2828 } 2829 2830 2831 // Arguments: 2832 // 2833 // Inputs: 2834 // c_rarg0 - byte[] source+offset 2835 // c_rarg1 - int[] SHA.state 2836 // c_rarg2 - int offset 2837 // c_rarg3 - int limit 2838 // 2839 address generate_sha256_implCompress(bool multi_block, const char *name) { 2840 static const uint32_t round_consts[64] = { 2841 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 2842 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 2843 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 2844 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 2845 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 2846 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 2847 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 2848 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 2849 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 2850 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 2851 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 2852 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 2853 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 2854 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 2855 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 2856 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 2857 }; 2858 __ align(CodeEntryAlignment); 2859 StubCodeMark mark(this, "StubRoutines", name); 2860 address start = __ pc(); 2861 2862 Register buf = c_rarg0; 2863 Register state = c_rarg1; 2864 Register ofs = c_rarg2; 2865 Register limit = c_rarg3; 2866 2867 Label sha1_loop; 2868 2869 __ stpd(v8, v9, __ pre(sp, -32)); 2870 __ stpd(v10, v11, Address(sp, 16)); 2871 2872 // dga == v0 2873 // dgb == v1 2874 // dg0 == v2 2875 // dg1 == v3 2876 // dg2 == v4 2877 // t0 == v6 2878 // t1 == v7 2879 2880 // load 16 keys to v16..v31 2881 __ lea(rscratch1, ExternalAddress((address)round_consts)); 2882 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 2883 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 2884 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 2885 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 2886 2887 // load 8 words (256 bits) state 2888 __ ldpq(v0, v1, state); 2889 2890 __ BIND(sha1_loop); 2891 // load 64 bytes of data into v8..v11 2892 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 2893 __ rev32(v8, __ T16B, v8); 2894 __ rev32(v9, __ T16B, v9); 2895 __ rev32(v10, __ T16B, v10); 2896 __ rev32(v11, __ T16B, v11); 2897 2898 __ addv(v6, __ T4S, v8, v16); 2899 __ orr(v2, __ T16B, v0, v0); 2900 __ orr(v3, __ T16B, v1, v1); 2901 2902 FloatRegister d0 = v8; 2903 FloatRegister d1 = v9; 2904 FloatRegister d2 = v10; 2905 FloatRegister d3 = v11; 2906 2907 2908 for (int round = 0; round < 16; round++) { 2909 FloatRegister tmp1 = (round & 1) ? v6 : v7; 2910 FloatRegister tmp2 = (round & 1) ? v7 : v6; 2911 FloatRegister tmp3 = (round & 1) ? v2 : v4; 2912 FloatRegister tmp4 = (round & 1) ? v4 : v2; 2913 2914 if (round < 12) __ sha256su0(d0, __ T4S, d1); 2915 __ orr(v4, __ T16B, v2, v2); 2916 if (round < 15) 2917 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 2918 __ sha256h(v2, __ T4S, v3, tmp2); 2919 __ sha256h2(v3, __ T4S, v4, tmp2); 2920 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 2921 2922 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2923 } 2924 2925 __ addv(v0, __ T4S, v0, v2); 2926 __ addv(v1, __ T4S, v1, v3); 2927 2928 if (multi_block) { 2929 __ add(ofs, ofs, 64); 2930 __ cmp(ofs, limit); 2931 __ br(Assembler::LE, sha1_loop); 2932 __ mov(c_rarg0, ofs); // return ofs 2933 } 2934 2935 __ ldpd(v10, v11, Address(sp, 16)); 2936 __ ldpd(v8, v9, __ post(sp, 32)); 2937 2938 __ stpq(v0, v1, state); 2939 2940 __ ret(lr); 2941 2942 return start; 2943 } 2944 2945 #ifndef BUILTIN_SIM 2946 // Safefetch stubs. 2947 void generate_safefetch(const char* name, int size, address* entry, 2948 address* fault_pc, address* continuation_pc) { 2949 // safefetch signatures: 2950 // int SafeFetch32(int* adr, int errValue); 2951 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 2952 // 2953 // arguments: 2954 // c_rarg0 = adr 2955 // c_rarg1 = errValue 2956 // 2957 // result: 2958 // PPC_RET = *adr or errValue 2959 2960 StubCodeMark mark(this, "StubRoutines", name); 2961 2962 // Entry point, pc or function descriptor. 2963 *entry = __ pc(); 2964 2965 // Load *adr into c_rarg1, may fault. 2966 *fault_pc = __ pc(); 2967 switch (size) { 2968 case 4: 2969 // int32_t 2970 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 2971 break; 2972 case 8: 2973 // int64_t 2974 __ ldr(c_rarg1, Address(c_rarg0, 0)); 2975 break; 2976 default: 2977 ShouldNotReachHere(); 2978 } 2979 2980 // return errValue or *adr 2981 *continuation_pc = __ pc(); 2982 __ mov(r0, c_rarg1); 2983 __ ret(lr); 2984 } 2985 #endif 2986 2987 /** 2988 * Arguments: 2989 * 2990 * Inputs: 2991 * c_rarg0 - int crc 2992 * c_rarg1 - byte* buf 2993 * c_rarg2 - int length 2994 * 2995 * Ouput: 2996 * rax - int crc result 2997 */ 2998 address generate_updateBytesCRC32() { 2999 assert(UseCRC32Intrinsics, "what are we doing here?"); 3000 3001 __ align(CodeEntryAlignment); 3002 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3003 3004 address start = __ pc(); 3005 3006 const Register crc = c_rarg0; // crc 3007 const Register buf = c_rarg1; // source java byte array address 3008 const Register len = c_rarg2; // length 3009 const Register table0 = c_rarg3; // crc_table address 3010 const Register table1 = c_rarg4; 3011 const Register table2 = c_rarg5; 3012 const Register table3 = c_rarg6; 3013 const Register tmp3 = c_rarg7; 3014 3015 BLOCK_COMMENT("Entry:"); 3016 __ enter(); // required for proper stackwalking of RuntimeStub frame 3017 3018 __ kernel_crc32(crc, buf, len, 3019 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3020 3021 __ leave(); // required for proper stackwalking of RuntimeStub frame 3022 __ ret(lr); 3023 3024 return start; 3025 } 3026 3027 /** 3028 * Arguments: 3029 * 3030 * Inputs: 3031 * c_rarg0 - int crc 3032 * c_rarg1 - byte* buf 3033 * c_rarg2 - int length 3034 * c_rarg3 - int* table 3035 * 3036 * Ouput: 3037 * r0 - int crc result 3038 */ 3039 address generate_updateBytesCRC32C() { 3040 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3041 3042 __ align(CodeEntryAlignment); 3043 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3044 3045 address start = __ pc(); 3046 3047 const Register crc = c_rarg0; // crc 3048 const Register buf = c_rarg1; // source java byte array address 3049 const Register len = c_rarg2; // length 3050 const Register table0 = c_rarg3; // crc_table address 3051 const Register table1 = c_rarg4; 3052 const Register table2 = c_rarg5; 3053 const Register table3 = c_rarg6; 3054 const Register tmp3 = c_rarg7; 3055 3056 BLOCK_COMMENT("Entry:"); 3057 __ enter(); // required for proper stackwalking of RuntimeStub frame 3058 3059 __ kernel_crc32c(crc, buf, len, 3060 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3061 3062 __ leave(); // required for proper stackwalking of RuntimeStub frame 3063 __ ret(lr); 3064 3065 return start; 3066 } 3067 3068 /*** 3069 * Arguments: 3070 * 3071 * Inputs: 3072 * c_rarg0 - int adler 3073 * c_rarg1 - byte* buff 3074 * c_rarg2 - int len 3075 * 3076 * Output: 3077 * c_rarg0 - int adler result 3078 */ 3079 address generate_updateBytesAdler32() { 3080 __ align(CodeEntryAlignment); 3081 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3082 address start = __ pc(); 3083 3084 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3085 3086 // Aliases 3087 Register adler = c_rarg0; 3088 Register s1 = c_rarg0; 3089 Register s2 = c_rarg3; 3090 Register buff = c_rarg1; 3091 Register len = c_rarg2; 3092 Register nmax = r4; 3093 Register base = r5; 3094 Register count = r6; 3095 Register temp0 = rscratch1; 3096 Register temp1 = rscratch2; 3097 Register temp2 = r7; 3098 3099 // Max number of bytes we can process before having to take the mod 3100 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3101 unsigned long BASE = 0xfff1; 3102 unsigned long NMAX = 0x15B0; 3103 3104 __ mov(base, BASE); 3105 __ mov(nmax, NMAX); 3106 3107 // s1 is initialized to the lower 16 bits of adler 3108 // s2 is initialized to the upper 16 bits of adler 3109 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3110 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3111 3112 // The pipelined loop needs at least 16 elements for 1 iteration 3113 // It does check this, but it is more effective to skip to the cleanup loop 3114 __ cmp(len, 16); 3115 __ br(Assembler::HS, L_nmax); 3116 __ cbz(len, L_combine); 3117 3118 __ bind(L_simple_by1_loop); 3119 __ ldrb(temp0, Address(__ post(buff, 1))); 3120 __ add(s1, s1, temp0); 3121 __ add(s2, s2, s1); 3122 __ subs(len, len, 1); 3123 __ br(Assembler::HI, L_simple_by1_loop); 3124 3125 // s1 = s1 % BASE 3126 __ subs(temp0, s1, base); 3127 __ csel(s1, temp0, s1, Assembler::HS); 3128 3129 // s2 = s2 % BASE 3130 __ lsr(temp0, s2, 16); 3131 __ lsl(temp1, temp0, 4); 3132 __ sub(temp1, temp1, temp0); 3133 __ add(s2, temp1, s2, ext::uxth); 3134 3135 __ subs(temp0, s2, base); 3136 __ csel(s2, temp0, s2, Assembler::HS); 3137 3138 __ b(L_combine); 3139 3140 __ bind(L_nmax); 3141 __ subs(len, len, nmax); 3142 __ sub(count, nmax, 16); 3143 __ br(Assembler::LO, L_by16); 3144 3145 __ bind(L_nmax_loop); 3146 3147 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3148 3149 __ add(s1, s1, temp0, ext::uxtb); 3150 __ ubfx(temp2, temp0, 8, 8); 3151 __ add(s2, s2, s1); 3152 __ add(s1, s1, temp2); 3153 __ ubfx(temp2, temp0, 16, 8); 3154 __ add(s2, s2, s1); 3155 __ add(s1, s1, temp2); 3156 __ ubfx(temp2, temp0, 24, 8); 3157 __ add(s2, s2, s1); 3158 __ add(s1, s1, temp2); 3159 __ ubfx(temp2, temp0, 32, 8); 3160 __ add(s2, s2, s1); 3161 __ add(s1, s1, temp2); 3162 __ ubfx(temp2, temp0, 40, 8); 3163 __ add(s2, s2, s1); 3164 __ add(s1, s1, temp2); 3165 __ ubfx(temp2, temp0, 48, 8); 3166 __ add(s2, s2, s1); 3167 __ add(s1, s1, temp2); 3168 __ add(s2, s2, s1); 3169 __ add(s1, s1, temp0, Assembler::LSR, 56); 3170 __ add(s2, s2, s1); 3171 3172 __ add(s1, s1, temp1, ext::uxtb); 3173 __ ubfx(temp2, temp1, 8, 8); 3174 __ add(s2, s2, s1); 3175 __ add(s1, s1, temp2); 3176 __ ubfx(temp2, temp1, 16, 8); 3177 __ add(s2, s2, s1); 3178 __ add(s1, s1, temp2); 3179 __ ubfx(temp2, temp1, 24, 8); 3180 __ add(s2, s2, s1); 3181 __ add(s1, s1, temp2); 3182 __ ubfx(temp2, temp1, 32, 8); 3183 __ add(s2, s2, s1); 3184 __ add(s1, s1, temp2); 3185 __ ubfx(temp2, temp1, 40, 8); 3186 __ add(s2, s2, s1); 3187 __ add(s1, s1, temp2); 3188 __ ubfx(temp2, temp1, 48, 8); 3189 __ add(s2, s2, s1); 3190 __ add(s1, s1, temp2); 3191 __ add(s2, s2, s1); 3192 __ add(s1, s1, temp1, Assembler::LSR, 56); 3193 __ add(s2, s2, s1); 3194 3195 __ subs(count, count, 16); 3196 __ br(Assembler::HS, L_nmax_loop); 3197 3198 // s1 = s1 % BASE 3199 __ lsr(temp0, s1, 16); 3200 __ lsl(temp1, temp0, 4); 3201 __ sub(temp1, temp1, temp0); 3202 __ add(temp1, temp1, s1, ext::uxth); 3203 3204 __ lsr(temp0, temp1, 16); 3205 __ lsl(s1, temp0, 4); 3206 __ sub(s1, s1, temp0); 3207 __ add(s1, s1, temp1, ext:: uxth); 3208 3209 __ subs(temp0, s1, base); 3210 __ csel(s1, temp0, s1, Assembler::HS); 3211 3212 // s2 = s2 % BASE 3213 __ lsr(temp0, s2, 16); 3214 __ lsl(temp1, temp0, 4); 3215 __ sub(temp1, temp1, temp0); 3216 __ add(temp1, temp1, s2, ext::uxth); 3217 3218 __ lsr(temp0, temp1, 16); 3219 __ lsl(s2, temp0, 4); 3220 __ sub(s2, s2, temp0); 3221 __ add(s2, s2, temp1, ext:: uxth); 3222 3223 __ subs(temp0, s2, base); 3224 __ csel(s2, temp0, s2, Assembler::HS); 3225 3226 __ subs(len, len, nmax); 3227 __ sub(count, nmax, 16); 3228 __ br(Assembler::HS, L_nmax_loop); 3229 3230 __ bind(L_by16); 3231 __ adds(len, len, count); 3232 __ br(Assembler::LO, L_by1); 3233 3234 __ bind(L_by16_loop); 3235 3236 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3237 3238 __ add(s1, s1, temp0, ext::uxtb); 3239 __ ubfx(temp2, temp0, 8, 8); 3240 __ add(s2, s2, s1); 3241 __ add(s1, s1, temp2); 3242 __ ubfx(temp2, temp0, 16, 8); 3243 __ add(s2, s2, s1); 3244 __ add(s1, s1, temp2); 3245 __ ubfx(temp2, temp0, 24, 8); 3246 __ add(s2, s2, s1); 3247 __ add(s1, s1, temp2); 3248 __ ubfx(temp2, temp0, 32, 8); 3249 __ add(s2, s2, s1); 3250 __ add(s1, s1, temp2); 3251 __ ubfx(temp2, temp0, 40, 8); 3252 __ add(s2, s2, s1); 3253 __ add(s1, s1, temp2); 3254 __ ubfx(temp2, temp0, 48, 8); 3255 __ add(s2, s2, s1); 3256 __ add(s1, s1, temp2); 3257 __ add(s2, s2, s1); 3258 __ add(s1, s1, temp0, Assembler::LSR, 56); 3259 __ add(s2, s2, s1); 3260 3261 __ add(s1, s1, temp1, ext::uxtb); 3262 __ ubfx(temp2, temp1, 8, 8); 3263 __ add(s2, s2, s1); 3264 __ add(s1, s1, temp2); 3265 __ ubfx(temp2, temp1, 16, 8); 3266 __ add(s2, s2, s1); 3267 __ add(s1, s1, temp2); 3268 __ ubfx(temp2, temp1, 24, 8); 3269 __ add(s2, s2, s1); 3270 __ add(s1, s1, temp2); 3271 __ ubfx(temp2, temp1, 32, 8); 3272 __ add(s2, s2, s1); 3273 __ add(s1, s1, temp2); 3274 __ ubfx(temp2, temp1, 40, 8); 3275 __ add(s2, s2, s1); 3276 __ add(s1, s1, temp2); 3277 __ ubfx(temp2, temp1, 48, 8); 3278 __ add(s2, s2, s1); 3279 __ add(s1, s1, temp2); 3280 __ add(s2, s2, s1); 3281 __ add(s1, s1, temp1, Assembler::LSR, 56); 3282 __ add(s2, s2, s1); 3283 3284 __ subs(len, len, 16); 3285 __ br(Assembler::HS, L_by16_loop); 3286 3287 __ bind(L_by1); 3288 __ adds(len, len, 15); 3289 __ br(Assembler::LO, L_do_mod); 3290 3291 __ bind(L_by1_loop); 3292 __ ldrb(temp0, Address(__ post(buff, 1))); 3293 __ add(s1, temp0, s1); 3294 __ add(s2, s2, s1); 3295 __ subs(len, len, 1); 3296 __ br(Assembler::HS, L_by1_loop); 3297 3298 __ bind(L_do_mod); 3299 // s1 = s1 % BASE 3300 __ lsr(temp0, s1, 16); 3301 __ lsl(temp1, temp0, 4); 3302 __ sub(temp1, temp1, temp0); 3303 __ add(temp1, temp1, s1, ext::uxth); 3304 3305 __ lsr(temp0, temp1, 16); 3306 __ lsl(s1, temp0, 4); 3307 __ sub(s1, s1, temp0); 3308 __ add(s1, s1, temp1, ext:: uxth); 3309 3310 __ subs(temp0, s1, base); 3311 __ csel(s1, temp0, s1, Assembler::HS); 3312 3313 // s2 = s2 % BASE 3314 __ lsr(temp0, s2, 16); 3315 __ lsl(temp1, temp0, 4); 3316 __ sub(temp1, temp1, temp0); 3317 __ add(temp1, temp1, s2, ext::uxth); 3318 3319 __ lsr(temp0, temp1, 16); 3320 __ lsl(s2, temp0, 4); 3321 __ sub(s2, s2, temp0); 3322 __ add(s2, s2, temp1, ext:: uxth); 3323 3324 __ subs(temp0, s2, base); 3325 __ csel(s2, temp0, s2, Assembler::HS); 3326 3327 // Combine lower bits and higher bits 3328 __ bind(L_combine); 3329 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3330 3331 __ ret(lr); 3332 3333 return start; 3334 } 3335 3336 /** 3337 * Arguments: 3338 * 3339 * Input: 3340 * c_rarg0 - x address 3341 * c_rarg1 - x length 3342 * c_rarg2 - y address 3343 * c_rarg3 - y lenth 3344 * c_rarg4 - z address 3345 * c_rarg5 - z length 3346 */ 3347 address generate_multiplyToLen() { 3348 __ align(CodeEntryAlignment); 3349 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3350 3351 address start = __ pc(); 3352 const Register x = r0; 3353 const Register xlen = r1; 3354 const Register y = r2; 3355 const Register ylen = r3; 3356 const Register z = r4; 3357 const Register zlen = r5; 3358 3359 const Register tmp1 = r10; 3360 const Register tmp2 = r11; 3361 const Register tmp3 = r12; 3362 const Register tmp4 = r13; 3363 const Register tmp5 = r14; 3364 const Register tmp6 = r15; 3365 const Register tmp7 = r16; 3366 3367 BLOCK_COMMENT("Entry:"); 3368 __ enter(); // required for proper stackwalking of RuntimeStub frame 3369 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3370 __ leave(); // required for proper stackwalking of RuntimeStub frame 3371 __ ret(lr); 3372 3373 return start; 3374 } 3375 3376 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3377 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3378 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3379 // Karatsuba multiplication performs a 128*128 -> 256-bit 3380 // multiplication in three 128-bit multiplications and a few 3381 // additions. 3382 // 3383 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3384 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3385 // 3386 // Inputs: 3387 // 3388 // A0 in a.d[0] (subkey) 3389 // A1 in a.d[1] 3390 // (A1+A0) in a1_xor_a0.d[0] 3391 // 3392 // B0 in b.d[0] (state) 3393 // B1 in b.d[1] 3394 3395 __ ext(tmp1, __ T16B, b, b, 0x08); 3396 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3397 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3398 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3399 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3400 3401 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3402 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3403 __ eor(tmp2, __ T16B, tmp2, tmp4); 3404 __ eor(tmp2, __ T16B, tmp2, tmp3); 3405 3406 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3407 __ ins(result_hi, __ D, tmp2, 0, 1); 3408 __ ins(result_lo, __ D, tmp2, 1, 0); 3409 } 3410 3411 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3412 FloatRegister p, FloatRegister z, FloatRegister t1) { 3413 const FloatRegister t0 = result; 3414 3415 // The GCM field polynomial f is z^128 + p(z), where p = 3416 // z^7+z^2+z+1. 3417 // 3418 // z^128 === -p(z) (mod (z^128 + p(z))) 3419 // 3420 // so, given that the product we're reducing is 3421 // a == lo + hi * z^128 3422 // substituting, 3423 // === lo - hi * p(z) (mod (z^128 + p(z))) 3424 // 3425 // we reduce by multiplying hi by p(z) and subtracting the result 3426 // from (i.e. XORing it with) lo. Because p has no nonzero high 3427 // bits we can do this with two 64-bit multiplications, lo*p and 3428 // hi*p. 3429 3430 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3431 __ ext(t1, __ T16B, t0, z, 8); 3432 __ eor(hi, __ T16B, hi, t1); 3433 __ ext(t1, __ T16B, z, t0, 8); 3434 __ eor(lo, __ T16B, lo, t1); 3435 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3436 __ eor(result, __ T16B, lo, t0); 3437 } 3438 3439 /** 3440 * Arguments: 3441 * 3442 * Input: 3443 * c_rarg0 - current state address 3444 * c_rarg1 - H key address 3445 * c_rarg2 - data address 3446 * c_rarg3 - number of blocks 3447 * 3448 * Output: 3449 * Updated state at c_rarg0 3450 */ 3451 address generate_ghash_processBlocks() { 3452 // Bafflingly, GCM uses little-endian for the byte order, but 3453 // big-endian for the bit order. For example, the polynomial 1 is 3454 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3455 // 3456 // So, we must either reverse the bytes in each word and do 3457 // everything big-endian or reverse the bits in each byte and do 3458 // it little-endian. On AArch64 it's more idiomatic to reverse 3459 // the bits in each byte (we have an instruction, RBIT, to do 3460 // that) and keep the data in little-endian bit order throught the 3461 // calculation, bit-reversing the inputs and outputs. 3462 3463 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3464 __ align(wordSize * 2); 3465 address p = __ pc(); 3466 __ emit_int64(0x87); // The low-order bits of the field 3467 // polynomial (i.e. p = z^7+z^2+z+1) 3468 // repeated in the low and high parts of a 3469 // 128-bit vector 3470 __ emit_int64(0x87); 3471 3472 __ align(CodeEntryAlignment); 3473 address start = __ pc(); 3474 3475 Register state = c_rarg0; 3476 Register subkeyH = c_rarg1; 3477 Register data = c_rarg2; 3478 Register blocks = c_rarg3; 3479 3480 FloatRegister vzr = v30; 3481 __ eor(vzr, __ T16B, vzr, vzr); // zero register 3482 3483 __ ldrq(v0, Address(state)); 3484 __ ldrq(v1, Address(subkeyH)); 3485 3486 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 3487 __ rbit(v0, __ T16B, v0); 3488 __ rev64(v1, __ T16B, v1); 3489 __ rbit(v1, __ T16B, v1); 3490 3491 __ ldrq(v26, p); 3492 3493 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 3494 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 3495 3496 { 3497 Label L_ghash_loop; 3498 __ bind(L_ghash_loop); 3499 3500 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 3501 // reversing each byte 3502 __ rbit(v2, __ T16B, v2); 3503 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 3504 3505 // Multiply state in v2 by subkey in v1 3506 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 3507 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 3508 /*temps*/v6, v20, v18, v21); 3509 // Reduce v7:v5 by the field polynomial 3510 ghash_reduce(v0, v5, v7, v26, vzr, v20); 3511 3512 __ sub(blocks, blocks, 1); 3513 __ cbnz(blocks, L_ghash_loop); 3514 } 3515 3516 // The bit-reversed result is at this point in v0 3517 __ rev64(v1, __ T16B, v0); 3518 __ rbit(v1, __ T16B, v1); 3519 3520 __ st1(v1, __ T16B, state); 3521 __ ret(lr); 3522 3523 return start; 3524 } 3525 3526 // Continuation point for throwing of implicit exceptions that are 3527 // not handled in the current activation. Fabricates an exception 3528 // oop and initiates normal exception dispatching in this 3529 // frame. Since we need to preserve callee-saved values (currently 3530 // only for C2, but done for C1 as well) we need a callee-saved oop 3531 // map and therefore have to make these stubs into RuntimeStubs 3532 // rather than BufferBlobs. If the compiler needs all registers to 3533 // be preserved between the fault point and the exception handler 3534 // then it must assume responsibility for that in 3535 // AbstractCompiler::continuation_for_implicit_null_exception or 3536 // continuation_for_implicit_division_by_zero_exception. All other 3537 // implicit exceptions (e.g., NullPointerException or 3538 // AbstractMethodError on entry) are either at call sites or 3539 // otherwise assume that stack unwinding will be initiated, so 3540 // caller saved registers were assumed volatile in the compiler. 3541 3542 #undef __ 3543 #define __ masm-> 3544 3545 address generate_throw_exception(const char* name, 3546 address runtime_entry, 3547 Register arg1 = noreg, 3548 Register arg2 = noreg) { 3549 // Information about frame layout at time of blocking runtime call. 3550 // Note that we only have to preserve callee-saved registers since 3551 // the compilers are responsible for supplying a continuation point 3552 // if they expect all registers to be preserved. 3553 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 3554 enum layout { 3555 rfp_off = 0, 3556 rfp_off2, 3557 return_off, 3558 return_off2, 3559 framesize // inclusive of return address 3560 }; 3561 3562 int insts_size = 512; 3563 int locs_size = 64; 3564 3565 CodeBuffer code(name, insts_size, locs_size); 3566 OopMapSet* oop_maps = new OopMapSet(); 3567 MacroAssembler* masm = new MacroAssembler(&code); 3568 3569 address start = __ pc(); 3570 3571 // This is an inlined and slightly modified version of call_VM 3572 // which has the ability to fetch the return PC out of 3573 // thread-local storage and also sets up last_Java_sp slightly 3574 // differently than the real call_VM 3575 3576 __ enter(); // Save FP and LR before call 3577 3578 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3579 3580 // lr and fp are already in place 3581 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 3582 3583 int frame_complete = __ pc() - start; 3584 3585 // Set up last_Java_sp and last_Java_fp 3586 address the_pc = __ pc(); 3587 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 3588 3589 // Call runtime 3590 if (arg1 != noreg) { 3591 assert(arg2 != c_rarg1, "clobbered"); 3592 __ mov(c_rarg1, arg1); 3593 } 3594 if (arg2 != noreg) { 3595 __ mov(c_rarg2, arg2); 3596 } 3597 __ mov(c_rarg0, rthread); 3598 BLOCK_COMMENT("call runtime_entry"); 3599 __ mov(rscratch1, runtime_entry); 3600 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 3601 3602 // Generate oop map 3603 OopMap* map = new OopMap(framesize, 0); 3604 3605 oop_maps->add_gc_map(the_pc - start, map); 3606 3607 __ reset_last_Java_frame(true, true); 3608 __ maybe_isb(); 3609 3610 __ leave(); 3611 3612 // check for pending exceptions 3613 #ifdef ASSERT 3614 Label L; 3615 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 3616 __ cbnz(rscratch1, L); 3617 __ should_not_reach_here(); 3618 __ bind(L); 3619 #endif // ASSERT 3620 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3621 3622 3623 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3624 RuntimeStub* stub = 3625 RuntimeStub::new_runtime_stub(name, 3626 &code, 3627 frame_complete, 3628 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3629 oop_maps, false); 3630 return stub->entry_point(); 3631 } 3632 3633 class MontgomeryMultiplyGenerator : public MacroAssembler { 3634 3635 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3636 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 3637 3638 RegSet _toSave; 3639 bool _squaring; 3640 3641 public: 3642 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3643 : MacroAssembler(as->code()), _squaring(squaring) { 3644 3645 // Register allocation 3646 3647 Register reg = c_rarg0; 3648 Pa_base = reg; // Argument registers 3649 if (squaring) 3650 Pb_base = Pa_base; 3651 else 3652 Pb_base = ++reg; 3653 Pn_base = ++reg; 3654 Rlen= ++reg; 3655 inv = ++reg; 3656 Pm_base = ++reg; 3657 3658 // Working registers: 3659 Ra = ++reg; // The current digit of a, b, n, and m. 3660 Rb = ++reg; 3661 Rm = ++reg; 3662 Rn = ++reg; 3663 3664 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 3665 Pb = ++reg; 3666 Pm = ++reg; 3667 Pn = ++reg; 3668 3669 t0 = ++reg; // Three registers which form a 3670 t1 = ++reg; // triple-precision accumuator. 3671 t2 = ++reg; 3672 3673 Ri = ++reg; // Inner and outer loop indexes. 3674 Rj = ++reg; 3675 3676 Rhi_ab = ++reg; // Product registers: low and high parts 3677 Rlo_ab = ++reg; // of a*b and m*n. 3678 Rhi_mn = ++reg; 3679 Rlo_mn = ++reg; 3680 3681 // r19 and up are callee-saved. 3682 _toSave = RegSet::range(r19, reg) + Pm_base; 3683 } 3684 3685 private: 3686 void save_regs() { 3687 push(_toSave, sp); 3688 } 3689 3690 void restore_regs() { 3691 pop(_toSave, sp); 3692 } 3693 3694 template <typename T> 3695 void unroll_2(Register count, T block) { 3696 Label loop, end, odd; 3697 tbnz(count, 0, odd); 3698 cbz(count, end); 3699 align(16); 3700 bind(loop); 3701 (this->*block)(); 3702 bind(odd); 3703 (this->*block)(); 3704 subs(count, count, 2); 3705 br(Assembler::GT, loop); 3706 bind(end); 3707 } 3708 3709 template <typename T> 3710 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3711 Label loop, end, odd; 3712 tbnz(count, 0, odd); 3713 cbz(count, end); 3714 align(16); 3715 bind(loop); 3716 (this->*block)(d, s, tmp); 3717 bind(odd); 3718 (this->*block)(d, s, tmp); 3719 subs(count, count, 2); 3720 br(Assembler::GT, loop); 3721 bind(end); 3722 } 3723 3724 void pre1(RegisterOrConstant i) { 3725 block_comment("pre1"); 3726 // Pa = Pa_base; 3727 // Pb = Pb_base + i; 3728 // Pm = Pm_base; 3729 // Pn = Pn_base + i; 3730 // Ra = *Pa; 3731 // Rb = *Pb; 3732 // Rm = *Pm; 3733 // Rn = *Pn; 3734 ldr(Ra, Address(Pa_base)); 3735 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3736 ldr(Rm, Address(Pm_base)); 3737 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3738 lea(Pa, Address(Pa_base)); 3739 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3740 lea(Pm, Address(Pm_base)); 3741 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3742 3743 // Zero the m*n result. 3744 mov(Rhi_mn, zr); 3745 mov(Rlo_mn, zr); 3746 } 3747 3748 // The core multiply-accumulate step of a Montgomery 3749 // multiplication. The idea is to schedule operations as a 3750 // pipeline so that instructions with long latencies (loads and 3751 // multiplies) have time to complete before their results are 3752 // used. This most benefits in-order implementations of the 3753 // architecture but out-of-order ones also benefit. 3754 void step() { 3755 block_comment("step"); 3756 // MACC(Ra, Rb, t0, t1, t2); 3757 // Ra = *++Pa; 3758 // Rb = *--Pb; 3759 umulh(Rhi_ab, Ra, Rb); 3760 mul(Rlo_ab, Ra, Rb); 3761 ldr(Ra, pre(Pa, wordSize)); 3762 ldr(Rb, pre(Pb, -wordSize)); 3763 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 3764 // previous iteration. 3765 // MACC(Rm, Rn, t0, t1, t2); 3766 // Rm = *++Pm; 3767 // Rn = *--Pn; 3768 umulh(Rhi_mn, Rm, Rn); 3769 mul(Rlo_mn, Rm, Rn); 3770 ldr(Rm, pre(Pm, wordSize)); 3771 ldr(Rn, pre(Pn, -wordSize)); 3772 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3773 } 3774 3775 void post1() { 3776 block_comment("post1"); 3777 3778 // MACC(Ra, Rb, t0, t1, t2); 3779 // Ra = *++Pa; 3780 // Rb = *--Pb; 3781 umulh(Rhi_ab, Ra, Rb); 3782 mul(Rlo_ab, Ra, Rb); 3783 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3784 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3785 3786 // *Pm = Rm = t0 * inv; 3787 mul(Rm, t0, inv); 3788 str(Rm, Address(Pm)); 3789 3790 // MACC(Rm, Rn, t0, t1, t2); 3791 // t0 = t1; t1 = t2; t2 = 0; 3792 umulh(Rhi_mn, Rm, Rn); 3793 3794 #ifndef PRODUCT 3795 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 3796 { 3797 mul(Rlo_mn, Rm, Rn); 3798 add(Rlo_mn, t0, Rlo_mn); 3799 Label ok; 3800 cbz(Rlo_mn, ok); { 3801 stop("broken Montgomery multiply"); 3802 } bind(ok); 3803 } 3804 #endif 3805 // We have very carefully set things up so that 3806 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 3807 // the lower half of Rm * Rn because we know the result already: 3808 // it must be -t0. t0 + (-t0) must generate a carry iff 3809 // t0 != 0. So, rather than do a mul and an adds we just set 3810 // the carry flag iff t0 is nonzero. 3811 // 3812 // mul(Rlo_mn, Rm, Rn); 3813 // adds(zr, t0, Rlo_mn); 3814 subs(zr, t0, 1); // Set carry iff t0 is nonzero 3815 adcs(t0, t1, Rhi_mn); 3816 adc(t1, t2, zr); 3817 mov(t2, zr); 3818 } 3819 3820 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 3821 block_comment("pre2"); 3822 // Pa = Pa_base + i-len; 3823 // Pb = Pb_base + len; 3824 // Pm = Pm_base + i-len; 3825 // Pn = Pn_base + len; 3826 3827 if (i.is_register()) { 3828 sub(Rj, i.as_register(), len); 3829 } else { 3830 mov(Rj, i.as_constant()); 3831 sub(Rj, Rj, len); 3832 } 3833 // Rj == i-len 3834 3835 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 3836 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 3837 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 3838 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 3839 3840 // Ra = *++Pa; 3841 // Rb = *--Pb; 3842 // Rm = *++Pm; 3843 // Rn = *--Pn; 3844 ldr(Ra, pre(Pa, wordSize)); 3845 ldr(Rb, pre(Pb, -wordSize)); 3846 ldr(Rm, pre(Pm, wordSize)); 3847 ldr(Rn, pre(Pn, -wordSize)); 3848 3849 mov(Rhi_mn, zr); 3850 mov(Rlo_mn, zr); 3851 } 3852 3853 void post2(RegisterOrConstant i, RegisterOrConstant len) { 3854 block_comment("post2"); 3855 if (i.is_constant()) { 3856 mov(Rj, i.as_constant()-len.as_constant()); 3857 } else { 3858 sub(Rj, i.as_register(), len); 3859 } 3860 3861 adds(t0, t0, Rlo_mn); // The pending m*n, low part 3862 3863 // As soon as we know the least significant digit of our result, 3864 // store it. 3865 // Pm_base[i-len] = t0; 3866 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 3867 3868 // t0 = t1; t1 = t2; t2 = 0; 3869 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 3870 adc(t1, t2, zr); 3871 mov(t2, zr); 3872 } 3873 3874 // A carry in t0 after Montgomery multiplication means that we 3875 // should subtract multiples of n from our result in m. We'll 3876 // keep doing that until there is no carry. 3877 void normalize(RegisterOrConstant len) { 3878 block_comment("normalize"); 3879 // while (t0) 3880 // t0 = sub(Pm_base, Pn_base, t0, len); 3881 Label loop, post, again; 3882 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 3883 cbz(t0, post); { 3884 bind(again); { 3885 mov(i, zr); 3886 mov(cnt, len); 3887 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3888 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3889 subs(zr, zr, zr); // set carry flag, i.e. no borrow 3890 align(16); 3891 bind(loop); { 3892 sbcs(Rm, Rm, Rn); 3893 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3894 add(i, i, 1); 3895 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3896 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3897 sub(cnt, cnt, 1); 3898 } cbnz(cnt, loop); 3899 sbc(t0, t0, zr); 3900 } cbnz(t0, again); 3901 } bind(post); 3902 } 3903 3904 // Move memory at s to d, reversing words. 3905 // Increments d to end of copied memory 3906 // Destroys tmp1, tmp2 3907 // Preserves len 3908 // Leaves s pointing to the address which was in d at start 3909 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 3910 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 3911 3912 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 3913 mov(tmp1, len); 3914 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 3915 sub(s, d, len, ext::uxtw, LogBytesPerWord); 3916 } 3917 // where 3918 void reverse1(Register d, Register s, Register tmp) { 3919 ldr(tmp, pre(s, -wordSize)); 3920 ror(tmp, tmp, 32); 3921 str(tmp, post(d, wordSize)); 3922 } 3923 3924 void step_squaring() { 3925 // An extra ACC 3926 step(); 3927 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3928 } 3929 3930 void last_squaring(RegisterOrConstant i) { 3931 Label dont; 3932 // if ((i & 1) == 0) { 3933 tbnz(i.as_register(), 0, dont); { 3934 // MACC(Ra, Rb, t0, t1, t2); 3935 // Ra = *++Pa; 3936 // Rb = *--Pb; 3937 umulh(Rhi_ab, Ra, Rb); 3938 mul(Rlo_ab, Ra, Rb); 3939 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3940 } bind(dont); 3941 } 3942 3943 void extra_step_squaring() { 3944 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3945 3946 // MACC(Rm, Rn, t0, t1, t2); 3947 // Rm = *++Pm; 3948 // Rn = *--Pn; 3949 umulh(Rhi_mn, Rm, Rn); 3950 mul(Rlo_mn, Rm, Rn); 3951 ldr(Rm, pre(Pm, wordSize)); 3952 ldr(Rn, pre(Pn, -wordSize)); 3953 } 3954 3955 void post1_squaring() { 3956 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3957 3958 // *Pm = Rm = t0 * inv; 3959 mul(Rm, t0, inv); 3960 str(Rm, Address(Pm)); 3961 3962 // MACC(Rm, Rn, t0, t1, t2); 3963 // t0 = t1; t1 = t2; t2 = 0; 3964 umulh(Rhi_mn, Rm, Rn); 3965 3966 #ifndef PRODUCT 3967 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 3968 { 3969 mul(Rlo_mn, Rm, Rn); 3970 add(Rlo_mn, t0, Rlo_mn); 3971 Label ok; 3972 cbz(Rlo_mn, ok); { 3973 stop("broken Montgomery multiply"); 3974 } bind(ok); 3975 } 3976 #endif 3977 // We have very carefully set things up so that 3978 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 3979 // the lower half of Rm * Rn because we know the result already: 3980 // it must be -t0. t0 + (-t0) must generate a carry iff 3981 // t0 != 0. So, rather than do a mul and an adds we just set 3982 // the carry flag iff t0 is nonzero. 3983 // 3984 // mul(Rlo_mn, Rm, Rn); 3985 // adds(zr, t0, Rlo_mn); 3986 subs(zr, t0, 1); // Set carry iff t0 is nonzero 3987 adcs(t0, t1, Rhi_mn); 3988 adc(t1, t2, zr); 3989 mov(t2, zr); 3990 } 3991 3992 void acc(Register Rhi, Register Rlo, 3993 Register t0, Register t1, Register t2) { 3994 adds(t0, t0, Rlo); 3995 adcs(t1, t1, Rhi); 3996 adc(t2, t2, zr); 3997 } 3998 3999 public: 4000 /** 4001 * Fast Montgomery multiplication. The derivation of the 4002 * algorithm is in A Cryptographic Library for the Motorola 4003 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 4004 * 4005 * Arguments: 4006 * 4007 * Inputs for multiplication: 4008 * c_rarg0 - int array elements a 4009 * c_rarg1 - int array elements b 4010 * c_rarg2 - int array elements n (the modulus) 4011 * c_rarg3 - int length 4012 * c_rarg4 - int inv 4013 * c_rarg5 - int array elements m (the result) 4014 * 4015 * Inputs for squaring: 4016 * c_rarg0 - int array elements a 4017 * c_rarg1 - int array elements n (the modulus) 4018 * c_rarg2 - int length 4019 * c_rarg3 - int inv 4020 * c_rarg4 - int array elements m (the result) 4021 * 4022 */ 4023 address generate_multiply() { 4024 Label argh, nothing; 4025 bind(argh); 4026 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4027 4028 align(CodeEntryAlignment); 4029 address entry = pc(); 4030 4031 cbzw(Rlen, nothing); 4032 4033 enter(); 4034 4035 // Make room. 4036 cmpw(Rlen, 512); 4037 br(Assembler::HI, argh); 4038 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4039 andr(sp, Ra, -2 * wordSize); 4040 4041 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4042 4043 { 4044 // Copy input args, reversing as we go. We use Ra as a 4045 // temporary variable. 4046 reverse(Ra, Pa_base, Rlen, t0, t1); 4047 if (!_squaring) 4048 reverse(Ra, Pb_base, Rlen, t0, t1); 4049 reverse(Ra, Pn_base, Rlen, t0, t1); 4050 } 4051 4052 // Push all call-saved registers and also Pm_base which we'll need 4053 // at the end. 4054 save_regs(); 4055 4056 #ifndef PRODUCT 4057 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 4058 { 4059 ldr(Rn, Address(Pn_base, 0)); 4060 mul(Rlo_mn, Rn, inv); 4061 cmp(Rlo_mn, -1); 4062 Label ok; 4063 br(EQ, ok); { 4064 stop("broken inverse in Montgomery multiply"); 4065 } bind(ok); 4066 } 4067 #endif 4068 4069 mov(Pm_base, Ra); 4070 4071 mov(t0, zr); 4072 mov(t1, zr); 4073 mov(t2, zr); 4074 4075 block_comment("for (int i = 0; i < len; i++) {"); 4076 mov(Ri, zr); { 4077 Label loop, end; 4078 cmpw(Ri, Rlen); 4079 br(Assembler::GE, end); 4080 4081 bind(loop); 4082 pre1(Ri); 4083 4084 block_comment(" for (j = i; j; j--) {"); { 4085 movw(Rj, Ri); 4086 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4087 } block_comment(" } // j"); 4088 4089 post1(); 4090 addw(Ri, Ri, 1); 4091 cmpw(Ri, Rlen); 4092 br(Assembler::LT, loop); 4093 bind(end); 4094 block_comment("} // i"); 4095 } 4096 4097 block_comment("for (int i = len; i < 2*len; i++) {"); 4098 mov(Ri, Rlen); { 4099 Label loop, end; 4100 cmpw(Ri, Rlen, Assembler::LSL, 1); 4101 br(Assembler::GE, end); 4102 4103 bind(loop); 4104 pre2(Ri, Rlen); 4105 4106 block_comment(" for (j = len*2-i-1; j; j--) {"); { 4107 lslw(Rj, Rlen, 1); 4108 subw(Rj, Rj, Ri); 4109 subw(Rj, Rj, 1); 4110 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4111 } block_comment(" } // j"); 4112 4113 post2(Ri, Rlen); 4114 addw(Ri, Ri, 1); 4115 cmpw(Ri, Rlen, Assembler::LSL, 1); 4116 br(Assembler::LT, loop); 4117 bind(end); 4118 } 4119 block_comment("} // i"); 4120 4121 normalize(Rlen); 4122 4123 mov(Ra, Pm_base); // Save Pm_base in Ra 4124 restore_regs(); // Restore caller's Pm_base 4125 4126 // Copy our result into caller's Pm_base 4127 reverse(Pm_base, Ra, Rlen, t0, t1); 4128 4129 leave(); 4130 bind(nothing); 4131 ret(lr); 4132 4133 return entry; 4134 } 4135 // In C, approximately: 4136 4137 // void 4138 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 4139 // unsigned long Pn_base[], unsigned long Pm_base[], 4140 // unsigned long inv, int len) { 4141 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4142 // unsigned long *Pa, *Pb, *Pn, *Pm; 4143 // unsigned long Ra, Rb, Rn, Rm; 4144 4145 // int i; 4146 4147 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4148 4149 // for (i = 0; i < len; i++) { 4150 // int j; 4151 4152 // Pa = Pa_base; 4153 // Pb = Pb_base + i; 4154 // Pm = Pm_base; 4155 // Pn = Pn_base + i; 4156 4157 // Ra = *Pa; 4158 // Rb = *Pb; 4159 // Rm = *Pm; 4160 // Rn = *Pn; 4161 4162 // int iters = i; 4163 // for (j = 0; iters--; j++) { 4164 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4165 // MACC(Ra, Rb, t0, t1, t2); 4166 // Ra = *++Pa; 4167 // Rb = *--Pb; 4168 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4169 // MACC(Rm, Rn, t0, t1, t2); 4170 // Rm = *++Pm; 4171 // Rn = *--Pn; 4172 // } 4173 4174 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 4175 // MACC(Ra, Rb, t0, t1, t2); 4176 // *Pm = Rm = t0 * inv; 4177 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4178 // MACC(Rm, Rn, t0, t1, t2); 4179 4180 // assert(t0 == 0, "broken Montgomery multiply"); 4181 4182 // t0 = t1; t1 = t2; t2 = 0; 4183 // } 4184 4185 // for (i = len; i < 2*len; i++) { 4186 // int j; 4187 4188 // Pa = Pa_base + i-len; 4189 // Pb = Pb_base + len; 4190 // Pm = Pm_base + i-len; 4191 // Pn = Pn_base + len; 4192 4193 // Ra = *++Pa; 4194 // Rb = *--Pb; 4195 // Rm = *++Pm; 4196 // Rn = *--Pn; 4197 4198 // int iters = len*2-i-1; 4199 // for (j = i-len+1; iters--; j++) { 4200 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4201 // MACC(Ra, Rb, t0, t1, t2); 4202 // Ra = *++Pa; 4203 // Rb = *--Pb; 4204 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4205 // MACC(Rm, Rn, t0, t1, t2); 4206 // Rm = *++Pm; 4207 // Rn = *--Pn; 4208 // } 4209 4210 // Pm_base[i-len] = t0; 4211 // t0 = t1; t1 = t2; t2 = 0; 4212 // } 4213 4214 // while (t0) 4215 // t0 = sub(Pm_base, Pn_base, t0, len); 4216 // } 4217 4218 /** 4219 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4220 * multiplies than Montgomery multiplication so it should be up to 4221 * 25% faster. However, its loop control is more complex and it 4222 * may actually run slower on some machines. 4223 * 4224 * Arguments: 4225 * 4226 * Inputs: 4227 * c_rarg0 - int array elements a 4228 * c_rarg1 - int array elements n (the modulus) 4229 * c_rarg2 - int length 4230 * c_rarg3 - int inv 4231 * c_rarg4 - int array elements m (the result) 4232 * 4233 */ 4234 address generate_square() { 4235 Label argh; 4236 bind(argh); 4237 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4238 4239 align(CodeEntryAlignment); 4240 address entry = pc(); 4241 4242 enter(); 4243 4244 // Make room. 4245 cmpw(Rlen, 512); 4246 br(Assembler::HI, argh); 4247 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4248 andr(sp, Ra, -2 * wordSize); 4249 4250 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4251 4252 { 4253 // Copy input args, reversing as we go. We use Ra as a 4254 // temporary variable. 4255 reverse(Ra, Pa_base, Rlen, t0, t1); 4256 reverse(Ra, Pn_base, Rlen, t0, t1); 4257 } 4258 4259 // Push all call-saved registers and also Pm_base which we'll need 4260 // at the end. 4261 save_regs(); 4262 4263 mov(Pm_base, Ra); 4264 4265 mov(t0, zr); 4266 mov(t1, zr); 4267 mov(t2, zr); 4268 4269 block_comment("for (int i = 0; i < len; i++) {"); 4270 mov(Ri, zr); { 4271 Label loop, end; 4272 bind(loop); 4273 cmp(Ri, Rlen); 4274 br(Assembler::GE, end); 4275 4276 pre1(Ri); 4277 4278 block_comment("for (j = (i+1)/2; j; j--) {"); { 4279 add(Rj, Ri, 1); 4280 lsr(Rj, Rj, 1); 4281 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4282 } block_comment(" } // j"); 4283 4284 last_squaring(Ri); 4285 4286 block_comment(" for (j = i/2; j; j--) {"); { 4287 lsr(Rj, Ri, 1); 4288 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4289 } block_comment(" } // j"); 4290 4291 post1_squaring(); 4292 add(Ri, Ri, 1); 4293 cmp(Ri, Rlen); 4294 br(Assembler::LT, loop); 4295 4296 bind(end); 4297 block_comment("} // i"); 4298 } 4299 4300 block_comment("for (int i = len; i < 2*len; i++) {"); 4301 mov(Ri, Rlen); { 4302 Label loop, end; 4303 bind(loop); 4304 cmp(Ri, Rlen, Assembler::LSL, 1); 4305 br(Assembler::GE, end); 4306 4307 pre2(Ri, Rlen); 4308 4309 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4310 lsl(Rj, Rlen, 1); 4311 sub(Rj, Rj, Ri); 4312 sub(Rj, Rj, 1); 4313 lsr(Rj, Rj, 1); 4314 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4315 } block_comment(" } // j"); 4316 4317 last_squaring(Ri); 4318 4319 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4320 lsl(Rj, Rlen, 1); 4321 sub(Rj, Rj, Ri); 4322 lsr(Rj, Rj, 1); 4323 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4324 } block_comment(" } // j"); 4325 4326 post2(Ri, Rlen); 4327 add(Ri, Ri, 1); 4328 cmp(Ri, Rlen, Assembler::LSL, 1); 4329 4330 br(Assembler::LT, loop); 4331 bind(end); 4332 block_comment("} // i"); 4333 } 4334 4335 normalize(Rlen); 4336 4337 mov(Ra, Pm_base); // Save Pm_base in Ra 4338 restore_regs(); // Restore caller's Pm_base 4339 4340 // Copy our result into caller's Pm_base 4341 reverse(Pm_base, Ra, Rlen, t0, t1); 4342 4343 leave(); 4344 ret(lr); 4345 4346 return entry; 4347 } 4348 // In C, approximately: 4349 4350 // void 4351 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4352 // unsigned long Pm_base[], unsigned long inv, int len) { 4353 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4354 // unsigned long *Pa, *Pb, *Pn, *Pm; 4355 // unsigned long Ra, Rb, Rn, Rm; 4356 4357 // int i; 4358 4359 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4360 4361 // for (i = 0; i < len; i++) { 4362 // int j; 4363 4364 // Pa = Pa_base; 4365 // Pb = Pa_base + i; 4366 // Pm = Pm_base; 4367 // Pn = Pn_base + i; 4368 4369 // Ra = *Pa; 4370 // Rb = *Pb; 4371 // Rm = *Pm; 4372 // Rn = *Pn; 4373 4374 // int iters = (i+1)/2; 4375 // for (j = 0; iters--; j++) { 4376 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4377 // MACC2(Ra, Rb, t0, t1, t2); 4378 // Ra = *++Pa; 4379 // Rb = *--Pb; 4380 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4381 // MACC(Rm, Rn, t0, t1, t2); 4382 // Rm = *++Pm; 4383 // Rn = *--Pn; 4384 // } 4385 // if ((i & 1) == 0) { 4386 // assert(Ra == Pa_base[j], "must be"); 4387 // MACC(Ra, Ra, t0, t1, t2); 4388 // } 4389 // iters = i/2; 4390 // assert(iters == i-j, "must be"); 4391 // for (; iters--; j++) { 4392 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4393 // MACC(Rm, Rn, t0, t1, t2); 4394 // Rm = *++Pm; 4395 // Rn = *--Pn; 4396 // } 4397 4398 // *Pm = Rm = t0 * inv; 4399 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4400 // MACC(Rm, Rn, t0, t1, t2); 4401 4402 // assert(t0 == 0, "broken Montgomery multiply"); 4403 4404 // t0 = t1; t1 = t2; t2 = 0; 4405 // } 4406 4407 // for (i = len; i < 2*len; i++) { 4408 // int start = i-len+1; 4409 // int end = start + (len - start)/2; 4410 // int j; 4411 4412 // Pa = Pa_base + i-len; 4413 // Pb = Pa_base + len; 4414 // Pm = Pm_base + i-len; 4415 // Pn = Pn_base + len; 4416 4417 // Ra = *++Pa; 4418 // Rb = *--Pb; 4419 // Rm = *++Pm; 4420 // Rn = *--Pn; 4421 4422 // int iters = (2*len-i-1)/2; 4423 // assert(iters == end-start, "must be"); 4424 // for (j = start; iters--; j++) { 4425 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4426 // MACC2(Ra, Rb, t0, t1, t2); 4427 // Ra = *++Pa; 4428 // Rb = *--Pb; 4429 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4430 // MACC(Rm, Rn, t0, t1, t2); 4431 // Rm = *++Pm; 4432 // Rn = *--Pn; 4433 // } 4434 // if ((i & 1) == 0) { 4435 // assert(Ra == Pa_base[j], "must be"); 4436 // MACC(Ra, Ra, t0, t1, t2); 4437 // } 4438 // iters = (2*len-i)/2; 4439 // assert(iters == len-j, "must be"); 4440 // for (; iters--; j++) { 4441 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4442 // MACC(Rm, Rn, t0, t1, t2); 4443 // Rm = *++Pm; 4444 // Rn = *--Pn; 4445 // } 4446 // Pm_base[i-len] = t0; 4447 // t0 = t1; t1 = t2; t2 = 0; 4448 // } 4449 4450 // while (t0) 4451 // t0 = sub(Pm_base, Pn_base, t0, len); 4452 // } 4453 }; 4454 4455 // Initialization 4456 void generate_initial() { 4457 // Generate initial stubs and initializes the entry points 4458 4459 // entry points that exist in all platforms Note: This is code 4460 // that could be shared among different platforms - however the 4461 // benefit seems to be smaller than the disadvantage of having a 4462 // much more complicated generator structure. See also comment in 4463 // stubRoutines.hpp. 4464 4465 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4466 4467 StubRoutines::_call_stub_entry = 4468 generate_call_stub(StubRoutines::_call_stub_return_address); 4469 4470 // is referenced by megamorphic call 4471 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4472 4473 // Build this early so it's available for the interpreter. 4474 StubRoutines::_throw_StackOverflowError_entry = 4475 generate_throw_exception("StackOverflowError throw_exception", 4476 CAST_FROM_FN_PTR(address, 4477 SharedRuntime:: 4478 throw_StackOverflowError)); 4479 if (UseCRC32Intrinsics) { 4480 // set table address before stub generation which use it 4481 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 4482 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4483 } 4484 } 4485 4486 void generate_all() { 4487 // support for verify_oop (must happen after universe_init) 4488 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4489 StubRoutines::_throw_AbstractMethodError_entry = 4490 generate_throw_exception("AbstractMethodError throw_exception", 4491 CAST_FROM_FN_PTR(address, 4492 SharedRuntime:: 4493 throw_AbstractMethodError)); 4494 4495 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4496 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4497 CAST_FROM_FN_PTR(address, 4498 SharedRuntime:: 4499 throw_IncompatibleClassChangeError)); 4500 4501 StubRoutines::_throw_NullPointerException_at_call_entry = 4502 generate_throw_exception("NullPointerException at call throw_exception", 4503 CAST_FROM_FN_PTR(address, 4504 SharedRuntime:: 4505 throw_NullPointerException_at_call)); 4506 4507 // arraycopy stubs used by compilers 4508 generate_arraycopy_stubs(); 4509 4510 if (UseMultiplyToLenIntrinsic) { 4511 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4512 } 4513 4514 if (UseMontgomeryMultiplyIntrinsic) { 4515 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4516 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4517 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4518 } 4519 4520 if (UseMontgomerySquareIntrinsic) { 4521 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4522 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4523 // We use generate_multiply() rather than generate_square() 4524 // because it's faster for the sizes of modulus we care about. 4525 StubRoutines::_montgomerySquare = g.generate_multiply(); 4526 } 4527 4528 #ifndef BUILTIN_SIM 4529 // generate GHASH intrinsics code 4530 if (UseGHASHIntrinsics) { 4531 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 4532 } 4533 4534 if (UseAESIntrinsics) { 4535 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4536 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4537 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4538 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4539 } 4540 4541 if (UseSHA1Intrinsics) { 4542 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4543 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4544 } 4545 if (UseSHA256Intrinsics) { 4546 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4547 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4548 } 4549 4550 if (UseCRC32CIntrinsics) { 4551 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 4552 } 4553 4554 // generate Adler32 intrinsics code 4555 if (UseAdler32Intrinsics) { 4556 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 4557 } 4558 4559 // Safefetch stubs. 4560 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4561 &StubRoutines::_safefetch32_fault_pc, 4562 &StubRoutines::_safefetch32_continuation_pc); 4563 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4564 &StubRoutines::_safefetchN_fault_pc, 4565 &StubRoutines::_safefetchN_continuation_pc); 4566 #endif 4567 } 4568 4569 public: 4570 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4571 if (all) { 4572 generate_all(); 4573 } else { 4574 generate_initial(); 4575 } 4576 } 4577 }; // end class declaration 4578 4579 void StubGenerator_generate(CodeBuffer* code, bool all) { 4580 StubGenerator g(code, all); 4581 }