1 /* 2 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "interpreter/interpreter.hpp" 30 #include "nativeInst_aarch64.hpp" 31 #include "oops/instanceOop.hpp" 32 #include "oops/method.hpp" 33 #include "oops/objArrayKlass.hpp" 34 #include "oops/oop.inline.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/frame.inline.hpp" 37 #include "runtime/handles.inline.hpp" 38 #include "runtime/sharedRuntime.hpp" 39 #include "runtime/stubCodeGenerator.hpp" 40 #include "runtime/stubRoutines.hpp" 41 #include "runtime/thread.inline.hpp" 42 #include "utilities/align.hpp" 43 #ifdef COMPILER2 44 #include "opto/runtime.hpp" 45 #endif 46 47 #ifdef BUILTIN_SIM 48 #include "../../../../../../simulator/simulator.hpp" 49 #endif 50 51 // Declaration and definition of StubGenerator (no .hpp file). 52 // For a more detailed description of the stub routine structure 53 // see the comment in stubRoutines.hpp 54 55 #undef __ 56 #define __ _masm-> 57 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 58 59 #ifdef PRODUCT 60 #define BLOCK_COMMENT(str) /* nothing */ 61 #else 62 #define BLOCK_COMMENT(str) __ block_comment(str) 63 #endif 64 65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 66 67 // Stub Code definitions 68 69 class StubGenerator: public StubCodeGenerator { 70 private: 71 72 #ifdef PRODUCT 73 #define inc_counter_np(counter) ((void)0) 74 #else 75 void inc_counter_np_(int& counter) { 76 __ lea(rscratch2, ExternalAddress((address)&counter)); 77 __ ldrw(rscratch1, Address(rscratch2)); 78 __ addw(rscratch1, rscratch1, 1); 79 __ strw(rscratch1, Address(rscratch2)); 80 } 81 #define inc_counter_np(counter) \ 82 BLOCK_COMMENT("inc_counter " #counter); \ 83 inc_counter_np_(counter); 84 #endif 85 86 // Call stubs are used to call Java from C 87 // 88 // Arguments: 89 // c_rarg0: call wrapper address address 90 // c_rarg1: result address 91 // c_rarg2: result type BasicType 92 // c_rarg3: method Method* 93 // c_rarg4: (interpreter) entry point address 94 // c_rarg5: parameters intptr_t* 95 // c_rarg6: parameter size (in words) int 96 // c_rarg7: thread Thread* 97 // 98 // There is no return from the stub itself as any Java result 99 // is written to result 100 // 101 // we save r30 (lr) as the return PC at the base of the frame and 102 // link r29 (fp) below it as the frame pointer installing sp (r31) 103 // into fp. 104 // 105 // we save r0-r7, which accounts for all the c arguments. 106 // 107 // TODO: strictly do we need to save them all? they are treated as 108 // volatile by C so could we omit saving the ones we are going to 109 // place in global registers (thread? method?) or those we only use 110 // during setup of the Java call? 111 // 112 // we don't need to save r8 which C uses as an indirect result location 113 // return register. 114 // 115 // we don't need to save r9-r15 which both C and Java treat as 116 // volatile 117 // 118 // we don't need to save r16-18 because Java does not use them 119 // 120 // we save r19-r28 which Java uses as scratch registers and C 121 // expects to be callee-save 122 // 123 // we save the bottom 64 bits of each value stored in v8-v15; it is 124 // the responsibility of the caller to preserve larger values. 125 // 126 // so the stub frame looks like this when we enter Java code 127 // 128 // [ return_from_Java ] <--- sp 129 // [ argument word n ] 130 // ... 131 // -27 [ argument word 1 ] 132 // -26 [ saved v15 ] <--- sp_after_call 133 // -25 [ saved v14 ] 134 // -24 [ saved v13 ] 135 // -23 [ saved v12 ] 136 // -22 [ saved v11 ] 137 // -21 [ saved v10 ] 138 // -20 [ saved v9 ] 139 // -19 [ saved v8 ] 140 // -18 [ saved r28 ] 141 // -17 [ saved r27 ] 142 // -16 [ saved r26 ] 143 // -15 [ saved r25 ] 144 // -14 [ saved r24 ] 145 // -13 [ saved r23 ] 146 // -12 [ saved r22 ] 147 // -11 [ saved r21 ] 148 // -10 [ saved r20 ] 149 // -9 [ saved r19 ] 150 // -8 [ call wrapper (r0) ] 151 // -7 [ result (r1) ] 152 // -6 [ result type (r2) ] 153 // -5 [ method (r3) ] 154 // -4 [ entry point (r4) ] 155 // -3 [ parameters (r5) ] 156 // -2 [ parameter size (r6) ] 157 // -1 [ thread (r7) ] 158 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 159 // 1 [ saved lr (r30) ] 160 161 // Call stub stack layout word offsets from fp 162 enum call_stub_layout { 163 sp_after_call_off = -26, 164 165 d15_off = -26, 166 d13_off = -24, 167 d11_off = -22, 168 d9_off = -20, 169 170 r28_off = -18, 171 r26_off = -16, 172 r24_off = -14, 173 r22_off = -12, 174 r20_off = -10, 175 call_wrapper_off = -8, 176 result_off = -7, 177 result_type_off = -6, 178 method_off = -5, 179 entry_point_off = -4, 180 parameter_size_off = -2, 181 thread_off = -1, 182 fp_f = 0, 183 retaddr_off = 1, 184 }; 185 186 address generate_call_stub(address& return_address) { 187 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 188 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 189 "adjust this code"); 190 191 StubCodeMark mark(this, "StubRoutines", "call_stub"); 192 address start = __ pc(); 193 194 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 195 196 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 197 const Address result (rfp, result_off * wordSize); 198 const Address result_type (rfp, result_type_off * wordSize); 199 const Address method (rfp, method_off * wordSize); 200 const Address entry_point (rfp, entry_point_off * wordSize); 201 const Address parameter_size(rfp, parameter_size_off * wordSize); 202 203 const Address thread (rfp, thread_off * wordSize); 204 205 const Address d15_save (rfp, d15_off * wordSize); 206 const Address d13_save (rfp, d13_off * wordSize); 207 const Address d11_save (rfp, d11_off * wordSize); 208 const Address d9_save (rfp, d9_off * wordSize); 209 210 const Address r28_save (rfp, r28_off * wordSize); 211 const Address r26_save (rfp, r26_off * wordSize); 212 const Address r24_save (rfp, r24_off * wordSize); 213 const Address r22_save (rfp, r22_off * wordSize); 214 const Address r20_save (rfp, r20_off * wordSize); 215 216 // stub code 217 218 // we need a C prolog to bootstrap the x86 caller into the sim 219 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 220 221 address aarch64_entry = __ pc(); 222 223 #ifdef BUILTIN_SIM 224 // Save sender's SP for stack traces. 225 __ mov(rscratch1, sp); 226 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 227 #endif 228 // set up frame and move sp to end of save area 229 __ enter(); 230 __ sub(sp, rfp, -sp_after_call_off * wordSize); 231 232 // save register parameters and Java scratch/global registers 233 // n.b. we save thread even though it gets installed in 234 // rthread because we want to sanity check rthread later 235 __ str(c_rarg7, thread); 236 __ strw(c_rarg6, parameter_size); 237 __ stp(c_rarg4, c_rarg5, entry_point); 238 __ stp(c_rarg2, c_rarg3, result_type); 239 __ stp(c_rarg0, c_rarg1, call_wrapper); 240 241 __ stp(r20, r19, r20_save); 242 __ stp(r22, r21, r22_save); 243 __ stp(r24, r23, r24_save); 244 __ stp(r26, r25, r26_save); 245 __ stp(r28, r27, r28_save); 246 247 __ stpd(v9, v8, d9_save); 248 __ stpd(v11, v10, d11_save); 249 __ stpd(v13, v12, d13_save); 250 __ stpd(v15, v14, d15_save); 251 252 // install Java thread in global register now we have saved 253 // whatever value it held 254 __ mov(rthread, c_rarg7); 255 // And method 256 __ mov(rmethod, c_rarg3); 257 258 // set up the heapbase register 259 __ reinit_heapbase(); 260 261 #ifdef ASSERT 262 // make sure we have no pending exceptions 263 { 264 Label L; 265 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 266 __ cmp(rscratch1, (unsigned)NULL_WORD); 267 __ br(Assembler::EQ, L); 268 __ stop("StubRoutines::call_stub: entered with pending exception"); 269 __ BIND(L); 270 } 271 #endif 272 // pass parameters if any 273 __ mov(esp, sp); 274 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 275 __ andr(sp, rscratch1, -2 * wordSize); 276 277 BLOCK_COMMENT("pass parameters if any"); 278 Label parameters_done; 279 // parameter count is still in c_rarg6 280 // and parameter pointer identifying param 1 is in c_rarg5 281 __ cbzw(c_rarg6, parameters_done); 282 283 address loop = __ pc(); 284 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 285 __ subsw(c_rarg6, c_rarg6, 1); 286 __ push(rscratch1); 287 __ br(Assembler::GT, loop); 288 289 __ BIND(parameters_done); 290 291 // call Java entry -- passing methdoOop, and current sp 292 // rmethod: Method* 293 // r13: sender sp 294 BLOCK_COMMENT("call Java function"); 295 __ mov(r13, sp); 296 __ blr(c_rarg4); 297 298 // tell the simulator we have returned to the stub 299 300 // we do this here because the notify will already have been done 301 // if we get to the next instruction via an exception 302 // 303 // n.b. adding this instruction here affects the calculation of 304 // whether or not a routine returns to the call stub (used when 305 // doing stack walks) since the normal test is to check the return 306 // pc against the address saved below. so we may need to allow for 307 // this extra instruction in the check. 308 309 if (NotifySimulator) { 310 __ notify(Assembler::method_reentry); 311 } 312 // save current address for use by exception handling code 313 314 return_address = __ pc(); 315 316 // store result depending on type (everything that is not 317 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 318 // n.b. this assumes Java returns an integral result in r0 319 // and a floating result in j_farg0 320 __ ldr(j_rarg2, result); 321 Label is_long, is_float, is_double, exit; 322 __ ldr(j_rarg1, result_type); 323 __ cmp(j_rarg1, T_OBJECT); 324 __ br(Assembler::EQ, is_long); 325 __ cmp(j_rarg1, T_LONG); 326 __ br(Assembler::EQ, is_long); 327 __ cmp(j_rarg1, T_FLOAT); 328 __ br(Assembler::EQ, is_float); 329 __ cmp(j_rarg1, T_DOUBLE); 330 __ br(Assembler::EQ, is_double); 331 332 // handle T_INT case 333 __ strw(r0, Address(j_rarg2)); 334 335 __ BIND(exit); 336 337 // pop parameters 338 __ sub(esp, rfp, -sp_after_call_off * wordSize); 339 340 #ifdef ASSERT 341 // verify that threads correspond 342 { 343 Label L, S; 344 __ ldr(rscratch1, thread); 345 __ cmp(rthread, rscratch1); 346 __ br(Assembler::NE, S); 347 __ get_thread(rscratch1); 348 __ cmp(rthread, rscratch1); 349 __ br(Assembler::EQ, L); 350 __ BIND(S); 351 __ stop("StubRoutines::call_stub: threads must correspond"); 352 __ BIND(L); 353 } 354 #endif 355 356 // restore callee-save registers 357 __ ldpd(v15, v14, d15_save); 358 __ ldpd(v13, v12, d13_save); 359 __ ldpd(v11, v10, d11_save); 360 __ ldpd(v9, v8, d9_save); 361 362 __ ldp(r28, r27, r28_save); 363 __ ldp(r26, r25, r26_save); 364 __ ldp(r24, r23, r24_save); 365 __ ldp(r22, r21, r22_save); 366 __ ldp(r20, r19, r20_save); 367 368 __ ldp(c_rarg0, c_rarg1, call_wrapper); 369 __ ldrw(c_rarg2, result_type); 370 __ ldr(c_rarg3, method); 371 __ ldp(c_rarg4, c_rarg5, entry_point); 372 __ ldp(c_rarg6, c_rarg7, parameter_size); 373 374 #ifndef PRODUCT 375 // tell the simulator we are about to end Java execution 376 if (NotifySimulator) { 377 __ notify(Assembler::method_exit); 378 } 379 #endif 380 // leave frame and return to caller 381 __ leave(); 382 __ ret(lr); 383 384 // handle return types different from T_INT 385 386 __ BIND(is_long); 387 __ str(r0, Address(j_rarg2, 0)); 388 __ br(Assembler::AL, exit); 389 390 __ BIND(is_float); 391 __ strs(j_farg0, Address(j_rarg2, 0)); 392 __ br(Assembler::AL, exit); 393 394 __ BIND(is_double); 395 __ strd(j_farg0, Address(j_rarg2, 0)); 396 __ br(Assembler::AL, exit); 397 398 return start; 399 } 400 401 // Return point for a Java call if there's an exception thrown in 402 // Java code. The exception is caught and transformed into a 403 // pending exception stored in JavaThread that can be tested from 404 // within the VM. 405 // 406 // Note: Usually the parameters are removed by the callee. In case 407 // of an exception crossing an activation frame boundary, that is 408 // not the case if the callee is compiled code => need to setup the 409 // rsp. 410 // 411 // r0: exception oop 412 413 // NOTE: this is used as a target from the signal handler so it 414 // needs an x86 prolog which returns into the current simulator 415 // executing the generated catch_exception code. so the prolog 416 // needs to install rax in a sim register and adjust the sim's 417 // restart pc to enter the generated code at the start position 418 // then return from native to simulated execution. 419 420 address generate_catch_exception() { 421 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 422 address start = __ pc(); 423 424 // same as in generate_call_stub(): 425 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 426 const Address thread (rfp, thread_off * wordSize); 427 428 #ifdef ASSERT 429 // verify that threads correspond 430 { 431 Label L, S; 432 __ ldr(rscratch1, thread); 433 __ cmp(rthread, rscratch1); 434 __ br(Assembler::NE, S); 435 __ get_thread(rscratch1); 436 __ cmp(rthread, rscratch1); 437 __ br(Assembler::EQ, L); 438 __ bind(S); 439 __ stop("StubRoutines::catch_exception: threads must correspond"); 440 __ bind(L); 441 } 442 #endif 443 444 // set pending exception 445 __ verify_oop(r0); 446 447 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 448 __ mov(rscratch1, (address)__FILE__); 449 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 450 __ movw(rscratch1, (int)__LINE__); 451 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 452 453 // complete return to VM 454 assert(StubRoutines::_call_stub_return_address != NULL, 455 "_call_stub_return_address must have been generated before"); 456 __ b(StubRoutines::_call_stub_return_address); 457 458 return start; 459 } 460 461 // Continuation point for runtime calls returning with a pending 462 // exception. The pending exception check happened in the runtime 463 // or native call stub. The pending exception in Thread is 464 // converted into a Java-level exception. 465 // 466 // Contract with Java-level exception handlers: 467 // r0: exception 468 // r3: throwing pc 469 // 470 // NOTE: At entry of this stub, exception-pc must be in LR !! 471 472 // NOTE: this is always used as a jump target within generated code 473 // so it just needs to be generated code wiht no x86 prolog 474 475 address generate_forward_exception() { 476 StubCodeMark mark(this, "StubRoutines", "forward exception"); 477 address start = __ pc(); 478 479 // Upon entry, LR points to the return address returning into 480 // Java (interpreted or compiled) code; i.e., the return address 481 // becomes the throwing pc. 482 // 483 // Arguments pushed before the runtime call are still on the stack 484 // but the exception handler will reset the stack pointer -> 485 // ignore them. A potential result in registers can be ignored as 486 // well. 487 488 #ifdef ASSERT 489 // make sure this code is only executed if there is a pending exception 490 { 491 Label L; 492 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 493 __ cbnz(rscratch1, L); 494 __ stop("StubRoutines::forward exception: no pending exception (1)"); 495 __ bind(L); 496 } 497 #endif 498 499 // compute exception handler into r19 500 501 // call the VM to find the handler address associated with the 502 // caller address. pass thread in r0 and caller pc (ret address) 503 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 504 // the stack. 505 __ mov(c_rarg1, lr); 506 // lr will be trashed by the VM call so we move it to R19 507 // (callee-saved) because we also need to pass it to the handler 508 // returned by this call. 509 __ mov(r19, lr); 510 BLOCK_COMMENT("call exception_handler_for_return_address"); 511 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 512 SharedRuntime::exception_handler_for_return_address), 513 rthread, c_rarg1); 514 // we should not really care that lr is no longer the callee 515 // address. we saved the value the handler needs in r19 so we can 516 // just copy it to r3. however, the C2 handler will push its own 517 // frame and then calls into the VM and the VM code asserts that 518 // the PC for the frame above the handler belongs to a compiled 519 // Java method. So, we restore lr here to satisfy that assert. 520 __ mov(lr, r19); 521 // setup r0 & r3 & clear pending exception 522 __ mov(r3, r19); 523 __ mov(r19, r0); 524 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 525 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 526 527 #ifdef ASSERT 528 // make sure exception is set 529 { 530 Label L; 531 __ cbnz(r0, L); 532 __ stop("StubRoutines::forward exception: no pending exception (2)"); 533 __ bind(L); 534 } 535 #endif 536 537 // continue at exception handler 538 // r0: exception 539 // r3: throwing pc 540 // r19: exception handler 541 __ verify_oop(r0); 542 __ br(r19); 543 544 return start; 545 } 546 547 // Non-destructive plausibility checks for oops 548 // 549 // Arguments: 550 // r0: oop to verify 551 // rscratch1: error message 552 // 553 // Stack after saving c_rarg3: 554 // [tos + 0]: saved c_rarg3 555 // [tos + 1]: saved c_rarg2 556 // [tos + 2]: saved lr 557 // [tos + 3]: saved rscratch2 558 // [tos + 4]: saved r0 559 // [tos + 5]: saved rscratch1 560 address generate_verify_oop() { 561 562 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 563 address start = __ pc(); 564 565 Label exit, error; 566 567 // save c_rarg2 and c_rarg3 568 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 569 570 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 571 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 572 __ ldr(c_rarg3, Address(c_rarg2)); 573 __ add(c_rarg3, c_rarg3, 1); 574 __ str(c_rarg3, Address(c_rarg2)); 575 576 // object is in r0 577 // make sure object is 'reasonable' 578 __ cbz(r0, exit); // if obj is NULL it is OK 579 580 // Check if the oop is in the right area of memory 581 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 582 __ andr(c_rarg2, r0, c_rarg3); 583 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 584 585 // Compare c_rarg2 and c_rarg3. We don't use a compare 586 // instruction here because the flags register is live. 587 __ eor(c_rarg2, c_rarg2, c_rarg3); 588 __ cbnz(c_rarg2, error); 589 590 // make sure klass is 'reasonable', which is not zero. 591 __ load_klass(r0, r0); // get klass 592 __ cbz(r0, error); // if klass is NULL it is broken 593 594 // return if everything seems ok 595 __ bind(exit); 596 597 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 598 __ ret(lr); 599 600 // handle errors 601 __ bind(error); 602 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 603 604 __ push(RegSet::range(r0, r29), sp); 605 // debug(char* msg, int64_t pc, int64_t regs[]) 606 __ mov(c_rarg0, rscratch1); // pass address of error message 607 __ mov(c_rarg1, lr); // pass return address 608 __ mov(c_rarg2, sp); // pass address of regs on stack 609 #ifndef PRODUCT 610 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 611 #endif 612 BLOCK_COMMENT("call MacroAssembler::debug"); 613 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 614 __ blrt(rscratch1, 3, 0, 1); 615 616 return start; 617 } 618 619 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 620 621 // Generate code for an array write pre barrier 622 // 623 // addr - starting address 624 // count - element count 625 // tmp - scratch register 626 // saved_regs - registers to be saved before calling static_write_ref_array_pre 627 // 628 // Callers must specify which registers to preserve in saved_regs. 629 // Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs. 630 // 631 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized, RegSet saved_regs) { 632 BarrierSet* bs = Universe::heap()->barrier_set(); 633 switch (bs->kind()) { 634 case BarrierSet::G1SATBCTLogging: 635 // With G1, don't generate the call if we statically know that the target in uninitialized 636 if (!dest_uninitialized) { 637 __ push(saved_regs, sp); 638 if (count == c_rarg0) { 639 if (addr == c_rarg1) { 640 // exactly backwards!! 641 __ mov(rscratch1, c_rarg0); 642 __ mov(c_rarg0, c_rarg1); 643 __ mov(c_rarg1, rscratch1); 644 } else { 645 __ mov(c_rarg1, count); 646 __ mov(c_rarg0, addr); 647 } 648 } else { 649 __ mov(c_rarg0, addr); 650 __ mov(c_rarg1, count); 651 } 652 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 653 __ pop(saved_regs, sp); 654 break; 655 case BarrierSet::CardTableForRS: 656 case BarrierSet::CardTableExtension: 657 case BarrierSet::ModRef: 658 break; 659 default: 660 ShouldNotReachHere(); 661 662 } 663 } 664 } 665 666 // 667 // Generate code for an array write post barrier 668 // 669 // Input: 670 // start - register containing starting address of destination array 671 // end - register containing ending address of destination array 672 // scratch - scratch register 673 // saved_regs - registers to be saved before calling static_write_ref_array_post 674 // 675 // The input registers are overwritten. 676 // The ending address is inclusive. 677 // Callers must specify which registers to preserve in saved_regs. 678 // Clobbers: r0-r18, v0-v7, v16-v31, except saved_regs. 679 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch, RegSet saved_regs) { 680 assert_different_registers(start, end, scratch); 681 BarrierSet* bs = Universe::heap()->barrier_set(); 682 switch (bs->kind()) { 683 case BarrierSet::G1SATBCTLogging: 684 685 { 686 __ push(saved_regs, sp); 687 // must compute element count unless barrier set interface is changed (other platforms supply count) 688 assert_different_registers(start, end, scratch); 689 __ lea(scratch, Address(end, BytesPerHeapOop)); 690 __ sub(scratch, scratch, start); // subtract start to get #bytes 691 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 692 __ mov(c_rarg0, start); 693 __ mov(c_rarg1, scratch); 694 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 695 __ pop(saved_regs, sp); 696 } 697 break; 698 case BarrierSet::CardTableForRS: 699 case BarrierSet::CardTableExtension: 700 { 701 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 702 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 703 704 Label L_loop; 705 706 __ lsr(start, start, CardTableModRefBS::card_shift); 707 __ lsr(end, end, CardTableModRefBS::card_shift); 708 __ sub(end, end, start); // number of bytes to copy 709 710 const Register count = end; // 'end' register contains bytes count now 711 __ load_byte_map_base(scratch); 712 __ add(start, start, scratch); 713 if (UseConcMarkSweepGC) { 714 __ membar(__ StoreStore); 715 } 716 __ BIND(L_loop); 717 __ strb(zr, Address(start, count)); 718 __ subs(count, count, 1); 719 __ br(Assembler::GE, L_loop); 720 } 721 break; 722 default: 723 ShouldNotReachHere(); 724 725 } 726 } 727 728 // The inner part of zero_words(). This is the bulk operation, 729 // zeroing words in blocks, possibly using DC ZVA to do it. The 730 // caller is responsible for zeroing the last few words. 731 // 732 // Inputs: 733 // r10: the HeapWord-aligned base address of an array to zero. 734 // r11: the count in HeapWords, r11 > 0. 735 // 736 // Returns r10 and r11, adjusted for the caller to clear. 737 // r10: the base address of the tail of words left to clear. 738 // r11: the number of words in the tail. 739 // r11 < MacroAssembler::zero_words_block_size. 740 741 address generate_zero_blocks() { 742 Label store_pair, loop_store_pair, done; 743 Label base_aligned; 744 745 Register base = r10, cnt = r11; 746 747 __ align(CodeEntryAlignment); 748 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 749 address start = __ pc(); 750 751 if (UseBlockZeroing) { 752 int zva_length = VM_Version::zva_length(); 753 754 // Ensure ZVA length can be divided by 16. This is required by 755 // the subsequent operations. 756 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 757 758 __ tbz(base, 3, base_aligned); 759 __ str(zr, Address(__ post(base, 8))); 760 __ sub(cnt, cnt, 1); 761 __ bind(base_aligned); 762 763 // Ensure count >= zva_length * 2 so that it still deserves a zva after 764 // alignment. 765 Label small; 766 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 767 __ cmp(cnt, low_limit >> 3); 768 __ br(Assembler::LT, small); 769 __ zero_dcache_blocks(base, cnt); 770 __ bind(small); 771 } 772 773 { 774 // Number of stp instructions we'll unroll 775 const int unroll = 776 MacroAssembler::zero_words_block_size / 2; 777 // Clear the remaining blocks. 778 Label loop; 779 __ subs(cnt, cnt, unroll * 2); 780 __ br(Assembler::LT, done); 781 __ bind(loop); 782 for (int i = 0; i < unroll; i++) 783 __ stp(zr, zr, __ post(base, 16)); 784 __ subs(cnt, cnt, unroll * 2); 785 __ br(Assembler::GE, loop); 786 __ bind(done); 787 __ add(cnt, cnt, unroll * 2); 788 } 789 790 __ ret(lr); 791 792 return start; 793 } 794 795 796 typedef enum { 797 copy_forwards = 1, 798 copy_backwards = -1 799 } copy_direction; 800 801 // Bulk copy of blocks of 8 words. 802 // 803 // count is a count of words. 804 // 805 // Precondition: count >= 8 806 // 807 // Postconditions: 808 // 809 // The least significant bit of count contains the remaining count 810 // of words to copy. The rest of count is trash. 811 // 812 // s and d are adjusted to point to the remaining words to copy 813 // 814 void generate_copy_longs(Label &start, Register s, Register d, Register count, 815 copy_direction direction) { 816 int unit = wordSize * direction; 817 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 818 819 int offset; 820 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 821 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 822 const Register stride = r13; 823 824 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 825 assert_different_registers(s, d, count, rscratch1); 826 827 Label again, drain; 828 const char *stub_name; 829 if (direction == copy_forwards) 830 stub_name = "forward_copy_longs"; 831 else 832 stub_name = "backward_copy_longs"; 833 StubCodeMark mark(this, "StubRoutines", stub_name); 834 __ align(CodeEntryAlignment); 835 __ bind(start); 836 837 Label unaligned_copy_long; 838 if (AvoidUnalignedAccesses) { 839 __ tbnz(d, 3, unaligned_copy_long); 840 } 841 842 if (direction == copy_forwards) { 843 __ sub(s, s, bias); 844 __ sub(d, d, bias); 845 } 846 847 #ifdef ASSERT 848 // Make sure we are never given < 8 words 849 { 850 Label L; 851 __ cmp(count, 8); 852 __ br(Assembler::GE, L); 853 __ stop("genrate_copy_longs called with < 8 words"); 854 __ bind(L); 855 } 856 #endif 857 858 // Fill 8 registers 859 if (UseSIMDForMemoryOps) { 860 __ ldpq(v0, v1, Address(s, 4 * unit)); 861 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 862 } else { 863 __ ldp(t0, t1, Address(s, 2 * unit)); 864 __ ldp(t2, t3, Address(s, 4 * unit)); 865 __ ldp(t4, t5, Address(s, 6 * unit)); 866 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 867 } 868 869 __ subs(count, count, 16); 870 __ br(Assembler::LO, drain); 871 872 int prefetch = PrefetchCopyIntervalInBytes; 873 bool use_stride = false; 874 if (direction == copy_backwards) { 875 use_stride = prefetch > 256; 876 prefetch = -prefetch; 877 if (use_stride) __ mov(stride, prefetch); 878 } 879 880 __ bind(again); 881 882 if (PrefetchCopyIntervalInBytes > 0) 883 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 884 885 if (UseSIMDForMemoryOps) { 886 __ stpq(v0, v1, Address(d, 4 * unit)); 887 __ ldpq(v0, v1, Address(s, 4 * unit)); 888 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 889 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 890 } else { 891 __ stp(t0, t1, Address(d, 2 * unit)); 892 __ ldp(t0, t1, Address(s, 2 * unit)); 893 __ stp(t2, t3, Address(d, 4 * unit)); 894 __ ldp(t2, t3, Address(s, 4 * unit)); 895 __ stp(t4, t5, Address(d, 6 * unit)); 896 __ ldp(t4, t5, Address(s, 6 * unit)); 897 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 898 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 899 } 900 901 __ subs(count, count, 8); 902 __ br(Assembler::HS, again); 903 904 // Drain 905 __ bind(drain); 906 if (UseSIMDForMemoryOps) { 907 __ stpq(v0, v1, Address(d, 4 * unit)); 908 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 909 } else { 910 __ stp(t0, t1, Address(d, 2 * unit)); 911 __ stp(t2, t3, Address(d, 4 * unit)); 912 __ stp(t4, t5, Address(d, 6 * unit)); 913 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 914 } 915 916 { 917 Label L1, L2; 918 __ tbz(count, exact_log2(4), L1); 919 if (UseSIMDForMemoryOps) { 920 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 921 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 922 } else { 923 __ ldp(t0, t1, Address(s, 2 * unit)); 924 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 925 __ stp(t0, t1, Address(d, 2 * unit)); 926 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 927 } 928 __ bind(L1); 929 930 if (direction == copy_forwards) { 931 __ add(s, s, bias); 932 __ add(d, d, bias); 933 } 934 935 __ tbz(count, 1, L2); 936 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 937 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 938 __ bind(L2); 939 } 940 941 __ ret(lr); 942 943 if (AvoidUnalignedAccesses) { 944 Label drain, again; 945 // Register order for storing. Order is different for backward copy. 946 947 __ bind(unaligned_copy_long); 948 949 // source address is even aligned, target odd aligned 950 // 951 // when forward copying word pairs we read long pairs at offsets 952 // {0, 2, 4, 6} (in long words). when backwards copying we read 953 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 954 // address by -2 in the forwards case so we can compute the 955 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 956 // or -1. 957 // 958 // when forward copying we need to store 1 word, 3 pairs and 959 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 960 // zero offset We adjust the destination by -1 which means we 961 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 962 // 963 // When backwards copyng we need to store 1 word, 3 pairs and 964 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 965 // offsets {1, 3, 5, 7, 8} * unit. 966 967 if (direction == copy_forwards) { 968 __ sub(s, s, 16); 969 __ sub(d, d, 8); 970 } 971 972 // Fill 8 registers 973 // 974 // for forwards copy s was offset by -16 from the original input 975 // value of s so the register contents are at these offsets 976 // relative to the 64 bit block addressed by that original input 977 // and so on for each successive 64 byte block when s is updated 978 // 979 // t0 at offset 0, t1 at offset 8 980 // t2 at offset 16, t3 at offset 24 981 // t4 at offset 32, t5 at offset 40 982 // t6 at offset 48, t7 at offset 56 983 984 // for backwards copy s was not offset so the register contents 985 // are at these offsets into the preceding 64 byte block 986 // relative to that original input and so on for each successive 987 // preceding 64 byte block when s is updated. this explains the 988 // slightly counter-intuitive looking pattern of register usage 989 // in the stp instructions for backwards copy. 990 // 991 // t0 at offset -16, t1 at offset -8 992 // t2 at offset -32, t3 at offset -24 993 // t4 at offset -48, t5 at offset -40 994 // t6 at offset -64, t7 at offset -56 995 996 __ ldp(t0, t1, Address(s, 2 * unit)); 997 __ ldp(t2, t3, Address(s, 4 * unit)); 998 __ ldp(t4, t5, Address(s, 6 * unit)); 999 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1000 1001 __ subs(count, count, 16); 1002 __ br(Assembler::LO, drain); 1003 1004 int prefetch = PrefetchCopyIntervalInBytes; 1005 bool use_stride = false; 1006 if (direction == copy_backwards) { 1007 use_stride = prefetch > 256; 1008 prefetch = -prefetch; 1009 if (use_stride) __ mov(stride, prefetch); 1010 } 1011 1012 __ bind(again); 1013 1014 if (PrefetchCopyIntervalInBytes > 0) 1015 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 1016 1017 if (direction == copy_forwards) { 1018 // allowing for the offset of -8 the store instructions place 1019 // registers into the target 64 bit block at the following 1020 // offsets 1021 // 1022 // t0 at offset 0 1023 // t1 at offset 8, t2 at offset 16 1024 // t3 at offset 24, t4 at offset 32 1025 // t5 at offset 40, t6 at offset 48 1026 // t7 at offset 56 1027 1028 __ str(t0, Address(d, 1 * unit)); 1029 __ stp(t1, t2, Address(d, 2 * unit)); 1030 __ ldp(t0, t1, Address(s, 2 * unit)); 1031 __ stp(t3, t4, Address(d, 4 * unit)); 1032 __ ldp(t2, t3, Address(s, 4 * unit)); 1033 __ stp(t5, t6, Address(d, 6 * unit)); 1034 __ ldp(t4, t5, Address(s, 6 * unit)); 1035 __ str(t7, Address(__ pre(d, 8 * unit))); 1036 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1037 } else { 1038 // d was not offset when we started so the registers are 1039 // written into the 64 bit block preceding d with the following 1040 // offsets 1041 // 1042 // t1 at offset -8 1043 // t3 at offset -24, t0 at offset -16 1044 // t5 at offset -48, t2 at offset -32 1045 // t7 at offset -56, t4 at offset -48 1046 // t6 at offset -64 1047 // 1048 // note that this matches the offsets previously noted for the 1049 // loads 1050 1051 __ str(t1, Address(d, 1 * unit)); 1052 __ stp(t3, t0, Address(d, 3 * unit)); 1053 __ ldp(t0, t1, Address(s, 2 * unit)); 1054 __ stp(t5, t2, Address(d, 5 * unit)); 1055 __ ldp(t2, t3, Address(s, 4 * unit)); 1056 __ stp(t7, t4, Address(d, 7 * unit)); 1057 __ ldp(t4, t5, Address(s, 6 * unit)); 1058 __ str(t6, Address(__ pre(d, 8 * unit))); 1059 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 1060 } 1061 1062 __ subs(count, count, 8); 1063 __ br(Assembler::HS, again); 1064 1065 // Drain 1066 // 1067 // this uses the same pattern of offsets and register arguments 1068 // as above 1069 __ bind(drain); 1070 if (direction == copy_forwards) { 1071 __ str(t0, Address(d, 1 * unit)); 1072 __ stp(t1, t2, Address(d, 2 * unit)); 1073 __ stp(t3, t4, Address(d, 4 * unit)); 1074 __ stp(t5, t6, Address(d, 6 * unit)); 1075 __ str(t7, Address(__ pre(d, 8 * unit))); 1076 } else { 1077 __ str(t1, Address(d, 1 * unit)); 1078 __ stp(t3, t0, Address(d, 3 * unit)); 1079 __ stp(t5, t2, Address(d, 5 * unit)); 1080 __ stp(t7, t4, Address(d, 7 * unit)); 1081 __ str(t6, Address(__ pre(d, 8 * unit))); 1082 } 1083 // now we need to copy any remaining part block which may 1084 // include a 4 word block subblock and/or a 2 word subblock. 1085 // bits 2 and 1 in the count are the tell-tale for whetehr we 1086 // have each such subblock 1087 { 1088 Label L1, L2; 1089 __ tbz(count, exact_log2(4), L1); 1090 // this is the same as above but copying only 4 longs hence 1091 // with ony one intervening stp between the str instructions 1092 // but note that the offsets and registers still follow the 1093 // same pattern 1094 __ ldp(t0, t1, Address(s, 2 * unit)); 1095 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1096 if (direction == copy_forwards) { 1097 __ str(t0, Address(d, 1 * unit)); 1098 __ stp(t1, t2, Address(d, 2 * unit)); 1099 __ str(t3, Address(__ pre(d, 4 * unit))); 1100 } else { 1101 __ str(t1, Address(d, 1 * unit)); 1102 __ stp(t3, t0, Address(d, 3 * unit)); 1103 __ str(t2, Address(__ pre(d, 4 * unit))); 1104 } 1105 __ bind(L1); 1106 1107 __ tbz(count, 1, L2); 1108 // this is the same as above but copying only 2 longs hence 1109 // there is no intervening stp between the str instructions 1110 // but note that the offset and register patterns are still 1111 // the same 1112 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1113 if (direction == copy_forwards) { 1114 __ str(t0, Address(d, 1 * unit)); 1115 __ str(t1, Address(__ pre(d, 2 * unit))); 1116 } else { 1117 __ str(t1, Address(d, 1 * unit)); 1118 __ str(t0, Address(__ pre(d, 2 * unit))); 1119 } 1120 __ bind(L2); 1121 1122 // for forwards copy we need to re-adjust the offsets we 1123 // applied so that s and d are follow the last words written 1124 1125 if (direction == copy_forwards) { 1126 __ add(s, s, 16); 1127 __ add(d, d, 8); 1128 } 1129 1130 } 1131 1132 __ ret(lr); 1133 } 1134 } 1135 1136 // Small copy: less than 16 bytes. 1137 // 1138 // NB: Ignores all of the bits of count which represent more than 15 1139 // bytes, so a caller doesn't have to mask them. 1140 1141 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1142 bool is_backwards = step < 0; 1143 size_t granularity = uabs(step); 1144 int direction = is_backwards ? -1 : 1; 1145 int unit = wordSize * direction; 1146 1147 Label Lpair, Lword, Lint, Lshort, Lbyte; 1148 1149 assert(granularity 1150 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1151 1152 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1153 1154 // ??? I don't know if this bit-test-and-branch is the right thing 1155 // to do. It does a lot of jumping, resulting in several 1156 // mispredicted branches. It might make more sense to do this 1157 // with something like Duff's device with a single computed branch. 1158 1159 __ tbz(count, 3 - exact_log2(granularity), Lword); 1160 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1161 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1162 __ bind(Lword); 1163 1164 if (granularity <= sizeof (jint)) { 1165 __ tbz(count, 2 - exact_log2(granularity), Lint); 1166 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1167 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1168 __ bind(Lint); 1169 } 1170 1171 if (granularity <= sizeof (jshort)) { 1172 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1173 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1174 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1175 __ bind(Lshort); 1176 } 1177 1178 if (granularity <= sizeof (jbyte)) { 1179 __ tbz(count, 0, Lbyte); 1180 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1181 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1182 __ bind(Lbyte); 1183 } 1184 } 1185 1186 Label copy_f, copy_b; 1187 1188 // All-singing all-dancing memory copy. 1189 // 1190 // Copy count units of memory from s to d. The size of a unit is 1191 // step, which can be positive or negative depending on the direction 1192 // of copy. If is_aligned is false, we align the source address. 1193 // 1194 1195 void copy_memory(bool is_aligned, Register s, Register d, 1196 Register count, Register tmp, int step) { 1197 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1198 bool is_backwards = step < 0; 1199 int granularity = uabs(step); 1200 const Register t0 = r3, t1 = r4; 1201 1202 // <= 96 bytes do inline. Direction doesn't matter because we always 1203 // load all the data before writing anything 1204 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1205 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1206 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1207 const Register send = r17, dend = r18; 1208 1209 if (PrefetchCopyIntervalInBytes > 0) 1210 __ prfm(Address(s, 0), PLDL1KEEP); 1211 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1212 __ br(Assembler::HI, copy_big); 1213 1214 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1215 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1216 1217 __ cmp(count, 16/granularity); 1218 __ br(Assembler::LS, copy16); 1219 1220 __ cmp(count, 64/granularity); 1221 __ br(Assembler::HI, copy80); 1222 1223 __ cmp(count, 32/granularity); 1224 __ br(Assembler::LS, copy32); 1225 1226 // 33..64 bytes 1227 if (UseSIMDForMemoryOps) { 1228 __ ldpq(v0, v1, Address(s, 0)); 1229 __ ldpq(v2, v3, Address(send, -32)); 1230 __ stpq(v0, v1, Address(d, 0)); 1231 __ stpq(v2, v3, Address(dend, -32)); 1232 } else { 1233 __ ldp(t0, t1, Address(s, 0)); 1234 __ ldp(t2, t3, Address(s, 16)); 1235 __ ldp(t4, t5, Address(send, -32)); 1236 __ ldp(t6, t7, Address(send, -16)); 1237 1238 __ stp(t0, t1, Address(d, 0)); 1239 __ stp(t2, t3, Address(d, 16)); 1240 __ stp(t4, t5, Address(dend, -32)); 1241 __ stp(t6, t7, Address(dend, -16)); 1242 } 1243 __ b(finish); 1244 1245 // 17..32 bytes 1246 __ bind(copy32); 1247 __ ldp(t0, t1, Address(s, 0)); 1248 __ ldp(t2, t3, Address(send, -16)); 1249 __ stp(t0, t1, Address(d, 0)); 1250 __ stp(t2, t3, Address(dend, -16)); 1251 __ b(finish); 1252 1253 // 65..80/96 bytes 1254 // (96 bytes if SIMD because we do 32 byes per instruction) 1255 __ bind(copy80); 1256 if (UseSIMDForMemoryOps) { 1257 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1258 __ ldpq(v4, v5, Address(send, -32)); 1259 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1260 __ stpq(v4, v5, Address(dend, -32)); 1261 } else { 1262 __ ldp(t0, t1, Address(s, 0)); 1263 __ ldp(t2, t3, Address(s, 16)); 1264 __ ldp(t4, t5, Address(s, 32)); 1265 __ ldp(t6, t7, Address(s, 48)); 1266 __ ldp(t8, t9, Address(send, -16)); 1267 1268 __ stp(t0, t1, Address(d, 0)); 1269 __ stp(t2, t3, Address(d, 16)); 1270 __ stp(t4, t5, Address(d, 32)); 1271 __ stp(t6, t7, Address(d, 48)); 1272 __ stp(t8, t9, Address(dend, -16)); 1273 } 1274 __ b(finish); 1275 1276 // 0..16 bytes 1277 __ bind(copy16); 1278 __ cmp(count, 8/granularity); 1279 __ br(Assembler::LO, copy8); 1280 1281 // 8..16 bytes 1282 __ ldr(t0, Address(s, 0)); 1283 __ ldr(t1, Address(send, -8)); 1284 __ str(t0, Address(d, 0)); 1285 __ str(t1, Address(dend, -8)); 1286 __ b(finish); 1287 1288 if (granularity < 8) { 1289 // 4..7 bytes 1290 __ bind(copy8); 1291 __ tbz(count, 2 - exact_log2(granularity), copy4); 1292 __ ldrw(t0, Address(s, 0)); 1293 __ ldrw(t1, Address(send, -4)); 1294 __ strw(t0, Address(d, 0)); 1295 __ strw(t1, Address(dend, -4)); 1296 __ b(finish); 1297 if (granularity < 4) { 1298 // 0..3 bytes 1299 __ bind(copy4); 1300 __ cbz(count, finish); // get rid of 0 case 1301 if (granularity == 2) { 1302 __ ldrh(t0, Address(s, 0)); 1303 __ strh(t0, Address(d, 0)); 1304 } else { // granularity == 1 1305 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1306 // the first and last byte. 1307 // Handle the 3 byte case by loading and storing base + count/2 1308 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1309 // This does means in the 1 byte case we load/store the same 1310 // byte 3 times. 1311 __ lsr(count, count, 1); 1312 __ ldrb(t0, Address(s, 0)); 1313 __ ldrb(t1, Address(send, -1)); 1314 __ ldrb(t2, Address(s, count)); 1315 __ strb(t0, Address(d, 0)); 1316 __ strb(t1, Address(dend, -1)); 1317 __ strb(t2, Address(d, count)); 1318 } 1319 __ b(finish); 1320 } 1321 } 1322 1323 __ bind(copy_big); 1324 if (is_backwards) { 1325 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1326 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1327 } 1328 1329 // Now we've got the small case out of the way we can align the 1330 // source address on a 2-word boundary. 1331 1332 Label aligned; 1333 1334 if (is_aligned) { 1335 // We may have to adjust by 1 word to get s 2-word-aligned. 1336 __ tbz(s, exact_log2(wordSize), aligned); 1337 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1338 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1339 __ sub(count, count, wordSize/granularity); 1340 } else { 1341 if (is_backwards) { 1342 __ andr(rscratch2, s, 2 * wordSize - 1); 1343 } else { 1344 __ neg(rscratch2, s); 1345 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1346 } 1347 // rscratch2 is the byte adjustment needed to align s. 1348 __ cbz(rscratch2, aligned); 1349 int shift = exact_log2(granularity); 1350 if (shift) __ lsr(rscratch2, rscratch2, shift); 1351 __ sub(count, count, rscratch2); 1352 1353 #if 0 1354 // ?? This code is only correct for a disjoint copy. It may or 1355 // may not make sense to use it in that case. 1356 1357 // Copy the first pair; s and d may not be aligned. 1358 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1359 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1360 1361 // Align s and d, adjust count 1362 if (is_backwards) { 1363 __ sub(s, s, rscratch2); 1364 __ sub(d, d, rscratch2); 1365 } else { 1366 __ add(s, s, rscratch2); 1367 __ add(d, d, rscratch2); 1368 } 1369 #else 1370 copy_memory_small(s, d, rscratch2, rscratch1, step); 1371 #endif 1372 } 1373 1374 __ bind(aligned); 1375 1376 // s is now 2-word-aligned. 1377 1378 // We have a count of units and some trailing bytes. Adjust the 1379 // count and do a bulk copy of words. 1380 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1381 if (direction == copy_forwards) 1382 __ bl(copy_f); 1383 else 1384 __ bl(copy_b); 1385 1386 // And the tail. 1387 copy_memory_small(s, d, count, tmp, step); 1388 1389 if (granularity >= 8) __ bind(copy8); 1390 if (granularity >= 4) __ bind(copy4); 1391 __ bind(finish); 1392 } 1393 1394 1395 void clobber_registers() { 1396 #ifdef ASSERT 1397 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1398 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1399 for (Register r = r3; r <= r18; r++) 1400 if (r != rscratch1) __ mov(r, rscratch1); 1401 #endif 1402 } 1403 1404 // Scan over array at a for count oops, verifying each one. 1405 // Preserves a and count, clobbers rscratch1 and rscratch2. 1406 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1407 Label loop, end; 1408 __ mov(rscratch1, a); 1409 __ mov(rscratch2, zr); 1410 __ bind(loop); 1411 __ cmp(rscratch2, count); 1412 __ br(Assembler::HS, end); 1413 if (size == (size_t)wordSize) { 1414 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1415 __ verify_oop(temp); 1416 } else { 1417 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1418 __ decode_heap_oop(temp); // calls verify_oop 1419 } 1420 __ add(rscratch2, rscratch2, size); 1421 __ b(loop); 1422 __ bind(end); 1423 } 1424 1425 // Arguments: 1426 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1427 // ignored 1428 // is_oop - true => oop array, so generate store check code 1429 // name - stub name string 1430 // 1431 // Inputs: 1432 // c_rarg0 - source array address 1433 // c_rarg1 - destination array address 1434 // c_rarg2 - element count, treated as ssize_t, can be zero 1435 // 1436 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1437 // the hardware handle it. The two dwords within qwords that span 1438 // cache line boundaries will still be loaded and stored atomicly. 1439 // 1440 // Side Effects: 1441 // disjoint_int_copy_entry is set to the no-overlap entry point 1442 // used by generate_conjoint_int_oop_copy(). 1443 // 1444 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1445 const char *name, bool dest_uninitialized = false) { 1446 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1447 RegSet saved_reg = RegSet::of(s, d, count); 1448 __ align(CodeEntryAlignment); 1449 StubCodeMark mark(this, "StubRoutines", name); 1450 address start = __ pc(); 1451 __ enter(); 1452 1453 if (entry != NULL) { 1454 *entry = __ pc(); 1455 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1456 BLOCK_COMMENT("Entry:"); 1457 } 1458 1459 if (is_oop) { 1460 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_reg); 1461 // save regs before copy_memory 1462 __ push(RegSet::of(d, count), sp); 1463 } 1464 copy_memory(aligned, s, d, count, rscratch1, size); 1465 if (is_oop) { 1466 __ pop(RegSet::of(d, count), sp); 1467 if (VerifyOops) 1468 verify_oop_array(size, d, count, r16); 1469 __ sub(count, count, 1); // make an inclusive end pointer 1470 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1471 gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet()); 1472 } 1473 __ leave(); 1474 __ mov(r0, zr); // return 0 1475 __ ret(lr); 1476 #ifdef BUILTIN_SIM 1477 { 1478 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1479 sim->notifyCompile(const_cast<char*>(name), start); 1480 } 1481 #endif 1482 return start; 1483 } 1484 1485 // Arguments: 1486 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1487 // ignored 1488 // is_oop - true => oop array, so generate store check code 1489 // name - stub name string 1490 // 1491 // Inputs: 1492 // c_rarg0 - source array address 1493 // c_rarg1 - destination array address 1494 // c_rarg2 - element count, treated as ssize_t, can be zero 1495 // 1496 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1497 // the hardware handle it. The two dwords within qwords that span 1498 // cache line boundaries will still be loaded and stored atomicly. 1499 // 1500 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1501 address *entry, const char *name, 1502 bool dest_uninitialized = false) { 1503 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1504 RegSet saved_regs = RegSet::of(s, d, count); 1505 StubCodeMark mark(this, "StubRoutines", name); 1506 address start = __ pc(); 1507 __ enter(); 1508 1509 if (entry != NULL) { 1510 *entry = __ pc(); 1511 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1512 BLOCK_COMMENT("Entry:"); 1513 } 1514 1515 // use fwd copy when (d-s) above_equal (count*size) 1516 __ sub(rscratch1, d, s); 1517 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1518 __ br(Assembler::HS, nooverlap_target); 1519 1520 if (is_oop) { 1521 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized, saved_regs); 1522 // save regs before copy_memory 1523 __ push(RegSet::of(d, count), sp); 1524 } 1525 copy_memory(aligned, s, d, count, rscratch1, -size); 1526 if (is_oop) { 1527 __ pop(RegSet::of(d, count), sp); 1528 if (VerifyOops) 1529 verify_oop_array(size, d, count, r16); 1530 __ sub(count, count, 1); // make an inclusive end pointer 1531 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1532 gen_write_ref_array_post_barrier(d, count, rscratch1, RegSet()); 1533 } 1534 __ leave(); 1535 __ mov(r0, zr); // return 0 1536 __ ret(lr); 1537 #ifdef BUILTIN_SIM 1538 { 1539 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1540 sim->notifyCompile(const_cast<char*>(name), start); 1541 } 1542 #endif 1543 return start; 1544 } 1545 1546 // Arguments: 1547 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1548 // ignored 1549 // name - stub name string 1550 // 1551 // Inputs: 1552 // c_rarg0 - source array address 1553 // c_rarg1 - destination array address 1554 // c_rarg2 - element count, treated as ssize_t, can be zero 1555 // 1556 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1557 // we let the hardware handle it. The one to eight bytes within words, 1558 // dwords or qwords that span cache line boundaries will still be loaded 1559 // and stored atomically. 1560 // 1561 // Side Effects: 1562 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1563 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1564 // we let the hardware handle it. The one to eight bytes within words, 1565 // dwords or qwords that span cache line boundaries will still be loaded 1566 // and stored atomically. 1567 // 1568 // Side Effects: 1569 // disjoint_byte_copy_entry is set to the no-overlap entry point 1570 // used by generate_conjoint_byte_copy(). 1571 // 1572 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1573 const bool not_oop = false; 1574 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1575 } 1576 1577 // Arguments: 1578 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1579 // ignored 1580 // name - stub name string 1581 // 1582 // Inputs: 1583 // c_rarg0 - source array address 1584 // c_rarg1 - destination array address 1585 // c_rarg2 - element count, treated as ssize_t, can be zero 1586 // 1587 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1588 // we let the hardware handle it. The one to eight bytes within words, 1589 // dwords or qwords that span cache line boundaries will still be loaded 1590 // and stored atomically. 1591 // 1592 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1593 address* entry, const char *name) { 1594 const bool not_oop = false; 1595 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1596 } 1597 1598 // Arguments: 1599 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1600 // ignored 1601 // name - stub name string 1602 // 1603 // Inputs: 1604 // c_rarg0 - source array address 1605 // c_rarg1 - destination array address 1606 // c_rarg2 - element count, treated as ssize_t, can be zero 1607 // 1608 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1609 // let the hardware handle it. The two or four words within dwords 1610 // or qwords that span cache line boundaries will still be loaded 1611 // and stored atomically. 1612 // 1613 // Side Effects: 1614 // disjoint_short_copy_entry is set to the no-overlap entry point 1615 // used by generate_conjoint_short_copy(). 1616 // 1617 address generate_disjoint_short_copy(bool aligned, 1618 address* entry, const char *name) { 1619 const bool not_oop = false; 1620 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1621 } 1622 1623 // Arguments: 1624 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1625 // ignored 1626 // name - stub name string 1627 // 1628 // Inputs: 1629 // c_rarg0 - source array address 1630 // c_rarg1 - destination array address 1631 // c_rarg2 - element count, treated as ssize_t, can be zero 1632 // 1633 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1634 // let the hardware handle it. The two or four words within dwords 1635 // or qwords that span cache line boundaries will still be loaded 1636 // and stored atomically. 1637 // 1638 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1639 address *entry, const char *name) { 1640 const bool not_oop = false; 1641 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1642 1643 } 1644 // Arguments: 1645 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1646 // ignored 1647 // name - stub name string 1648 // 1649 // Inputs: 1650 // c_rarg0 - source array address 1651 // c_rarg1 - destination array address 1652 // c_rarg2 - element count, treated as ssize_t, can be zero 1653 // 1654 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1655 // the hardware handle it. The two dwords within qwords that span 1656 // cache line boundaries will still be loaded and stored atomicly. 1657 // 1658 // Side Effects: 1659 // disjoint_int_copy_entry is set to the no-overlap entry point 1660 // used by generate_conjoint_int_oop_copy(). 1661 // 1662 address generate_disjoint_int_copy(bool aligned, address *entry, 1663 const char *name, bool dest_uninitialized = false) { 1664 const bool not_oop = false; 1665 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1666 } 1667 1668 // Arguments: 1669 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1670 // ignored 1671 // name - stub name string 1672 // 1673 // Inputs: 1674 // c_rarg0 - source array address 1675 // c_rarg1 - destination array address 1676 // c_rarg2 - element count, treated as ssize_t, can be zero 1677 // 1678 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1679 // the hardware handle it. The two dwords within qwords that span 1680 // cache line boundaries will still be loaded and stored atomicly. 1681 // 1682 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1683 address *entry, const char *name, 1684 bool dest_uninitialized = false) { 1685 const bool not_oop = false; 1686 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1687 } 1688 1689 1690 // Arguments: 1691 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1692 // ignored 1693 // name - stub name string 1694 // 1695 // Inputs: 1696 // c_rarg0 - source array address 1697 // c_rarg1 - destination array address 1698 // c_rarg2 - element count, treated as size_t, can be zero 1699 // 1700 // Side Effects: 1701 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1702 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1703 // 1704 address generate_disjoint_long_copy(bool aligned, address *entry, 1705 const char *name, bool dest_uninitialized = false) { 1706 const bool not_oop = false; 1707 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1708 } 1709 1710 // Arguments: 1711 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1712 // ignored 1713 // name - stub name string 1714 // 1715 // Inputs: 1716 // c_rarg0 - source array address 1717 // c_rarg1 - destination array address 1718 // c_rarg2 - element count, treated as size_t, can be zero 1719 // 1720 address generate_conjoint_long_copy(bool aligned, 1721 address nooverlap_target, address *entry, 1722 const char *name, bool dest_uninitialized = false) { 1723 const bool not_oop = false; 1724 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1725 } 1726 1727 // Arguments: 1728 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1729 // ignored 1730 // name - stub name string 1731 // 1732 // Inputs: 1733 // c_rarg0 - source array address 1734 // c_rarg1 - destination array address 1735 // c_rarg2 - element count, treated as size_t, can be zero 1736 // 1737 // Side Effects: 1738 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1739 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1740 // 1741 address generate_disjoint_oop_copy(bool aligned, address *entry, 1742 const char *name, bool dest_uninitialized) { 1743 const bool is_oop = true; 1744 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1745 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1746 } 1747 1748 // Arguments: 1749 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1750 // ignored 1751 // name - stub name string 1752 // 1753 // Inputs: 1754 // c_rarg0 - source array address 1755 // c_rarg1 - destination array address 1756 // c_rarg2 - element count, treated as size_t, can be zero 1757 // 1758 address generate_conjoint_oop_copy(bool aligned, 1759 address nooverlap_target, address *entry, 1760 const char *name, bool dest_uninitialized) { 1761 const bool is_oop = true; 1762 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1763 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1764 name, dest_uninitialized); 1765 } 1766 1767 1768 // Helper for generating a dynamic type check. 1769 // Smashes rscratch1. 1770 void generate_type_check(Register sub_klass, 1771 Register super_check_offset, 1772 Register super_klass, 1773 Label& L_success) { 1774 assert_different_registers(sub_klass, super_check_offset, super_klass); 1775 1776 BLOCK_COMMENT("type_check:"); 1777 1778 Label L_miss; 1779 1780 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1781 super_check_offset); 1782 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1783 1784 // Fall through on failure! 1785 __ BIND(L_miss); 1786 } 1787 1788 // 1789 // Generate checkcasting array copy stub 1790 // 1791 // Input: 1792 // c_rarg0 - source array address 1793 // c_rarg1 - destination array address 1794 // c_rarg2 - element count, treated as ssize_t, can be zero 1795 // c_rarg3 - size_t ckoff (super_check_offset) 1796 // c_rarg4 - oop ckval (super_klass) 1797 // 1798 // Output: 1799 // r0 == 0 - success 1800 // r0 == -1^K - failure, where K is partial transfer count 1801 // 1802 address generate_checkcast_copy(const char *name, address *entry, 1803 bool dest_uninitialized = false) { 1804 1805 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1806 1807 // Input registers (after setup_arg_regs) 1808 const Register from = c_rarg0; // source array address 1809 const Register to = c_rarg1; // destination array address 1810 const Register count = c_rarg2; // elementscount 1811 const Register ckoff = c_rarg3; // super_check_offset 1812 const Register ckval = c_rarg4; // super_klass 1813 1814 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1815 RegSet wb_post_saved_regs = RegSet::of(count); 1816 1817 // Registers used as temps (r18, r19, r20 are save-on-entry) 1818 const Register count_save = r21; // orig elementscount 1819 const Register start_to = r20; // destination array start address 1820 const Register copied_oop = r18; // actual oop copied 1821 const Register r19_klass = r19; // oop._klass 1822 1823 //--------------------------------------------------------------- 1824 // Assembler stub will be used for this call to arraycopy 1825 // if the two arrays are subtypes of Object[] but the 1826 // destination array type is not equal to or a supertype 1827 // of the source type. Each element must be separately 1828 // checked. 1829 1830 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1831 copied_oop, r19_klass, count_save); 1832 1833 __ align(CodeEntryAlignment); 1834 StubCodeMark mark(this, "StubRoutines", name); 1835 address start = __ pc(); 1836 1837 __ enter(); // required for proper stackwalking of RuntimeStub frame 1838 1839 #ifdef ASSERT 1840 // caller guarantees that the arrays really are different 1841 // otherwise, we would have to make conjoint checks 1842 { Label L; 1843 array_overlap_test(L, TIMES_OOP); 1844 __ stop("checkcast_copy within a single array"); 1845 __ bind(L); 1846 } 1847 #endif //ASSERT 1848 1849 // Caller of this entry point must set up the argument registers. 1850 if (entry != NULL) { 1851 *entry = __ pc(); 1852 BLOCK_COMMENT("Entry:"); 1853 } 1854 1855 // Empty array: Nothing to do. 1856 __ cbz(count, L_done); 1857 1858 __ push(RegSet::of(r18, r19, r20, r21), sp); 1859 1860 #ifdef ASSERT 1861 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1862 // The ckoff and ckval must be mutually consistent, 1863 // even though caller generates both. 1864 { Label L; 1865 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1866 __ ldrw(start_to, Address(ckval, sco_offset)); 1867 __ cmpw(ckoff, start_to); 1868 __ br(Assembler::EQ, L); 1869 __ stop("super_check_offset inconsistent"); 1870 __ bind(L); 1871 } 1872 #endif //ASSERT 1873 1874 gen_write_ref_array_pre_barrier(to, count, dest_uninitialized, wb_pre_saved_regs); 1875 1876 // save the original count 1877 __ mov(count_save, count); 1878 1879 // Copy from low to high addresses 1880 __ mov(start_to, to); // Save destination array start address 1881 __ b(L_load_element); 1882 1883 // ======== begin loop ======== 1884 // (Loop is rotated; its entry is L_load_element.) 1885 // Loop control: 1886 // for (; count != 0; count--) { 1887 // copied_oop = load_heap_oop(from++); 1888 // ... generate_type_check ...; 1889 // store_heap_oop(to++, copied_oop); 1890 // } 1891 __ align(OptoLoopAlignment); 1892 1893 __ BIND(L_store_element); 1894 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1895 __ sub(count, count, 1); 1896 __ cbz(count, L_do_card_marks); 1897 1898 // ======== loop entry is here ======== 1899 __ BIND(L_load_element); 1900 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1901 __ cbz(copied_oop, L_store_element); 1902 1903 __ load_klass(r19_klass, copied_oop);// query the object klass 1904 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1905 // ======== end loop ======== 1906 1907 // It was a real error; we must depend on the caller to finish the job. 1908 // Register count = remaining oops, count_orig = total oops. 1909 // Emit GC store barriers for the oops we have copied and report 1910 // their number to the caller. 1911 1912 __ subs(count, count_save, count); // K = partially copied oop count 1913 __ eon(count, count, zr); // report (-1^K) to caller 1914 __ br(Assembler::EQ, L_done_pop); 1915 1916 __ BIND(L_do_card_marks); 1917 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1918 gen_write_ref_array_post_barrier(start_to, to, rscratch1, wb_post_saved_regs); 1919 1920 __ bind(L_done_pop); 1921 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1922 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1923 1924 __ bind(L_done); 1925 __ mov(r0, count); 1926 __ leave(); 1927 __ ret(lr); 1928 1929 return start; 1930 } 1931 1932 // Perform range checks on the proposed arraycopy. 1933 // Kills temp, but nothing else. 1934 // Also, clean the sign bits of src_pos and dst_pos. 1935 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1936 Register src_pos, // source position (c_rarg1) 1937 Register dst, // destination array oo (c_rarg2) 1938 Register dst_pos, // destination position (c_rarg3) 1939 Register length, 1940 Register temp, 1941 Label& L_failed) { 1942 BLOCK_COMMENT("arraycopy_range_checks:"); 1943 1944 assert_different_registers(rscratch1, temp); 1945 1946 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1947 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1948 __ addw(temp, length, src_pos); 1949 __ cmpw(temp, rscratch1); 1950 __ br(Assembler::HI, L_failed); 1951 1952 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1953 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1954 __ addw(temp, length, dst_pos); 1955 __ cmpw(temp, rscratch1); 1956 __ br(Assembler::HI, L_failed); 1957 1958 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1959 __ movw(src_pos, src_pos); 1960 __ movw(dst_pos, dst_pos); 1961 1962 BLOCK_COMMENT("arraycopy_range_checks done"); 1963 } 1964 1965 // These stubs get called from some dumb test routine. 1966 // I'll write them properly when they're called from 1967 // something that's actually doing something. 1968 static void fake_arraycopy_stub(address src, address dst, int count) { 1969 assert(count == 0, "huh?"); 1970 } 1971 1972 1973 // 1974 // Generate 'unsafe' array copy stub 1975 // Though just as safe as the other stubs, it takes an unscaled 1976 // size_t argument instead of an element count. 1977 // 1978 // Input: 1979 // c_rarg0 - source array address 1980 // c_rarg1 - destination array address 1981 // c_rarg2 - byte count, treated as ssize_t, can be zero 1982 // 1983 // Examines the alignment of the operands and dispatches 1984 // to a long, int, short, or byte copy loop. 1985 // 1986 address generate_unsafe_copy(const char *name, 1987 address byte_copy_entry, 1988 address short_copy_entry, 1989 address int_copy_entry, 1990 address long_copy_entry) { 1991 Label L_long_aligned, L_int_aligned, L_short_aligned; 1992 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1993 1994 __ align(CodeEntryAlignment); 1995 StubCodeMark mark(this, "StubRoutines", name); 1996 address start = __ pc(); 1997 __ enter(); // required for proper stackwalking of RuntimeStub frame 1998 1999 // bump this on entry, not on exit: 2000 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 2001 2002 __ orr(rscratch1, s, d); 2003 __ orr(rscratch1, rscratch1, count); 2004 2005 __ andr(rscratch1, rscratch1, BytesPerLong-1); 2006 __ cbz(rscratch1, L_long_aligned); 2007 __ andr(rscratch1, rscratch1, BytesPerInt-1); 2008 __ cbz(rscratch1, L_int_aligned); 2009 __ tbz(rscratch1, 0, L_short_aligned); 2010 __ b(RuntimeAddress(byte_copy_entry)); 2011 2012 __ BIND(L_short_aligned); 2013 __ lsr(count, count, LogBytesPerShort); // size => short_count 2014 __ b(RuntimeAddress(short_copy_entry)); 2015 __ BIND(L_int_aligned); 2016 __ lsr(count, count, LogBytesPerInt); // size => int_count 2017 __ b(RuntimeAddress(int_copy_entry)); 2018 __ BIND(L_long_aligned); 2019 __ lsr(count, count, LogBytesPerLong); // size => long_count 2020 __ b(RuntimeAddress(long_copy_entry)); 2021 2022 return start; 2023 } 2024 2025 // 2026 // Generate generic array copy stubs 2027 // 2028 // Input: 2029 // c_rarg0 - src oop 2030 // c_rarg1 - src_pos (32-bits) 2031 // c_rarg2 - dst oop 2032 // c_rarg3 - dst_pos (32-bits) 2033 // c_rarg4 - element count (32-bits) 2034 // 2035 // Output: 2036 // r0 == 0 - success 2037 // r0 == -1^K - failure, where K is partial transfer count 2038 // 2039 address generate_generic_copy(const char *name, 2040 address byte_copy_entry, address short_copy_entry, 2041 address int_copy_entry, address oop_copy_entry, 2042 address long_copy_entry, address checkcast_copy_entry) { 2043 2044 Label L_failed, L_failed_0, L_objArray; 2045 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 2046 2047 // Input registers 2048 const Register src = c_rarg0; // source array oop 2049 const Register src_pos = c_rarg1; // source position 2050 const Register dst = c_rarg2; // destination array oop 2051 const Register dst_pos = c_rarg3; // destination position 2052 const Register length = c_rarg4; 2053 2054 StubCodeMark mark(this, "StubRoutines", name); 2055 2056 __ align(CodeEntryAlignment); 2057 address start = __ pc(); 2058 2059 __ enter(); // required for proper stackwalking of RuntimeStub frame 2060 2061 // bump this on entry, not on exit: 2062 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2063 2064 //----------------------------------------------------------------------- 2065 // Assembler stub will be used for this call to arraycopy 2066 // if the following conditions are met: 2067 // 2068 // (1) src and dst must not be null. 2069 // (2) src_pos must not be negative. 2070 // (3) dst_pos must not be negative. 2071 // (4) length must not be negative. 2072 // (5) src klass and dst klass should be the same and not NULL. 2073 // (6) src and dst should be arrays. 2074 // (7) src_pos + length must not exceed length of src. 2075 // (8) dst_pos + length must not exceed length of dst. 2076 // 2077 2078 // if (src == NULL) return -1; 2079 __ cbz(src, L_failed); 2080 2081 // if (src_pos < 0) return -1; 2082 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2083 2084 // if (dst == NULL) return -1; 2085 __ cbz(dst, L_failed); 2086 2087 // if (dst_pos < 0) return -1; 2088 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2089 2090 // registers used as temp 2091 const Register scratch_length = r16; // elements count to copy 2092 const Register scratch_src_klass = r17; // array klass 2093 const Register lh = r18; // layout helper 2094 2095 // if (length < 0) return -1; 2096 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2097 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2098 2099 __ load_klass(scratch_src_klass, src); 2100 #ifdef ASSERT 2101 // assert(src->klass() != NULL); 2102 { 2103 BLOCK_COMMENT("assert klasses not null {"); 2104 Label L1, L2; 2105 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2106 __ bind(L1); 2107 __ stop("broken null klass"); 2108 __ bind(L2); 2109 __ load_klass(rscratch1, dst); 2110 __ cbz(rscratch1, L1); // this would be broken also 2111 BLOCK_COMMENT("} assert klasses not null done"); 2112 } 2113 #endif 2114 2115 // Load layout helper (32-bits) 2116 // 2117 // |array_tag| | header_size | element_type | |log2_element_size| 2118 // 32 30 24 16 8 2 0 2119 // 2120 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2121 // 2122 2123 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2124 2125 // Handle objArrays completely differently... 2126 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2127 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2128 __ movw(rscratch1, objArray_lh); 2129 __ eorw(rscratch2, lh, rscratch1); 2130 __ cbzw(rscratch2, L_objArray); 2131 2132 // if (src->klass() != dst->klass()) return -1; 2133 __ load_klass(rscratch2, dst); 2134 __ eor(rscratch2, rscratch2, scratch_src_klass); 2135 __ cbnz(rscratch2, L_failed); 2136 2137 // if (!src->is_Array()) return -1; 2138 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2139 2140 // At this point, it is known to be a typeArray (array_tag 0x3). 2141 #ifdef ASSERT 2142 { 2143 BLOCK_COMMENT("assert primitive array {"); 2144 Label L; 2145 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2146 __ cmpw(lh, rscratch2); 2147 __ br(Assembler::GE, L); 2148 __ stop("must be a primitive array"); 2149 __ bind(L); 2150 BLOCK_COMMENT("} assert primitive array done"); 2151 } 2152 #endif 2153 2154 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2155 rscratch2, L_failed); 2156 2157 // TypeArrayKlass 2158 // 2159 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2160 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2161 // 2162 2163 const Register rscratch1_offset = rscratch1; // array offset 2164 const Register r18_elsize = lh; // element size 2165 2166 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2167 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2168 __ add(src, src, rscratch1_offset); // src array offset 2169 __ add(dst, dst, rscratch1_offset); // dst array offset 2170 BLOCK_COMMENT("choose copy loop based on element size"); 2171 2172 // next registers should be set before the jump to corresponding stub 2173 const Register from = c_rarg0; // source array address 2174 const Register to = c_rarg1; // destination array address 2175 const Register count = c_rarg2; // elements count 2176 2177 // 'from', 'to', 'count' registers should be set in such order 2178 // since they are the same as 'src', 'src_pos', 'dst'. 2179 2180 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2181 2182 // The possible values of elsize are 0-3, i.e. exact_log2(element 2183 // size in bytes). We do a simple bitwise binary search. 2184 __ BIND(L_copy_bytes); 2185 __ tbnz(r18_elsize, 1, L_copy_ints); 2186 __ tbnz(r18_elsize, 0, L_copy_shorts); 2187 __ lea(from, Address(src, src_pos));// src_addr 2188 __ lea(to, Address(dst, dst_pos));// dst_addr 2189 __ movw(count, scratch_length); // length 2190 __ b(RuntimeAddress(byte_copy_entry)); 2191 2192 __ BIND(L_copy_shorts); 2193 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2194 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2195 __ movw(count, scratch_length); // length 2196 __ b(RuntimeAddress(short_copy_entry)); 2197 2198 __ BIND(L_copy_ints); 2199 __ tbnz(r18_elsize, 0, L_copy_longs); 2200 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2201 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2202 __ movw(count, scratch_length); // length 2203 __ b(RuntimeAddress(int_copy_entry)); 2204 2205 __ BIND(L_copy_longs); 2206 #ifdef ASSERT 2207 { 2208 BLOCK_COMMENT("assert long copy {"); 2209 Label L; 2210 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2211 __ cmpw(r18_elsize, LogBytesPerLong); 2212 __ br(Assembler::EQ, L); 2213 __ stop("must be long copy, but elsize is wrong"); 2214 __ bind(L); 2215 BLOCK_COMMENT("} assert long copy done"); 2216 } 2217 #endif 2218 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2219 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2220 __ movw(count, scratch_length); // length 2221 __ b(RuntimeAddress(long_copy_entry)); 2222 2223 // ObjArrayKlass 2224 __ BIND(L_objArray); 2225 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2226 2227 Label L_plain_copy, L_checkcast_copy; 2228 // test array classes for subtyping 2229 __ load_klass(r18, dst); 2230 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2231 __ br(Assembler::NE, L_checkcast_copy); 2232 2233 // Identically typed arrays can be copied without element-wise checks. 2234 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2235 rscratch2, L_failed); 2236 2237 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2238 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2239 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2240 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2241 __ movw(count, scratch_length); // length 2242 __ BIND(L_plain_copy); 2243 __ b(RuntimeAddress(oop_copy_entry)); 2244 2245 __ BIND(L_checkcast_copy); 2246 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2247 { 2248 // Before looking at dst.length, make sure dst is also an objArray. 2249 __ ldrw(rscratch1, Address(r18, lh_offset)); 2250 __ movw(rscratch2, objArray_lh); 2251 __ eorw(rscratch1, rscratch1, rscratch2); 2252 __ cbnzw(rscratch1, L_failed); 2253 2254 // It is safe to examine both src.length and dst.length. 2255 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2256 r18, L_failed); 2257 2258 const Register rscratch2_dst_klass = rscratch2; 2259 __ load_klass(rscratch2_dst_klass, dst); // reload 2260 2261 // Marshal the base address arguments now, freeing registers. 2262 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2263 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2264 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2265 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2266 __ movw(count, length); // length (reloaded) 2267 Register sco_temp = c_rarg3; // this register is free now 2268 assert_different_registers(from, to, count, sco_temp, 2269 rscratch2_dst_klass, scratch_src_klass); 2270 // assert_clean_int(count, sco_temp); 2271 2272 // Generate the type check. 2273 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2274 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2275 // assert_clean_int(sco_temp, r18); 2276 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2277 2278 // Fetch destination element klass from the ObjArrayKlass header. 2279 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2280 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2281 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2282 2283 // the checkcast_copy loop needs two extra arguments: 2284 assert(c_rarg3 == sco_temp, "#3 already in place"); 2285 // Set up arguments for checkcast_copy_entry. 2286 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2287 __ b(RuntimeAddress(checkcast_copy_entry)); 2288 } 2289 2290 __ BIND(L_failed); 2291 __ mov(r0, -1); 2292 __ leave(); // required for proper stackwalking of RuntimeStub frame 2293 __ ret(lr); 2294 2295 return start; 2296 } 2297 2298 // 2299 // Generate stub for array fill. If "aligned" is true, the 2300 // "to" address is assumed to be heapword aligned. 2301 // 2302 // Arguments for generated stub: 2303 // to: c_rarg0 2304 // value: c_rarg1 2305 // count: c_rarg2 treated as signed 2306 // 2307 address generate_fill(BasicType t, bool aligned, const char *name) { 2308 __ align(CodeEntryAlignment); 2309 StubCodeMark mark(this, "StubRoutines", name); 2310 address start = __ pc(); 2311 2312 BLOCK_COMMENT("Entry:"); 2313 2314 const Register to = c_rarg0; // source array address 2315 const Register value = c_rarg1; // value 2316 const Register count = c_rarg2; // elements count 2317 2318 const Register bz_base = r10; // base for block_zero routine 2319 const Register cnt_words = r11; // temp register 2320 2321 __ enter(); 2322 2323 Label L_fill_elements, L_exit1; 2324 2325 int shift = -1; 2326 switch (t) { 2327 case T_BYTE: 2328 shift = 0; 2329 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2330 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2331 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2332 __ br(Assembler::LO, L_fill_elements); 2333 break; 2334 case T_SHORT: 2335 shift = 1; 2336 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2337 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2338 __ br(Assembler::LO, L_fill_elements); 2339 break; 2340 case T_INT: 2341 shift = 2; 2342 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2343 __ br(Assembler::LO, L_fill_elements); 2344 break; 2345 default: ShouldNotReachHere(); 2346 } 2347 2348 // Align source address at 8 bytes address boundary. 2349 Label L_skip_align1, L_skip_align2, L_skip_align4; 2350 if (!aligned) { 2351 switch (t) { 2352 case T_BYTE: 2353 // One byte misalignment happens only for byte arrays. 2354 __ tbz(to, 0, L_skip_align1); 2355 __ strb(value, Address(__ post(to, 1))); 2356 __ subw(count, count, 1); 2357 __ bind(L_skip_align1); 2358 // Fallthrough 2359 case T_SHORT: 2360 // Two bytes misalignment happens only for byte and short (char) arrays. 2361 __ tbz(to, 1, L_skip_align2); 2362 __ strh(value, Address(__ post(to, 2))); 2363 __ subw(count, count, 2 >> shift); 2364 __ bind(L_skip_align2); 2365 // Fallthrough 2366 case T_INT: 2367 // Align to 8 bytes, we know we are 4 byte aligned to start. 2368 __ tbz(to, 2, L_skip_align4); 2369 __ strw(value, Address(__ post(to, 4))); 2370 __ subw(count, count, 4 >> shift); 2371 __ bind(L_skip_align4); 2372 break; 2373 default: ShouldNotReachHere(); 2374 } 2375 } 2376 2377 // 2378 // Fill large chunks 2379 // 2380 __ lsrw(cnt_words, count, 3 - shift); // number of words 2381 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2382 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2383 if (UseBlockZeroing) { 2384 Label non_block_zeroing, rest; 2385 // If the fill value is zero we can use the fast zero_words(). 2386 __ cbnz(value, non_block_zeroing); 2387 __ mov(bz_base, to); 2388 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2389 __ zero_words(bz_base, cnt_words); 2390 __ b(rest); 2391 __ bind(non_block_zeroing); 2392 __ fill_words(to, cnt_words, value); 2393 __ bind(rest); 2394 } else { 2395 __ fill_words(to, cnt_words, value); 2396 } 2397 2398 // Remaining count is less than 8 bytes. Fill it by a single store. 2399 // Note that the total length is no less than 8 bytes. 2400 if (t == T_BYTE || t == T_SHORT) { 2401 Label L_exit1; 2402 __ cbzw(count, L_exit1); 2403 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2404 __ str(value, Address(to, -8)); // overwrite some elements 2405 __ bind(L_exit1); 2406 __ leave(); 2407 __ ret(lr); 2408 } 2409 2410 // Handle copies less than 8 bytes. 2411 Label L_fill_2, L_fill_4, L_exit2; 2412 __ bind(L_fill_elements); 2413 switch (t) { 2414 case T_BYTE: 2415 __ tbz(count, 0, L_fill_2); 2416 __ strb(value, Address(__ post(to, 1))); 2417 __ bind(L_fill_2); 2418 __ tbz(count, 1, L_fill_4); 2419 __ strh(value, Address(__ post(to, 2))); 2420 __ bind(L_fill_4); 2421 __ tbz(count, 2, L_exit2); 2422 __ strw(value, Address(to)); 2423 break; 2424 case T_SHORT: 2425 __ tbz(count, 0, L_fill_4); 2426 __ strh(value, Address(__ post(to, 2))); 2427 __ bind(L_fill_4); 2428 __ tbz(count, 1, L_exit2); 2429 __ strw(value, Address(to)); 2430 break; 2431 case T_INT: 2432 __ cbzw(count, L_exit2); 2433 __ strw(value, Address(to)); 2434 break; 2435 default: ShouldNotReachHere(); 2436 } 2437 __ bind(L_exit2); 2438 __ leave(); 2439 __ ret(lr); 2440 return start; 2441 } 2442 2443 void generate_arraycopy_stubs() { 2444 address entry; 2445 address entry_jbyte_arraycopy; 2446 address entry_jshort_arraycopy; 2447 address entry_jint_arraycopy; 2448 address entry_oop_arraycopy; 2449 address entry_jlong_arraycopy; 2450 address entry_checkcast_arraycopy; 2451 2452 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2453 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2454 2455 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2456 2457 //*** jbyte 2458 // Always need aligned and unaligned versions 2459 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2460 "jbyte_disjoint_arraycopy"); 2461 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2462 &entry_jbyte_arraycopy, 2463 "jbyte_arraycopy"); 2464 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2465 "arrayof_jbyte_disjoint_arraycopy"); 2466 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2467 "arrayof_jbyte_arraycopy"); 2468 2469 //*** jshort 2470 // Always need aligned and unaligned versions 2471 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2472 "jshort_disjoint_arraycopy"); 2473 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2474 &entry_jshort_arraycopy, 2475 "jshort_arraycopy"); 2476 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2477 "arrayof_jshort_disjoint_arraycopy"); 2478 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2479 "arrayof_jshort_arraycopy"); 2480 2481 //*** jint 2482 // Aligned versions 2483 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2484 "arrayof_jint_disjoint_arraycopy"); 2485 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2486 "arrayof_jint_arraycopy"); 2487 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2488 // entry_jint_arraycopy always points to the unaligned version 2489 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2490 "jint_disjoint_arraycopy"); 2491 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2492 &entry_jint_arraycopy, 2493 "jint_arraycopy"); 2494 2495 //*** jlong 2496 // It is always aligned 2497 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2498 "arrayof_jlong_disjoint_arraycopy"); 2499 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2500 "arrayof_jlong_arraycopy"); 2501 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2502 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2503 2504 //*** oops 2505 { 2506 // With compressed oops we need unaligned versions; notice that 2507 // we overwrite entry_oop_arraycopy. 2508 bool aligned = !UseCompressedOops; 2509 2510 StubRoutines::_arrayof_oop_disjoint_arraycopy 2511 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2512 /*dest_uninitialized*/false); 2513 StubRoutines::_arrayof_oop_arraycopy 2514 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2515 /*dest_uninitialized*/false); 2516 // Aligned versions without pre-barriers 2517 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2518 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2519 /*dest_uninitialized*/true); 2520 StubRoutines::_arrayof_oop_arraycopy_uninit 2521 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2522 /*dest_uninitialized*/true); 2523 } 2524 2525 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2526 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2527 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2528 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2529 2530 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2531 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2532 /*dest_uninitialized*/true); 2533 2534 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2535 entry_jbyte_arraycopy, 2536 entry_jshort_arraycopy, 2537 entry_jint_arraycopy, 2538 entry_jlong_arraycopy); 2539 2540 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2541 entry_jbyte_arraycopy, 2542 entry_jshort_arraycopy, 2543 entry_jint_arraycopy, 2544 entry_oop_arraycopy, 2545 entry_jlong_arraycopy, 2546 entry_checkcast_arraycopy); 2547 2548 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2549 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2550 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2551 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2552 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2553 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2554 } 2555 2556 void generate_math_stubs() { Unimplemented(); } 2557 2558 // Arguments: 2559 // 2560 // Inputs: 2561 // c_rarg0 - source byte array address 2562 // c_rarg1 - destination byte array address 2563 // c_rarg2 - K (key) in little endian int array 2564 // 2565 address generate_aescrypt_encryptBlock() { 2566 __ align(CodeEntryAlignment); 2567 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2568 2569 Label L_doLast; 2570 2571 const Register from = c_rarg0; // source array address 2572 const Register to = c_rarg1; // destination array address 2573 const Register key = c_rarg2; // key array address 2574 const Register keylen = rscratch1; 2575 2576 address start = __ pc(); 2577 __ enter(); 2578 2579 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2580 2581 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2582 2583 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2584 __ rev32(v1, __ T16B, v1); 2585 __ rev32(v2, __ T16B, v2); 2586 __ rev32(v3, __ T16B, v3); 2587 __ rev32(v4, __ T16B, v4); 2588 __ aese(v0, v1); 2589 __ aesmc(v0, v0); 2590 __ aese(v0, v2); 2591 __ aesmc(v0, v0); 2592 __ aese(v0, v3); 2593 __ aesmc(v0, v0); 2594 __ aese(v0, v4); 2595 __ aesmc(v0, v0); 2596 2597 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2598 __ rev32(v1, __ T16B, v1); 2599 __ rev32(v2, __ T16B, v2); 2600 __ rev32(v3, __ T16B, v3); 2601 __ rev32(v4, __ T16B, v4); 2602 __ aese(v0, v1); 2603 __ aesmc(v0, v0); 2604 __ aese(v0, v2); 2605 __ aesmc(v0, v0); 2606 __ aese(v0, v3); 2607 __ aesmc(v0, v0); 2608 __ aese(v0, v4); 2609 __ aesmc(v0, v0); 2610 2611 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2612 __ rev32(v1, __ T16B, v1); 2613 __ rev32(v2, __ T16B, v2); 2614 2615 __ cmpw(keylen, 44); 2616 __ br(Assembler::EQ, L_doLast); 2617 2618 __ aese(v0, v1); 2619 __ aesmc(v0, v0); 2620 __ aese(v0, v2); 2621 __ aesmc(v0, v0); 2622 2623 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2624 __ rev32(v1, __ T16B, v1); 2625 __ rev32(v2, __ T16B, v2); 2626 2627 __ cmpw(keylen, 52); 2628 __ br(Assembler::EQ, L_doLast); 2629 2630 __ aese(v0, v1); 2631 __ aesmc(v0, v0); 2632 __ aese(v0, v2); 2633 __ aesmc(v0, v0); 2634 2635 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2636 __ rev32(v1, __ T16B, v1); 2637 __ rev32(v2, __ T16B, v2); 2638 2639 __ BIND(L_doLast); 2640 2641 __ aese(v0, v1); 2642 __ aesmc(v0, v0); 2643 __ aese(v0, v2); 2644 2645 __ ld1(v1, __ T16B, key); 2646 __ rev32(v1, __ T16B, v1); 2647 __ eor(v0, __ T16B, v0, v1); 2648 2649 __ st1(v0, __ T16B, to); 2650 2651 __ mov(r0, 0); 2652 2653 __ leave(); 2654 __ ret(lr); 2655 2656 return start; 2657 } 2658 2659 // Arguments: 2660 // 2661 // Inputs: 2662 // c_rarg0 - source byte array address 2663 // c_rarg1 - destination byte array address 2664 // c_rarg2 - K (key) in little endian int array 2665 // 2666 address generate_aescrypt_decryptBlock() { 2667 assert(UseAES, "need AES instructions and misaligned SSE support"); 2668 __ align(CodeEntryAlignment); 2669 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2670 Label L_doLast; 2671 2672 const Register from = c_rarg0; // source array address 2673 const Register to = c_rarg1; // destination array address 2674 const Register key = c_rarg2; // key array address 2675 const Register keylen = rscratch1; 2676 2677 address start = __ pc(); 2678 __ enter(); // required for proper stackwalking of RuntimeStub frame 2679 2680 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2681 2682 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2683 2684 __ ld1(v5, __ T16B, __ post(key, 16)); 2685 __ rev32(v5, __ T16B, v5); 2686 2687 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2688 __ rev32(v1, __ T16B, v1); 2689 __ rev32(v2, __ T16B, v2); 2690 __ rev32(v3, __ T16B, v3); 2691 __ rev32(v4, __ T16B, v4); 2692 __ aesd(v0, v1); 2693 __ aesimc(v0, v0); 2694 __ aesd(v0, v2); 2695 __ aesimc(v0, v0); 2696 __ aesd(v0, v3); 2697 __ aesimc(v0, v0); 2698 __ aesd(v0, v4); 2699 __ aesimc(v0, v0); 2700 2701 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2702 __ rev32(v1, __ T16B, v1); 2703 __ rev32(v2, __ T16B, v2); 2704 __ rev32(v3, __ T16B, v3); 2705 __ rev32(v4, __ T16B, v4); 2706 __ aesd(v0, v1); 2707 __ aesimc(v0, v0); 2708 __ aesd(v0, v2); 2709 __ aesimc(v0, v0); 2710 __ aesd(v0, v3); 2711 __ aesimc(v0, v0); 2712 __ aesd(v0, v4); 2713 __ aesimc(v0, v0); 2714 2715 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2716 __ rev32(v1, __ T16B, v1); 2717 __ rev32(v2, __ T16B, v2); 2718 2719 __ cmpw(keylen, 44); 2720 __ br(Assembler::EQ, L_doLast); 2721 2722 __ aesd(v0, v1); 2723 __ aesimc(v0, v0); 2724 __ aesd(v0, v2); 2725 __ aesimc(v0, v0); 2726 2727 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2728 __ rev32(v1, __ T16B, v1); 2729 __ rev32(v2, __ T16B, v2); 2730 2731 __ cmpw(keylen, 52); 2732 __ br(Assembler::EQ, L_doLast); 2733 2734 __ aesd(v0, v1); 2735 __ aesimc(v0, v0); 2736 __ aesd(v0, v2); 2737 __ aesimc(v0, v0); 2738 2739 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2740 __ rev32(v1, __ T16B, v1); 2741 __ rev32(v2, __ T16B, v2); 2742 2743 __ BIND(L_doLast); 2744 2745 __ aesd(v0, v1); 2746 __ aesimc(v0, v0); 2747 __ aesd(v0, v2); 2748 2749 __ eor(v0, __ T16B, v0, v5); 2750 2751 __ st1(v0, __ T16B, to); 2752 2753 __ mov(r0, 0); 2754 2755 __ leave(); 2756 __ ret(lr); 2757 2758 return start; 2759 } 2760 2761 // Arguments: 2762 // 2763 // Inputs: 2764 // c_rarg0 - source byte array address 2765 // c_rarg1 - destination byte array address 2766 // c_rarg2 - K (key) in little endian int array 2767 // c_rarg3 - r vector byte array address 2768 // c_rarg4 - input length 2769 // 2770 // Output: 2771 // x0 - input length 2772 // 2773 address generate_cipherBlockChaining_encryptAESCrypt() { 2774 assert(UseAES, "need AES instructions and misaligned SSE support"); 2775 __ align(CodeEntryAlignment); 2776 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2777 2778 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2779 2780 const Register from = c_rarg0; // source array address 2781 const Register to = c_rarg1; // destination array address 2782 const Register key = c_rarg2; // key array address 2783 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2784 // and left with the results of the last encryption block 2785 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2786 const Register keylen = rscratch1; 2787 2788 address start = __ pc(); 2789 2790 __ enter(); 2791 2792 __ movw(rscratch2, len_reg); 2793 2794 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2795 2796 __ ld1(v0, __ T16B, rvec); 2797 2798 __ cmpw(keylen, 52); 2799 __ br(Assembler::CC, L_loadkeys_44); 2800 __ br(Assembler::EQ, L_loadkeys_52); 2801 2802 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2803 __ rev32(v17, __ T16B, v17); 2804 __ rev32(v18, __ T16B, v18); 2805 __ BIND(L_loadkeys_52); 2806 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2807 __ rev32(v19, __ T16B, v19); 2808 __ rev32(v20, __ T16B, v20); 2809 __ BIND(L_loadkeys_44); 2810 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2811 __ rev32(v21, __ T16B, v21); 2812 __ rev32(v22, __ T16B, v22); 2813 __ rev32(v23, __ T16B, v23); 2814 __ rev32(v24, __ T16B, v24); 2815 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2816 __ rev32(v25, __ T16B, v25); 2817 __ rev32(v26, __ T16B, v26); 2818 __ rev32(v27, __ T16B, v27); 2819 __ rev32(v28, __ T16B, v28); 2820 __ ld1(v29, v30, v31, __ T16B, key); 2821 __ rev32(v29, __ T16B, v29); 2822 __ rev32(v30, __ T16B, v30); 2823 __ rev32(v31, __ T16B, v31); 2824 2825 __ BIND(L_aes_loop); 2826 __ ld1(v1, __ T16B, __ post(from, 16)); 2827 __ eor(v0, __ T16B, v0, v1); 2828 2829 __ br(Assembler::CC, L_rounds_44); 2830 __ br(Assembler::EQ, L_rounds_52); 2831 2832 __ aese(v0, v17); __ aesmc(v0, v0); 2833 __ aese(v0, v18); __ aesmc(v0, v0); 2834 __ BIND(L_rounds_52); 2835 __ aese(v0, v19); __ aesmc(v0, v0); 2836 __ aese(v0, v20); __ aesmc(v0, v0); 2837 __ BIND(L_rounds_44); 2838 __ aese(v0, v21); __ aesmc(v0, v0); 2839 __ aese(v0, v22); __ aesmc(v0, v0); 2840 __ aese(v0, v23); __ aesmc(v0, v0); 2841 __ aese(v0, v24); __ aesmc(v0, v0); 2842 __ aese(v0, v25); __ aesmc(v0, v0); 2843 __ aese(v0, v26); __ aesmc(v0, v0); 2844 __ aese(v0, v27); __ aesmc(v0, v0); 2845 __ aese(v0, v28); __ aesmc(v0, v0); 2846 __ aese(v0, v29); __ aesmc(v0, v0); 2847 __ aese(v0, v30); 2848 __ eor(v0, __ T16B, v0, v31); 2849 2850 __ st1(v0, __ T16B, __ post(to, 16)); 2851 2852 __ subw(len_reg, len_reg, 16); 2853 __ cbnzw(len_reg, L_aes_loop); 2854 2855 __ st1(v0, __ T16B, rvec); 2856 2857 __ mov(r0, rscratch2); 2858 2859 __ leave(); 2860 __ ret(lr); 2861 2862 return start; 2863 } 2864 2865 // Arguments: 2866 // 2867 // Inputs: 2868 // c_rarg0 - source byte array address 2869 // c_rarg1 - destination byte array address 2870 // c_rarg2 - K (key) in little endian int array 2871 // c_rarg3 - r vector byte array address 2872 // c_rarg4 - input length 2873 // 2874 // Output: 2875 // r0 - input length 2876 // 2877 address generate_cipherBlockChaining_decryptAESCrypt() { 2878 assert(UseAES, "need AES instructions and misaligned SSE support"); 2879 __ align(CodeEntryAlignment); 2880 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2881 2882 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2883 2884 const Register from = c_rarg0; // source array address 2885 const Register to = c_rarg1; // destination array address 2886 const Register key = c_rarg2; // key array address 2887 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2888 // and left with the results of the last encryption block 2889 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2890 const Register keylen = rscratch1; 2891 2892 address start = __ pc(); 2893 2894 __ enter(); 2895 2896 __ movw(rscratch2, len_reg); 2897 2898 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2899 2900 __ ld1(v2, __ T16B, rvec); 2901 2902 __ ld1(v31, __ T16B, __ post(key, 16)); 2903 __ rev32(v31, __ T16B, v31); 2904 2905 __ cmpw(keylen, 52); 2906 __ br(Assembler::CC, L_loadkeys_44); 2907 __ br(Assembler::EQ, L_loadkeys_52); 2908 2909 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2910 __ rev32(v17, __ T16B, v17); 2911 __ rev32(v18, __ T16B, v18); 2912 __ BIND(L_loadkeys_52); 2913 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2914 __ rev32(v19, __ T16B, v19); 2915 __ rev32(v20, __ T16B, v20); 2916 __ BIND(L_loadkeys_44); 2917 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2918 __ rev32(v21, __ T16B, v21); 2919 __ rev32(v22, __ T16B, v22); 2920 __ rev32(v23, __ T16B, v23); 2921 __ rev32(v24, __ T16B, v24); 2922 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2923 __ rev32(v25, __ T16B, v25); 2924 __ rev32(v26, __ T16B, v26); 2925 __ rev32(v27, __ T16B, v27); 2926 __ rev32(v28, __ T16B, v28); 2927 __ ld1(v29, v30, __ T16B, key); 2928 __ rev32(v29, __ T16B, v29); 2929 __ rev32(v30, __ T16B, v30); 2930 2931 __ BIND(L_aes_loop); 2932 __ ld1(v0, __ T16B, __ post(from, 16)); 2933 __ orr(v1, __ T16B, v0, v0); 2934 2935 __ br(Assembler::CC, L_rounds_44); 2936 __ br(Assembler::EQ, L_rounds_52); 2937 2938 __ aesd(v0, v17); __ aesimc(v0, v0); 2939 __ aesd(v0, v18); __ aesimc(v0, v0); 2940 __ BIND(L_rounds_52); 2941 __ aesd(v0, v19); __ aesimc(v0, v0); 2942 __ aesd(v0, v20); __ aesimc(v0, v0); 2943 __ BIND(L_rounds_44); 2944 __ aesd(v0, v21); __ aesimc(v0, v0); 2945 __ aesd(v0, v22); __ aesimc(v0, v0); 2946 __ aesd(v0, v23); __ aesimc(v0, v0); 2947 __ aesd(v0, v24); __ aesimc(v0, v0); 2948 __ aesd(v0, v25); __ aesimc(v0, v0); 2949 __ aesd(v0, v26); __ aesimc(v0, v0); 2950 __ aesd(v0, v27); __ aesimc(v0, v0); 2951 __ aesd(v0, v28); __ aesimc(v0, v0); 2952 __ aesd(v0, v29); __ aesimc(v0, v0); 2953 __ aesd(v0, v30); 2954 __ eor(v0, __ T16B, v0, v31); 2955 __ eor(v0, __ T16B, v0, v2); 2956 2957 __ st1(v0, __ T16B, __ post(to, 16)); 2958 __ orr(v2, __ T16B, v1, v1); 2959 2960 __ subw(len_reg, len_reg, 16); 2961 __ cbnzw(len_reg, L_aes_loop); 2962 2963 __ st1(v2, __ T16B, rvec); 2964 2965 __ mov(r0, rscratch2); 2966 2967 __ leave(); 2968 __ ret(lr); 2969 2970 return start; 2971 } 2972 2973 // Arguments: 2974 // 2975 // Inputs: 2976 // c_rarg0 - byte[] source+offset 2977 // c_rarg1 - int[] SHA.state 2978 // c_rarg2 - int offset 2979 // c_rarg3 - int limit 2980 // 2981 address generate_sha1_implCompress(bool multi_block, const char *name) { 2982 __ align(CodeEntryAlignment); 2983 StubCodeMark mark(this, "StubRoutines", name); 2984 address start = __ pc(); 2985 2986 Register buf = c_rarg0; 2987 Register state = c_rarg1; 2988 Register ofs = c_rarg2; 2989 Register limit = c_rarg3; 2990 2991 Label keys; 2992 Label sha1_loop; 2993 2994 // load the keys into v0..v3 2995 __ adr(rscratch1, keys); 2996 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2997 // load 5 words state into v6, v7 2998 __ ldrq(v6, Address(state, 0)); 2999 __ ldrs(v7, Address(state, 16)); 3000 3001 3002 __ BIND(sha1_loop); 3003 // load 64 bytes of data into v16..v19 3004 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 3005 __ rev32(v16, __ T16B, v16); 3006 __ rev32(v17, __ T16B, v17); 3007 __ rev32(v18, __ T16B, v18); 3008 __ rev32(v19, __ T16B, v19); 3009 3010 // do the sha1 3011 __ addv(v4, __ T4S, v16, v0); 3012 __ orr(v20, __ T16B, v6, v6); 3013 3014 FloatRegister d0 = v16; 3015 FloatRegister d1 = v17; 3016 FloatRegister d2 = v18; 3017 FloatRegister d3 = v19; 3018 3019 for (int round = 0; round < 20; round++) { 3020 FloatRegister tmp1 = (round & 1) ? v4 : v5; 3021 FloatRegister tmp2 = (round & 1) ? v21 : v22; 3022 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 3023 FloatRegister tmp4 = (round & 1) ? v5 : v4; 3024 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 3025 3026 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 3027 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 3028 __ sha1h(tmp2, __ T4S, v20); 3029 if (round < 5) 3030 __ sha1c(v20, __ T4S, tmp3, tmp4); 3031 else if (round < 10 || round >= 15) 3032 __ sha1p(v20, __ T4S, tmp3, tmp4); 3033 else 3034 __ sha1m(v20, __ T4S, tmp3, tmp4); 3035 if (round < 16) __ sha1su1(d0, __ T4S, d3); 3036 3037 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3038 } 3039 3040 __ addv(v7, __ T2S, v7, v21); 3041 __ addv(v6, __ T4S, v6, v20); 3042 3043 if (multi_block) { 3044 __ add(ofs, ofs, 64); 3045 __ cmp(ofs, limit); 3046 __ br(Assembler::LE, sha1_loop); 3047 __ mov(c_rarg0, ofs); // return ofs 3048 } 3049 3050 __ strq(v6, Address(state, 0)); 3051 __ strs(v7, Address(state, 16)); 3052 3053 __ ret(lr); 3054 3055 __ bind(keys); 3056 __ emit_int32(0x5a827999); 3057 __ emit_int32(0x6ed9eba1); 3058 __ emit_int32(0x8f1bbcdc); 3059 __ emit_int32(0xca62c1d6); 3060 3061 return start; 3062 } 3063 3064 3065 // Arguments: 3066 // 3067 // Inputs: 3068 // c_rarg0 - byte[] source+offset 3069 // c_rarg1 - int[] SHA.state 3070 // c_rarg2 - int offset 3071 // c_rarg3 - int limit 3072 // 3073 address generate_sha256_implCompress(bool multi_block, const char *name) { 3074 static const uint32_t round_consts[64] = { 3075 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3076 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3077 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3078 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3079 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3080 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3081 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3082 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3083 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3084 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3085 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3086 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3087 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3088 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3089 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3090 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3091 }; 3092 __ align(CodeEntryAlignment); 3093 StubCodeMark mark(this, "StubRoutines", name); 3094 address start = __ pc(); 3095 3096 Register buf = c_rarg0; 3097 Register state = c_rarg1; 3098 Register ofs = c_rarg2; 3099 Register limit = c_rarg3; 3100 3101 Label sha1_loop; 3102 3103 __ stpd(v8, v9, __ pre(sp, -32)); 3104 __ stpd(v10, v11, Address(sp, 16)); 3105 3106 // dga == v0 3107 // dgb == v1 3108 // dg0 == v2 3109 // dg1 == v3 3110 // dg2 == v4 3111 // t0 == v6 3112 // t1 == v7 3113 3114 // load 16 keys to v16..v31 3115 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3116 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3117 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3118 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3119 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3120 3121 // load 8 words (256 bits) state 3122 __ ldpq(v0, v1, state); 3123 3124 __ BIND(sha1_loop); 3125 // load 64 bytes of data into v8..v11 3126 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3127 __ rev32(v8, __ T16B, v8); 3128 __ rev32(v9, __ T16B, v9); 3129 __ rev32(v10, __ T16B, v10); 3130 __ rev32(v11, __ T16B, v11); 3131 3132 __ addv(v6, __ T4S, v8, v16); 3133 __ orr(v2, __ T16B, v0, v0); 3134 __ orr(v3, __ T16B, v1, v1); 3135 3136 FloatRegister d0 = v8; 3137 FloatRegister d1 = v9; 3138 FloatRegister d2 = v10; 3139 FloatRegister d3 = v11; 3140 3141 3142 for (int round = 0; round < 16; round++) { 3143 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3144 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3145 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3146 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3147 3148 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3149 __ orr(v4, __ T16B, v2, v2); 3150 if (round < 15) 3151 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3152 __ sha256h(v2, __ T4S, v3, tmp2); 3153 __ sha256h2(v3, __ T4S, v4, tmp2); 3154 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3155 3156 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3157 } 3158 3159 __ addv(v0, __ T4S, v0, v2); 3160 __ addv(v1, __ T4S, v1, v3); 3161 3162 if (multi_block) { 3163 __ add(ofs, ofs, 64); 3164 __ cmp(ofs, limit); 3165 __ br(Assembler::LE, sha1_loop); 3166 __ mov(c_rarg0, ofs); // return ofs 3167 } 3168 3169 __ ldpd(v10, v11, Address(sp, 16)); 3170 __ ldpd(v8, v9, __ post(sp, 32)); 3171 3172 __ stpq(v0, v1, state); 3173 3174 __ ret(lr); 3175 3176 return start; 3177 } 3178 3179 #ifndef BUILTIN_SIM 3180 // Safefetch stubs. 3181 void generate_safefetch(const char* name, int size, address* entry, 3182 address* fault_pc, address* continuation_pc) { 3183 // safefetch signatures: 3184 // int SafeFetch32(int* adr, int errValue); 3185 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3186 // 3187 // arguments: 3188 // c_rarg0 = adr 3189 // c_rarg1 = errValue 3190 // 3191 // result: 3192 // PPC_RET = *adr or errValue 3193 3194 StubCodeMark mark(this, "StubRoutines", name); 3195 3196 // Entry point, pc or function descriptor. 3197 *entry = __ pc(); 3198 3199 // Load *adr into c_rarg1, may fault. 3200 *fault_pc = __ pc(); 3201 switch (size) { 3202 case 4: 3203 // int32_t 3204 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3205 break; 3206 case 8: 3207 // int64_t 3208 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3209 break; 3210 default: 3211 ShouldNotReachHere(); 3212 } 3213 3214 // return errValue or *adr 3215 *continuation_pc = __ pc(); 3216 __ mov(r0, c_rarg1); 3217 __ ret(lr); 3218 } 3219 #endif 3220 3221 /** 3222 * Arguments: 3223 * 3224 * Inputs: 3225 * c_rarg0 - int crc 3226 * c_rarg1 - byte* buf 3227 * c_rarg2 - int length 3228 * 3229 * Ouput: 3230 * rax - int crc result 3231 */ 3232 address generate_updateBytesCRC32() { 3233 assert(UseCRC32Intrinsics, "what are we doing here?"); 3234 3235 __ align(CodeEntryAlignment); 3236 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3237 3238 address start = __ pc(); 3239 3240 const Register crc = c_rarg0; // crc 3241 const Register buf = c_rarg1; // source java byte array address 3242 const Register len = c_rarg2; // length 3243 const Register table0 = c_rarg3; // crc_table address 3244 const Register table1 = c_rarg4; 3245 const Register table2 = c_rarg5; 3246 const Register table3 = c_rarg6; 3247 const Register tmp3 = c_rarg7; 3248 3249 BLOCK_COMMENT("Entry:"); 3250 __ enter(); // required for proper stackwalking of RuntimeStub frame 3251 3252 __ kernel_crc32(crc, buf, len, 3253 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3254 3255 __ leave(); // required for proper stackwalking of RuntimeStub frame 3256 __ ret(lr); 3257 3258 return start; 3259 } 3260 3261 /** 3262 * Arguments: 3263 * 3264 * Inputs: 3265 * c_rarg0 - int crc 3266 * c_rarg1 - byte* buf 3267 * c_rarg2 - int length 3268 * c_rarg3 - int* table 3269 * 3270 * Ouput: 3271 * r0 - int crc result 3272 */ 3273 address generate_updateBytesCRC32C() { 3274 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3275 3276 __ align(CodeEntryAlignment); 3277 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3278 3279 address start = __ pc(); 3280 3281 const Register crc = c_rarg0; // crc 3282 const Register buf = c_rarg1; // source java byte array address 3283 const Register len = c_rarg2; // length 3284 const Register table0 = c_rarg3; // crc_table address 3285 const Register table1 = c_rarg4; 3286 const Register table2 = c_rarg5; 3287 const Register table3 = c_rarg6; 3288 const Register tmp3 = c_rarg7; 3289 3290 BLOCK_COMMENT("Entry:"); 3291 __ enter(); // required for proper stackwalking of RuntimeStub frame 3292 3293 __ kernel_crc32c(crc, buf, len, 3294 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3295 3296 __ leave(); // required for proper stackwalking of RuntimeStub frame 3297 __ ret(lr); 3298 3299 return start; 3300 } 3301 3302 /*** 3303 * Arguments: 3304 * 3305 * Inputs: 3306 * c_rarg0 - int adler 3307 * c_rarg1 - byte* buff 3308 * c_rarg2 - int len 3309 * 3310 * Output: 3311 * c_rarg0 - int adler result 3312 */ 3313 address generate_updateBytesAdler32() { 3314 __ align(CodeEntryAlignment); 3315 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3316 address start = __ pc(); 3317 3318 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3319 3320 // Aliases 3321 Register adler = c_rarg0; 3322 Register s1 = c_rarg0; 3323 Register s2 = c_rarg3; 3324 Register buff = c_rarg1; 3325 Register len = c_rarg2; 3326 Register nmax = r4; 3327 Register base = r5; 3328 Register count = r6; 3329 Register temp0 = rscratch1; 3330 Register temp1 = rscratch2; 3331 Register temp2 = r7; 3332 3333 // Max number of bytes we can process before having to take the mod 3334 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3335 unsigned long BASE = 0xfff1; 3336 unsigned long NMAX = 0x15B0; 3337 3338 __ mov(base, BASE); 3339 __ mov(nmax, NMAX); 3340 3341 // s1 is initialized to the lower 16 bits of adler 3342 // s2 is initialized to the upper 16 bits of adler 3343 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3344 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3345 3346 // The pipelined loop needs at least 16 elements for 1 iteration 3347 // It does check this, but it is more effective to skip to the cleanup loop 3348 __ cmp(len, 16); 3349 __ br(Assembler::HS, L_nmax); 3350 __ cbz(len, L_combine); 3351 3352 __ bind(L_simple_by1_loop); 3353 __ ldrb(temp0, Address(__ post(buff, 1))); 3354 __ add(s1, s1, temp0); 3355 __ add(s2, s2, s1); 3356 __ subs(len, len, 1); 3357 __ br(Assembler::HI, L_simple_by1_loop); 3358 3359 // s1 = s1 % BASE 3360 __ subs(temp0, s1, base); 3361 __ csel(s1, temp0, s1, Assembler::HS); 3362 3363 // s2 = s2 % BASE 3364 __ lsr(temp0, s2, 16); 3365 __ lsl(temp1, temp0, 4); 3366 __ sub(temp1, temp1, temp0); 3367 __ add(s2, temp1, s2, ext::uxth); 3368 3369 __ subs(temp0, s2, base); 3370 __ csel(s2, temp0, s2, Assembler::HS); 3371 3372 __ b(L_combine); 3373 3374 __ bind(L_nmax); 3375 __ subs(len, len, nmax); 3376 __ sub(count, nmax, 16); 3377 __ br(Assembler::LO, L_by16); 3378 3379 __ bind(L_nmax_loop); 3380 3381 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3382 3383 __ add(s1, s1, temp0, ext::uxtb); 3384 __ ubfx(temp2, temp0, 8, 8); 3385 __ add(s2, s2, s1); 3386 __ add(s1, s1, temp2); 3387 __ ubfx(temp2, temp0, 16, 8); 3388 __ add(s2, s2, s1); 3389 __ add(s1, s1, temp2); 3390 __ ubfx(temp2, temp0, 24, 8); 3391 __ add(s2, s2, s1); 3392 __ add(s1, s1, temp2); 3393 __ ubfx(temp2, temp0, 32, 8); 3394 __ add(s2, s2, s1); 3395 __ add(s1, s1, temp2); 3396 __ ubfx(temp2, temp0, 40, 8); 3397 __ add(s2, s2, s1); 3398 __ add(s1, s1, temp2); 3399 __ ubfx(temp2, temp0, 48, 8); 3400 __ add(s2, s2, s1); 3401 __ add(s1, s1, temp2); 3402 __ add(s2, s2, s1); 3403 __ add(s1, s1, temp0, Assembler::LSR, 56); 3404 __ add(s2, s2, s1); 3405 3406 __ add(s1, s1, temp1, ext::uxtb); 3407 __ ubfx(temp2, temp1, 8, 8); 3408 __ add(s2, s2, s1); 3409 __ add(s1, s1, temp2); 3410 __ ubfx(temp2, temp1, 16, 8); 3411 __ add(s2, s2, s1); 3412 __ add(s1, s1, temp2); 3413 __ ubfx(temp2, temp1, 24, 8); 3414 __ add(s2, s2, s1); 3415 __ add(s1, s1, temp2); 3416 __ ubfx(temp2, temp1, 32, 8); 3417 __ add(s2, s2, s1); 3418 __ add(s1, s1, temp2); 3419 __ ubfx(temp2, temp1, 40, 8); 3420 __ add(s2, s2, s1); 3421 __ add(s1, s1, temp2); 3422 __ ubfx(temp2, temp1, 48, 8); 3423 __ add(s2, s2, s1); 3424 __ add(s1, s1, temp2); 3425 __ add(s2, s2, s1); 3426 __ add(s1, s1, temp1, Assembler::LSR, 56); 3427 __ add(s2, s2, s1); 3428 3429 __ subs(count, count, 16); 3430 __ br(Assembler::HS, L_nmax_loop); 3431 3432 // s1 = s1 % BASE 3433 __ lsr(temp0, s1, 16); 3434 __ lsl(temp1, temp0, 4); 3435 __ sub(temp1, temp1, temp0); 3436 __ add(temp1, temp1, s1, ext::uxth); 3437 3438 __ lsr(temp0, temp1, 16); 3439 __ lsl(s1, temp0, 4); 3440 __ sub(s1, s1, temp0); 3441 __ add(s1, s1, temp1, ext:: uxth); 3442 3443 __ subs(temp0, s1, base); 3444 __ csel(s1, temp0, s1, Assembler::HS); 3445 3446 // s2 = s2 % BASE 3447 __ lsr(temp0, s2, 16); 3448 __ lsl(temp1, temp0, 4); 3449 __ sub(temp1, temp1, temp0); 3450 __ add(temp1, temp1, s2, ext::uxth); 3451 3452 __ lsr(temp0, temp1, 16); 3453 __ lsl(s2, temp0, 4); 3454 __ sub(s2, s2, temp0); 3455 __ add(s2, s2, temp1, ext:: uxth); 3456 3457 __ subs(temp0, s2, base); 3458 __ csel(s2, temp0, s2, Assembler::HS); 3459 3460 __ subs(len, len, nmax); 3461 __ sub(count, nmax, 16); 3462 __ br(Assembler::HS, L_nmax_loop); 3463 3464 __ bind(L_by16); 3465 __ adds(len, len, count); 3466 __ br(Assembler::LO, L_by1); 3467 3468 __ bind(L_by16_loop); 3469 3470 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3471 3472 __ add(s1, s1, temp0, ext::uxtb); 3473 __ ubfx(temp2, temp0, 8, 8); 3474 __ add(s2, s2, s1); 3475 __ add(s1, s1, temp2); 3476 __ ubfx(temp2, temp0, 16, 8); 3477 __ add(s2, s2, s1); 3478 __ add(s1, s1, temp2); 3479 __ ubfx(temp2, temp0, 24, 8); 3480 __ add(s2, s2, s1); 3481 __ add(s1, s1, temp2); 3482 __ ubfx(temp2, temp0, 32, 8); 3483 __ add(s2, s2, s1); 3484 __ add(s1, s1, temp2); 3485 __ ubfx(temp2, temp0, 40, 8); 3486 __ add(s2, s2, s1); 3487 __ add(s1, s1, temp2); 3488 __ ubfx(temp2, temp0, 48, 8); 3489 __ add(s2, s2, s1); 3490 __ add(s1, s1, temp2); 3491 __ add(s2, s2, s1); 3492 __ add(s1, s1, temp0, Assembler::LSR, 56); 3493 __ add(s2, s2, s1); 3494 3495 __ add(s1, s1, temp1, ext::uxtb); 3496 __ ubfx(temp2, temp1, 8, 8); 3497 __ add(s2, s2, s1); 3498 __ add(s1, s1, temp2); 3499 __ ubfx(temp2, temp1, 16, 8); 3500 __ add(s2, s2, s1); 3501 __ add(s1, s1, temp2); 3502 __ ubfx(temp2, temp1, 24, 8); 3503 __ add(s2, s2, s1); 3504 __ add(s1, s1, temp2); 3505 __ ubfx(temp2, temp1, 32, 8); 3506 __ add(s2, s2, s1); 3507 __ add(s1, s1, temp2); 3508 __ ubfx(temp2, temp1, 40, 8); 3509 __ add(s2, s2, s1); 3510 __ add(s1, s1, temp2); 3511 __ ubfx(temp2, temp1, 48, 8); 3512 __ add(s2, s2, s1); 3513 __ add(s1, s1, temp2); 3514 __ add(s2, s2, s1); 3515 __ add(s1, s1, temp1, Assembler::LSR, 56); 3516 __ add(s2, s2, s1); 3517 3518 __ subs(len, len, 16); 3519 __ br(Assembler::HS, L_by16_loop); 3520 3521 __ bind(L_by1); 3522 __ adds(len, len, 15); 3523 __ br(Assembler::LO, L_do_mod); 3524 3525 __ bind(L_by1_loop); 3526 __ ldrb(temp0, Address(__ post(buff, 1))); 3527 __ add(s1, temp0, s1); 3528 __ add(s2, s2, s1); 3529 __ subs(len, len, 1); 3530 __ br(Assembler::HS, L_by1_loop); 3531 3532 __ bind(L_do_mod); 3533 // s1 = s1 % BASE 3534 __ lsr(temp0, s1, 16); 3535 __ lsl(temp1, temp0, 4); 3536 __ sub(temp1, temp1, temp0); 3537 __ add(temp1, temp1, s1, ext::uxth); 3538 3539 __ lsr(temp0, temp1, 16); 3540 __ lsl(s1, temp0, 4); 3541 __ sub(s1, s1, temp0); 3542 __ add(s1, s1, temp1, ext:: uxth); 3543 3544 __ subs(temp0, s1, base); 3545 __ csel(s1, temp0, s1, Assembler::HS); 3546 3547 // s2 = s2 % BASE 3548 __ lsr(temp0, s2, 16); 3549 __ lsl(temp1, temp0, 4); 3550 __ sub(temp1, temp1, temp0); 3551 __ add(temp1, temp1, s2, ext::uxth); 3552 3553 __ lsr(temp0, temp1, 16); 3554 __ lsl(s2, temp0, 4); 3555 __ sub(s2, s2, temp0); 3556 __ add(s2, s2, temp1, ext:: uxth); 3557 3558 __ subs(temp0, s2, base); 3559 __ csel(s2, temp0, s2, Assembler::HS); 3560 3561 // Combine lower bits and higher bits 3562 __ bind(L_combine); 3563 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3564 3565 __ ret(lr); 3566 3567 return start; 3568 } 3569 3570 /** 3571 * Arguments: 3572 * 3573 * Input: 3574 * c_rarg0 - x address 3575 * c_rarg1 - x length 3576 * c_rarg2 - y address 3577 * c_rarg3 - y lenth 3578 * c_rarg4 - z address 3579 * c_rarg5 - z length 3580 */ 3581 address generate_multiplyToLen() { 3582 __ align(CodeEntryAlignment); 3583 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3584 3585 address start = __ pc(); 3586 const Register x = r0; 3587 const Register xlen = r1; 3588 const Register y = r2; 3589 const Register ylen = r3; 3590 const Register z = r4; 3591 const Register zlen = r5; 3592 3593 const Register tmp1 = r10; 3594 const Register tmp2 = r11; 3595 const Register tmp3 = r12; 3596 const Register tmp4 = r13; 3597 const Register tmp5 = r14; 3598 const Register tmp6 = r15; 3599 const Register tmp7 = r16; 3600 3601 BLOCK_COMMENT("Entry:"); 3602 __ enter(); // required for proper stackwalking of RuntimeStub frame 3603 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3604 __ leave(); // required for proper stackwalking of RuntimeStub frame 3605 __ ret(lr); 3606 3607 return start; 3608 } 3609 3610 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3611 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3612 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3613 // Karatsuba multiplication performs a 128*128 -> 256-bit 3614 // multiplication in three 128-bit multiplications and a few 3615 // additions. 3616 // 3617 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3618 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3619 // 3620 // Inputs: 3621 // 3622 // A0 in a.d[0] (subkey) 3623 // A1 in a.d[1] 3624 // (A1+A0) in a1_xor_a0.d[0] 3625 // 3626 // B0 in b.d[0] (state) 3627 // B1 in b.d[1] 3628 3629 __ ext(tmp1, __ T16B, b, b, 0x08); 3630 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3631 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3632 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3633 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3634 3635 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3636 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3637 __ eor(tmp2, __ T16B, tmp2, tmp4); 3638 __ eor(tmp2, __ T16B, tmp2, tmp3); 3639 3640 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3641 __ ins(result_hi, __ D, tmp2, 0, 1); 3642 __ ins(result_lo, __ D, tmp2, 1, 0); 3643 } 3644 3645 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3646 FloatRegister p, FloatRegister z, FloatRegister t1) { 3647 const FloatRegister t0 = result; 3648 3649 // The GCM field polynomial f is z^128 + p(z), where p = 3650 // z^7+z^2+z+1. 3651 // 3652 // z^128 === -p(z) (mod (z^128 + p(z))) 3653 // 3654 // so, given that the product we're reducing is 3655 // a == lo + hi * z^128 3656 // substituting, 3657 // === lo - hi * p(z) (mod (z^128 + p(z))) 3658 // 3659 // we reduce by multiplying hi by p(z) and subtracting the result 3660 // from (i.e. XORing it with) lo. Because p has no nonzero high 3661 // bits we can do this with two 64-bit multiplications, lo*p and 3662 // hi*p. 3663 3664 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3665 __ ext(t1, __ T16B, t0, z, 8); 3666 __ eor(hi, __ T16B, hi, t1); 3667 __ ext(t1, __ T16B, z, t0, 8); 3668 __ eor(lo, __ T16B, lo, t1); 3669 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3670 __ eor(result, __ T16B, lo, t0); 3671 } 3672 3673 address generate_has_negatives(address &has_negatives_long) { 3674 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3675 const int large_loop_size = 64; 3676 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3677 int dcache_line = VM_Version::dcache_line_size(); 3678 3679 Register ary1 = r1, len = r2, result = r0; 3680 3681 __ align(CodeEntryAlignment); 3682 address entry = __ pc(); 3683 3684 __ enter(); 3685 3686 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3687 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3688 3689 __ cmp(len, 15); 3690 __ br(Assembler::GT, LEN_OVER_15); 3691 // The only case when execution falls into this code is when pointer is near 3692 // the end of memory page and we have to avoid reading next page 3693 __ add(ary1, ary1, len); 3694 __ subs(len, len, 8); 3695 __ br(Assembler::GT, LEN_OVER_8); 3696 __ ldr(rscratch2, Address(ary1, -8)); 3697 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3698 __ lsrv(rscratch2, rscratch2, rscratch1); 3699 __ tst(rscratch2, UPPER_BIT_MASK); 3700 __ cset(result, Assembler::NE); 3701 __ leave(); 3702 __ ret(lr); 3703 __ bind(LEN_OVER_8); 3704 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3705 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3706 __ tst(rscratch2, UPPER_BIT_MASK); 3707 __ br(Assembler::NE, RET_TRUE_NO_POP); 3708 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3709 __ lsrv(rscratch1, rscratch1, rscratch2); 3710 __ tst(rscratch1, UPPER_BIT_MASK); 3711 __ cset(result, Assembler::NE); 3712 __ leave(); 3713 __ ret(lr); 3714 3715 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3716 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3717 3718 has_negatives_long = __ pc(); // 2nd entry point 3719 3720 __ enter(); 3721 3722 __ bind(LEN_OVER_15); 3723 __ push(spilled_regs, sp); 3724 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3725 __ cbz(rscratch2, ALIGNED); 3726 __ ldp(tmp6, tmp1, Address(ary1)); 3727 __ mov(tmp5, 16); 3728 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3729 __ add(ary1, ary1, rscratch1); 3730 __ sub(len, len, rscratch1); 3731 __ orr(tmp6, tmp6, tmp1); 3732 __ tst(tmp6, UPPER_BIT_MASK); 3733 __ br(Assembler::NE, RET_TRUE); 3734 3735 __ bind(ALIGNED); 3736 __ cmp(len, large_loop_size); 3737 __ br(Assembler::LT, CHECK_16); 3738 // Perform 16-byte load as early return in pre-loop to handle situation 3739 // when initially aligned large array has negative values at starting bytes, 3740 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3741 // slower. Cases with negative bytes further ahead won't be affected that 3742 // much. In fact, it'll be faster due to early loads, less instructions and 3743 // less branches in LARGE_LOOP. 3744 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3745 __ sub(len, len, 16); 3746 __ orr(tmp6, tmp6, tmp1); 3747 __ tst(tmp6, UPPER_BIT_MASK); 3748 __ br(Assembler::NE, RET_TRUE); 3749 __ cmp(len, large_loop_size); 3750 __ br(Assembler::LT, CHECK_16); 3751 3752 if (SoftwarePrefetchHintDistance >= 0 3753 && SoftwarePrefetchHintDistance >= dcache_line) { 3754 // initial prefetch 3755 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3756 } 3757 __ bind(LARGE_LOOP); 3758 if (SoftwarePrefetchHintDistance >= 0) { 3759 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3760 } 3761 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3762 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3763 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3764 // instructions per cycle and have less branches, but this approach disables 3765 // early return, thus, all 64 bytes are loaded and checked every time. 3766 __ ldp(tmp2, tmp3, Address(ary1)); 3767 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3768 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3769 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3770 __ add(ary1, ary1, large_loop_size); 3771 __ sub(len, len, large_loop_size); 3772 __ orr(tmp2, tmp2, tmp3); 3773 __ orr(tmp4, tmp4, tmp5); 3774 __ orr(rscratch1, rscratch1, rscratch2); 3775 __ orr(tmp6, tmp6, tmp1); 3776 __ orr(tmp2, tmp2, tmp4); 3777 __ orr(rscratch1, rscratch1, tmp6); 3778 __ orr(tmp2, tmp2, rscratch1); 3779 __ tst(tmp2, UPPER_BIT_MASK); 3780 __ br(Assembler::NE, RET_TRUE); 3781 __ cmp(len, large_loop_size); 3782 __ br(Assembler::GE, LARGE_LOOP); 3783 3784 __ bind(CHECK_16); // small 16-byte load pre-loop 3785 __ cmp(len, 16); 3786 __ br(Assembler::LT, POST_LOOP16); 3787 3788 __ bind(LOOP16); // small 16-byte load loop 3789 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3790 __ sub(len, len, 16); 3791 __ orr(tmp2, tmp2, tmp3); 3792 __ tst(tmp2, UPPER_BIT_MASK); 3793 __ br(Assembler::NE, RET_TRUE); 3794 __ cmp(len, 16); 3795 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3796 3797 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3798 __ cmp(len, 8); 3799 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3800 __ ldr(tmp3, Address(__ post(ary1, 8))); 3801 __ sub(len, len, 8); 3802 __ tst(tmp3, UPPER_BIT_MASK); 3803 __ br(Assembler::NE, RET_TRUE); 3804 3805 __ bind(POST_LOOP16_LOAD_TAIL); 3806 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3807 __ ldr(tmp1, Address(ary1)); 3808 __ mov(tmp2, 64); 3809 __ sub(tmp4, tmp2, len, __ LSL, 3); 3810 __ lslv(tmp1, tmp1, tmp4); 3811 __ tst(tmp1, UPPER_BIT_MASK); 3812 __ br(Assembler::NE, RET_TRUE); 3813 // Fallthrough 3814 3815 __ bind(RET_FALSE); 3816 __ pop(spilled_regs, sp); 3817 __ leave(); 3818 __ mov(result, zr); 3819 __ ret(lr); 3820 3821 __ bind(RET_TRUE); 3822 __ pop(spilled_regs, sp); 3823 __ bind(RET_TRUE_NO_POP); 3824 __ leave(); 3825 __ mov(result, 1); 3826 __ ret(lr); 3827 3828 __ bind(DONE); 3829 __ pop(spilled_regs, sp); 3830 __ leave(); 3831 __ ret(lr); 3832 return entry; 3833 } 3834 /** 3835 * Arguments: 3836 * 3837 * Input: 3838 * c_rarg0 - current state address 3839 * c_rarg1 - H key address 3840 * c_rarg2 - data address 3841 * c_rarg3 - number of blocks 3842 * 3843 * Output: 3844 * Updated state at c_rarg0 3845 */ 3846 address generate_ghash_processBlocks() { 3847 // Bafflingly, GCM uses little-endian for the byte order, but 3848 // big-endian for the bit order. For example, the polynomial 1 is 3849 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3850 // 3851 // So, we must either reverse the bytes in each word and do 3852 // everything big-endian or reverse the bits in each byte and do 3853 // it little-endian. On AArch64 it's more idiomatic to reverse 3854 // the bits in each byte (we have an instruction, RBIT, to do 3855 // that) and keep the data in little-endian bit order throught the 3856 // calculation, bit-reversing the inputs and outputs. 3857 3858 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3859 __ align(wordSize * 2); 3860 address p = __ pc(); 3861 __ emit_int64(0x87); // The low-order bits of the field 3862 // polynomial (i.e. p = z^7+z^2+z+1) 3863 // repeated in the low and high parts of a 3864 // 128-bit vector 3865 __ emit_int64(0x87); 3866 3867 __ align(CodeEntryAlignment); 3868 address start = __ pc(); 3869 3870 Register state = c_rarg0; 3871 Register subkeyH = c_rarg1; 3872 Register data = c_rarg2; 3873 Register blocks = c_rarg3; 3874 3875 FloatRegister vzr = v30; 3876 __ eor(vzr, __ T16B, vzr, vzr); // zero register 3877 3878 __ ldrq(v0, Address(state)); 3879 __ ldrq(v1, Address(subkeyH)); 3880 3881 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 3882 __ rbit(v0, __ T16B, v0); 3883 __ rev64(v1, __ T16B, v1); 3884 __ rbit(v1, __ T16B, v1); 3885 3886 __ ldrq(v26, p); 3887 3888 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 3889 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 3890 3891 { 3892 Label L_ghash_loop; 3893 __ bind(L_ghash_loop); 3894 3895 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 3896 // reversing each byte 3897 __ rbit(v2, __ T16B, v2); 3898 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 3899 3900 // Multiply state in v2 by subkey in v1 3901 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 3902 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 3903 /*temps*/v6, v20, v18, v21); 3904 // Reduce v7:v5 by the field polynomial 3905 ghash_reduce(v0, v5, v7, v26, vzr, v20); 3906 3907 __ sub(blocks, blocks, 1); 3908 __ cbnz(blocks, L_ghash_loop); 3909 } 3910 3911 // The bit-reversed result is at this point in v0 3912 __ rev64(v1, __ T16B, v0); 3913 __ rbit(v1, __ T16B, v1); 3914 3915 __ st1(v1, __ T16B, state); 3916 __ ret(lr); 3917 3918 return start; 3919 } 3920 3921 // Continuation point for throwing of implicit exceptions that are 3922 // not handled in the current activation. Fabricates an exception 3923 // oop and initiates normal exception dispatching in this 3924 // frame. Since we need to preserve callee-saved values (currently 3925 // only for C2, but done for C1 as well) we need a callee-saved oop 3926 // map and therefore have to make these stubs into RuntimeStubs 3927 // rather than BufferBlobs. If the compiler needs all registers to 3928 // be preserved between the fault point and the exception handler 3929 // then it must assume responsibility for that in 3930 // AbstractCompiler::continuation_for_implicit_null_exception or 3931 // continuation_for_implicit_division_by_zero_exception. All other 3932 // implicit exceptions (e.g., NullPointerException or 3933 // AbstractMethodError on entry) are either at call sites or 3934 // otherwise assume that stack unwinding will be initiated, so 3935 // caller saved registers were assumed volatile in the compiler. 3936 3937 #undef __ 3938 #define __ masm-> 3939 3940 address generate_throw_exception(const char* name, 3941 address runtime_entry, 3942 Register arg1 = noreg, 3943 Register arg2 = noreg) { 3944 // Information about frame layout at time of blocking runtime call. 3945 // Note that we only have to preserve callee-saved registers since 3946 // the compilers are responsible for supplying a continuation point 3947 // if they expect all registers to be preserved. 3948 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 3949 enum layout { 3950 rfp_off = 0, 3951 rfp_off2, 3952 return_off, 3953 return_off2, 3954 framesize // inclusive of return address 3955 }; 3956 3957 int insts_size = 512; 3958 int locs_size = 64; 3959 3960 CodeBuffer code(name, insts_size, locs_size); 3961 OopMapSet* oop_maps = new OopMapSet(); 3962 MacroAssembler* masm = new MacroAssembler(&code); 3963 3964 address start = __ pc(); 3965 3966 // This is an inlined and slightly modified version of call_VM 3967 // which has the ability to fetch the return PC out of 3968 // thread-local storage and also sets up last_Java_sp slightly 3969 // differently than the real call_VM 3970 3971 __ enter(); // Save FP and LR before call 3972 3973 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3974 3975 // lr and fp are already in place 3976 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 3977 3978 int frame_complete = __ pc() - start; 3979 3980 // Set up last_Java_sp and last_Java_fp 3981 address the_pc = __ pc(); 3982 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 3983 3984 // Call runtime 3985 if (arg1 != noreg) { 3986 assert(arg2 != c_rarg1, "clobbered"); 3987 __ mov(c_rarg1, arg1); 3988 } 3989 if (arg2 != noreg) { 3990 __ mov(c_rarg2, arg2); 3991 } 3992 __ mov(c_rarg0, rthread); 3993 BLOCK_COMMENT("call runtime_entry"); 3994 __ mov(rscratch1, runtime_entry); 3995 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 3996 3997 // Generate oop map 3998 OopMap* map = new OopMap(framesize, 0); 3999 4000 oop_maps->add_gc_map(the_pc - start, map); 4001 4002 __ reset_last_Java_frame(true); 4003 __ maybe_isb(); 4004 4005 __ leave(); 4006 4007 // check for pending exceptions 4008 #ifdef ASSERT 4009 Label L; 4010 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4011 __ cbnz(rscratch1, L); 4012 __ should_not_reach_here(); 4013 __ bind(L); 4014 #endif // ASSERT 4015 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4016 4017 4018 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4019 RuntimeStub* stub = 4020 RuntimeStub::new_runtime_stub(name, 4021 &code, 4022 frame_complete, 4023 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4024 oop_maps, false); 4025 return stub->entry_point(); 4026 } 4027 4028 class MontgomeryMultiplyGenerator : public MacroAssembler { 4029 4030 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4031 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4032 4033 RegSet _toSave; 4034 bool _squaring; 4035 4036 public: 4037 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4038 : MacroAssembler(as->code()), _squaring(squaring) { 4039 4040 // Register allocation 4041 4042 Register reg = c_rarg0; 4043 Pa_base = reg; // Argument registers 4044 if (squaring) 4045 Pb_base = Pa_base; 4046 else 4047 Pb_base = ++reg; 4048 Pn_base = ++reg; 4049 Rlen= ++reg; 4050 inv = ++reg; 4051 Pm_base = ++reg; 4052 4053 // Working registers: 4054 Ra = ++reg; // The current digit of a, b, n, and m. 4055 Rb = ++reg; 4056 Rm = ++reg; 4057 Rn = ++reg; 4058 4059 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4060 Pb = ++reg; 4061 Pm = ++reg; 4062 Pn = ++reg; 4063 4064 t0 = ++reg; // Three registers which form a 4065 t1 = ++reg; // triple-precision accumuator. 4066 t2 = ++reg; 4067 4068 Ri = ++reg; // Inner and outer loop indexes. 4069 Rj = ++reg; 4070 4071 Rhi_ab = ++reg; // Product registers: low and high parts 4072 Rlo_ab = ++reg; // of a*b and m*n. 4073 Rhi_mn = ++reg; 4074 Rlo_mn = ++reg; 4075 4076 // r19 and up are callee-saved. 4077 _toSave = RegSet::range(r19, reg) + Pm_base; 4078 } 4079 4080 private: 4081 void save_regs() { 4082 push(_toSave, sp); 4083 } 4084 4085 void restore_regs() { 4086 pop(_toSave, sp); 4087 } 4088 4089 template <typename T> 4090 void unroll_2(Register count, T block) { 4091 Label loop, end, odd; 4092 tbnz(count, 0, odd); 4093 cbz(count, end); 4094 align(16); 4095 bind(loop); 4096 (this->*block)(); 4097 bind(odd); 4098 (this->*block)(); 4099 subs(count, count, 2); 4100 br(Assembler::GT, loop); 4101 bind(end); 4102 } 4103 4104 template <typename T> 4105 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4106 Label loop, end, odd; 4107 tbnz(count, 0, odd); 4108 cbz(count, end); 4109 align(16); 4110 bind(loop); 4111 (this->*block)(d, s, tmp); 4112 bind(odd); 4113 (this->*block)(d, s, tmp); 4114 subs(count, count, 2); 4115 br(Assembler::GT, loop); 4116 bind(end); 4117 } 4118 4119 void pre1(RegisterOrConstant i) { 4120 block_comment("pre1"); 4121 // Pa = Pa_base; 4122 // Pb = Pb_base + i; 4123 // Pm = Pm_base; 4124 // Pn = Pn_base + i; 4125 // Ra = *Pa; 4126 // Rb = *Pb; 4127 // Rm = *Pm; 4128 // Rn = *Pn; 4129 ldr(Ra, Address(Pa_base)); 4130 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4131 ldr(Rm, Address(Pm_base)); 4132 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4133 lea(Pa, Address(Pa_base)); 4134 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4135 lea(Pm, Address(Pm_base)); 4136 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4137 4138 // Zero the m*n result. 4139 mov(Rhi_mn, zr); 4140 mov(Rlo_mn, zr); 4141 } 4142 4143 // The core multiply-accumulate step of a Montgomery 4144 // multiplication. The idea is to schedule operations as a 4145 // pipeline so that instructions with long latencies (loads and 4146 // multiplies) have time to complete before their results are 4147 // used. This most benefits in-order implementations of the 4148 // architecture but out-of-order ones also benefit. 4149 void step() { 4150 block_comment("step"); 4151 // MACC(Ra, Rb, t0, t1, t2); 4152 // Ra = *++Pa; 4153 // Rb = *--Pb; 4154 umulh(Rhi_ab, Ra, Rb); 4155 mul(Rlo_ab, Ra, Rb); 4156 ldr(Ra, pre(Pa, wordSize)); 4157 ldr(Rb, pre(Pb, -wordSize)); 4158 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4159 // previous iteration. 4160 // MACC(Rm, Rn, t0, t1, t2); 4161 // Rm = *++Pm; 4162 // Rn = *--Pn; 4163 umulh(Rhi_mn, Rm, Rn); 4164 mul(Rlo_mn, Rm, Rn); 4165 ldr(Rm, pre(Pm, wordSize)); 4166 ldr(Rn, pre(Pn, -wordSize)); 4167 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4168 } 4169 4170 void post1() { 4171 block_comment("post1"); 4172 4173 // MACC(Ra, Rb, t0, t1, t2); 4174 // Ra = *++Pa; 4175 // Rb = *--Pb; 4176 umulh(Rhi_ab, Ra, Rb); 4177 mul(Rlo_ab, Ra, Rb); 4178 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4179 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4180 4181 // *Pm = Rm = t0 * inv; 4182 mul(Rm, t0, inv); 4183 str(Rm, Address(Pm)); 4184 4185 // MACC(Rm, Rn, t0, t1, t2); 4186 // t0 = t1; t1 = t2; t2 = 0; 4187 umulh(Rhi_mn, Rm, Rn); 4188 4189 #ifndef PRODUCT 4190 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4191 { 4192 mul(Rlo_mn, Rm, Rn); 4193 add(Rlo_mn, t0, Rlo_mn); 4194 Label ok; 4195 cbz(Rlo_mn, ok); { 4196 stop("broken Montgomery multiply"); 4197 } bind(ok); 4198 } 4199 #endif 4200 // We have very carefully set things up so that 4201 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4202 // the lower half of Rm * Rn because we know the result already: 4203 // it must be -t0. t0 + (-t0) must generate a carry iff 4204 // t0 != 0. So, rather than do a mul and an adds we just set 4205 // the carry flag iff t0 is nonzero. 4206 // 4207 // mul(Rlo_mn, Rm, Rn); 4208 // adds(zr, t0, Rlo_mn); 4209 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4210 adcs(t0, t1, Rhi_mn); 4211 adc(t1, t2, zr); 4212 mov(t2, zr); 4213 } 4214 4215 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 4216 block_comment("pre2"); 4217 // Pa = Pa_base + i-len; 4218 // Pb = Pb_base + len; 4219 // Pm = Pm_base + i-len; 4220 // Pn = Pn_base + len; 4221 4222 if (i.is_register()) { 4223 sub(Rj, i.as_register(), len); 4224 } else { 4225 mov(Rj, i.as_constant()); 4226 sub(Rj, Rj, len); 4227 } 4228 // Rj == i-len 4229 4230 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 4231 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 4232 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4233 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 4234 4235 // Ra = *++Pa; 4236 // Rb = *--Pb; 4237 // Rm = *++Pm; 4238 // Rn = *--Pn; 4239 ldr(Ra, pre(Pa, wordSize)); 4240 ldr(Rb, pre(Pb, -wordSize)); 4241 ldr(Rm, pre(Pm, wordSize)); 4242 ldr(Rn, pre(Pn, -wordSize)); 4243 4244 mov(Rhi_mn, zr); 4245 mov(Rlo_mn, zr); 4246 } 4247 4248 void post2(RegisterOrConstant i, RegisterOrConstant len) { 4249 block_comment("post2"); 4250 if (i.is_constant()) { 4251 mov(Rj, i.as_constant()-len.as_constant()); 4252 } else { 4253 sub(Rj, i.as_register(), len); 4254 } 4255 4256 adds(t0, t0, Rlo_mn); // The pending m*n, low part 4257 4258 // As soon as we know the least significant digit of our result, 4259 // store it. 4260 // Pm_base[i-len] = t0; 4261 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4262 4263 // t0 = t1; t1 = t2; t2 = 0; 4264 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 4265 adc(t1, t2, zr); 4266 mov(t2, zr); 4267 } 4268 4269 // A carry in t0 after Montgomery multiplication means that we 4270 // should subtract multiples of n from our result in m. We'll 4271 // keep doing that until there is no carry. 4272 void normalize(RegisterOrConstant len) { 4273 block_comment("normalize"); 4274 // while (t0) 4275 // t0 = sub(Pm_base, Pn_base, t0, len); 4276 Label loop, post, again; 4277 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 4278 cbz(t0, post); { 4279 bind(again); { 4280 mov(i, zr); 4281 mov(cnt, len); 4282 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4283 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4284 subs(zr, zr, zr); // set carry flag, i.e. no borrow 4285 align(16); 4286 bind(loop); { 4287 sbcs(Rm, Rm, Rn); 4288 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4289 add(i, i, 1); 4290 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4291 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4292 sub(cnt, cnt, 1); 4293 } cbnz(cnt, loop); 4294 sbc(t0, t0, zr); 4295 } cbnz(t0, again); 4296 } bind(post); 4297 } 4298 4299 // Move memory at s to d, reversing words. 4300 // Increments d to end of copied memory 4301 // Destroys tmp1, tmp2 4302 // Preserves len 4303 // Leaves s pointing to the address which was in d at start 4304 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 4305 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 4306 4307 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 4308 mov(tmp1, len); 4309 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 4310 sub(s, d, len, ext::uxtw, LogBytesPerWord); 4311 } 4312 // where 4313 void reverse1(Register d, Register s, Register tmp) { 4314 ldr(tmp, pre(s, -wordSize)); 4315 ror(tmp, tmp, 32); 4316 str(tmp, post(d, wordSize)); 4317 } 4318 4319 void step_squaring() { 4320 // An extra ACC 4321 step(); 4322 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4323 } 4324 4325 void last_squaring(RegisterOrConstant i) { 4326 Label dont; 4327 // if ((i & 1) == 0) { 4328 tbnz(i.as_register(), 0, dont); { 4329 // MACC(Ra, Rb, t0, t1, t2); 4330 // Ra = *++Pa; 4331 // Rb = *--Pb; 4332 umulh(Rhi_ab, Ra, Rb); 4333 mul(Rlo_ab, Ra, Rb); 4334 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4335 } bind(dont); 4336 } 4337 4338 void extra_step_squaring() { 4339 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4340 4341 // MACC(Rm, Rn, t0, t1, t2); 4342 // Rm = *++Pm; 4343 // Rn = *--Pn; 4344 umulh(Rhi_mn, Rm, Rn); 4345 mul(Rlo_mn, Rm, Rn); 4346 ldr(Rm, pre(Pm, wordSize)); 4347 ldr(Rn, pre(Pn, -wordSize)); 4348 } 4349 4350 void post1_squaring() { 4351 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4352 4353 // *Pm = Rm = t0 * inv; 4354 mul(Rm, t0, inv); 4355 str(Rm, Address(Pm)); 4356 4357 // MACC(Rm, Rn, t0, t1, t2); 4358 // t0 = t1; t1 = t2; t2 = 0; 4359 umulh(Rhi_mn, Rm, Rn); 4360 4361 #ifndef PRODUCT 4362 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4363 { 4364 mul(Rlo_mn, Rm, Rn); 4365 add(Rlo_mn, t0, Rlo_mn); 4366 Label ok; 4367 cbz(Rlo_mn, ok); { 4368 stop("broken Montgomery multiply"); 4369 } bind(ok); 4370 } 4371 #endif 4372 // We have very carefully set things up so that 4373 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4374 // the lower half of Rm * Rn because we know the result already: 4375 // it must be -t0. t0 + (-t0) must generate a carry iff 4376 // t0 != 0. So, rather than do a mul and an adds we just set 4377 // the carry flag iff t0 is nonzero. 4378 // 4379 // mul(Rlo_mn, Rm, Rn); 4380 // adds(zr, t0, Rlo_mn); 4381 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4382 adcs(t0, t1, Rhi_mn); 4383 adc(t1, t2, zr); 4384 mov(t2, zr); 4385 } 4386 4387 void acc(Register Rhi, Register Rlo, 4388 Register t0, Register t1, Register t2) { 4389 adds(t0, t0, Rlo); 4390 adcs(t1, t1, Rhi); 4391 adc(t2, t2, zr); 4392 } 4393 4394 public: 4395 /** 4396 * Fast Montgomery multiplication. The derivation of the 4397 * algorithm is in A Cryptographic Library for the Motorola 4398 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 4399 * 4400 * Arguments: 4401 * 4402 * Inputs for multiplication: 4403 * c_rarg0 - int array elements a 4404 * c_rarg1 - int array elements b 4405 * c_rarg2 - int array elements n (the modulus) 4406 * c_rarg3 - int length 4407 * c_rarg4 - int inv 4408 * c_rarg5 - int array elements m (the result) 4409 * 4410 * Inputs for squaring: 4411 * c_rarg0 - int array elements a 4412 * c_rarg1 - int array elements n (the modulus) 4413 * c_rarg2 - int length 4414 * c_rarg3 - int inv 4415 * c_rarg4 - int array elements m (the result) 4416 * 4417 */ 4418 address generate_multiply() { 4419 Label argh, nothing; 4420 bind(argh); 4421 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4422 4423 align(CodeEntryAlignment); 4424 address entry = pc(); 4425 4426 cbzw(Rlen, nothing); 4427 4428 enter(); 4429 4430 // Make room. 4431 cmpw(Rlen, 512); 4432 br(Assembler::HI, argh); 4433 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4434 andr(sp, Ra, -2 * wordSize); 4435 4436 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4437 4438 { 4439 // Copy input args, reversing as we go. We use Ra as a 4440 // temporary variable. 4441 reverse(Ra, Pa_base, Rlen, t0, t1); 4442 if (!_squaring) 4443 reverse(Ra, Pb_base, Rlen, t0, t1); 4444 reverse(Ra, Pn_base, Rlen, t0, t1); 4445 } 4446 4447 // Push all call-saved registers and also Pm_base which we'll need 4448 // at the end. 4449 save_regs(); 4450 4451 #ifndef PRODUCT 4452 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 4453 { 4454 ldr(Rn, Address(Pn_base, 0)); 4455 mul(Rlo_mn, Rn, inv); 4456 cmp(Rlo_mn, -1); 4457 Label ok; 4458 br(EQ, ok); { 4459 stop("broken inverse in Montgomery multiply"); 4460 } bind(ok); 4461 } 4462 #endif 4463 4464 mov(Pm_base, Ra); 4465 4466 mov(t0, zr); 4467 mov(t1, zr); 4468 mov(t2, zr); 4469 4470 block_comment("for (int i = 0; i < len; i++) {"); 4471 mov(Ri, zr); { 4472 Label loop, end; 4473 cmpw(Ri, Rlen); 4474 br(Assembler::GE, end); 4475 4476 bind(loop); 4477 pre1(Ri); 4478 4479 block_comment(" for (j = i; j; j--) {"); { 4480 movw(Rj, Ri); 4481 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4482 } block_comment(" } // j"); 4483 4484 post1(); 4485 addw(Ri, Ri, 1); 4486 cmpw(Ri, Rlen); 4487 br(Assembler::LT, loop); 4488 bind(end); 4489 block_comment("} // i"); 4490 } 4491 4492 block_comment("for (int i = len; i < 2*len; i++) {"); 4493 mov(Ri, Rlen); { 4494 Label loop, end; 4495 cmpw(Ri, Rlen, Assembler::LSL, 1); 4496 br(Assembler::GE, end); 4497 4498 bind(loop); 4499 pre2(Ri, Rlen); 4500 4501 block_comment(" for (j = len*2-i-1; j; j--) {"); { 4502 lslw(Rj, Rlen, 1); 4503 subw(Rj, Rj, Ri); 4504 subw(Rj, Rj, 1); 4505 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4506 } block_comment(" } // j"); 4507 4508 post2(Ri, Rlen); 4509 addw(Ri, Ri, 1); 4510 cmpw(Ri, Rlen, Assembler::LSL, 1); 4511 br(Assembler::LT, loop); 4512 bind(end); 4513 } 4514 block_comment("} // i"); 4515 4516 normalize(Rlen); 4517 4518 mov(Ra, Pm_base); // Save Pm_base in Ra 4519 restore_regs(); // Restore caller's Pm_base 4520 4521 // Copy our result into caller's Pm_base 4522 reverse(Pm_base, Ra, Rlen, t0, t1); 4523 4524 leave(); 4525 bind(nothing); 4526 ret(lr); 4527 4528 return entry; 4529 } 4530 // In C, approximately: 4531 4532 // void 4533 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 4534 // unsigned long Pn_base[], unsigned long Pm_base[], 4535 // unsigned long inv, int len) { 4536 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4537 // unsigned long *Pa, *Pb, *Pn, *Pm; 4538 // unsigned long Ra, Rb, Rn, Rm; 4539 4540 // int i; 4541 4542 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4543 4544 // for (i = 0; i < len; i++) { 4545 // int j; 4546 4547 // Pa = Pa_base; 4548 // Pb = Pb_base + i; 4549 // Pm = Pm_base; 4550 // Pn = Pn_base + i; 4551 4552 // Ra = *Pa; 4553 // Rb = *Pb; 4554 // Rm = *Pm; 4555 // Rn = *Pn; 4556 4557 // int iters = i; 4558 // for (j = 0; iters--; j++) { 4559 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4560 // MACC(Ra, Rb, t0, t1, t2); 4561 // Ra = *++Pa; 4562 // Rb = *--Pb; 4563 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4564 // MACC(Rm, Rn, t0, t1, t2); 4565 // Rm = *++Pm; 4566 // Rn = *--Pn; 4567 // } 4568 4569 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 4570 // MACC(Ra, Rb, t0, t1, t2); 4571 // *Pm = Rm = t0 * inv; 4572 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4573 // MACC(Rm, Rn, t0, t1, t2); 4574 4575 // assert(t0 == 0, "broken Montgomery multiply"); 4576 4577 // t0 = t1; t1 = t2; t2 = 0; 4578 // } 4579 4580 // for (i = len; i < 2*len; i++) { 4581 // int j; 4582 4583 // Pa = Pa_base + i-len; 4584 // Pb = Pb_base + len; 4585 // Pm = Pm_base + i-len; 4586 // Pn = Pn_base + len; 4587 4588 // Ra = *++Pa; 4589 // Rb = *--Pb; 4590 // Rm = *++Pm; 4591 // Rn = *--Pn; 4592 4593 // int iters = len*2-i-1; 4594 // for (j = i-len+1; iters--; j++) { 4595 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4596 // MACC(Ra, Rb, t0, t1, t2); 4597 // Ra = *++Pa; 4598 // Rb = *--Pb; 4599 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4600 // MACC(Rm, Rn, t0, t1, t2); 4601 // Rm = *++Pm; 4602 // Rn = *--Pn; 4603 // } 4604 4605 // Pm_base[i-len] = t0; 4606 // t0 = t1; t1 = t2; t2 = 0; 4607 // } 4608 4609 // while (t0) 4610 // t0 = sub(Pm_base, Pn_base, t0, len); 4611 // } 4612 4613 /** 4614 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4615 * multiplies than Montgomery multiplication so it should be up to 4616 * 25% faster. However, its loop control is more complex and it 4617 * may actually run slower on some machines. 4618 * 4619 * Arguments: 4620 * 4621 * Inputs: 4622 * c_rarg0 - int array elements a 4623 * c_rarg1 - int array elements n (the modulus) 4624 * c_rarg2 - int length 4625 * c_rarg3 - int inv 4626 * c_rarg4 - int array elements m (the result) 4627 * 4628 */ 4629 address generate_square() { 4630 Label argh; 4631 bind(argh); 4632 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4633 4634 align(CodeEntryAlignment); 4635 address entry = pc(); 4636 4637 enter(); 4638 4639 // Make room. 4640 cmpw(Rlen, 512); 4641 br(Assembler::HI, argh); 4642 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4643 andr(sp, Ra, -2 * wordSize); 4644 4645 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4646 4647 { 4648 // Copy input args, reversing as we go. We use Ra as a 4649 // temporary variable. 4650 reverse(Ra, Pa_base, Rlen, t0, t1); 4651 reverse(Ra, Pn_base, Rlen, t0, t1); 4652 } 4653 4654 // Push all call-saved registers and also Pm_base which we'll need 4655 // at the end. 4656 save_regs(); 4657 4658 mov(Pm_base, Ra); 4659 4660 mov(t0, zr); 4661 mov(t1, zr); 4662 mov(t2, zr); 4663 4664 block_comment("for (int i = 0; i < len; i++) {"); 4665 mov(Ri, zr); { 4666 Label loop, end; 4667 bind(loop); 4668 cmp(Ri, Rlen); 4669 br(Assembler::GE, end); 4670 4671 pre1(Ri); 4672 4673 block_comment("for (j = (i+1)/2; j; j--) {"); { 4674 add(Rj, Ri, 1); 4675 lsr(Rj, Rj, 1); 4676 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4677 } block_comment(" } // j"); 4678 4679 last_squaring(Ri); 4680 4681 block_comment(" for (j = i/2; j; j--) {"); { 4682 lsr(Rj, Ri, 1); 4683 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4684 } block_comment(" } // j"); 4685 4686 post1_squaring(); 4687 add(Ri, Ri, 1); 4688 cmp(Ri, Rlen); 4689 br(Assembler::LT, loop); 4690 4691 bind(end); 4692 block_comment("} // i"); 4693 } 4694 4695 block_comment("for (int i = len; i < 2*len; i++) {"); 4696 mov(Ri, Rlen); { 4697 Label loop, end; 4698 bind(loop); 4699 cmp(Ri, Rlen, Assembler::LSL, 1); 4700 br(Assembler::GE, end); 4701 4702 pre2(Ri, Rlen); 4703 4704 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4705 lsl(Rj, Rlen, 1); 4706 sub(Rj, Rj, Ri); 4707 sub(Rj, Rj, 1); 4708 lsr(Rj, Rj, 1); 4709 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4710 } block_comment(" } // j"); 4711 4712 last_squaring(Ri); 4713 4714 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4715 lsl(Rj, Rlen, 1); 4716 sub(Rj, Rj, Ri); 4717 lsr(Rj, Rj, 1); 4718 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4719 } block_comment(" } // j"); 4720 4721 post2(Ri, Rlen); 4722 add(Ri, Ri, 1); 4723 cmp(Ri, Rlen, Assembler::LSL, 1); 4724 4725 br(Assembler::LT, loop); 4726 bind(end); 4727 block_comment("} // i"); 4728 } 4729 4730 normalize(Rlen); 4731 4732 mov(Ra, Pm_base); // Save Pm_base in Ra 4733 restore_regs(); // Restore caller's Pm_base 4734 4735 // Copy our result into caller's Pm_base 4736 reverse(Pm_base, Ra, Rlen, t0, t1); 4737 4738 leave(); 4739 ret(lr); 4740 4741 return entry; 4742 } 4743 // In C, approximately: 4744 4745 // void 4746 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4747 // unsigned long Pm_base[], unsigned long inv, int len) { 4748 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4749 // unsigned long *Pa, *Pb, *Pn, *Pm; 4750 // unsigned long Ra, Rb, Rn, Rm; 4751 4752 // int i; 4753 4754 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4755 4756 // for (i = 0; i < len; i++) { 4757 // int j; 4758 4759 // Pa = Pa_base; 4760 // Pb = Pa_base + i; 4761 // Pm = Pm_base; 4762 // Pn = Pn_base + i; 4763 4764 // Ra = *Pa; 4765 // Rb = *Pb; 4766 // Rm = *Pm; 4767 // Rn = *Pn; 4768 4769 // int iters = (i+1)/2; 4770 // for (j = 0; iters--; j++) { 4771 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4772 // MACC2(Ra, Rb, t0, t1, t2); 4773 // Ra = *++Pa; 4774 // Rb = *--Pb; 4775 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4776 // MACC(Rm, Rn, t0, t1, t2); 4777 // Rm = *++Pm; 4778 // Rn = *--Pn; 4779 // } 4780 // if ((i & 1) == 0) { 4781 // assert(Ra == Pa_base[j], "must be"); 4782 // MACC(Ra, Ra, t0, t1, t2); 4783 // } 4784 // iters = i/2; 4785 // assert(iters == i-j, "must be"); 4786 // for (; iters--; j++) { 4787 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4788 // MACC(Rm, Rn, t0, t1, t2); 4789 // Rm = *++Pm; 4790 // Rn = *--Pn; 4791 // } 4792 4793 // *Pm = Rm = t0 * inv; 4794 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4795 // MACC(Rm, Rn, t0, t1, t2); 4796 4797 // assert(t0 == 0, "broken Montgomery multiply"); 4798 4799 // t0 = t1; t1 = t2; t2 = 0; 4800 // } 4801 4802 // for (i = len; i < 2*len; i++) { 4803 // int start = i-len+1; 4804 // int end = start + (len - start)/2; 4805 // int j; 4806 4807 // Pa = Pa_base + i-len; 4808 // Pb = Pa_base + len; 4809 // Pm = Pm_base + i-len; 4810 // Pn = Pn_base + len; 4811 4812 // Ra = *++Pa; 4813 // Rb = *--Pb; 4814 // Rm = *++Pm; 4815 // Rn = *--Pn; 4816 4817 // int iters = (2*len-i-1)/2; 4818 // assert(iters == end-start, "must be"); 4819 // for (j = start; iters--; j++) { 4820 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4821 // MACC2(Ra, Rb, t0, t1, t2); 4822 // Ra = *++Pa; 4823 // Rb = *--Pb; 4824 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4825 // MACC(Rm, Rn, t0, t1, t2); 4826 // Rm = *++Pm; 4827 // Rn = *--Pn; 4828 // } 4829 // if ((i & 1) == 0) { 4830 // assert(Ra == Pa_base[j], "must be"); 4831 // MACC(Ra, Ra, t0, t1, t2); 4832 // } 4833 // iters = (2*len-i)/2; 4834 // assert(iters == len-j, "must be"); 4835 // for (; iters--; j++) { 4836 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4837 // MACC(Rm, Rn, t0, t1, t2); 4838 // Rm = *++Pm; 4839 // Rn = *--Pn; 4840 // } 4841 // Pm_base[i-len] = t0; 4842 // t0 = t1; t1 = t2; t2 = 0; 4843 // } 4844 4845 // while (t0) 4846 // t0 = sub(Pm_base, Pn_base, t0, len); 4847 // } 4848 }; 4849 4850 4851 // Initialization 4852 void generate_initial() { 4853 // Generate initial stubs and initializes the entry points 4854 4855 // entry points that exist in all platforms Note: This is code 4856 // that could be shared among different platforms - however the 4857 // benefit seems to be smaller than the disadvantage of having a 4858 // much more complicated generator structure. See also comment in 4859 // stubRoutines.hpp. 4860 4861 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4862 4863 StubRoutines::_call_stub_entry = 4864 generate_call_stub(StubRoutines::_call_stub_return_address); 4865 4866 // is referenced by megamorphic call 4867 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4868 4869 // Build this early so it's available for the interpreter. 4870 StubRoutines::_throw_StackOverflowError_entry = 4871 generate_throw_exception("StackOverflowError throw_exception", 4872 CAST_FROM_FN_PTR(address, 4873 SharedRuntime::throw_StackOverflowError)); 4874 StubRoutines::_throw_delayed_StackOverflowError_entry = 4875 generate_throw_exception("delayed StackOverflowError throw_exception", 4876 CAST_FROM_FN_PTR(address, 4877 SharedRuntime::throw_delayed_StackOverflowError)); 4878 if (UseCRC32Intrinsics) { 4879 // set table address before stub generation which use it 4880 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 4881 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4882 } 4883 } 4884 4885 void generate_all() { 4886 // support for verify_oop (must happen after universe_init) 4887 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4888 StubRoutines::_throw_AbstractMethodError_entry = 4889 generate_throw_exception("AbstractMethodError throw_exception", 4890 CAST_FROM_FN_PTR(address, 4891 SharedRuntime:: 4892 throw_AbstractMethodError)); 4893 4894 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4895 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4896 CAST_FROM_FN_PTR(address, 4897 SharedRuntime:: 4898 throw_IncompatibleClassChangeError)); 4899 4900 StubRoutines::_throw_NullPointerException_at_call_entry = 4901 generate_throw_exception("NullPointerException at call throw_exception", 4902 CAST_FROM_FN_PTR(address, 4903 SharedRuntime:: 4904 throw_NullPointerException_at_call)); 4905 4906 // arraycopy stubs used by compilers 4907 generate_arraycopy_stubs(); 4908 4909 // has negatives stub for large arrays. 4910 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 4911 4912 if (UseMultiplyToLenIntrinsic) { 4913 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4914 } 4915 4916 if (UseMontgomeryMultiplyIntrinsic) { 4917 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4918 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4919 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4920 } 4921 4922 if (UseMontgomerySquareIntrinsic) { 4923 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4924 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4925 // We use generate_multiply() rather than generate_square() 4926 // because it's faster for the sizes of modulus we care about. 4927 StubRoutines::_montgomerySquare = g.generate_multiply(); 4928 } 4929 4930 #ifndef BUILTIN_SIM 4931 // generate GHASH intrinsics code 4932 if (UseGHASHIntrinsics) { 4933 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 4934 } 4935 4936 if (UseAESIntrinsics) { 4937 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4938 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4939 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4940 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4941 } 4942 4943 if (UseSHA1Intrinsics) { 4944 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4945 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4946 } 4947 if (UseSHA256Intrinsics) { 4948 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4949 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4950 } 4951 4952 if (UseCRC32CIntrinsics) { 4953 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 4954 } 4955 4956 // generate Adler32 intrinsics code 4957 if (UseAdler32Intrinsics) { 4958 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 4959 } 4960 4961 // Safefetch stubs. 4962 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4963 &StubRoutines::_safefetch32_fault_pc, 4964 &StubRoutines::_safefetch32_continuation_pc); 4965 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4966 &StubRoutines::_safefetchN_fault_pc, 4967 &StubRoutines::_safefetchN_continuation_pc); 4968 #endif 4969 StubRoutines::aarch64::set_completed(); 4970 } 4971 4972 public: 4973 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4974 if (all) { 4975 generate_all(); 4976 } else { 4977 generate_initial(); 4978 } 4979 } 4980 }; // end class declaration 4981 4982 void StubGenerator_generate(CodeBuffer* code, bool all) { 4983 StubGenerator g(code, all); 4984 }