1 /* 2 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "interpreter/interpreter.hpp" 30 #include "nativeInst_aarch64.hpp" 31 #include "oops/instanceOop.hpp" 32 #include "oops/method.hpp" 33 #include "oops/objArrayKlass.hpp" 34 #include "oops/oop.inline.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/frame.inline.hpp" 37 #include "runtime/handles.inline.hpp" 38 #include "runtime/sharedRuntime.hpp" 39 #include "runtime/stubCodeGenerator.hpp" 40 #include "runtime/stubRoutines.hpp" 41 #include "runtime/thread.inline.hpp" 42 #include "utilities/top.hpp" 43 #ifdef COMPILER2 44 #include "opto/runtime.hpp" 45 #endif 46 47 #ifdef BUILTIN_SIM 48 #include "../../../../../../simulator/simulator.hpp" 49 #endif 50 51 // Declaration and definition of StubGenerator (no .hpp file). 52 // For a more detailed description of the stub routine structure 53 // see the comment in stubRoutines.hpp 54 55 #undef __ 56 #define __ _masm-> 57 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 58 59 #ifdef PRODUCT 60 #define BLOCK_COMMENT(str) /* nothing */ 61 #else 62 #define BLOCK_COMMENT(str) __ block_comment(str) 63 #endif 64 65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 66 67 // Stub Code definitions 68 69 class StubGenerator: public StubCodeGenerator { 70 private: 71 72 #ifdef PRODUCT 73 #define inc_counter_np(counter) ((void)0) 74 #else 75 void inc_counter_np_(int& counter) { 76 __ lea(rscratch2, ExternalAddress((address)&counter)); 77 __ ldrw(rscratch1, Address(rscratch2)); 78 __ addw(rscratch1, rscratch1, 1); 79 __ strw(rscratch1, Address(rscratch2)); 80 } 81 #define inc_counter_np(counter) \ 82 BLOCK_COMMENT("inc_counter " #counter); \ 83 inc_counter_np_(counter); 84 #endif 85 86 // Call stubs are used to call Java from C 87 // 88 // Arguments: 89 // c_rarg0: call wrapper address address 90 // c_rarg1: result address 91 // c_rarg2: result type BasicType 92 // c_rarg3: method Method* 93 // c_rarg4: (interpreter) entry point address 94 // c_rarg5: parameters intptr_t* 95 // c_rarg6: parameter size (in words) int 96 // c_rarg7: thread Thread* 97 // 98 // There is no return from the stub itself as any Java result 99 // is written to result 100 // 101 // we save r30 (lr) as the return PC at the base of the frame and 102 // link r29 (fp) below it as the frame pointer installing sp (r31) 103 // into fp. 104 // 105 // we save r0-r7, which accounts for all the c arguments. 106 // 107 // TODO: strictly do we need to save them all? they are treated as 108 // volatile by C so could we omit saving the ones we are going to 109 // place in global registers (thread? method?) or those we only use 110 // during setup of the Java call? 111 // 112 // we don't need to save r8 which C uses as an indirect result location 113 // return register. 114 // 115 // we don't need to save r9-r15 which both C and Java treat as 116 // volatile 117 // 118 // we don't need to save r16-18 because Java does not use them 119 // 120 // we save r19-r28 which Java uses as scratch registers and C 121 // expects to be callee-save 122 // 123 // we save the bottom 64 bits of each value stored in v8-v15; it is 124 // the responsibility of the caller to preserve larger values. 125 // 126 // so the stub frame looks like this when we enter Java code 127 // 128 // [ return_from_Java ] <--- sp 129 // [ argument word n ] 130 // ... 131 // -27 [ argument word 1 ] 132 // -26 [ saved v15 ] <--- sp_after_call 133 // -25 [ saved v14 ] 134 // -24 [ saved v13 ] 135 // -23 [ saved v12 ] 136 // -22 [ saved v11 ] 137 // -21 [ saved v10 ] 138 // -20 [ saved v9 ] 139 // -19 [ saved v8 ] 140 // -18 [ saved r28 ] 141 // -17 [ saved r27 ] 142 // -16 [ saved r26 ] 143 // -15 [ saved r25 ] 144 // -14 [ saved r24 ] 145 // -13 [ saved r23 ] 146 // -12 [ saved r22 ] 147 // -11 [ saved r21 ] 148 // -10 [ saved r20 ] 149 // -9 [ saved r19 ] 150 // -8 [ call wrapper (r0) ] 151 // -7 [ result (r1) ] 152 // -6 [ result type (r2) ] 153 // -5 [ method (r3) ] 154 // -4 [ entry point (r4) ] 155 // -3 [ parameters (r5) ] 156 // -2 [ parameter size (r6) ] 157 // -1 [ thread (r7) ] 158 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 159 // 1 [ saved lr (r30) ] 160 161 // Call stub stack layout word offsets from fp 162 enum call_stub_layout { 163 sp_after_call_off = -26, 164 165 d15_off = -26, 166 d13_off = -24, 167 d11_off = -22, 168 d9_off = -20, 169 170 r28_off = -18, 171 r26_off = -16, 172 r24_off = -14, 173 r22_off = -12, 174 r20_off = -10, 175 call_wrapper_off = -8, 176 result_off = -7, 177 result_type_off = -6, 178 method_off = -5, 179 entry_point_off = -4, 180 parameter_size_off = -2, 181 thread_off = -1, 182 fp_f = 0, 183 retaddr_off = 1, 184 }; 185 186 address generate_call_stub(address& return_address) { 187 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 188 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 189 "adjust this code"); 190 191 StubCodeMark mark(this, "StubRoutines", "call_stub"); 192 address start = __ pc(); 193 194 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 195 196 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 197 const Address result (rfp, result_off * wordSize); 198 const Address result_type (rfp, result_type_off * wordSize); 199 const Address method (rfp, method_off * wordSize); 200 const Address entry_point (rfp, entry_point_off * wordSize); 201 const Address parameter_size(rfp, parameter_size_off * wordSize); 202 203 const Address thread (rfp, thread_off * wordSize); 204 205 const Address d15_save (rfp, d15_off * wordSize); 206 const Address d13_save (rfp, d13_off * wordSize); 207 const Address d11_save (rfp, d11_off * wordSize); 208 const Address d9_save (rfp, d9_off * wordSize); 209 210 const Address r28_save (rfp, r28_off * wordSize); 211 const Address r26_save (rfp, r26_off * wordSize); 212 const Address r24_save (rfp, r24_off * wordSize); 213 const Address r22_save (rfp, r22_off * wordSize); 214 const Address r20_save (rfp, r20_off * wordSize); 215 216 // stub code 217 218 // we need a C prolog to bootstrap the x86 caller into the sim 219 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 220 221 address aarch64_entry = __ pc(); 222 223 #ifdef BUILTIN_SIM 224 // Save sender's SP for stack traces. 225 __ mov(rscratch1, sp); 226 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 227 #endif 228 // set up frame and move sp to end of save area 229 __ enter(); 230 __ sub(sp, rfp, -sp_after_call_off * wordSize); 231 232 // save register parameters and Java scratch/global registers 233 // n.b. we save thread even though it gets installed in 234 // rthread because we want to sanity check rthread later 235 __ str(c_rarg7, thread); 236 __ strw(c_rarg6, parameter_size); 237 __ stp(c_rarg4, c_rarg5, entry_point); 238 __ stp(c_rarg2, c_rarg3, result_type); 239 __ stp(c_rarg0, c_rarg1, call_wrapper); 240 241 __ stp(r20, r19, r20_save); 242 __ stp(r22, r21, r22_save); 243 __ stp(r24, r23, r24_save); 244 __ stp(r26, r25, r26_save); 245 __ stp(r28, r27, r28_save); 246 247 __ stpd(v9, v8, d9_save); 248 __ stpd(v11, v10, d11_save); 249 __ stpd(v13, v12, d13_save); 250 __ stpd(v15, v14, d15_save); 251 252 // install Java thread in global register now we have saved 253 // whatever value it held 254 __ mov(rthread, c_rarg7); 255 // And method 256 __ mov(rmethod, c_rarg3); 257 258 // set up the heapbase register 259 __ reinit_heapbase(); 260 261 #ifdef ASSERT 262 // make sure we have no pending exceptions 263 { 264 Label L; 265 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 266 __ cmp(rscratch1, (unsigned)NULL_WORD); 267 __ br(Assembler::EQ, L); 268 __ stop("StubRoutines::call_stub: entered with pending exception"); 269 __ BIND(L); 270 } 271 #endif 272 // pass parameters if any 273 __ mov(esp, sp); 274 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 275 __ andr(sp, rscratch1, -2 * wordSize); 276 277 BLOCK_COMMENT("pass parameters if any"); 278 Label parameters_done; 279 // parameter count is still in c_rarg6 280 // and parameter pointer identifying param 1 is in c_rarg5 281 __ cbzw(c_rarg6, parameters_done); 282 283 address loop = __ pc(); 284 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 285 __ subsw(c_rarg6, c_rarg6, 1); 286 __ push(rscratch1); 287 __ br(Assembler::GT, loop); 288 289 __ BIND(parameters_done); 290 291 // call Java entry -- passing methdoOop, and current sp 292 // rmethod: Method* 293 // r13: sender sp 294 BLOCK_COMMENT("call Java function"); 295 __ mov(r13, sp); 296 __ blr(c_rarg4); 297 298 // tell the simulator we have returned to the stub 299 300 // we do this here because the notify will already have been done 301 // if we get to the next instruction via an exception 302 // 303 // n.b. adding this instruction here affects the calculation of 304 // whether or not a routine returns to the call stub (used when 305 // doing stack walks) since the normal test is to check the return 306 // pc against the address saved below. so we may need to allow for 307 // this extra instruction in the check. 308 309 if (NotifySimulator) { 310 __ notify(Assembler::method_reentry); 311 } 312 // save current address for use by exception handling code 313 314 return_address = __ pc(); 315 316 // store result depending on type (everything that is not 317 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 318 // n.b. this assumes Java returns an integral result in r0 319 // and a floating result in j_farg0 320 __ ldr(j_rarg2, result); 321 Label is_long, is_float, is_double, exit; 322 __ ldr(j_rarg1, result_type); 323 __ cmp(j_rarg1, T_OBJECT); 324 __ br(Assembler::EQ, is_long); 325 __ cmp(j_rarg1, T_LONG); 326 __ br(Assembler::EQ, is_long); 327 __ cmp(j_rarg1, T_FLOAT); 328 __ br(Assembler::EQ, is_float); 329 __ cmp(j_rarg1, T_DOUBLE); 330 __ br(Assembler::EQ, is_double); 331 332 // handle T_INT case 333 __ strw(r0, Address(j_rarg2)); 334 335 __ BIND(exit); 336 337 // pop parameters 338 __ sub(esp, rfp, -sp_after_call_off * wordSize); 339 340 #ifdef ASSERT 341 // verify that threads correspond 342 { 343 Label L, S; 344 __ ldr(rscratch1, thread); 345 __ cmp(rthread, rscratch1); 346 __ br(Assembler::NE, S); 347 __ get_thread(rscratch1); 348 __ cmp(rthread, rscratch1); 349 __ br(Assembler::EQ, L); 350 __ BIND(S); 351 __ stop("StubRoutines::call_stub: threads must correspond"); 352 __ BIND(L); 353 } 354 #endif 355 356 // restore callee-save registers 357 __ ldpd(v15, v14, d15_save); 358 __ ldpd(v13, v12, d13_save); 359 __ ldpd(v11, v10, d11_save); 360 __ ldpd(v9, v8, d9_save); 361 362 __ ldp(r28, r27, r28_save); 363 __ ldp(r26, r25, r26_save); 364 __ ldp(r24, r23, r24_save); 365 __ ldp(r22, r21, r22_save); 366 __ ldp(r20, r19, r20_save); 367 368 __ ldp(c_rarg0, c_rarg1, call_wrapper); 369 __ ldrw(c_rarg2, result_type); 370 __ ldr(c_rarg3, method); 371 __ ldp(c_rarg4, c_rarg5, entry_point); 372 __ ldp(c_rarg6, c_rarg7, parameter_size); 373 374 #ifndef PRODUCT 375 // tell the simulator we are about to end Java execution 376 if (NotifySimulator) { 377 __ notify(Assembler::method_exit); 378 } 379 #endif 380 // leave frame and return to caller 381 __ leave(); 382 __ ret(lr); 383 384 // handle return types different from T_INT 385 386 __ BIND(is_long); 387 __ str(r0, Address(j_rarg2, 0)); 388 __ br(Assembler::AL, exit); 389 390 __ BIND(is_float); 391 __ strs(j_farg0, Address(j_rarg2, 0)); 392 __ br(Assembler::AL, exit); 393 394 __ BIND(is_double); 395 __ strd(j_farg0, Address(j_rarg2, 0)); 396 __ br(Assembler::AL, exit); 397 398 return start; 399 } 400 401 // Return point for a Java call if there's an exception thrown in 402 // Java code. The exception is caught and transformed into a 403 // pending exception stored in JavaThread that can be tested from 404 // within the VM. 405 // 406 // Note: Usually the parameters are removed by the callee. In case 407 // of an exception crossing an activation frame boundary, that is 408 // not the case if the callee is compiled code => need to setup the 409 // rsp. 410 // 411 // r0: exception oop 412 413 // NOTE: this is used as a target from the signal handler so it 414 // needs an x86 prolog which returns into the current simulator 415 // executing the generated catch_exception code. so the prolog 416 // needs to install rax in a sim register and adjust the sim's 417 // restart pc to enter the generated code at the start position 418 // then return from native to simulated execution. 419 420 address generate_catch_exception() { 421 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 422 address start = __ pc(); 423 424 // same as in generate_call_stub(): 425 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 426 const Address thread (rfp, thread_off * wordSize); 427 428 #ifdef ASSERT 429 // verify that threads correspond 430 { 431 Label L, S; 432 __ ldr(rscratch1, thread); 433 __ cmp(rthread, rscratch1); 434 __ br(Assembler::NE, S); 435 __ get_thread(rscratch1); 436 __ cmp(rthread, rscratch1); 437 __ br(Assembler::EQ, L); 438 __ bind(S); 439 __ stop("StubRoutines::catch_exception: threads must correspond"); 440 __ bind(L); 441 } 442 #endif 443 444 // set pending exception 445 __ verify_oop(r0); 446 447 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 448 __ mov(rscratch1, (address)__FILE__); 449 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 450 __ movw(rscratch1, (int)__LINE__); 451 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 452 453 // complete return to VM 454 assert(StubRoutines::_call_stub_return_address != NULL, 455 "_call_stub_return_address must have been generated before"); 456 __ b(StubRoutines::_call_stub_return_address); 457 458 return start; 459 } 460 461 // Continuation point for runtime calls returning with a pending 462 // exception. The pending exception check happened in the runtime 463 // or native call stub. The pending exception in Thread is 464 // converted into a Java-level exception. 465 // 466 // Contract with Java-level exception handlers: 467 // r0: exception 468 // r3: throwing pc 469 // 470 // NOTE: At entry of this stub, exception-pc must be in LR !! 471 472 // NOTE: this is always used as a jump target within generated code 473 // so it just needs to be generated code wiht no x86 prolog 474 475 address generate_forward_exception() { 476 StubCodeMark mark(this, "StubRoutines", "forward exception"); 477 address start = __ pc(); 478 479 // Upon entry, LR points to the return address returning into 480 // Java (interpreted or compiled) code; i.e., the return address 481 // becomes the throwing pc. 482 // 483 // Arguments pushed before the runtime call are still on the stack 484 // but the exception handler will reset the stack pointer -> 485 // ignore them. A potential result in registers can be ignored as 486 // well. 487 488 #ifdef ASSERT 489 // make sure this code is only executed if there is a pending exception 490 { 491 Label L; 492 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 493 __ cbnz(rscratch1, L); 494 __ stop("StubRoutines::forward exception: no pending exception (1)"); 495 __ bind(L); 496 } 497 #endif 498 499 // compute exception handler into r19 500 501 // call the VM to find the handler address associated with the 502 // caller address. pass thread in r0 and caller pc (ret address) 503 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 504 // the stack. 505 __ mov(c_rarg1, lr); 506 // lr will be trashed by the VM call so we move it to R19 507 // (callee-saved) because we also need to pass it to the handler 508 // returned by this call. 509 __ mov(r19, lr); 510 BLOCK_COMMENT("call exception_handler_for_return_address"); 511 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 512 SharedRuntime::exception_handler_for_return_address), 513 rthread, c_rarg1); 514 // we should not really care that lr is no longer the callee 515 // address. we saved the value the handler needs in r19 so we can 516 // just copy it to r3. however, the C2 handler will push its own 517 // frame and then calls into the VM and the VM code asserts that 518 // the PC for the frame above the handler belongs to a compiled 519 // Java method. So, we restore lr here to satisfy that assert. 520 __ mov(lr, r19); 521 // setup r0 & r3 & clear pending exception 522 __ mov(r3, r19); 523 __ mov(r19, r0); 524 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 525 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 526 527 #ifdef ASSERT 528 // make sure exception is set 529 { 530 Label L; 531 __ cbnz(r0, L); 532 __ stop("StubRoutines::forward exception: no pending exception (2)"); 533 __ bind(L); 534 } 535 #endif 536 537 // continue at exception handler 538 // r0: exception 539 // r3: throwing pc 540 // r19: exception handler 541 __ verify_oop(r0); 542 __ br(r19); 543 544 return start; 545 } 546 547 // Non-destructive plausibility checks for oops 548 // 549 // Arguments: 550 // r0: oop to verify 551 // rscratch1: error message 552 // 553 // Stack after saving c_rarg3: 554 // [tos + 0]: saved c_rarg3 555 // [tos + 1]: saved c_rarg2 556 // [tos + 2]: saved lr 557 // [tos + 3]: saved rscratch2 558 // [tos + 4]: saved r0 559 // [tos + 5]: saved rscratch1 560 address generate_verify_oop() { 561 562 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 563 address start = __ pc(); 564 565 Label exit, error; 566 567 // save c_rarg2 and c_rarg3 568 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 569 570 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 571 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 572 __ ldr(c_rarg3, Address(c_rarg2)); 573 __ add(c_rarg3, c_rarg3, 1); 574 __ str(c_rarg3, Address(c_rarg2)); 575 576 // object is in r0 577 // make sure object is 'reasonable' 578 __ cbz(r0, exit); // if obj is NULL it is OK 579 580 // Check if the oop is in the right area of memory 581 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 582 __ andr(c_rarg2, r0, c_rarg3); 583 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 584 585 // Compare c_rarg2 and c_rarg3. We don't use a compare 586 // instruction here because the flags register is live. 587 __ eor(c_rarg2, c_rarg2, c_rarg3); 588 __ cbnz(c_rarg2, error); 589 590 // make sure klass is 'reasonable', which is not zero. 591 __ load_klass(r0, r0); // get klass 592 __ cbz(r0, error); // if klass is NULL it is broken 593 594 // return if everything seems ok 595 __ bind(exit); 596 597 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 598 __ ret(lr); 599 600 // handle errors 601 __ bind(error); 602 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 603 604 __ push(RegSet::range(r0, r29), sp); 605 // debug(char* msg, int64_t pc, int64_t regs[]) 606 __ mov(c_rarg0, rscratch1); // pass address of error message 607 __ mov(c_rarg1, lr); // pass return address 608 __ mov(c_rarg2, sp); // pass address of regs on stack 609 #ifndef PRODUCT 610 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 611 #endif 612 BLOCK_COMMENT("call MacroAssembler::debug"); 613 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 614 __ blrt(rscratch1, 3, 0, 1); 615 616 return start; 617 } 618 619 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 620 621 // Generate code for an array write pre barrier 622 // 623 // addr - starting address 624 // count - element count 625 // tmp - scratch register 626 // 627 // Destroy no registers except rscratch1 and rscratch2 628 // 629 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 630 BarrierSet* bs = Universe::heap()->barrier_set(); 631 switch (bs->kind()) { 632 case BarrierSet::G1SATBCTLogging: 633 // With G1, don't generate the call if we statically know that the target in uninitialized 634 if (!dest_uninitialized) { 635 __ push_call_clobbered_registers(); 636 if (count == c_rarg0) { 637 if (addr == c_rarg1) { 638 // exactly backwards!! 639 __ mov(rscratch1, c_rarg0); 640 __ mov(c_rarg0, c_rarg1); 641 __ mov(c_rarg1, rscratch1); 642 } else { 643 __ mov(c_rarg1, count); 644 __ mov(c_rarg0, addr); 645 } 646 } else { 647 __ mov(c_rarg0, addr); 648 __ mov(c_rarg1, count); 649 } 650 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 651 __ pop_call_clobbered_registers(); 652 break; 653 case BarrierSet::CardTableForRS: 654 case BarrierSet::CardTableExtension: 655 case BarrierSet::ModRef: 656 break; 657 default: 658 ShouldNotReachHere(); 659 660 } 661 } 662 } 663 664 // 665 // Generate code for an array write post barrier 666 // 667 // Input: 668 // start - register containing starting address of destination array 669 // end - register containing ending address of destination array 670 // scratch - scratch register 671 // 672 // The input registers are overwritten. 673 // The ending address is inclusive. 674 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) { 675 assert_different_registers(start, end, scratch); 676 BarrierSet* bs = Universe::heap()->barrier_set(); 677 switch (bs->kind()) { 678 case BarrierSet::G1SATBCTLogging: 679 680 { 681 __ push_call_clobbered_registers(); 682 // must compute element count unless barrier set interface is changed (other platforms supply count) 683 assert_different_registers(start, end, scratch); 684 __ lea(scratch, Address(end, BytesPerHeapOop)); 685 __ sub(scratch, scratch, start); // subtract start to get #bytes 686 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 687 __ mov(c_rarg0, start); 688 __ mov(c_rarg1, scratch); 689 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 690 __ pop_call_clobbered_registers(); 691 } 692 break; 693 case BarrierSet::CardTableForRS: 694 case BarrierSet::CardTableExtension: 695 { 696 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 697 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 698 699 Label L_loop; 700 701 __ lsr(start, start, CardTableModRefBS::card_shift); 702 __ lsr(end, end, CardTableModRefBS::card_shift); 703 __ sub(end, end, start); // number of bytes to copy 704 705 const Register count = end; // 'end' register contains bytes count now 706 __ load_byte_map_base(scratch); 707 __ add(start, start, scratch); 708 if (UseConcMarkSweepGC) { 709 __ membar(__ StoreStore); 710 } 711 __ BIND(L_loop); 712 __ strb(zr, Address(start, count)); 713 __ subs(count, count, 1); 714 __ br(Assembler::HS, L_loop); 715 } 716 break; 717 default: 718 ShouldNotReachHere(); 719 720 } 721 } 722 723 typedef enum { 724 copy_forwards = 1, 725 copy_backwards = -1 726 } copy_direction; 727 728 // Bulk copy of blocks of 8 words. 729 // 730 // count is a count of words. 731 // 732 // Precondition: count >= 8 733 // 734 // Postconditions: 735 // 736 // The least significant bit of count contains the remaining count 737 // of words to copy. The rest of count is trash. 738 // 739 // s and d are adjusted to point to the remaining words to copy 740 // 741 void generate_copy_longs(Label &start, Register s, Register d, Register count, 742 copy_direction direction) { 743 int unit = wordSize * direction; 744 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 745 746 int offset; 747 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 748 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 749 const Register stride = r13; 750 751 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 752 assert_different_registers(s, d, count, rscratch1); 753 754 Label again, drain; 755 const char *stub_name; 756 if (direction == copy_forwards) 757 stub_name = "foward_copy_longs"; 758 else 759 stub_name = "backward_copy_longs"; 760 StubCodeMark mark(this, "StubRoutines", stub_name); 761 __ align(CodeEntryAlignment); 762 __ bind(start); 763 if (direction == copy_forwards) { 764 __ sub(s, s, bias); 765 __ sub(d, d, bias); 766 } 767 768 #ifdef ASSERT 769 // Make sure we are never given < 8 words 770 { 771 Label L; 772 __ cmp(count, 8); 773 __ br(Assembler::GE, L); 774 __ stop("genrate_copy_longs called with < 8 words"); 775 __ bind(L); 776 } 777 #endif 778 779 // Fill 8 registers 780 if (UseSIMDForMemoryOps) { 781 __ ldpq(v0, v1, Address(s, 4 * unit)); 782 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 783 } else { 784 __ ldp(t0, t1, Address(s, 2 * unit)); 785 __ ldp(t2, t3, Address(s, 4 * unit)); 786 __ ldp(t4, t5, Address(s, 6 * unit)); 787 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 788 } 789 790 __ subs(count, count, 16); 791 __ br(Assembler::LO, drain); 792 793 int prefetch = PrefetchCopyIntervalInBytes; 794 bool use_stride = false; 795 if (direction == copy_backwards) { 796 use_stride = prefetch > 256; 797 prefetch = -prefetch; 798 if (use_stride) __ mov(stride, prefetch); 799 } 800 801 __ bind(again); 802 803 if (PrefetchCopyIntervalInBytes > 0) 804 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 805 806 if (UseSIMDForMemoryOps) { 807 __ stpq(v0, v1, Address(d, 4 * unit)); 808 __ ldpq(v0, v1, Address(s, 4 * unit)); 809 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 810 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 811 } else { 812 __ stp(t0, t1, Address(d, 2 * unit)); 813 __ ldp(t0, t1, Address(s, 2 * unit)); 814 __ stp(t2, t3, Address(d, 4 * unit)); 815 __ ldp(t2, t3, Address(s, 4 * unit)); 816 __ stp(t4, t5, Address(d, 6 * unit)); 817 __ ldp(t4, t5, Address(s, 6 * unit)); 818 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 819 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 820 } 821 822 __ subs(count, count, 8); 823 __ br(Assembler::HS, again); 824 825 // Drain 826 __ bind(drain); 827 if (UseSIMDForMemoryOps) { 828 __ stpq(v0, v1, Address(d, 4 * unit)); 829 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 830 } else { 831 __ stp(t0, t1, Address(d, 2 * unit)); 832 __ stp(t2, t3, Address(d, 4 * unit)); 833 __ stp(t4, t5, Address(d, 6 * unit)); 834 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 835 } 836 837 { 838 Label L1, L2; 839 __ tbz(count, exact_log2(4), L1); 840 if (UseSIMDForMemoryOps) { 841 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 842 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 843 } else { 844 __ ldp(t0, t1, Address(s, 2 * unit)); 845 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 846 __ stp(t0, t1, Address(d, 2 * unit)); 847 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 848 } 849 __ bind(L1); 850 851 if (direction == copy_forwards) { 852 __ add(s, s, bias); 853 __ add(d, d, bias); 854 } 855 856 __ tbz(count, 1, L2); 857 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 858 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 859 __ bind(L2); 860 } 861 862 __ ret(lr); 863 } 864 865 // Small copy: less than 16 bytes. 866 // 867 // NB: Ignores all of the bits of count which represent more than 15 868 // bytes, so a caller doesn't have to mask them. 869 870 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 871 bool is_backwards = step < 0; 872 size_t granularity = uabs(step); 873 int direction = is_backwards ? -1 : 1; 874 int unit = wordSize * direction; 875 876 Label Lpair, Lword, Lint, Lshort, Lbyte; 877 878 assert(granularity 879 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 880 881 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 882 883 // ??? I don't know if this bit-test-and-branch is the right thing 884 // to do. It does a lot of jumping, resulting in several 885 // mispredicted branches. It might make more sense to do this 886 // with something like Duff's device with a single computed branch. 887 888 __ tbz(count, 3 - exact_log2(granularity), Lword); 889 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 890 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 891 __ bind(Lword); 892 893 if (granularity <= sizeof (jint)) { 894 __ tbz(count, 2 - exact_log2(granularity), Lint); 895 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 896 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 897 __ bind(Lint); 898 } 899 900 if (granularity <= sizeof (jshort)) { 901 __ tbz(count, 1 - exact_log2(granularity), Lshort); 902 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 903 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 904 __ bind(Lshort); 905 } 906 907 if (granularity <= sizeof (jbyte)) { 908 __ tbz(count, 0, Lbyte); 909 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 910 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 911 __ bind(Lbyte); 912 } 913 } 914 915 Label copy_f, copy_b; 916 917 // All-singing all-dancing memory copy. 918 // 919 // Copy count units of memory from s to d. The size of a unit is 920 // step, which can be positive or negative depending on the direction 921 // of copy. If is_aligned is false, we align the source address. 922 // 923 924 void copy_memory(bool is_aligned, Register s, Register d, 925 Register count, Register tmp, int step) { 926 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 927 bool is_backwards = step < 0; 928 int granularity = uabs(step); 929 const Register t0 = r3, t1 = r4; 930 931 // <= 96 bytes do inline. Direction doesn't matter because we always 932 // load all the data before writing anything 933 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 934 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 935 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 936 const Register send = r17, dend = r18; 937 938 if (PrefetchCopyIntervalInBytes > 0) 939 __ prfm(Address(s, 0), PLDL1KEEP); 940 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 941 __ br(Assembler::HI, copy_big); 942 943 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 944 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 945 946 __ cmp(count, 16/granularity); 947 __ br(Assembler::LS, copy16); 948 949 __ cmp(count, 64/granularity); 950 __ br(Assembler::HI, copy80); 951 952 __ cmp(count, 32/granularity); 953 __ br(Assembler::LS, copy32); 954 955 // 33..64 bytes 956 if (UseSIMDForMemoryOps) { 957 __ ldpq(v0, v1, Address(s, 0)); 958 __ ldpq(v2, v3, Address(send, -32)); 959 __ stpq(v0, v1, Address(d, 0)); 960 __ stpq(v2, v3, Address(dend, -32)); 961 } else { 962 __ ldp(t0, t1, Address(s, 0)); 963 __ ldp(t2, t3, Address(s, 16)); 964 __ ldp(t4, t5, Address(send, -32)); 965 __ ldp(t6, t7, Address(send, -16)); 966 967 __ stp(t0, t1, Address(d, 0)); 968 __ stp(t2, t3, Address(d, 16)); 969 __ stp(t4, t5, Address(dend, -32)); 970 __ stp(t6, t7, Address(dend, -16)); 971 } 972 __ b(finish); 973 974 // 17..32 bytes 975 __ bind(copy32); 976 __ ldp(t0, t1, Address(s, 0)); 977 __ ldp(t2, t3, Address(send, -16)); 978 __ stp(t0, t1, Address(d, 0)); 979 __ stp(t2, t3, Address(dend, -16)); 980 __ b(finish); 981 982 // 65..80/96 bytes 983 // (96 bytes if SIMD because we do 32 byes per instruction) 984 __ bind(copy80); 985 if (UseSIMDForMemoryOps) { 986 __ ldpq(v0, v1, Address(s, 0)); 987 __ ldpq(v2, v3, Address(s, 32)); 988 __ ldpq(v4, v5, Address(send, -32)); 989 __ stpq(v0, v1, Address(d, 0)); 990 __ stpq(v2, v3, Address(d, 32)); 991 __ stpq(v4, v5, Address(dend, -32)); 992 } else { 993 __ ldp(t0, t1, Address(s, 0)); 994 __ ldp(t2, t3, Address(s, 16)); 995 __ ldp(t4, t5, Address(s, 32)); 996 __ ldp(t6, t7, Address(s, 48)); 997 __ ldp(t8, t9, Address(send, -16)); 998 999 __ stp(t0, t1, Address(d, 0)); 1000 __ stp(t2, t3, Address(d, 16)); 1001 __ stp(t4, t5, Address(d, 32)); 1002 __ stp(t6, t7, Address(d, 48)); 1003 __ stp(t8, t9, Address(dend, -16)); 1004 } 1005 __ b(finish); 1006 1007 // 0..16 bytes 1008 __ bind(copy16); 1009 __ cmp(count, 8/granularity); 1010 __ br(Assembler::LO, copy8); 1011 1012 // 8..16 bytes 1013 __ ldr(t0, Address(s, 0)); 1014 __ ldr(t1, Address(send, -8)); 1015 __ str(t0, Address(d, 0)); 1016 __ str(t1, Address(dend, -8)); 1017 __ b(finish); 1018 1019 if (granularity < 8) { 1020 // 4..7 bytes 1021 __ bind(copy8); 1022 __ tbz(count, 2 - exact_log2(granularity), copy4); 1023 __ ldrw(t0, Address(s, 0)); 1024 __ ldrw(t1, Address(send, -4)); 1025 __ strw(t0, Address(d, 0)); 1026 __ strw(t1, Address(dend, -4)); 1027 __ b(finish); 1028 if (granularity < 4) { 1029 // 0..3 bytes 1030 __ bind(copy4); 1031 __ cbz(count, finish); // get rid of 0 case 1032 if (granularity == 2) { 1033 __ ldrh(t0, Address(s, 0)); 1034 __ strh(t0, Address(d, 0)); 1035 } else { // granularity == 1 1036 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1037 // the first and last byte. 1038 // Handle the 3 byte case by loading and storing base + count/2 1039 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1040 // This does means in the 1 byte case we load/store the same 1041 // byte 3 times. 1042 __ lsr(count, count, 1); 1043 __ ldrb(t0, Address(s, 0)); 1044 __ ldrb(t1, Address(send, -1)); 1045 __ ldrb(t2, Address(s, count)); 1046 __ strb(t0, Address(d, 0)); 1047 __ strb(t1, Address(dend, -1)); 1048 __ strb(t2, Address(d, count)); 1049 } 1050 __ b(finish); 1051 } 1052 } 1053 1054 __ bind(copy_big); 1055 if (is_backwards) { 1056 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1057 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1058 } 1059 1060 // Now we've got the small case out of the way we can align the 1061 // source address on a 2-word boundary. 1062 1063 Label aligned; 1064 1065 if (is_aligned) { 1066 // We may have to adjust by 1 word to get s 2-word-aligned. 1067 __ tbz(s, exact_log2(wordSize), aligned); 1068 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1069 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1070 __ sub(count, count, wordSize/granularity); 1071 } else { 1072 if (is_backwards) { 1073 __ andr(rscratch2, s, 2 * wordSize - 1); 1074 } else { 1075 __ neg(rscratch2, s); 1076 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1077 } 1078 // rscratch2 is the byte adjustment needed to align s. 1079 __ cbz(rscratch2, aligned); 1080 int shift = exact_log2(granularity); 1081 if (shift) __ lsr(rscratch2, rscratch2, shift); 1082 __ sub(count, count, rscratch2); 1083 1084 #if 0 1085 // ?? This code is only correct for a disjoint copy. It may or 1086 // may not make sense to use it in that case. 1087 1088 // Copy the first pair; s and d may not be aligned. 1089 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1090 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1091 1092 // Align s and d, adjust count 1093 if (is_backwards) { 1094 __ sub(s, s, rscratch2); 1095 __ sub(d, d, rscratch2); 1096 } else { 1097 __ add(s, s, rscratch2); 1098 __ add(d, d, rscratch2); 1099 } 1100 #else 1101 copy_memory_small(s, d, rscratch2, rscratch1, step); 1102 #endif 1103 } 1104 1105 __ bind(aligned); 1106 1107 // s is now 2-word-aligned. 1108 1109 // We have a count of units and some trailing bytes. Adjust the 1110 // count and do a bulk copy of words. 1111 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1112 if (direction == copy_forwards) 1113 __ bl(copy_f); 1114 else 1115 __ bl(copy_b); 1116 1117 // And the tail. 1118 copy_memory_small(s, d, count, tmp, step); 1119 1120 if (granularity >= 8) __ bind(copy8); 1121 if (granularity >= 4) __ bind(copy4); 1122 __ bind(finish); 1123 } 1124 1125 1126 void clobber_registers() { 1127 #ifdef ASSERT 1128 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1129 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1130 for (Register r = r3; r <= r18; r++) 1131 if (r != rscratch1) __ mov(r, rscratch1); 1132 #endif 1133 } 1134 1135 // Scan over array at a for count oops, verifying each one. 1136 // Preserves a and count, clobbers rscratch1 and rscratch2. 1137 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1138 Label loop, end; 1139 __ mov(rscratch1, a); 1140 __ mov(rscratch2, zr); 1141 __ bind(loop); 1142 __ cmp(rscratch2, count); 1143 __ br(Assembler::HS, end); 1144 if (size == (size_t)wordSize) { 1145 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1146 __ verify_oop(temp); 1147 } else { 1148 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1149 __ decode_heap_oop(temp); // calls verify_oop 1150 } 1151 __ add(rscratch2, rscratch2, size); 1152 __ b(loop); 1153 __ bind(end); 1154 } 1155 1156 // Arguments: 1157 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1158 // ignored 1159 // is_oop - true => oop array, so generate store check code 1160 // name - stub name string 1161 // 1162 // Inputs: 1163 // c_rarg0 - source array address 1164 // c_rarg1 - destination array address 1165 // c_rarg2 - element count, treated as ssize_t, can be zero 1166 // 1167 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1168 // the hardware handle it. The two dwords within qwords that span 1169 // cache line boundaries will still be loaded and stored atomicly. 1170 // 1171 // Side Effects: 1172 // disjoint_int_copy_entry is set to the no-overlap entry point 1173 // used by generate_conjoint_int_oop_copy(). 1174 // 1175 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1176 const char *name, bool dest_uninitialized = false) { 1177 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1178 __ align(CodeEntryAlignment); 1179 StubCodeMark mark(this, "StubRoutines", name); 1180 address start = __ pc(); 1181 __ enter(); 1182 1183 if (entry != NULL) { 1184 *entry = __ pc(); 1185 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1186 BLOCK_COMMENT("Entry:"); 1187 } 1188 1189 if (is_oop) { 1190 __ push(RegSet::of(d, count), sp); 1191 // no registers are destroyed by this call 1192 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1193 } 1194 copy_memory(aligned, s, d, count, rscratch1, size); 1195 if (is_oop) { 1196 __ pop(RegSet::of(d, count), sp); 1197 if (VerifyOops) 1198 verify_oop_array(size, d, count, r16); 1199 __ sub(count, count, 1); // make an inclusive end pointer 1200 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1201 gen_write_ref_array_post_barrier(d, count, rscratch1); 1202 } 1203 __ leave(); 1204 __ mov(r0, zr); // return 0 1205 __ ret(lr); 1206 #ifdef BUILTIN_SIM 1207 { 1208 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1209 sim->notifyCompile(const_cast<char*>(name), start); 1210 } 1211 #endif 1212 return start; 1213 } 1214 1215 // Arguments: 1216 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1217 // ignored 1218 // is_oop - true => oop array, so generate store check code 1219 // name - stub name string 1220 // 1221 // Inputs: 1222 // c_rarg0 - source array address 1223 // c_rarg1 - destination array address 1224 // c_rarg2 - element count, treated as ssize_t, can be zero 1225 // 1226 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1227 // the hardware handle it. The two dwords within qwords that span 1228 // cache line boundaries will still be loaded and stored atomicly. 1229 // 1230 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1231 address *entry, const char *name, 1232 bool dest_uninitialized = false) { 1233 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1234 1235 StubCodeMark mark(this, "StubRoutines", name); 1236 address start = __ pc(); 1237 __ enter(); 1238 1239 if (entry != NULL) { 1240 *entry = __ pc(); 1241 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1242 BLOCK_COMMENT("Entry:"); 1243 } 1244 1245 // use fwd copy when (d-s) above_equal (count*size) 1246 __ sub(rscratch1, d, s); 1247 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1248 __ br(Assembler::HS, nooverlap_target); 1249 1250 if (is_oop) { 1251 __ push(RegSet::of(d, count), sp); 1252 // no registers are destroyed by this call 1253 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1254 } 1255 copy_memory(aligned, s, d, count, rscratch1, -size); 1256 if (is_oop) { 1257 __ pop(RegSet::of(d, count), sp); 1258 if (VerifyOops) 1259 verify_oop_array(size, d, count, r16); 1260 __ sub(count, count, 1); // make an inclusive end pointer 1261 __ lea(count, Address(d, count, Address::uxtw(exact_log2(size)))); 1262 gen_write_ref_array_post_barrier(d, count, rscratch1); 1263 } 1264 __ leave(); 1265 __ mov(r0, zr); // return 0 1266 __ ret(lr); 1267 #ifdef BUILTIN_SIM 1268 { 1269 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1270 sim->notifyCompile(const_cast<char*>(name), start); 1271 } 1272 #endif 1273 return start; 1274 } 1275 1276 // Arguments: 1277 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1278 // ignored 1279 // name - stub name string 1280 // 1281 // Inputs: 1282 // c_rarg0 - source array address 1283 // c_rarg1 - destination array address 1284 // c_rarg2 - element count, treated as ssize_t, can be zero 1285 // 1286 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1287 // we let the hardware handle it. The one to eight bytes within words, 1288 // dwords or qwords that span cache line boundaries will still be loaded 1289 // and stored atomically. 1290 // 1291 // Side Effects: 1292 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1293 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1294 // we let the hardware handle it. The one to eight bytes within words, 1295 // dwords or qwords that span cache line boundaries will still be loaded 1296 // and stored atomically. 1297 // 1298 // Side Effects: 1299 // disjoint_byte_copy_entry is set to the no-overlap entry point 1300 // used by generate_conjoint_byte_copy(). 1301 // 1302 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1303 const bool not_oop = false; 1304 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1305 } 1306 1307 // Arguments: 1308 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1309 // ignored 1310 // name - stub name string 1311 // 1312 // Inputs: 1313 // c_rarg0 - source array address 1314 // c_rarg1 - destination array address 1315 // c_rarg2 - element count, treated as ssize_t, can be zero 1316 // 1317 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1318 // we let the hardware handle it. The one to eight bytes within words, 1319 // dwords or qwords that span cache line boundaries will still be loaded 1320 // and stored atomically. 1321 // 1322 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1323 address* entry, const char *name) { 1324 const bool not_oop = false; 1325 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1326 } 1327 1328 // Arguments: 1329 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1330 // ignored 1331 // name - stub name string 1332 // 1333 // Inputs: 1334 // c_rarg0 - source array address 1335 // c_rarg1 - destination array address 1336 // c_rarg2 - element count, treated as ssize_t, can be zero 1337 // 1338 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1339 // let the hardware handle it. The two or four words within dwords 1340 // or qwords that span cache line boundaries will still be loaded 1341 // and stored atomically. 1342 // 1343 // Side Effects: 1344 // disjoint_short_copy_entry is set to the no-overlap entry point 1345 // used by generate_conjoint_short_copy(). 1346 // 1347 address generate_disjoint_short_copy(bool aligned, 1348 address* entry, const char *name) { 1349 const bool not_oop = false; 1350 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1351 } 1352 1353 // Arguments: 1354 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1355 // ignored 1356 // name - stub name string 1357 // 1358 // Inputs: 1359 // c_rarg0 - source array address 1360 // c_rarg1 - destination array address 1361 // c_rarg2 - element count, treated as ssize_t, can be zero 1362 // 1363 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1364 // let the hardware handle it. The two or four words within dwords 1365 // or qwords that span cache line boundaries will still be loaded 1366 // and stored atomically. 1367 // 1368 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1369 address *entry, const char *name) { 1370 const bool not_oop = false; 1371 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1372 1373 } 1374 // Arguments: 1375 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1376 // ignored 1377 // name - stub name string 1378 // 1379 // Inputs: 1380 // c_rarg0 - source array address 1381 // c_rarg1 - destination array address 1382 // c_rarg2 - element count, treated as ssize_t, can be zero 1383 // 1384 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1385 // the hardware handle it. The two dwords within qwords that span 1386 // cache line boundaries will still be loaded and stored atomicly. 1387 // 1388 // Side Effects: 1389 // disjoint_int_copy_entry is set to the no-overlap entry point 1390 // used by generate_conjoint_int_oop_copy(). 1391 // 1392 address generate_disjoint_int_copy(bool aligned, address *entry, 1393 const char *name, bool dest_uninitialized = false) { 1394 const bool not_oop = false; 1395 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1396 } 1397 1398 // Arguments: 1399 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1400 // ignored 1401 // name - stub name string 1402 // 1403 // Inputs: 1404 // c_rarg0 - source array address 1405 // c_rarg1 - destination array address 1406 // c_rarg2 - element count, treated as ssize_t, can be zero 1407 // 1408 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1409 // the hardware handle it. The two dwords within qwords that span 1410 // cache line boundaries will still be loaded and stored atomicly. 1411 // 1412 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1413 address *entry, const char *name, 1414 bool dest_uninitialized = false) { 1415 const bool not_oop = false; 1416 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1417 } 1418 1419 1420 // Arguments: 1421 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1422 // ignored 1423 // name - stub name string 1424 // 1425 // Inputs: 1426 // c_rarg0 - source array address 1427 // c_rarg1 - destination array address 1428 // c_rarg2 - element count, treated as size_t, can be zero 1429 // 1430 // Side Effects: 1431 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1432 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1433 // 1434 address generate_disjoint_long_copy(bool aligned, address *entry, 1435 const char *name, bool dest_uninitialized = false) { 1436 const bool not_oop = false; 1437 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1438 } 1439 1440 // Arguments: 1441 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1442 // ignored 1443 // name - stub name string 1444 // 1445 // Inputs: 1446 // c_rarg0 - source array address 1447 // c_rarg1 - destination array address 1448 // c_rarg2 - element count, treated as size_t, can be zero 1449 // 1450 address generate_conjoint_long_copy(bool aligned, 1451 address nooverlap_target, address *entry, 1452 const char *name, bool dest_uninitialized = false) { 1453 const bool not_oop = false; 1454 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1455 } 1456 1457 // Arguments: 1458 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1459 // ignored 1460 // name - stub name string 1461 // 1462 // Inputs: 1463 // c_rarg0 - source array address 1464 // c_rarg1 - destination array address 1465 // c_rarg2 - element count, treated as size_t, can be zero 1466 // 1467 // Side Effects: 1468 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1469 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1470 // 1471 address generate_disjoint_oop_copy(bool aligned, address *entry, 1472 const char *name, bool dest_uninitialized) { 1473 const bool is_oop = true; 1474 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1475 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1476 } 1477 1478 // Arguments: 1479 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1480 // ignored 1481 // name - stub name string 1482 // 1483 // Inputs: 1484 // c_rarg0 - source array address 1485 // c_rarg1 - destination array address 1486 // c_rarg2 - element count, treated as size_t, can be zero 1487 // 1488 address generate_conjoint_oop_copy(bool aligned, 1489 address nooverlap_target, address *entry, 1490 const char *name, bool dest_uninitialized) { 1491 const bool is_oop = true; 1492 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1493 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1494 name, dest_uninitialized); 1495 } 1496 1497 1498 // Helper for generating a dynamic type check. 1499 // Smashes rscratch1. 1500 void generate_type_check(Register sub_klass, 1501 Register super_check_offset, 1502 Register super_klass, 1503 Label& L_success) { 1504 assert_different_registers(sub_klass, super_check_offset, super_klass); 1505 1506 BLOCK_COMMENT("type_check:"); 1507 1508 Label L_miss; 1509 1510 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1511 super_check_offset); 1512 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1513 1514 // Fall through on failure! 1515 __ BIND(L_miss); 1516 } 1517 1518 // 1519 // Generate checkcasting array copy stub 1520 // 1521 // Input: 1522 // c_rarg0 - source array address 1523 // c_rarg1 - destination array address 1524 // c_rarg2 - element count, treated as ssize_t, can be zero 1525 // c_rarg3 - size_t ckoff (super_check_offset) 1526 // c_rarg4 - oop ckval (super_klass) 1527 // 1528 // Output: 1529 // r0 == 0 - success 1530 // r0 == -1^K - failure, where K is partial transfer count 1531 // 1532 address generate_checkcast_copy(const char *name, address *entry, 1533 bool dest_uninitialized = false) { 1534 1535 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1536 1537 // Input registers (after setup_arg_regs) 1538 const Register from = c_rarg0; // source array address 1539 const Register to = c_rarg1; // destination array address 1540 const Register count = c_rarg2; // elementscount 1541 const Register ckoff = c_rarg3; // super_check_offset 1542 const Register ckval = c_rarg4; // super_klass 1543 1544 // Registers used as temps (r18, r19, r20 are save-on-entry) 1545 const Register count_save = r21; // orig elementscount 1546 const Register start_to = r20; // destination array start address 1547 const Register copied_oop = r18; // actual oop copied 1548 const Register r19_klass = r19; // oop._klass 1549 1550 //--------------------------------------------------------------- 1551 // Assembler stub will be used for this call to arraycopy 1552 // if the two arrays are subtypes of Object[] but the 1553 // destination array type is not equal to or a supertype 1554 // of the source type. Each element must be separately 1555 // checked. 1556 1557 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1558 copied_oop, r19_klass, count_save); 1559 1560 __ align(CodeEntryAlignment); 1561 StubCodeMark mark(this, "StubRoutines", name); 1562 address start = __ pc(); 1563 1564 __ enter(); // required for proper stackwalking of RuntimeStub frame 1565 1566 #ifdef ASSERT 1567 // caller guarantees that the arrays really are different 1568 // otherwise, we would have to make conjoint checks 1569 { Label L; 1570 array_overlap_test(L, TIMES_OOP); 1571 __ stop("checkcast_copy within a single array"); 1572 __ bind(L); 1573 } 1574 #endif //ASSERT 1575 1576 // Caller of this entry point must set up the argument registers. 1577 if (entry != NULL) { 1578 *entry = __ pc(); 1579 BLOCK_COMMENT("Entry:"); 1580 } 1581 1582 // Empty array: Nothing to do. 1583 __ cbz(count, L_done); 1584 1585 __ push(RegSet::of(r18, r19, r20, r21), sp); 1586 1587 #ifdef ASSERT 1588 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1589 // The ckoff and ckval must be mutually consistent, 1590 // even though caller generates both. 1591 { Label L; 1592 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1593 __ ldrw(start_to, Address(ckval, sco_offset)); 1594 __ cmpw(ckoff, start_to); 1595 __ br(Assembler::EQ, L); 1596 __ stop("super_check_offset inconsistent"); 1597 __ bind(L); 1598 } 1599 #endif //ASSERT 1600 1601 gen_write_ref_array_pre_barrier(to, count, dest_uninitialized); 1602 1603 // save the original count 1604 __ mov(count_save, count); 1605 1606 // Copy from low to high addresses 1607 __ mov(start_to, to); // Save destination array start address 1608 __ b(L_load_element); 1609 1610 // ======== begin loop ======== 1611 // (Loop is rotated; its entry is L_load_element.) 1612 // Loop control: 1613 // for (; count != 0; count--) { 1614 // copied_oop = load_heap_oop(from++); 1615 // ... generate_type_check ...; 1616 // store_heap_oop(to++, copied_oop); 1617 // } 1618 __ align(OptoLoopAlignment); 1619 1620 __ BIND(L_store_element); 1621 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1622 __ sub(count, count, 1); 1623 __ cbz(count, L_do_card_marks); 1624 1625 // ======== loop entry is here ======== 1626 __ BIND(L_load_element); 1627 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1628 __ cbz(copied_oop, L_store_element); 1629 1630 __ load_klass(r19_klass, copied_oop);// query the object klass 1631 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1632 // ======== end loop ======== 1633 1634 // It was a real error; we must depend on the caller to finish the job. 1635 // Register count = remaining oops, count_orig = total oops. 1636 // Emit GC store barriers for the oops we have copied and report 1637 // their number to the caller. 1638 1639 __ subs(count, count_save, count); // K = partially copied oop count 1640 __ eon(count, count, zr); // report (-1^K) to caller 1641 __ br(Assembler::EQ, L_done_pop); 1642 1643 __ BIND(L_do_card_marks); 1644 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1645 gen_write_ref_array_post_barrier(start_to, to, rscratch1); 1646 1647 __ bind(L_done_pop); 1648 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1649 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1650 1651 __ bind(L_done); 1652 __ mov(r0, count); 1653 __ leave(); 1654 __ ret(lr); 1655 1656 return start; 1657 } 1658 1659 // Perform range checks on the proposed arraycopy. 1660 // Kills temp, but nothing else. 1661 // Also, clean the sign bits of src_pos and dst_pos. 1662 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1663 Register src_pos, // source position (c_rarg1) 1664 Register dst, // destination array oo (c_rarg2) 1665 Register dst_pos, // destination position (c_rarg3) 1666 Register length, 1667 Register temp, 1668 Label& L_failed) { 1669 BLOCK_COMMENT("arraycopy_range_checks:"); 1670 1671 assert_different_registers(rscratch1, temp); 1672 1673 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1674 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1675 __ addw(temp, length, src_pos); 1676 __ cmpw(temp, rscratch1); 1677 __ br(Assembler::HI, L_failed); 1678 1679 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1680 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1681 __ addw(temp, length, dst_pos); 1682 __ cmpw(temp, rscratch1); 1683 __ br(Assembler::HI, L_failed); 1684 1685 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1686 __ movw(src_pos, src_pos); 1687 __ movw(dst_pos, dst_pos); 1688 1689 BLOCK_COMMENT("arraycopy_range_checks done"); 1690 } 1691 1692 // These stubs get called from some dumb test routine. 1693 // I'll write them properly when they're called from 1694 // something that's actually doing something. 1695 static void fake_arraycopy_stub(address src, address dst, int count) { 1696 assert(count == 0, "huh?"); 1697 } 1698 1699 1700 // 1701 // Generate 'unsafe' array copy stub 1702 // Though just as safe as the other stubs, it takes an unscaled 1703 // size_t argument instead of an element count. 1704 // 1705 // Input: 1706 // c_rarg0 - source array address 1707 // c_rarg1 - destination array address 1708 // c_rarg2 - byte count, treated as ssize_t, can be zero 1709 // 1710 // Examines the alignment of the operands and dispatches 1711 // to a long, int, short, or byte copy loop. 1712 // 1713 address generate_unsafe_copy(const char *name, 1714 address byte_copy_entry, 1715 address short_copy_entry, 1716 address int_copy_entry, 1717 address long_copy_entry) { 1718 Label L_long_aligned, L_int_aligned, L_short_aligned; 1719 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1720 1721 __ align(CodeEntryAlignment); 1722 StubCodeMark mark(this, "StubRoutines", name); 1723 address start = __ pc(); 1724 __ enter(); // required for proper stackwalking of RuntimeStub frame 1725 1726 // bump this on entry, not on exit: 1727 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1728 1729 __ orr(rscratch1, s, d); 1730 __ orr(rscratch1, rscratch1, count); 1731 1732 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1733 __ cbz(rscratch1, L_long_aligned); 1734 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1735 __ cbz(rscratch1, L_int_aligned); 1736 __ tbz(rscratch1, 0, L_short_aligned); 1737 __ b(RuntimeAddress(byte_copy_entry)); 1738 1739 __ BIND(L_short_aligned); 1740 __ lsr(count, count, LogBytesPerShort); // size => short_count 1741 __ b(RuntimeAddress(short_copy_entry)); 1742 __ BIND(L_int_aligned); 1743 __ lsr(count, count, LogBytesPerInt); // size => int_count 1744 __ b(RuntimeAddress(int_copy_entry)); 1745 __ BIND(L_long_aligned); 1746 __ lsr(count, count, LogBytesPerLong); // size => long_count 1747 __ b(RuntimeAddress(long_copy_entry)); 1748 1749 return start; 1750 } 1751 1752 // 1753 // Generate generic array copy stubs 1754 // 1755 // Input: 1756 // c_rarg0 - src oop 1757 // c_rarg1 - src_pos (32-bits) 1758 // c_rarg2 - dst oop 1759 // c_rarg3 - dst_pos (32-bits) 1760 // c_rarg4 - element count (32-bits) 1761 // 1762 // Output: 1763 // r0 == 0 - success 1764 // r0 == -1^K - failure, where K is partial transfer count 1765 // 1766 address generate_generic_copy(const char *name, 1767 address byte_copy_entry, address short_copy_entry, 1768 address int_copy_entry, address oop_copy_entry, 1769 address long_copy_entry, address checkcast_copy_entry) { 1770 1771 Label L_failed, L_failed_0, L_objArray; 1772 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1773 1774 // Input registers 1775 const Register src = c_rarg0; // source array oop 1776 const Register src_pos = c_rarg1; // source position 1777 const Register dst = c_rarg2; // destination array oop 1778 const Register dst_pos = c_rarg3; // destination position 1779 const Register length = c_rarg4; 1780 1781 StubCodeMark mark(this, "StubRoutines", name); 1782 1783 __ align(CodeEntryAlignment); 1784 address start = __ pc(); 1785 1786 __ enter(); // required for proper stackwalking of RuntimeStub frame 1787 1788 // bump this on entry, not on exit: 1789 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1790 1791 //----------------------------------------------------------------------- 1792 // Assembler stub will be used for this call to arraycopy 1793 // if the following conditions are met: 1794 // 1795 // (1) src and dst must not be null. 1796 // (2) src_pos must not be negative. 1797 // (3) dst_pos must not be negative. 1798 // (4) length must not be negative. 1799 // (5) src klass and dst klass should be the same and not NULL. 1800 // (6) src and dst should be arrays. 1801 // (7) src_pos + length must not exceed length of src. 1802 // (8) dst_pos + length must not exceed length of dst. 1803 // 1804 1805 // if (src == NULL) return -1; 1806 __ cbz(src, L_failed); 1807 1808 // if (src_pos < 0) return -1; 1809 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 1810 1811 // if (dst == NULL) return -1; 1812 __ cbz(dst, L_failed); 1813 1814 // if (dst_pos < 0) return -1; 1815 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 1816 1817 // registers used as temp 1818 const Register scratch_length = r16; // elements count to copy 1819 const Register scratch_src_klass = r17; // array klass 1820 const Register lh = r18; // layout helper 1821 1822 // if (length < 0) return -1; 1823 __ movw(scratch_length, length); // length (elements count, 32-bits value) 1824 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 1825 1826 __ load_klass(scratch_src_klass, src); 1827 #ifdef ASSERT 1828 // assert(src->klass() != NULL); 1829 { 1830 BLOCK_COMMENT("assert klasses not null {"); 1831 Label L1, L2; 1832 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 1833 __ bind(L1); 1834 __ stop("broken null klass"); 1835 __ bind(L2); 1836 __ load_klass(rscratch1, dst); 1837 __ cbz(rscratch1, L1); // this would be broken also 1838 BLOCK_COMMENT("} assert klasses not null done"); 1839 } 1840 #endif 1841 1842 // Load layout helper (32-bits) 1843 // 1844 // |array_tag| | header_size | element_type | |log2_element_size| 1845 // 32 30 24 16 8 2 0 1846 // 1847 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 1848 // 1849 1850 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 1851 1852 // Handle objArrays completely differently... 1853 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 1854 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 1855 __ movw(rscratch1, objArray_lh); 1856 __ eorw(rscratch2, lh, rscratch1); 1857 __ cbzw(rscratch2, L_objArray); 1858 1859 // if (src->klass() != dst->klass()) return -1; 1860 __ load_klass(rscratch2, dst); 1861 __ eor(rscratch2, rscratch2, scratch_src_klass); 1862 __ cbnz(rscratch2, L_failed); 1863 1864 // if (!src->is_Array()) return -1; 1865 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 1866 1867 // At this point, it is known to be a typeArray (array_tag 0x3). 1868 #ifdef ASSERT 1869 { 1870 BLOCK_COMMENT("assert primitive array {"); 1871 Label L; 1872 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 1873 __ cmpw(lh, rscratch2); 1874 __ br(Assembler::GE, L); 1875 __ stop("must be a primitive array"); 1876 __ bind(L); 1877 BLOCK_COMMENT("} assert primitive array done"); 1878 } 1879 #endif 1880 1881 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1882 rscratch2, L_failed); 1883 1884 // TypeArrayKlass 1885 // 1886 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 1887 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 1888 // 1889 1890 const Register rscratch1_offset = rscratch1; // array offset 1891 const Register r18_elsize = lh; // element size 1892 1893 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 1894 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 1895 __ add(src, src, rscratch1_offset); // src array offset 1896 __ add(dst, dst, rscratch1_offset); // dst array offset 1897 BLOCK_COMMENT("choose copy loop based on element size"); 1898 1899 // next registers should be set before the jump to corresponding stub 1900 const Register from = c_rarg0; // source array address 1901 const Register to = c_rarg1; // destination array address 1902 const Register count = c_rarg2; // elements count 1903 1904 // 'from', 'to', 'count' registers should be set in such order 1905 // since they are the same as 'src', 'src_pos', 'dst'. 1906 1907 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 1908 1909 // The possible values of elsize are 0-3, i.e. exact_log2(element 1910 // size in bytes). We do a simple bitwise binary search. 1911 __ BIND(L_copy_bytes); 1912 __ tbnz(r18_elsize, 1, L_copy_ints); 1913 __ tbnz(r18_elsize, 0, L_copy_shorts); 1914 __ lea(from, Address(src, src_pos));// src_addr 1915 __ lea(to, Address(dst, dst_pos));// dst_addr 1916 __ movw(count, scratch_length); // length 1917 __ b(RuntimeAddress(byte_copy_entry)); 1918 1919 __ BIND(L_copy_shorts); 1920 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 1921 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 1922 __ movw(count, scratch_length); // length 1923 __ b(RuntimeAddress(short_copy_entry)); 1924 1925 __ BIND(L_copy_ints); 1926 __ tbnz(r18_elsize, 0, L_copy_longs); 1927 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 1928 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 1929 __ movw(count, scratch_length); // length 1930 __ b(RuntimeAddress(int_copy_entry)); 1931 1932 __ BIND(L_copy_longs); 1933 #ifdef ASSERT 1934 { 1935 BLOCK_COMMENT("assert long copy {"); 1936 Label L; 1937 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 1938 __ cmpw(r18_elsize, LogBytesPerLong); 1939 __ br(Assembler::EQ, L); 1940 __ stop("must be long copy, but elsize is wrong"); 1941 __ bind(L); 1942 BLOCK_COMMENT("} assert long copy done"); 1943 } 1944 #endif 1945 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 1946 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 1947 __ movw(count, scratch_length); // length 1948 __ b(RuntimeAddress(long_copy_entry)); 1949 1950 // ObjArrayKlass 1951 __ BIND(L_objArray); 1952 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 1953 1954 Label L_plain_copy, L_checkcast_copy; 1955 // test array classes for subtyping 1956 __ load_klass(r18, dst); 1957 __ cmp(scratch_src_klass, r18); // usual case is exact equality 1958 __ br(Assembler::NE, L_checkcast_copy); 1959 1960 // Identically typed arrays can be copied without element-wise checks. 1961 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1962 rscratch2, L_failed); 1963 1964 __ lea(from, Address(src, src_pos, Address::lsl(3))); 1965 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1966 __ lea(to, Address(dst, dst_pos, Address::lsl(3))); 1967 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1968 __ movw(count, scratch_length); // length 1969 __ BIND(L_plain_copy); 1970 __ b(RuntimeAddress(oop_copy_entry)); 1971 1972 __ BIND(L_checkcast_copy); 1973 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 1974 { 1975 // Before looking at dst.length, make sure dst is also an objArray. 1976 __ ldrw(rscratch1, Address(r18, lh_offset)); 1977 __ movw(rscratch2, objArray_lh); 1978 __ eorw(rscratch1, rscratch1, rscratch2); 1979 __ cbnzw(rscratch1, L_failed); 1980 1981 // It is safe to examine both src.length and dst.length. 1982 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1983 r18, L_failed); 1984 1985 const Register rscratch2_dst_klass = rscratch2; 1986 __ load_klass(rscratch2_dst_klass, dst); // reload 1987 1988 // Marshal the base address arguments now, freeing registers. 1989 __ lea(from, Address(src, src_pos, Address::lsl(3))); 1990 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1991 __ lea(to, Address(dst, dst_pos, Address::lsl(3))); 1992 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1993 __ movw(count, length); // length (reloaded) 1994 Register sco_temp = c_rarg3; // this register is free now 1995 assert_different_registers(from, to, count, sco_temp, 1996 rscratch2_dst_klass, scratch_src_klass); 1997 // assert_clean_int(count, sco_temp); 1998 1999 // Generate the type check. 2000 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2001 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2002 // assert_clean_int(sco_temp, r18); 2003 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2004 2005 // Fetch destination element klass from the ObjArrayKlass header. 2006 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2007 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2008 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2009 2010 // the checkcast_copy loop needs two extra arguments: 2011 assert(c_rarg3 == sco_temp, "#3 already in place"); 2012 // Set up arguments for checkcast_copy_entry. 2013 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2014 __ b(RuntimeAddress(checkcast_copy_entry)); 2015 } 2016 2017 __ BIND(L_failed); 2018 __ mov(r0, -1); 2019 __ leave(); // required for proper stackwalking of RuntimeStub frame 2020 __ ret(lr); 2021 2022 return start; 2023 } 2024 2025 void generate_arraycopy_stubs() { 2026 address entry; 2027 address entry_jbyte_arraycopy; 2028 address entry_jshort_arraycopy; 2029 address entry_jint_arraycopy; 2030 address entry_oop_arraycopy; 2031 address entry_jlong_arraycopy; 2032 address entry_checkcast_arraycopy; 2033 2034 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2035 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2036 2037 //*** jbyte 2038 // Always need aligned and unaligned versions 2039 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2040 "jbyte_disjoint_arraycopy"); 2041 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2042 &entry_jbyte_arraycopy, 2043 "jbyte_arraycopy"); 2044 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2045 "arrayof_jbyte_disjoint_arraycopy"); 2046 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2047 "arrayof_jbyte_arraycopy"); 2048 2049 //*** jshort 2050 // Always need aligned and unaligned versions 2051 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2052 "jshort_disjoint_arraycopy"); 2053 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2054 &entry_jshort_arraycopy, 2055 "jshort_arraycopy"); 2056 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2057 "arrayof_jshort_disjoint_arraycopy"); 2058 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2059 "arrayof_jshort_arraycopy"); 2060 2061 //*** jint 2062 // Aligned versions 2063 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2064 "arrayof_jint_disjoint_arraycopy"); 2065 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2066 "arrayof_jint_arraycopy"); 2067 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2068 // entry_jint_arraycopy always points to the unaligned version 2069 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2070 "jint_disjoint_arraycopy"); 2071 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2072 &entry_jint_arraycopy, 2073 "jint_arraycopy"); 2074 2075 //*** jlong 2076 // It is always aligned 2077 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2078 "arrayof_jlong_disjoint_arraycopy"); 2079 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2080 "arrayof_jlong_arraycopy"); 2081 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2082 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2083 2084 //*** oops 2085 { 2086 // With compressed oops we need unaligned versions; notice that 2087 // we overwrite entry_oop_arraycopy. 2088 bool aligned = !UseCompressedOops; 2089 2090 StubRoutines::_arrayof_oop_disjoint_arraycopy 2091 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2092 /*dest_uninitialized*/false); 2093 StubRoutines::_arrayof_oop_arraycopy 2094 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2095 /*dest_uninitialized*/false); 2096 // Aligned versions without pre-barriers 2097 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2098 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2099 /*dest_uninitialized*/true); 2100 StubRoutines::_arrayof_oop_arraycopy_uninit 2101 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2102 /*dest_uninitialized*/true); 2103 } 2104 2105 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2106 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2107 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2108 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2109 2110 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2111 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2112 /*dest_uninitialized*/true); 2113 2114 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2115 entry_jbyte_arraycopy, 2116 entry_jshort_arraycopy, 2117 entry_jint_arraycopy, 2118 entry_jlong_arraycopy); 2119 2120 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2121 entry_jbyte_arraycopy, 2122 entry_jshort_arraycopy, 2123 entry_jint_arraycopy, 2124 entry_oop_arraycopy, 2125 entry_jlong_arraycopy, 2126 entry_checkcast_arraycopy); 2127 2128 } 2129 2130 void generate_math_stubs() { Unimplemented(); } 2131 2132 // Arguments: 2133 // 2134 // Inputs: 2135 // c_rarg0 - source byte array address 2136 // c_rarg1 - destination byte array address 2137 // c_rarg2 - K (key) in little endian int array 2138 // 2139 address generate_aescrypt_encryptBlock() { 2140 __ align(CodeEntryAlignment); 2141 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2142 2143 Label L_doLast; 2144 2145 const Register from = c_rarg0; // source array address 2146 const Register to = c_rarg1; // destination array address 2147 const Register key = c_rarg2; // key array address 2148 const Register keylen = rscratch1; 2149 2150 address start = __ pc(); 2151 __ enter(); 2152 2153 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2154 2155 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2156 2157 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2158 __ rev32(v1, __ T16B, v1); 2159 __ rev32(v2, __ T16B, v2); 2160 __ rev32(v3, __ T16B, v3); 2161 __ rev32(v4, __ T16B, v4); 2162 __ aese(v0, v1); 2163 __ aesmc(v0, v0); 2164 __ aese(v0, v2); 2165 __ aesmc(v0, v0); 2166 __ aese(v0, v3); 2167 __ aesmc(v0, v0); 2168 __ aese(v0, v4); 2169 __ aesmc(v0, v0); 2170 2171 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2172 __ rev32(v1, __ T16B, v1); 2173 __ rev32(v2, __ T16B, v2); 2174 __ rev32(v3, __ T16B, v3); 2175 __ rev32(v4, __ T16B, v4); 2176 __ aese(v0, v1); 2177 __ aesmc(v0, v0); 2178 __ aese(v0, v2); 2179 __ aesmc(v0, v0); 2180 __ aese(v0, v3); 2181 __ aesmc(v0, v0); 2182 __ aese(v0, v4); 2183 __ aesmc(v0, v0); 2184 2185 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2186 __ rev32(v1, __ T16B, v1); 2187 __ rev32(v2, __ T16B, v2); 2188 2189 __ cmpw(keylen, 44); 2190 __ br(Assembler::EQ, L_doLast); 2191 2192 __ aese(v0, v1); 2193 __ aesmc(v0, v0); 2194 __ aese(v0, v2); 2195 __ aesmc(v0, v0); 2196 2197 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2198 __ rev32(v1, __ T16B, v1); 2199 __ rev32(v2, __ T16B, v2); 2200 2201 __ cmpw(keylen, 52); 2202 __ br(Assembler::EQ, L_doLast); 2203 2204 __ aese(v0, v1); 2205 __ aesmc(v0, v0); 2206 __ aese(v0, v2); 2207 __ aesmc(v0, v0); 2208 2209 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2210 __ rev32(v1, __ T16B, v1); 2211 __ rev32(v2, __ T16B, v2); 2212 2213 __ BIND(L_doLast); 2214 2215 __ aese(v0, v1); 2216 __ aesmc(v0, v0); 2217 __ aese(v0, v2); 2218 2219 __ ld1(v1, __ T16B, key); 2220 __ rev32(v1, __ T16B, v1); 2221 __ eor(v0, __ T16B, v0, v1); 2222 2223 __ st1(v0, __ T16B, to); 2224 2225 __ mov(r0, 0); 2226 2227 __ leave(); 2228 __ ret(lr); 2229 2230 return start; 2231 } 2232 2233 // Arguments: 2234 // 2235 // Inputs: 2236 // c_rarg0 - source byte array address 2237 // c_rarg1 - destination byte array address 2238 // c_rarg2 - K (key) in little endian int array 2239 // 2240 address generate_aescrypt_decryptBlock() { 2241 assert(UseAES, "need AES instructions and misaligned SSE support"); 2242 __ align(CodeEntryAlignment); 2243 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2244 Label L_doLast; 2245 2246 const Register from = c_rarg0; // source array address 2247 const Register to = c_rarg1; // destination array address 2248 const Register key = c_rarg2; // key array address 2249 const Register keylen = rscratch1; 2250 2251 address start = __ pc(); 2252 __ enter(); // required for proper stackwalking of RuntimeStub frame 2253 2254 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2255 2256 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2257 2258 __ ld1(v5, __ T16B, __ post(key, 16)); 2259 __ rev32(v5, __ T16B, v5); 2260 2261 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2262 __ rev32(v1, __ T16B, v1); 2263 __ rev32(v2, __ T16B, v2); 2264 __ rev32(v3, __ T16B, v3); 2265 __ rev32(v4, __ T16B, v4); 2266 __ aesd(v0, v1); 2267 __ aesimc(v0, v0); 2268 __ aesd(v0, v2); 2269 __ aesimc(v0, v0); 2270 __ aesd(v0, v3); 2271 __ aesimc(v0, v0); 2272 __ aesd(v0, v4); 2273 __ aesimc(v0, v0); 2274 2275 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2276 __ rev32(v1, __ T16B, v1); 2277 __ rev32(v2, __ T16B, v2); 2278 __ rev32(v3, __ T16B, v3); 2279 __ rev32(v4, __ T16B, v4); 2280 __ aesd(v0, v1); 2281 __ aesimc(v0, v0); 2282 __ aesd(v0, v2); 2283 __ aesimc(v0, v0); 2284 __ aesd(v0, v3); 2285 __ aesimc(v0, v0); 2286 __ aesd(v0, v4); 2287 __ aesimc(v0, v0); 2288 2289 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2290 __ rev32(v1, __ T16B, v1); 2291 __ rev32(v2, __ T16B, v2); 2292 2293 __ cmpw(keylen, 44); 2294 __ br(Assembler::EQ, L_doLast); 2295 2296 __ aesd(v0, v1); 2297 __ aesimc(v0, v0); 2298 __ aesd(v0, v2); 2299 __ aesimc(v0, v0); 2300 2301 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2302 __ rev32(v1, __ T16B, v1); 2303 __ rev32(v2, __ T16B, v2); 2304 2305 __ cmpw(keylen, 52); 2306 __ br(Assembler::EQ, L_doLast); 2307 2308 __ aesd(v0, v1); 2309 __ aesimc(v0, v0); 2310 __ aesd(v0, v2); 2311 __ aesimc(v0, v0); 2312 2313 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2314 __ rev32(v1, __ T16B, v1); 2315 __ rev32(v2, __ T16B, v2); 2316 2317 __ BIND(L_doLast); 2318 2319 __ aesd(v0, v1); 2320 __ aesimc(v0, v0); 2321 __ aesd(v0, v2); 2322 2323 __ eor(v0, __ T16B, v0, v5); 2324 2325 __ st1(v0, __ T16B, to); 2326 2327 __ mov(r0, 0); 2328 2329 __ leave(); 2330 __ ret(lr); 2331 2332 return start; 2333 } 2334 2335 // Arguments: 2336 // 2337 // Inputs: 2338 // c_rarg0 - source byte array address 2339 // c_rarg1 - destination byte array address 2340 // c_rarg2 - K (key) in little endian int array 2341 // c_rarg3 - r vector byte array address 2342 // c_rarg4 - input length 2343 // 2344 // Output: 2345 // x0 - input length 2346 // 2347 address generate_cipherBlockChaining_encryptAESCrypt() { 2348 assert(UseAES, "need AES instructions and misaligned SSE support"); 2349 __ align(CodeEntryAlignment); 2350 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2351 2352 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2353 2354 const Register from = c_rarg0; // source array address 2355 const Register to = c_rarg1; // destination array address 2356 const Register key = c_rarg2; // key array address 2357 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2358 // and left with the results of the last encryption block 2359 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2360 const Register keylen = rscratch1; 2361 2362 address start = __ pc(); 2363 __ enter(); 2364 2365 __ mov(rscratch2, len_reg); 2366 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2367 2368 __ ld1(v0, __ T16B, rvec); 2369 2370 __ cmpw(keylen, 52); 2371 __ br(Assembler::CC, L_loadkeys_44); 2372 __ br(Assembler::EQ, L_loadkeys_52); 2373 2374 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2375 __ rev32(v17, __ T16B, v17); 2376 __ rev32(v18, __ T16B, v18); 2377 __ BIND(L_loadkeys_52); 2378 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2379 __ rev32(v19, __ T16B, v19); 2380 __ rev32(v20, __ T16B, v20); 2381 __ BIND(L_loadkeys_44); 2382 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2383 __ rev32(v21, __ T16B, v21); 2384 __ rev32(v22, __ T16B, v22); 2385 __ rev32(v23, __ T16B, v23); 2386 __ rev32(v24, __ T16B, v24); 2387 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2388 __ rev32(v25, __ T16B, v25); 2389 __ rev32(v26, __ T16B, v26); 2390 __ rev32(v27, __ T16B, v27); 2391 __ rev32(v28, __ T16B, v28); 2392 __ ld1(v29, v30, v31, __ T16B, key); 2393 __ rev32(v29, __ T16B, v29); 2394 __ rev32(v30, __ T16B, v30); 2395 __ rev32(v31, __ T16B, v31); 2396 2397 __ BIND(L_aes_loop); 2398 __ ld1(v1, __ T16B, __ post(from, 16)); 2399 __ eor(v0, __ T16B, v0, v1); 2400 2401 __ br(Assembler::CC, L_rounds_44); 2402 __ br(Assembler::EQ, L_rounds_52); 2403 2404 __ aese(v0, v17); __ aesmc(v0, v0); 2405 __ aese(v0, v18); __ aesmc(v0, v0); 2406 __ BIND(L_rounds_52); 2407 __ aese(v0, v19); __ aesmc(v0, v0); 2408 __ aese(v0, v20); __ aesmc(v0, v0); 2409 __ BIND(L_rounds_44); 2410 __ aese(v0, v21); __ aesmc(v0, v0); 2411 __ aese(v0, v22); __ aesmc(v0, v0); 2412 __ aese(v0, v23); __ aesmc(v0, v0); 2413 __ aese(v0, v24); __ aesmc(v0, v0); 2414 __ aese(v0, v25); __ aesmc(v0, v0); 2415 __ aese(v0, v26); __ aesmc(v0, v0); 2416 __ aese(v0, v27); __ aesmc(v0, v0); 2417 __ aese(v0, v28); __ aesmc(v0, v0); 2418 __ aese(v0, v29); __ aesmc(v0, v0); 2419 __ aese(v0, v30); 2420 __ eor(v0, __ T16B, v0, v31); 2421 2422 __ st1(v0, __ T16B, __ post(to, 16)); 2423 __ sub(len_reg, len_reg, 16); 2424 __ cbnz(len_reg, L_aes_loop); 2425 2426 __ st1(v0, __ T16B, rvec); 2427 2428 __ mov(r0, rscratch2); 2429 2430 __ leave(); 2431 __ ret(lr); 2432 2433 return start; 2434 } 2435 2436 // Arguments: 2437 // 2438 // Inputs: 2439 // c_rarg0 - source byte array address 2440 // c_rarg1 - destination byte array address 2441 // c_rarg2 - K (key) in little endian int array 2442 // c_rarg3 - r vector byte array address 2443 // c_rarg4 - input length 2444 // 2445 // Output: 2446 // r0 - input length 2447 // 2448 address generate_cipherBlockChaining_decryptAESCrypt() { 2449 assert(UseAES, "need AES instructions and misaligned SSE support"); 2450 __ align(CodeEntryAlignment); 2451 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2452 2453 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2454 2455 const Register from = c_rarg0; // source array address 2456 const Register to = c_rarg1; // destination array address 2457 const Register key = c_rarg2; // key array address 2458 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2459 // and left with the results of the last encryption block 2460 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2461 const Register keylen = rscratch1; 2462 2463 address start = __ pc(); 2464 __ enter(); 2465 2466 __ mov(rscratch2, len_reg); 2467 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2468 2469 __ ld1(v2, __ T16B, rvec); 2470 2471 __ ld1(v31, __ T16B, __ post(key, 16)); 2472 __ rev32(v31, __ T16B, v31); 2473 2474 __ cmpw(keylen, 52); 2475 __ br(Assembler::CC, L_loadkeys_44); 2476 __ br(Assembler::EQ, L_loadkeys_52); 2477 2478 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2479 __ rev32(v17, __ T16B, v17); 2480 __ rev32(v18, __ T16B, v18); 2481 __ BIND(L_loadkeys_52); 2482 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2483 __ rev32(v19, __ T16B, v19); 2484 __ rev32(v20, __ T16B, v20); 2485 __ BIND(L_loadkeys_44); 2486 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2487 __ rev32(v21, __ T16B, v21); 2488 __ rev32(v22, __ T16B, v22); 2489 __ rev32(v23, __ T16B, v23); 2490 __ rev32(v24, __ T16B, v24); 2491 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2492 __ rev32(v25, __ T16B, v25); 2493 __ rev32(v26, __ T16B, v26); 2494 __ rev32(v27, __ T16B, v27); 2495 __ rev32(v28, __ T16B, v28); 2496 __ ld1(v29, v30, __ T16B, key); 2497 __ rev32(v29, __ T16B, v29); 2498 __ rev32(v30, __ T16B, v30); 2499 2500 __ BIND(L_aes_loop); 2501 __ ld1(v0, __ T16B, __ post(from, 16)); 2502 __ orr(v1, __ T16B, v0, v0); 2503 2504 __ br(Assembler::CC, L_rounds_44); 2505 __ br(Assembler::EQ, L_rounds_52); 2506 2507 __ aesd(v0, v17); __ aesimc(v0, v0); 2508 __ aesd(v0, v18); __ aesimc(v0, v0); 2509 __ BIND(L_rounds_52); 2510 __ aesd(v0, v19); __ aesimc(v0, v0); 2511 __ aesd(v0, v20); __ aesimc(v0, v0); 2512 __ BIND(L_rounds_44); 2513 __ aesd(v0, v21); __ aesimc(v0, v0); 2514 __ aesd(v0, v22); __ aesimc(v0, v0); 2515 __ aesd(v0, v23); __ aesimc(v0, v0); 2516 __ aesd(v0, v24); __ aesimc(v0, v0); 2517 __ aesd(v0, v25); __ aesimc(v0, v0); 2518 __ aesd(v0, v26); __ aesimc(v0, v0); 2519 __ aesd(v0, v27); __ aesimc(v0, v0); 2520 __ aesd(v0, v28); __ aesimc(v0, v0); 2521 __ aesd(v0, v29); __ aesimc(v0, v0); 2522 __ aesd(v0, v30); 2523 __ eor(v0, __ T16B, v0, v31); 2524 __ eor(v0, __ T16B, v0, v2); 2525 2526 __ st1(v0, __ T16B, __ post(to, 16)); 2527 __ orr(v2, __ T16B, v1, v1); 2528 2529 __ sub(len_reg, len_reg, 16); 2530 __ cbnz(len_reg, L_aes_loop); 2531 2532 __ st1(v2, __ T16B, rvec); 2533 2534 __ mov(r0, rscratch2); 2535 2536 __ leave(); 2537 __ ret(lr); 2538 2539 return start; 2540 } 2541 2542 // Arguments: 2543 // 2544 // Inputs: 2545 // c_rarg0 - byte[] source+offset 2546 // c_rarg1 - int[] SHA.state 2547 // c_rarg2 - int offset 2548 // c_rarg3 - int limit 2549 // 2550 address generate_sha1_implCompress(bool multi_block, const char *name) { 2551 __ align(CodeEntryAlignment); 2552 StubCodeMark mark(this, "StubRoutines", name); 2553 address start = __ pc(); 2554 2555 Register buf = c_rarg0; 2556 Register state = c_rarg1; 2557 Register ofs = c_rarg2; 2558 Register limit = c_rarg3; 2559 2560 Label keys; 2561 Label sha1_loop; 2562 2563 // load the keys into v0..v3 2564 __ adr(rscratch1, keys); 2565 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2566 // load 5 words state into v6, v7 2567 __ ldrq(v6, Address(state, 0)); 2568 __ ldrs(v7, Address(state, 16)); 2569 2570 2571 __ BIND(sha1_loop); 2572 // load 64 bytes of data into v16..v19 2573 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2574 __ rev32(v16, __ T16B, v16); 2575 __ rev32(v17, __ T16B, v17); 2576 __ rev32(v18, __ T16B, v18); 2577 __ rev32(v19, __ T16B, v19); 2578 2579 // do the sha1 2580 __ addv(v4, __ T4S, v16, v0); 2581 __ orr(v20, __ T16B, v6, v6); 2582 2583 FloatRegister d0 = v16; 2584 FloatRegister d1 = v17; 2585 FloatRegister d2 = v18; 2586 FloatRegister d3 = v19; 2587 2588 for (int round = 0; round < 20; round++) { 2589 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2590 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2591 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2592 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2593 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2594 2595 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2596 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2597 __ sha1h(tmp2, __ T4S, v20); 2598 if (round < 5) 2599 __ sha1c(v20, __ T4S, tmp3, tmp4); 2600 else if (round < 10 || round >= 15) 2601 __ sha1p(v20, __ T4S, tmp3, tmp4); 2602 else 2603 __ sha1m(v20, __ T4S, tmp3, tmp4); 2604 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2605 2606 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2607 } 2608 2609 __ addv(v7, __ T2S, v7, v21); 2610 __ addv(v6, __ T4S, v6, v20); 2611 2612 if (multi_block) { 2613 __ add(ofs, ofs, 64); 2614 __ cmp(ofs, limit); 2615 __ br(Assembler::LE, sha1_loop); 2616 __ mov(c_rarg0, ofs); // return ofs 2617 } 2618 2619 __ strq(v6, Address(state, 0)); 2620 __ strs(v7, Address(state, 16)); 2621 2622 __ ret(lr); 2623 2624 __ bind(keys); 2625 __ emit_int32(0x5a827999); 2626 __ emit_int32(0x6ed9eba1); 2627 __ emit_int32(0x8f1bbcdc); 2628 __ emit_int32(0xca62c1d6); 2629 2630 return start; 2631 } 2632 2633 2634 // Arguments: 2635 // 2636 // Inputs: 2637 // c_rarg0 - byte[] source+offset 2638 // c_rarg1 - int[] SHA.state 2639 // c_rarg2 - int offset 2640 // c_rarg3 - int limit 2641 // 2642 address generate_sha256_implCompress(bool multi_block, const char *name) { 2643 static const uint32_t round_consts[64] = { 2644 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 2645 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 2646 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 2647 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 2648 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 2649 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 2650 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 2651 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 2652 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 2653 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 2654 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 2655 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 2656 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 2657 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 2658 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 2659 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 2660 }; 2661 __ align(CodeEntryAlignment); 2662 StubCodeMark mark(this, "StubRoutines", name); 2663 address start = __ pc(); 2664 2665 Register buf = c_rarg0; 2666 Register state = c_rarg1; 2667 Register ofs = c_rarg2; 2668 Register limit = c_rarg3; 2669 2670 Label sha1_loop; 2671 2672 __ stpd(v8, v9, __ pre(sp, -32)); 2673 __ stpd(v10, v11, Address(sp, 16)); 2674 2675 // dga == v0 2676 // dgb == v1 2677 // dg0 == v2 2678 // dg1 == v3 2679 // dg2 == v4 2680 // t0 == v6 2681 // t1 == v7 2682 2683 // load 16 keys to v16..v31 2684 __ lea(rscratch1, ExternalAddress((address)round_consts)); 2685 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 2686 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 2687 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 2688 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 2689 2690 // load 8 words (256 bits) state 2691 __ ldpq(v0, v1, state); 2692 2693 __ BIND(sha1_loop); 2694 // load 64 bytes of data into v8..v11 2695 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 2696 __ rev32(v8, __ T16B, v8); 2697 __ rev32(v9, __ T16B, v9); 2698 __ rev32(v10, __ T16B, v10); 2699 __ rev32(v11, __ T16B, v11); 2700 2701 __ addv(v6, __ T4S, v8, v16); 2702 __ orr(v2, __ T16B, v0, v0); 2703 __ orr(v3, __ T16B, v1, v1); 2704 2705 FloatRegister d0 = v8; 2706 FloatRegister d1 = v9; 2707 FloatRegister d2 = v10; 2708 FloatRegister d3 = v11; 2709 2710 2711 for (int round = 0; round < 16; round++) { 2712 FloatRegister tmp1 = (round & 1) ? v6 : v7; 2713 FloatRegister tmp2 = (round & 1) ? v7 : v6; 2714 FloatRegister tmp3 = (round & 1) ? v2 : v4; 2715 FloatRegister tmp4 = (round & 1) ? v4 : v2; 2716 2717 if (round < 12) __ sha256su0(d0, __ T4S, d1); 2718 __ orr(v4, __ T16B, v2, v2); 2719 if (round < 15) 2720 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 2721 __ sha256h(v2, __ T4S, v3, tmp2); 2722 __ sha256h2(v3, __ T4S, v4, tmp2); 2723 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 2724 2725 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2726 } 2727 2728 __ addv(v0, __ T4S, v0, v2); 2729 __ addv(v1, __ T4S, v1, v3); 2730 2731 if (multi_block) { 2732 __ add(ofs, ofs, 64); 2733 __ cmp(ofs, limit); 2734 __ br(Assembler::LE, sha1_loop); 2735 __ mov(c_rarg0, ofs); // return ofs 2736 } 2737 2738 __ ldpd(v10, v11, Address(sp, 16)); 2739 __ ldpd(v8, v9, __ post(sp, 32)); 2740 2741 __ stpq(v0, v1, state); 2742 2743 __ ret(lr); 2744 2745 return start; 2746 } 2747 2748 #ifndef BUILTIN_SIM 2749 // Safefetch stubs. 2750 void generate_safefetch(const char* name, int size, address* entry, 2751 address* fault_pc, address* continuation_pc) { 2752 // safefetch signatures: 2753 // int SafeFetch32(int* adr, int errValue); 2754 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 2755 // 2756 // arguments: 2757 // c_rarg0 = adr 2758 // c_rarg1 = errValue 2759 // 2760 // result: 2761 // PPC_RET = *adr or errValue 2762 2763 StubCodeMark mark(this, "StubRoutines", name); 2764 2765 // Entry point, pc or function descriptor. 2766 *entry = __ pc(); 2767 2768 // Load *adr into c_rarg1, may fault. 2769 *fault_pc = __ pc(); 2770 switch (size) { 2771 case 4: 2772 // int32_t 2773 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 2774 break; 2775 case 8: 2776 // int64_t 2777 __ ldr(c_rarg1, Address(c_rarg0, 0)); 2778 break; 2779 default: 2780 ShouldNotReachHere(); 2781 } 2782 2783 // return errValue or *adr 2784 *continuation_pc = __ pc(); 2785 __ mov(r0, c_rarg1); 2786 __ ret(lr); 2787 } 2788 #endif 2789 2790 /** 2791 * Arguments: 2792 * 2793 * Inputs: 2794 * c_rarg0 - int crc 2795 * c_rarg1 - byte* buf 2796 * c_rarg2 - int length 2797 * 2798 * Ouput: 2799 * rax - int crc result 2800 */ 2801 address generate_updateBytesCRC32() { 2802 assert(UseCRC32Intrinsics, "what are we doing here?"); 2803 2804 __ align(CodeEntryAlignment); 2805 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 2806 2807 address start = __ pc(); 2808 2809 const Register crc = c_rarg0; // crc 2810 const Register buf = c_rarg1; // source java byte array address 2811 const Register len = c_rarg2; // length 2812 const Register table0 = c_rarg3; // crc_table address 2813 const Register table1 = c_rarg4; 2814 const Register table2 = c_rarg5; 2815 const Register table3 = c_rarg6; 2816 const Register tmp3 = c_rarg7; 2817 2818 BLOCK_COMMENT("Entry:"); 2819 __ enter(); // required for proper stackwalking of RuntimeStub frame 2820 2821 __ kernel_crc32(crc, buf, len, 2822 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 2823 2824 __ leave(); // required for proper stackwalking of RuntimeStub frame 2825 __ ret(lr); 2826 2827 return start; 2828 } 2829 2830 /** 2831 * Arguments: 2832 * 2833 * Inputs: 2834 * c_rarg0 - int crc 2835 * c_rarg1 - byte* buf 2836 * c_rarg2 - int length 2837 * c_rarg3 - int* table 2838 * 2839 * Ouput: 2840 * r0 - int crc result 2841 */ 2842 address generate_updateBytesCRC32C() { 2843 assert(UseCRC32CIntrinsics, "what are we doing here?"); 2844 2845 __ align(CodeEntryAlignment); 2846 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 2847 2848 address start = __ pc(); 2849 2850 const Register crc = c_rarg0; // crc 2851 const Register buf = c_rarg1; // source java byte array address 2852 const Register len = c_rarg2; // length 2853 const Register table0 = c_rarg3; // crc_table address 2854 const Register table1 = c_rarg4; 2855 const Register table2 = c_rarg5; 2856 const Register table3 = c_rarg6; 2857 const Register tmp3 = c_rarg7; 2858 2859 BLOCK_COMMENT("Entry:"); 2860 __ enter(); // required for proper stackwalking of RuntimeStub frame 2861 2862 __ kernel_crc32c(crc, buf, len, 2863 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 2864 2865 __ leave(); // required for proper stackwalking of RuntimeStub frame 2866 __ ret(lr); 2867 2868 return start; 2869 } 2870 2871 /*** 2872 * Arguments: 2873 * 2874 * Inputs: 2875 * c_rarg0 - int adler 2876 * c_rarg1 - byte* buff 2877 * c_rarg2 - int len 2878 * 2879 * Output: 2880 * c_rarg0 - int adler result 2881 */ 2882 address generate_updateBytesAdler32() { 2883 __ align(CodeEntryAlignment); 2884 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 2885 address start = __ pc(); 2886 2887 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 2888 2889 // Aliases 2890 Register adler = c_rarg0; 2891 Register s1 = c_rarg0; 2892 Register s2 = c_rarg3; 2893 Register buff = c_rarg1; 2894 Register len = c_rarg2; 2895 Register nmax = r4; 2896 Register base = r5; 2897 Register count = r6; 2898 Register temp0 = rscratch1; 2899 Register temp1 = rscratch2; 2900 Register temp2 = r7; 2901 2902 // Max number of bytes we can process before having to take the mod 2903 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 2904 unsigned long BASE = 0xfff1; 2905 unsigned long NMAX = 0x15B0; 2906 2907 __ mov(base, BASE); 2908 __ mov(nmax, NMAX); 2909 2910 // s1 is initialized to the lower 16 bits of adler 2911 // s2 is initialized to the upper 16 bits of adler 2912 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 2913 __ uxth(s1, adler); // s1 = (adler & 0xffff) 2914 2915 // The pipelined loop needs at least 16 elements for 1 iteration 2916 // It does check this, but it is more effective to skip to the cleanup loop 2917 __ cmp(len, 16); 2918 __ br(Assembler::HS, L_nmax); 2919 __ cbz(len, L_combine); 2920 2921 __ bind(L_simple_by1_loop); 2922 __ ldrb(temp0, Address(__ post(buff, 1))); 2923 __ add(s1, s1, temp0); 2924 __ add(s2, s2, s1); 2925 __ subs(len, len, 1); 2926 __ br(Assembler::HI, L_simple_by1_loop); 2927 2928 // s1 = s1 % BASE 2929 __ subs(temp0, s1, base); 2930 __ csel(s1, temp0, s1, Assembler::HS); 2931 2932 // s2 = s2 % BASE 2933 __ lsr(temp0, s2, 16); 2934 __ lsl(temp1, temp0, 4); 2935 __ sub(temp1, temp1, temp0); 2936 __ add(s2, temp1, s2, ext::uxth); 2937 2938 __ subs(temp0, s2, base); 2939 __ csel(s2, temp0, s2, Assembler::HS); 2940 2941 __ b(L_combine); 2942 2943 __ bind(L_nmax); 2944 __ subs(len, len, nmax); 2945 __ sub(count, nmax, 16); 2946 __ br(Assembler::LO, L_by16); 2947 2948 __ bind(L_nmax_loop); 2949 2950 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 2951 2952 __ add(s1, s1, temp0, ext::uxtb); 2953 __ ubfx(temp2, temp0, 8, 8); 2954 __ add(s2, s2, s1); 2955 __ add(s1, s1, temp2); 2956 __ ubfx(temp2, temp0, 16, 8); 2957 __ add(s2, s2, s1); 2958 __ add(s1, s1, temp2); 2959 __ ubfx(temp2, temp0, 24, 8); 2960 __ add(s2, s2, s1); 2961 __ add(s1, s1, temp2); 2962 __ ubfx(temp2, temp0, 32, 8); 2963 __ add(s2, s2, s1); 2964 __ add(s1, s1, temp2); 2965 __ ubfx(temp2, temp0, 40, 8); 2966 __ add(s2, s2, s1); 2967 __ add(s1, s1, temp2); 2968 __ ubfx(temp2, temp0, 48, 8); 2969 __ add(s2, s2, s1); 2970 __ add(s1, s1, temp2); 2971 __ add(s2, s2, s1); 2972 __ add(s1, s1, temp0, Assembler::LSR, 56); 2973 __ add(s2, s2, s1); 2974 2975 __ add(s1, s1, temp1, ext::uxtb); 2976 __ ubfx(temp2, temp1, 8, 8); 2977 __ add(s2, s2, s1); 2978 __ add(s1, s1, temp2); 2979 __ ubfx(temp2, temp1, 16, 8); 2980 __ add(s2, s2, s1); 2981 __ add(s1, s1, temp2); 2982 __ ubfx(temp2, temp1, 24, 8); 2983 __ add(s2, s2, s1); 2984 __ add(s1, s1, temp2); 2985 __ ubfx(temp2, temp1, 32, 8); 2986 __ add(s2, s2, s1); 2987 __ add(s1, s1, temp2); 2988 __ ubfx(temp2, temp1, 40, 8); 2989 __ add(s2, s2, s1); 2990 __ add(s1, s1, temp2); 2991 __ ubfx(temp2, temp1, 48, 8); 2992 __ add(s2, s2, s1); 2993 __ add(s1, s1, temp2); 2994 __ add(s2, s2, s1); 2995 __ add(s1, s1, temp1, Assembler::LSR, 56); 2996 __ add(s2, s2, s1); 2997 2998 __ subs(count, count, 16); 2999 __ br(Assembler::HS, L_nmax_loop); 3000 3001 // s1 = s1 % BASE 3002 __ lsr(temp0, s1, 16); 3003 __ lsl(temp1, temp0, 4); 3004 __ sub(temp1, temp1, temp0); 3005 __ add(temp1, temp1, s1, ext::uxth); 3006 3007 __ lsr(temp0, temp1, 16); 3008 __ lsl(s1, temp0, 4); 3009 __ sub(s1, s1, temp0); 3010 __ add(s1, s1, temp1, ext:: uxth); 3011 3012 __ subs(temp0, s1, base); 3013 __ csel(s1, temp0, s1, Assembler::HS); 3014 3015 // s2 = s2 % BASE 3016 __ lsr(temp0, s2, 16); 3017 __ lsl(temp1, temp0, 4); 3018 __ sub(temp1, temp1, temp0); 3019 __ add(temp1, temp1, s2, ext::uxth); 3020 3021 __ lsr(temp0, temp1, 16); 3022 __ lsl(s2, temp0, 4); 3023 __ sub(s2, s2, temp0); 3024 __ add(s2, s2, temp1, ext:: uxth); 3025 3026 __ subs(temp0, s2, base); 3027 __ csel(s2, temp0, s2, Assembler::HS); 3028 3029 __ subs(len, len, nmax); 3030 __ sub(count, nmax, 16); 3031 __ br(Assembler::HS, L_nmax_loop); 3032 3033 __ bind(L_by16); 3034 __ adds(len, len, count); 3035 __ br(Assembler::LO, L_by1); 3036 3037 __ bind(L_by16_loop); 3038 3039 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3040 3041 __ add(s1, s1, temp0, ext::uxtb); 3042 __ ubfx(temp2, temp0, 8, 8); 3043 __ add(s2, s2, s1); 3044 __ add(s1, s1, temp2); 3045 __ ubfx(temp2, temp0, 16, 8); 3046 __ add(s2, s2, s1); 3047 __ add(s1, s1, temp2); 3048 __ ubfx(temp2, temp0, 24, 8); 3049 __ add(s2, s2, s1); 3050 __ add(s1, s1, temp2); 3051 __ ubfx(temp2, temp0, 32, 8); 3052 __ add(s2, s2, s1); 3053 __ add(s1, s1, temp2); 3054 __ ubfx(temp2, temp0, 40, 8); 3055 __ add(s2, s2, s1); 3056 __ add(s1, s1, temp2); 3057 __ ubfx(temp2, temp0, 48, 8); 3058 __ add(s2, s2, s1); 3059 __ add(s1, s1, temp2); 3060 __ add(s2, s2, s1); 3061 __ add(s1, s1, temp0, Assembler::LSR, 56); 3062 __ add(s2, s2, s1); 3063 3064 __ add(s1, s1, temp1, ext::uxtb); 3065 __ ubfx(temp2, temp1, 8, 8); 3066 __ add(s2, s2, s1); 3067 __ add(s1, s1, temp2); 3068 __ ubfx(temp2, temp1, 16, 8); 3069 __ add(s2, s2, s1); 3070 __ add(s1, s1, temp2); 3071 __ ubfx(temp2, temp1, 24, 8); 3072 __ add(s2, s2, s1); 3073 __ add(s1, s1, temp2); 3074 __ ubfx(temp2, temp1, 32, 8); 3075 __ add(s2, s2, s1); 3076 __ add(s1, s1, temp2); 3077 __ ubfx(temp2, temp1, 40, 8); 3078 __ add(s2, s2, s1); 3079 __ add(s1, s1, temp2); 3080 __ ubfx(temp2, temp1, 48, 8); 3081 __ add(s2, s2, s1); 3082 __ add(s1, s1, temp2); 3083 __ add(s2, s2, s1); 3084 __ add(s1, s1, temp1, Assembler::LSR, 56); 3085 __ add(s2, s2, s1); 3086 3087 __ subs(len, len, 16); 3088 __ br(Assembler::HS, L_by16_loop); 3089 3090 __ bind(L_by1); 3091 __ adds(len, len, 15); 3092 __ br(Assembler::LO, L_do_mod); 3093 3094 __ bind(L_by1_loop); 3095 __ ldrb(temp0, Address(__ post(buff, 1))); 3096 __ add(s1, temp0, s1); 3097 __ add(s2, s2, s1); 3098 __ subs(len, len, 1); 3099 __ br(Assembler::HS, L_by1_loop); 3100 3101 __ bind(L_do_mod); 3102 // s1 = s1 % BASE 3103 __ lsr(temp0, s1, 16); 3104 __ lsl(temp1, temp0, 4); 3105 __ sub(temp1, temp1, temp0); 3106 __ add(temp1, temp1, s1, ext::uxth); 3107 3108 __ lsr(temp0, temp1, 16); 3109 __ lsl(s1, temp0, 4); 3110 __ sub(s1, s1, temp0); 3111 __ add(s1, s1, temp1, ext:: uxth); 3112 3113 __ subs(temp0, s1, base); 3114 __ csel(s1, temp0, s1, Assembler::HS); 3115 3116 // s2 = s2 % BASE 3117 __ lsr(temp0, s2, 16); 3118 __ lsl(temp1, temp0, 4); 3119 __ sub(temp1, temp1, temp0); 3120 __ add(temp1, temp1, s2, ext::uxth); 3121 3122 __ lsr(temp0, temp1, 16); 3123 __ lsl(s2, temp0, 4); 3124 __ sub(s2, s2, temp0); 3125 __ add(s2, s2, temp1, ext:: uxth); 3126 3127 __ subs(temp0, s2, base); 3128 __ csel(s2, temp0, s2, Assembler::HS); 3129 3130 // Combine lower bits and higher bits 3131 __ bind(L_combine); 3132 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3133 3134 __ ret(lr); 3135 3136 return start; 3137 } 3138 3139 /** 3140 * Arguments: 3141 * 3142 * Input: 3143 * c_rarg0 - x address 3144 * c_rarg1 - x length 3145 * c_rarg2 - y address 3146 * c_rarg3 - y lenth 3147 * c_rarg4 - z address 3148 * c_rarg5 - z length 3149 */ 3150 address generate_multiplyToLen() { 3151 __ align(CodeEntryAlignment); 3152 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3153 3154 address start = __ pc(); 3155 const Register x = r0; 3156 const Register xlen = r1; 3157 const Register y = r2; 3158 const Register ylen = r3; 3159 const Register z = r4; 3160 const Register zlen = r5; 3161 3162 const Register tmp1 = r10; 3163 const Register tmp2 = r11; 3164 const Register tmp3 = r12; 3165 const Register tmp4 = r13; 3166 const Register tmp5 = r14; 3167 const Register tmp6 = r15; 3168 const Register tmp7 = r16; 3169 3170 BLOCK_COMMENT("Entry:"); 3171 __ enter(); // required for proper stackwalking of RuntimeStub frame 3172 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3173 __ leave(); // required for proper stackwalking of RuntimeStub frame 3174 __ ret(lr); 3175 3176 return start; 3177 } 3178 3179 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3180 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3181 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3182 // Karatsuba multiplication performs a 128*128 -> 256-bit 3183 // multiplication in three 128-bit multiplications and a few 3184 // additions. 3185 // 3186 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3187 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3188 // 3189 // Inputs: 3190 // 3191 // A0 in a.d[0] (subkey) 3192 // A1 in a.d[1] 3193 // (A1+A0) in a1_xor_a0.d[0] 3194 // 3195 // B0 in b.d[0] (state) 3196 // B1 in b.d[1] 3197 3198 __ ext(tmp1, __ T16B, b, b, 0x08); 3199 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3200 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3201 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3202 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3203 3204 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3205 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3206 __ eor(tmp2, __ T16B, tmp2, tmp4); 3207 __ eor(tmp2, __ T16B, tmp2, tmp3); 3208 3209 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3210 __ ins(result_hi, __ D, tmp2, 0, 1); 3211 __ ins(result_lo, __ D, tmp2, 1, 0); 3212 } 3213 3214 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3215 FloatRegister p, FloatRegister z, FloatRegister t1) { 3216 const FloatRegister t0 = result; 3217 3218 // The GCM field polynomial f is z^128 + p(z), where p = 3219 // z^7+z^2+z+1. 3220 // 3221 // z^128 === -p(z) (mod (z^128 + p(z))) 3222 // 3223 // so, given that the product we're reducing is 3224 // a == lo + hi * z^128 3225 // substituting, 3226 // === lo - hi * p(z) (mod (z^128 + p(z))) 3227 // 3228 // we reduce by multiplying hi by p(z) and subtracting the result 3229 // from (i.e. XORing it with) lo. Because p has no nonzero high 3230 // bits we can do this with two 64-bit multiplications, lo*p and 3231 // hi*p. 3232 3233 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3234 __ ext(t1, __ T16B, t0, z, 8); 3235 __ eor(hi, __ T16B, hi, t1); 3236 __ ext(t1, __ T16B, z, t0, 8); 3237 __ eor(lo, __ T16B, lo, t1); 3238 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3239 __ eor(result, __ T16B, lo, t0); 3240 } 3241 3242 /** 3243 * Arguments: 3244 * 3245 * Input: 3246 * c_rarg0 - current state address 3247 * c_rarg1 - H key address 3248 * c_rarg2 - data address 3249 * c_rarg3 - number of blocks 3250 * 3251 * Output: 3252 * Updated state at c_rarg0 3253 */ 3254 address generate_ghash_processBlocks() { 3255 // Bafflingly, GCM uses little-endian for the byte order, but 3256 // big-endian for the bit order. For example, the polynomial 1 is 3257 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3258 // 3259 // So, we must either reverse the bytes in each word and do 3260 // everything big-endian or reverse the bits in each byte and do 3261 // it little-endian. On AArch64 it's more idiomatic to reverse 3262 // the bits in each byte (we have an instruction, RBIT, to do 3263 // that) and keep the data in little-endian bit order throught the 3264 // calculation, bit-reversing the inputs and outputs. 3265 3266 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3267 __ align(wordSize * 2); 3268 address p = __ pc(); 3269 __ emit_int64(0x87); // The low-order bits of the field 3270 // polynomial (i.e. p = z^7+z^2+z+1) 3271 // repeated in the low and high parts of a 3272 // 128-bit vector 3273 __ emit_int64(0x87); 3274 3275 __ align(CodeEntryAlignment); 3276 address start = __ pc(); 3277 3278 Register state = c_rarg0; 3279 Register subkeyH = c_rarg1; 3280 Register data = c_rarg2; 3281 Register blocks = c_rarg3; 3282 3283 FloatRegister vzr = v30; 3284 __ eor(vzr, __ T16B, vzr, vzr); // zero register 3285 3286 __ ldrq(v0, Address(state)); 3287 __ ldrq(v1, Address(subkeyH)); 3288 3289 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 3290 __ rbit(v0, __ T16B, v0); 3291 __ rev64(v1, __ T16B, v1); 3292 __ rbit(v1, __ T16B, v1); 3293 3294 __ ldrq(v26, p); 3295 3296 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 3297 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 3298 3299 { 3300 Label L_ghash_loop; 3301 __ bind(L_ghash_loop); 3302 3303 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 3304 // reversing each byte 3305 __ rbit(v2, __ T16B, v2); 3306 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 3307 3308 // Multiply state in v2 by subkey in v1 3309 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 3310 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 3311 /*temps*/v6, v20, v18, v21); 3312 // Reduce v7:v5 by the field polynomial 3313 ghash_reduce(v0, v5, v7, v26, vzr, v20); 3314 3315 __ sub(blocks, blocks, 1); 3316 __ cbnz(blocks, L_ghash_loop); 3317 } 3318 3319 // The bit-reversed result is at this point in v0 3320 __ rev64(v1, __ T16B, v0); 3321 __ rbit(v1, __ T16B, v1); 3322 3323 __ st1(v1, __ T16B, state); 3324 __ ret(lr); 3325 3326 return start; 3327 } 3328 3329 // Continuation point for throwing of implicit exceptions that are 3330 // not handled in the current activation. Fabricates an exception 3331 // oop and initiates normal exception dispatching in this 3332 // frame. Since we need to preserve callee-saved values (currently 3333 // only for C2, but done for C1 as well) we need a callee-saved oop 3334 // map and therefore have to make these stubs into RuntimeStubs 3335 // rather than BufferBlobs. If the compiler needs all registers to 3336 // be preserved between the fault point and the exception handler 3337 // then it must assume responsibility for that in 3338 // AbstractCompiler::continuation_for_implicit_null_exception or 3339 // continuation_for_implicit_division_by_zero_exception. All other 3340 // implicit exceptions (e.g., NullPointerException or 3341 // AbstractMethodError on entry) are either at call sites or 3342 // otherwise assume that stack unwinding will be initiated, so 3343 // caller saved registers were assumed volatile in the compiler. 3344 3345 #undef __ 3346 #define __ masm-> 3347 3348 address generate_throw_exception(const char* name, 3349 address runtime_entry, 3350 Register arg1 = noreg, 3351 Register arg2 = noreg) { 3352 // Information about frame layout at time of blocking runtime call. 3353 // Note that we only have to preserve callee-saved registers since 3354 // the compilers are responsible for supplying a continuation point 3355 // if they expect all registers to be preserved. 3356 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 3357 enum layout { 3358 rfp_off = 0, 3359 rfp_off2, 3360 return_off, 3361 return_off2, 3362 framesize // inclusive of return address 3363 }; 3364 3365 int insts_size = 512; 3366 int locs_size = 64; 3367 3368 CodeBuffer code(name, insts_size, locs_size); 3369 OopMapSet* oop_maps = new OopMapSet(); 3370 MacroAssembler* masm = new MacroAssembler(&code); 3371 3372 address start = __ pc(); 3373 3374 // This is an inlined and slightly modified version of call_VM 3375 // which has the ability to fetch the return PC out of 3376 // thread-local storage and also sets up last_Java_sp slightly 3377 // differently than the real call_VM 3378 3379 __ enter(); // Save FP and LR before call 3380 3381 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3382 3383 // lr and fp are already in place 3384 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 3385 3386 int frame_complete = __ pc() - start; 3387 3388 // Set up last_Java_sp and last_Java_fp 3389 address the_pc = __ pc(); 3390 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 3391 3392 // Call runtime 3393 if (arg1 != noreg) { 3394 assert(arg2 != c_rarg1, "clobbered"); 3395 __ mov(c_rarg1, arg1); 3396 } 3397 if (arg2 != noreg) { 3398 __ mov(c_rarg2, arg2); 3399 } 3400 __ mov(c_rarg0, rthread); 3401 BLOCK_COMMENT("call runtime_entry"); 3402 __ mov(rscratch1, runtime_entry); 3403 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 3404 3405 // Generate oop map 3406 OopMap* map = new OopMap(framesize, 0); 3407 3408 oop_maps->add_gc_map(the_pc - start, map); 3409 3410 __ reset_last_Java_frame(true, true); 3411 __ maybe_isb(); 3412 3413 __ leave(); 3414 3415 // check for pending exceptions 3416 #ifdef ASSERT 3417 Label L; 3418 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 3419 __ cbnz(rscratch1, L); 3420 __ should_not_reach_here(); 3421 __ bind(L); 3422 #endif // ASSERT 3423 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3424 3425 3426 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3427 RuntimeStub* stub = 3428 RuntimeStub::new_runtime_stub(name, 3429 &code, 3430 frame_complete, 3431 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3432 oop_maps, false); 3433 return stub->entry_point(); 3434 } 3435 3436 class MontgomeryMultiplyGenerator : public MacroAssembler { 3437 3438 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3439 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 3440 3441 RegSet _toSave; 3442 bool _squaring; 3443 3444 public: 3445 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3446 : MacroAssembler(as->code()), _squaring(squaring) { 3447 3448 // Register allocation 3449 3450 Register reg = c_rarg0; 3451 Pa_base = reg; // Argument registers 3452 if (squaring) 3453 Pb_base = Pa_base; 3454 else 3455 Pb_base = ++reg; 3456 Pn_base = ++reg; 3457 Rlen= ++reg; 3458 inv = ++reg; 3459 Pm_base = ++reg; 3460 3461 // Working registers: 3462 Ra = ++reg; // The current digit of a, b, n, and m. 3463 Rb = ++reg; 3464 Rm = ++reg; 3465 Rn = ++reg; 3466 3467 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 3468 Pb = ++reg; 3469 Pm = ++reg; 3470 Pn = ++reg; 3471 3472 t0 = ++reg; // Three registers which form a 3473 t1 = ++reg; // triple-precision accumuator. 3474 t2 = ++reg; 3475 3476 Ri = ++reg; // Inner and outer loop indexes. 3477 Rj = ++reg; 3478 3479 Rhi_ab = ++reg; // Product registers: low and high parts 3480 Rlo_ab = ++reg; // of a*b and m*n. 3481 Rhi_mn = ++reg; 3482 Rlo_mn = ++reg; 3483 3484 // r19 and up are callee-saved. 3485 _toSave = RegSet::range(r19, reg) + Pm_base; 3486 } 3487 3488 private: 3489 void save_regs() { 3490 push(_toSave, sp); 3491 } 3492 3493 void restore_regs() { 3494 pop(_toSave, sp); 3495 } 3496 3497 template <typename T> 3498 void unroll_2(Register count, T block) { 3499 Label loop, end, odd; 3500 tbnz(count, 0, odd); 3501 cbz(count, end); 3502 align(16); 3503 bind(loop); 3504 (this->*block)(); 3505 bind(odd); 3506 (this->*block)(); 3507 subs(count, count, 2); 3508 br(Assembler::GT, loop); 3509 bind(end); 3510 } 3511 3512 template <typename T> 3513 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3514 Label loop, end, odd; 3515 tbnz(count, 0, odd); 3516 cbz(count, end); 3517 align(16); 3518 bind(loop); 3519 (this->*block)(d, s, tmp); 3520 bind(odd); 3521 (this->*block)(d, s, tmp); 3522 subs(count, count, 2); 3523 br(Assembler::GT, loop); 3524 bind(end); 3525 } 3526 3527 void pre1(RegisterOrConstant i) { 3528 block_comment("pre1"); 3529 // Pa = Pa_base; 3530 // Pb = Pb_base + i; 3531 // Pm = Pm_base; 3532 // Pn = Pn_base + i; 3533 // Ra = *Pa; 3534 // Rb = *Pb; 3535 // Rm = *Pm; 3536 // Rn = *Pn; 3537 ldr(Ra, Address(Pa_base)); 3538 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3539 ldr(Rm, Address(Pm_base)); 3540 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3541 lea(Pa, Address(Pa_base)); 3542 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3543 lea(Pm, Address(Pm_base)); 3544 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3545 3546 // Zero the m*n result. 3547 mov(Rhi_mn, zr); 3548 mov(Rlo_mn, zr); 3549 } 3550 3551 // The core multiply-accumulate step of a Montgomery 3552 // multiplication. The idea is to schedule operations as a 3553 // pipeline so that instructions with long latencies (loads and 3554 // multiplies) have time to complete before their results are 3555 // used. This most benefits in-order implementations of the 3556 // architecture but out-of-order ones also benefit. 3557 void step() { 3558 block_comment("step"); 3559 // MACC(Ra, Rb, t0, t1, t2); 3560 // Ra = *++Pa; 3561 // Rb = *--Pb; 3562 umulh(Rhi_ab, Ra, Rb); 3563 mul(Rlo_ab, Ra, Rb); 3564 ldr(Ra, pre(Pa, wordSize)); 3565 ldr(Rb, pre(Pb, -wordSize)); 3566 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 3567 // previous iteration. 3568 // MACC(Rm, Rn, t0, t1, t2); 3569 // Rm = *++Pm; 3570 // Rn = *--Pn; 3571 umulh(Rhi_mn, Rm, Rn); 3572 mul(Rlo_mn, Rm, Rn); 3573 ldr(Rm, pre(Pm, wordSize)); 3574 ldr(Rn, pre(Pn, -wordSize)); 3575 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3576 } 3577 3578 void post1() { 3579 block_comment("post1"); 3580 3581 // MACC(Ra, Rb, t0, t1, t2); 3582 // Ra = *++Pa; 3583 // Rb = *--Pb; 3584 umulh(Rhi_ab, Ra, Rb); 3585 mul(Rlo_ab, Ra, Rb); 3586 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3587 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3588 3589 // *Pm = Rm = t0 * inv; 3590 mul(Rm, t0, inv); 3591 str(Rm, Address(Pm)); 3592 3593 // MACC(Rm, Rn, t0, t1, t2); 3594 // t0 = t1; t1 = t2; t2 = 0; 3595 umulh(Rhi_mn, Rm, Rn); 3596 3597 #ifndef PRODUCT 3598 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 3599 { 3600 mul(Rlo_mn, Rm, Rn); 3601 add(Rlo_mn, t0, Rlo_mn); 3602 Label ok; 3603 cbz(Rlo_mn, ok); { 3604 stop("broken Montgomery multiply"); 3605 } bind(ok); 3606 } 3607 #endif 3608 // We have very carefully set things up so that 3609 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 3610 // the lower half of Rm * Rn because we know the result already: 3611 // it must be -t0. t0 + (-t0) must generate a carry iff 3612 // t0 != 0. So, rather than do a mul and an adds we just set 3613 // the carry flag iff t0 is nonzero. 3614 // 3615 // mul(Rlo_mn, Rm, Rn); 3616 // adds(zr, t0, Rlo_mn); 3617 subs(zr, t0, 1); // Set carry iff t0 is nonzero 3618 adcs(t0, t1, Rhi_mn); 3619 adc(t1, t2, zr); 3620 mov(t2, zr); 3621 } 3622 3623 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 3624 block_comment("pre2"); 3625 // Pa = Pa_base + i-len; 3626 // Pb = Pb_base + len; 3627 // Pm = Pm_base + i-len; 3628 // Pn = Pn_base + len; 3629 3630 if (i.is_register()) { 3631 sub(Rj, i.as_register(), len); 3632 } else { 3633 mov(Rj, i.as_constant()); 3634 sub(Rj, Rj, len); 3635 } 3636 // Rj == i-len 3637 3638 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 3639 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 3640 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 3641 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 3642 3643 // Ra = *++Pa; 3644 // Rb = *--Pb; 3645 // Rm = *++Pm; 3646 // Rn = *--Pn; 3647 ldr(Ra, pre(Pa, wordSize)); 3648 ldr(Rb, pre(Pb, -wordSize)); 3649 ldr(Rm, pre(Pm, wordSize)); 3650 ldr(Rn, pre(Pn, -wordSize)); 3651 3652 mov(Rhi_mn, zr); 3653 mov(Rlo_mn, zr); 3654 } 3655 3656 void post2(RegisterOrConstant i, RegisterOrConstant len) { 3657 block_comment("post2"); 3658 if (i.is_constant()) { 3659 mov(Rj, i.as_constant()-len.as_constant()); 3660 } else { 3661 sub(Rj, i.as_register(), len); 3662 } 3663 3664 adds(t0, t0, Rlo_mn); // The pending m*n, low part 3665 3666 // As soon as we know the least significant digit of our result, 3667 // store it. 3668 // Pm_base[i-len] = t0; 3669 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 3670 3671 // t0 = t1; t1 = t2; t2 = 0; 3672 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 3673 adc(t1, t2, zr); 3674 mov(t2, zr); 3675 } 3676 3677 // A carry in t0 after Montgomery multiplication means that we 3678 // should subtract multiples of n from our result in m. We'll 3679 // keep doing that until there is no carry. 3680 void normalize(RegisterOrConstant len) { 3681 block_comment("normalize"); 3682 // while (t0) 3683 // t0 = sub(Pm_base, Pn_base, t0, len); 3684 Label loop, post, again; 3685 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 3686 cbz(t0, post); { 3687 bind(again); { 3688 mov(i, zr); 3689 mov(cnt, len); 3690 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3691 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3692 subs(zr, zr, zr); // set carry flag, i.e. no borrow 3693 align(16); 3694 bind(loop); { 3695 sbcs(Rm, Rm, Rn); 3696 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3697 add(i, i, 1); 3698 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3699 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3700 sub(cnt, cnt, 1); 3701 } cbnz(cnt, loop); 3702 sbc(t0, t0, zr); 3703 } cbnz(t0, again); 3704 } bind(post); 3705 } 3706 3707 // Move memory at s to d, reversing words. 3708 // Increments d to end of copied memory 3709 // Destroys tmp1, tmp2 3710 // Preserves len 3711 // Leaves s pointing to the address which was in d at start 3712 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 3713 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 3714 3715 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 3716 mov(tmp1, len); 3717 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 3718 sub(s, d, len, ext::uxtw, LogBytesPerWord); 3719 } 3720 // where 3721 void reverse1(Register d, Register s, Register tmp) { 3722 ldr(tmp, pre(s, -wordSize)); 3723 ror(tmp, tmp, 32); 3724 str(tmp, post(d, wordSize)); 3725 } 3726 3727 void step_squaring() { 3728 // An extra ACC 3729 step(); 3730 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3731 } 3732 3733 void last_squaring(RegisterOrConstant i) { 3734 Label dont; 3735 // if ((i & 1) == 0) { 3736 tbnz(i.as_register(), 0, dont); { 3737 // MACC(Ra, Rb, t0, t1, t2); 3738 // Ra = *++Pa; 3739 // Rb = *--Pb; 3740 umulh(Rhi_ab, Ra, Rb); 3741 mul(Rlo_ab, Ra, Rb); 3742 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3743 } bind(dont); 3744 } 3745 3746 void extra_step_squaring() { 3747 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3748 3749 // MACC(Rm, Rn, t0, t1, t2); 3750 // Rm = *++Pm; 3751 // Rn = *--Pn; 3752 umulh(Rhi_mn, Rm, Rn); 3753 mul(Rlo_mn, Rm, Rn); 3754 ldr(Rm, pre(Pm, wordSize)); 3755 ldr(Rn, pre(Pn, -wordSize)); 3756 } 3757 3758 void post1_squaring() { 3759 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3760 3761 // *Pm = Rm = t0 * inv; 3762 mul(Rm, t0, inv); 3763 str(Rm, Address(Pm)); 3764 3765 // MACC(Rm, Rn, t0, t1, t2); 3766 // t0 = t1; t1 = t2; t2 = 0; 3767 umulh(Rhi_mn, Rm, Rn); 3768 3769 #ifndef PRODUCT 3770 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 3771 { 3772 mul(Rlo_mn, Rm, Rn); 3773 add(Rlo_mn, t0, Rlo_mn); 3774 Label ok; 3775 cbz(Rlo_mn, ok); { 3776 stop("broken Montgomery multiply"); 3777 } bind(ok); 3778 } 3779 #endif 3780 // We have very carefully set things up so that 3781 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 3782 // the lower half of Rm * Rn because we know the result already: 3783 // it must be -t0. t0 + (-t0) must generate a carry iff 3784 // t0 != 0. So, rather than do a mul and an adds we just set 3785 // the carry flag iff t0 is nonzero. 3786 // 3787 // mul(Rlo_mn, Rm, Rn); 3788 // adds(zr, t0, Rlo_mn); 3789 subs(zr, t0, 1); // Set carry iff t0 is nonzero 3790 adcs(t0, t1, Rhi_mn); 3791 adc(t1, t2, zr); 3792 mov(t2, zr); 3793 } 3794 3795 void acc(Register Rhi, Register Rlo, 3796 Register t0, Register t1, Register t2) { 3797 adds(t0, t0, Rlo); 3798 adcs(t1, t1, Rhi); 3799 adc(t2, t2, zr); 3800 } 3801 3802 public: 3803 /** 3804 * Fast Montgomery multiplication. The derivation of the 3805 * algorithm is in A Cryptographic Library for the Motorola 3806 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3807 * 3808 * Arguments: 3809 * 3810 * Inputs for multiplication: 3811 * c_rarg0 - int array elements a 3812 * c_rarg1 - int array elements b 3813 * c_rarg2 - int array elements n (the modulus) 3814 * c_rarg3 - int length 3815 * c_rarg4 - int inv 3816 * c_rarg5 - int array elements m (the result) 3817 * 3818 * Inputs for squaring: 3819 * c_rarg0 - int array elements a 3820 * c_rarg1 - int array elements n (the modulus) 3821 * c_rarg2 - int length 3822 * c_rarg3 - int inv 3823 * c_rarg4 - int array elements m (the result) 3824 * 3825 */ 3826 address generate_multiply() { 3827 Label argh, nothing; 3828 bind(argh); 3829 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3830 3831 align(CodeEntryAlignment); 3832 address entry = pc(); 3833 3834 cbzw(Rlen, nothing); 3835 3836 enter(); 3837 3838 // Make room. 3839 cmpw(Rlen, 512); 3840 br(Assembler::HI, argh); 3841 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 3842 andr(sp, Ra, -2 * wordSize); 3843 3844 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 3845 3846 { 3847 // Copy input args, reversing as we go. We use Ra as a 3848 // temporary variable. 3849 reverse(Ra, Pa_base, Rlen, t0, t1); 3850 if (!_squaring) 3851 reverse(Ra, Pb_base, Rlen, t0, t1); 3852 reverse(Ra, Pn_base, Rlen, t0, t1); 3853 } 3854 3855 // Push all call-saved registers and also Pm_base which we'll need 3856 // at the end. 3857 save_regs(); 3858 3859 #ifndef PRODUCT 3860 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 3861 { 3862 ldr(Rn, Address(Pn_base, 0)); 3863 mul(Rlo_mn, Rn, inv); 3864 cmp(Rlo_mn, -1); 3865 Label ok; 3866 br(EQ, ok); { 3867 stop("broken inverse in Montgomery multiply"); 3868 } bind(ok); 3869 } 3870 #endif 3871 3872 mov(Pm_base, Ra); 3873 3874 mov(t0, zr); 3875 mov(t1, zr); 3876 mov(t2, zr); 3877 3878 block_comment("for (int i = 0; i < len; i++) {"); 3879 mov(Ri, zr); { 3880 Label loop, end; 3881 cmpw(Ri, Rlen); 3882 br(Assembler::GE, end); 3883 3884 bind(loop); 3885 pre1(Ri); 3886 3887 block_comment(" for (j = i; j; j--) {"); { 3888 movw(Rj, Ri); 3889 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3890 } block_comment(" } // j"); 3891 3892 post1(); 3893 addw(Ri, Ri, 1); 3894 cmpw(Ri, Rlen); 3895 br(Assembler::LT, loop); 3896 bind(end); 3897 block_comment("} // i"); 3898 } 3899 3900 block_comment("for (int i = len; i < 2*len; i++) {"); 3901 mov(Ri, Rlen); { 3902 Label loop, end; 3903 cmpw(Ri, Rlen, Assembler::LSL, 1); 3904 br(Assembler::GE, end); 3905 3906 bind(loop); 3907 pre2(Ri, Rlen); 3908 3909 block_comment(" for (j = len*2-i-1; j; j--) {"); { 3910 lslw(Rj, Rlen, 1); 3911 subw(Rj, Rj, Ri); 3912 subw(Rj, Rj, 1); 3913 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3914 } block_comment(" } // j"); 3915 3916 post2(Ri, Rlen); 3917 addw(Ri, Ri, 1); 3918 cmpw(Ri, Rlen, Assembler::LSL, 1); 3919 br(Assembler::LT, loop); 3920 bind(end); 3921 } 3922 block_comment("} // i"); 3923 3924 normalize(Rlen); 3925 3926 mov(Ra, Pm_base); // Save Pm_base in Ra 3927 restore_regs(); // Restore caller's Pm_base 3928 3929 // Copy our result into caller's Pm_base 3930 reverse(Pm_base, Ra, Rlen, t0, t1); 3931 3932 leave(); 3933 bind(nothing); 3934 ret(lr); 3935 3936 return entry; 3937 } 3938 // In C, approximately: 3939 3940 // void 3941 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 3942 // unsigned long Pn_base[], unsigned long Pm_base[], 3943 // unsigned long inv, int len) { 3944 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3945 // unsigned long *Pa, *Pb, *Pn, *Pm; 3946 // unsigned long Ra, Rb, Rn, Rm; 3947 3948 // int i; 3949 3950 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 3951 3952 // for (i = 0; i < len; i++) { 3953 // int j; 3954 3955 // Pa = Pa_base; 3956 // Pb = Pb_base + i; 3957 // Pm = Pm_base; 3958 // Pn = Pn_base + i; 3959 3960 // Ra = *Pa; 3961 // Rb = *Pb; 3962 // Rm = *Pm; 3963 // Rn = *Pn; 3964 3965 // int iters = i; 3966 // for (j = 0; iters--; j++) { 3967 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 3968 // MACC(Ra, Rb, t0, t1, t2); 3969 // Ra = *++Pa; 3970 // Rb = *--Pb; 3971 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 3972 // MACC(Rm, Rn, t0, t1, t2); 3973 // Rm = *++Pm; 3974 // Rn = *--Pn; 3975 // } 3976 3977 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 3978 // MACC(Ra, Rb, t0, t1, t2); 3979 // *Pm = Rm = t0 * inv; 3980 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 3981 // MACC(Rm, Rn, t0, t1, t2); 3982 3983 // assert(t0 == 0, "broken Montgomery multiply"); 3984 3985 // t0 = t1; t1 = t2; t2 = 0; 3986 // } 3987 3988 // for (i = len; i < 2*len; i++) { 3989 // int j; 3990 3991 // Pa = Pa_base + i-len; 3992 // Pb = Pb_base + len; 3993 // Pm = Pm_base + i-len; 3994 // Pn = Pn_base + len; 3995 3996 // Ra = *++Pa; 3997 // Rb = *--Pb; 3998 // Rm = *++Pm; 3999 // Rn = *--Pn; 4000 4001 // int iters = len*2-i-1; 4002 // for (j = i-len+1; iters--; j++) { 4003 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4004 // MACC(Ra, Rb, t0, t1, t2); 4005 // Ra = *++Pa; 4006 // Rb = *--Pb; 4007 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4008 // MACC(Rm, Rn, t0, t1, t2); 4009 // Rm = *++Pm; 4010 // Rn = *--Pn; 4011 // } 4012 4013 // Pm_base[i-len] = t0; 4014 // t0 = t1; t1 = t2; t2 = 0; 4015 // } 4016 4017 // while (t0) 4018 // t0 = sub(Pm_base, Pn_base, t0, len); 4019 // } 4020 4021 /** 4022 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4023 * multiplies than Montgomery multiplication so it should be up to 4024 * 25% faster. However, its loop control is more complex and it 4025 * may actually run slower on some machines. 4026 * 4027 * Arguments: 4028 * 4029 * Inputs: 4030 * c_rarg0 - int array elements a 4031 * c_rarg1 - int array elements n (the modulus) 4032 * c_rarg2 - int length 4033 * c_rarg3 - int inv 4034 * c_rarg4 - int array elements m (the result) 4035 * 4036 */ 4037 address generate_square() { 4038 Label argh; 4039 bind(argh); 4040 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4041 4042 align(CodeEntryAlignment); 4043 address entry = pc(); 4044 4045 enter(); 4046 4047 // Make room. 4048 cmpw(Rlen, 512); 4049 br(Assembler::HI, argh); 4050 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4051 andr(sp, Ra, -2 * wordSize); 4052 4053 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4054 4055 { 4056 // Copy input args, reversing as we go. We use Ra as a 4057 // temporary variable. 4058 reverse(Ra, Pa_base, Rlen, t0, t1); 4059 reverse(Ra, Pn_base, Rlen, t0, t1); 4060 } 4061 4062 // Push all call-saved registers and also Pm_base which we'll need 4063 // at the end. 4064 save_regs(); 4065 4066 mov(Pm_base, Ra); 4067 4068 mov(t0, zr); 4069 mov(t1, zr); 4070 mov(t2, zr); 4071 4072 block_comment("for (int i = 0; i < len; i++) {"); 4073 mov(Ri, zr); { 4074 Label loop, end; 4075 bind(loop); 4076 cmp(Ri, Rlen); 4077 br(Assembler::GE, end); 4078 4079 pre1(Ri); 4080 4081 block_comment("for (j = (i+1)/2; j; j--) {"); { 4082 add(Rj, Ri, 1); 4083 lsr(Rj, Rj, 1); 4084 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4085 } block_comment(" } // j"); 4086 4087 last_squaring(Ri); 4088 4089 block_comment(" for (j = i/2; j; j--) {"); { 4090 lsr(Rj, Ri, 1); 4091 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4092 } block_comment(" } // j"); 4093 4094 post1_squaring(); 4095 add(Ri, Ri, 1); 4096 cmp(Ri, Rlen); 4097 br(Assembler::LT, loop); 4098 4099 bind(end); 4100 block_comment("} // i"); 4101 } 4102 4103 block_comment("for (int i = len; i < 2*len; i++) {"); 4104 mov(Ri, Rlen); { 4105 Label loop, end; 4106 bind(loop); 4107 cmp(Ri, Rlen, Assembler::LSL, 1); 4108 br(Assembler::GE, end); 4109 4110 pre2(Ri, Rlen); 4111 4112 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4113 lsl(Rj, Rlen, 1); 4114 sub(Rj, Rj, Ri); 4115 sub(Rj, Rj, 1); 4116 lsr(Rj, Rj, 1); 4117 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4118 } block_comment(" } // j"); 4119 4120 last_squaring(Ri); 4121 4122 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4123 lsl(Rj, Rlen, 1); 4124 sub(Rj, Rj, Ri); 4125 lsr(Rj, Rj, 1); 4126 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4127 } block_comment(" } // j"); 4128 4129 post2(Ri, Rlen); 4130 add(Ri, Ri, 1); 4131 cmp(Ri, Rlen, Assembler::LSL, 1); 4132 4133 br(Assembler::LT, loop); 4134 bind(end); 4135 block_comment("} // i"); 4136 } 4137 4138 normalize(Rlen); 4139 4140 mov(Ra, Pm_base); // Save Pm_base in Ra 4141 restore_regs(); // Restore caller's Pm_base 4142 4143 // Copy our result into caller's Pm_base 4144 reverse(Pm_base, Ra, Rlen, t0, t1); 4145 4146 leave(); 4147 ret(lr); 4148 4149 return entry; 4150 } 4151 // In C, approximately: 4152 4153 // void 4154 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4155 // unsigned long Pm_base[], unsigned long inv, int len) { 4156 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4157 // unsigned long *Pa, *Pb, *Pn, *Pm; 4158 // unsigned long Ra, Rb, Rn, Rm; 4159 4160 // int i; 4161 4162 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4163 4164 // for (i = 0; i < len; i++) { 4165 // int j; 4166 4167 // Pa = Pa_base; 4168 // Pb = Pa_base + i; 4169 // Pm = Pm_base; 4170 // Pn = Pn_base + i; 4171 4172 // Ra = *Pa; 4173 // Rb = *Pb; 4174 // Rm = *Pm; 4175 // Rn = *Pn; 4176 4177 // int iters = (i+1)/2; 4178 // for (j = 0; iters--; j++) { 4179 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4180 // MACC2(Ra, Rb, t0, t1, t2); 4181 // Ra = *++Pa; 4182 // Rb = *--Pb; 4183 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4184 // MACC(Rm, Rn, t0, t1, t2); 4185 // Rm = *++Pm; 4186 // Rn = *--Pn; 4187 // } 4188 // if ((i & 1) == 0) { 4189 // assert(Ra == Pa_base[j], "must be"); 4190 // MACC(Ra, Ra, t0, t1, t2); 4191 // } 4192 // iters = i/2; 4193 // assert(iters == i-j, "must be"); 4194 // for (; iters--; j++) { 4195 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4196 // MACC(Rm, Rn, t0, t1, t2); 4197 // Rm = *++Pm; 4198 // Rn = *--Pn; 4199 // } 4200 4201 // *Pm = Rm = t0 * inv; 4202 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4203 // MACC(Rm, Rn, t0, t1, t2); 4204 4205 // assert(t0 == 0, "broken Montgomery multiply"); 4206 4207 // t0 = t1; t1 = t2; t2 = 0; 4208 // } 4209 4210 // for (i = len; i < 2*len; i++) { 4211 // int start = i-len+1; 4212 // int end = start + (len - start)/2; 4213 // int j; 4214 4215 // Pa = Pa_base + i-len; 4216 // Pb = Pa_base + len; 4217 // Pm = Pm_base + i-len; 4218 // Pn = Pn_base + len; 4219 4220 // Ra = *++Pa; 4221 // Rb = *--Pb; 4222 // Rm = *++Pm; 4223 // Rn = *--Pn; 4224 4225 // int iters = (2*len-i-1)/2; 4226 // assert(iters == end-start, "must be"); 4227 // for (j = start; iters--; j++) { 4228 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4229 // MACC2(Ra, Rb, t0, t1, t2); 4230 // Ra = *++Pa; 4231 // Rb = *--Pb; 4232 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4233 // MACC(Rm, Rn, t0, t1, t2); 4234 // Rm = *++Pm; 4235 // Rn = *--Pn; 4236 // } 4237 // if ((i & 1) == 0) { 4238 // assert(Ra == Pa_base[j], "must be"); 4239 // MACC(Ra, Ra, t0, t1, t2); 4240 // } 4241 // iters = (2*len-i)/2; 4242 // assert(iters == len-j, "must be"); 4243 // for (; iters--; j++) { 4244 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4245 // MACC(Rm, Rn, t0, t1, t2); 4246 // Rm = *++Pm; 4247 // Rn = *--Pn; 4248 // } 4249 // Pm_base[i-len] = t0; 4250 // t0 = t1; t1 = t2; t2 = 0; 4251 // } 4252 4253 // while (t0) 4254 // t0 = sub(Pm_base, Pn_base, t0, len); 4255 // } 4256 }; 4257 4258 // Initialization 4259 void generate_initial() { 4260 // Generate initial stubs and initializes the entry points 4261 4262 // entry points that exist in all platforms Note: This is code 4263 // that could be shared among different platforms - however the 4264 // benefit seems to be smaller than the disadvantage of having a 4265 // much more complicated generator structure. See also comment in 4266 // stubRoutines.hpp. 4267 4268 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4269 4270 StubRoutines::_call_stub_entry = 4271 generate_call_stub(StubRoutines::_call_stub_return_address); 4272 4273 // is referenced by megamorphic call 4274 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4275 4276 // Build this early so it's available for the interpreter. 4277 StubRoutines::_throw_StackOverflowError_entry = 4278 generate_throw_exception("StackOverflowError throw_exception", 4279 CAST_FROM_FN_PTR(address, 4280 SharedRuntime:: 4281 throw_StackOverflowError)); 4282 if (UseCRC32Intrinsics) { 4283 // set table address before stub generation which use it 4284 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 4285 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4286 } 4287 } 4288 4289 void generate_all() { 4290 // support for verify_oop (must happen after universe_init) 4291 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4292 StubRoutines::_throw_AbstractMethodError_entry = 4293 generate_throw_exception("AbstractMethodError throw_exception", 4294 CAST_FROM_FN_PTR(address, 4295 SharedRuntime:: 4296 throw_AbstractMethodError)); 4297 4298 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4299 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4300 CAST_FROM_FN_PTR(address, 4301 SharedRuntime:: 4302 throw_IncompatibleClassChangeError)); 4303 4304 StubRoutines::_throw_NullPointerException_at_call_entry = 4305 generate_throw_exception("NullPointerException at call throw_exception", 4306 CAST_FROM_FN_PTR(address, 4307 SharedRuntime:: 4308 throw_NullPointerException_at_call)); 4309 4310 // arraycopy stubs used by compilers 4311 generate_arraycopy_stubs(); 4312 4313 if (UseMultiplyToLenIntrinsic) { 4314 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4315 } 4316 4317 if (UseMontgomeryMultiplyIntrinsic) { 4318 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4319 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4320 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4321 } 4322 4323 if (UseMontgomerySquareIntrinsic) { 4324 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4325 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4326 // We use generate_multiply() rather than generate_square() 4327 // because it's faster for the sizes of modulus we care about. 4328 StubRoutines::_montgomerySquare = g.generate_multiply(); 4329 } 4330 4331 #ifndef BUILTIN_SIM 4332 // generate GHASH intrinsics code 4333 if (UseGHASHIntrinsics) { 4334 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 4335 } 4336 4337 if (UseAESIntrinsics) { 4338 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4339 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4340 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4341 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4342 } 4343 4344 if (UseSHA1Intrinsics) { 4345 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4346 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4347 } 4348 if (UseSHA256Intrinsics) { 4349 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4350 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4351 } 4352 4353 if (UseCRC32CIntrinsics) { 4354 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 4355 } 4356 4357 // generate Adler32 intrinsics code 4358 if (UseAdler32Intrinsics) { 4359 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 4360 } 4361 4362 // Safefetch stubs. 4363 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4364 &StubRoutines::_safefetch32_fault_pc, 4365 &StubRoutines::_safefetch32_continuation_pc); 4366 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4367 &StubRoutines::_safefetchN_fault_pc, 4368 &StubRoutines::_safefetchN_continuation_pc); 4369 #endif 4370 } 4371 4372 public: 4373 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4374 if (all) { 4375 generate_all(); 4376 } else { 4377 generate_initial(); 4378 } 4379 } 4380 }; // end class declaration 4381 4382 void StubGenerator_generate(CodeBuffer* code, bool all) { 4383 StubGenerator g(code, all); 4384 }