1 /* 2 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "interpreter/interpreter.hpp" 30 #include "nativeInst_aarch64.hpp" 31 #include "oops/instanceOop.hpp" 32 #include "oops/method.hpp" 33 #include "oops/objArrayKlass.hpp" 34 #include "oops/oop.inline.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/frame.inline.hpp" 37 #include "runtime/handles.inline.hpp" 38 #include "runtime/sharedRuntime.hpp" 39 #include "runtime/stubCodeGenerator.hpp" 40 #include "runtime/stubRoutines.hpp" 41 #include "runtime/thread.inline.hpp" 42 #include "utilities/top.hpp" 43 #ifdef COMPILER2 44 #include "opto/runtime.hpp" 45 #endif 46 47 #ifdef BUILTIN_SIM 48 #include "../../../../../../simulator/simulator.hpp" 49 #endif 50 51 // Declaration and definition of StubGenerator (no .hpp file). 52 // For a more detailed description of the stub routine structure 53 // see the comment in stubRoutines.hpp 54 55 #undef __ 56 #define __ _masm-> 57 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 58 59 #ifdef PRODUCT 60 #define BLOCK_COMMENT(str) /* nothing */ 61 #else 62 #define BLOCK_COMMENT(str) __ block_comment(str) 63 #endif 64 65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 66 67 // Stub Code definitions 68 69 class StubGenerator: public StubCodeGenerator { 70 private: 71 72 #ifdef PRODUCT 73 #define inc_counter_np(counter) ((void)0) 74 #else 75 void inc_counter_np_(int& counter) { 76 __ lea(rscratch2, ExternalAddress((address)&counter)); 77 __ ldrw(rscratch1, Address(rscratch2)); 78 __ addw(rscratch1, rscratch1, 1); 79 __ strw(rscratch1, Address(rscratch2)); 80 } 81 #define inc_counter_np(counter) \ 82 BLOCK_COMMENT("inc_counter " #counter); \ 83 inc_counter_np_(counter); 84 #endif 85 86 // Call stubs are used to call Java from C 87 // 88 // Arguments: 89 // c_rarg0: call wrapper address address 90 // c_rarg1: result address 91 // c_rarg2: result type BasicType 92 // c_rarg3: method Method* 93 // c_rarg4: (interpreter) entry point address 94 // c_rarg5: parameters intptr_t* 95 // c_rarg6: parameter size (in words) int 96 // c_rarg7: thread Thread* 97 // 98 // There is no return from the stub itself as any Java result 99 // is written to result 100 // 101 // we save r30 (lr) as the return PC at the base of the frame and 102 // link r29 (fp) below it as the frame pointer installing sp (r31) 103 // into fp. 104 // 105 // we save r0-r7, which accounts for all the c arguments. 106 // 107 // TODO: strictly do we need to save them all? they are treated as 108 // volatile by C so could we omit saving the ones we are going to 109 // place in global registers (thread? method?) or those we only use 110 // during setup of the Java call? 111 // 112 // we don't need to save r8 which C uses as an indirect result location 113 // return register. 114 // 115 // we don't need to save r9-r15 which both C and Java treat as 116 // volatile 117 // 118 // we don't need to save r16-18 because Java does not use them 119 // 120 // we save r19-r28 which Java uses as scratch registers and C 121 // expects to be callee-save 122 // 123 // we save the bottom 64 bits of each value stored in v8-v15; it is 124 // the responsibility of the caller to preserve larger values. 125 // 126 // so the stub frame looks like this when we enter Java code 127 // 128 // [ return_from_Java ] <--- sp 129 // [ argument word n ] 130 // ... 131 // -27 [ argument word 1 ] 132 // -26 [ saved v15 ] <--- sp_after_call 133 // -25 [ saved v14 ] 134 // -24 [ saved v13 ] 135 // -23 [ saved v12 ] 136 // -22 [ saved v11 ] 137 // -21 [ saved v10 ] 138 // -20 [ saved v9 ] 139 // -19 [ saved v8 ] 140 // -18 [ saved r28 ] 141 // -17 [ saved r27 ] 142 // -16 [ saved r26 ] 143 // -15 [ saved r25 ] 144 // -14 [ saved r24 ] 145 // -13 [ saved r23 ] 146 // -12 [ saved r22 ] 147 // -11 [ saved r21 ] 148 // -10 [ saved r20 ] 149 // -9 [ saved r19 ] 150 // -8 [ call wrapper (r0) ] 151 // -7 [ result (r1) ] 152 // -6 [ result type (r2) ] 153 // -5 [ method (r3) ] 154 // -4 [ entry point (r4) ] 155 // -3 [ parameters (r5) ] 156 // -2 [ parameter size (r6) ] 157 // -1 [ thread (r7) ] 158 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 159 // 1 [ saved lr (r30) ] 160 161 // Call stub stack layout word offsets from fp 162 enum call_stub_layout { 163 sp_after_call_off = -26, 164 165 d15_off = -26, 166 d13_off = -24, 167 d11_off = -22, 168 d9_off = -20, 169 170 r28_off = -18, 171 r26_off = -16, 172 r24_off = -14, 173 r22_off = -12, 174 r20_off = -10, 175 call_wrapper_off = -8, 176 result_off = -7, 177 result_type_off = -6, 178 method_off = -5, 179 entry_point_off = -4, 180 parameter_size_off = -2, 181 thread_off = -1, 182 fp_f = 0, 183 retaddr_off = 1, 184 }; 185 186 address generate_call_stub(address& return_address) { 187 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 188 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 189 "adjust this code"); 190 191 StubCodeMark mark(this, "StubRoutines", "call_stub"); 192 address start = __ pc(); 193 194 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 195 196 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 197 const Address result (rfp, result_off * wordSize); 198 const Address result_type (rfp, result_type_off * wordSize); 199 const Address method (rfp, method_off * wordSize); 200 const Address entry_point (rfp, entry_point_off * wordSize); 201 const Address parameter_size(rfp, parameter_size_off * wordSize); 202 203 const Address thread (rfp, thread_off * wordSize); 204 205 const Address d15_save (rfp, d15_off * wordSize); 206 const Address d13_save (rfp, d13_off * wordSize); 207 const Address d11_save (rfp, d11_off * wordSize); 208 const Address d9_save (rfp, d9_off * wordSize); 209 210 const Address r28_save (rfp, r28_off * wordSize); 211 const Address r26_save (rfp, r26_off * wordSize); 212 const Address r24_save (rfp, r24_off * wordSize); 213 const Address r22_save (rfp, r22_off * wordSize); 214 const Address r20_save (rfp, r20_off * wordSize); 215 216 // stub code 217 218 // we need a C prolog to bootstrap the x86 caller into the sim 219 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 220 221 address aarch64_entry = __ pc(); 222 223 #ifdef BUILTIN_SIM 224 // Save sender's SP for stack traces. 225 __ mov(rscratch1, sp); 226 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 227 #endif 228 // set up frame and move sp to end of save area 229 __ enter(); 230 __ sub(sp, rfp, -sp_after_call_off * wordSize); 231 232 // save register parameters and Java scratch/global registers 233 // n.b. we save thread even though it gets installed in 234 // rthread because we want to sanity check rthread later 235 __ str(c_rarg7, thread); 236 __ strw(c_rarg6, parameter_size); 237 __ stp(c_rarg4, c_rarg5, entry_point); 238 __ stp(c_rarg2, c_rarg3, result_type); 239 __ stp(c_rarg0, c_rarg1, call_wrapper); 240 241 __ stp(r20, r19, r20_save); 242 __ stp(r22, r21, r22_save); 243 __ stp(r24, r23, r24_save); 244 __ stp(r26, r25, r26_save); 245 __ stp(r28, r27, r28_save); 246 247 __ stpd(v9, v8, d9_save); 248 __ stpd(v11, v10, d11_save); 249 __ stpd(v13, v12, d13_save); 250 __ stpd(v15, v14, d15_save); 251 252 // install Java thread in global register now we have saved 253 // whatever value it held 254 __ mov(rthread, c_rarg7); 255 // And method 256 __ mov(rmethod, c_rarg3); 257 258 // set up the heapbase register 259 __ reinit_heapbase(); 260 261 #ifdef ASSERT 262 // make sure we have no pending exceptions 263 { 264 Label L; 265 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 266 __ cmp(rscratch1, (unsigned)NULL_WORD); 267 __ br(Assembler::EQ, L); 268 __ stop("StubRoutines::call_stub: entered with pending exception"); 269 __ BIND(L); 270 } 271 #endif 272 // pass parameters if any 273 __ mov(esp, sp); 274 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 275 __ andr(sp, rscratch1, -2 * wordSize); 276 277 BLOCK_COMMENT("pass parameters if any"); 278 Label parameters_done; 279 // parameter count is still in c_rarg6 280 // and parameter pointer identifying param 1 is in c_rarg5 281 __ cbzw(c_rarg6, parameters_done); 282 283 address loop = __ pc(); 284 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 285 __ subsw(c_rarg6, c_rarg6, 1); 286 __ push(rscratch1); 287 __ br(Assembler::GT, loop); 288 289 __ BIND(parameters_done); 290 291 // call Java entry -- passing methdoOop, and current sp 292 // rmethod: Method* 293 // r13: sender sp 294 BLOCK_COMMENT("call Java function"); 295 __ mov(r13, sp); 296 __ blr(c_rarg4); 297 298 // tell the simulator we have returned to the stub 299 300 // we do this here because the notify will already have been done 301 // if we get to the next instruction via an exception 302 // 303 // n.b. adding this instruction here affects the calculation of 304 // whether or not a routine returns to the call stub (used when 305 // doing stack walks) since the normal test is to check the return 306 // pc against the address saved below. so we may need to allow for 307 // this extra instruction in the check. 308 309 if (NotifySimulator) { 310 __ notify(Assembler::method_reentry); 311 } 312 // save current address for use by exception handling code 313 314 return_address = __ pc(); 315 316 // store result depending on type (everything that is not 317 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 318 // n.b. this assumes Java returns an integral result in r0 319 // and a floating result in j_farg0 320 __ ldr(j_rarg2, result); 321 Label is_long, is_float, is_double, exit; 322 __ ldr(j_rarg1, result_type); 323 __ cmp(j_rarg1, T_OBJECT); 324 __ br(Assembler::EQ, is_long); 325 __ cmp(j_rarg1, T_LONG); 326 __ br(Assembler::EQ, is_long); 327 __ cmp(j_rarg1, T_FLOAT); 328 __ br(Assembler::EQ, is_float); 329 __ cmp(j_rarg1, T_DOUBLE); 330 __ br(Assembler::EQ, is_double); 331 332 // handle T_INT case 333 __ strw(r0, Address(j_rarg2)); 334 335 __ BIND(exit); 336 337 // pop parameters 338 __ sub(esp, rfp, -sp_after_call_off * wordSize); 339 340 #ifdef ASSERT 341 // verify that threads correspond 342 { 343 Label L, S; 344 __ ldr(rscratch1, thread); 345 __ cmp(rthread, rscratch1); 346 __ br(Assembler::NE, S); 347 __ get_thread(rscratch1); 348 __ cmp(rthread, rscratch1); 349 __ br(Assembler::EQ, L); 350 __ BIND(S); 351 __ stop("StubRoutines::call_stub: threads must correspond"); 352 __ BIND(L); 353 } 354 #endif 355 356 // restore callee-save registers 357 __ ldpd(v15, v14, d15_save); 358 __ ldpd(v13, v12, d13_save); 359 __ ldpd(v11, v10, d11_save); 360 __ ldpd(v9, v8, d9_save); 361 362 __ ldp(r28, r27, r28_save); 363 __ ldp(r26, r25, r26_save); 364 __ ldp(r24, r23, r24_save); 365 __ ldp(r22, r21, r22_save); 366 __ ldp(r20, r19, r20_save); 367 368 __ ldp(c_rarg0, c_rarg1, call_wrapper); 369 __ ldrw(c_rarg2, result_type); 370 __ ldr(c_rarg3, method); 371 __ ldp(c_rarg4, c_rarg5, entry_point); 372 __ ldp(c_rarg6, c_rarg7, parameter_size); 373 374 #ifndef PRODUCT 375 // tell the simulator we are about to end Java execution 376 if (NotifySimulator) { 377 __ notify(Assembler::method_exit); 378 } 379 #endif 380 // leave frame and return to caller 381 __ leave(); 382 __ ret(lr); 383 384 // handle return types different from T_INT 385 386 __ BIND(is_long); 387 __ str(r0, Address(j_rarg2, 0)); 388 __ br(Assembler::AL, exit); 389 390 __ BIND(is_float); 391 __ strs(j_farg0, Address(j_rarg2, 0)); 392 __ br(Assembler::AL, exit); 393 394 __ BIND(is_double); 395 __ strd(j_farg0, Address(j_rarg2, 0)); 396 __ br(Assembler::AL, exit); 397 398 return start; 399 } 400 401 // Return point for a Java call if there's an exception thrown in 402 // Java code. The exception is caught and transformed into a 403 // pending exception stored in JavaThread that can be tested from 404 // within the VM. 405 // 406 // Note: Usually the parameters are removed by the callee. In case 407 // of an exception crossing an activation frame boundary, that is 408 // not the case if the callee is compiled code => need to setup the 409 // rsp. 410 // 411 // r0: exception oop 412 413 // NOTE: this is used as a target from the signal handler so it 414 // needs an x86 prolog which returns into the current simulator 415 // executing the generated catch_exception code. so the prolog 416 // needs to install rax in a sim register and adjust the sim's 417 // restart pc to enter the generated code at the start position 418 // then return from native to simulated execution. 419 420 address generate_catch_exception() { 421 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 422 address start = __ pc(); 423 424 // same as in generate_call_stub(): 425 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 426 const Address thread (rfp, thread_off * wordSize); 427 428 #ifdef ASSERT 429 // verify that threads correspond 430 { 431 Label L, S; 432 __ ldr(rscratch1, thread); 433 __ cmp(rthread, rscratch1); 434 __ br(Assembler::NE, S); 435 __ get_thread(rscratch1); 436 __ cmp(rthread, rscratch1); 437 __ br(Assembler::EQ, L); 438 __ bind(S); 439 __ stop("StubRoutines::catch_exception: threads must correspond"); 440 __ bind(L); 441 } 442 #endif 443 444 // set pending exception 445 __ verify_oop(r0); 446 447 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 448 __ mov(rscratch1, (address)__FILE__); 449 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 450 __ movw(rscratch1, (int)__LINE__); 451 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 452 453 // complete return to VM 454 assert(StubRoutines::_call_stub_return_address != NULL, 455 "_call_stub_return_address must have been generated before"); 456 __ b(StubRoutines::_call_stub_return_address); 457 458 return start; 459 } 460 461 // Continuation point for runtime calls returning with a pending 462 // exception. The pending exception check happened in the runtime 463 // or native call stub. The pending exception in Thread is 464 // converted into a Java-level exception. 465 // 466 // Contract with Java-level exception handlers: 467 // r0: exception 468 // r3: throwing pc 469 // 470 // NOTE: At entry of this stub, exception-pc must be in LR !! 471 472 // NOTE: this is always used as a jump target within generated code 473 // so it just needs to be generated code wiht no x86 prolog 474 475 address generate_forward_exception() { 476 StubCodeMark mark(this, "StubRoutines", "forward exception"); 477 address start = __ pc(); 478 479 // Upon entry, LR points to the return address returning into 480 // Java (interpreted or compiled) code; i.e., the return address 481 // becomes the throwing pc. 482 // 483 // Arguments pushed before the runtime call are still on the stack 484 // but the exception handler will reset the stack pointer -> 485 // ignore them. A potential result in registers can be ignored as 486 // well. 487 488 #ifdef ASSERT 489 // make sure this code is only executed if there is a pending exception 490 { 491 Label L; 492 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 493 __ cbnz(rscratch1, L); 494 __ stop("StubRoutines::forward exception: no pending exception (1)"); 495 __ bind(L); 496 } 497 #endif 498 499 // compute exception handler into r19 500 501 // call the VM to find the handler address associated with the 502 // caller address. pass thread in r0 and caller pc (ret address) 503 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 504 // the stack. 505 __ mov(c_rarg1, lr); 506 // lr will be trashed by the VM call so we move it to R19 507 // (callee-saved) because we also need to pass it to the handler 508 // returned by this call. 509 __ mov(r19, lr); 510 BLOCK_COMMENT("call exception_handler_for_return_address"); 511 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 512 SharedRuntime::exception_handler_for_return_address), 513 rthread, c_rarg1); 514 // we should not really care that lr is no longer the callee 515 // address. we saved the value the handler needs in r19 so we can 516 // just copy it to r3. however, the C2 handler will push its own 517 // frame and then calls into the VM and the VM code asserts that 518 // the PC for the frame above the handler belongs to a compiled 519 // Java method. So, we restore lr here to satisfy that assert. 520 __ mov(lr, r19); 521 // setup r0 & r3 & clear pending exception 522 __ mov(r3, r19); 523 __ mov(r19, r0); 524 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 525 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 526 527 #ifdef ASSERT 528 // make sure exception is set 529 { 530 Label L; 531 __ cbnz(r0, L); 532 __ stop("StubRoutines::forward exception: no pending exception (2)"); 533 __ bind(L); 534 } 535 #endif 536 537 // continue at exception handler 538 // r0: exception 539 // r3: throwing pc 540 // r19: exception handler 541 __ verify_oop(r0); 542 __ br(r19); 543 544 return start; 545 } 546 547 // Non-destructive plausibility checks for oops 548 // 549 // Arguments: 550 // r0: oop to verify 551 // rscratch1: error message 552 // 553 // Stack after saving c_rarg3: 554 // [tos + 0]: saved c_rarg3 555 // [tos + 1]: saved c_rarg2 556 // [tos + 2]: saved lr 557 // [tos + 3]: saved rscratch2 558 // [tos + 4]: saved r0 559 // [tos + 5]: saved rscratch1 560 address generate_verify_oop() { 561 562 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 563 address start = __ pc(); 564 565 Label exit, error; 566 567 // save c_rarg2 and c_rarg3 568 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 569 570 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 571 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 572 __ ldr(c_rarg3, Address(c_rarg2)); 573 __ add(c_rarg3, c_rarg3, 1); 574 __ str(c_rarg3, Address(c_rarg2)); 575 576 // object is in r0 577 // make sure object is 'reasonable' 578 __ cbz(r0, exit); // if obj is NULL it is OK 579 580 // Check if the oop is in the right area of memory 581 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 582 __ andr(c_rarg2, r0, c_rarg3); 583 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 584 585 // Compare c_rarg2 and c_rarg3. We don't use a compare 586 // instruction here because the flags register is live. 587 __ eor(c_rarg2, c_rarg2, c_rarg3); 588 __ cbnz(c_rarg2, error); 589 590 // make sure klass is 'reasonable', which is not zero. 591 __ load_klass(r0, r0); // get klass 592 __ cbz(r0, error); // if klass is NULL it is broken 593 594 // return if everything seems ok 595 __ bind(exit); 596 597 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 598 __ ret(lr); 599 600 // handle errors 601 __ bind(error); 602 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 603 604 __ push(RegSet::range(r0, r29), sp); 605 // debug(char* msg, int64_t pc, int64_t regs[]) 606 __ mov(c_rarg0, rscratch1); // pass address of error message 607 __ mov(c_rarg1, lr); // pass return address 608 __ mov(c_rarg2, sp); // pass address of regs on stack 609 #ifndef PRODUCT 610 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 611 #endif 612 BLOCK_COMMENT("call MacroAssembler::debug"); 613 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 614 __ blrt(rscratch1, 3, 0, 1); 615 616 return start; 617 } 618 619 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 620 621 // Generate code for an array write pre barrier 622 // 623 // addr - starting address 624 // count - element count 625 // tmp - scratch register 626 // 627 // Destroy no registers except rscratch1 and rscratch2 628 // 629 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 630 BarrierSet* bs = Universe::heap()->barrier_set(); 631 switch (bs->kind()) { 632 case BarrierSet::G1SATBCTLogging: 633 // With G1, don't generate the call if we statically know that the target in uninitialized 634 if (!dest_uninitialized) { 635 __ push_call_clobbered_registers(); 636 if (count == c_rarg0) { 637 if (addr == c_rarg1) { 638 // exactly backwards!! 639 __ mov(rscratch1, c_rarg0); 640 __ mov(c_rarg0, c_rarg1); 641 __ mov(c_rarg1, rscratch1); 642 } else { 643 __ mov(c_rarg1, count); 644 __ mov(c_rarg0, addr); 645 } 646 } else { 647 __ mov(c_rarg0, addr); 648 __ mov(c_rarg1, count); 649 } 650 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 651 __ pop_call_clobbered_registers(); 652 break; 653 case BarrierSet::CardTableForRS: 654 case BarrierSet::CardTableExtension: 655 case BarrierSet::ModRef: 656 break; 657 default: 658 ShouldNotReachHere(); 659 660 } 661 } 662 } 663 664 // 665 // Generate code for an array write post barrier 666 // 667 // Input: 668 // start - register containing starting address of destination array 669 // end - register containing ending address of destination array 670 // scratch - scratch register 671 // 672 // The input registers are overwritten. 673 // The ending address is inclusive. 674 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) { 675 assert_different_registers(start, end, scratch); 676 BarrierSet* bs = Universe::heap()->barrier_set(); 677 switch (bs->kind()) { 678 case BarrierSet::G1SATBCTLogging: 679 680 { 681 __ push_call_clobbered_registers(); 682 // must compute element count unless barrier set interface is changed (other platforms supply count) 683 assert_different_registers(start, end, scratch); 684 __ lea(scratch, Address(end, BytesPerHeapOop)); 685 __ sub(scratch, scratch, start); // subtract start to get #bytes 686 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 687 __ mov(c_rarg0, start); 688 __ mov(c_rarg1, scratch); 689 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 690 __ pop_call_clobbered_registers(); 691 } 692 break; 693 case BarrierSet::CardTableForRS: 694 case BarrierSet::CardTableExtension: 695 { 696 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 697 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 698 699 Label L_loop; 700 701 __ lsr(start, start, CardTableModRefBS::card_shift); 702 __ lsr(end, end, CardTableModRefBS::card_shift); 703 __ sub(end, end, start); // number of bytes to copy 704 705 const Register count = end; // 'end' register contains bytes count now 706 __ load_byte_map_base(scratch); 707 __ add(start, start, scratch); 708 if (UseConcMarkSweepGC) { 709 __ membar(__ StoreStore); 710 } 711 __ BIND(L_loop); 712 __ strb(zr, Address(start, count)); 713 __ subs(count, count, 1); 714 __ br(Assembler::HS, L_loop); 715 } 716 break; 717 default: 718 ShouldNotReachHere(); 719 720 } 721 } 722 723 typedef enum { 724 copy_forwards = 1, 725 copy_backwards = -1 726 } copy_direction; 727 728 // Bulk copy of blocks of 8 words. 729 // 730 // count is a count of words. 731 // 732 // Precondition: count >= 8 733 // 734 // Postconditions: 735 // 736 // The least significant bit of count contains the remaining count 737 // of words to copy. The rest of count is trash. 738 // 739 // s and d are adjusted to point to the remaining words to copy 740 // 741 void generate_copy_longs(Label &start, Register s, Register d, Register count, 742 copy_direction direction) { 743 int unit = wordSize * direction; 744 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 745 746 int offset; 747 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 748 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 749 const Register stride = r13; 750 751 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 752 assert_different_registers(s, d, count, rscratch1); 753 754 Label again, drain; 755 const char *stub_name; 756 if (direction == copy_forwards) 757 stub_name = "foward_copy_longs"; 758 else 759 stub_name = "backward_copy_longs"; 760 StubCodeMark mark(this, "StubRoutines", stub_name); 761 __ align(CodeEntryAlignment); 762 __ bind(start); 763 if (direction == copy_forwards) { 764 __ sub(s, s, bias); 765 __ sub(d, d, bias); 766 } 767 768 #ifdef ASSERT 769 // Make sure we are never given < 8 words 770 { 771 Label L; 772 __ cmp(count, 8); 773 __ br(Assembler::GE, L); 774 __ stop("genrate_copy_longs called with < 8 words"); 775 __ bind(L); 776 } 777 #endif 778 779 // Fill 8 registers 780 if (UseSIMDForMemoryOps) { 781 __ ldpq(v0, v1, Address(s, 4 * unit)); 782 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 783 } else { 784 __ ldp(t0, t1, Address(s, 2 * unit)); 785 __ ldp(t2, t3, Address(s, 4 * unit)); 786 __ ldp(t4, t5, Address(s, 6 * unit)); 787 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 788 } 789 790 __ subs(count, count, 16); 791 __ br(Assembler::LO, drain); 792 793 int prefetch = PrefetchCopyIntervalInBytes; 794 bool use_stride = false; 795 if (direction == copy_backwards) { 796 use_stride = prefetch > 256; 797 prefetch = -prefetch; 798 if (use_stride) __ mov(stride, prefetch); 799 } 800 801 __ bind(again); 802 803 if (PrefetchCopyIntervalInBytes > 0) 804 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 805 806 if (UseSIMDForMemoryOps) { 807 __ stpq(v0, v1, Address(d, 4 * unit)); 808 __ ldpq(v0, v1, Address(s, 4 * unit)); 809 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 810 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 811 } else { 812 __ stp(t0, t1, Address(d, 2 * unit)); 813 __ ldp(t0, t1, Address(s, 2 * unit)); 814 __ stp(t2, t3, Address(d, 4 * unit)); 815 __ ldp(t2, t3, Address(s, 4 * unit)); 816 __ stp(t4, t5, Address(d, 6 * unit)); 817 __ ldp(t4, t5, Address(s, 6 * unit)); 818 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 819 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 820 } 821 822 __ subs(count, count, 8); 823 __ br(Assembler::HS, again); 824 825 // Drain 826 __ bind(drain); 827 if (UseSIMDForMemoryOps) { 828 __ stpq(v0, v1, Address(d, 4 * unit)); 829 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 830 } else { 831 __ stp(t0, t1, Address(d, 2 * unit)); 832 __ stp(t2, t3, Address(d, 4 * unit)); 833 __ stp(t4, t5, Address(d, 6 * unit)); 834 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 835 } 836 837 { 838 Label L1, L2; 839 __ tbz(count, exact_log2(4), L1); 840 if (UseSIMDForMemoryOps) { 841 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 842 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 843 } else { 844 __ ldp(t0, t1, Address(s, 2 * unit)); 845 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 846 __ stp(t0, t1, Address(d, 2 * unit)); 847 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 848 } 849 __ bind(L1); 850 851 if (direction == copy_forwards) { 852 __ add(s, s, bias); 853 __ add(d, d, bias); 854 } 855 856 __ tbz(count, 1, L2); 857 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 858 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 859 __ bind(L2); 860 } 861 862 __ ret(lr); 863 } 864 865 // Small copy: less than 16 bytes. 866 // 867 // NB: Ignores all of the bits of count which represent more than 15 868 // bytes, so a caller doesn't have to mask them. 869 870 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 871 bool is_backwards = step < 0; 872 size_t granularity = uabs(step); 873 int direction = is_backwards ? -1 : 1; 874 int unit = wordSize * direction; 875 876 Label Lpair, Lword, Lint, Lshort, Lbyte; 877 878 assert(granularity 879 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 880 881 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 882 883 // ??? I don't know if this bit-test-and-branch is the right thing 884 // to do. It does a lot of jumping, resulting in several 885 // mispredicted branches. It might make more sense to do this 886 // with something like Duff's device with a single computed branch. 887 888 __ tbz(count, 3 - exact_log2(granularity), Lword); 889 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 890 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 891 __ bind(Lword); 892 893 if (granularity <= sizeof (jint)) { 894 __ tbz(count, 2 - exact_log2(granularity), Lint); 895 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 896 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 897 __ bind(Lint); 898 } 899 900 if (granularity <= sizeof (jshort)) { 901 __ tbz(count, 1 - exact_log2(granularity), Lshort); 902 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 903 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 904 __ bind(Lshort); 905 } 906 907 if (granularity <= sizeof (jbyte)) { 908 __ tbz(count, 0, Lbyte); 909 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 910 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 911 __ bind(Lbyte); 912 } 913 } 914 915 Label copy_f, copy_b; 916 917 // All-singing all-dancing memory copy. 918 // 919 // Copy count units of memory from s to d. The size of a unit is 920 // step, which can be positive or negative depending on the direction 921 // of copy. If is_aligned is false, we align the source address. 922 // 923 924 void copy_memory(bool is_aligned, Register s, Register d, 925 Register count, Register tmp, int step) { 926 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 927 bool is_backwards = step < 0; 928 int granularity = uabs(step); 929 const Register t0 = r3, t1 = r4; 930 931 // <= 96 bytes do inline. Direction doesn't matter because we always 932 // load all the data before writing anything 933 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 934 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 935 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 936 const Register send = r17, dend = r18; 937 938 if (PrefetchCopyIntervalInBytes > 0) 939 __ prfm(Address(s, 0), PLDL1KEEP); 940 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 941 __ br(Assembler::HI, copy_big); 942 943 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 944 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 945 946 __ cmp(count, 16/granularity); 947 __ br(Assembler::LS, copy16); 948 949 __ cmp(count, 64/granularity); 950 __ br(Assembler::HI, copy80); 951 952 __ cmp(count, 32/granularity); 953 __ br(Assembler::LS, copy32); 954 955 // 33..64 bytes 956 if (UseSIMDForMemoryOps) { 957 __ ldpq(v0, v1, Address(s, 0)); 958 __ ldpq(v2, v3, Address(send, -32)); 959 __ stpq(v0, v1, Address(d, 0)); 960 __ stpq(v2, v3, Address(dend, -32)); 961 } else { 962 __ ldp(t0, t1, Address(s, 0)); 963 __ ldp(t2, t3, Address(s, 16)); 964 __ ldp(t4, t5, Address(send, -32)); 965 __ ldp(t6, t7, Address(send, -16)); 966 967 __ stp(t0, t1, Address(d, 0)); 968 __ stp(t2, t3, Address(d, 16)); 969 __ stp(t4, t5, Address(dend, -32)); 970 __ stp(t6, t7, Address(dend, -16)); 971 } 972 __ b(finish); 973 974 // 17..32 bytes 975 __ bind(copy32); 976 __ ldp(t0, t1, Address(s, 0)); 977 __ ldp(t2, t3, Address(send, -16)); 978 __ stp(t0, t1, Address(d, 0)); 979 __ stp(t2, t3, Address(dend, -16)); 980 __ b(finish); 981 982 // 65..80/96 bytes 983 // (96 bytes if SIMD because we do 32 byes per instruction) 984 __ bind(copy80); 985 if (UseSIMDForMemoryOps) { 986 __ ldpq(v0, v1, Address(s, 0)); 987 __ ldpq(v2, v3, Address(s, 32)); 988 __ ldpq(v4, v5, Address(send, -32)); 989 __ stpq(v0, v1, Address(d, 0)); 990 __ stpq(v2, v3, Address(d, 32)); 991 __ stpq(v4, v5, Address(dend, -32)); 992 } else { 993 __ ldp(t0, t1, Address(s, 0)); 994 __ ldp(t2, t3, Address(s, 16)); 995 __ ldp(t4, t5, Address(s, 32)); 996 __ ldp(t6, t7, Address(s, 48)); 997 __ ldp(t8, t9, Address(send, -16)); 998 999 __ stp(t0, t1, Address(d, 0)); 1000 __ stp(t2, t3, Address(d, 16)); 1001 __ stp(t4, t5, Address(d, 32)); 1002 __ stp(t6, t7, Address(d, 48)); 1003 __ stp(t8, t9, Address(dend, -16)); 1004 } 1005 __ b(finish); 1006 1007 // 0..16 bytes 1008 __ bind(copy16); 1009 __ cmp(count, 8/granularity); 1010 __ br(Assembler::LO, copy8); 1011 1012 // 8..16 bytes 1013 __ ldr(t0, Address(s, 0)); 1014 __ ldr(t1, Address(send, -8)); 1015 __ str(t0, Address(d, 0)); 1016 __ str(t1, Address(dend, -8)); 1017 __ b(finish); 1018 1019 if (granularity < 8) { 1020 // 4..7 bytes 1021 __ bind(copy8); 1022 __ tbz(count, 2 - exact_log2(granularity), copy4); 1023 __ ldrw(t0, Address(s, 0)); 1024 __ ldrw(t1, Address(send, -4)); 1025 __ strw(t0, Address(d, 0)); 1026 __ strw(t1, Address(dend, -4)); 1027 __ b(finish); 1028 if (granularity < 4) { 1029 // 0..3 bytes 1030 __ bind(copy4); 1031 __ cbz(count, finish); // get rid of 0 case 1032 if (granularity == 2) { 1033 __ ldrh(t0, Address(s, 0)); 1034 __ strh(t0, Address(d, 0)); 1035 } else { // granularity == 1 1036 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1037 // the first and last byte. 1038 // Handle the 3 byte case by loading and storing base + count/2 1039 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1040 // This does means in the 1 byte case we load/store the same 1041 // byte 3 times. 1042 __ lsr(count, count, 1); 1043 __ ldrb(t0, Address(s, 0)); 1044 __ ldrb(t1, Address(send, -1)); 1045 __ ldrb(t2, Address(s, count)); 1046 __ strb(t0, Address(d, 0)); 1047 __ strb(t1, Address(dend, -1)); 1048 __ strb(t2, Address(d, count)); 1049 } 1050 __ b(finish); 1051 } 1052 } 1053 1054 __ bind(copy_big); 1055 if (is_backwards) { 1056 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1057 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1058 } 1059 1060 // Now we've got the small case out of the way we can align the 1061 // source address on a 2-word boundary. 1062 1063 Label aligned; 1064 1065 if (is_aligned) { 1066 // We may have to adjust by 1 word to get s 2-word-aligned. 1067 __ tbz(s, exact_log2(wordSize), aligned); 1068 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1069 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1070 __ sub(count, count, wordSize/granularity); 1071 } else { 1072 if (is_backwards) { 1073 __ andr(rscratch2, s, 2 * wordSize - 1); 1074 } else { 1075 __ neg(rscratch2, s); 1076 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1077 } 1078 // rscratch2 is the byte adjustment needed to align s. 1079 __ cbz(rscratch2, aligned); 1080 int shift = exact_log2(granularity); 1081 if (shift) __ lsr(rscratch2, rscratch2, shift); 1082 __ sub(count, count, rscratch2); 1083 1084 #if 0 1085 // ?? This code is only correct for a disjoint copy. It may or 1086 // may not make sense to use it in that case. 1087 1088 // Copy the first pair; s and d may not be aligned. 1089 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1090 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1091 1092 // Align s and d, adjust count 1093 if (is_backwards) { 1094 __ sub(s, s, rscratch2); 1095 __ sub(d, d, rscratch2); 1096 } else { 1097 __ add(s, s, rscratch2); 1098 __ add(d, d, rscratch2); 1099 } 1100 #else 1101 copy_memory_small(s, d, rscratch2, rscratch1, step); 1102 #endif 1103 } 1104 1105 __ bind(aligned); 1106 1107 // s is now 2-word-aligned. 1108 1109 // We have a count of units and some trailing bytes. Adjust the 1110 // count and do a bulk copy of words. 1111 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1112 if (direction == copy_forwards) 1113 __ bl(copy_f); 1114 else 1115 __ bl(copy_b); 1116 1117 // And the tail. 1118 copy_memory_small(s, d, count, tmp, step); 1119 1120 if (granularity >= 8) __ bind(copy8); 1121 if (granularity >= 4) __ bind(copy4); 1122 __ bind(finish); 1123 } 1124 1125 1126 void clobber_registers() { 1127 #ifdef ASSERT 1128 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1129 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1130 for (Register r = r3; r <= r18; r++) 1131 if (r != rscratch1) __ mov(r, rscratch1); 1132 #endif 1133 } 1134 1135 // Scan over array at a for count oops, verifying each one. 1136 // Preserves a and count, clobbers rscratch1 and rscratch2. 1137 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1138 Label loop, end; 1139 __ mov(rscratch1, a); 1140 __ mov(rscratch2, zr); 1141 __ bind(loop); 1142 __ cmp(rscratch2, count); 1143 __ br(Assembler::HS, end); 1144 if (size == (size_t)wordSize) { 1145 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1146 __ verify_oop(temp); 1147 } else { 1148 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1149 __ decode_heap_oop(temp); // calls verify_oop 1150 } 1151 __ add(rscratch2, rscratch2, size); 1152 __ b(loop); 1153 __ bind(end); 1154 } 1155 1156 // Arguments: 1157 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1158 // ignored 1159 // is_oop - true => oop array, so generate store check code 1160 // name - stub name string 1161 // 1162 // Inputs: 1163 // c_rarg0 - source array address 1164 // c_rarg1 - destination array address 1165 // c_rarg2 - element count, treated as ssize_t, can be zero 1166 // 1167 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1168 // the hardware handle it. The two dwords within qwords that span 1169 // cache line boundaries will still be loaded and stored atomicly. 1170 // 1171 // Side Effects: 1172 // disjoint_int_copy_entry is set to the no-overlap entry point 1173 // used by generate_conjoint_int_oop_copy(). 1174 // 1175 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1176 const char *name, bool dest_uninitialized = false) { 1177 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1178 __ align(CodeEntryAlignment); 1179 StubCodeMark mark(this, "StubRoutines", name); 1180 address start = __ pc(); 1181 __ enter(); 1182 1183 if (entry != NULL) { 1184 *entry = __ pc(); 1185 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1186 BLOCK_COMMENT("Entry:"); 1187 } 1188 1189 if (is_oop) { 1190 __ push(RegSet::of(d, count), sp); 1191 // no registers are destroyed by this call 1192 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1193 } 1194 copy_memory(aligned, s, d, count, rscratch1, size); 1195 if (is_oop) { 1196 __ pop(RegSet::of(d, count), sp); 1197 if (VerifyOops) 1198 verify_oop_array(size, d, count, r16); 1199 __ sub(count, count, 1); // make an inclusive end pointer 1200 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1201 gen_write_ref_array_post_barrier(d, count, rscratch1); 1202 } 1203 __ leave(); 1204 __ mov(r0, zr); // return 0 1205 __ ret(lr); 1206 #ifdef BUILTIN_SIM 1207 { 1208 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1209 sim->notifyCompile(const_cast<char*>(name), start); 1210 } 1211 #endif 1212 return start; 1213 } 1214 1215 // Arguments: 1216 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1217 // ignored 1218 // is_oop - true => oop array, so generate store check code 1219 // name - stub name string 1220 // 1221 // Inputs: 1222 // c_rarg0 - source array address 1223 // c_rarg1 - destination array address 1224 // c_rarg2 - element count, treated as ssize_t, can be zero 1225 // 1226 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1227 // the hardware handle it. The two dwords within qwords that span 1228 // cache line boundaries will still be loaded and stored atomicly. 1229 // 1230 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1231 address *entry, const char *name, 1232 bool dest_uninitialized = false) { 1233 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1234 1235 StubCodeMark mark(this, "StubRoutines", name); 1236 address start = __ pc(); 1237 __ enter(); 1238 1239 if (entry != NULL) { 1240 *entry = __ pc(); 1241 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1242 BLOCK_COMMENT("Entry:"); 1243 } 1244 1245 // use fwd copy when (d-s) above_equal (count*size) 1246 __ sub(rscratch1, d, s); 1247 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1248 __ br(Assembler::HS, nooverlap_target); 1249 1250 if (is_oop) { 1251 __ push(RegSet::of(d, count), sp); 1252 // no registers are destroyed by this call 1253 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1254 } 1255 copy_memory(aligned, s, d, count, rscratch1, -size); 1256 if (is_oop) { 1257 __ pop(RegSet::of(d, count), sp); 1258 if (VerifyOops) 1259 verify_oop_array(size, d, count, r16); 1260 __ sub(count, count, 1); // make an inclusive end pointer 1261 __ lea(count, Address(d, count, Address::uxtw(exact_log2(size)))); 1262 gen_write_ref_array_post_barrier(d, count, rscratch1); 1263 } 1264 __ leave(); 1265 __ mov(r0, zr); // return 0 1266 __ ret(lr); 1267 #ifdef BUILTIN_SIM 1268 { 1269 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1270 sim->notifyCompile(const_cast<char*>(name), start); 1271 } 1272 #endif 1273 return start; 1274 } 1275 1276 // Arguments: 1277 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1278 // ignored 1279 // name - stub name string 1280 // 1281 // Inputs: 1282 // c_rarg0 - source array address 1283 // c_rarg1 - destination array address 1284 // c_rarg2 - element count, treated as ssize_t, can be zero 1285 // 1286 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1287 // we let the hardware handle it. The one to eight bytes within words, 1288 // dwords or qwords that span cache line boundaries will still be loaded 1289 // and stored atomically. 1290 // 1291 // Side Effects: 1292 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1293 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1294 // we let the hardware handle it. The one to eight bytes within words, 1295 // dwords or qwords that span cache line boundaries will still be loaded 1296 // and stored atomically. 1297 // 1298 // Side Effects: 1299 // disjoint_byte_copy_entry is set to the no-overlap entry point 1300 // used by generate_conjoint_byte_copy(). 1301 // 1302 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1303 const bool not_oop = false; 1304 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1305 } 1306 1307 // Arguments: 1308 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1309 // ignored 1310 // name - stub name string 1311 // 1312 // Inputs: 1313 // c_rarg0 - source array address 1314 // c_rarg1 - destination array address 1315 // c_rarg2 - element count, treated as ssize_t, can be zero 1316 // 1317 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1318 // we let the hardware handle it. The one to eight bytes within words, 1319 // dwords or qwords that span cache line boundaries will still be loaded 1320 // and stored atomically. 1321 // 1322 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1323 address* entry, const char *name) { 1324 const bool not_oop = false; 1325 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1326 } 1327 1328 // Arguments: 1329 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1330 // ignored 1331 // name - stub name string 1332 // 1333 // Inputs: 1334 // c_rarg0 - source array address 1335 // c_rarg1 - destination array address 1336 // c_rarg2 - element count, treated as ssize_t, can be zero 1337 // 1338 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1339 // let the hardware handle it. The two or four words within dwords 1340 // or qwords that span cache line boundaries will still be loaded 1341 // and stored atomically. 1342 // 1343 // Side Effects: 1344 // disjoint_short_copy_entry is set to the no-overlap entry point 1345 // used by generate_conjoint_short_copy(). 1346 // 1347 address generate_disjoint_short_copy(bool aligned, 1348 address* entry, const char *name) { 1349 const bool not_oop = false; 1350 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1351 } 1352 1353 // Arguments: 1354 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1355 // ignored 1356 // name - stub name string 1357 // 1358 // Inputs: 1359 // c_rarg0 - source array address 1360 // c_rarg1 - destination array address 1361 // c_rarg2 - element count, treated as ssize_t, can be zero 1362 // 1363 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1364 // let the hardware handle it. The two or four words within dwords 1365 // or qwords that span cache line boundaries will still be loaded 1366 // and stored atomically. 1367 // 1368 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1369 address *entry, const char *name) { 1370 const bool not_oop = false; 1371 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1372 1373 } 1374 // Arguments: 1375 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1376 // ignored 1377 // name - stub name string 1378 // 1379 // Inputs: 1380 // c_rarg0 - source array address 1381 // c_rarg1 - destination array address 1382 // c_rarg2 - element count, treated as ssize_t, can be zero 1383 // 1384 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1385 // the hardware handle it. The two dwords within qwords that span 1386 // cache line boundaries will still be loaded and stored atomicly. 1387 // 1388 // Side Effects: 1389 // disjoint_int_copy_entry is set to the no-overlap entry point 1390 // used by generate_conjoint_int_oop_copy(). 1391 // 1392 address generate_disjoint_int_copy(bool aligned, address *entry, 1393 const char *name, bool dest_uninitialized = false) { 1394 const bool not_oop = false; 1395 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1396 } 1397 1398 // Arguments: 1399 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1400 // ignored 1401 // name - stub name string 1402 // 1403 // Inputs: 1404 // c_rarg0 - source array address 1405 // c_rarg1 - destination array address 1406 // c_rarg2 - element count, treated as ssize_t, can be zero 1407 // 1408 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1409 // the hardware handle it. The two dwords within qwords that span 1410 // cache line boundaries will still be loaded and stored atomicly. 1411 // 1412 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1413 address *entry, const char *name, 1414 bool dest_uninitialized = false) { 1415 const bool not_oop = false; 1416 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1417 } 1418 1419 1420 // Arguments: 1421 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1422 // ignored 1423 // name - stub name string 1424 // 1425 // Inputs: 1426 // c_rarg0 - source array address 1427 // c_rarg1 - destination array address 1428 // c_rarg2 - element count, treated as size_t, can be zero 1429 // 1430 // Side Effects: 1431 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1432 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1433 // 1434 address generate_disjoint_long_copy(bool aligned, address *entry, 1435 const char *name, bool dest_uninitialized = false) { 1436 const bool not_oop = false; 1437 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1438 } 1439 1440 // Arguments: 1441 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1442 // ignored 1443 // name - stub name string 1444 // 1445 // Inputs: 1446 // c_rarg0 - source array address 1447 // c_rarg1 - destination array address 1448 // c_rarg2 - element count, treated as size_t, can be zero 1449 // 1450 address generate_conjoint_long_copy(bool aligned, 1451 address nooverlap_target, address *entry, 1452 const char *name, bool dest_uninitialized = false) { 1453 const bool not_oop = false; 1454 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1455 } 1456 1457 // Arguments: 1458 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1459 // ignored 1460 // name - stub name string 1461 // 1462 // Inputs: 1463 // c_rarg0 - source array address 1464 // c_rarg1 - destination array address 1465 // c_rarg2 - element count, treated as size_t, can be zero 1466 // 1467 // Side Effects: 1468 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1469 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1470 // 1471 address generate_disjoint_oop_copy(bool aligned, address *entry, 1472 const char *name, bool dest_uninitialized) { 1473 const bool is_oop = true; 1474 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1475 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1476 } 1477 1478 // Arguments: 1479 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1480 // ignored 1481 // name - stub name string 1482 // 1483 // Inputs: 1484 // c_rarg0 - source array address 1485 // c_rarg1 - destination array address 1486 // c_rarg2 - element count, treated as size_t, can be zero 1487 // 1488 address generate_conjoint_oop_copy(bool aligned, 1489 address nooverlap_target, address *entry, 1490 const char *name, bool dest_uninitialized) { 1491 const bool is_oop = true; 1492 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1493 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1494 name, dest_uninitialized); 1495 } 1496 1497 1498 // Helper for generating a dynamic type check. 1499 // Smashes rscratch1. 1500 void generate_type_check(Register sub_klass, 1501 Register super_check_offset, 1502 Register super_klass, 1503 Label& L_success) { 1504 assert_different_registers(sub_klass, super_check_offset, super_klass); 1505 1506 BLOCK_COMMENT("type_check:"); 1507 1508 Label L_miss; 1509 1510 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1511 super_check_offset); 1512 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1513 1514 // Fall through on failure! 1515 __ BIND(L_miss); 1516 } 1517 1518 // 1519 // Generate checkcasting array copy stub 1520 // 1521 // Input: 1522 // c_rarg0 - source array address 1523 // c_rarg1 - destination array address 1524 // c_rarg2 - element count, treated as ssize_t, can be zero 1525 // c_rarg3 - size_t ckoff (super_check_offset) 1526 // c_rarg4 - oop ckval (super_klass) 1527 // 1528 // Output: 1529 // r0 == 0 - success 1530 // r0 == -1^K - failure, where K is partial transfer count 1531 // 1532 address generate_checkcast_copy(const char *name, address *entry, 1533 bool dest_uninitialized = false) { 1534 1535 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1536 1537 // Input registers (after setup_arg_regs) 1538 const Register from = c_rarg0; // source array address 1539 const Register to = c_rarg1; // destination array address 1540 const Register count = c_rarg2; // elementscount 1541 const Register ckoff = c_rarg3; // super_check_offset 1542 const Register ckval = c_rarg4; // super_klass 1543 1544 // Registers used as temps (r18, r19, r20 are save-on-entry) 1545 const Register count_save = r21; // orig elementscount 1546 const Register start_to = r20; // destination array start address 1547 const Register copied_oop = r18; // actual oop copied 1548 const Register r19_klass = r19; // oop._klass 1549 1550 //--------------------------------------------------------------- 1551 // Assembler stub will be used for this call to arraycopy 1552 // if the two arrays are subtypes of Object[] but the 1553 // destination array type is not equal to or a supertype 1554 // of the source type. Each element must be separately 1555 // checked. 1556 1557 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1558 copied_oop, r19_klass, count_save); 1559 1560 __ align(CodeEntryAlignment); 1561 StubCodeMark mark(this, "StubRoutines", name); 1562 address start = __ pc(); 1563 1564 __ enter(); // required for proper stackwalking of RuntimeStub frame 1565 1566 #ifdef ASSERT 1567 // caller guarantees that the arrays really are different 1568 // otherwise, we would have to make conjoint checks 1569 { Label L; 1570 array_overlap_test(L, TIMES_OOP); 1571 __ stop("checkcast_copy within a single array"); 1572 __ bind(L); 1573 } 1574 #endif //ASSERT 1575 1576 // Caller of this entry point must set up the argument registers. 1577 if (entry != NULL) { 1578 *entry = __ pc(); 1579 BLOCK_COMMENT("Entry:"); 1580 } 1581 1582 // Empty array: Nothing to do. 1583 __ cbz(count, L_done); 1584 1585 __ push(RegSet::of(r18, r19, r20, r21), sp); 1586 1587 #ifdef ASSERT 1588 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1589 // The ckoff and ckval must be mutually consistent, 1590 // even though caller generates both. 1591 { Label L; 1592 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1593 __ ldrw(start_to, Address(ckval, sco_offset)); 1594 __ cmpw(ckoff, start_to); 1595 __ br(Assembler::EQ, L); 1596 __ stop("super_check_offset inconsistent"); 1597 __ bind(L); 1598 } 1599 #endif //ASSERT 1600 1601 gen_write_ref_array_pre_barrier(to, count, dest_uninitialized); 1602 1603 // save the original count 1604 __ mov(count_save, count); 1605 1606 // Copy from low to high addresses 1607 __ mov(start_to, to); // Save destination array start address 1608 __ b(L_load_element); 1609 1610 // ======== begin loop ======== 1611 // (Loop is rotated; its entry is L_load_element.) 1612 // Loop control: 1613 // for (; count != 0; count--) { 1614 // copied_oop = load_heap_oop(from++); 1615 // ... generate_type_check ...; 1616 // store_heap_oop(to++, copied_oop); 1617 // } 1618 __ align(OptoLoopAlignment); 1619 1620 __ BIND(L_store_element); 1621 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1622 __ sub(count, count, 1); 1623 __ cbz(count, L_do_card_marks); 1624 1625 // ======== loop entry is here ======== 1626 __ BIND(L_load_element); 1627 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1628 __ cbz(copied_oop, L_store_element); 1629 1630 __ load_klass(r19_klass, copied_oop);// query the object klass 1631 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1632 // ======== end loop ======== 1633 1634 // It was a real error; we must depend on the caller to finish the job. 1635 // Register count = remaining oops, count_orig = total oops. 1636 // Emit GC store barriers for the oops we have copied and report 1637 // their number to the caller. 1638 1639 __ subs(count, count_save, count); // K = partially copied oop count 1640 __ eon(count, count, zr); // report (-1^K) to caller 1641 __ br(Assembler::EQ, L_done_pop); 1642 1643 __ BIND(L_do_card_marks); 1644 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1645 gen_write_ref_array_post_barrier(start_to, to, rscratch1); 1646 1647 __ bind(L_done_pop); 1648 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1649 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1650 1651 __ bind(L_done); 1652 __ mov(r0, count); 1653 __ leave(); 1654 __ ret(lr); 1655 1656 return start; 1657 } 1658 1659 // Perform range checks on the proposed arraycopy. 1660 // Kills temp, but nothing else. 1661 // Also, clean the sign bits of src_pos and dst_pos. 1662 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1663 Register src_pos, // source position (c_rarg1) 1664 Register dst, // destination array oo (c_rarg2) 1665 Register dst_pos, // destination position (c_rarg3) 1666 Register length, 1667 Register temp, 1668 Label& L_failed) { 1669 BLOCK_COMMENT("arraycopy_range_checks:"); 1670 1671 assert_different_registers(rscratch1, temp); 1672 1673 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1674 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1675 __ addw(temp, length, src_pos); 1676 __ cmpw(temp, rscratch1); 1677 __ br(Assembler::HI, L_failed); 1678 1679 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1680 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1681 __ addw(temp, length, dst_pos); 1682 __ cmpw(temp, rscratch1); 1683 __ br(Assembler::HI, L_failed); 1684 1685 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1686 __ movw(src_pos, src_pos); 1687 __ movw(dst_pos, dst_pos); 1688 1689 BLOCK_COMMENT("arraycopy_range_checks done"); 1690 } 1691 1692 // These stubs get called from some dumb test routine. 1693 // I'll write them properly when they're called from 1694 // something that's actually doing something. 1695 static void fake_arraycopy_stub(address src, address dst, int count) { 1696 assert(count == 0, "huh?"); 1697 } 1698 1699 1700 // 1701 // Generate 'unsafe' array copy stub 1702 // Though just as safe as the other stubs, it takes an unscaled 1703 // size_t argument instead of an element count. 1704 // 1705 // Input: 1706 // c_rarg0 - source array address 1707 // c_rarg1 - destination array address 1708 // c_rarg2 - byte count, treated as ssize_t, can be zero 1709 // 1710 // Examines the alignment of the operands and dispatches 1711 // to a long, int, short, or byte copy loop. 1712 // 1713 address generate_unsafe_copy(const char *name, 1714 address byte_copy_entry) { 1715 #ifdef PRODUCT 1716 return StubRoutines::_jbyte_arraycopy; 1717 #else 1718 __ align(CodeEntryAlignment); 1719 StubCodeMark mark(this, "StubRoutines", name); 1720 address start = __ pc(); 1721 __ enter(); // required for proper stackwalking of RuntimeStub frame 1722 // bump this on entry, not on exit: 1723 __ lea(rscratch2, ExternalAddress((address)&SharedRuntime::_unsafe_array_copy_ctr)); 1724 __ incrementw(Address(rscratch2)); 1725 __ b(RuntimeAddress(byte_copy_entry)); 1726 return start; 1727 #endif 1728 } 1729 1730 // 1731 // Generate generic array copy stubs 1732 // 1733 // Input: 1734 // c_rarg0 - src oop 1735 // c_rarg1 - src_pos (32-bits) 1736 // c_rarg2 - dst oop 1737 // c_rarg3 - dst_pos (32-bits) 1738 // c_rarg4 - element count (32-bits) 1739 // 1740 // Output: 1741 // r0 == 0 - success 1742 // r0 == -1^K - failure, where K is partial transfer count 1743 // 1744 address generate_generic_copy(const char *name, 1745 address byte_copy_entry, address short_copy_entry, 1746 address int_copy_entry, address oop_copy_entry, 1747 address long_copy_entry, address checkcast_copy_entry) { 1748 1749 Label L_failed, L_failed_0, L_objArray; 1750 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1751 1752 // Input registers 1753 const Register src = c_rarg0; // source array oop 1754 const Register src_pos = c_rarg1; // source position 1755 const Register dst = c_rarg2; // destination array oop 1756 const Register dst_pos = c_rarg3; // destination position 1757 const Register length = c_rarg4; 1758 1759 StubCodeMark mark(this, "StubRoutines", name); 1760 1761 __ align(CodeEntryAlignment); 1762 address start = __ pc(); 1763 1764 __ enter(); // required for proper stackwalking of RuntimeStub frame 1765 1766 // bump this on entry, not on exit: 1767 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1768 1769 //----------------------------------------------------------------------- 1770 // Assembler stub will be used for this call to arraycopy 1771 // if the following conditions are met: 1772 // 1773 // (1) src and dst must not be null. 1774 // (2) src_pos must not be negative. 1775 // (3) dst_pos must not be negative. 1776 // (4) length must not be negative. 1777 // (5) src klass and dst klass should be the same and not NULL. 1778 // (6) src and dst should be arrays. 1779 // (7) src_pos + length must not exceed length of src. 1780 // (8) dst_pos + length must not exceed length of dst. 1781 // 1782 1783 // if (src == NULL) return -1; 1784 __ cbz(src, L_failed); 1785 1786 // if (src_pos < 0) return -1; 1787 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 1788 1789 // if (dst == NULL) return -1; 1790 __ cbz(dst, L_failed); 1791 1792 // if (dst_pos < 0) return -1; 1793 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 1794 1795 // registers used as temp 1796 const Register scratch_length = r16; // elements count to copy 1797 const Register scratch_src_klass = r17; // array klass 1798 const Register lh = r18; // layout helper 1799 1800 // if (length < 0) return -1; 1801 __ movw(scratch_length, length); // length (elements count, 32-bits value) 1802 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 1803 1804 __ load_klass(scratch_src_klass, src); 1805 #ifdef ASSERT 1806 // assert(src->klass() != NULL); 1807 { 1808 BLOCK_COMMENT("assert klasses not null {"); 1809 Label L1, L2; 1810 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 1811 __ bind(L1); 1812 __ stop("broken null klass"); 1813 __ bind(L2); 1814 __ load_klass(rscratch1, dst); 1815 __ cbz(rscratch1, L1); // this would be broken also 1816 BLOCK_COMMENT("} assert klasses not null done"); 1817 } 1818 #endif 1819 1820 // Load layout helper (32-bits) 1821 // 1822 // |array_tag| | header_size | element_type | |log2_element_size| 1823 // 32 30 24 16 8 2 0 1824 // 1825 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 1826 // 1827 1828 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 1829 1830 // Handle objArrays completely differently... 1831 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 1832 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 1833 __ movw(rscratch1, objArray_lh); 1834 __ eorw(rscratch2, lh, rscratch1); 1835 __ cbzw(rscratch2, L_objArray); 1836 1837 // if (src->klass() != dst->klass()) return -1; 1838 __ load_klass(rscratch2, dst); 1839 __ eor(rscratch2, rscratch2, scratch_src_klass); 1840 __ cbnz(rscratch2, L_failed); 1841 1842 // if (!src->is_Array()) return -1; 1843 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 1844 1845 // At this point, it is known to be a typeArray (array_tag 0x3). 1846 #ifdef ASSERT 1847 { 1848 BLOCK_COMMENT("assert primitive array {"); 1849 Label L; 1850 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 1851 __ cmpw(lh, rscratch2); 1852 __ br(Assembler::GE, L); 1853 __ stop("must be a primitive array"); 1854 __ bind(L); 1855 BLOCK_COMMENT("} assert primitive array done"); 1856 } 1857 #endif 1858 1859 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1860 rscratch2, L_failed); 1861 1862 // TypeArrayKlass 1863 // 1864 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 1865 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 1866 // 1867 1868 const Register rscratch1_offset = rscratch1; // array offset 1869 const Register r18_elsize = lh; // element size 1870 1871 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 1872 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 1873 __ add(src, src, rscratch1_offset); // src array offset 1874 __ add(dst, dst, rscratch1_offset); // dst array offset 1875 BLOCK_COMMENT("choose copy loop based on element size"); 1876 1877 // next registers should be set before the jump to corresponding stub 1878 const Register from = c_rarg0; // source array address 1879 const Register to = c_rarg1; // destination array address 1880 const Register count = c_rarg2; // elements count 1881 1882 // 'from', 'to', 'count' registers should be set in such order 1883 // since they are the same as 'src', 'src_pos', 'dst'. 1884 1885 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 1886 1887 // The possible values of elsize are 0-3, i.e. exact_log2(element 1888 // size in bytes). We do a simple bitwise binary search. 1889 __ BIND(L_copy_bytes); 1890 __ tbnz(r18_elsize, 1, L_copy_ints); 1891 __ tbnz(r18_elsize, 0, L_copy_shorts); 1892 __ lea(from, Address(src, src_pos));// src_addr 1893 __ lea(to, Address(dst, dst_pos));// dst_addr 1894 __ movw(count, scratch_length); // length 1895 __ b(RuntimeAddress(byte_copy_entry)); 1896 1897 __ BIND(L_copy_shorts); 1898 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 1899 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 1900 __ movw(count, scratch_length); // length 1901 __ b(RuntimeAddress(short_copy_entry)); 1902 1903 __ BIND(L_copy_ints); 1904 __ tbnz(r18_elsize, 0, L_copy_longs); 1905 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 1906 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 1907 __ movw(count, scratch_length); // length 1908 __ b(RuntimeAddress(int_copy_entry)); 1909 1910 __ BIND(L_copy_longs); 1911 #ifdef ASSERT 1912 { 1913 BLOCK_COMMENT("assert long copy {"); 1914 Label L; 1915 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 1916 __ cmpw(r18_elsize, LogBytesPerLong); 1917 __ br(Assembler::EQ, L); 1918 __ stop("must be long copy, but elsize is wrong"); 1919 __ bind(L); 1920 BLOCK_COMMENT("} assert long copy done"); 1921 } 1922 #endif 1923 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 1924 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 1925 __ movw(count, scratch_length); // length 1926 __ b(RuntimeAddress(long_copy_entry)); 1927 1928 // ObjArrayKlass 1929 __ BIND(L_objArray); 1930 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 1931 1932 Label L_plain_copy, L_checkcast_copy; 1933 // test array classes for subtyping 1934 __ load_klass(r18, dst); 1935 __ cmp(scratch_src_klass, r18); // usual case is exact equality 1936 __ br(Assembler::NE, L_checkcast_copy); 1937 1938 // Identically typed arrays can be copied without element-wise checks. 1939 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1940 rscratch2, L_failed); 1941 1942 __ lea(from, Address(src, src_pos, Address::lsl(3))); 1943 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1944 __ lea(to, Address(dst, dst_pos, Address::lsl(3))); 1945 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1946 __ movw(count, scratch_length); // length 1947 __ BIND(L_plain_copy); 1948 __ b(RuntimeAddress(oop_copy_entry)); 1949 1950 __ BIND(L_checkcast_copy); 1951 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 1952 { 1953 // Before looking at dst.length, make sure dst is also an objArray. 1954 __ ldrw(rscratch1, Address(r18, lh_offset)); 1955 __ movw(rscratch2, objArray_lh); 1956 __ eorw(rscratch1, rscratch1, rscratch2); 1957 __ cbnzw(rscratch1, L_failed); 1958 1959 // It is safe to examine both src.length and dst.length. 1960 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1961 r18, L_failed); 1962 1963 const Register rscratch2_dst_klass = rscratch2; 1964 __ load_klass(rscratch2_dst_klass, dst); // reload 1965 1966 // Marshal the base address arguments now, freeing registers. 1967 __ lea(from, Address(src, src_pos, Address::lsl(3))); 1968 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1969 __ lea(to, Address(dst, dst_pos, Address::lsl(3))); 1970 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1971 __ movw(count, length); // length (reloaded) 1972 Register sco_temp = c_rarg3; // this register is free now 1973 assert_different_registers(from, to, count, sco_temp, 1974 rscratch2_dst_klass, scratch_src_klass); 1975 // assert_clean_int(count, sco_temp); 1976 1977 // Generate the type check. 1978 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1979 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 1980 // assert_clean_int(sco_temp, r18); 1981 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 1982 1983 // Fetch destination element klass from the ObjArrayKlass header. 1984 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 1985 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 1986 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 1987 1988 // the checkcast_copy loop needs two extra arguments: 1989 assert(c_rarg3 == sco_temp, "#3 already in place"); 1990 // Set up arguments for checkcast_copy_entry. 1991 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 1992 __ b(RuntimeAddress(checkcast_copy_entry)); 1993 } 1994 1995 __ BIND(L_failed); 1996 __ mov(r0, -1); 1997 __ leave(); // required for proper stackwalking of RuntimeStub frame 1998 __ ret(lr); 1999 2000 return start; 2001 } 2002 2003 void generate_arraycopy_stubs() { 2004 address entry; 2005 address entry_jbyte_arraycopy; 2006 address entry_jshort_arraycopy; 2007 address entry_jint_arraycopy; 2008 address entry_oop_arraycopy; 2009 address entry_jlong_arraycopy; 2010 address entry_checkcast_arraycopy; 2011 2012 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2013 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2014 2015 //*** jbyte 2016 // Always need aligned and unaligned versions 2017 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2018 "jbyte_disjoint_arraycopy"); 2019 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2020 &entry_jbyte_arraycopy, 2021 "jbyte_arraycopy"); 2022 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2023 "arrayof_jbyte_disjoint_arraycopy"); 2024 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2025 "arrayof_jbyte_arraycopy"); 2026 2027 //*** jshort 2028 // Always need aligned and unaligned versions 2029 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2030 "jshort_disjoint_arraycopy"); 2031 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2032 &entry_jshort_arraycopy, 2033 "jshort_arraycopy"); 2034 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2035 "arrayof_jshort_disjoint_arraycopy"); 2036 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2037 "arrayof_jshort_arraycopy"); 2038 2039 //*** jint 2040 // Aligned versions 2041 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2042 "arrayof_jint_disjoint_arraycopy"); 2043 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2044 "arrayof_jint_arraycopy"); 2045 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2046 // entry_jint_arraycopy always points to the unaligned version 2047 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2048 "jint_disjoint_arraycopy"); 2049 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2050 &entry_jint_arraycopy, 2051 "jint_arraycopy"); 2052 2053 //*** jlong 2054 // It is always aligned 2055 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2056 "arrayof_jlong_disjoint_arraycopy"); 2057 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2058 "arrayof_jlong_arraycopy"); 2059 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2060 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2061 2062 //*** oops 2063 { 2064 // With compressed oops we need unaligned versions; notice that 2065 // we overwrite entry_oop_arraycopy. 2066 bool aligned = !UseCompressedOops; 2067 2068 StubRoutines::_arrayof_oop_disjoint_arraycopy 2069 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2070 /*dest_uninitialized*/false); 2071 StubRoutines::_arrayof_oop_arraycopy 2072 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2073 /*dest_uninitialized*/false); 2074 // Aligned versions without pre-barriers 2075 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2076 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2077 /*dest_uninitialized*/true); 2078 StubRoutines::_arrayof_oop_arraycopy_uninit 2079 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2080 /*dest_uninitialized*/true); 2081 } 2082 2083 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2084 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2085 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2086 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2087 2088 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2089 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2090 /*dest_uninitialized*/true); 2091 2092 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2093 entry_jbyte_arraycopy); 2094 2095 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2096 entry_jbyte_arraycopy, 2097 entry_jshort_arraycopy, 2098 entry_jint_arraycopy, 2099 entry_oop_arraycopy, 2100 entry_jlong_arraycopy, 2101 entry_checkcast_arraycopy); 2102 2103 } 2104 2105 void generate_math_stubs() { Unimplemented(); } 2106 2107 // Arguments: 2108 // 2109 // Inputs: 2110 // c_rarg0 - source byte array address 2111 // c_rarg1 - destination byte array address 2112 // c_rarg2 - K (key) in little endian int array 2113 // 2114 address generate_aescrypt_encryptBlock() { 2115 __ align(CodeEntryAlignment); 2116 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2117 2118 Label L_doLast; 2119 2120 const Register from = c_rarg0; // source array address 2121 const Register to = c_rarg1; // destination array address 2122 const Register key = c_rarg2; // key array address 2123 const Register keylen = rscratch1; 2124 2125 address start = __ pc(); 2126 __ enter(); 2127 2128 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2129 2130 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2131 2132 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2133 __ rev32(v1, __ T16B, v1); 2134 __ rev32(v2, __ T16B, v2); 2135 __ rev32(v3, __ T16B, v3); 2136 __ rev32(v4, __ T16B, v4); 2137 __ aese(v0, v1); 2138 __ aesmc(v0, v0); 2139 __ aese(v0, v2); 2140 __ aesmc(v0, v0); 2141 __ aese(v0, v3); 2142 __ aesmc(v0, v0); 2143 __ aese(v0, v4); 2144 __ aesmc(v0, v0); 2145 2146 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2147 __ rev32(v1, __ T16B, v1); 2148 __ rev32(v2, __ T16B, v2); 2149 __ rev32(v3, __ T16B, v3); 2150 __ rev32(v4, __ T16B, v4); 2151 __ aese(v0, v1); 2152 __ aesmc(v0, v0); 2153 __ aese(v0, v2); 2154 __ aesmc(v0, v0); 2155 __ aese(v0, v3); 2156 __ aesmc(v0, v0); 2157 __ aese(v0, v4); 2158 __ aesmc(v0, v0); 2159 2160 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2161 __ rev32(v1, __ T16B, v1); 2162 __ rev32(v2, __ T16B, v2); 2163 2164 __ cmpw(keylen, 44); 2165 __ br(Assembler::EQ, L_doLast); 2166 2167 __ aese(v0, v1); 2168 __ aesmc(v0, v0); 2169 __ aese(v0, v2); 2170 __ aesmc(v0, v0); 2171 2172 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2173 __ rev32(v1, __ T16B, v1); 2174 __ rev32(v2, __ T16B, v2); 2175 2176 __ cmpw(keylen, 52); 2177 __ br(Assembler::EQ, L_doLast); 2178 2179 __ aese(v0, v1); 2180 __ aesmc(v0, v0); 2181 __ aese(v0, v2); 2182 __ aesmc(v0, v0); 2183 2184 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2185 __ rev32(v1, __ T16B, v1); 2186 __ rev32(v2, __ T16B, v2); 2187 2188 __ BIND(L_doLast); 2189 2190 __ aese(v0, v1); 2191 __ aesmc(v0, v0); 2192 __ aese(v0, v2); 2193 2194 __ ld1(v1, __ T16B, key); 2195 __ rev32(v1, __ T16B, v1); 2196 __ eor(v0, __ T16B, v0, v1); 2197 2198 __ st1(v0, __ T16B, to); 2199 2200 __ mov(r0, 0); 2201 2202 __ leave(); 2203 __ ret(lr); 2204 2205 return start; 2206 } 2207 2208 // Arguments: 2209 // 2210 // Inputs: 2211 // c_rarg0 - source byte array address 2212 // c_rarg1 - destination byte array address 2213 // c_rarg2 - K (key) in little endian int array 2214 // 2215 address generate_aescrypt_decryptBlock() { 2216 assert(UseAES, "need AES instructions and misaligned SSE support"); 2217 __ align(CodeEntryAlignment); 2218 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2219 Label L_doLast; 2220 2221 const Register from = c_rarg0; // source array address 2222 const Register to = c_rarg1; // destination array address 2223 const Register key = c_rarg2; // key array address 2224 const Register keylen = rscratch1; 2225 2226 address start = __ pc(); 2227 __ enter(); // required for proper stackwalking of RuntimeStub frame 2228 2229 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2230 2231 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2232 2233 __ ld1(v5, __ T16B, __ post(key, 16)); 2234 __ rev32(v5, __ T16B, v5); 2235 2236 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2237 __ rev32(v1, __ T16B, v1); 2238 __ rev32(v2, __ T16B, v2); 2239 __ rev32(v3, __ T16B, v3); 2240 __ rev32(v4, __ T16B, v4); 2241 __ aesd(v0, v1); 2242 __ aesimc(v0, v0); 2243 __ aesd(v0, v2); 2244 __ aesimc(v0, v0); 2245 __ aesd(v0, v3); 2246 __ aesimc(v0, v0); 2247 __ aesd(v0, v4); 2248 __ aesimc(v0, v0); 2249 2250 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2251 __ rev32(v1, __ T16B, v1); 2252 __ rev32(v2, __ T16B, v2); 2253 __ rev32(v3, __ T16B, v3); 2254 __ rev32(v4, __ T16B, v4); 2255 __ aesd(v0, v1); 2256 __ aesimc(v0, v0); 2257 __ aesd(v0, v2); 2258 __ aesimc(v0, v0); 2259 __ aesd(v0, v3); 2260 __ aesimc(v0, v0); 2261 __ aesd(v0, v4); 2262 __ aesimc(v0, v0); 2263 2264 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2265 __ rev32(v1, __ T16B, v1); 2266 __ rev32(v2, __ T16B, v2); 2267 2268 __ cmpw(keylen, 44); 2269 __ br(Assembler::EQ, L_doLast); 2270 2271 __ aesd(v0, v1); 2272 __ aesimc(v0, v0); 2273 __ aesd(v0, v2); 2274 __ aesimc(v0, v0); 2275 2276 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2277 __ rev32(v1, __ T16B, v1); 2278 __ rev32(v2, __ T16B, v2); 2279 2280 __ cmpw(keylen, 52); 2281 __ br(Assembler::EQ, L_doLast); 2282 2283 __ aesd(v0, v1); 2284 __ aesimc(v0, v0); 2285 __ aesd(v0, v2); 2286 __ aesimc(v0, v0); 2287 2288 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2289 __ rev32(v1, __ T16B, v1); 2290 __ rev32(v2, __ T16B, v2); 2291 2292 __ BIND(L_doLast); 2293 2294 __ aesd(v0, v1); 2295 __ aesimc(v0, v0); 2296 __ aesd(v0, v2); 2297 2298 __ eor(v0, __ T16B, v0, v5); 2299 2300 __ st1(v0, __ T16B, to); 2301 2302 __ mov(r0, 0); 2303 2304 __ leave(); 2305 __ ret(lr); 2306 2307 return start; 2308 } 2309 2310 // Arguments: 2311 // 2312 // Inputs: 2313 // c_rarg0 - source byte array address 2314 // c_rarg1 - destination byte array address 2315 // c_rarg2 - K (key) in little endian int array 2316 // c_rarg3 - r vector byte array address 2317 // c_rarg4 - input length 2318 // 2319 // Output: 2320 // x0 - input length 2321 // 2322 address generate_cipherBlockChaining_encryptAESCrypt() { 2323 assert(UseAES, "need AES instructions and misaligned SSE support"); 2324 __ align(CodeEntryAlignment); 2325 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2326 2327 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2328 2329 const Register from = c_rarg0; // source array address 2330 const Register to = c_rarg1; // destination array address 2331 const Register key = c_rarg2; // key array address 2332 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2333 // and left with the results of the last encryption block 2334 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2335 const Register keylen = rscratch1; 2336 2337 address start = __ pc(); 2338 __ enter(); 2339 2340 __ mov(rscratch2, len_reg); 2341 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2342 2343 __ ld1(v0, __ T16B, rvec); 2344 2345 __ cmpw(keylen, 52); 2346 __ br(Assembler::CC, L_loadkeys_44); 2347 __ br(Assembler::EQ, L_loadkeys_52); 2348 2349 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2350 __ rev32(v17, __ T16B, v17); 2351 __ rev32(v18, __ T16B, v18); 2352 __ BIND(L_loadkeys_52); 2353 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2354 __ rev32(v19, __ T16B, v19); 2355 __ rev32(v20, __ T16B, v20); 2356 __ BIND(L_loadkeys_44); 2357 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2358 __ rev32(v21, __ T16B, v21); 2359 __ rev32(v22, __ T16B, v22); 2360 __ rev32(v23, __ T16B, v23); 2361 __ rev32(v24, __ T16B, v24); 2362 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2363 __ rev32(v25, __ T16B, v25); 2364 __ rev32(v26, __ T16B, v26); 2365 __ rev32(v27, __ T16B, v27); 2366 __ rev32(v28, __ T16B, v28); 2367 __ ld1(v29, v30, v31, __ T16B, key); 2368 __ rev32(v29, __ T16B, v29); 2369 __ rev32(v30, __ T16B, v30); 2370 __ rev32(v31, __ T16B, v31); 2371 2372 __ BIND(L_aes_loop); 2373 __ ld1(v1, __ T16B, __ post(from, 16)); 2374 __ eor(v0, __ T16B, v0, v1); 2375 2376 __ br(Assembler::CC, L_rounds_44); 2377 __ br(Assembler::EQ, L_rounds_52); 2378 2379 __ aese(v0, v17); __ aesmc(v0, v0); 2380 __ aese(v0, v18); __ aesmc(v0, v0); 2381 __ BIND(L_rounds_52); 2382 __ aese(v0, v19); __ aesmc(v0, v0); 2383 __ aese(v0, v20); __ aesmc(v0, v0); 2384 __ BIND(L_rounds_44); 2385 __ aese(v0, v21); __ aesmc(v0, v0); 2386 __ aese(v0, v22); __ aesmc(v0, v0); 2387 __ aese(v0, v23); __ aesmc(v0, v0); 2388 __ aese(v0, v24); __ aesmc(v0, v0); 2389 __ aese(v0, v25); __ aesmc(v0, v0); 2390 __ aese(v0, v26); __ aesmc(v0, v0); 2391 __ aese(v0, v27); __ aesmc(v0, v0); 2392 __ aese(v0, v28); __ aesmc(v0, v0); 2393 __ aese(v0, v29); __ aesmc(v0, v0); 2394 __ aese(v0, v30); 2395 __ eor(v0, __ T16B, v0, v31); 2396 2397 __ st1(v0, __ T16B, __ post(to, 16)); 2398 __ sub(len_reg, len_reg, 16); 2399 __ cbnz(len_reg, L_aes_loop); 2400 2401 __ st1(v0, __ T16B, rvec); 2402 2403 __ mov(r0, rscratch2); 2404 2405 __ leave(); 2406 __ ret(lr); 2407 2408 return start; 2409 } 2410 2411 // Arguments: 2412 // 2413 // Inputs: 2414 // c_rarg0 - source byte array address 2415 // c_rarg1 - destination byte array address 2416 // c_rarg2 - K (key) in little endian int array 2417 // c_rarg3 - r vector byte array address 2418 // c_rarg4 - input length 2419 // 2420 // Output: 2421 // r0 - input length 2422 // 2423 address generate_cipherBlockChaining_decryptAESCrypt() { 2424 assert(UseAES, "need AES instructions and misaligned SSE support"); 2425 __ align(CodeEntryAlignment); 2426 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2427 2428 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2429 2430 const Register from = c_rarg0; // source array address 2431 const Register to = c_rarg1; // destination array address 2432 const Register key = c_rarg2; // key array address 2433 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2434 // and left with the results of the last encryption block 2435 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2436 const Register keylen = rscratch1; 2437 2438 address start = __ pc(); 2439 __ enter(); 2440 2441 __ mov(rscratch2, len_reg); 2442 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2443 2444 __ ld1(v2, __ T16B, rvec); 2445 2446 __ ld1(v31, __ T16B, __ post(key, 16)); 2447 __ rev32(v31, __ T16B, v31); 2448 2449 __ cmpw(keylen, 52); 2450 __ br(Assembler::CC, L_loadkeys_44); 2451 __ br(Assembler::EQ, L_loadkeys_52); 2452 2453 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2454 __ rev32(v17, __ T16B, v17); 2455 __ rev32(v18, __ T16B, v18); 2456 __ BIND(L_loadkeys_52); 2457 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2458 __ rev32(v19, __ T16B, v19); 2459 __ rev32(v20, __ T16B, v20); 2460 __ BIND(L_loadkeys_44); 2461 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2462 __ rev32(v21, __ T16B, v21); 2463 __ rev32(v22, __ T16B, v22); 2464 __ rev32(v23, __ T16B, v23); 2465 __ rev32(v24, __ T16B, v24); 2466 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2467 __ rev32(v25, __ T16B, v25); 2468 __ rev32(v26, __ T16B, v26); 2469 __ rev32(v27, __ T16B, v27); 2470 __ rev32(v28, __ T16B, v28); 2471 __ ld1(v29, v30, __ T16B, key); 2472 __ rev32(v29, __ T16B, v29); 2473 __ rev32(v30, __ T16B, v30); 2474 2475 __ BIND(L_aes_loop); 2476 __ ld1(v0, __ T16B, __ post(from, 16)); 2477 __ orr(v1, __ T16B, v0, v0); 2478 2479 __ br(Assembler::CC, L_rounds_44); 2480 __ br(Assembler::EQ, L_rounds_52); 2481 2482 __ aesd(v0, v17); __ aesimc(v0, v0); 2483 __ aesd(v0, v18); __ aesimc(v0, v0); 2484 __ BIND(L_rounds_52); 2485 __ aesd(v0, v19); __ aesimc(v0, v0); 2486 __ aesd(v0, v20); __ aesimc(v0, v0); 2487 __ BIND(L_rounds_44); 2488 __ aesd(v0, v21); __ aesimc(v0, v0); 2489 __ aesd(v0, v22); __ aesimc(v0, v0); 2490 __ aesd(v0, v23); __ aesimc(v0, v0); 2491 __ aesd(v0, v24); __ aesimc(v0, v0); 2492 __ aesd(v0, v25); __ aesimc(v0, v0); 2493 __ aesd(v0, v26); __ aesimc(v0, v0); 2494 __ aesd(v0, v27); __ aesimc(v0, v0); 2495 __ aesd(v0, v28); __ aesimc(v0, v0); 2496 __ aesd(v0, v29); __ aesimc(v0, v0); 2497 __ aesd(v0, v30); 2498 __ eor(v0, __ T16B, v0, v31); 2499 __ eor(v0, __ T16B, v0, v2); 2500 2501 __ st1(v0, __ T16B, __ post(to, 16)); 2502 __ orr(v2, __ T16B, v1, v1); 2503 2504 __ sub(len_reg, len_reg, 16); 2505 __ cbnz(len_reg, L_aes_loop); 2506 2507 __ st1(v2, __ T16B, rvec); 2508 2509 __ mov(r0, rscratch2); 2510 2511 __ leave(); 2512 __ ret(lr); 2513 2514 return start; 2515 } 2516 2517 // Arguments: 2518 // 2519 // Inputs: 2520 // c_rarg0 - byte[] source+offset 2521 // c_rarg1 - int[] SHA.state 2522 // c_rarg2 - int offset 2523 // c_rarg3 - int limit 2524 // 2525 address generate_sha1_implCompress(bool multi_block, const char *name) { 2526 __ align(CodeEntryAlignment); 2527 StubCodeMark mark(this, "StubRoutines", name); 2528 address start = __ pc(); 2529 2530 Register buf = c_rarg0; 2531 Register state = c_rarg1; 2532 Register ofs = c_rarg2; 2533 Register limit = c_rarg3; 2534 2535 Label keys; 2536 Label sha1_loop; 2537 2538 // load the keys into v0..v3 2539 __ adr(rscratch1, keys); 2540 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2541 // load 5 words state into v6, v7 2542 __ ldrq(v6, Address(state, 0)); 2543 __ ldrs(v7, Address(state, 16)); 2544 2545 2546 __ BIND(sha1_loop); 2547 // load 64 bytes of data into v16..v19 2548 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2549 __ rev32(v16, __ T16B, v16); 2550 __ rev32(v17, __ T16B, v17); 2551 __ rev32(v18, __ T16B, v18); 2552 __ rev32(v19, __ T16B, v19); 2553 2554 // do the sha1 2555 __ addv(v4, __ T4S, v16, v0); 2556 __ orr(v20, __ T16B, v6, v6); 2557 2558 FloatRegister d0 = v16; 2559 FloatRegister d1 = v17; 2560 FloatRegister d2 = v18; 2561 FloatRegister d3 = v19; 2562 2563 for (int round = 0; round < 20; round++) { 2564 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2565 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2566 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2567 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2568 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2569 2570 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2571 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2572 __ sha1h(tmp2, __ T4S, v20); 2573 if (round < 5) 2574 __ sha1c(v20, __ T4S, tmp3, tmp4); 2575 else if (round < 10 || round >= 15) 2576 __ sha1p(v20, __ T4S, tmp3, tmp4); 2577 else 2578 __ sha1m(v20, __ T4S, tmp3, tmp4); 2579 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2580 2581 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2582 } 2583 2584 __ addv(v7, __ T2S, v7, v21); 2585 __ addv(v6, __ T4S, v6, v20); 2586 2587 if (multi_block) { 2588 __ add(ofs, ofs, 64); 2589 __ cmp(ofs, limit); 2590 __ br(Assembler::LE, sha1_loop); 2591 __ mov(c_rarg0, ofs); // return ofs 2592 } 2593 2594 __ strq(v6, Address(state, 0)); 2595 __ strs(v7, Address(state, 16)); 2596 2597 __ ret(lr); 2598 2599 __ bind(keys); 2600 __ emit_int32(0x5a827999); 2601 __ emit_int32(0x6ed9eba1); 2602 __ emit_int32(0x8f1bbcdc); 2603 __ emit_int32(0xca62c1d6); 2604 2605 return start; 2606 } 2607 2608 2609 // Arguments: 2610 // 2611 // Inputs: 2612 // c_rarg0 - byte[] source+offset 2613 // c_rarg1 - int[] SHA.state 2614 // c_rarg2 - int offset 2615 // c_rarg3 - int limit 2616 // 2617 address generate_sha256_implCompress(bool multi_block, const char *name) { 2618 static const uint32_t round_consts[64] = { 2619 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 2620 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 2621 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 2622 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 2623 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 2624 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 2625 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 2626 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 2627 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 2628 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 2629 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 2630 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 2631 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 2632 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 2633 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 2634 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 2635 }; 2636 __ align(CodeEntryAlignment); 2637 StubCodeMark mark(this, "StubRoutines", name); 2638 address start = __ pc(); 2639 2640 Register buf = c_rarg0; 2641 Register state = c_rarg1; 2642 Register ofs = c_rarg2; 2643 Register limit = c_rarg3; 2644 2645 Label sha1_loop; 2646 2647 __ stpd(v8, v9, __ pre(sp, -32)); 2648 __ stpd(v10, v11, Address(sp, 16)); 2649 2650 // dga == v0 2651 // dgb == v1 2652 // dg0 == v2 2653 // dg1 == v3 2654 // dg2 == v4 2655 // t0 == v6 2656 // t1 == v7 2657 2658 // load 16 keys to v16..v31 2659 __ lea(rscratch1, ExternalAddress((address)round_consts)); 2660 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 2661 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 2662 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 2663 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 2664 2665 // load 8 words (256 bits) state 2666 __ ldpq(v0, v1, state); 2667 2668 __ BIND(sha1_loop); 2669 // load 64 bytes of data into v8..v11 2670 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 2671 __ rev32(v8, __ T16B, v8); 2672 __ rev32(v9, __ T16B, v9); 2673 __ rev32(v10, __ T16B, v10); 2674 __ rev32(v11, __ T16B, v11); 2675 2676 __ addv(v6, __ T4S, v8, v16); 2677 __ orr(v2, __ T16B, v0, v0); 2678 __ orr(v3, __ T16B, v1, v1); 2679 2680 FloatRegister d0 = v8; 2681 FloatRegister d1 = v9; 2682 FloatRegister d2 = v10; 2683 FloatRegister d3 = v11; 2684 2685 2686 for (int round = 0; round < 16; round++) { 2687 FloatRegister tmp1 = (round & 1) ? v6 : v7; 2688 FloatRegister tmp2 = (round & 1) ? v7 : v6; 2689 FloatRegister tmp3 = (round & 1) ? v2 : v4; 2690 FloatRegister tmp4 = (round & 1) ? v4 : v2; 2691 2692 if (round < 12) __ sha256su0(d0, __ T4S, d1); 2693 __ orr(v4, __ T16B, v2, v2); 2694 if (round < 15) 2695 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 2696 __ sha256h(v2, __ T4S, v3, tmp2); 2697 __ sha256h2(v3, __ T4S, v4, tmp2); 2698 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 2699 2700 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2701 } 2702 2703 __ addv(v0, __ T4S, v0, v2); 2704 __ addv(v1, __ T4S, v1, v3); 2705 2706 if (multi_block) { 2707 __ add(ofs, ofs, 64); 2708 __ cmp(ofs, limit); 2709 __ br(Assembler::LE, sha1_loop); 2710 __ mov(c_rarg0, ofs); // return ofs 2711 } 2712 2713 __ ldpd(v10, v11, Address(sp, 16)); 2714 __ ldpd(v8, v9, __ post(sp, 32)); 2715 2716 __ stpq(v0, v1, state); 2717 2718 __ ret(lr); 2719 2720 return start; 2721 } 2722 2723 #ifndef BUILTIN_SIM 2724 // Safefetch stubs. 2725 void generate_safefetch(const char* name, int size, address* entry, 2726 address* fault_pc, address* continuation_pc) { 2727 // safefetch signatures: 2728 // int SafeFetch32(int* adr, int errValue); 2729 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 2730 // 2731 // arguments: 2732 // c_rarg0 = adr 2733 // c_rarg1 = errValue 2734 // 2735 // result: 2736 // PPC_RET = *adr or errValue 2737 2738 StubCodeMark mark(this, "StubRoutines", name); 2739 2740 // Entry point, pc or function descriptor. 2741 *entry = __ pc(); 2742 2743 // Load *adr into c_rarg1, may fault. 2744 *fault_pc = __ pc(); 2745 switch (size) { 2746 case 4: 2747 // int32_t 2748 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 2749 break; 2750 case 8: 2751 // int64_t 2752 __ ldr(c_rarg1, Address(c_rarg0, 0)); 2753 break; 2754 default: 2755 ShouldNotReachHere(); 2756 } 2757 2758 // return errValue or *adr 2759 *continuation_pc = __ pc(); 2760 __ mov(r0, c_rarg1); 2761 __ ret(lr); 2762 } 2763 #endif 2764 2765 /** 2766 * Arguments: 2767 * 2768 * Inputs: 2769 * c_rarg0 - int crc 2770 * c_rarg1 - byte* buf 2771 * c_rarg2 - int length 2772 * 2773 * Ouput: 2774 * rax - int crc result 2775 */ 2776 address generate_updateBytesCRC32() { 2777 assert(UseCRC32Intrinsics, "what are we doing here?"); 2778 2779 __ align(CodeEntryAlignment); 2780 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 2781 2782 address start = __ pc(); 2783 2784 const Register crc = c_rarg0; // crc 2785 const Register buf = c_rarg1; // source java byte array address 2786 const Register len = c_rarg2; // length 2787 const Register table0 = c_rarg3; // crc_table address 2788 const Register table1 = c_rarg4; 2789 const Register table2 = c_rarg5; 2790 const Register table3 = c_rarg6; 2791 const Register tmp3 = c_rarg7; 2792 2793 BLOCK_COMMENT("Entry:"); 2794 __ enter(); // required for proper stackwalking of RuntimeStub frame 2795 2796 __ kernel_crc32(crc, buf, len, 2797 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 2798 2799 __ leave(); // required for proper stackwalking of RuntimeStub frame 2800 __ ret(lr); 2801 2802 return start; 2803 } 2804 2805 /** 2806 * Arguments: 2807 * 2808 * Inputs: 2809 * c_rarg0 - int crc 2810 * c_rarg1 - byte* buf 2811 * c_rarg2 - int length 2812 * c_rarg3 - int* table 2813 * 2814 * Ouput: 2815 * r0 - int crc result 2816 */ 2817 address generate_updateBytesCRC32C() { 2818 assert(UseCRC32CIntrinsics, "what are we doing here?"); 2819 2820 __ align(CodeEntryAlignment); 2821 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 2822 2823 address start = __ pc(); 2824 2825 const Register crc = c_rarg0; // crc 2826 const Register buf = c_rarg1; // source java byte array address 2827 const Register len = c_rarg2; // length 2828 const Register table0 = c_rarg3; // crc_table address 2829 const Register table1 = c_rarg4; 2830 const Register table2 = c_rarg5; 2831 const Register table3 = c_rarg6; 2832 const Register tmp3 = c_rarg7; 2833 2834 BLOCK_COMMENT("Entry:"); 2835 __ enter(); // required for proper stackwalking of RuntimeStub frame 2836 2837 __ kernel_crc32c(crc, buf, len, 2838 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 2839 2840 __ leave(); // required for proper stackwalking of RuntimeStub frame 2841 __ ret(lr); 2842 2843 return start; 2844 } 2845 2846 /*** 2847 * Arguments: 2848 * 2849 * Inputs: 2850 * c_rarg0 - int adler 2851 * c_rarg1 - byte* buff 2852 * c_rarg2 - int len 2853 * 2854 * Output: 2855 * c_rarg0 - int adler result 2856 */ 2857 address generate_updateBytesAdler32() { 2858 __ align(CodeEntryAlignment); 2859 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 2860 address start = __ pc(); 2861 2862 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 2863 2864 // Aliases 2865 Register adler = c_rarg0; 2866 Register s1 = c_rarg0; 2867 Register s2 = c_rarg3; 2868 Register buff = c_rarg1; 2869 Register len = c_rarg2; 2870 Register nmax = r4; 2871 Register base = r5; 2872 Register count = r6; 2873 Register temp0 = rscratch1; 2874 Register temp1 = rscratch2; 2875 Register temp2 = r7; 2876 2877 // Max number of bytes we can process before having to take the mod 2878 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 2879 unsigned long BASE = 0xfff1; 2880 unsigned long NMAX = 0x15B0; 2881 2882 __ mov(base, BASE); 2883 __ mov(nmax, NMAX); 2884 2885 // s1 is initialized to the lower 16 bits of adler 2886 // s2 is initialized to the upper 16 bits of adler 2887 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 2888 __ uxth(s1, adler); // s1 = (adler & 0xffff) 2889 2890 // The pipelined loop needs at least 16 elements for 1 iteration 2891 // It does check this, but it is more effective to skip to the cleanup loop 2892 __ cmp(len, 16); 2893 __ br(Assembler::HS, L_nmax); 2894 __ cbz(len, L_combine); 2895 2896 __ bind(L_simple_by1_loop); 2897 __ ldrb(temp0, Address(__ post(buff, 1))); 2898 __ add(s1, s1, temp0); 2899 __ add(s2, s2, s1); 2900 __ subs(len, len, 1); 2901 __ br(Assembler::HI, L_simple_by1_loop); 2902 2903 // s1 = s1 % BASE 2904 __ subs(temp0, s1, base); 2905 __ csel(s1, temp0, s1, Assembler::HS); 2906 2907 // s2 = s2 % BASE 2908 __ lsr(temp0, s2, 16); 2909 __ lsl(temp1, temp0, 4); 2910 __ sub(temp1, temp1, temp0); 2911 __ add(s2, temp1, s2, ext::uxth); 2912 2913 __ subs(temp0, s2, base); 2914 __ csel(s2, temp0, s2, Assembler::HS); 2915 2916 __ b(L_combine); 2917 2918 __ bind(L_nmax); 2919 __ subs(len, len, nmax); 2920 __ sub(count, nmax, 16); 2921 __ br(Assembler::LO, L_by16); 2922 2923 __ bind(L_nmax_loop); 2924 2925 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 2926 2927 __ add(s1, s1, temp0, ext::uxtb); 2928 __ ubfx(temp2, temp0, 8, 8); 2929 __ add(s2, s2, s1); 2930 __ add(s1, s1, temp2); 2931 __ ubfx(temp2, temp0, 16, 8); 2932 __ add(s2, s2, s1); 2933 __ add(s1, s1, temp2); 2934 __ ubfx(temp2, temp0, 24, 8); 2935 __ add(s2, s2, s1); 2936 __ add(s1, s1, temp2); 2937 __ ubfx(temp2, temp0, 32, 8); 2938 __ add(s2, s2, s1); 2939 __ add(s1, s1, temp2); 2940 __ ubfx(temp2, temp0, 40, 8); 2941 __ add(s2, s2, s1); 2942 __ add(s1, s1, temp2); 2943 __ ubfx(temp2, temp0, 48, 8); 2944 __ add(s2, s2, s1); 2945 __ add(s1, s1, temp2); 2946 __ add(s2, s2, s1); 2947 __ add(s1, s1, temp0, Assembler::LSR, 56); 2948 __ add(s2, s2, s1); 2949 2950 __ add(s1, s1, temp1, ext::uxtb); 2951 __ ubfx(temp2, temp1, 8, 8); 2952 __ add(s2, s2, s1); 2953 __ add(s1, s1, temp2); 2954 __ ubfx(temp2, temp1, 16, 8); 2955 __ add(s2, s2, s1); 2956 __ add(s1, s1, temp2); 2957 __ ubfx(temp2, temp1, 24, 8); 2958 __ add(s2, s2, s1); 2959 __ add(s1, s1, temp2); 2960 __ ubfx(temp2, temp1, 32, 8); 2961 __ add(s2, s2, s1); 2962 __ add(s1, s1, temp2); 2963 __ ubfx(temp2, temp1, 40, 8); 2964 __ add(s2, s2, s1); 2965 __ add(s1, s1, temp2); 2966 __ ubfx(temp2, temp1, 48, 8); 2967 __ add(s2, s2, s1); 2968 __ add(s1, s1, temp2); 2969 __ add(s2, s2, s1); 2970 __ add(s1, s1, temp1, Assembler::LSR, 56); 2971 __ add(s2, s2, s1); 2972 2973 __ subs(count, count, 16); 2974 __ br(Assembler::HS, L_nmax_loop); 2975 2976 // s1 = s1 % BASE 2977 __ lsr(temp0, s1, 16); 2978 __ lsl(temp1, temp0, 4); 2979 __ sub(temp1, temp1, temp0); 2980 __ add(temp1, temp1, s1, ext::uxth); 2981 2982 __ lsr(temp0, temp1, 16); 2983 __ lsl(s1, temp0, 4); 2984 __ sub(s1, s1, temp0); 2985 __ add(s1, s1, temp1, ext:: uxth); 2986 2987 __ subs(temp0, s1, base); 2988 __ csel(s1, temp0, s1, Assembler::HS); 2989 2990 // s2 = s2 % BASE 2991 __ lsr(temp0, s2, 16); 2992 __ lsl(temp1, temp0, 4); 2993 __ sub(temp1, temp1, temp0); 2994 __ add(temp1, temp1, s2, ext::uxth); 2995 2996 __ lsr(temp0, temp1, 16); 2997 __ lsl(s2, temp0, 4); 2998 __ sub(s2, s2, temp0); 2999 __ add(s2, s2, temp1, ext:: uxth); 3000 3001 __ subs(temp0, s2, base); 3002 __ csel(s2, temp0, s2, Assembler::HS); 3003 3004 __ subs(len, len, nmax); 3005 __ sub(count, nmax, 16); 3006 __ br(Assembler::HS, L_nmax_loop); 3007 3008 __ bind(L_by16); 3009 __ adds(len, len, count); 3010 __ br(Assembler::LO, L_by1); 3011 3012 __ bind(L_by16_loop); 3013 3014 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3015 3016 __ add(s1, s1, temp0, ext::uxtb); 3017 __ ubfx(temp2, temp0, 8, 8); 3018 __ add(s2, s2, s1); 3019 __ add(s1, s1, temp2); 3020 __ ubfx(temp2, temp0, 16, 8); 3021 __ add(s2, s2, s1); 3022 __ add(s1, s1, temp2); 3023 __ ubfx(temp2, temp0, 24, 8); 3024 __ add(s2, s2, s1); 3025 __ add(s1, s1, temp2); 3026 __ ubfx(temp2, temp0, 32, 8); 3027 __ add(s2, s2, s1); 3028 __ add(s1, s1, temp2); 3029 __ ubfx(temp2, temp0, 40, 8); 3030 __ add(s2, s2, s1); 3031 __ add(s1, s1, temp2); 3032 __ ubfx(temp2, temp0, 48, 8); 3033 __ add(s2, s2, s1); 3034 __ add(s1, s1, temp2); 3035 __ add(s2, s2, s1); 3036 __ add(s1, s1, temp0, Assembler::LSR, 56); 3037 __ add(s2, s2, s1); 3038 3039 __ add(s1, s1, temp1, ext::uxtb); 3040 __ ubfx(temp2, temp1, 8, 8); 3041 __ add(s2, s2, s1); 3042 __ add(s1, s1, temp2); 3043 __ ubfx(temp2, temp1, 16, 8); 3044 __ add(s2, s2, s1); 3045 __ add(s1, s1, temp2); 3046 __ ubfx(temp2, temp1, 24, 8); 3047 __ add(s2, s2, s1); 3048 __ add(s1, s1, temp2); 3049 __ ubfx(temp2, temp1, 32, 8); 3050 __ add(s2, s2, s1); 3051 __ add(s1, s1, temp2); 3052 __ ubfx(temp2, temp1, 40, 8); 3053 __ add(s2, s2, s1); 3054 __ add(s1, s1, temp2); 3055 __ ubfx(temp2, temp1, 48, 8); 3056 __ add(s2, s2, s1); 3057 __ add(s1, s1, temp2); 3058 __ add(s2, s2, s1); 3059 __ add(s1, s1, temp1, Assembler::LSR, 56); 3060 __ add(s2, s2, s1); 3061 3062 __ subs(len, len, 16); 3063 __ br(Assembler::HS, L_by16_loop); 3064 3065 __ bind(L_by1); 3066 __ adds(len, len, 15); 3067 __ br(Assembler::LO, L_do_mod); 3068 3069 __ bind(L_by1_loop); 3070 __ ldrb(temp0, Address(__ post(buff, 1))); 3071 __ add(s1, temp0, s1); 3072 __ add(s2, s2, s1); 3073 __ subs(len, len, 1); 3074 __ br(Assembler::HS, L_by1_loop); 3075 3076 __ bind(L_do_mod); 3077 // s1 = s1 % BASE 3078 __ lsr(temp0, s1, 16); 3079 __ lsl(temp1, temp0, 4); 3080 __ sub(temp1, temp1, temp0); 3081 __ add(temp1, temp1, s1, ext::uxth); 3082 3083 __ lsr(temp0, temp1, 16); 3084 __ lsl(s1, temp0, 4); 3085 __ sub(s1, s1, temp0); 3086 __ add(s1, s1, temp1, ext:: uxth); 3087 3088 __ subs(temp0, s1, base); 3089 __ csel(s1, temp0, s1, Assembler::HS); 3090 3091 // s2 = s2 % BASE 3092 __ lsr(temp0, s2, 16); 3093 __ lsl(temp1, temp0, 4); 3094 __ sub(temp1, temp1, temp0); 3095 __ add(temp1, temp1, s2, ext::uxth); 3096 3097 __ lsr(temp0, temp1, 16); 3098 __ lsl(s2, temp0, 4); 3099 __ sub(s2, s2, temp0); 3100 __ add(s2, s2, temp1, ext:: uxth); 3101 3102 __ subs(temp0, s2, base); 3103 __ csel(s2, temp0, s2, Assembler::HS); 3104 3105 // Combine lower bits and higher bits 3106 __ bind(L_combine); 3107 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3108 3109 __ ret(lr); 3110 3111 return start; 3112 } 3113 3114 /** 3115 * Arguments: 3116 * 3117 * Input: 3118 * c_rarg0 - x address 3119 * c_rarg1 - x length 3120 * c_rarg2 - y address 3121 * c_rarg3 - y lenth 3122 * c_rarg4 - z address 3123 * c_rarg5 - z length 3124 */ 3125 address generate_multiplyToLen() { 3126 __ align(CodeEntryAlignment); 3127 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3128 3129 address start = __ pc(); 3130 const Register x = r0; 3131 const Register xlen = r1; 3132 const Register y = r2; 3133 const Register ylen = r3; 3134 const Register z = r4; 3135 const Register zlen = r5; 3136 3137 const Register tmp1 = r10; 3138 const Register tmp2 = r11; 3139 const Register tmp3 = r12; 3140 const Register tmp4 = r13; 3141 const Register tmp5 = r14; 3142 const Register tmp6 = r15; 3143 const Register tmp7 = r16; 3144 3145 BLOCK_COMMENT("Entry:"); 3146 __ enter(); // required for proper stackwalking of RuntimeStub frame 3147 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3148 __ leave(); // required for proper stackwalking of RuntimeStub frame 3149 __ ret(lr); 3150 3151 return start; 3152 } 3153 3154 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3155 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3156 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3157 // Karatsuba multiplication performs a 128*128 -> 256-bit 3158 // multiplication in three 128-bit multiplications and a few 3159 // additions. 3160 // 3161 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3162 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3163 // 3164 // Inputs: 3165 // 3166 // A0 in a.d[0] (subkey) 3167 // A1 in a.d[1] 3168 // (A1+A0) in a1_xor_a0.d[0] 3169 // 3170 // B0 in b.d[0] (state) 3171 // B1 in b.d[1] 3172 3173 __ ext(tmp1, __ T16B, b, b, 0x08); 3174 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3175 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3176 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3177 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3178 3179 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3180 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3181 __ eor(tmp2, __ T16B, tmp2, tmp4); 3182 __ eor(tmp2, __ T16B, tmp2, tmp3); 3183 3184 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3185 __ ins(result_hi, __ D, tmp2, 0, 1); 3186 __ ins(result_lo, __ D, tmp2, 1, 0); 3187 } 3188 3189 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3190 FloatRegister p, FloatRegister z, FloatRegister t1) { 3191 const FloatRegister t0 = result; 3192 3193 // The GCM field polynomial f is z^128 + p(z), where p = 3194 // z^7+z^2+z+1. 3195 // 3196 // z^128 === -p(z) (mod (z^128 + p(z))) 3197 // 3198 // so, given that the product we're reducing is 3199 // a == lo + hi * z^128 3200 // substituting, 3201 // === lo - hi * p(z) (mod (z^128 + p(z))) 3202 // 3203 // we reduce by multiplying hi by p(z) and subtracting the result 3204 // from (i.e. XORing it with) lo. Because p has no nonzero high 3205 // bits we can do this with two 64-bit multiplications, lo*p and 3206 // hi*p. 3207 3208 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3209 __ ext(t1, __ T16B, t0, z, 8); 3210 __ eor(hi, __ T16B, hi, t1); 3211 __ ext(t1, __ T16B, z, t0, 8); 3212 __ eor(lo, __ T16B, lo, t1); 3213 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3214 __ eor(result, __ T16B, lo, t0); 3215 } 3216 3217 /** 3218 * Arguments: 3219 * 3220 * Input: 3221 * c_rarg0 - current state address 3222 * c_rarg1 - H key address 3223 * c_rarg2 - data address 3224 * c_rarg3 - number of blocks 3225 * 3226 * Output: 3227 * Updated state at c_rarg0 3228 */ 3229 address generate_ghash_processBlocks() { 3230 // Bafflingly, GCM uses little-endian for the byte order, but 3231 // big-endian for the bit order. For example, the polynomial 1 is 3232 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3233 // 3234 // So, we must either reverse the bytes in each word and do 3235 // everything big-endian or reverse the bits in each byte and do 3236 // it little-endian. On AArch64 it's more idiomatic to reverse 3237 // the bits in each byte (we have an instruction, RBIT, to do 3238 // that) and keep the data in little-endian bit order throught the 3239 // calculation, bit-reversing the inputs and outputs. 3240 3241 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3242 __ align(wordSize * 2); 3243 address p = __ pc(); 3244 __ emit_int64(0x87); // The low-order bits of the field 3245 // polynomial (i.e. p = z^7+z^2+z+1) 3246 // repeated in the low and high parts of a 3247 // 128-bit vector 3248 __ emit_int64(0x87); 3249 3250 __ align(CodeEntryAlignment); 3251 address start = __ pc(); 3252 3253 Register state = c_rarg0; 3254 Register subkeyH = c_rarg1; 3255 Register data = c_rarg2; 3256 Register blocks = c_rarg3; 3257 3258 FloatRegister vzr = v30; 3259 __ eor(vzr, __ T16B, vzr, vzr); // zero register 3260 3261 __ ldrq(v0, Address(state)); 3262 __ ldrq(v1, Address(subkeyH)); 3263 3264 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 3265 __ rbit(v0, __ T16B, v0); 3266 __ rev64(v1, __ T16B, v1); 3267 __ rbit(v1, __ T16B, v1); 3268 3269 __ ldrq(v26, p); 3270 3271 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 3272 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 3273 3274 { 3275 Label L_ghash_loop; 3276 __ bind(L_ghash_loop); 3277 3278 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 3279 // reversing each byte 3280 __ rbit(v2, __ T16B, v2); 3281 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 3282 3283 // Multiply state in v2 by subkey in v1 3284 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 3285 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 3286 /*temps*/v6, v20, v18, v21); 3287 // Reduce v7:v5 by the field polynomial 3288 ghash_reduce(v0, v5, v7, v26, vzr, v20); 3289 3290 __ sub(blocks, blocks, 1); 3291 __ cbnz(blocks, L_ghash_loop); 3292 } 3293 3294 // The bit-reversed result is at this point in v0 3295 __ rev64(v1, __ T16B, v0); 3296 __ rbit(v1, __ T16B, v1); 3297 3298 __ st1(v1, __ T16B, state); 3299 __ ret(lr); 3300 3301 return start; 3302 } 3303 3304 // Continuation point for throwing of implicit exceptions that are 3305 // not handled in the current activation. Fabricates an exception 3306 // oop and initiates normal exception dispatching in this 3307 // frame. Since we need to preserve callee-saved values (currently 3308 // only for C2, but done for C1 as well) we need a callee-saved oop 3309 // map and therefore have to make these stubs into RuntimeStubs 3310 // rather than BufferBlobs. If the compiler needs all registers to 3311 // be preserved between the fault point and the exception handler 3312 // then it must assume responsibility for that in 3313 // AbstractCompiler::continuation_for_implicit_null_exception or 3314 // continuation_for_implicit_division_by_zero_exception. All other 3315 // implicit exceptions (e.g., NullPointerException or 3316 // AbstractMethodError on entry) are either at call sites or 3317 // otherwise assume that stack unwinding will be initiated, so 3318 // caller saved registers were assumed volatile in the compiler. 3319 3320 #undef __ 3321 #define __ masm-> 3322 3323 address generate_throw_exception(const char* name, 3324 address runtime_entry, 3325 Register arg1 = noreg, 3326 Register arg2 = noreg) { 3327 // Information about frame layout at time of blocking runtime call. 3328 // Note that we only have to preserve callee-saved registers since 3329 // the compilers are responsible for supplying a continuation point 3330 // if they expect all registers to be preserved. 3331 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 3332 enum layout { 3333 rfp_off = 0, 3334 rfp_off2, 3335 return_off, 3336 return_off2, 3337 framesize // inclusive of return address 3338 }; 3339 3340 int insts_size = 512; 3341 int locs_size = 64; 3342 3343 CodeBuffer code(name, insts_size, locs_size); 3344 OopMapSet* oop_maps = new OopMapSet(); 3345 MacroAssembler* masm = new MacroAssembler(&code); 3346 3347 address start = __ pc(); 3348 3349 // This is an inlined and slightly modified version of call_VM 3350 // which has the ability to fetch the return PC out of 3351 // thread-local storage and also sets up last_Java_sp slightly 3352 // differently than the real call_VM 3353 3354 __ enter(); // Save FP and LR before call 3355 3356 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3357 3358 // lr and fp are already in place 3359 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 3360 3361 int frame_complete = __ pc() - start; 3362 3363 // Set up last_Java_sp and last_Java_fp 3364 address the_pc = __ pc(); 3365 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 3366 3367 // Call runtime 3368 if (arg1 != noreg) { 3369 assert(arg2 != c_rarg1, "clobbered"); 3370 __ mov(c_rarg1, arg1); 3371 } 3372 if (arg2 != noreg) { 3373 __ mov(c_rarg2, arg2); 3374 } 3375 __ mov(c_rarg0, rthread); 3376 BLOCK_COMMENT("call runtime_entry"); 3377 __ mov(rscratch1, runtime_entry); 3378 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 3379 3380 // Generate oop map 3381 OopMap* map = new OopMap(framesize, 0); 3382 3383 oop_maps->add_gc_map(the_pc - start, map); 3384 3385 __ reset_last_Java_frame(true, true); 3386 __ maybe_isb(); 3387 3388 __ leave(); 3389 3390 // check for pending exceptions 3391 #ifdef ASSERT 3392 Label L; 3393 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 3394 __ cbnz(rscratch1, L); 3395 __ should_not_reach_here(); 3396 __ bind(L); 3397 #endif // ASSERT 3398 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3399 3400 3401 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3402 RuntimeStub* stub = 3403 RuntimeStub::new_runtime_stub(name, 3404 &code, 3405 frame_complete, 3406 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3407 oop_maps, false); 3408 return stub->entry_point(); 3409 } 3410 3411 class MontgomeryMultiplyGenerator : public MacroAssembler { 3412 3413 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3414 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 3415 3416 RegSet _toSave; 3417 bool _squaring; 3418 3419 public: 3420 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3421 : MacroAssembler(as->code()), _squaring(squaring) { 3422 3423 // Register allocation 3424 3425 Register reg = c_rarg0; 3426 Pa_base = reg; // Argument registers 3427 if (squaring) 3428 Pb_base = Pa_base; 3429 else 3430 Pb_base = ++reg; 3431 Pn_base = ++reg; 3432 Rlen= ++reg; 3433 inv = ++reg; 3434 Pm_base = ++reg; 3435 3436 // Working registers: 3437 Ra = ++reg; // The current digit of a, b, n, and m. 3438 Rb = ++reg; 3439 Rm = ++reg; 3440 Rn = ++reg; 3441 3442 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 3443 Pb = ++reg; 3444 Pm = ++reg; 3445 Pn = ++reg; 3446 3447 t0 = ++reg; // Three registers which form a 3448 t1 = ++reg; // triple-precision accumuator. 3449 t2 = ++reg; 3450 3451 Ri = ++reg; // Inner and outer loop indexes. 3452 Rj = ++reg; 3453 3454 Rhi_ab = ++reg; // Product registers: low and high parts 3455 Rlo_ab = ++reg; // of a*b and m*n. 3456 Rhi_mn = ++reg; 3457 Rlo_mn = ++reg; 3458 3459 // r19 and up are callee-saved. 3460 _toSave = RegSet::range(r19, reg) + Pm_base; 3461 } 3462 3463 private: 3464 void save_regs() { 3465 push(_toSave, sp); 3466 } 3467 3468 void restore_regs() { 3469 pop(_toSave, sp); 3470 } 3471 3472 template <typename T> 3473 void unroll_2(Register count, T block) { 3474 Label loop, end, odd; 3475 tbnz(count, 0, odd); 3476 cbz(count, end); 3477 align(16); 3478 bind(loop); 3479 (this->*block)(); 3480 bind(odd); 3481 (this->*block)(); 3482 subs(count, count, 2); 3483 br(Assembler::GT, loop); 3484 bind(end); 3485 } 3486 3487 template <typename T> 3488 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3489 Label loop, end, odd; 3490 tbnz(count, 0, odd); 3491 cbz(count, end); 3492 align(16); 3493 bind(loop); 3494 (this->*block)(d, s, tmp); 3495 bind(odd); 3496 (this->*block)(d, s, tmp); 3497 subs(count, count, 2); 3498 br(Assembler::GT, loop); 3499 bind(end); 3500 } 3501 3502 void pre1(RegisterOrConstant i) { 3503 block_comment("pre1"); 3504 // Pa = Pa_base; 3505 // Pb = Pb_base + i; 3506 // Pm = Pm_base; 3507 // Pn = Pn_base + i; 3508 // Ra = *Pa; 3509 // Rb = *Pb; 3510 // Rm = *Pm; 3511 // Rn = *Pn; 3512 ldr(Ra, Address(Pa_base)); 3513 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3514 ldr(Rm, Address(Pm_base)); 3515 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3516 lea(Pa, Address(Pa_base)); 3517 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3518 lea(Pm, Address(Pm_base)); 3519 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3520 3521 // Zero the m*n result. 3522 mov(Rhi_mn, zr); 3523 mov(Rlo_mn, zr); 3524 } 3525 3526 // The core multiply-accumulate step of a Montgomery 3527 // multiplication. The idea is to schedule operations as a 3528 // pipeline so that instructions with long latencies (loads and 3529 // multiplies) have time to complete before their results are 3530 // used. This most benefits in-order implementations of the 3531 // architecture but out-of-order ones also benefit. 3532 void step() { 3533 block_comment("step"); 3534 // MACC(Ra, Rb, t0, t1, t2); 3535 // Ra = *++Pa; 3536 // Rb = *--Pb; 3537 umulh(Rhi_ab, Ra, Rb); 3538 mul(Rlo_ab, Ra, Rb); 3539 ldr(Ra, pre(Pa, wordSize)); 3540 ldr(Rb, pre(Pb, -wordSize)); 3541 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 3542 // previous iteration. 3543 // MACC(Rm, Rn, t0, t1, t2); 3544 // Rm = *++Pm; 3545 // Rn = *--Pn; 3546 umulh(Rhi_mn, Rm, Rn); 3547 mul(Rlo_mn, Rm, Rn); 3548 ldr(Rm, pre(Pm, wordSize)); 3549 ldr(Rn, pre(Pn, -wordSize)); 3550 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3551 } 3552 3553 void post1() { 3554 block_comment("post1"); 3555 3556 // MACC(Ra, Rb, t0, t1, t2); 3557 // Ra = *++Pa; 3558 // Rb = *--Pb; 3559 umulh(Rhi_ab, Ra, Rb); 3560 mul(Rlo_ab, Ra, Rb); 3561 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3562 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3563 3564 // *Pm = Rm = t0 * inv; 3565 mul(Rm, t0, inv); 3566 str(Rm, Address(Pm)); 3567 3568 // MACC(Rm, Rn, t0, t1, t2); 3569 // t0 = t1; t1 = t2; t2 = 0; 3570 umulh(Rhi_mn, Rm, Rn); 3571 3572 #ifndef PRODUCT 3573 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 3574 { 3575 mul(Rlo_mn, Rm, Rn); 3576 add(Rlo_mn, t0, Rlo_mn); 3577 Label ok; 3578 cbz(Rlo_mn, ok); { 3579 stop("broken Montgomery multiply"); 3580 } bind(ok); 3581 } 3582 #endif 3583 // We have very carefully set things up so that 3584 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 3585 // the lower half of Rm * Rn because we know the result already: 3586 // it must be -t0. t0 + (-t0) must generate a carry iff 3587 // t0 != 0. So, rather than do a mul and an adds we just set 3588 // the carry flag iff t0 is nonzero. 3589 // 3590 // mul(Rlo_mn, Rm, Rn); 3591 // adds(zr, t0, Rlo_mn); 3592 subs(zr, t0, 1); // Set carry iff t0 is nonzero 3593 adcs(t0, t1, Rhi_mn); 3594 adc(t1, t2, zr); 3595 mov(t2, zr); 3596 } 3597 3598 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 3599 block_comment("pre2"); 3600 // Pa = Pa_base + i-len; 3601 // Pb = Pb_base + len; 3602 // Pm = Pm_base + i-len; 3603 // Pn = Pn_base + len; 3604 3605 if (i.is_register()) { 3606 sub(Rj, i.as_register(), len); 3607 } else { 3608 mov(Rj, i.as_constant()); 3609 sub(Rj, Rj, len); 3610 } 3611 // Rj == i-len 3612 3613 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 3614 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 3615 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 3616 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 3617 3618 // Ra = *++Pa; 3619 // Rb = *--Pb; 3620 // Rm = *++Pm; 3621 // Rn = *--Pn; 3622 ldr(Ra, pre(Pa, wordSize)); 3623 ldr(Rb, pre(Pb, -wordSize)); 3624 ldr(Rm, pre(Pm, wordSize)); 3625 ldr(Rn, pre(Pn, -wordSize)); 3626 3627 mov(Rhi_mn, zr); 3628 mov(Rlo_mn, zr); 3629 } 3630 3631 void post2(RegisterOrConstant i, RegisterOrConstant len) { 3632 block_comment("post2"); 3633 if (i.is_constant()) { 3634 mov(Rj, i.as_constant()-len.as_constant()); 3635 } else { 3636 sub(Rj, i.as_register(), len); 3637 } 3638 3639 adds(t0, t0, Rlo_mn); // The pending m*n, low part 3640 3641 // As soon as we know the least significant digit of our result, 3642 // store it. 3643 // Pm_base[i-len] = t0; 3644 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 3645 3646 // t0 = t1; t1 = t2; t2 = 0; 3647 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 3648 adc(t1, t2, zr); 3649 mov(t2, zr); 3650 } 3651 3652 // A carry in t0 after Montgomery multiplication means that we 3653 // should subtract multiples of n from our result in m. We'll 3654 // keep doing that until there is no carry. 3655 void normalize(RegisterOrConstant len) { 3656 block_comment("normalize"); 3657 // while (t0) 3658 // t0 = sub(Pm_base, Pn_base, t0, len); 3659 Label loop, post, again; 3660 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 3661 cbz(t0, post); { 3662 bind(again); { 3663 mov(i, zr); 3664 mov(cnt, len); 3665 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3666 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3667 subs(zr, zr, zr); // set carry flag, i.e. no borrow 3668 align(16); 3669 bind(loop); { 3670 sbcs(Rm, Rm, Rn); 3671 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3672 add(i, i, 1); 3673 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3674 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3675 sub(cnt, cnt, 1); 3676 } cbnz(cnt, loop); 3677 sbc(t0, t0, zr); 3678 } cbnz(t0, again); 3679 } bind(post); 3680 } 3681 3682 // Move memory at s to d, reversing words. 3683 // Increments d to end of copied memory 3684 // Destroys tmp1, tmp2 3685 // Preserves len 3686 // Leaves s pointing to the address which was in d at start 3687 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 3688 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 3689 3690 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 3691 mov(tmp1, len); 3692 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 3693 sub(s, d, len, ext::uxtw, LogBytesPerWord); 3694 } 3695 // where 3696 void reverse1(Register d, Register s, Register tmp) { 3697 ldr(tmp, pre(s, -wordSize)); 3698 ror(tmp, tmp, 32); 3699 str(tmp, post(d, wordSize)); 3700 } 3701 3702 void step_squaring() { 3703 // An extra ACC 3704 step(); 3705 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3706 } 3707 3708 void last_squaring(RegisterOrConstant i) { 3709 Label dont; 3710 // if ((i & 1) == 0) { 3711 tbnz(i.as_register(), 0, dont); { 3712 // MACC(Ra, Rb, t0, t1, t2); 3713 // Ra = *++Pa; 3714 // Rb = *--Pb; 3715 umulh(Rhi_ab, Ra, Rb); 3716 mul(Rlo_ab, Ra, Rb); 3717 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3718 } bind(dont); 3719 } 3720 3721 void extra_step_squaring() { 3722 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3723 3724 // MACC(Rm, Rn, t0, t1, t2); 3725 // Rm = *++Pm; 3726 // Rn = *--Pn; 3727 umulh(Rhi_mn, Rm, Rn); 3728 mul(Rlo_mn, Rm, Rn); 3729 ldr(Rm, pre(Pm, wordSize)); 3730 ldr(Rn, pre(Pn, -wordSize)); 3731 } 3732 3733 void post1_squaring() { 3734 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3735 3736 // *Pm = Rm = t0 * inv; 3737 mul(Rm, t0, inv); 3738 str(Rm, Address(Pm)); 3739 3740 // MACC(Rm, Rn, t0, t1, t2); 3741 // t0 = t1; t1 = t2; t2 = 0; 3742 umulh(Rhi_mn, Rm, Rn); 3743 3744 #ifndef PRODUCT 3745 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 3746 { 3747 mul(Rlo_mn, Rm, Rn); 3748 add(Rlo_mn, t0, Rlo_mn); 3749 Label ok; 3750 cbz(Rlo_mn, ok); { 3751 stop("broken Montgomery multiply"); 3752 } bind(ok); 3753 } 3754 #endif 3755 // We have very carefully set things up so that 3756 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 3757 // the lower half of Rm * Rn because we know the result already: 3758 // it must be -t0. t0 + (-t0) must generate a carry iff 3759 // t0 != 0. So, rather than do a mul and an adds we just set 3760 // the carry flag iff t0 is nonzero. 3761 // 3762 // mul(Rlo_mn, Rm, Rn); 3763 // adds(zr, t0, Rlo_mn); 3764 subs(zr, t0, 1); // Set carry iff t0 is nonzero 3765 adcs(t0, t1, Rhi_mn); 3766 adc(t1, t2, zr); 3767 mov(t2, zr); 3768 } 3769 3770 void acc(Register Rhi, Register Rlo, 3771 Register t0, Register t1, Register t2) { 3772 adds(t0, t0, Rlo); 3773 adcs(t1, t1, Rhi); 3774 adc(t2, t2, zr); 3775 } 3776 3777 public: 3778 /** 3779 * Fast Montgomery multiplication. The derivation of the 3780 * algorithm is in A Cryptographic Library for the Motorola 3781 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3782 * 3783 * Arguments: 3784 * 3785 * Inputs for multiplication: 3786 * c_rarg0 - int array elements a 3787 * c_rarg1 - int array elements b 3788 * c_rarg2 - int array elements n (the modulus) 3789 * c_rarg3 - int length 3790 * c_rarg4 - int inv 3791 * c_rarg5 - int array elements m (the result) 3792 * 3793 * Inputs for squaring: 3794 * c_rarg0 - int array elements a 3795 * c_rarg1 - int array elements n (the modulus) 3796 * c_rarg2 - int length 3797 * c_rarg3 - int inv 3798 * c_rarg4 - int array elements m (the result) 3799 * 3800 */ 3801 address generate_multiply() { 3802 Label argh, nothing; 3803 bind(argh); 3804 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3805 3806 align(CodeEntryAlignment); 3807 address entry = pc(); 3808 3809 cbzw(Rlen, nothing); 3810 3811 enter(); 3812 3813 // Make room. 3814 cmpw(Rlen, 512); 3815 br(Assembler::HI, argh); 3816 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 3817 andr(sp, Ra, -2 * wordSize); 3818 3819 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 3820 3821 { 3822 // Copy input args, reversing as we go. We use Ra as a 3823 // temporary variable. 3824 reverse(Ra, Pa_base, Rlen, t0, t1); 3825 if (!_squaring) 3826 reverse(Ra, Pb_base, Rlen, t0, t1); 3827 reverse(Ra, Pn_base, Rlen, t0, t1); 3828 } 3829 3830 // Push all call-saved registers and also Pm_base which we'll need 3831 // at the end. 3832 save_regs(); 3833 3834 #ifndef PRODUCT 3835 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 3836 { 3837 ldr(Rn, Address(Pn_base, 0)); 3838 mul(Rlo_mn, Rn, inv); 3839 cmp(Rlo_mn, -1); 3840 Label ok; 3841 br(EQ, ok); { 3842 stop("broken inverse in Montgomery multiply"); 3843 } bind(ok); 3844 } 3845 #endif 3846 3847 mov(Pm_base, Ra); 3848 3849 mov(t0, zr); 3850 mov(t1, zr); 3851 mov(t2, zr); 3852 3853 block_comment("for (int i = 0; i < len; i++) {"); 3854 mov(Ri, zr); { 3855 Label loop, end; 3856 cmpw(Ri, Rlen); 3857 br(Assembler::GE, end); 3858 3859 bind(loop); 3860 pre1(Ri); 3861 3862 block_comment(" for (j = i; j; j--) {"); { 3863 movw(Rj, Ri); 3864 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3865 } block_comment(" } // j"); 3866 3867 post1(); 3868 addw(Ri, Ri, 1); 3869 cmpw(Ri, Rlen); 3870 br(Assembler::LT, loop); 3871 bind(end); 3872 block_comment("} // i"); 3873 } 3874 3875 block_comment("for (int i = len; i < 2*len; i++) {"); 3876 mov(Ri, Rlen); { 3877 Label loop, end; 3878 cmpw(Ri, Rlen, Assembler::LSL, 1); 3879 br(Assembler::GE, end); 3880 3881 bind(loop); 3882 pre2(Ri, Rlen); 3883 3884 block_comment(" for (j = len*2-i-1; j; j--) {"); { 3885 lslw(Rj, Rlen, 1); 3886 subw(Rj, Rj, Ri); 3887 subw(Rj, Rj, 1); 3888 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3889 } block_comment(" } // j"); 3890 3891 post2(Ri, Rlen); 3892 addw(Ri, Ri, 1); 3893 cmpw(Ri, Rlen, Assembler::LSL, 1); 3894 br(Assembler::LT, loop); 3895 bind(end); 3896 } 3897 block_comment("} // i"); 3898 3899 normalize(Rlen); 3900 3901 mov(Ra, Pm_base); // Save Pm_base in Ra 3902 restore_regs(); // Restore caller's Pm_base 3903 3904 // Copy our result into caller's Pm_base 3905 reverse(Pm_base, Ra, Rlen, t0, t1); 3906 3907 leave(); 3908 bind(nothing); 3909 ret(lr); 3910 3911 return entry; 3912 } 3913 // In C, approximately: 3914 3915 // void 3916 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 3917 // unsigned long Pn_base[], unsigned long Pm_base[], 3918 // unsigned long inv, int len) { 3919 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3920 // unsigned long *Pa, *Pb, *Pn, *Pm; 3921 // unsigned long Ra, Rb, Rn, Rm; 3922 3923 // int i; 3924 3925 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 3926 3927 // for (i = 0; i < len; i++) { 3928 // int j; 3929 3930 // Pa = Pa_base; 3931 // Pb = Pb_base + i; 3932 // Pm = Pm_base; 3933 // Pn = Pn_base + i; 3934 3935 // Ra = *Pa; 3936 // Rb = *Pb; 3937 // Rm = *Pm; 3938 // Rn = *Pn; 3939 3940 // int iters = i; 3941 // for (j = 0; iters--; j++) { 3942 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 3943 // MACC(Ra, Rb, t0, t1, t2); 3944 // Ra = *++Pa; 3945 // Rb = *--Pb; 3946 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 3947 // MACC(Rm, Rn, t0, t1, t2); 3948 // Rm = *++Pm; 3949 // Rn = *--Pn; 3950 // } 3951 3952 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 3953 // MACC(Ra, Rb, t0, t1, t2); 3954 // *Pm = Rm = t0 * inv; 3955 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 3956 // MACC(Rm, Rn, t0, t1, t2); 3957 3958 // assert(t0 == 0, "broken Montgomery multiply"); 3959 3960 // t0 = t1; t1 = t2; t2 = 0; 3961 // } 3962 3963 // for (i = len; i < 2*len; i++) { 3964 // int j; 3965 3966 // Pa = Pa_base + i-len; 3967 // Pb = Pb_base + len; 3968 // Pm = Pm_base + i-len; 3969 // Pn = Pn_base + len; 3970 3971 // Ra = *++Pa; 3972 // Rb = *--Pb; 3973 // Rm = *++Pm; 3974 // Rn = *--Pn; 3975 3976 // int iters = len*2-i-1; 3977 // for (j = i-len+1; iters--; j++) { 3978 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 3979 // MACC(Ra, Rb, t0, t1, t2); 3980 // Ra = *++Pa; 3981 // Rb = *--Pb; 3982 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 3983 // MACC(Rm, Rn, t0, t1, t2); 3984 // Rm = *++Pm; 3985 // Rn = *--Pn; 3986 // } 3987 3988 // Pm_base[i-len] = t0; 3989 // t0 = t1; t1 = t2; t2 = 0; 3990 // } 3991 3992 // while (t0) 3993 // t0 = sub(Pm_base, Pn_base, t0, len); 3994 // } 3995 3996 /** 3997 * Fast Montgomery squaring. This uses asymptotically 25% fewer 3998 * multiplies than Montgomery multiplication so it should be up to 3999 * 25% faster. However, its loop control is more complex and it 4000 * may actually run slower on some machines. 4001 * 4002 * Arguments: 4003 * 4004 * Inputs: 4005 * c_rarg0 - int array elements a 4006 * c_rarg1 - int array elements n (the modulus) 4007 * c_rarg2 - int length 4008 * c_rarg3 - int inv 4009 * c_rarg4 - int array elements m (the result) 4010 * 4011 */ 4012 address generate_square() { 4013 Label argh; 4014 bind(argh); 4015 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4016 4017 align(CodeEntryAlignment); 4018 address entry = pc(); 4019 4020 enter(); 4021 4022 // Make room. 4023 cmpw(Rlen, 512); 4024 br(Assembler::HI, argh); 4025 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4026 andr(sp, Ra, -2 * wordSize); 4027 4028 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4029 4030 { 4031 // Copy input args, reversing as we go. We use Ra as a 4032 // temporary variable. 4033 reverse(Ra, Pa_base, Rlen, t0, t1); 4034 reverse(Ra, Pn_base, Rlen, t0, t1); 4035 } 4036 4037 // Push all call-saved registers and also Pm_base which we'll need 4038 // at the end. 4039 save_regs(); 4040 4041 mov(Pm_base, Ra); 4042 4043 mov(t0, zr); 4044 mov(t1, zr); 4045 mov(t2, zr); 4046 4047 block_comment("for (int i = 0; i < len; i++) {"); 4048 mov(Ri, zr); { 4049 Label loop, end; 4050 bind(loop); 4051 cmp(Ri, Rlen); 4052 br(Assembler::GE, end); 4053 4054 pre1(Ri); 4055 4056 block_comment("for (j = (i+1)/2; j; j--) {"); { 4057 add(Rj, Ri, 1); 4058 lsr(Rj, Rj, 1); 4059 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4060 } block_comment(" } // j"); 4061 4062 last_squaring(Ri); 4063 4064 block_comment(" for (j = i/2; j; j--) {"); { 4065 lsr(Rj, Ri, 1); 4066 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4067 } block_comment(" } // j"); 4068 4069 post1_squaring(); 4070 add(Ri, Ri, 1); 4071 cmp(Ri, Rlen); 4072 br(Assembler::LT, loop); 4073 4074 bind(end); 4075 block_comment("} // i"); 4076 } 4077 4078 block_comment("for (int i = len; i < 2*len; i++) {"); 4079 mov(Ri, Rlen); { 4080 Label loop, end; 4081 bind(loop); 4082 cmp(Ri, Rlen, Assembler::LSL, 1); 4083 br(Assembler::GE, end); 4084 4085 pre2(Ri, Rlen); 4086 4087 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4088 lsl(Rj, Rlen, 1); 4089 sub(Rj, Rj, Ri); 4090 sub(Rj, Rj, 1); 4091 lsr(Rj, Rj, 1); 4092 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4093 } block_comment(" } // j"); 4094 4095 last_squaring(Ri); 4096 4097 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4098 lsl(Rj, Rlen, 1); 4099 sub(Rj, Rj, Ri); 4100 lsr(Rj, Rj, 1); 4101 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4102 } block_comment(" } // j"); 4103 4104 post2(Ri, Rlen); 4105 add(Ri, Ri, 1); 4106 cmp(Ri, Rlen, Assembler::LSL, 1); 4107 4108 br(Assembler::LT, loop); 4109 bind(end); 4110 block_comment("} // i"); 4111 } 4112 4113 normalize(Rlen); 4114 4115 mov(Ra, Pm_base); // Save Pm_base in Ra 4116 restore_regs(); // Restore caller's Pm_base 4117 4118 // Copy our result into caller's Pm_base 4119 reverse(Pm_base, Ra, Rlen, t0, t1); 4120 4121 leave(); 4122 ret(lr); 4123 4124 return entry; 4125 } 4126 // In C, approximately: 4127 4128 // void 4129 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4130 // unsigned long Pm_base[], unsigned long inv, int len) { 4131 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4132 // unsigned long *Pa, *Pb, *Pn, *Pm; 4133 // unsigned long Ra, Rb, Rn, Rm; 4134 4135 // int i; 4136 4137 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4138 4139 // for (i = 0; i < len; i++) { 4140 // int j; 4141 4142 // Pa = Pa_base; 4143 // Pb = Pa_base + i; 4144 // Pm = Pm_base; 4145 // Pn = Pn_base + i; 4146 4147 // Ra = *Pa; 4148 // Rb = *Pb; 4149 // Rm = *Pm; 4150 // Rn = *Pn; 4151 4152 // int iters = (i+1)/2; 4153 // for (j = 0; iters--; j++) { 4154 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4155 // MACC2(Ra, Rb, t0, t1, t2); 4156 // Ra = *++Pa; 4157 // Rb = *--Pb; 4158 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4159 // MACC(Rm, Rn, t0, t1, t2); 4160 // Rm = *++Pm; 4161 // Rn = *--Pn; 4162 // } 4163 // if ((i & 1) == 0) { 4164 // assert(Ra == Pa_base[j], "must be"); 4165 // MACC(Ra, Ra, t0, t1, t2); 4166 // } 4167 // iters = i/2; 4168 // assert(iters == i-j, "must be"); 4169 // for (; iters--; j++) { 4170 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4171 // MACC(Rm, Rn, t0, t1, t2); 4172 // Rm = *++Pm; 4173 // Rn = *--Pn; 4174 // } 4175 4176 // *Pm = Rm = t0 * inv; 4177 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4178 // MACC(Rm, Rn, t0, t1, t2); 4179 4180 // assert(t0 == 0, "broken Montgomery multiply"); 4181 4182 // t0 = t1; t1 = t2; t2 = 0; 4183 // } 4184 4185 // for (i = len; i < 2*len; i++) { 4186 // int start = i-len+1; 4187 // int end = start + (len - start)/2; 4188 // int j; 4189 4190 // Pa = Pa_base + i-len; 4191 // Pb = Pa_base + len; 4192 // Pm = Pm_base + i-len; 4193 // Pn = Pn_base + len; 4194 4195 // Ra = *++Pa; 4196 // Rb = *--Pb; 4197 // Rm = *++Pm; 4198 // Rn = *--Pn; 4199 4200 // int iters = (2*len-i-1)/2; 4201 // assert(iters == end-start, "must be"); 4202 // for (j = start; iters--; j++) { 4203 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4204 // MACC2(Ra, Rb, t0, t1, t2); 4205 // Ra = *++Pa; 4206 // Rb = *--Pb; 4207 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4208 // MACC(Rm, Rn, t0, t1, t2); 4209 // Rm = *++Pm; 4210 // Rn = *--Pn; 4211 // } 4212 // if ((i & 1) == 0) { 4213 // assert(Ra == Pa_base[j], "must be"); 4214 // MACC(Ra, Ra, t0, t1, t2); 4215 // } 4216 // iters = (2*len-i)/2; 4217 // assert(iters == len-j, "must be"); 4218 // for (; iters--; j++) { 4219 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4220 // MACC(Rm, Rn, t0, t1, t2); 4221 // Rm = *++Pm; 4222 // Rn = *--Pn; 4223 // } 4224 // Pm_base[i-len] = t0; 4225 // t0 = t1; t1 = t2; t2 = 0; 4226 // } 4227 4228 // while (t0) 4229 // t0 = sub(Pm_base, Pn_base, t0, len); 4230 // } 4231 }; 4232 4233 // Initialization 4234 void generate_initial() { 4235 // Generate initial stubs and initializes the entry points 4236 4237 // entry points that exist in all platforms Note: This is code 4238 // that could be shared among different platforms - however the 4239 // benefit seems to be smaller than the disadvantage of having a 4240 // much more complicated generator structure. See also comment in 4241 // stubRoutines.hpp. 4242 4243 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4244 4245 StubRoutines::_call_stub_entry = 4246 generate_call_stub(StubRoutines::_call_stub_return_address); 4247 4248 // is referenced by megamorphic call 4249 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4250 4251 // Build this early so it's available for the interpreter. 4252 StubRoutines::_throw_StackOverflowError_entry = 4253 generate_throw_exception("StackOverflowError throw_exception", 4254 CAST_FROM_FN_PTR(address, 4255 SharedRuntime:: 4256 throw_StackOverflowError)); 4257 if (UseCRC32Intrinsics) { 4258 // set table address before stub generation which use it 4259 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 4260 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4261 } 4262 } 4263 4264 void generate_all() { 4265 // support for verify_oop (must happen after universe_init) 4266 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4267 StubRoutines::_throw_AbstractMethodError_entry = 4268 generate_throw_exception("AbstractMethodError throw_exception", 4269 CAST_FROM_FN_PTR(address, 4270 SharedRuntime:: 4271 throw_AbstractMethodError)); 4272 4273 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4274 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4275 CAST_FROM_FN_PTR(address, 4276 SharedRuntime:: 4277 throw_IncompatibleClassChangeError)); 4278 4279 StubRoutines::_throw_NullPointerException_at_call_entry = 4280 generate_throw_exception("NullPointerException at call throw_exception", 4281 CAST_FROM_FN_PTR(address, 4282 SharedRuntime:: 4283 throw_NullPointerException_at_call)); 4284 4285 // arraycopy stubs used by compilers 4286 generate_arraycopy_stubs(); 4287 4288 if (UseMultiplyToLenIntrinsic) { 4289 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4290 } 4291 4292 if (UseMontgomeryMultiplyIntrinsic) { 4293 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4294 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4295 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4296 } 4297 4298 if (UseMontgomerySquareIntrinsic) { 4299 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4300 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4301 // We use generate_multiply() rather than generate_square() 4302 // because it's faster for the sizes of modulus we care about. 4303 StubRoutines::_montgomerySquare = g.generate_multiply(); 4304 } 4305 4306 #ifndef BUILTIN_SIM 4307 // generate GHASH intrinsics code 4308 if (UseGHASHIntrinsics) { 4309 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 4310 } 4311 4312 if (UseAESIntrinsics) { 4313 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4314 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4315 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4316 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4317 } 4318 4319 if (UseSHA1Intrinsics) { 4320 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4321 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4322 } 4323 if (UseSHA256Intrinsics) { 4324 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4325 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4326 } 4327 4328 if (UseCRC32CIntrinsics) { 4329 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 4330 } 4331 4332 // generate Adler32 intrinsics code 4333 if (UseAdler32Intrinsics) { 4334 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 4335 } 4336 4337 // Safefetch stubs. 4338 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4339 &StubRoutines::_safefetch32_fault_pc, 4340 &StubRoutines::_safefetch32_continuation_pc); 4341 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4342 &StubRoutines::_safefetchN_fault_pc, 4343 &StubRoutines::_safefetchN_continuation_pc); 4344 #endif 4345 } 4346 4347 public: 4348 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4349 if (all) { 4350 generate_all(); 4351 } else { 4352 generate_initial(); 4353 } 4354 } 4355 }; // end class declaration 4356 4357 void StubGenerator_generate(CodeBuffer* code, bool all) { 4358 StubGenerator g(code, all); 4359 }