1 /* 2 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "interpreter/interpreter.hpp" 30 #include "nativeInst_aarch64.hpp" 31 #include "oops/instanceOop.hpp" 32 #include "oops/method.hpp" 33 #include "oops/objArrayKlass.hpp" 34 #include "oops/oop.inline.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/frame.inline.hpp" 37 #include "runtime/handles.inline.hpp" 38 #include "runtime/sharedRuntime.hpp" 39 #include "runtime/stubCodeGenerator.hpp" 40 #include "runtime/stubRoutines.hpp" 41 #include "runtime/thread.inline.hpp" 42 #include "utilities/top.hpp" 43 #ifdef COMPILER2 44 #include "opto/runtime.hpp" 45 #endif 46 47 #ifdef BUILTIN_SIM 48 #include "../../../../../../simulator/simulator.hpp" 49 #endif 50 51 // Declaration and definition of StubGenerator (no .hpp file). 52 // For a more detailed description of the stub routine structure 53 // see the comment in stubRoutines.hpp 54 55 #undef __ 56 #define __ _masm-> 57 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 58 59 #ifdef PRODUCT 60 #define BLOCK_COMMENT(str) /* nothing */ 61 #else 62 #define BLOCK_COMMENT(str) __ block_comment(str) 63 #endif 64 65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 66 67 // Stub Code definitions 68 69 class StubGenerator: public StubCodeGenerator { 70 private: 71 72 #ifdef PRODUCT 73 #define inc_counter_np(counter) ((void)0) 74 #else 75 void inc_counter_np_(int& counter) { 76 __ lea(rscratch2, ExternalAddress((address)&counter)); 77 __ ldrw(rscratch1, Address(rscratch2)); 78 __ addw(rscratch1, rscratch1, 1); 79 __ strw(rscratch1, Address(rscratch2)); 80 } 81 #define inc_counter_np(counter) \ 82 BLOCK_COMMENT("inc_counter " #counter); \ 83 inc_counter_np_(counter); 84 #endif 85 86 // Call stubs are used to call Java from C 87 // 88 // Arguments: 89 // c_rarg0: call wrapper address address 90 // c_rarg1: result address 91 // c_rarg2: result type BasicType 92 // c_rarg3: method Method* 93 // c_rarg4: (interpreter) entry point address 94 // c_rarg5: parameters intptr_t* 95 // c_rarg6: parameter size (in words) int 96 // c_rarg7: thread Thread* 97 // 98 // There is no return from the stub itself as any Java result 99 // is written to result 100 // 101 // we save r30 (lr) as the return PC at the base of the frame and 102 // link r29 (fp) below it as the frame pointer installing sp (r31) 103 // into fp. 104 // 105 // we save r0-r7, which accounts for all the c arguments. 106 // 107 // TODO: strictly do we need to save them all? they are treated as 108 // volatile by C so could we omit saving the ones we are going to 109 // place in global registers (thread? method?) or those we only use 110 // during setup of the Java call? 111 // 112 // we don't need to save r8 which C uses as an indirect result location 113 // return register. 114 // 115 // we don't need to save r9-r15 which both C and Java treat as 116 // volatile 117 // 118 // we don't need to save r16-18 because Java does not use them 119 // 120 // we save r19-r28 which Java uses as scratch registers and C 121 // expects to be callee-save 122 // 123 // we save the bottom 64 bits of each value stored in v8-v15; it is 124 // the responsibility of the caller to preserve larger values. 125 // 126 // so the stub frame looks like this when we enter Java code 127 // 128 // [ return_from_Java ] <--- sp 129 // [ argument word n ] 130 // ... 131 // -27 [ argument word 1 ] 132 // -26 [ saved v15 ] <--- sp_after_call 133 // -25 [ saved v14 ] 134 // -24 [ saved v13 ] 135 // -23 [ saved v12 ] 136 // -22 [ saved v11 ] 137 // -21 [ saved v10 ] 138 // -20 [ saved v9 ] 139 // -19 [ saved v8 ] 140 // -18 [ saved r28 ] 141 // -17 [ saved r27 ] 142 // -16 [ saved r26 ] 143 // -15 [ saved r25 ] 144 // -14 [ saved r24 ] 145 // -13 [ saved r23 ] 146 // -12 [ saved r22 ] 147 // -11 [ saved r21 ] 148 // -10 [ saved r20 ] 149 // -9 [ saved r19 ] 150 // -8 [ call wrapper (r0) ] 151 // -7 [ result (r1) ] 152 // -6 [ result type (r2) ] 153 // -5 [ method (r3) ] 154 // -4 [ entry point (r4) ] 155 // -3 [ parameters (r5) ] 156 // -2 [ parameter size (r6) ] 157 // -1 [ thread (r7) ] 158 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 159 // 1 [ saved lr (r30) ] 160 161 // Call stub stack layout word offsets from fp 162 enum call_stub_layout { 163 sp_after_call_off = -26, 164 165 d15_off = -26, 166 d14_off = -25, 167 d13_off = -24, 168 d12_off = -23, 169 d11_off = -22, 170 d10_off = -21, 171 d9_off = -20, 172 d8_off = -19, 173 174 r28_off = -18, 175 r27_off = -17, 176 r26_off = -16, 177 r25_off = -15, 178 r24_off = -14, 179 r23_off = -13, 180 r22_off = -12, 181 r21_off = -11, 182 r20_off = -10, 183 r19_off = -9, 184 call_wrapper_off = -8, 185 result_off = -7, 186 result_type_off = -6, 187 method_off = -5, 188 entry_point_off = -4, 189 parameters_off = -3, 190 parameter_size_off = -2, 191 thread_off = -1, 192 fp_f = 0, 193 retaddr_off = 1, 194 }; 195 196 address generate_call_stub(address& return_address) { 197 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 198 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 199 "adjust this code"); 200 201 StubCodeMark mark(this, "StubRoutines", "call_stub"); 202 address start = __ pc(); 203 204 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 205 206 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 207 const Address result (rfp, result_off * wordSize); 208 const Address result_type (rfp, result_type_off * wordSize); 209 const Address method (rfp, method_off * wordSize); 210 const Address entry_point (rfp, entry_point_off * wordSize); 211 const Address parameters (rfp, parameters_off * wordSize); 212 const Address parameter_size(rfp, parameter_size_off * wordSize); 213 214 const Address thread (rfp, thread_off * wordSize); 215 216 const Address d15_save (rfp, d15_off * wordSize); 217 const Address d14_save (rfp, d14_off * wordSize); 218 const Address d13_save (rfp, d13_off * wordSize); 219 const Address d12_save (rfp, d12_off * wordSize); 220 const Address d11_save (rfp, d11_off * wordSize); 221 const Address d10_save (rfp, d10_off * wordSize); 222 const Address d9_save (rfp, d9_off * wordSize); 223 const Address d8_save (rfp, d8_off * wordSize); 224 225 const Address r28_save (rfp, r28_off * wordSize); 226 const Address r27_save (rfp, r27_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r25_save (rfp, r25_off * wordSize); 229 const Address r24_save (rfp, r24_off * wordSize); 230 const Address r23_save (rfp, r23_off * wordSize); 231 const Address r22_save (rfp, r22_off * wordSize); 232 const Address r21_save (rfp, r21_off * wordSize); 233 const Address r20_save (rfp, r20_off * wordSize); 234 const Address r19_save (rfp, r19_off * wordSize); 235 236 // stub code 237 238 // we need a C prolog to bootstrap the x86 caller into the sim 239 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 240 241 address aarch64_entry = __ pc(); 242 243 #ifdef BUILTIN_SIM 244 // Save sender's SP for stack traces. 245 __ mov(rscratch1, sp); 246 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 247 #endif 248 // set up frame and move sp to end of save area 249 __ enter(); 250 __ sub(sp, rfp, -sp_after_call_off * wordSize); 251 252 // save register parameters and Java scratch/global registers 253 // n.b. we save thread even though it gets installed in 254 // rthread because we want to sanity check rthread later 255 __ str(c_rarg7, thread); 256 __ strw(c_rarg6, parameter_size); 257 __ str(c_rarg5, parameters); 258 __ str(c_rarg4, entry_point); 259 __ str(c_rarg3, method); 260 __ str(c_rarg2, result_type); 261 __ str(c_rarg1, result); 262 __ str(c_rarg0, call_wrapper); 263 __ str(r19, r19_save); 264 __ str(r20, r20_save); 265 __ str(r21, r21_save); 266 __ str(r22, r22_save); 267 __ str(r23, r23_save); 268 __ str(r24, r24_save); 269 __ str(r25, r25_save); 270 __ str(r26, r26_save); 271 __ str(r27, r27_save); 272 __ str(r28, r28_save); 273 274 __ strd(v8, d8_save); 275 __ strd(v9, d9_save); 276 __ strd(v10, d10_save); 277 __ strd(v11, d11_save); 278 __ strd(v12, d12_save); 279 __ strd(v13, d13_save); 280 __ strd(v14, d14_save); 281 __ strd(v15, d15_save); 282 283 // install Java thread in global register now we have saved 284 // whatever value it held 285 __ mov(rthread, c_rarg7); 286 // And method 287 __ mov(rmethod, c_rarg3); 288 289 // set up the heapbase register 290 __ reinit_heapbase(); 291 292 #ifdef ASSERT 293 // make sure we have no pending exceptions 294 { 295 Label L; 296 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 297 __ cmp(rscratch1, (unsigned)NULL_WORD); 298 __ br(Assembler::EQ, L); 299 __ stop("StubRoutines::call_stub: entered with pending exception"); 300 __ BIND(L); 301 } 302 #endif 303 // pass parameters if any 304 __ mov(esp, sp); 305 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 306 __ andr(sp, rscratch1, -2 * wordSize); 307 308 BLOCK_COMMENT("pass parameters if any"); 309 Label parameters_done; 310 // parameter count is still in c_rarg6 311 // and parameter pointer identifying param 1 is in c_rarg5 312 __ cbzw(c_rarg6, parameters_done); 313 314 address loop = __ pc(); 315 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 316 __ subsw(c_rarg6, c_rarg6, 1); 317 __ push(rscratch1); 318 __ br(Assembler::GT, loop); 319 320 __ BIND(parameters_done); 321 322 // call Java entry -- passing methdoOop, and current sp 323 // rmethod: Method* 324 // r13: sender sp 325 BLOCK_COMMENT("call Java function"); 326 __ mov(r13, sp); 327 __ blr(c_rarg4); 328 329 // tell the simulator we have returned to the stub 330 331 // we do this here because the notify will already have been done 332 // if we get to the next instruction via an exception 333 // 334 // n.b. adding this instruction here affects the calculation of 335 // whether or not a routine returns to the call stub (used when 336 // doing stack walks) since the normal test is to check the return 337 // pc against the address saved below. so we may need to allow for 338 // this extra instruction in the check. 339 340 if (NotifySimulator) { 341 __ notify(Assembler::method_reentry); 342 } 343 // save current address for use by exception handling code 344 345 return_address = __ pc(); 346 347 // store result depending on type (everything that is not 348 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 349 // n.b. this assumes Java returns an integral result in r0 350 // and a floating result in j_farg0 351 __ ldr(j_rarg2, result); 352 Label is_long, is_float, is_double, exit; 353 __ ldr(j_rarg1, result_type); 354 __ cmp(j_rarg1, T_OBJECT); 355 __ br(Assembler::EQ, is_long); 356 __ cmp(j_rarg1, T_LONG); 357 __ br(Assembler::EQ, is_long); 358 __ cmp(j_rarg1, T_FLOAT); 359 __ br(Assembler::EQ, is_float); 360 __ cmp(j_rarg1, T_DOUBLE); 361 __ br(Assembler::EQ, is_double); 362 363 // handle T_INT case 364 __ strw(r0, Address(j_rarg2)); 365 366 __ BIND(exit); 367 368 // pop parameters 369 __ sub(esp, rfp, -sp_after_call_off * wordSize); 370 371 #ifdef ASSERT 372 // verify that threads correspond 373 { 374 Label L, S; 375 __ ldr(rscratch1, thread); 376 __ cmp(rthread, rscratch1); 377 __ br(Assembler::NE, S); 378 __ get_thread(rscratch1); 379 __ cmp(rthread, rscratch1); 380 __ br(Assembler::EQ, L); 381 __ BIND(S); 382 __ stop("StubRoutines::call_stub: threads must correspond"); 383 __ BIND(L); 384 } 385 #endif 386 387 // restore callee-save registers 388 __ ldrd(v15, d15_save); 389 __ ldrd(v14, d14_save); 390 __ ldrd(v13, d13_save); 391 __ ldrd(v12, d12_save); 392 __ ldrd(v11, d11_save); 393 __ ldrd(v10, d10_save); 394 __ ldrd(v9, d9_save); 395 __ ldrd(v8, d8_save); 396 397 __ ldr(r28, r28_save); 398 __ ldr(r27, r27_save); 399 __ ldr(r26, r26_save); 400 __ ldr(r25, r25_save); 401 __ ldr(r24, r24_save); 402 __ ldr(r23, r23_save); 403 __ ldr(r22, r22_save); 404 __ ldr(r21, r21_save); 405 __ ldr(r20, r20_save); 406 __ ldr(r19, r19_save); 407 __ ldr(c_rarg0, call_wrapper); 408 __ ldr(c_rarg1, result); 409 __ ldrw(c_rarg2, result_type); 410 __ ldr(c_rarg3, method); 411 __ ldr(c_rarg4, entry_point); 412 __ ldr(c_rarg5, parameters); 413 __ ldr(c_rarg6, parameter_size); 414 __ ldr(c_rarg7, thread); 415 416 #ifndef PRODUCT 417 // tell the simulator we are about to end Java execution 418 if (NotifySimulator) { 419 __ notify(Assembler::method_exit); 420 } 421 #endif 422 // leave frame and return to caller 423 __ leave(); 424 __ ret(lr); 425 426 // handle return types different from T_INT 427 428 __ BIND(is_long); 429 __ str(r0, Address(j_rarg2, 0)); 430 __ br(Assembler::AL, exit); 431 432 __ BIND(is_float); 433 __ strs(j_farg0, Address(j_rarg2, 0)); 434 __ br(Assembler::AL, exit); 435 436 __ BIND(is_double); 437 __ strd(j_farg0, Address(j_rarg2, 0)); 438 __ br(Assembler::AL, exit); 439 440 return start; 441 } 442 443 // Return point for a Java call if there's an exception thrown in 444 // Java code. The exception is caught and transformed into a 445 // pending exception stored in JavaThread that can be tested from 446 // within the VM. 447 // 448 // Note: Usually the parameters are removed by the callee. In case 449 // of an exception crossing an activation frame boundary, that is 450 // not the case if the callee is compiled code => need to setup the 451 // rsp. 452 // 453 // r0: exception oop 454 455 // NOTE: this is used as a target from the signal handler so it 456 // needs an x86 prolog which returns into the current simulator 457 // executing the generated catch_exception code. so the prolog 458 // needs to install rax in a sim register and adjust the sim's 459 // restart pc to enter the generated code at the start position 460 // then return from native to simulated execution. 461 462 address generate_catch_exception() { 463 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 464 address start = __ pc(); 465 466 // same as in generate_call_stub(): 467 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 468 const Address thread (rfp, thread_off * wordSize); 469 470 #ifdef ASSERT 471 // verify that threads correspond 472 { 473 Label L, S; 474 __ ldr(rscratch1, thread); 475 __ cmp(rthread, rscratch1); 476 __ br(Assembler::NE, S); 477 __ get_thread(rscratch1); 478 __ cmp(rthread, rscratch1); 479 __ br(Assembler::EQ, L); 480 __ bind(S); 481 __ stop("StubRoutines::catch_exception: threads must correspond"); 482 __ bind(L); 483 } 484 #endif 485 486 // set pending exception 487 __ verify_oop(r0); 488 489 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 490 __ mov(rscratch1, (address)__FILE__); 491 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 492 __ movw(rscratch1, (int)__LINE__); 493 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 494 495 // complete return to VM 496 assert(StubRoutines::_call_stub_return_address != NULL, 497 "_call_stub_return_address must have been generated before"); 498 __ b(StubRoutines::_call_stub_return_address); 499 500 return start; 501 } 502 503 // Continuation point for runtime calls returning with a pending 504 // exception. The pending exception check happened in the runtime 505 // or native call stub. The pending exception in Thread is 506 // converted into a Java-level exception. 507 // 508 // Contract with Java-level exception handlers: 509 // r0: exception 510 // r3: throwing pc 511 // 512 // NOTE: At entry of this stub, exception-pc must be in LR !! 513 514 // NOTE: this is always used as a jump target within generated code 515 // so it just needs to be generated code wiht no x86 prolog 516 517 address generate_forward_exception() { 518 StubCodeMark mark(this, "StubRoutines", "forward exception"); 519 address start = __ pc(); 520 521 // Upon entry, LR points to the return address returning into 522 // Java (interpreted or compiled) code; i.e., the return address 523 // becomes the throwing pc. 524 // 525 // Arguments pushed before the runtime call are still on the stack 526 // but the exception handler will reset the stack pointer -> 527 // ignore them. A potential result in registers can be ignored as 528 // well. 529 530 #ifdef ASSERT 531 // make sure this code is only executed if there is a pending exception 532 { 533 Label L; 534 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 535 __ cbnz(rscratch1, L); 536 __ stop("StubRoutines::forward exception: no pending exception (1)"); 537 __ bind(L); 538 } 539 #endif 540 541 // compute exception handler into r19 542 543 // call the VM to find the handler address associated with the 544 // caller address. pass thread in r0 and caller pc (ret address) 545 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 546 // the stack. 547 __ mov(c_rarg1, lr); 548 // lr will be trashed by the VM call so we move it to R19 549 // (callee-saved) because we also need to pass it to the handler 550 // returned by this call. 551 __ mov(r19, lr); 552 BLOCK_COMMENT("call exception_handler_for_return_address"); 553 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 554 SharedRuntime::exception_handler_for_return_address), 555 rthread, c_rarg1); 556 // we should not really care that lr is no longer the callee 557 // address. we saved the value the handler needs in r19 so we can 558 // just copy it to r3. however, the C2 handler will push its own 559 // frame and then calls into the VM and the VM code asserts that 560 // the PC for the frame above the handler belongs to a compiled 561 // Java method. So, we restore lr here to satisfy that assert. 562 __ mov(lr, r19); 563 // setup r0 & r3 & clear pending exception 564 __ mov(r3, r19); 565 __ mov(r19, r0); 566 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 567 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 568 569 #ifdef ASSERT 570 // make sure exception is set 571 { 572 Label L; 573 __ cbnz(r0, L); 574 __ stop("StubRoutines::forward exception: no pending exception (2)"); 575 __ bind(L); 576 } 577 #endif 578 579 // continue at exception handler 580 // r0: exception 581 // r3: throwing pc 582 // r19: exception handler 583 __ verify_oop(r0); 584 __ br(r19); 585 586 return start; 587 } 588 589 // Non-destructive plausibility checks for oops 590 // 591 // Arguments: 592 // r0: oop to verify 593 // rscratch1: error message 594 // 595 // Stack after saving c_rarg3: 596 // [tos + 0]: saved c_rarg3 597 // [tos + 1]: saved c_rarg2 598 // [tos + 2]: saved lr 599 // [tos + 3]: saved rscratch2 600 // [tos + 4]: saved r0 601 // [tos + 5]: saved rscratch1 602 address generate_verify_oop() { 603 604 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 605 address start = __ pc(); 606 607 Label exit, error; 608 609 // save c_rarg2 and c_rarg3 610 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 611 612 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 613 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 614 __ ldr(c_rarg3, Address(c_rarg2)); 615 __ add(c_rarg3, c_rarg3, 1); 616 __ str(c_rarg3, Address(c_rarg2)); 617 618 // object is in r0 619 // make sure object is 'reasonable' 620 __ cbz(r0, exit); // if obj is NULL it is OK 621 622 // Check if the oop is in the right area of memory 623 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 624 __ andr(c_rarg2, r0, c_rarg3); 625 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 626 627 // Compare c_rarg2 and c_rarg3. We don't use a compare 628 // instruction here because the flags register is live. 629 __ eor(c_rarg2, c_rarg2, c_rarg3); 630 __ cbnz(c_rarg2, error); 631 632 // make sure klass is 'reasonable', which is not zero. 633 __ load_klass(r0, r0); // get klass 634 __ cbz(r0, error); // if klass is NULL it is broken 635 636 // return if everything seems ok 637 __ bind(exit); 638 639 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 640 __ ret(lr); 641 642 // handle errors 643 __ bind(error); 644 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 645 646 __ push(RegSet::range(r0, r29), sp); 647 // debug(char* msg, int64_t pc, int64_t regs[]) 648 __ mov(c_rarg0, rscratch1); // pass address of error message 649 __ mov(c_rarg1, lr); // pass return address 650 __ mov(c_rarg2, sp); // pass address of regs on stack 651 #ifndef PRODUCT 652 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 653 #endif 654 BLOCK_COMMENT("call MacroAssembler::debug"); 655 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 656 __ blrt(rscratch1, 3, 0, 1); 657 658 return start; 659 } 660 661 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 662 663 // Generate code for an array write pre barrier 664 // 665 // addr - starting address 666 // count - element count 667 // tmp - scratch register 668 // 669 // Destroy no registers! 670 // 671 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 672 BarrierSet* bs = Universe::heap()->barrier_set(); 673 switch (bs->kind()) { 674 case BarrierSet::G1SATBCTLogging: 675 // With G1, don't generate the call if we statically know that the target in uninitialized 676 if (!dest_uninitialized) { 677 __ push(RegSet::range(r0, r29), sp); // integer registers except lr & sp 678 if (count == c_rarg0) { 679 if (addr == c_rarg1) { 680 // exactly backwards!! 681 __ stp(c_rarg0, c_rarg1, __ pre(sp, -2 * wordSize)); 682 __ ldp(c_rarg1, c_rarg0, __ post(sp, -2 * wordSize)); 683 } else { 684 __ mov(c_rarg1, count); 685 __ mov(c_rarg0, addr); 686 } 687 } else { 688 __ mov(c_rarg0, addr); 689 __ mov(c_rarg1, count); 690 } 691 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 692 __ pop(RegSet::range(r0, r29), sp); // integer registers except lr & sp } 693 break; 694 case BarrierSet::CardTableForRS: 695 case BarrierSet::CardTableExtension: 696 case BarrierSet::ModRef: 697 break; 698 default: 699 ShouldNotReachHere(); 700 701 } 702 } 703 } 704 705 // 706 // Generate code for an array write post barrier 707 // 708 // Input: 709 // start - register containing starting address of destination array 710 // end - register containing ending address of destination array 711 // scratch - scratch register 712 // 713 // The input registers are overwritten. 714 // The ending address is inclusive. 715 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) { 716 assert_different_registers(start, end, scratch); 717 BarrierSet* bs = Universe::heap()->barrier_set(); 718 switch (bs->kind()) { 719 case BarrierSet::G1SATBCTLogging: 720 721 { 722 __ push(RegSet::range(r0, r29), sp); // integer registers except lr & sp 723 // must compute element count unless barrier set interface is changed (other platforms supply count) 724 assert_different_registers(start, end, scratch); 725 __ lea(scratch, Address(end, BytesPerHeapOop)); 726 __ sub(scratch, scratch, start); // subtract start to get #bytes 727 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 728 __ mov(c_rarg0, start); 729 __ mov(c_rarg1, scratch); 730 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 731 __ pop(RegSet::range(r0, r29), sp); // integer registers except lr & sp } 732 } 733 break; 734 case BarrierSet::CardTableForRS: 735 case BarrierSet::CardTableExtension: 736 { 737 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 738 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 739 740 Label L_loop; 741 742 __ lsr(start, start, CardTableModRefBS::card_shift); 743 __ lsr(end, end, CardTableModRefBS::card_shift); 744 __ sub(end, end, start); // number of bytes to copy 745 746 const Register count = end; // 'end' register contains bytes count now 747 __ load_byte_map_base(scratch); 748 __ add(start, start, scratch); 749 if (UseConcMarkSweepGC) { 750 __ membar(__ StoreStore); 751 } 752 __ BIND(L_loop); 753 __ strb(zr, Address(start, count)); 754 __ subs(count, count, 1); 755 __ br(Assembler::HS, L_loop); 756 } 757 break; 758 default: 759 ShouldNotReachHere(); 760 761 } 762 } 763 764 typedef enum { 765 copy_forwards = 1, 766 copy_backwards = -1 767 } copy_direction; 768 769 // Bulk copy of blocks of 8 words. 770 // 771 // count is a count of words. 772 // 773 // Precondition: count >= 2 774 // 775 // Postconditions: 776 // 777 // The least significant bit of count contains the remaining count 778 // of words to copy. The rest of count is trash. 779 // 780 // s and d are adjusted to point to the remaining words to copy 781 // 782 void generate_copy_longs(Label &start, Register s, Register d, Register count, 783 copy_direction direction) { 784 int unit = wordSize * direction; 785 786 int offset; 787 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 788 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 789 790 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 791 assert_different_registers(s, d, count, rscratch1); 792 793 Label again, large, small; 794 const char *stub_name; 795 if (direction == copy_forwards) 796 stub_name = "foward_copy_longs"; 797 else 798 stub_name = "backward_copy_longs"; 799 StubCodeMark mark(this, "StubRoutines", stub_name); 800 __ align(6); 801 __ bind(start); 802 __ cmp(count, 8); 803 __ br(Assembler::LO, small); 804 if (direction == copy_forwards) { 805 __ sub(s, s, 2 * wordSize); 806 __ sub(d, d, 2 * wordSize); 807 } 808 __ subs(count, count, 16); 809 __ br(Assembler::GE, large); 810 811 // 8 <= count < 16 words. Copy 8. 812 __ ldp(t0, t1, Address(s, 2 * unit)); 813 __ ldp(t2, t3, Address(s, 4 * unit)); 814 __ ldp(t4, t5, Address(s, 6 * unit)); 815 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 816 817 __ stp(t0, t1, Address(d, 2 * unit)); 818 __ stp(t2, t3, Address(d, 4 * unit)); 819 __ stp(t4, t5, Address(d, 6 * unit)); 820 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 821 822 if (direction == copy_forwards) { 823 __ add(s, s, 2 * wordSize); 824 __ add(d, d, 2 * wordSize); 825 } 826 827 { 828 Label L1, L2; 829 __ bind(small); 830 __ tbz(count, exact_log2(4), L1); 831 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 832 __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 833 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 834 __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 835 __ bind(L1); 836 837 __ tbz(count, 1, L2); 838 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 839 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 840 __ bind(L2); 841 } 842 843 __ ret(lr); 844 845 __ align(6); 846 __ bind(large); 847 848 // Fill 8 registers 849 __ ldp(t0, t1, Address(s, 2 * unit)); 850 __ ldp(t2, t3, Address(s, 4 * unit)); 851 __ ldp(t4, t5, Address(s, 6 * unit)); 852 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 853 854 __ bind(again); 855 856 if (direction == copy_forwards && PrefetchCopyIntervalInBytes > 0) 857 __ prfm(Address(s, PrefetchCopyIntervalInBytes), PLDL1KEEP); 858 859 __ stp(t0, t1, Address(d, 2 * unit)); 860 __ ldp(t0, t1, Address(s, 2 * unit)); 861 __ stp(t2, t3, Address(d, 4 * unit)); 862 __ ldp(t2, t3, Address(s, 4 * unit)); 863 __ stp(t4, t5, Address(d, 6 * unit)); 864 __ ldp(t4, t5, Address(s, 6 * unit)); 865 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 866 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 867 868 __ subs(count, count, 8); 869 __ br(Assembler::HS, again); 870 871 // Drain 872 __ stp(t0, t1, Address(d, 2 * unit)); 873 __ stp(t2, t3, Address(d, 4 * unit)); 874 __ stp(t4, t5, Address(d, 6 * unit)); 875 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 876 877 if (direction == copy_forwards) { 878 __ add(s, s, 2 * wordSize); 879 __ add(d, d, 2 * wordSize); 880 } 881 882 { 883 Label L1, L2; 884 __ tbz(count, exact_log2(4), L1); 885 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 886 __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 887 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 888 __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 889 __ bind(L1); 890 891 __ tbz(count, 1, L2); 892 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 893 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 894 __ bind(L2); 895 } 896 897 __ ret(lr); 898 } 899 900 // Small copy: less than 16 bytes. 901 // 902 // NB: Ignores all of the bits of count which represent more than 15 903 // bytes, so a caller doesn't have to mask them. 904 905 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 906 bool is_backwards = step < 0; 907 size_t granularity = uabs(step); 908 int direction = is_backwards ? -1 : 1; 909 int unit = wordSize * direction; 910 911 Label Lpair, Lword, Lint, Lshort, Lbyte; 912 913 assert(granularity 914 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 915 916 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 917 918 // ??? I don't know if this bit-test-and-branch is the right thing 919 // to do. It does a lot of jumping, resulting in several 920 // mispredicted branches. It might make more sense to do this 921 // with something like Duff's device with a single computed branch. 922 923 __ tbz(count, 3 - exact_log2(granularity), Lword); 924 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 925 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 926 __ bind(Lword); 927 928 if (granularity <= sizeof (jint)) { 929 __ tbz(count, 2 - exact_log2(granularity), Lint); 930 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 931 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 932 __ bind(Lint); 933 } 934 935 if (granularity <= sizeof (jshort)) { 936 __ tbz(count, 1 - exact_log2(granularity), Lshort); 937 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 938 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 939 __ bind(Lshort); 940 } 941 942 if (granularity <= sizeof (jbyte)) { 943 __ tbz(count, 0, Lbyte); 944 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 945 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 946 __ bind(Lbyte); 947 } 948 } 949 950 Label copy_f, copy_b; 951 952 // All-singing all-dancing memory copy. 953 // 954 // Copy count units of memory from s to d. The size of a unit is 955 // step, which can be positive or negative depending on the direction 956 // of copy. If is_aligned is false, we align the source address. 957 // 958 959 void copy_memory(bool is_aligned, Register s, Register d, 960 Register count, Register tmp, int step) { 961 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 962 bool is_backwards = step < 0; 963 int granularity = uabs(step); 964 const Register t0 = r3, t1 = r4; 965 966 if (is_backwards) { 967 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 968 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 969 } 970 971 Label tail; 972 973 __ cmp(count, 16/granularity); 974 __ br(Assembler::LO, tail); 975 976 // Now we've got the small case out of the way we can align the 977 // source address on a 2-word boundary. 978 979 Label aligned; 980 981 if (is_aligned) { 982 // We may have to adjust by 1 word to get s 2-word-aligned. 983 __ tbz(s, exact_log2(wordSize), aligned); 984 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 985 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 986 __ sub(count, count, wordSize/granularity); 987 } else { 988 if (is_backwards) { 989 __ andr(rscratch2, s, 2 * wordSize - 1); 990 } else { 991 __ neg(rscratch2, s); 992 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 993 } 994 // rscratch2 is the byte adjustment needed to align s. 995 __ cbz(rscratch2, aligned); 996 int shift = exact_log2(granularity); 997 if (shift) __ lsr(rscratch2, rscratch2, shift); 998 __ sub(count, count, rscratch2); 999 1000 #if 0 1001 // ?? This code is only correct for a disjoint copy. It may or 1002 // may not make sense to use it in that case. 1003 1004 // Copy the first pair; s and d may not be aligned. 1005 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1006 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1007 1008 // Align s and d, adjust count 1009 if (is_backwards) { 1010 __ sub(s, s, rscratch2); 1011 __ sub(d, d, rscratch2); 1012 } else { 1013 __ add(s, s, rscratch2); 1014 __ add(d, d, rscratch2); 1015 } 1016 #else 1017 copy_memory_small(s, d, rscratch2, rscratch1, step); 1018 #endif 1019 } 1020 1021 __ cmp(count, 16/granularity); 1022 __ br(Assembler::LT, tail); 1023 __ bind(aligned); 1024 1025 // s is now 2-word-aligned. 1026 1027 // We have a count of units and some trailing bytes. Adjust the 1028 // count and do a bulk copy of words. 1029 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1030 if (direction == copy_forwards) 1031 __ bl(copy_f); 1032 else 1033 __ bl(copy_b); 1034 1035 // And the tail. 1036 1037 __ bind(tail); 1038 copy_memory_small(s, d, count, tmp, step); 1039 } 1040 1041 1042 void clobber_registers() { 1043 #ifdef ASSERT 1044 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1045 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1046 for (Register r = r3; r <= r18; r++) 1047 if (r != rscratch1) __ mov(r, rscratch1); 1048 #endif 1049 } 1050 1051 // Scan over array at a for count oops, verifying each one. 1052 // Preserves a and count, clobbers rscratch1 and rscratch2. 1053 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1054 Label loop, end; 1055 __ mov(rscratch1, a); 1056 __ mov(rscratch2, zr); 1057 __ bind(loop); 1058 __ cmp(rscratch2, count); 1059 __ br(Assembler::HS, end); 1060 if (size == (size_t)wordSize) { 1061 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1062 __ verify_oop(temp); 1063 } else { 1064 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1065 __ decode_heap_oop(temp); // calls verify_oop 1066 } 1067 __ add(rscratch2, rscratch2, size); 1068 __ b(loop); 1069 __ bind(end); 1070 } 1071 1072 // Arguments: 1073 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1074 // ignored 1075 // is_oop - true => oop array, so generate store check code 1076 // name - stub name string 1077 // 1078 // Inputs: 1079 // c_rarg0 - source array address 1080 // c_rarg1 - destination array address 1081 // c_rarg2 - element count, treated as ssize_t, can be zero 1082 // 1083 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1084 // the hardware handle it. The two dwords within qwords that span 1085 // cache line boundaries will still be loaded and stored atomicly. 1086 // 1087 // Side Effects: 1088 // disjoint_int_copy_entry is set to the no-overlap entry point 1089 // used by generate_conjoint_int_oop_copy(). 1090 // 1091 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1092 const char *name, bool dest_uninitialized = false) { 1093 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1094 __ align(CodeEntryAlignment); 1095 StubCodeMark mark(this, "StubRoutines", name); 1096 address start = __ pc(); 1097 __ enter(); 1098 1099 if (entry != NULL) { 1100 *entry = __ pc(); 1101 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1102 BLOCK_COMMENT("Entry:"); 1103 } 1104 1105 if (is_oop) { 1106 __ push(RegSet::of(d, count), sp); 1107 // no registers are destroyed by this call 1108 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1109 } 1110 copy_memory(aligned, s, d, count, rscratch1, size); 1111 if (is_oop) { 1112 __ pop(RegSet::of(d, count), sp); 1113 if (VerifyOops) 1114 verify_oop_array(size, d, count, r16); 1115 __ sub(count, count, 1); // make an inclusive end pointer 1116 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1117 gen_write_ref_array_post_barrier(d, count, rscratch1); 1118 } 1119 __ leave(); 1120 __ mov(r0, zr); // return 0 1121 __ ret(lr); 1122 #ifdef BUILTIN_SIM 1123 { 1124 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1125 sim->notifyCompile(const_cast<char*>(name), start); 1126 } 1127 #endif 1128 return start; 1129 } 1130 1131 // Arguments: 1132 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1133 // ignored 1134 // is_oop - true => oop array, so generate store check code 1135 // name - stub name string 1136 // 1137 // Inputs: 1138 // c_rarg0 - source array address 1139 // c_rarg1 - destination array address 1140 // c_rarg2 - element count, treated as ssize_t, can be zero 1141 // 1142 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1143 // the hardware handle it. The two dwords within qwords that span 1144 // cache line boundaries will still be loaded and stored atomicly. 1145 // 1146 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1147 address *entry, const char *name, 1148 bool dest_uninitialized = false) { 1149 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1150 1151 StubCodeMark mark(this, "StubRoutines", name); 1152 address start = __ pc(); 1153 __ enter(); 1154 1155 if (entry != NULL) { 1156 *entry = __ pc(); 1157 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1158 BLOCK_COMMENT("Entry:"); 1159 } 1160 1161 // no overlap when (d-s) above_equal (count*size) 1162 __ sub(rscratch1, d, s); 1163 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1164 __ br(Assembler::HS, nooverlap_target); 1165 1166 if (is_oop) { 1167 __ push(RegSet::of(d, count), sp); 1168 // no registers are destroyed by this call 1169 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1170 } 1171 copy_memory(aligned, s, d, count, rscratch1, -size); 1172 if (is_oop) { 1173 __ pop(RegSet::of(d, count), sp); 1174 if (VerifyOops) 1175 verify_oop_array(size, d, count, r16); 1176 __ sub(count, count, 1); // make an inclusive end pointer 1177 __ lea(count, Address(d, count, Address::uxtw(exact_log2(size)))); 1178 gen_write_ref_array_post_barrier(d, count, rscratch1); 1179 } 1180 __ leave(); 1181 __ mov(r0, zr); // return 0 1182 __ ret(lr); 1183 #ifdef BUILTIN_SIM 1184 { 1185 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1186 sim->notifyCompile(const_cast<char*>(name), start); 1187 } 1188 #endif 1189 return start; 1190 } 1191 1192 // Arguments: 1193 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1194 // ignored 1195 // name - stub name string 1196 // 1197 // Inputs: 1198 // c_rarg0 - source array address 1199 // c_rarg1 - destination array address 1200 // c_rarg2 - element count, treated as ssize_t, can be zero 1201 // 1202 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1203 // we let the hardware handle it. The one to eight bytes within words, 1204 // dwords or qwords that span cache line boundaries will still be loaded 1205 // and stored atomically. 1206 // 1207 // Side Effects: 1208 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1209 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1210 // we let the hardware handle it. The one to eight bytes within words, 1211 // dwords or qwords that span cache line boundaries will still be loaded 1212 // and stored atomically. 1213 // 1214 // Side Effects: 1215 // disjoint_byte_copy_entry is set to the no-overlap entry point 1216 // used by generate_conjoint_byte_copy(). 1217 // 1218 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1219 const bool not_oop = false; 1220 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1221 } 1222 1223 // Arguments: 1224 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1225 // ignored 1226 // name - stub name string 1227 // 1228 // Inputs: 1229 // c_rarg0 - source array address 1230 // c_rarg1 - destination array address 1231 // c_rarg2 - element count, treated as ssize_t, can be zero 1232 // 1233 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1234 // we let the hardware handle it. The one to eight bytes within words, 1235 // dwords or qwords that span cache line boundaries will still be loaded 1236 // and stored atomically. 1237 // 1238 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1239 address* entry, const char *name) { 1240 const bool not_oop = false; 1241 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1242 } 1243 1244 // Arguments: 1245 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1246 // ignored 1247 // name - stub name string 1248 // 1249 // Inputs: 1250 // c_rarg0 - source array address 1251 // c_rarg1 - destination array address 1252 // c_rarg2 - element count, treated as ssize_t, can be zero 1253 // 1254 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1255 // let the hardware handle it. The two or four words within dwords 1256 // or qwords that span cache line boundaries will still be loaded 1257 // and stored atomically. 1258 // 1259 // Side Effects: 1260 // disjoint_short_copy_entry is set to the no-overlap entry point 1261 // used by generate_conjoint_short_copy(). 1262 // 1263 address generate_disjoint_short_copy(bool aligned, 1264 address* entry, const char *name) { 1265 const bool not_oop = false; 1266 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1267 } 1268 1269 // Arguments: 1270 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1271 // ignored 1272 // name - stub name string 1273 // 1274 // Inputs: 1275 // c_rarg0 - source array address 1276 // c_rarg1 - destination array address 1277 // c_rarg2 - element count, treated as ssize_t, can be zero 1278 // 1279 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1280 // let the hardware handle it. The two or four words within dwords 1281 // or qwords that span cache line boundaries will still be loaded 1282 // and stored atomically. 1283 // 1284 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1285 address *entry, const char *name) { 1286 const bool not_oop = false; 1287 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1288 1289 } 1290 // Arguments: 1291 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1292 // ignored 1293 // name - stub name string 1294 // 1295 // Inputs: 1296 // c_rarg0 - source array address 1297 // c_rarg1 - destination array address 1298 // c_rarg2 - element count, treated as ssize_t, can be zero 1299 // 1300 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1301 // the hardware handle it. The two dwords within qwords that span 1302 // cache line boundaries will still be loaded and stored atomicly. 1303 // 1304 // Side Effects: 1305 // disjoint_int_copy_entry is set to the no-overlap entry point 1306 // used by generate_conjoint_int_oop_copy(). 1307 // 1308 address generate_disjoint_int_copy(bool aligned, address *entry, 1309 const char *name, bool dest_uninitialized = false) { 1310 const bool not_oop = false; 1311 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1312 } 1313 1314 // Arguments: 1315 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1316 // ignored 1317 // name - stub name string 1318 // 1319 // Inputs: 1320 // c_rarg0 - source array address 1321 // c_rarg1 - destination array address 1322 // c_rarg2 - element count, treated as ssize_t, can be zero 1323 // 1324 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1325 // the hardware handle it. The two dwords within qwords that span 1326 // cache line boundaries will still be loaded and stored atomicly. 1327 // 1328 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1329 address *entry, const char *name, 1330 bool dest_uninitialized = false) { 1331 const bool not_oop = false; 1332 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1333 } 1334 1335 1336 // Arguments: 1337 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1338 // ignored 1339 // name - stub name string 1340 // 1341 // Inputs: 1342 // c_rarg0 - source array address 1343 // c_rarg1 - destination array address 1344 // c_rarg2 - element count, treated as size_t, can be zero 1345 // 1346 // Side Effects: 1347 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1348 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1349 // 1350 address generate_disjoint_long_copy(bool aligned, address *entry, 1351 const char *name, bool dest_uninitialized = false) { 1352 const bool not_oop = false; 1353 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1354 } 1355 1356 // Arguments: 1357 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1358 // ignored 1359 // name - stub name string 1360 // 1361 // Inputs: 1362 // c_rarg0 - source array address 1363 // c_rarg1 - destination array address 1364 // c_rarg2 - element count, treated as size_t, can be zero 1365 // 1366 address generate_conjoint_long_copy(bool aligned, 1367 address nooverlap_target, address *entry, 1368 const char *name, bool dest_uninitialized = false) { 1369 const bool not_oop = false; 1370 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1371 } 1372 1373 // Arguments: 1374 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1375 // ignored 1376 // name - stub name string 1377 // 1378 // Inputs: 1379 // c_rarg0 - source array address 1380 // c_rarg1 - destination array address 1381 // c_rarg2 - element count, treated as size_t, can be zero 1382 // 1383 // Side Effects: 1384 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1385 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1386 // 1387 address generate_disjoint_oop_copy(bool aligned, address *entry, 1388 const char *name, bool dest_uninitialized = false) { 1389 const bool is_oop = true; 1390 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1391 return generate_disjoint_copy(size, aligned, is_oop, entry, name); 1392 } 1393 1394 // Arguments: 1395 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1396 // ignored 1397 // name - stub name string 1398 // 1399 // Inputs: 1400 // c_rarg0 - source array address 1401 // c_rarg1 - destination array address 1402 // c_rarg2 - element count, treated as size_t, can be zero 1403 // 1404 address generate_conjoint_oop_copy(bool aligned, 1405 address nooverlap_target, address *entry, 1406 const char *name, bool dest_uninitialized = false) { 1407 const bool is_oop = true; 1408 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1409 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, name); 1410 } 1411 1412 1413 // Helper for generating a dynamic type check. 1414 // Smashes rscratch1. 1415 void generate_type_check(Register sub_klass, 1416 Register super_check_offset, 1417 Register super_klass, 1418 Label& L_success) { 1419 assert_different_registers(sub_klass, super_check_offset, super_klass); 1420 1421 BLOCK_COMMENT("type_check:"); 1422 1423 Label L_miss; 1424 1425 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1426 super_check_offset); 1427 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1428 1429 // Fall through on failure! 1430 __ BIND(L_miss); 1431 } 1432 1433 // 1434 // Generate checkcasting array copy stub 1435 // 1436 // Input: 1437 // c_rarg0 - source array address 1438 // c_rarg1 - destination array address 1439 // c_rarg2 - element count, treated as ssize_t, can be zero 1440 // c_rarg3 - size_t ckoff (super_check_offset) 1441 // c_rarg4 - oop ckval (super_klass) 1442 // 1443 // Output: 1444 // r0 == 0 - success 1445 // r0 == -1^K - failure, where K is partial transfer count 1446 // 1447 address generate_checkcast_copy(const char *name, address *entry, 1448 bool dest_uninitialized = false) { 1449 1450 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1451 1452 // Input registers (after setup_arg_regs) 1453 const Register from = c_rarg0; // source array address 1454 const Register to = c_rarg1; // destination array address 1455 const Register count = c_rarg2; // elementscount 1456 const Register ckoff = c_rarg3; // super_check_offset 1457 const Register ckval = c_rarg4; // super_klass 1458 1459 // Registers used as temps (r18, r19, r20 are save-on-entry) 1460 const Register count_save = r21; // orig elementscount 1461 const Register start_to = r20; // destination array start address 1462 const Register copied_oop = r18; // actual oop copied 1463 const Register r19_klass = r19; // oop._klass 1464 1465 //--------------------------------------------------------------- 1466 // Assembler stub will be used for this call to arraycopy 1467 // if the two arrays are subtypes of Object[] but the 1468 // destination array type is not equal to or a supertype 1469 // of the source type. Each element must be separately 1470 // checked. 1471 1472 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1473 copied_oop, r19_klass, count_save); 1474 1475 __ align(CodeEntryAlignment); 1476 StubCodeMark mark(this, "StubRoutines", name); 1477 address start = __ pc(); 1478 1479 __ enter(); // required for proper stackwalking of RuntimeStub frame 1480 1481 #ifdef ASSERT 1482 // caller guarantees that the arrays really are different 1483 // otherwise, we would have to make conjoint checks 1484 { Label L; 1485 array_overlap_test(L, TIMES_OOP); 1486 __ stop("checkcast_copy within a single array"); 1487 __ bind(L); 1488 } 1489 #endif //ASSERT 1490 1491 // Caller of this entry point must set up the argument registers. 1492 if (entry != NULL) { 1493 *entry = __ pc(); 1494 BLOCK_COMMENT("Entry:"); 1495 } 1496 1497 // Empty array: Nothing to do. 1498 __ cbz(count, L_done); 1499 1500 __ push(RegSet::of(r18, r19, r20, r21), sp); 1501 1502 #ifdef ASSERT 1503 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1504 // The ckoff and ckval must be mutually consistent, 1505 // even though caller generates both. 1506 { Label L; 1507 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1508 __ ldrw(start_to, Address(ckval, sco_offset)); 1509 __ cmpw(ckoff, start_to); 1510 __ br(Assembler::EQ, L); 1511 __ stop("super_check_offset inconsistent"); 1512 __ bind(L); 1513 } 1514 #endif //ASSERT 1515 1516 // save the original count 1517 __ mov(count_save, count); 1518 1519 // Copy from low to high addresses 1520 __ mov(start_to, to); // Save destination array start address 1521 __ b(L_load_element); 1522 1523 // ======== begin loop ======== 1524 // (Loop is rotated; its entry is L_load_element.) 1525 // Loop control: 1526 // for (; count != 0; count--) { 1527 // copied_oop = load_heap_oop(from++); 1528 // ... generate_type_check ...; 1529 // store_heap_oop(to++, copied_oop); 1530 // } 1531 __ align(OptoLoopAlignment); 1532 1533 __ BIND(L_store_element); 1534 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1535 __ sub(count, count, 1); 1536 __ cbz(count, L_do_card_marks); 1537 1538 // ======== loop entry is here ======== 1539 __ BIND(L_load_element); 1540 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1541 __ cbz(copied_oop, L_store_element); 1542 1543 __ load_klass(r19_klass, copied_oop);// query the object klass 1544 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1545 // ======== end loop ======== 1546 1547 // It was a real error; we must depend on the caller to finish the job. 1548 // Register count = remaining oops, count_orig = total oops. 1549 // Emit GC store barriers for the oops we have copied and report 1550 // their number to the caller. 1551 1552 __ subs(count, count_save, count); // K = partially copied oop count 1553 __ eon(count, count, zr); // report (-1^K) to caller 1554 __ br(Assembler::EQ, L_done_pop); 1555 1556 __ BIND(L_do_card_marks); 1557 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1558 gen_write_ref_array_post_barrier(start_to, to, rscratch1); 1559 1560 __ bind(L_done_pop); 1561 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1562 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1563 1564 __ bind(L_done); 1565 __ mov(r0, count); 1566 __ leave(); 1567 __ ret(lr); 1568 1569 return start; 1570 } 1571 1572 // Perform range checks on the proposed arraycopy. 1573 // Kills temp, but nothing else. 1574 // Also, clean the sign bits of src_pos and dst_pos. 1575 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1576 Register src_pos, // source position (c_rarg1) 1577 Register dst, // destination array oo (c_rarg2) 1578 Register dst_pos, // destination position (c_rarg3) 1579 Register length, 1580 Register temp, 1581 Label& L_failed) { 1582 BLOCK_COMMENT("arraycopy_range_checks:"); 1583 1584 assert_different_registers(rscratch1, temp); 1585 1586 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1587 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1588 __ addw(temp, length, src_pos); 1589 __ cmpw(temp, rscratch1); 1590 __ br(Assembler::HI, L_failed); 1591 1592 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1593 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1594 __ addw(temp, length, dst_pos); 1595 __ cmpw(temp, rscratch1); 1596 __ br(Assembler::HI, L_failed); 1597 1598 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1599 __ movw(src_pos, src_pos); 1600 __ movw(dst_pos, dst_pos); 1601 1602 BLOCK_COMMENT("arraycopy_range_checks done"); 1603 } 1604 1605 // These stubs get called from some dumb test routine. 1606 // I'll write them properly when they're called from 1607 // something that's actually doing something. 1608 static void fake_arraycopy_stub(address src, address dst, int count) { 1609 assert(count == 0, "huh?"); 1610 } 1611 1612 1613 // 1614 // Generate 'unsafe' array copy stub 1615 // Though just as safe as the other stubs, it takes an unscaled 1616 // size_t argument instead of an element count. 1617 // 1618 // Input: 1619 // c_rarg0 - source array address 1620 // c_rarg1 - destination array address 1621 // c_rarg2 - byte count, treated as ssize_t, can be zero 1622 // 1623 // Examines the alignment of the operands and dispatches 1624 // to a long, int, short, or byte copy loop. 1625 // 1626 address generate_unsafe_copy(const char *name, 1627 address byte_copy_entry) { 1628 #ifdef PRODUCT 1629 return StubRoutines::_jbyte_arraycopy; 1630 #else 1631 __ align(CodeEntryAlignment); 1632 StubCodeMark mark(this, "StubRoutines", name); 1633 address start = __ pc(); 1634 __ enter(); // required for proper stackwalking of RuntimeStub frame 1635 // bump this on entry, not on exit: 1636 __ lea(rscratch2, ExternalAddress((address)&SharedRuntime::_unsafe_array_copy_ctr)); 1637 __ incrementw(Address(rscratch2)); 1638 __ b(RuntimeAddress(byte_copy_entry)); 1639 return start; 1640 #endif 1641 } 1642 1643 // 1644 // Generate generic array copy stubs 1645 // 1646 // Input: 1647 // c_rarg0 - src oop 1648 // c_rarg1 - src_pos (32-bits) 1649 // c_rarg2 - dst oop 1650 // c_rarg3 - dst_pos (32-bits) 1651 // c_rarg4 - element count (32-bits) 1652 // 1653 // Output: 1654 // r0 == 0 - success 1655 // r0 == -1^K - failure, where K is partial transfer count 1656 // 1657 address generate_generic_copy(const char *name, 1658 address byte_copy_entry, address short_copy_entry, 1659 address int_copy_entry, address oop_copy_entry, 1660 address long_copy_entry, address checkcast_copy_entry) { 1661 1662 Label L_failed, L_failed_0, L_objArray; 1663 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1664 1665 // Input registers 1666 const Register src = c_rarg0; // source array oop 1667 const Register src_pos = c_rarg1; // source position 1668 const Register dst = c_rarg2; // destination array oop 1669 const Register dst_pos = c_rarg3; // destination position 1670 const Register length = c_rarg4; 1671 1672 StubCodeMark mark(this, "StubRoutines", name); 1673 1674 __ align(CodeEntryAlignment); 1675 address start = __ pc(); 1676 1677 __ enter(); // required for proper stackwalking of RuntimeStub frame 1678 1679 // bump this on entry, not on exit: 1680 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1681 1682 //----------------------------------------------------------------------- 1683 // Assembler stub will be used for this call to arraycopy 1684 // if the following conditions are met: 1685 // 1686 // (1) src and dst must not be null. 1687 // (2) src_pos must not be negative. 1688 // (3) dst_pos must not be negative. 1689 // (4) length must not be negative. 1690 // (5) src klass and dst klass should be the same and not NULL. 1691 // (6) src and dst should be arrays. 1692 // (7) src_pos + length must not exceed length of src. 1693 // (8) dst_pos + length must not exceed length of dst. 1694 // 1695 1696 // if (src == NULL) return -1; 1697 __ cbz(src, L_failed); 1698 1699 // if (src_pos < 0) return -1; 1700 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 1701 1702 // if (dst == NULL) return -1; 1703 __ cbz(dst, L_failed); 1704 1705 // if (dst_pos < 0) return -1; 1706 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 1707 1708 // registers used as temp 1709 const Register scratch_length = r16; // elements count to copy 1710 const Register scratch_src_klass = r17; // array klass 1711 const Register lh = r18; // layout helper 1712 1713 // if (length < 0) return -1; 1714 __ movw(scratch_length, length); // length (elements count, 32-bits value) 1715 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 1716 1717 __ load_klass(scratch_src_klass, src); 1718 #ifdef ASSERT 1719 // assert(src->klass() != NULL); 1720 { 1721 BLOCK_COMMENT("assert klasses not null {"); 1722 Label L1, L2; 1723 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 1724 __ bind(L1); 1725 __ stop("broken null klass"); 1726 __ bind(L2); 1727 __ load_klass(rscratch1, dst); 1728 __ cbz(rscratch1, L1); // this would be broken also 1729 BLOCK_COMMENT("} assert klasses not null done"); 1730 } 1731 #endif 1732 1733 // Load layout helper (32-bits) 1734 // 1735 // |array_tag| | header_size | element_type | |log2_element_size| 1736 // 32 30 24 16 8 2 0 1737 // 1738 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 1739 // 1740 1741 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 1742 1743 // Handle objArrays completely differently... 1744 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 1745 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 1746 __ movw(rscratch1, objArray_lh); 1747 __ eorw(rscratch2, lh, rscratch1); 1748 __ cbzw(rscratch2, L_objArray); 1749 1750 // if (src->klass() != dst->klass()) return -1; 1751 __ load_klass(rscratch2, dst); 1752 __ eor(rscratch2, rscratch2, scratch_src_klass); 1753 __ cbnz(rscratch2, L_failed); 1754 1755 // if (!src->is_Array()) return -1; 1756 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 1757 1758 // At this point, it is known to be a typeArray (array_tag 0x3). 1759 #ifdef ASSERT 1760 { 1761 BLOCK_COMMENT("assert primitive array {"); 1762 Label L; 1763 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 1764 __ cmpw(lh, rscratch2); 1765 __ br(Assembler::GE, L); 1766 __ stop("must be a primitive array"); 1767 __ bind(L); 1768 BLOCK_COMMENT("} assert primitive array done"); 1769 } 1770 #endif 1771 1772 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1773 rscratch2, L_failed); 1774 1775 // TypeArrayKlass 1776 // 1777 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 1778 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 1779 // 1780 1781 const Register rscratch1_offset = rscratch1; // array offset 1782 const Register r18_elsize = lh; // element size 1783 1784 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 1785 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 1786 __ add(src, src, rscratch1_offset); // src array offset 1787 __ add(dst, dst, rscratch1_offset); // dst array offset 1788 BLOCK_COMMENT("choose copy loop based on element size"); 1789 1790 // next registers should be set before the jump to corresponding stub 1791 const Register from = c_rarg0; // source array address 1792 const Register to = c_rarg1; // destination array address 1793 const Register count = c_rarg2; // elements count 1794 1795 // 'from', 'to', 'count' registers should be set in such order 1796 // since they are the same as 'src', 'src_pos', 'dst'. 1797 1798 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 1799 1800 // The possible values of elsize are 0-3, i.e. exact_log2(element 1801 // size in bytes). We do a simple bitwise binary search. 1802 __ BIND(L_copy_bytes); 1803 __ tbnz(r18_elsize, 1, L_copy_ints); 1804 __ tbnz(r18_elsize, 0, L_copy_shorts); 1805 __ lea(from, Address(src, src_pos));// src_addr 1806 __ lea(to, Address(dst, dst_pos));// dst_addr 1807 __ movw(count, scratch_length); // length 1808 __ b(RuntimeAddress(byte_copy_entry)); 1809 1810 __ BIND(L_copy_shorts); 1811 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 1812 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 1813 __ movw(count, scratch_length); // length 1814 __ b(RuntimeAddress(short_copy_entry)); 1815 1816 __ BIND(L_copy_ints); 1817 __ tbnz(r18_elsize, 0, L_copy_longs); 1818 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 1819 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 1820 __ movw(count, scratch_length); // length 1821 __ b(RuntimeAddress(int_copy_entry)); 1822 1823 __ BIND(L_copy_longs); 1824 #ifdef ASSERT 1825 { 1826 BLOCK_COMMENT("assert long copy {"); 1827 Label L; 1828 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 1829 __ cmpw(r18_elsize, LogBytesPerLong); 1830 __ br(Assembler::EQ, L); 1831 __ stop("must be long copy, but elsize is wrong"); 1832 __ bind(L); 1833 BLOCK_COMMENT("} assert long copy done"); 1834 } 1835 #endif 1836 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 1837 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 1838 __ movw(count, scratch_length); // length 1839 __ b(RuntimeAddress(long_copy_entry)); 1840 1841 // ObjArrayKlass 1842 __ BIND(L_objArray); 1843 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 1844 1845 Label L_plain_copy, L_checkcast_copy; 1846 // test array classes for subtyping 1847 __ load_klass(r18, dst); 1848 __ cmp(scratch_src_klass, r18); // usual case is exact equality 1849 __ br(Assembler::NE, L_checkcast_copy); 1850 1851 // Identically typed arrays can be copied without element-wise checks. 1852 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1853 rscratch2, L_failed); 1854 1855 __ lea(from, Address(src, src_pos, Address::lsl(3))); 1856 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1857 __ lea(to, Address(dst, dst_pos, Address::lsl(3))); 1858 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1859 __ movw(count, scratch_length); // length 1860 __ BIND(L_plain_copy); 1861 __ b(RuntimeAddress(oop_copy_entry)); 1862 1863 __ BIND(L_checkcast_copy); 1864 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 1865 { 1866 // Before looking at dst.length, make sure dst is also an objArray. 1867 __ ldrw(rscratch1, Address(r18, lh_offset)); 1868 __ movw(rscratch2, objArray_lh); 1869 __ eorw(rscratch1, rscratch1, rscratch2); 1870 __ cbnzw(rscratch1, L_failed); 1871 1872 // It is safe to examine both src.length and dst.length. 1873 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1874 r18, L_failed); 1875 1876 const Register rscratch2_dst_klass = rscratch2; 1877 __ load_klass(rscratch2_dst_klass, dst); // reload 1878 1879 // Marshal the base address arguments now, freeing registers. 1880 __ lea(from, Address(src, src_pos, Address::lsl(3))); 1881 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1882 __ lea(to, Address(dst, dst_pos, Address::lsl(3))); 1883 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1884 __ movw(count, length); // length (reloaded) 1885 Register sco_temp = c_rarg3; // this register is free now 1886 assert_different_registers(from, to, count, sco_temp, 1887 rscratch2_dst_klass, scratch_src_klass); 1888 // assert_clean_int(count, sco_temp); 1889 1890 // Generate the type check. 1891 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1892 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 1893 // assert_clean_int(sco_temp, r18); 1894 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 1895 1896 // Fetch destination element klass from the ObjArrayKlass header. 1897 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 1898 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 1899 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 1900 1901 // the checkcast_copy loop needs two extra arguments: 1902 assert(c_rarg3 == sco_temp, "#3 already in place"); 1903 // Set up arguments for checkcast_copy_entry. 1904 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 1905 __ b(RuntimeAddress(checkcast_copy_entry)); 1906 } 1907 1908 __ BIND(L_failed); 1909 __ mov(r0, -1); 1910 __ leave(); // required for proper stackwalking of RuntimeStub frame 1911 __ ret(lr); 1912 1913 return start; 1914 } 1915 1916 void generate_arraycopy_stubs() { 1917 address entry; 1918 address entry_jbyte_arraycopy; 1919 address entry_jshort_arraycopy; 1920 address entry_jint_arraycopy; 1921 address entry_oop_arraycopy; 1922 address entry_jlong_arraycopy; 1923 address entry_checkcast_arraycopy; 1924 1925 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 1926 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 1927 1928 //*** jbyte 1929 // Always need aligned and unaligned versions 1930 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 1931 "jbyte_disjoint_arraycopy"); 1932 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 1933 &entry_jbyte_arraycopy, 1934 "jbyte_arraycopy"); 1935 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 1936 "arrayof_jbyte_disjoint_arraycopy"); 1937 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 1938 "arrayof_jbyte_arraycopy"); 1939 1940 //*** jshort 1941 // Always need aligned and unaligned versions 1942 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 1943 "jshort_disjoint_arraycopy"); 1944 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 1945 &entry_jshort_arraycopy, 1946 "jshort_arraycopy"); 1947 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 1948 "arrayof_jshort_disjoint_arraycopy"); 1949 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 1950 "arrayof_jshort_arraycopy"); 1951 1952 //*** jint 1953 // Aligned versions 1954 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 1955 "arrayof_jint_disjoint_arraycopy"); 1956 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 1957 "arrayof_jint_arraycopy"); 1958 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 1959 // entry_jint_arraycopy always points to the unaligned version 1960 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 1961 "jint_disjoint_arraycopy"); 1962 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 1963 &entry_jint_arraycopy, 1964 "jint_arraycopy"); 1965 1966 //*** jlong 1967 // It is always aligned 1968 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 1969 "arrayof_jlong_disjoint_arraycopy"); 1970 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 1971 "arrayof_jlong_arraycopy"); 1972 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 1973 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 1974 1975 //*** oops 1976 { 1977 // With compressed oops we need unaligned versions; notice that 1978 // we overwrite entry_oop_arraycopy. 1979 bool aligned = !UseCompressedOops; 1980 1981 StubRoutines::_arrayof_oop_disjoint_arraycopy 1982 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy"); 1983 StubRoutines::_arrayof_oop_arraycopy 1984 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy"); 1985 // Aligned versions without pre-barriers 1986 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 1987 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 1988 /*dest_uninitialized*/true); 1989 StubRoutines::_arrayof_oop_arraycopy_uninit 1990 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 1991 /*dest_uninitialized*/true); 1992 } 1993 1994 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 1995 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 1996 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 1997 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 1998 1999 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2000 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2001 /*dest_uninitialized*/true); 2002 2003 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2004 entry_jbyte_arraycopy); 2005 2006 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2007 entry_jbyte_arraycopy, 2008 entry_jshort_arraycopy, 2009 entry_jint_arraycopy, 2010 entry_oop_arraycopy, 2011 entry_jlong_arraycopy, 2012 entry_checkcast_arraycopy); 2013 2014 } 2015 2016 void generate_math_stubs() { Unimplemented(); } 2017 2018 // Arguments: 2019 // 2020 // Inputs: 2021 // c_rarg0 - source byte array address 2022 // c_rarg1 - destination byte array address 2023 // c_rarg2 - K (key) in little endian int array 2024 // 2025 address generate_aescrypt_encryptBlock() { 2026 __ align(CodeEntryAlignment); 2027 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2028 2029 Label L_doLast; 2030 2031 const Register from = c_rarg0; // source array address 2032 const Register to = c_rarg1; // destination array address 2033 const Register key = c_rarg2; // key array address 2034 const Register keylen = rscratch1; 2035 2036 address start = __ pc(); 2037 __ enter(); 2038 2039 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2040 2041 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2042 2043 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2044 __ rev32(v1, __ T16B, v1); 2045 __ rev32(v2, __ T16B, v2); 2046 __ rev32(v3, __ T16B, v3); 2047 __ rev32(v4, __ T16B, v4); 2048 __ aese(v0, v1); 2049 __ aesmc(v0, v0); 2050 __ aese(v0, v2); 2051 __ aesmc(v0, v0); 2052 __ aese(v0, v3); 2053 __ aesmc(v0, v0); 2054 __ aese(v0, v4); 2055 __ aesmc(v0, v0); 2056 2057 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2058 __ rev32(v1, __ T16B, v1); 2059 __ rev32(v2, __ T16B, v2); 2060 __ rev32(v3, __ T16B, v3); 2061 __ rev32(v4, __ T16B, v4); 2062 __ aese(v0, v1); 2063 __ aesmc(v0, v0); 2064 __ aese(v0, v2); 2065 __ aesmc(v0, v0); 2066 __ aese(v0, v3); 2067 __ aesmc(v0, v0); 2068 __ aese(v0, v4); 2069 __ aesmc(v0, v0); 2070 2071 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2072 __ rev32(v1, __ T16B, v1); 2073 __ rev32(v2, __ T16B, v2); 2074 2075 __ cmpw(keylen, 44); 2076 __ br(Assembler::EQ, L_doLast); 2077 2078 __ aese(v0, v1); 2079 __ aesmc(v0, v0); 2080 __ aese(v0, v2); 2081 __ aesmc(v0, v0); 2082 2083 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2084 __ rev32(v1, __ T16B, v1); 2085 __ rev32(v2, __ T16B, v2); 2086 2087 __ cmpw(keylen, 52); 2088 __ br(Assembler::EQ, L_doLast); 2089 2090 __ aese(v0, v1); 2091 __ aesmc(v0, v0); 2092 __ aese(v0, v2); 2093 __ aesmc(v0, v0); 2094 2095 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2096 __ rev32(v1, __ T16B, v1); 2097 __ rev32(v2, __ T16B, v2); 2098 2099 __ BIND(L_doLast); 2100 2101 __ aese(v0, v1); 2102 __ aesmc(v0, v0); 2103 __ aese(v0, v2); 2104 2105 __ ld1(v1, __ T16B, key); 2106 __ rev32(v1, __ T16B, v1); 2107 __ eor(v0, __ T16B, v0, v1); 2108 2109 __ st1(v0, __ T16B, to); 2110 2111 __ mov(r0, 0); 2112 2113 __ leave(); 2114 __ ret(lr); 2115 2116 return start; 2117 } 2118 2119 // Arguments: 2120 // 2121 // Inputs: 2122 // c_rarg0 - source byte array address 2123 // c_rarg1 - destination byte array address 2124 // c_rarg2 - K (key) in little endian int array 2125 // 2126 address generate_aescrypt_decryptBlock() { 2127 assert(UseAES, "need AES instructions and misaligned SSE support"); 2128 __ align(CodeEntryAlignment); 2129 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2130 Label L_doLast; 2131 2132 const Register from = c_rarg0; // source array address 2133 const Register to = c_rarg1; // destination array address 2134 const Register key = c_rarg2; // key array address 2135 const Register keylen = rscratch1; 2136 2137 address start = __ pc(); 2138 __ enter(); // required for proper stackwalking of RuntimeStub frame 2139 2140 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2141 2142 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2143 2144 __ ld1(v5, __ T16B, __ post(key, 16)); 2145 __ rev32(v5, __ T16B, v5); 2146 2147 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2148 __ rev32(v1, __ T16B, v1); 2149 __ rev32(v2, __ T16B, v2); 2150 __ rev32(v3, __ T16B, v3); 2151 __ rev32(v4, __ T16B, v4); 2152 __ aesd(v0, v1); 2153 __ aesimc(v0, v0); 2154 __ aesd(v0, v2); 2155 __ aesimc(v0, v0); 2156 __ aesd(v0, v3); 2157 __ aesimc(v0, v0); 2158 __ aesd(v0, v4); 2159 __ aesimc(v0, v0); 2160 2161 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2162 __ rev32(v1, __ T16B, v1); 2163 __ rev32(v2, __ T16B, v2); 2164 __ rev32(v3, __ T16B, v3); 2165 __ rev32(v4, __ T16B, v4); 2166 __ aesd(v0, v1); 2167 __ aesimc(v0, v0); 2168 __ aesd(v0, v2); 2169 __ aesimc(v0, v0); 2170 __ aesd(v0, v3); 2171 __ aesimc(v0, v0); 2172 __ aesd(v0, v4); 2173 __ aesimc(v0, v0); 2174 2175 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2176 __ rev32(v1, __ T16B, v1); 2177 __ rev32(v2, __ T16B, v2); 2178 2179 __ cmpw(keylen, 44); 2180 __ br(Assembler::EQ, L_doLast); 2181 2182 __ aesd(v0, v1); 2183 __ aesimc(v0, v0); 2184 __ aesd(v0, v2); 2185 __ aesimc(v0, v0); 2186 2187 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2188 __ rev32(v1, __ T16B, v1); 2189 __ rev32(v2, __ T16B, v2); 2190 2191 __ cmpw(keylen, 52); 2192 __ br(Assembler::EQ, L_doLast); 2193 2194 __ aesd(v0, v1); 2195 __ aesimc(v0, v0); 2196 __ aesd(v0, v2); 2197 __ aesimc(v0, v0); 2198 2199 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2200 __ rev32(v1, __ T16B, v1); 2201 __ rev32(v2, __ T16B, v2); 2202 2203 __ BIND(L_doLast); 2204 2205 __ aesd(v0, v1); 2206 __ aesimc(v0, v0); 2207 __ aesd(v0, v2); 2208 2209 __ eor(v0, __ T16B, v0, v5); 2210 2211 __ st1(v0, __ T16B, to); 2212 2213 __ mov(r0, 0); 2214 2215 __ leave(); 2216 __ ret(lr); 2217 2218 return start; 2219 } 2220 2221 // Arguments: 2222 // 2223 // Inputs: 2224 // c_rarg0 - source byte array address 2225 // c_rarg1 - destination byte array address 2226 // c_rarg2 - K (key) in little endian int array 2227 // c_rarg3 - r vector byte array address 2228 // c_rarg4 - input length 2229 // 2230 // Output: 2231 // x0 - input length 2232 // 2233 address generate_cipherBlockChaining_encryptAESCrypt() { 2234 assert(UseAES, "need AES instructions and misaligned SSE support"); 2235 __ align(CodeEntryAlignment); 2236 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2237 2238 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2239 2240 const Register from = c_rarg0; // source array address 2241 const Register to = c_rarg1; // destination array address 2242 const Register key = c_rarg2; // key array address 2243 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2244 // and left with the results of the last encryption block 2245 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2246 const Register keylen = rscratch1; 2247 2248 address start = __ pc(); 2249 __ enter(); 2250 2251 __ mov(rscratch2, len_reg); 2252 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2253 2254 __ ld1(v0, __ T16B, rvec); 2255 2256 __ cmpw(keylen, 52); 2257 __ br(Assembler::CC, L_loadkeys_44); 2258 __ br(Assembler::EQ, L_loadkeys_52); 2259 2260 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2261 __ rev32(v17, __ T16B, v17); 2262 __ rev32(v18, __ T16B, v18); 2263 __ BIND(L_loadkeys_52); 2264 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2265 __ rev32(v19, __ T16B, v19); 2266 __ rev32(v20, __ T16B, v20); 2267 __ BIND(L_loadkeys_44); 2268 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2269 __ rev32(v21, __ T16B, v21); 2270 __ rev32(v22, __ T16B, v22); 2271 __ rev32(v23, __ T16B, v23); 2272 __ rev32(v24, __ T16B, v24); 2273 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2274 __ rev32(v25, __ T16B, v25); 2275 __ rev32(v26, __ T16B, v26); 2276 __ rev32(v27, __ T16B, v27); 2277 __ rev32(v28, __ T16B, v28); 2278 __ ld1(v29, v30, v31, __ T16B, key); 2279 __ rev32(v29, __ T16B, v29); 2280 __ rev32(v30, __ T16B, v30); 2281 __ rev32(v31, __ T16B, v31); 2282 2283 __ BIND(L_aes_loop); 2284 __ ld1(v1, __ T16B, __ post(from, 16)); 2285 __ eor(v0, __ T16B, v0, v1); 2286 2287 __ br(Assembler::CC, L_rounds_44); 2288 __ br(Assembler::EQ, L_rounds_52); 2289 2290 __ aese(v0, v17); __ aesmc(v0, v0); 2291 __ aese(v0, v18); __ aesmc(v0, v0); 2292 __ BIND(L_rounds_52); 2293 __ aese(v0, v19); __ aesmc(v0, v0); 2294 __ aese(v0, v20); __ aesmc(v0, v0); 2295 __ BIND(L_rounds_44); 2296 __ aese(v0, v21); __ aesmc(v0, v0); 2297 __ aese(v0, v22); __ aesmc(v0, v0); 2298 __ aese(v0, v23); __ aesmc(v0, v0); 2299 __ aese(v0, v24); __ aesmc(v0, v0); 2300 __ aese(v0, v25); __ aesmc(v0, v0); 2301 __ aese(v0, v26); __ aesmc(v0, v0); 2302 __ aese(v0, v27); __ aesmc(v0, v0); 2303 __ aese(v0, v28); __ aesmc(v0, v0); 2304 __ aese(v0, v29); __ aesmc(v0, v0); 2305 __ aese(v0, v30); 2306 __ eor(v0, __ T16B, v0, v31); 2307 2308 __ st1(v0, __ T16B, __ post(to, 16)); 2309 __ sub(len_reg, len_reg, 16); 2310 __ cbnz(len_reg, L_aes_loop); 2311 2312 __ st1(v0, __ T16B, rvec); 2313 2314 __ mov(r0, rscratch2); 2315 2316 __ leave(); 2317 __ ret(lr); 2318 2319 return start; 2320 } 2321 2322 // Arguments: 2323 // 2324 // Inputs: 2325 // c_rarg0 - source byte array address 2326 // c_rarg1 - destination byte array address 2327 // c_rarg2 - K (key) in little endian int array 2328 // c_rarg3 - r vector byte array address 2329 // c_rarg4 - input length 2330 // 2331 // Output: 2332 // r0 - input length 2333 // 2334 address generate_cipherBlockChaining_decryptAESCrypt() { 2335 assert(UseAES, "need AES instructions and misaligned SSE support"); 2336 __ align(CodeEntryAlignment); 2337 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2338 2339 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2340 2341 const Register from = c_rarg0; // source array address 2342 const Register to = c_rarg1; // destination array address 2343 const Register key = c_rarg2; // key array address 2344 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2345 // and left with the results of the last encryption block 2346 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2347 const Register keylen = rscratch1; 2348 2349 address start = __ pc(); 2350 __ enter(); 2351 2352 __ mov(rscratch2, len_reg); 2353 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2354 2355 __ ld1(v2, __ T16B, rvec); 2356 2357 __ ld1(v31, __ T16B, __ post(key, 16)); 2358 __ rev32(v31, __ T16B, v31); 2359 2360 __ cmpw(keylen, 52); 2361 __ br(Assembler::CC, L_loadkeys_44); 2362 __ br(Assembler::EQ, L_loadkeys_52); 2363 2364 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2365 __ rev32(v17, __ T16B, v17); 2366 __ rev32(v18, __ T16B, v18); 2367 __ BIND(L_loadkeys_52); 2368 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2369 __ rev32(v19, __ T16B, v19); 2370 __ rev32(v20, __ T16B, v20); 2371 __ BIND(L_loadkeys_44); 2372 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2373 __ rev32(v21, __ T16B, v21); 2374 __ rev32(v22, __ T16B, v22); 2375 __ rev32(v23, __ T16B, v23); 2376 __ rev32(v24, __ T16B, v24); 2377 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2378 __ rev32(v25, __ T16B, v25); 2379 __ rev32(v26, __ T16B, v26); 2380 __ rev32(v27, __ T16B, v27); 2381 __ rev32(v28, __ T16B, v28); 2382 __ ld1(v29, v30, __ T16B, key); 2383 __ rev32(v29, __ T16B, v29); 2384 __ rev32(v30, __ T16B, v30); 2385 2386 __ BIND(L_aes_loop); 2387 __ ld1(v0, __ T16B, __ post(from, 16)); 2388 __ orr(v1, __ T16B, v0, v0); 2389 2390 __ br(Assembler::CC, L_rounds_44); 2391 __ br(Assembler::EQ, L_rounds_52); 2392 2393 __ aesd(v0, v17); __ aesimc(v0, v0); 2394 __ aesd(v0, v18); __ aesimc(v0, v0); 2395 __ BIND(L_rounds_52); 2396 __ aesd(v0, v19); __ aesimc(v0, v0); 2397 __ aesd(v0, v20); __ aesimc(v0, v0); 2398 __ BIND(L_rounds_44); 2399 __ aesd(v0, v21); __ aesimc(v0, v0); 2400 __ aesd(v0, v22); __ aesimc(v0, v0); 2401 __ aesd(v0, v23); __ aesimc(v0, v0); 2402 __ aesd(v0, v24); __ aesimc(v0, v0); 2403 __ aesd(v0, v25); __ aesimc(v0, v0); 2404 __ aesd(v0, v26); __ aesimc(v0, v0); 2405 __ aesd(v0, v27); __ aesimc(v0, v0); 2406 __ aesd(v0, v28); __ aesimc(v0, v0); 2407 __ aesd(v0, v29); __ aesimc(v0, v0); 2408 __ aesd(v0, v30); 2409 __ eor(v0, __ T16B, v0, v31); 2410 __ eor(v0, __ T16B, v0, v2); 2411 2412 __ st1(v0, __ T16B, __ post(to, 16)); 2413 __ orr(v2, __ T16B, v1, v1); 2414 2415 __ sub(len_reg, len_reg, 16); 2416 __ cbnz(len_reg, L_aes_loop); 2417 2418 __ st1(v2, __ T16B, rvec); 2419 2420 __ mov(r0, rscratch2); 2421 2422 __ leave(); 2423 __ ret(lr); 2424 2425 return start; 2426 } 2427 2428 // Arguments: 2429 // 2430 // Inputs: 2431 // c_rarg0 - byte[] source+offset 2432 // c_rarg1 - int[] SHA.state 2433 // c_rarg2 - int offset 2434 // c_rarg3 - int limit 2435 // 2436 address generate_sha1_implCompress(bool multi_block, const char *name) { 2437 __ align(CodeEntryAlignment); 2438 StubCodeMark mark(this, "StubRoutines", name); 2439 address start = __ pc(); 2440 2441 Register buf = c_rarg0; 2442 Register state = c_rarg1; 2443 Register ofs = c_rarg2; 2444 Register limit = c_rarg3; 2445 2446 Label keys; 2447 Label sha1_loop; 2448 2449 // load the keys into v0..v3 2450 __ adr(rscratch1, keys); 2451 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2452 // load 5 words state into v6, v7 2453 __ ldrq(v6, Address(state, 0)); 2454 __ ldrs(v7, Address(state, 16)); 2455 2456 2457 __ BIND(sha1_loop); 2458 // load 64 bytes of data into v16..v19 2459 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2460 __ rev32(v16, __ T16B, v16); 2461 __ rev32(v17, __ T16B, v17); 2462 __ rev32(v18, __ T16B, v18); 2463 __ rev32(v19, __ T16B, v19); 2464 2465 // do the sha1 2466 __ addv(v4, __ T4S, v16, v0); 2467 __ orr(v20, __ T16B, v6, v6); 2468 2469 FloatRegister d0 = v16; 2470 FloatRegister d1 = v17; 2471 FloatRegister d2 = v18; 2472 FloatRegister d3 = v19; 2473 2474 for (int round = 0; round < 20; round++) { 2475 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2476 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2477 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2478 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2479 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2480 2481 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2482 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2483 __ sha1h(tmp2, __ T4S, v20); 2484 if (round < 5) 2485 __ sha1c(v20, __ T4S, tmp3, tmp4); 2486 else if (round < 10 || round >= 15) 2487 __ sha1p(v20, __ T4S, tmp3, tmp4); 2488 else 2489 __ sha1m(v20, __ T4S, tmp3, tmp4); 2490 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2491 2492 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2493 } 2494 2495 __ addv(v7, __ T2S, v7, v21); 2496 __ addv(v6, __ T4S, v6, v20); 2497 2498 if (multi_block) { 2499 __ add(ofs, ofs, 64); 2500 __ cmp(ofs, limit); 2501 __ br(Assembler::LE, sha1_loop); 2502 __ mov(c_rarg0, ofs); // return ofs 2503 } 2504 2505 __ strq(v6, Address(state, 0)); 2506 __ strs(v7, Address(state, 16)); 2507 2508 __ ret(lr); 2509 2510 __ bind(keys); 2511 __ emit_int32(0x5a827999); 2512 __ emit_int32(0x6ed9eba1); 2513 __ emit_int32(0x8f1bbcdc); 2514 __ emit_int32(0xca62c1d6); 2515 2516 return start; 2517 } 2518 2519 2520 // Arguments: 2521 // 2522 // Inputs: 2523 // c_rarg0 - byte[] source+offset 2524 // c_rarg1 - int[] SHA.state 2525 // c_rarg2 - int offset 2526 // c_rarg3 - int limit 2527 // 2528 address generate_sha256_implCompress(bool multi_block, const char *name) { 2529 static const uint32_t round_consts[64] = { 2530 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 2531 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 2532 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 2533 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 2534 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 2535 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 2536 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 2537 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 2538 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 2539 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 2540 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 2541 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 2542 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 2543 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 2544 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 2545 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 2546 }; 2547 __ align(CodeEntryAlignment); 2548 StubCodeMark mark(this, "StubRoutines", name); 2549 address start = __ pc(); 2550 2551 Register buf = c_rarg0; 2552 Register state = c_rarg1; 2553 Register ofs = c_rarg2; 2554 Register limit = c_rarg3; 2555 2556 Label sha1_loop; 2557 2558 __ stpd(v8, v9, __ pre(sp, -32)); 2559 __ stpd(v10, v11, Address(sp, 16)); 2560 2561 // dga == v0 2562 // dgb == v1 2563 // dg0 == v2 2564 // dg1 == v3 2565 // dg2 == v4 2566 // t0 == v6 2567 // t1 == v7 2568 2569 // load 16 keys to v16..v31 2570 __ lea(rscratch1, ExternalAddress((address)round_consts)); 2571 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 2572 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 2573 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 2574 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 2575 2576 // load 8 words (256 bits) state 2577 __ ldpq(v0, v1, state); 2578 2579 __ BIND(sha1_loop); 2580 // load 64 bytes of data into v8..v11 2581 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 2582 __ rev32(v8, __ T16B, v8); 2583 __ rev32(v9, __ T16B, v9); 2584 __ rev32(v10, __ T16B, v10); 2585 __ rev32(v11, __ T16B, v11); 2586 2587 __ addv(v6, __ T4S, v8, v16); 2588 __ orr(v2, __ T16B, v0, v0); 2589 __ orr(v3, __ T16B, v1, v1); 2590 2591 FloatRegister d0 = v8; 2592 FloatRegister d1 = v9; 2593 FloatRegister d2 = v10; 2594 FloatRegister d3 = v11; 2595 2596 2597 for (int round = 0; round < 16; round++) { 2598 FloatRegister tmp1 = (round & 1) ? v6 : v7; 2599 FloatRegister tmp2 = (round & 1) ? v7 : v6; 2600 FloatRegister tmp3 = (round & 1) ? v2 : v4; 2601 FloatRegister tmp4 = (round & 1) ? v4 : v2; 2602 2603 if (round < 12) __ sha256su0(d0, __ T4S, d1); 2604 __ orr(v4, __ T16B, v2, v2); 2605 if (round < 15) 2606 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 2607 __ sha256h(v2, __ T4S, v3, tmp2); 2608 __ sha256h2(v3, __ T4S, v4, tmp2); 2609 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 2610 2611 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2612 } 2613 2614 __ addv(v0, __ T4S, v0, v2); 2615 __ addv(v1, __ T4S, v1, v3); 2616 2617 if (multi_block) { 2618 __ add(ofs, ofs, 64); 2619 __ cmp(ofs, limit); 2620 __ br(Assembler::LE, sha1_loop); 2621 __ mov(c_rarg0, ofs); // return ofs 2622 } 2623 2624 __ ldpd(v10, v11, Address(sp, 16)); 2625 __ ldpd(v8, v9, __ post(sp, 32)); 2626 2627 __ stpq(v0, v1, state); 2628 2629 __ ret(lr); 2630 2631 return start; 2632 } 2633 2634 #ifndef BUILTIN_SIM 2635 // Safefetch stubs. 2636 void generate_safefetch(const char* name, int size, address* entry, 2637 address* fault_pc, address* continuation_pc) { 2638 // safefetch signatures: 2639 // int SafeFetch32(int* adr, int errValue); 2640 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 2641 // 2642 // arguments: 2643 // c_rarg0 = adr 2644 // c_rarg1 = errValue 2645 // 2646 // result: 2647 // PPC_RET = *adr or errValue 2648 2649 StubCodeMark mark(this, "StubRoutines", name); 2650 2651 // Entry point, pc or function descriptor. 2652 *entry = __ pc(); 2653 2654 // Load *adr into c_rarg1, may fault. 2655 *fault_pc = __ pc(); 2656 switch (size) { 2657 case 4: 2658 // int32_t 2659 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 2660 break; 2661 case 8: 2662 // int64_t 2663 __ ldr(c_rarg1, Address(c_rarg0, 0)); 2664 break; 2665 default: 2666 ShouldNotReachHere(); 2667 } 2668 2669 // return errValue or *adr 2670 *continuation_pc = __ pc(); 2671 __ mov(r0, c_rarg1); 2672 __ ret(lr); 2673 } 2674 #endif 2675 2676 /** 2677 * Arguments: 2678 * 2679 * Inputs: 2680 * c_rarg0 - int crc 2681 * c_rarg1 - byte* buf 2682 * c_rarg2 - int length 2683 * 2684 * Ouput: 2685 * rax - int crc result 2686 */ 2687 address generate_updateBytesCRC32() { 2688 assert(UseCRC32Intrinsics, "what are we doing here?"); 2689 2690 __ align(CodeEntryAlignment); 2691 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 2692 2693 address start = __ pc(); 2694 2695 const Register crc = c_rarg0; // crc 2696 const Register buf = c_rarg1; // source java byte array address 2697 const Register len = c_rarg2; // length 2698 const Register table0 = c_rarg3; // crc_table address 2699 const Register table1 = c_rarg4; 2700 const Register table2 = c_rarg5; 2701 const Register table3 = c_rarg6; 2702 const Register tmp3 = c_rarg7; 2703 2704 BLOCK_COMMENT("Entry:"); 2705 __ enter(); // required for proper stackwalking of RuntimeStub frame 2706 2707 __ kernel_crc32(crc, buf, len, 2708 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 2709 2710 __ leave(); // required for proper stackwalking of RuntimeStub frame 2711 __ ret(lr); 2712 2713 return start; 2714 } 2715 2716 /** 2717 * Arguments: 2718 * 2719 * Inputs: 2720 * c_rarg0 - int crc 2721 * c_rarg1 - byte* buf 2722 * c_rarg2 - int length 2723 * c_rarg3 - int* table 2724 * 2725 * Ouput: 2726 * r0 - int crc result 2727 */ 2728 address generate_updateBytesCRC32C() { 2729 assert(UseCRC32CIntrinsics, "what are we doing here?"); 2730 2731 __ align(CodeEntryAlignment); 2732 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 2733 2734 address start = __ pc(); 2735 2736 const Register crc = c_rarg0; // crc 2737 const Register buf = c_rarg1; // source java byte array address 2738 const Register len = c_rarg2; // length 2739 const Register table0 = c_rarg3; // crc_table address 2740 const Register table1 = c_rarg4; 2741 const Register table2 = c_rarg5; 2742 const Register table3 = c_rarg6; 2743 const Register tmp3 = c_rarg7; 2744 2745 BLOCK_COMMENT("Entry:"); 2746 __ enter(); // required for proper stackwalking of RuntimeStub frame 2747 2748 __ kernel_crc32c(crc, buf, len, 2749 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 2750 2751 __ leave(); // required for proper stackwalking of RuntimeStub frame 2752 __ ret(lr); 2753 2754 return start; 2755 } 2756 2757 /*** 2758 * Arguments: 2759 * 2760 * Inputs: 2761 * c_rarg0 - int adler 2762 * c_rarg1 - byte* buff 2763 * c_rarg2 - int len 2764 * 2765 * Output: 2766 * c_rarg0 - int adler result 2767 */ 2768 address generate_updateBytesAdler32() { 2769 __ align(CodeEntryAlignment); 2770 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 2771 address start = __ pc(); 2772 2773 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 2774 2775 // Aliases 2776 Register adler = c_rarg0; 2777 Register s1 = c_rarg0; 2778 Register s2 = c_rarg3; 2779 Register buff = c_rarg1; 2780 Register len = c_rarg2; 2781 Register nmax = r4; 2782 Register base = r5; 2783 Register count = r6; 2784 Register temp0 = rscratch1; 2785 Register temp1 = rscratch2; 2786 Register temp2 = r7; 2787 2788 // Max number of bytes we can process before having to take the mod 2789 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 2790 unsigned long BASE = 0xfff1; 2791 unsigned long NMAX = 0x15B0; 2792 2793 __ mov(base, BASE); 2794 __ mov(nmax, NMAX); 2795 2796 // s1 is initialized to the lower 16 bits of adler 2797 // s2 is initialized to the upper 16 bits of adler 2798 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 2799 __ uxth(s1, adler); // s1 = (adler & 0xffff) 2800 2801 // The pipelined loop needs at least 16 elements for 1 iteration 2802 // It does check this, but it is more effective to skip to the cleanup loop 2803 __ cmp(len, 16); 2804 __ br(Assembler::HS, L_nmax); 2805 __ cbz(len, L_combine); 2806 2807 __ bind(L_simple_by1_loop); 2808 __ ldrb(temp0, Address(__ post(buff, 1))); 2809 __ add(s1, s1, temp0); 2810 __ add(s2, s2, s1); 2811 __ subs(len, len, 1); 2812 __ br(Assembler::HI, L_simple_by1_loop); 2813 2814 // s1 = s1 % BASE 2815 __ subs(temp0, s1, base); 2816 __ csel(s1, temp0, s1, Assembler::HS); 2817 2818 // s2 = s2 % BASE 2819 __ lsr(temp0, s2, 16); 2820 __ lsl(temp1, temp0, 4); 2821 __ sub(temp1, temp1, temp0); 2822 __ add(s2, temp1, s2, ext::uxth); 2823 2824 __ subs(temp0, s2, base); 2825 __ csel(s2, temp0, s2, Assembler::HS); 2826 2827 __ b(L_combine); 2828 2829 __ bind(L_nmax); 2830 __ subs(len, len, nmax); 2831 __ sub(count, nmax, 16); 2832 __ br(Assembler::LO, L_by16); 2833 2834 __ bind(L_nmax_loop); 2835 2836 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 2837 2838 __ add(s1, s1, temp0, ext::uxtb); 2839 __ ubfx(temp2, temp0, 8, 8); 2840 __ add(s2, s2, s1); 2841 __ add(s1, s1, temp2); 2842 __ ubfx(temp2, temp0, 16, 8); 2843 __ add(s2, s2, s1); 2844 __ add(s1, s1, temp2); 2845 __ ubfx(temp2, temp0, 24, 8); 2846 __ add(s2, s2, s1); 2847 __ add(s1, s1, temp2); 2848 __ ubfx(temp2, temp0, 32, 8); 2849 __ add(s2, s2, s1); 2850 __ add(s1, s1, temp2); 2851 __ ubfx(temp2, temp0, 40, 8); 2852 __ add(s2, s2, s1); 2853 __ add(s1, s1, temp2); 2854 __ ubfx(temp2, temp0, 48, 8); 2855 __ add(s2, s2, s1); 2856 __ add(s1, s1, temp2); 2857 __ add(s2, s2, s1); 2858 __ add(s1, s1, temp0, Assembler::LSR, 56); 2859 __ add(s2, s2, s1); 2860 2861 __ add(s1, s1, temp1, ext::uxtb); 2862 __ ubfx(temp2, temp1, 8, 8); 2863 __ add(s2, s2, s1); 2864 __ add(s1, s1, temp2); 2865 __ ubfx(temp2, temp1, 16, 8); 2866 __ add(s2, s2, s1); 2867 __ add(s1, s1, temp2); 2868 __ ubfx(temp2, temp1, 24, 8); 2869 __ add(s2, s2, s1); 2870 __ add(s1, s1, temp2); 2871 __ ubfx(temp2, temp1, 32, 8); 2872 __ add(s2, s2, s1); 2873 __ add(s1, s1, temp2); 2874 __ ubfx(temp2, temp1, 40, 8); 2875 __ add(s2, s2, s1); 2876 __ add(s1, s1, temp2); 2877 __ ubfx(temp2, temp1, 48, 8); 2878 __ add(s2, s2, s1); 2879 __ add(s1, s1, temp2); 2880 __ add(s2, s2, s1); 2881 __ add(s1, s1, temp1, Assembler::LSR, 56); 2882 __ add(s2, s2, s1); 2883 2884 __ subs(count, count, 16); 2885 __ br(Assembler::HS, L_nmax_loop); 2886 2887 // s1 = s1 % BASE 2888 __ lsr(temp0, s1, 16); 2889 __ lsl(temp1, temp0, 4); 2890 __ sub(temp1, temp1, temp0); 2891 __ add(temp1, temp1, s1, ext::uxth); 2892 2893 __ lsr(temp0, temp1, 16); 2894 __ lsl(s1, temp0, 4); 2895 __ sub(s1, s1, temp0); 2896 __ add(s1, s1, temp1, ext:: uxth); 2897 2898 __ subs(temp0, s1, base); 2899 __ csel(s1, temp0, s1, Assembler::HS); 2900 2901 // s2 = s2 % BASE 2902 __ lsr(temp0, s2, 16); 2903 __ lsl(temp1, temp0, 4); 2904 __ sub(temp1, temp1, temp0); 2905 __ add(temp1, temp1, s2, ext::uxth); 2906 2907 __ lsr(temp0, temp1, 16); 2908 __ lsl(s2, temp0, 4); 2909 __ sub(s2, s2, temp0); 2910 __ add(s2, s2, temp1, ext:: uxth); 2911 2912 __ subs(temp0, s2, base); 2913 __ csel(s2, temp0, s2, Assembler::HS); 2914 2915 __ subs(len, len, nmax); 2916 __ sub(count, nmax, 16); 2917 __ br(Assembler::HS, L_nmax_loop); 2918 2919 __ bind(L_by16); 2920 __ adds(len, len, count); 2921 __ br(Assembler::LO, L_by1); 2922 2923 __ bind(L_by16_loop); 2924 2925 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 2926 2927 __ add(s1, s1, temp0, ext::uxtb); 2928 __ ubfx(temp2, temp0, 8, 8); 2929 __ add(s2, s2, s1); 2930 __ add(s1, s1, temp2); 2931 __ ubfx(temp2, temp0, 16, 8); 2932 __ add(s2, s2, s1); 2933 __ add(s1, s1, temp2); 2934 __ ubfx(temp2, temp0, 24, 8); 2935 __ add(s2, s2, s1); 2936 __ add(s1, s1, temp2); 2937 __ ubfx(temp2, temp0, 32, 8); 2938 __ add(s2, s2, s1); 2939 __ add(s1, s1, temp2); 2940 __ ubfx(temp2, temp0, 40, 8); 2941 __ add(s2, s2, s1); 2942 __ add(s1, s1, temp2); 2943 __ ubfx(temp2, temp0, 48, 8); 2944 __ add(s2, s2, s1); 2945 __ add(s1, s1, temp2); 2946 __ add(s2, s2, s1); 2947 __ add(s1, s1, temp0, Assembler::LSR, 56); 2948 __ add(s2, s2, s1); 2949 2950 __ add(s1, s1, temp1, ext::uxtb); 2951 __ ubfx(temp2, temp1, 8, 8); 2952 __ add(s2, s2, s1); 2953 __ add(s1, s1, temp2); 2954 __ ubfx(temp2, temp1, 16, 8); 2955 __ add(s2, s2, s1); 2956 __ add(s1, s1, temp2); 2957 __ ubfx(temp2, temp1, 24, 8); 2958 __ add(s2, s2, s1); 2959 __ add(s1, s1, temp2); 2960 __ ubfx(temp2, temp1, 32, 8); 2961 __ add(s2, s2, s1); 2962 __ add(s1, s1, temp2); 2963 __ ubfx(temp2, temp1, 40, 8); 2964 __ add(s2, s2, s1); 2965 __ add(s1, s1, temp2); 2966 __ ubfx(temp2, temp1, 48, 8); 2967 __ add(s2, s2, s1); 2968 __ add(s1, s1, temp2); 2969 __ add(s2, s2, s1); 2970 __ add(s1, s1, temp1, Assembler::LSR, 56); 2971 __ add(s2, s2, s1); 2972 2973 __ subs(len, len, 16); 2974 __ br(Assembler::HS, L_by16_loop); 2975 2976 __ bind(L_by1); 2977 __ adds(len, len, 15); 2978 __ br(Assembler::LO, L_do_mod); 2979 2980 __ bind(L_by1_loop); 2981 __ ldrb(temp0, Address(__ post(buff, 1))); 2982 __ add(s1, temp0, s1); 2983 __ add(s2, s2, s1); 2984 __ subs(len, len, 1); 2985 __ br(Assembler::HS, L_by1_loop); 2986 2987 __ bind(L_do_mod); 2988 // s1 = s1 % BASE 2989 __ lsr(temp0, s1, 16); 2990 __ lsl(temp1, temp0, 4); 2991 __ sub(temp1, temp1, temp0); 2992 __ add(temp1, temp1, s1, ext::uxth); 2993 2994 __ lsr(temp0, temp1, 16); 2995 __ lsl(s1, temp0, 4); 2996 __ sub(s1, s1, temp0); 2997 __ add(s1, s1, temp1, ext:: uxth); 2998 2999 __ subs(temp0, s1, base); 3000 __ csel(s1, temp0, s1, Assembler::HS); 3001 3002 // s2 = s2 % BASE 3003 __ lsr(temp0, s2, 16); 3004 __ lsl(temp1, temp0, 4); 3005 __ sub(temp1, temp1, temp0); 3006 __ add(temp1, temp1, s2, ext::uxth); 3007 3008 __ lsr(temp0, temp1, 16); 3009 __ lsl(s2, temp0, 4); 3010 __ sub(s2, s2, temp0); 3011 __ add(s2, s2, temp1, ext:: uxth); 3012 3013 __ subs(temp0, s2, base); 3014 __ csel(s2, temp0, s2, Assembler::HS); 3015 3016 // Combine lower bits and higher bits 3017 __ bind(L_combine); 3018 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3019 3020 __ ret(lr); 3021 3022 return start; 3023 } 3024 3025 /** 3026 * Arguments: 3027 * 3028 * Input: 3029 * c_rarg0 - x address 3030 * c_rarg1 - x length 3031 * c_rarg2 - y address 3032 * c_rarg3 - y lenth 3033 * c_rarg4 - z address 3034 * c_rarg5 - z length 3035 */ 3036 address generate_multiplyToLen() { 3037 __ align(CodeEntryAlignment); 3038 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3039 3040 address start = __ pc(); 3041 const Register x = r0; 3042 const Register xlen = r1; 3043 const Register y = r2; 3044 const Register ylen = r3; 3045 const Register z = r4; 3046 const Register zlen = r5; 3047 3048 const Register tmp1 = r10; 3049 const Register tmp2 = r11; 3050 const Register tmp3 = r12; 3051 const Register tmp4 = r13; 3052 const Register tmp5 = r14; 3053 const Register tmp6 = r15; 3054 const Register tmp7 = r16; 3055 3056 BLOCK_COMMENT("Entry:"); 3057 __ enter(); // required for proper stackwalking of RuntimeStub frame 3058 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3059 __ leave(); // required for proper stackwalking of RuntimeStub frame 3060 __ ret(lr); 3061 3062 return start; 3063 } 3064 3065 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3066 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3067 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3068 // Karatsuba multiplication performs a 128*128 -> 256-bit 3069 // multiplication in three 128-bit multiplications and a few 3070 // additions. 3071 // 3072 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3073 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3074 // 3075 // Inputs: 3076 // 3077 // A0 in a.d[0] (subkey) 3078 // A1 in a.d[1] 3079 // (A1+A0) in a1_xor_a0.d[0] 3080 // 3081 // B0 in b.d[0] (state) 3082 // B1 in b.d[1] 3083 3084 __ ext(tmp1, __ T16B, b, b, 0x08); 3085 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3086 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3087 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3088 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3089 3090 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3091 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3092 __ eor(tmp2, __ T16B, tmp2, tmp4); 3093 __ eor(tmp2, __ T16B, tmp2, tmp3); 3094 3095 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3096 __ ins(result_hi, __ D, tmp2, 0, 1); 3097 __ ins(result_lo, __ D, tmp2, 1, 0); 3098 } 3099 3100 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3101 FloatRegister p, FloatRegister z, FloatRegister t1) { 3102 const FloatRegister t0 = result; 3103 3104 // The GCM field polynomial f is z^128 + p(z), where p = 3105 // z^7+z^2+z+1. 3106 // 3107 // z^128 === -p(z) (mod (z^128 + p(z))) 3108 // 3109 // so, given that the product we're reducing is 3110 // a == lo + hi * z^128 3111 // substituting, 3112 // === lo - hi * p(z) (mod (z^128 + p(z))) 3113 // 3114 // we reduce by multiplying hi by p(z) and subtracting the result 3115 // from (i.e. XORing it with) lo. Because p has no nonzero high 3116 // bits we can do this with two 64-bit multiplications, lo*p and 3117 // hi*p. 3118 3119 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3120 __ ext(t1, __ T16B, t0, z, 8); 3121 __ eor(hi, __ T16B, hi, t1); 3122 __ ext(t1, __ T16B, z, t0, 8); 3123 __ eor(lo, __ T16B, lo, t1); 3124 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3125 __ eor(result, __ T16B, lo, t0); 3126 } 3127 3128 /** 3129 * Arguments: 3130 * 3131 * Input: 3132 * c_rarg0 - current state address 3133 * c_rarg1 - H key address 3134 * c_rarg2 - data address 3135 * c_rarg3 - number of blocks 3136 * 3137 * Output: 3138 * Updated state at c_rarg0 3139 */ 3140 address generate_ghash_processBlocks() { 3141 // Bafflingly, GCM uses little-endian for the byte order, but 3142 // big-endian for the bit order. For example, the polynomial 1 is 3143 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3144 // 3145 // So, we must either reverse the bytes in each word and do 3146 // everything big-endian or reverse the bits in each byte and do 3147 // it little-endian. On AArch64 it's more idiomatic to reverse 3148 // the bits in each byte (we have an instruction, RBIT, to do 3149 // that) and keep the data in little-endian bit order throught the 3150 // calculation, bit-reversing the inputs and outputs. 3151 3152 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3153 __ align(wordSize * 2); 3154 address p = __ pc(); 3155 __ emit_int64(0x87); // The low-order bits of the field 3156 // polynomial (i.e. p = z^7+z^2+z+1) 3157 // repeated in the low and high parts of a 3158 // 128-bit vector 3159 __ emit_int64(0x87); 3160 3161 __ align(CodeEntryAlignment); 3162 address start = __ pc(); 3163 3164 Register state = c_rarg0; 3165 Register subkeyH = c_rarg1; 3166 Register data = c_rarg2; 3167 Register blocks = c_rarg3; 3168 3169 FloatRegister vzr = v30; 3170 __ eor(vzr, __ T16B, vzr, vzr); // zero register 3171 3172 __ ldrq(v0, Address(state)); 3173 __ ldrq(v1, Address(subkeyH)); 3174 3175 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 3176 __ rbit(v0, __ T16B, v0); 3177 __ rev64(v1, __ T16B, v1); 3178 __ rbit(v1, __ T16B, v1); 3179 3180 __ ldrq(v26, p); 3181 3182 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 3183 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 3184 3185 { 3186 Label L_ghash_loop; 3187 __ bind(L_ghash_loop); 3188 3189 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 3190 // reversing each byte 3191 __ rbit(v2, __ T16B, v2); 3192 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 3193 3194 // Multiply state in v2 by subkey in v1 3195 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 3196 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 3197 /*temps*/v6, v20, v18, v21); 3198 // Reduce v7:v5 by the field polynomial 3199 ghash_reduce(v0, v5, v7, v26, vzr, v20); 3200 3201 __ sub(blocks, blocks, 1); 3202 __ cbnz(blocks, L_ghash_loop); 3203 } 3204 3205 // The bit-reversed result is at this point in v0 3206 __ rev64(v1, __ T16B, v0); 3207 __ rbit(v1, __ T16B, v1); 3208 3209 __ st1(v1, __ T16B, state); 3210 __ ret(lr); 3211 3212 return start; 3213 } 3214 3215 // Continuation point for throwing of implicit exceptions that are 3216 // not handled in the current activation. Fabricates an exception 3217 // oop and initiates normal exception dispatching in this 3218 // frame. Since we need to preserve callee-saved values (currently 3219 // only for C2, but done for C1 as well) we need a callee-saved oop 3220 // map and therefore have to make these stubs into RuntimeStubs 3221 // rather than BufferBlobs. If the compiler needs all registers to 3222 // be preserved between the fault point and the exception handler 3223 // then it must assume responsibility for that in 3224 // AbstractCompiler::continuation_for_implicit_null_exception or 3225 // continuation_for_implicit_division_by_zero_exception. All other 3226 // implicit exceptions (e.g., NullPointerException or 3227 // AbstractMethodError on entry) are either at call sites or 3228 // otherwise assume that stack unwinding will be initiated, so 3229 // caller saved registers were assumed volatile in the compiler. 3230 3231 #undef __ 3232 #define __ masm-> 3233 3234 address generate_throw_exception(const char* name, 3235 address runtime_entry, 3236 Register arg1 = noreg, 3237 Register arg2 = noreg) { 3238 // Information about frame layout at time of blocking runtime call. 3239 // Note that we only have to preserve callee-saved registers since 3240 // the compilers are responsible for supplying a continuation point 3241 // if they expect all registers to be preserved. 3242 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 3243 enum layout { 3244 rfp_off = 0, 3245 rfp_off2, 3246 return_off, 3247 return_off2, 3248 framesize // inclusive of return address 3249 }; 3250 3251 int insts_size = 512; 3252 int locs_size = 64; 3253 3254 CodeBuffer code(name, insts_size, locs_size); 3255 OopMapSet* oop_maps = new OopMapSet(); 3256 MacroAssembler* masm = new MacroAssembler(&code); 3257 3258 address start = __ pc(); 3259 3260 // This is an inlined and slightly modified version of call_VM 3261 // which has the ability to fetch the return PC out of 3262 // thread-local storage and also sets up last_Java_sp slightly 3263 // differently than the real call_VM 3264 3265 __ enter(); // Save FP and LR before call 3266 3267 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3268 3269 // lr and fp are already in place 3270 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 3271 3272 int frame_complete = __ pc() - start; 3273 3274 // Set up last_Java_sp and last_Java_fp 3275 address the_pc = __ pc(); 3276 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 3277 3278 // Call runtime 3279 if (arg1 != noreg) { 3280 assert(arg2 != c_rarg1, "clobbered"); 3281 __ mov(c_rarg1, arg1); 3282 } 3283 if (arg2 != noreg) { 3284 __ mov(c_rarg2, arg2); 3285 } 3286 __ mov(c_rarg0, rthread); 3287 BLOCK_COMMENT("call runtime_entry"); 3288 __ mov(rscratch1, runtime_entry); 3289 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 3290 3291 // Generate oop map 3292 OopMap* map = new OopMap(framesize, 0); 3293 3294 oop_maps->add_gc_map(the_pc - start, map); 3295 3296 __ reset_last_Java_frame(true, true); 3297 __ maybe_isb(); 3298 3299 __ leave(); 3300 3301 // check for pending exceptions 3302 #ifdef ASSERT 3303 Label L; 3304 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 3305 __ cbnz(rscratch1, L); 3306 __ should_not_reach_here(); 3307 __ bind(L); 3308 #endif // ASSERT 3309 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3310 3311 3312 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3313 RuntimeStub* stub = 3314 RuntimeStub::new_runtime_stub(name, 3315 &code, 3316 frame_complete, 3317 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3318 oop_maps, false); 3319 return stub->entry_point(); 3320 } 3321 3322 class MontgomeryMultiplyGenerator : public MacroAssembler { 3323 3324 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3325 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 3326 3327 RegSet _toSave; 3328 bool _squaring; 3329 3330 public: 3331 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3332 : MacroAssembler(as->code()), _squaring(squaring) { 3333 3334 // Register allocation 3335 3336 Register reg = c_rarg0; 3337 Pa_base = reg; // Argument registers 3338 if (squaring) 3339 Pb_base = Pa_base; 3340 else 3341 Pb_base = ++reg; 3342 Pn_base = ++reg; 3343 Rlen= ++reg; 3344 inv = ++reg; 3345 Pm_base = ++reg; 3346 3347 // Working registers: 3348 Ra = ++reg; // The current digit of a, b, n, and m. 3349 Rb = ++reg; 3350 Rm = ++reg; 3351 Rn = ++reg; 3352 3353 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 3354 Pb = ++reg; 3355 Pm = ++reg; 3356 Pn = ++reg; 3357 3358 t0 = ++reg; // Three registers which form a 3359 t1 = ++reg; // triple-precision accumuator. 3360 t2 = ++reg; 3361 3362 Ri = ++reg; // Inner and outer loop indexes. 3363 Rj = ++reg; 3364 3365 Rhi_ab = ++reg; // Product registers: low and high parts 3366 Rlo_ab = ++reg; // of a*b and m*n. 3367 Rhi_mn = ++reg; 3368 Rlo_mn = ++reg; 3369 3370 // r19 and up are callee-saved. 3371 _toSave = RegSet::range(r19, reg) + Pm_base; 3372 } 3373 3374 private: 3375 void save_regs() { 3376 push(_toSave, sp); 3377 } 3378 3379 void restore_regs() { 3380 pop(_toSave, sp); 3381 } 3382 3383 template <typename T> 3384 void unroll_2(Register count, T block) { 3385 Label loop, end, odd; 3386 tbnz(count, 0, odd); 3387 cbz(count, end); 3388 align(16); 3389 bind(loop); 3390 (this->*block)(); 3391 bind(odd); 3392 (this->*block)(); 3393 subs(count, count, 2); 3394 br(Assembler::GT, loop); 3395 bind(end); 3396 } 3397 3398 template <typename T> 3399 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3400 Label loop, end, odd; 3401 tbnz(count, 0, odd); 3402 cbz(count, end); 3403 align(16); 3404 bind(loop); 3405 (this->*block)(d, s, tmp); 3406 bind(odd); 3407 (this->*block)(d, s, tmp); 3408 subs(count, count, 2); 3409 br(Assembler::GT, loop); 3410 bind(end); 3411 } 3412 3413 void pre1(RegisterOrConstant i) { 3414 block_comment("pre1"); 3415 // Pa = Pa_base; 3416 // Pb = Pb_base + i; 3417 // Pm = Pm_base; 3418 // Pn = Pn_base + i; 3419 // Ra = *Pa; 3420 // Rb = *Pb; 3421 // Rm = *Pm; 3422 // Rn = *Pn; 3423 ldr(Ra, Address(Pa_base)); 3424 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3425 ldr(Rm, Address(Pm_base)); 3426 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3427 lea(Pa, Address(Pa_base)); 3428 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3429 lea(Pm, Address(Pm_base)); 3430 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3431 3432 // Zero the m*n result. 3433 mov(Rhi_mn, zr); 3434 mov(Rlo_mn, zr); 3435 } 3436 3437 // The core multiply-accumulate step of a Montgomery 3438 // multiplication. The idea is to schedule operations as a 3439 // pipeline so that instructions with long latencies (loads and 3440 // multiplies) have time to complete before their results are 3441 // used. This most benefits in-order implementations of the 3442 // architecture but out-of-order ones also benefit. 3443 void step() { 3444 block_comment("step"); 3445 // MACC(Ra, Rb, t0, t1, t2); 3446 // Ra = *++Pa; 3447 // Rb = *--Pb; 3448 umulh(Rhi_ab, Ra, Rb); 3449 mul(Rlo_ab, Ra, Rb); 3450 ldr(Ra, pre(Pa, wordSize)); 3451 ldr(Rb, pre(Pb, -wordSize)); 3452 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 3453 // previous iteration. 3454 // MACC(Rm, Rn, t0, t1, t2); 3455 // Rm = *++Pm; 3456 // Rn = *--Pn; 3457 umulh(Rhi_mn, Rm, Rn); 3458 mul(Rlo_mn, Rm, Rn); 3459 ldr(Rm, pre(Pm, wordSize)); 3460 ldr(Rn, pre(Pn, -wordSize)); 3461 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3462 } 3463 3464 void post1() { 3465 block_comment("post1"); 3466 3467 // MACC(Ra, Rb, t0, t1, t2); 3468 // Ra = *++Pa; 3469 // Rb = *--Pb; 3470 umulh(Rhi_ab, Ra, Rb); 3471 mul(Rlo_ab, Ra, Rb); 3472 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3473 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3474 3475 // *Pm = Rm = t0 * inv; 3476 mul(Rm, t0, inv); 3477 str(Rm, Address(Pm)); 3478 3479 // MACC(Rm, Rn, t0, t1, t2); 3480 // t0 = t1; t1 = t2; t2 = 0; 3481 umulh(Rhi_mn, Rm, Rn); 3482 3483 #ifndef PRODUCT 3484 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 3485 { 3486 mul(Rlo_mn, Rm, Rn); 3487 add(Rlo_mn, t0, Rlo_mn); 3488 Label ok; 3489 cbz(Rlo_mn, ok); { 3490 stop("broken Montgomery multiply"); 3491 } bind(ok); 3492 } 3493 #endif 3494 // We have very carefully set things up so that 3495 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 3496 // the lower half of Rm * Rn because we know the result already: 3497 // it must be -t0. t0 + (-t0) must generate a carry iff 3498 // t0 != 0. So, rather than do a mul and an adds we just set 3499 // the carry flag iff t0 is nonzero. 3500 // 3501 // mul(Rlo_mn, Rm, Rn); 3502 // adds(zr, t0, Rlo_mn); 3503 subs(zr, t0, 1); // Set carry iff t0 is nonzero 3504 adcs(t0, t1, Rhi_mn); 3505 adc(t1, t2, zr); 3506 mov(t2, zr); 3507 } 3508 3509 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 3510 block_comment("pre2"); 3511 // Pa = Pa_base + i-len; 3512 // Pb = Pb_base + len; 3513 // Pm = Pm_base + i-len; 3514 // Pn = Pn_base + len; 3515 3516 if (i.is_register()) { 3517 sub(Rj, i.as_register(), len); 3518 } else { 3519 mov(Rj, i.as_constant()); 3520 sub(Rj, Rj, len); 3521 } 3522 // Rj == i-len 3523 3524 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 3525 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 3526 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 3527 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 3528 3529 // Ra = *++Pa; 3530 // Rb = *--Pb; 3531 // Rm = *++Pm; 3532 // Rn = *--Pn; 3533 ldr(Ra, pre(Pa, wordSize)); 3534 ldr(Rb, pre(Pb, -wordSize)); 3535 ldr(Rm, pre(Pm, wordSize)); 3536 ldr(Rn, pre(Pn, -wordSize)); 3537 3538 mov(Rhi_mn, zr); 3539 mov(Rlo_mn, zr); 3540 } 3541 3542 void post2(RegisterOrConstant i, RegisterOrConstant len) { 3543 block_comment("post2"); 3544 if (i.is_constant()) { 3545 mov(Rj, i.as_constant()-len.as_constant()); 3546 } else { 3547 sub(Rj, i.as_register(), len); 3548 } 3549 3550 adds(t0, t0, Rlo_mn); // The pending m*n, low part 3551 3552 // As soon as we know the least significant digit of our result, 3553 // store it. 3554 // Pm_base[i-len] = t0; 3555 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 3556 3557 // t0 = t1; t1 = t2; t2 = 0; 3558 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 3559 adc(t1, t2, zr); 3560 mov(t2, zr); 3561 } 3562 3563 // A carry in t0 after Montgomery multiplication means that we 3564 // should subtract multiples of n from our result in m. We'll 3565 // keep doing that until there is no carry. 3566 void normalize(RegisterOrConstant len) { 3567 block_comment("normalize"); 3568 // while (t0) 3569 // t0 = sub(Pm_base, Pn_base, t0, len); 3570 Label loop, post, again; 3571 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 3572 cbz(t0, post); { 3573 bind(again); { 3574 mov(i, zr); 3575 mov(cnt, len); 3576 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3577 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3578 subs(zr, zr, zr); // set carry flag, i.e. no borrow 3579 align(16); 3580 bind(loop); { 3581 sbcs(Rm, Rm, Rn); 3582 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3583 add(i, i, 1); 3584 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3585 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3586 sub(cnt, cnt, 1); 3587 } cbnz(cnt, loop); 3588 sbc(t0, t0, zr); 3589 } cbnz(t0, again); 3590 } bind(post); 3591 } 3592 3593 // Move memory at s to d, reversing words. 3594 // Increments d to end of copied memory 3595 // Destroys tmp1, tmp2 3596 // Preserves len 3597 // Leaves s pointing to the address which was in d at start 3598 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 3599 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 3600 3601 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 3602 mov(tmp1, len); 3603 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 3604 sub(s, d, len, ext::uxtw, LogBytesPerWord); 3605 } 3606 // where 3607 void reverse1(Register d, Register s, Register tmp) { 3608 ldr(tmp, pre(s, -wordSize)); 3609 ror(tmp, tmp, 32); 3610 str(tmp, post(d, wordSize)); 3611 } 3612 3613 void step_squaring() { 3614 // An extra ACC 3615 step(); 3616 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3617 } 3618 3619 void last_squaring(RegisterOrConstant i) { 3620 Label dont; 3621 // if ((i & 1) == 0) { 3622 tbnz(i.as_register(), 0, dont); { 3623 // MACC(Ra, Rb, t0, t1, t2); 3624 // Ra = *++Pa; 3625 // Rb = *--Pb; 3626 umulh(Rhi_ab, Ra, Rb); 3627 mul(Rlo_ab, Ra, Rb); 3628 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3629 } bind(dont); 3630 } 3631 3632 void extra_step_squaring() { 3633 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3634 3635 // MACC(Rm, Rn, t0, t1, t2); 3636 // Rm = *++Pm; 3637 // Rn = *--Pn; 3638 umulh(Rhi_mn, Rm, Rn); 3639 mul(Rlo_mn, Rm, Rn); 3640 ldr(Rm, pre(Pm, wordSize)); 3641 ldr(Rn, pre(Pn, -wordSize)); 3642 } 3643 3644 void post1_squaring() { 3645 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3646 3647 // *Pm = Rm = t0 * inv; 3648 mul(Rm, t0, inv); 3649 str(Rm, Address(Pm)); 3650 3651 // MACC(Rm, Rn, t0, t1, t2); 3652 // t0 = t1; t1 = t2; t2 = 0; 3653 umulh(Rhi_mn, Rm, Rn); 3654 3655 #ifndef PRODUCT 3656 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 3657 { 3658 mul(Rlo_mn, Rm, Rn); 3659 add(Rlo_mn, t0, Rlo_mn); 3660 Label ok; 3661 cbz(Rlo_mn, ok); { 3662 stop("broken Montgomery multiply"); 3663 } bind(ok); 3664 } 3665 #endif 3666 // We have very carefully set things up so that 3667 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 3668 // the lower half of Rm * Rn because we know the result already: 3669 // it must be -t0. t0 + (-t0) must generate a carry iff 3670 // t0 != 0. So, rather than do a mul and an adds we just set 3671 // the carry flag iff t0 is nonzero. 3672 // 3673 // mul(Rlo_mn, Rm, Rn); 3674 // adds(zr, t0, Rlo_mn); 3675 subs(zr, t0, 1); // Set carry iff t0 is nonzero 3676 adcs(t0, t1, Rhi_mn); 3677 adc(t1, t2, zr); 3678 mov(t2, zr); 3679 } 3680 3681 void acc(Register Rhi, Register Rlo, 3682 Register t0, Register t1, Register t2) { 3683 adds(t0, t0, Rlo); 3684 adcs(t1, t1, Rhi); 3685 adc(t2, t2, zr); 3686 } 3687 3688 public: 3689 /** 3690 * Fast Montgomery multiplication. The derivation of the 3691 * algorithm is in A Cryptographic Library for the Motorola 3692 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3693 * 3694 * Arguments: 3695 * 3696 * Inputs for multiplication: 3697 * c_rarg0 - int array elements a 3698 * c_rarg1 - int array elements b 3699 * c_rarg2 - int array elements n (the modulus) 3700 * c_rarg3 - int length 3701 * c_rarg4 - int inv 3702 * c_rarg5 - int array elements m (the result) 3703 * 3704 * Inputs for squaring: 3705 * c_rarg0 - int array elements a 3706 * c_rarg1 - int array elements n (the modulus) 3707 * c_rarg2 - int length 3708 * c_rarg3 - int inv 3709 * c_rarg4 - int array elements m (the result) 3710 * 3711 */ 3712 address generate_multiply() { 3713 Label argh, nothing; 3714 bind(argh); 3715 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3716 3717 align(CodeEntryAlignment); 3718 address entry = pc(); 3719 3720 cbzw(Rlen, nothing); 3721 3722 enter(); 3723 3724 // Make room. 3725 cmpw(Rlen, 512); 3726 br(Assembler::HI, argh); 3727 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 3728 andr(sp, Ra, -2 * wordSize); 3729 3730 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 3731 3732 { 3733 // Copy input args, reversing as we go. We use Ra as a 3734 // temporary variable. 3735 reverse(Ra, Pa_base, Rlen, t0, t1); 3736 if (!_squaring) 3737 reverse(Ra, Pb_base, Rlen, t0, t1); 3738 reverse(Ra, Pn_base, Rlen, t0, t1); 3739 } 3740 3741 // Push all call-saved registers and also Pm_base which we'll need 3742 // at the end. 3743 save_regs(); 3744 3745 #ifndef PRODUCT 3746 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 3747 { 3748 ldr(Rn, Address(Pn_base, 0)); 3749 mul(Rlo_mn, Rn, inv); 3750 cmp(Rlo_mn, -1); 3751 Label ok; 3752 br(EQ, ok); { 3753 stop("broken inverse in Montgomery multiply"); 3754 } bind(ok); 3755 } 3756 #endif 3757 3758 mov(Pm_base, Ra); 3759 3760 mov(t0, zr); 3761 mov(t1, zr); 3762 mov(t2, zr); 3763 3764 block_comment("for (int i = 0; i < len; i++) {"); 3765 mov(Ri, zr); { 3766 Label loop, end; 3767 cmpw(Ri, Rlen); 3768 br(Assembler::GE, end); 3769 3770 bind(loop); 3771 pre1(Ri); 3772 3773 block_comment(" for (j = i; j; j--) {"); { 3774 movw(Rj, Ri); 3775 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3776 } block_comment(" } // j"); 3777 3778 post1(); 3779 addw(Ri, Ri, 1); 3780 cmpw(Ri, Rlen); 3781 br(Assembler::LT, loop); 3782 bind(end); 3783 block_comment("} // i"); 3784 } 3785 3786 block_comment("for (int i = len; i < 2*len; i++) {"); 3787 mov(Ri, Rlen); { 3788 Label loop, end; 3789 cmpw(Ri, Rlen, Assembler::LSL, 1); 3790 br(Assembler::GE, end); 3791 3792 bind(loop); 3793 pre2(Ri, Rlen); 3794 3795 block_comment(" for (j = len*2-i-1; j; j--) {"); { 3796 lslw(Rj, Rlen, 1); 3797 subw(Rj, Rj, Ri); 3798 subw(Rj, Rj, 1); 3799 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3800 } block_comment(" } // j"); 3801 3802 post2(Ri, Rlen); 3803 addw(Ri, Ri, 1); 3804 cmpw(Ri, Rlen, Assembler::LSL, 1); 3805 br(Assembler::LT, loop); 3806 bind(end); 3807 } 3808 block_comment("} // i"); 3809 3810 normalize(Rlen); 3811 3812 mov(Ra, Pm_base); // Save Pm_base in Ra 3813 restore_regs(); // Restore caller's Pm_base 3814 3815 // Copy our result into caller's Pm_base 3816 reverse(Pm_base, Ra, Rlen, t0, t1); 3817 3818 leave(); 3819 bind(nothing); 3820 ret(lr); 3821 3822 return entry; 3823 } 3824 // In C, approximately: 3825 3826 // void 3827 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 3828 // unsigned long Pn_base[], unsigned long Pm_base[], 3829 // unsigned long inv, int len) { 3830 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3831 // unsigned long *Pa, *Pb, *Pn, *Pm; 3832 // unsigned long Ra, Rb, Rn, Rm; 3833 3834 // int i; 3835 3836 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 3837 3838 // for (i = 0; i < len; i++) { 3839 // int j; 3840 3841 // Pa = Pa_base; 3842 // Pb = Pb_base + i; 3843 // Pm = Pm_base; 3844 // Pn = Pn_base + i; 3845 3846 // Ra = *Pa; 3847 // Rb = *Pb; 3848 // Rm = *Pm; 3849 // Rn = *Pn; 3850 3851 // int iters = i; 3852 // for (j = 0; iters--; j++) { 3853 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 3854 // MACC(Ra, Rb, t0, t1, t2); 3855 // Ra = *++Pa; 3856 // Rb = *--Pb; 3857 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 3858 // MACC(Rm, Rn, t0, t1, t2); 3859 // Rm = *++Pm; 3860 // Rn = *--Pn; 3861 // } 3862 3863 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 3864 // MACC(Ra, Rb, t0, t1, t2); 3865 // *Pm = Rm = t0 * inv; 3866 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 3867 // MACC(Rm, Rn, t0, t1, t2); 3868 3869 // assert(t0 == 0, "broken Montgomery multiply"); 3870 3871 // t0 = t1; t1 = t2; t2 = 0; 3872 // } 3873 3874 // for (i = len; i < 2*len; i++) { 3875 // int j; 3876 3877 // Pa = Pa_base + i-len; 3878 // Pb = Pb_base + len; 3879 // Pm = Pm_base + i-len; 3880 // Pn = Pn_base + len; 3881 3882 // Ra = *++Pa; 3883 // Rb = *--Pb; 3884 // Rm = *++Pm; 3885 // Rn = *--Pn; 3886 3887 // int iters = len*2-i-1; 3888 // for (j = i-len+1; iters--; j++) { 3889 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 3890 // MACC(Ra, Rb, t0, t1, t2); 3891 // Ra = *++Pa; 3892 // Rb = *--Pb; 3893 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 3894 // MACC(Rm, Rn, t0, t1, t2); 3895 // Rm = *++Pm; 3896 // Rn = *--Pn; 3897 // } 3898 3899 // Pm_base[i-len] = t0; 3900 // t0 = t1; t1 = t2; t2 = 0; 3901 // } 3902 3903 // while (t0) 3904 // t0 = sub(Pm_base, Pn_base, t0, len); 3905 // } 3906 3907 /** 3908 * Fast Montgomery squaring. This uses asymptotically 25% fewer 3909 * multiplies than Montgomery multiplication so it should be up to 3910 * 25% faster. However, its loop control is more complex and it 3911 * may actually run slower on some machines. 3912 * 3913 * Arguments: 3914 * 3915 * Inputs: 3916 * c_rarg0 - int array elements a 3917 * c_rarg1 - int array elements n (the modulus) 3918 * c_rarg2 - int length 3919 * c_rarg3 - int inv 3920 * c_rarg4 - int array elements m (the result) 3921 * 3922 */ 3923 address generate_square() { 3924 Label argh; 3925 bind(argh); 3926 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3927 3928 align(CodeEntryAlignment); 3929 address entry = pc(); 3930 3931 enter(); 3932 3933 // Make room. 3934 cmpw(Rlen, 512); 3935 br(Assembler::HI, argh); 3936 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 3937 andr(sp, Ra, -2 * wordSize); 3938 3939 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 3940 3941 { 3942 // Copy input args, reversing as we go. We use Ra as a 3943 // temporary variable. 3944 reverse(Ra, Pa_base, Rlen, t0, t1); 3945 reverse(Ra, Pn_base, Rlen, t0, t1); 3946 } 3947 3948 // Push all call-saved registers and also Pm_base which we'll need 3949 // at the end. 3950 save_regs(); 3951 3952 mov(Pm_base, Ra); 3953 3954 mov(t0, zr); 3955 mov(t1, zr); 3956 mov(t2, zr); 3957 3958 block_comment("for (int i = 0; i < len; i++) {"); 3959 mov(Ri, zr); { 3960 Label loop, end; 3961 bind(loop); 3962 cmp(Ri, Rlen); 3963 br(Assembler::GE, end); 3964 3965 pre1(Ri); 3966 3967 block_comment("for (j = (i+1)/2; j; j--) {"); { 3968 add(Rj, Ri, 1); 3969 lsr(Rj, Rj, 1); 3970 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 3971 } block_comment(" } // j"); 3972 3973 last_squaring(Ri); 3974 3975 block_comment(" for (j = i/2; j; j--) {"); { 3976 lsr(Rj, Ri, 1); 3977 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 3978 } block_comment(" } // j"); 3979 3980 post1_squaring(); 3981 add(Ri, Ri, 1); 3982 cmp(Ri, Rlen); 3983 br(Assembler::LT, loop); 3984 3985 bind(end); 3986 block_comment("} // i"); 3987 } 3988 3989 block_comment("for (int i = len; i < 2*len; i++) {"); 3990 mov(Ri, Rlen); { 3991 Label loop, end; 3992 bind(loop); 3993 cmp(Ri, Rlen, Assembler::LSL, 1); 3994 br(Assembler::GE, end); 3995 3996 pre2(Ri, Rlen); 3997 3998 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 3999 lsl(Rj, Rlen, 1); 4000 sub(Rj, Rj, Ri); 4001 sub(Rj, Rj, 1); 4002 lsr(Rj, Rj, 1); 4003 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4004 } block_comment(" } // j"); 4005 4006 last_squaring(Ri); 4007 4008 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4009 lsl(Rj, Rlen, 1); 4010 sub(Rj, Rj, Ri); 4011 lsr(Rj, Rj, 1); 4012 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4013 } block_comment(" } // j"); 4014 4015 post2(Ri, Rlen); 4016 add(Ri, Ri, 1); 4017 cmp(Ri, Rlen, Assembler::LSL, 1); 4018 4019 br(Assembler::LT, loop); 4020 bind(end); 4021 block_comment("} // i"); 4022 } 4023 4024 normalize(Rlen); 4025 4026 mov(Ra, Pm_base); // Save Pm_base in Ra 4027 restore_regs(); // Restore caller's Pm_base 4028 4029 // Copy our result into caller's Pm_base 4030 reverse(Pm_base, Ra, Rlen, t0, t1); 4031 4032 leave(); 4033 ret(lr); 4034 4035 return entry; 4036 } 4037 // In C, approximately: 4038 4039 // void 4040 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4041 // unsigned long Pm_base[], unsigned long inv, int len) { 4042 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4043 // unsigned long *Pa, *Pb, *Pn, *Pm; 4044 // unsigned long Ra, Rb, Rn, Rm; 4045 4046 // int i; 4047 4048 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4049 4050 // for (i = 0; i < len; i++) { 4051 // int j; 4052 4053 // Pa = Pa_base; 4054 // Pb = Pa_base + i; 4055 // Pm = Pm_base; 4056 // Pn = Pn_base + i; 4057 4058 // Ra = *Pa; 4059 // Rb = *Pb; 4060 // Rm = *Pm; 4061 // Rn = *Pn; 4062 4063 // int iters = (i+1)/2; 4064 // for (j = 0; iters--; j++) { 4065 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4066 // MACC2(Ra, Rb, t0, t1, t2); 4067 // Ra = *++Pa; 4068 // Rb = *--Pb; 4069 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4070 // MACC(Rm, Rn, t0, t1, t2); 4071 // Rm = *++Pm; 4072 // Rn = *--Pn; 4073 // } 4074 // if ((i & 1) == 0) { 4075 // assert(Ra == Pa_base[j], "must be"); 4076 // MACC(Ra, Ra, t0, t1, t2); 4077 // } 4078 // iters = i/2; 4079 // assert(iters == i-j, "must be"); 4080 // for (; iters--; j++) { 4081 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4082 // MACC(Rm, Rn, t0, t1, t2); 4083 // Rm = *++Pm; 4084 // Rn = *--Pn; 4085 // } 4086 4087 // *Pm = Rm = t0 * inv; 4088 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4089 // MACC(Rm, Rn, t0, t1, t2); 4090 4091 // assert(t0 == 0, "broken Montgomery multiply"); 4092 4093 // t0 = t1; t1 = t2; t2 = 0; 4094 // } 4095 4096 // for (i = len; i < 2*len; i++) { 4097 // int start = i-len+1; 4098 // int end = start + (len - start)/2; 4099 // int j; 4100 4101 // Pa = Pa_base + i-len; 4102 // Pb = Pa_base + len; 4103 // Pm = Pm_base + i-len; 4104 // Pn = Pn_base + len; 4105 4106 // Ra = *++Pa; 4107 // Rb = *--Pb; 4108 // Rm = *++Pm; 4109 // Rn = *--Pn; 4110 4111 // int iters = (2*len-i-1)/2; 4112 // assert(iters == end-start, "must be"); 4113 // for (j = start; iters--; j++) { 4114 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4115 // MACC2(Ra, Rb, t0, t1, t2); 4116 // Ra = *++Pa; 4117 // Rb = *--Pb; 4118 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4119 // MACC(Rm, Rn, t0, t1, t2); 4120 // Rm = *++Pm; 4121 // Rn = *--Pn; 4122 // } 4123 // if ((i & 1) == 0) { 4124 // assert(Ra == Pa_base[j], "must be"); 4125 // MACC(Ra, Ra, t0, t1, t2); 4126 // } 4127 // iters = (2*len-i)/2; 4128 // assert(iters == len-j, "must be"); 4129 // for (; iters--; j++) { 4130 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4131 // MACC(Rm, Rn, t0, t1, t2); 4132 // Rm = *++Pm; 4133 // Rn = *--Pn; 4134 // } 4135 // Pm_base[i-len] = t0; 4136 // t0 = t1; t1 = t2; t2 = 0; 4137 // } 4138 4139 // while (t0) 4140 // t0 = sub(Pm_base, Pn_base, t0, len); 4141 // } 4142 }; 4143 4144 // Initialization 4145 void generate_initial() { 4146 // Generate initial stubs and initializes the entry points 4147 4148 // entry points that exist in all platforms Note: This is code 4149 // that could be shared among different platforms - however the 4150 // benefit seems to be smaller than the disadvantage of having a 4151 // much more complicated generator structure. See also comment in 4152 // stubRoutines.hpp. 4153 4154 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4155 4156 StubRoutines::_call_stub_entry = 4157 generate_call_stub(StubRoutines::_call_stub_return_address); 4158 4159 // is referenced by megamorphic call 4160 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4161 4162 // Build this early so it's available for the interpreter. 4163 StubRoutines::_throw_StackOverflowError_entry = 4164 generate_throw_exception("StackOverflowError throw_exception", 4165 CAST_FROM_FN_PTR(address, 4166 SharedRuntime:: 4167 throw_StackOverflowError)); 4168 if (UseCRC32Intrinsics) { 4169 // set table address before stub generation which use it 4170 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 4171 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4172 } 4173 } 4174 4175 void generate_all() { 4176 // support for verify_oop (must happen after universe_init) 4177 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4178 StubRoutines::_throw_AbstractMethodError_entry = 4179 generate_throw_exception("AbstractMethodError throw_exception", 4180 CAST_FROM_FN_PTR(address, 4181 SharedRuntime:: 4182 throw_AbstractMethodError)); 4183 4184 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4185 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4186 CAST_FROM_FN_PTR(address, 4187 SharedRuntime:: 4188 throw_IncompatibleClassChangeError)); 4189 4190 StubRoutines::_throw_NullPointerException_at_call_entry = 4191 generate_throw_exception("NullPointerException at call throw_exception", 4192 CAST_FROM_FN_PTR(address, 4193 SharedRuntime:: 4194 throw_NullPointerException_at_call)); 4195 4196 // arraycopy stubs used by compilers 4197 generate_arraycopy_stubs(); 4198 4199 if (UseMultiplyToLenIntrinsic) { 4200 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4201 } 4202 4203 if (UseMontgomeryMultiplyIntrinsic) { 4204 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4205 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4206 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4207 } 4208 4209 if (UseMontgomerySquareIntrinsic) { 4210 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4211 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4212 // We use generate_multiply() rather than generate_square() 4213 // because it's faster for the sizes of modulus we care about. 4214 StubRoutines::_montgomerySquare = g.generate_multiply(); 4215 } 4216 4217 #ifndef BUILTIN_SIM 4218 // generate GHASH intrinsics code 4219 if (UseGHASHIntrinsics) { 4220 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 4221 } 4222 4223 if (UseAESIntrinsics) { 4224 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4225 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4226 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4227 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4228 } 4229 4230 if (UseSHA1Intrinsics) { 4231 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4232 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4233 } 4234 if (UseSHA256Intrinsics) { 4235 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4236 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4237 } 4238 4239 if (UseCRC32CIntrinsics) { 4240 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 4241 } 4242 4243 // generate Adler32 intrinsics code 4244 if (UseAdler32Intrinsics) { 4245 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 4246 } 4247 4248 // Safefetch stubs. 4249 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4250 &StubRoutines::_safefetch32_fault_pc, 4251 &StubRoutines::_safefetch32_continuation_pc); 4252 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4253 &StubRoutines::_safefetchN_fault_pc, 4254 &StubRoutines::_safefetchN_continuation_pc); 4255 #endif 4256 } 4257 4258 public: 4259 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4260 if (all) { 4261 generate_all(); 4262 } else { 4263 generate_initial(); 4264 } 4265 } 4266 }; // end class declaration 4267 4268 void StubGenerator_generate(CodeBuffer* code, bool all) { 4269 StubGenerator g(code, all); 4270 }