1 /* 2 * Copyright (c) 2003, 2015, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "interpreter/interpreter.hpp" 30 #include "nativeInst_aarch64.hpp" 31 #include "oops/instanceOop.hpp" 32 #include "oops/method.hpp" 33 #include "oops/objArrayKlass.hpp" 34 #include "oops/oop.inline.hpp" 35 #include "prims/methodHandles.hpp" 36 #include "runtime/frame.inline.hpp" 37 #include "runtime/handles.inline.hpp" 38 #include "runtime/sharedRuntime.hpp" 39 #include "runtime/stubCodeGenerator.hpp" 40 #include "runtime/stubRoutines.hpp" 41 #include "runtime/thread.inline.hpp" 42 #include "utilities/top.hpp" 43 #ifdef COMPILER2 44 #include "opto/runtime.hpp" 45 #endif 46 47 #ifdef BUILTIN_SIM 48 #include "../../../../../../simulator/simulator.hpp" 49 #endif 50 51 // Declaration and definition of StubGenerator (no .hpp file). 52 // For a more detailed description of the stub routine structure 53 // see the comment in stubRoutines.hpp 54 55 #undef __ 56 #define __ _masm-> 57 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 58 59 #ifdef PRODUCT 60 #define BLOCK_COMMENT(str) /* nothing */ 61 #else 62 #define BLOCK_COMMENT(str) __ block_comment(str) 63 #endif 64 65 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 66 67 // Stub Code definitions 68 69 class StubGenerator: public StubCodeGenerator { 70 private: 71 72 #ifdef PRODUCT 73 #define inc_counter_np(counter) ((void)0) 74 #else 75 void inc_counter_np_(int& counter) { 76 __ lea(rscratch2, ExternalAddress((address)&counter)); 77 __ ldrw(rscratch1, Address(rscratch2)); 78 __ addw(rscratch1, rscratch1, 1); 79 __ strw(rscratch1, Address(rscratch2)); 80 } 81 #define inc_counter_np(counter) \ 82 BLOCK_COMMENT("inc_counter " #counter); \ 83 inc_counter_np_(counter); 84 #endif 85 86 // Call stubs are used to call Java from C 87 // 88 // Arguments: 89 // c_rarg0: call wrapper address address 90 // c_rarg1: result address 91 // c_rarg2: result type BasicType 92 // c_rarg3: method Method* 93 // c_rarg4: (interpreter) entry point address 94 // c_rarg5: parameters intptr_t* 95 // c_rarg6: parameter size (in words) int 96 // c_rarg7: thread Thread* 97 // 98 // There is no return from the stub itself as any Java result 99 // is written to result 100 // 101 // we save r30 (lr) as the return PC at the base of the frame and 102 // link r29 (fp) below it as the frame pointer installing sp (r31) 103 // into fp. 104 // 105 // we save r0-r7, which accounts for all the c arguments. 106 // 107 // TODO: strictly do we need to save them all? they are treated as 108 // volatile by C so could we omit saving the ones we are going to 109 // place in global registers (thread? method?) or those we only use 110 // during setup of the Java call? 111 // 112 // we don't need to save r8 which C uses as an indirect result location 113 // return register. 114 // 115 // we don't need to save r9-r15 which both C and Java treat as 116 // volatile 117 // 118 // we don't need to save r16-18 because Java does not use them 119 // 120 // we save r19-r28 which Java uses as scratch registers and C 121 // expects to be callee-save 122 // 123 // we save the bottom 64 bits of each value stored in v8-v15; it is 124 // the responsibility of the caller to preserve larger values. 125 // 126 // so the stub frame looks like this when we enter Java code 127 // 128 // [ return_from_Java ] <--- sp 129 // [ argument word n ] 130 // ... 131 // -27 [ argument word 1 ] 132 // -26 [ saved v15 ] <--- sp_after_call 133 // -25 [ saved v14 ] 134 // -24 [ saved v13 ] 135 // -23 [ saved v12 ] 136 // -22 [ saved v11 ] 137 // -21 [ saved v10 ] 138 // -20 [ saved v9 ] 139 // -19 [ saved v8 ] 140 // -18 [ saved r28 ] 141 // -17 [ saved r27 ] 142 // -16 [ saved r26 ] 143 // -15 [ saved r25 ] 144 // -14 [ saved r24 ] 145 // -13 [ saved r23 ] 146 // -12 [ saved r22 ] 147 // -11 [ saved r21 ] 148 // -10 [ saved r20 ] 149 // -9 [ saved r19 ] 150 // -8 [ call wrapper (r0) ] 151 // -7 [ result (r1) ] 152 // -6 [ result type (r2) ] 153 // -5 [ method (r3) ] 154 // -4 [ entry point (r4) ] 155 // -3 [ parameters (r5) ] 156 // -2 [ parameter size (r6) ] 157 // -1 [ thread (r7) ] 158 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 159 // 1 [ saved lr (r30) ] 160 161 // Call stub stack layout word offsets from fp 162 enum call_stub_layout { 163 sp_after_call_off = -26, 164 165 d15_off = -26, 166 d14_off = -25, 167 d13_off = -24, 168 d12_off = -23, 169 d11_off = -22, 170 d10_off = -21, 171 d9_off = -20, 172 d8_off = -19, 173 174 r28_off = -18, 175 r27_off = -17, 176 r26_off = -16, 177 r25_off = -15, 178 r24_off = -14, 179 r23_off = -13, 180 r22_off = -12, 181 r21_off = -11, 182 r20_off = -10, 183 r19_off = -9, 184 call_wrapper_off = -8, 185 result_off = -7, 186 result_type_off = -6, 187 method_off = -5, 188 entry_point_off = -4, 189 parameters_off = -3, 190 parameter_size_off = -2, 191 thread_off = -1, 192 fp_f = 0, 193 retaddr_off = 1, 194 }; 195 196 address generate_call_stub(address& return_address) { 197 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 198 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 199 "adjust this code"); 200 201 StubCodeMark mark(this, "StubRoutines", "call_stub"); 202 address start = __ pc(); 203 204 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 205 206 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 207 const Address result (rfp, result_off * wordSize); 208 const Address result_type (rfp, result_type_off * wordSize); 209 const Address method (rfp, method_off * wordSize); 210 const Address entry_point (rfp, entry_point_off * wordSize); 211 const Address parameters (rfp, parameters_off * wordSize); 212 const Address parameter_size(rfp, parameter_size_off * wordSize); 213 214 const Address thread (rfp, thread_off * wordSize); 215 216 const Address d15_save (rfp, d15_off * wordSize); 217 const Address d14_save (rfp, d14_off * wordSize); 218 const Address d13_save (rfp, d13_off * wordSize); 219 const Address d12_save (rfp, d12_off * wordSize); 220 const Address d11_save (rfp, d11_off * wordSize); 221 const Address d10_save (rfp, d10_off * wordSize); 222 const Address d9_save (rfp, d9_off * wordSize); 223 const Address d8_save (rfp, d8_off * wordSize); 224 225 const Address r28_save (rfp, r28_off * wordSize); 226 const Address r27_save (rfp, r27_off * wordSize); 227 const Address r26_save (rfp, r26_off * wordSize); 228 const Address r25_save (rfp, r25_off * wordSize); 229 const Address r24_save (rfp, r24_off * wordSize); 230 const Address r23_save (rfp, r23_off * wordSize); 231 const Address r22_save (rfp, r22_off * wordSize); 232 const Address r21_save (rfp, r21_off * wordSize); 233 const Address r20_save (rfp, r20_off * wordSize); 234 const Address r19_save (rfp, r19_off * wordSize); 235 236 // stub code 237 238 // we need a C prolog to bootstrap the x86 caller into the sim 239 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 240 241 address aarch64_entry = __ pc(); 242 243 #ifdef BUILTIN_SIM 244 // Save sender's SP for stack traces. 245 __ mov(rscratch1, sp); 246 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 247 #endif 248 // set up frame and move sp to end of save area 249 __ enter(); 250 __ sub(sp, rfp, -sp_after_call_off * wordSize); 251 252 // save register parameters and Java scratch/global registers 253 // n.b. we save thread even though it gets installed in 254 // rthread because we want to sanity check rthread later 255 __ str(c_rarg7, thread); 256 __ strw(c_rarg6, parameter_size); 257 __ str(c_rarg5, parameters); 258 __ str(c_rarg4, entry_point); 259 __ str(c_rarg3, method); 260 __ str(c_rarg2, result_type); 261 __ str(c_rarg1, result); 262 __ str(c_rarg0, call_wrapper); 263 __ str(r19, r19_save); 264 __ str(r20, r20_save); 265 __ str(r21, r21_save); 266 __ str(r22, r22_save); 267 __ str(r23, r23_save); 268 __ str(r24, r24_save); 269 __ str(r25, r25_save); 270 __ str(r26, r26_save); 271 __ str(r27, r27_save); 272 __ str(r28, r28_save); 273 274 __ strd(v8, d8_save); 275 __ strd(v9, d9_save); 276 __ strd(v10, d10_save); 277 __ strd(v11, d11_save); 278 __ strd(v12, d12_save); 279 __ strd(v13, d13_save); 280 __ strd(v14, d14_save); 281 __ strd(v15, d15_save); 282 283 // install Java thread in global register now we have saved 284 // whatever value it held 285 __ mov(rthread, c_rarg7); 286 // And method 287 __ mov(rmethod, c_rarg3); 288 289 // set up the heapbase register 290 __ reinit_heapbase(); 291 292 #ifdef ASSERT 293 // make sure we have no pending exceptions 294 { 295 Label L; 296 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 297 __ cmp(rscratch1, (unsigned)NULL_WORD); 298 __ br(Assembler::EQ, L); 299 __ stop("StubRoutines::call_stub: entered with pending exception"); 300 __ BIND(L); 301 } 302 #endif 303 // pass parameters if any 304 __ mov(esp, sp); 305 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 306 __ andr(sp, rscratch1, -2 * wordSize); 307 308 BLOCK_COMMENT("pass parameters if any"); 309 Label parameters_done; 310 // parameter count is still in c_rarg6 311 // and parameter pointer identifying param 1 is in c_rarg5 312 __ cbzw(c_rarg6, parameters_done); 313 314 address loop = __ pc(); 315 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 316 __ subsw(c_rarg6, c_rarg6, 1); 317 __ push(rscratch1); 318 __ br(Assembler::GT, loop); 319 320 __ BIND(parameters_done); 321 322 // call Java entry -- passing methdoOop, and current sp 323 // rmethod: Method* 324 // r13: sender sp 325 BLOCK_COMMENT("call Java function"); 326 __ mov(r13, sp); 327 __ blr(c_rarg4); 328 329 // tell the simulator we have returned to the stub 330 331 // we do this here because the notify will already have been done 332 // if we get to the next instruction via an exception 333 // 334 // n.b. adding this instruction here affects the calculation of 335 // whether or not a routine returns to the call stub (used when 336 // doing stack walks) since the normal test is to check the return 337 // pc against the address saved below. so we may need to allow for 338 // this extra instruction in the check. 339 340 if (NotifySimulator) { 341 __ notify(Assembler::method_reentry); 342 } 343 // save current address for use by exception handling code 344 345 return_address = __ pc(); 346 347 // store result depending on type (everything that is not 348 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 349 // n.b. this assumes Java returns an integral result in r0 350 // and a floating result in j_farg0 351 __ ldr(j_rarg2, result); 352 Label is_long, is_float, is_double, exit; 353 __ ldr(j_rarg1, result_type); 354 __ cmp(j_rarg1, T_OBJECT); 355 __ br(Assembler::EQ, is_long); 356 __ cmp(j_rarg1, T_LONG); 357 __ br(Assembler::EQ, is_long); 358 __ cmp(j_rarg1, T_FLOAT); 359 __ br(Assembler::EQ, is_float); 360 __ cmp(j_rarg1, T_DOUBLE); 361 __ br(Assembler::EQ, is_double); 362 363 // handle T_INT case 364 __ strw(r0, Address(j_rarg2)); 365 366 __ BIND(exit); 367 368 // pop parameters 369 __ sub(esp, rfp, -sp_after_call_off * wordSize); 370 371 #ifdef ASSERT 372 // verify that threads correspond 373 { 374 Label L, S; 375 __ ldr(rscratch1, thread); 376 __ cmp(rthread, rscratch1); 377 __ br(Assembler::NE, S); 378 __ get_thread(rscratch1); 379 __ cmp(rthread, rscratch1); 380 __ br(Assembler::EQ, L); 381 __ BIND(S); 382 __ stop("StubRoutines::call_stub: threads must correspond"); 383 __ BIND(L); 384 } 385 #endif 386 387 // restore callee-save registers 388 __ ldrd(v15, d15_save); 389 __ ldrd(v14, d14_save); 390 __ ldrd(v13, d13_save); 391 __ ldrd(v12, d12_save); 392 __ ldrd(v11, d11_save); 393 __ ldrd(v10, d10_save); 394 __ ldrd(v9, d9_save); 395 __ ldrd(v8, d8_save); 396 397 __ ldr(r28, r28_save); 398 __ ldr(r27, r27_save); 399 __ ldr(r26, r26_save); 400 __ ldr(r25, r25_save); 401 __ ldr(r24, r24_save); 402 __ ldr(r23, r23_save); 403 __ ldr(r22, r22_save); 404 __ ldr(r21, r21_save); 405 __ ldr(r20, r20_save); 406 __ ldr(r19, r19_save); 407 __ ldr(c_rarg0, call_wrapper); 408 __ ldr(c_rarg1, result); 409 __ ldrw(c_rarg2, result_type); 410 __ ldr(c_rarg3, method); 411 __ ldr(c_rarg4, entry_point); 412 __ ldr(c_rarg5, parameters); 413 __ ldr(c_rarg6, parameter_size); 414 __ ldr(c_rarg7, thread); 415 416 #ifndef PRODUCT 417 // tell the simulator we are about to end Java execution 418 if (NotifySimulator) { 419 __ notify(Assembler::method_exit); 420 } 421 #endif 422 // leave frame and return to caller 423 __ leave(); 424 __ ret(lr); 425 426 // handle return types different from T_INT 427 428 __ BIND(is_long); 429 __ str(r0, Address(j_rarg2, 0)); 430 __ br(Assembler::AL, exit); 431 432 __ BIND(is_float); 433 __ strs(j_farg0, Address(j_rarg2, 0)); 434 __ br(Assembler::AL, exit); 435 436 __ BIND(is_double); 437 __ strd(j_farg0, Address(j_rarg2, 0)); 438 __ br(Assembler::AL, exit); 439 440 return start; 441 } 442 443 // Return point for a Java call if there's an exception thrown in 444 // Java code. The exception is caught and transformed into a 445 // pending exception stored in JavaThread that can be tested from 446 // within the VM. 447 // 448 // Note: Usually the parameters are removed by the callee. In case 449 // of an exception crossing an activation frame boundary, that is 450 // not the case if the callee is compiled code => need to setup the 451 // rsp. 452 // 453 // r0: exception oop 454 455 // NOTE: this is used as a target from the signal handler so it 456 // needs an x86 prolog which returns into the current simulator 457 // executing the generated catch_exception code. so the prolog 458 // needs to install rax in a sim register and adjust the sim's 459 // restart pc to enter the generated code at the start position 460 // then return from native to simulated execution. 461 462 address generate_catch_exception() { 463 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 464 address start = __ pc(); 465 466 // same as in generate_call_stub(): 467 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 468 const Address thread (rfp, thread_off * wordSize); 469 470 #ifdef ASSERT 471 // verify that threads correspond 472 { 473 Label L, S; 474 __ ldr(rscratch1, thread); 475 __ cmp(rthread, rscratch1); 476 __ br(Assembler::NE, S); 477 __ get_thread(rscratch1); 478 __ cmp(rthread, rscratch1); 479 __ br(Assembler::EQ, L); 480 __ bind(S); 481 __ stop("StubRoutines::catch_exception: threads must correspond"); 482 __ bind(L); 483 } 484 #endif 485 486 // set pending exception 487 __ verify_oop(r0); 488 489 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 490 __ mov(rscratch1, (address)__FILE__); 491 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 492 __ movw(rscratch1, (int)__LINE__); 493 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 494 495 // complete return to VM 496 assert(StubRoutines::_call_stub_return_address != NULL, 497 "_call_stub_return_address must have been generated before"); 498 __ b(StubRoutines::_call_stub_return_address); 499 500 return start; 501 } 502 503 // Continuation point for runtime calls returning with a pending 504 // exception. The pending exception check happened in the runtime 505 // or native call stub. The pending exception in Thread is 506 // converted into a Java-level exception. 507 // 508 // Contract with Java-level exception handlers: 509 // r0: exception 510 // r3: throwing pc 511 // 512 // NOTE: At entry of this stub, exception-pc must be in LR !! 513 514 // NOTE: this is always used as a jump target within generated code 515 // so it just needs to be generated code wiht no x86 prolog 516 517 address generate_forward_exception() { 518 StubCodeMark mark(this, "StubRoutines", "forward exception"); 519 address start = __ pc(); 520 521 // Upon entry, LR points to the return address returning into 522 // Java (interpreted or compiled) code; i.e., the return address 523 // becomes the throwing pc. 524 // 525 // Arguments pushed before the runtime call are still on the stack 526 // but the exception handler will reset the stack pointer -> 527 // ignore them. A potential result in registers can be ignored as 528 // well. 529 530 #ifdef ASSERT 531 // make sure this code is only executed if there is a pending exception 532 { 533 Label L; 534 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 535 __ cbnz(rscratch1, L); 536 __ stop("StubRoutines::forward exception: no pending exception (1)"); 537 __ bind(L); 538 } 539 #endif 540 541 // compute exception handler into r19 542 543 // call the VM to find the handler address associated with the 544 // caller address. pass thread in r0 and caller pc (ret address) 545 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 546 // the stack. 547 __ mov(c_rarg1, lr); 548 // lr will be trashed by the VM call so we move it to R19 549 // (callee-saved) because we also need to pass it to the handler 550 // returned by this call. 551 __ mov(r19, lr); 552 BLOCK_COMMENT("call exception_handler_for_return_address"); 553 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 554 SharedRuntime::exception_handler_for_return_address), 555 rthread, c_rarg1); 556 // we should not really care that lr is no longer the callee 557 // address. we saved the value the handler needs in r19 so we can 558 // just copy it to r3. however, the C2 handler will push its own 559 // frame and then calls into the VM and the VM code asserts that 560 // the PC for the frame above the handler belongs to a compiled 561 // Java method. So, we restore lr here to satisfy that assert. 562 __ mov(lr, r19); 563 // setup r0 & r3 & clear pending exception 564 __ mov(r3, r19); 565 __ mov(r19, r0); 566 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 567 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 568 569 #ifdef ASSERT 570 // make sure exception is set 571 { 572 Label L; 573 __ cbnz(r0, L); 574 __ stop("StubRoutines::forward exception: no pending exception (2)"); 575 __ bind(L); 576 } 577 #endif 578 579 // continue at exception handler 580 // r0: exception 581 // r3: throwing pc 582 // r19: exception handler 583 __ verify_oop(r0); 584 __ br(r19); 585 586 return start; 587 } 588 589 // Non-destructive plausibility checks for oops 590 // 591 // Arguments: 592 // r0: oop to verify 593 // rscratch1: error message 594 // 595 // Stack after saving c_rarg3: 596 // [tos + 0]: saved c_rarg3 597 // [tos + 1]: saved c_rarg2 598 // [tos + 2]: saved lr 599 // [tos + 3]: saved rscratch2 600 // [tos + 4]: saved r0 601 // [tos + 5]: saved rscratch1 602 address generate_verify_oop() { 603 604 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 605 address start = __ pc(); 606 607 Label exit, error; 608 609 // save c_rarg2 and c_rarg3 610 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 611 612 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 613 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 614 __ ldr(c_rarg3, Address(c_rarg2)); 615 __ add(c_rarg3, c_rarg3, 1); 616 __ str(c_rarg3, Address(c_rarg2)); 617 618 // object is in r0 619 // make sure object is 'reasonable' 620 __ cbz(r0, exit); // if obj is NULL it is OK 621 622 // Check if the oop is in the right area of memory 623 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 624 __ andr(c_rarg2, r0, c_rarg3); 625 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 626 627 // Compare c_rarg2 and c_rarg3. We don't use a compare 628 // instruction here because the flags register is live. 629 __ eor(c_rarg2, c_rarg2, c_rarg3); 630 __ cbnz(c_rarg2, error); 631 632 // make sure klass is 'reasonable', which is not zero. 633 __ load_klass(r0, r0); // get klass 634 __ cbz(r0, error); // if klass is NULL it is broken 635 636 // return if everything seems ok 637 __ bind(exit); 638 639 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 640 __ ret(lr); 641 642 // handle errors 643 __ bind(error); 644 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 645 646 __ push(RegSet::range(r0, r29), sp); 647 // debug(char* msg, int64_t pc, int64_t regs[]) 648 __ mov(c_rarg0, rscratch1); // pass address of error message 649 __ mov(c_rarg1, lr); // pass return address 650 __ mov(c_rarg2, sp); // pass address of regs on stack 651 #ifndef PRODUCT 652 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 653 #endif 654 BLOCK_COMMENT("call MacroAssembler::debug"); 655 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 656 __ blrt(rscratch1, 3, 0, 1); 657 658 return start; 659 } 660 661 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 662 663 // Generate code for an array write pre barrier 664 // 665 // addr - starting address 666 // count - element count 667 // tmp - scratch register 668 // 669 // Destroy no registers! 670 // 671 void gen_write_ref_array_pre_barrier(Register addr, Register count, bool dest_uninitialized) { 672 BarrierSet* bs = Universe::heap()->barrier_set(); 673 switch (bs->kind()) { 674 case BarrierSet::G1SATBCTLogging: 675 // With G1, don't generate the call if we statically know that the target in uninitialized 676 if (!dest_uninitialized) { 677 __ push(RegSet::range(r0, r29), sp); // integer registers except lr & sp 678 if (count == c_rarg0) { 679 if (addr == c_rarg1) { 680 // exactly backwards!! 681 __ stp(c_rarg0, c_rarg1, __ pre(sp, -2 * wordSize)); 682 __ ldp(c_rarg1, c_rarg0, __ post(sp, -2 * wordSize)); 683 } else { 684 __ mov(c_rarg1, count); 685 __ mov(c_rarg0, addr); 686 } 687 } else { 688 __ mov(c_rarg0, addr); 689 __ mov(c_rarg1, count); 690 } 691 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_pre), 2); 692 __ pop(RegSet::range(r0, r29), sp); // integer registers except lr & sp } 693 break; 694 case BarrierSet::CardTableForRS: 695 case BarrierSet::CardTableExtension: 696 case BarrierSet::ModRef: 697 break; 698 default: 699 ShouldNotReachHere(); 700 701 } 702 } 703 } 704 705 // 706 // Generate code for an array write post barrier 707 // 708 // Input: 709 // start - register containing starting address of destination array 710 // end - register containing ending address of destination array 711 // scratch - scratch register 712 // 713 // The input registers are overwritten. 714 // The ending address is inclusive. 715 void gen_write_ref_array_post_barrier(Register start, Register end, Register scratch) { 716 assert_different_registers(start, end, scratch); 717 BarrierSet* bs = Universe::heap()->barrier_set(); 718 switch (bs->kind()) { 719 case BarrierSet::G1SATBCTLogging: 720 721 { 722 __ push(RegSet::range(r0, r29), sp); // integer registers except lr & sp 723 // must compute element count unless barrier set interface is changed (other platforms supply count) 724 assert_different_registers(start, end, scratch); 725 __ lea(scratch, Address(end, BytesPerHeapOop)); 726 __ sub(scratch, scratch, start); // subtract start to get #bytes 727 __ lsr(scratch, scratch, LogBytesPerHeapOop); // convert to element count 728 __ mov(c_rarg0, start); 729 __ mov(c_rarg1, scratch); 730 __ call_VM_leaf(CAST_FROM_FN_PTR(address, BarrierSet::static_write_ref_array_post), 2); 731 __ pop(RegSet::range(r0, r29), sp); // integer registers except lr & sp } 732 } 733 break; 734 case BarrierSet::CardTableForRS: 735 case BarrierSet::CardTableExtension: 736 { 737 CardTableModRefBS* ct = (CardTableModRefBS*)bs; 738 assert(sizeof(*ct->byte_map_base) == sizeof(jbyte), "adjust this code"); 739 740 Label L_loop; 741 742 __ lsr(start, start, CardTableModRefBS::card_shift); 743 __ lsr(end, end, CardTableModRefBS::card_shift); 744 __ sub(end, end, start); // number of bytes to copy 745 746 const Register count = end; // 'end' register contains bytes count now 747 __ load_byte_map_base(scratch); 748 __ add(start, start, scratch); 749 if (UseConcMarkSweepGC) { 750 __ membar(__ StoreStore); 751 } 752 __ BIND(L_loop); 753 __ strb(zr, Address(start, count)); 754 __ subs(count, count, 1); 755 __ br(Assembler::HS, L_loop); 756 } 757 break; 758 default: 759 ShouldNotReachHere(); 760 761 } 762 } 763 764 typedef enum { 765 copy_forwards = 1, 766 copy_backwards = -1 767 } copy_direction; 768 769 // Bulk copy of blocks of 8 words. 770 // 771 // count is a count of words. 772 // 773 // Precondition: count >= 2 774 // 775 // Postconditions: 776 // 777 // The least significant bit of count contains the remaining count 778 // of words to copy. The rest of count is trash. 779 // 780 // s and d are adjusted to point to the remaining words to copy 781 // 782 void generate_copy_longs(Label &start, Register s, Register d, Register count, 783 copy_direction direction) { 784 int unit = wordSize * direction; 785 786 int offset; 787 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 788 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 789 const Register stride = r13; 790 791 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 792 assert_different_registers(s, d, count, rscratch1); 793 794 Label again, large, small; 795 const char *stub_name; 796 if (direction == copy_forwards) 797 stub_name = "foward_copy_longs"; 798 else 799 stub_name = "backward_copy_longs"; 800 StubCodeMark mark(this, "StubRoutines", stub_name); 801 __ align(CodeEntryAlignment); 802 __ bind(start); 803 __ cmp(count, 8); 804 __ br(Assembler::LO, small); 805 if (direction == copy_forwards) { 806 __ sub(s, s, 2 * wordSize); 807 __ sub(d, d, 2 * wordSize); 808 } 809 __ subs(count, count, 16); 810 __ br(Assembler::GE, large); 811 812 // 8 <= count < 16 words. Copy 8. 813 __ ldp(t0, t1, Address(s, 2 * unit)); 814 __ ldp(t2, t3, Address(s, 4 * unit)); 815 __ ldp(t4, t5, Address(s, 6 * unit)); 816 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 817 818 __ stp(t0, t1, Address(d, 2 * unit)); 819 __ stp(t2, t3, Address(d, 4 * unit)); 820 __ stp(t4, t5, Address(d, 6 * unit)); 821 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 822 823 if (direction == copy_forwards) { 824 __ add(s, s, 2 * wordSize); 825 __ add(d, d, 2 * wordSize); 826 } 827 828 { 829 Label L1, L2; 830 __ bind(small); 831 __ tbz(count, exact_log2(4), L1); 832 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 833 __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 834 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 835 __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 836 __ bind(L1); 837 838 __ tbz(count, 1, L2); 839 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 840 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 841 __ bind(L2); 842 } 843 844 __ ret(lr); 845 846 __ align(CodeEntryAlignment); 847 __ bind(large); 848 849 // Fill 8 registers 850 __ ldp(t0, t1, Address(s, 2 * unit)); 851 __ ldp(t2, t3, Address(s, 4 * unit)); 852 __ ldp(t4, t5, Address(s, 6 * unit)); 853 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 854 855 int prefetch = PrefetchCopyIntervalInBytes; 856 bool use_stride = false; 857 if (direction == copy_backwards) { 858 use_stride = prefetch > 256; 859 prefetch = -prefetch; 860 if (use_stride) __ mov(stride, prefetch); 861 } 862 863 __ bind(again); 864 865 if (PrefetchCopyIntervalInBytes > 0) 866 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 867 868 __ stp(t0, t1, Address(d, 2 * unit)); 869 __ ldp(t0, t1, Address(s, 2 * unit)); 870 __ stp(t2, t3, Address(d, 4 * unit)); 871 __ ldp(t2, t3, Address(s, 4 * unit)); 872 __ stp(t4, t5, Address(d, 6 * unit)); 873 __ ldp(t4, t5, Address(s, 6 * unit)); 874 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 875 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 876 877 __ subs(count, count, 8); 878 __ br(Assembler::HS, again); 879 880 // Drain 881 __ stp(t0, t1, Address(d, 2 * unit)); 882 __ stp(t2, t3, Address(d, 4 * unit)); 883 __ stp(t4, t5, Address(d, 6 * unit)); 884 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 885 886 if (direction == copy_forwards) { 887 __ add(s, s, 2 * wordSize); 888 __ add(d, d, 2 * wordSize); 889 } 890 891 { 892 Label L1, L2; 893 __ tbz(count, exact_log2(4), L1); 894 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 895 __ ldp(t2, t3, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 896 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 897 __ stp(t2, t3, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 898 __ bind(L1); 899 900 __ tbz(count, 1, L2); 901 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 902 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 903 __ bind(L2); 904 } 905 906 __ ret(lr); 907 } 908 909 // Small copy: less than 16 bytes. 910 // 911 // NB: Ignores all of the bits of count which represent more than 15 912 // bytes, so a caller doesn't have to mask them. 913 914 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 915 bool is_backwards = step < 0; 916 size_t granularity = uabs(step); 917 int direction = is_backwards ? -1 : 1; 918 int unit = wordSize * direction; 919 920 Label Lpair, Lword, Lint, Lshort, Lbyte; 921 922 assert(granularity 923 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 924 925 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 926 927 // ??? I don't know if this bit-test-and-branch is the right thing 928 // to do. It does a lot of jumping, resulting in several 929 // mispredicted branches. It might make more sense to do this 930 // with something like Duff's device with a single computed branch. 931 932 __ tbz(count, 3 - exact_log2(granularity), Lword); 933 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 934 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 935 __ bind(Lword); 936 937 if (granularity <= sizeof (jint)) { 938 __ tbz(count, 2 - exact_log2(granularity), Lint); 939 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 940 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 941 __ bind(Lint); 942 } 943 944 if (granularity <= sizeof (jshort)) { 945 __ tbz(count, 1 - exact_log2(granularity), Lshort); 946 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 947 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 948 __ bind(Lshort); 949 } 950 951 if (granularity <= sizeof (jbyte)) { 952 __ tbz(count, 0, Lbyte); 953 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 954 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 955 __ bind(Lbyte); 956 } 957 } 958 959 Label copy_f, copy_b; 960 961 // All-singing all-dancing memory copy. 962 // 963 // Copy count units of memory from s to d. The size of a unit is 964 // step, which can be positive or negative depending on the direction 965 // of copy. If is_aligned is false, we align the source address. 966 // 967 968 void copy_memory(bool is_aligned, Register s, Register d, 969 Register count, Register tmp, int step) { 970 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 971 bool is_backwards = step < 0; 972 int granularity = uabs(step); 973 const Register t0 = r3, t1 = r4; 974 975 if (is_backwards) { 976 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 977 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 978 } 979 980 Label tail; 981 982 __ cmp(count, 16/granularity); 983 __ br(Assembler::LO, tail); 984 985 // Now we've got the small case out of the way we can align the 986 // source address on a 2-word boundary. 987 988 Label aligned; 989 990 if (is_aligned) { 991 // We may have to adjust by 1 word to get s 2-word-aligned. 992 __ tbz(s, exact_log2(wordSize), aligned); 993 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 994 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 995 __ sub(count, count, wordSize/granularity); 996 } else { 997 if (is_backwards) { 998 __ andr(rscratch2, s, 2 * wordSize - 1); 999 } else { 1000 __ neg(rscratch2, s); 1001 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1002 } 1003 // rscratch2 is the byte adjustment needed to align s. 1004 __ cbz(rscratch2, aligned); 1005 int shift = exact_log2(granularity); 1006 if (shift) __ lsr(rscratch2, rscratch2, shift); 1007 __ sub(count, count, rscratch2); 1008 1009 #if 0 1010 // ?? This code is only correct for a disjoint copy. It may or 1011 // may not make sense to use it in that case. 1012 1013 // Copy the first pair; s and d may not be aligned. 1014 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1015 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1016 1017 // Align s and d, adjust count 1018 if (is_backwards) { 1019 __ sub(s, s, rscratch2); 1020 __ sub(d, d, rscratch2); 1021 } else { 1022 __ add(s, s, rscratch2); 1023 __ add(d, d, rscratch2); 1024 } 1025 #else 1026 copy_memory_small(s, d, rscratch2, rscratch1, step); 1027 #endif 1028 } 1029 1030 __ cmp(count, 16/granularity); 1031 __ br(Assembler::LT, tail); 1032 __ bind(aligned); 1033 1034 // s is now 2-word-aligned. 1035 1036 // We have a count of units and some trailing bytes. Adjust the 1037 // count and do a bulk copy of words. 1038 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1039 if (direction == copy_forwards) 1040 __ bl(copy_f); 1041 else 1042 __ bl(copy_b); 1043 1044 // And the tail. 1045 1046 __ bind(tail); 1047 copy_memory_small(s, d, count, tmp, step); 1048 } 1049 1050 1051 void clobber_registers() { 1052 #ifdef ASSERT 1053 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1054 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1055 for (Register r = r3; r <= r18; r++) 1056 if (r != rscratch1) __ mov(r, rscratch1); 1057 #endif 1058 } 1059 1060 // Scan over array at a for count oops, verifying each one. 1061 // Preserves a and count, clobbers rscratch1 and rscratch2. 1062 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1063 Label loop, end; 1064 __ mov(rscratch1, a); 1065 __ mov(rscratch2, zr); 1066 __ bind(loop); 1067 __ cmp(rscratch2, count); 1068 __ br(Assembler::HS, end); 1069 if (size == (size_t)wordSize) { 1070 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1071 __ verify_oop(temp); 1072 } else { 1073 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1074 __ decode_heap_oop(temp); // calls verify_oop 1075 } 1076 __ add(rscratch2, rscratch2, size); 1077 __ b(loop); 1078 __ bind(end); 1079 } 1080 1081 // Arguments: 1082 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1083 // ignored 1084 // is_oop - true => oop array, so generate store check code 1085 // name - stub name string 1086 // 1087 // Inputs: 1088 // c_rarg0 - source array address 1089 // c_rarg1 - destination array address 1090 // c_rarg2 - element count, treated as ssize_t, can be zero 1091 // 1092 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1093 // the hardware handle it. The two dwords within qwords that span 1094 // cache line boundaries will still be loaded and stored atomicly. 1095 // 1096 // Side Effects: 1097 // disjoint_int_copy_entry is set to the no-overlap entry point 1098 // used by generate_conjoint_int_oop_copy(). 1099 // 1100 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1101 const char *name, bool dest_uninitialized = false) { 1102 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1103 __ align(CodeEntryAlignment); 1104 StubCodeMark mark(this, "StubRoutines", name); 1105 address start = __ pc(); 1106 __ enter(); 1107 1108 if (entry != NULL) { 1109 *entry = __ pc(); 1110 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1111 BLOCK_COMMENT("Entry:"); 1112 } 1113 1114 if (is_oop) { 1115 __ push(RegSet::of(d, count), sp); 1116 // no registers are destroyed by this call 1117 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1118 } 1119 copy_memory(aligned, s, d, count, rscratch1, size); 1120 if (is_oop) { 1121 __ pop(RegSet::of(d, count), sp); 1122 if (VerifyOops) 1123 verify_oop_array(size, d, count, r16); 1124 __ sub(count, count, 1); // make an inclusive end pointer 1125 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1126 gen_write_ref_array_post_barrier(d, count, rscratch1); 1127 } 1128 __ leave(); 1129 __ mov(r0, zr); // return 0 1130 __ ret(lr); 1131 #ifdef BUILTIN_SIM 1132 { 1133 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1134 sim->notifyCompile(const_cast<char*>(name), start); 1135 } 1136 #endif 1137 return start; 1138 } 1139 1140 // Arguments: 1141 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1142 // ignored 1143 // is_oop - true => oop array, so generate store check code 1144 // name - stub name string 1145 // 1146 // Inputs: 1147 // c_rarg0 - source array address 1148 // c_rarg1 - destination array address 1149 // c_rarg2 - element count, treated as ssize_t, can be zero 1150 // 1151 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1152 // the hardware handle it. The two dwords within qwords that span 1153 // cache line boundaries will still be loaded and stored atomicly. 1154 // 1155 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1156 address *entry, const char *name, 1157 bool dest_uninitialized = false) { 1158 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1159 1160 StubCodeMark mark(this, "StubRoutines", name); 1161 address start = __ pc(); 1162 __ enter(); 1163 1164 if (entry != NULL) { 1165 *entry = __ pc(); 1166 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1167 BLOCK_COMMENT("Entry:"); 1168 } 1169 1170 // use fwd copy when (d-s) above_equal (count*size) 1171 __ sub(rscratch1, d, s); 1172 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1173 __ br(Assembler::HS, nooverlap_target); 1174 1175 if (is_oop) { 1176 __ push(RegSet::of(d, count), sp); 1177 // no registers are destroyed by this call 1178 gen_write_ref_array_pre_barrier(d, count, dest_uninitialized); 1179 } 1180 copy_memory(aligned, s, d, count, rscratch1, -size); 1181 if (is_oop) { 1182 __ pop(RegSet::of(d, count), sp); 1183 if (VerifyOops) 1184 verify_oop_array(size, d, count, r16); 1185 __ sub(count, count, 1); // make an inclusive end pointer 1186 __ lea(count, Address(d, count, Address::uxtw(exact_log2(size)))); 1187 gen_write_ref_array_post_barrier(d, count, rscratch1); 1188 } 1189 __ leave(); 1190 __ mov(r0, zr); // return 0 1191 __ ret(lr); 1192 #ifdef BUILTIN_SIM 1193 { 1194 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1195 sim->notifyCompile(const_cast<char*>(name), start); 1196 } 1197 #endif 1198 return start; 1199 } 1200 1201 // Arguments: 1202 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1203 // ignored 1204 // name - stub name string 1205 // 1206 // Inputs: 1207 // c_rarg0 - source array address 1208 // c_rarg1 - destination array address 1209 // c_rarg2 - element count, treated as ssize_t, can be zero 1210 // 1211 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1212 // we let the hardware handle it. The one to eight bytes within words, 1213 // dwords or qwords that span cache line boundaries will still be loaded 1214 // and stored atomically. 1215 // 1216 // Side Effects: 1217 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1218 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1219 // we let the hardware handle it. The one to eight bytes within words, 1220 // dwords or qwords that span cache line boundaries will still be loaded 1221 // and stored atomically. 1222 // 1223 // Side Effects: 1224 // disjoint_byte_copy_entry is set to the no-overlap entry point 1225 // used by generate_conjoint_byte_copy(). 1226 // 1227 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1228 const bool not_oop = false; 1229 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1230 } 1231 1232 // Arguments: 1233 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1234 // ignored 1235 // name - stub name string 1236 // 1237 // Inputs: 1238 // c_rarg0 - source array address 1239 // c_rarg1 - destination array address 1240 // c_rarg2 - element count, treated as ssize_t, can be zero 1241 // 1242 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1243 // we let the hardware handle it. The one to eight bytes within words, 1244 // dwords or qwords that span cache line boundaries will still be loaded 1245 // and stored atomically. 1246 // 1247 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1248 address* entry, const char *name) { 1249 const bool not_oop = false; 1250 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1251 } 1252 1253 // Arguments: 1254 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1255 // ignored 1256 // name - stub name string 1257 // 1258 // Inputs: 1259 // c_rarg0 - source array address 1260 // c_rarg1 - destination array address 1261 // c_rarg2 - element count, treated as ssize_t, can be zero 1262 // 1263 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1264 // let the hardware handle it. The two or four words within dwords 1265 // or qwords that span cache line boundaries will still be loaded 1266 // and stored atomically. 1267 // 1268 // Side Effects: 1269 // disjoint_short_copy_entry is set to the no-overlap entry point 1270 // used by generate_conjoint_short_copy(). 1271 // 1272 address generate_disjoint_short_copy(bool aligned, 1273 address* entry, const char *name) { 1274 const bool not_oop = false; 1275 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1276 } 1277 1278 // Arguments: 1279 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1280 // ignored 1281 // name - stub name string 1282 // 1283 // Inputs: 1284 // c_rarg0 - source array address 1285 // c_rarg1 - destination array address 1286 // c_rarg2 - element count, treated as ssize_t, can be zero 1287 // 1288 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1289 // let the hardware handle it. The two or four words within dwords 1290 // or qwords that span cache line boundaries will still be loaded 1291 // and stored atomically. 1292 // 1293 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1294 address *entry, const char *name) { 1295 const bool not_oop = false; 1296 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1297 1298 } 1299 // Arguments: 1300 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1301 // ignored 1302 // name - stub name string 1303 // 1304 // Inputs: 1305 // c_rarg0 - source array address 1306 // c_rarg1 - destination array address 1307 // c_rarg2 - element count, treated as ssize_t, can be zero 1308 // 1309 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1310 // the hardware handle it. The two dwords within qwords that span 1311 // cache line boundaries will still be loaded and stored atomicly. 1312 // 1313 // Side Effects: 1314 // disjoint_int_copy_entry is set to the no-overlap entry point 1315 // used by generate_conjoint_int_oop_copy(). 1316 // 1317 address generate_disjoint_int_copy(bool aligned, address *entry, 1318 const char *name, bool dest_uninitialized = false) { 1319 const bool not_oop = false; 1320 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1321 } 1322 1323 // Arguments: 1324 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1325 // ignored 1326 // name - stub name string 1327 // 1328 // Inputs: 1329 // c_rarg0 - source array address 1330 // c_rarg1 - destination array address 1331 // c_rarg2 - element count, treated as ssize_t, can be zero 1332 // 1333 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1334 // the hardware handle it. The two dwords within qwords that span 1335 // cache line boundaries will still be loaded and stored atomicly. 1336 // 1337 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1338 address *entry, const char *name, 1339 bool dest_uninitialized = false) { 1340 const bool not_oop = false; 1341 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1342 } 1343 1344 1345 // Arguments: 1346 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1347 // ignored 1348 // name - stub name string 1349 // 1350 // Inputs: 1351 // c_rarg0 - source array address 1352 // c_rarg1 - destination array address 1353 // c_rarg2 - element count, treated as size_t, can be zero 1354 // 1355 // Side Effects: 1356 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1357 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1358 // 1359 address generate_disjoint_long_copy(bool aligned, address *entry, 1360 const char *name, bool dest_uninitialized = false) { 1361 const bool not_oop = false; 1362 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1363 } 1364 1365 // Arguments: 1366 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1367 // ignored 1368 // name - stub name string 1369 // 1370 // Inputs: 1371 // c_rarg0 - source array address 1372 // c_rarg1 - destination array address 1373 // c_rarg2 - element count, treated as size_t, can be zero 1374 // 1375 address generate_conjoint_long_copy(bool aligned, 1376 address nooverlap_target, address *entry, 1377 const char *name, bool dest_uninitialized = false) { 1378 const bool not_oop = false; 1379 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1380 } 1381 1382 // Arguments: 1383 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1384 // ignored 1385 // name - stub name string 1386 // 1387 // Inputs: 1388 // c_rarg0 - source array address 1389 // c_rarg1 - destination array address 1390 // c_rarg2 - element count, treated as size_t, can be zero 1391 // 1392 // Side Effects: 1393 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1394 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1395 // 1396 address generate_disjoint_oop_copy(bool aligned, address *entry, 1397 const char *name, bool dest_uninitialized = false) { 1398 const bool is_oop = true; 1399 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1400 return generate_disjoint_copy(size, aligned, is_oop, entry, name); 1401 } 1402 1403 // Arguments: 1404 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1405 // ignored 1406 // name - stub name string 1407 // 1408 // Inputs: 1409 // c_rarg0 - source array address 1410 // c_rarg1 - destination array address 1411 // c_rarg2 - element count, treated as size_t, can be zero 1412 // 1413 address generate_conjoint_oop_copy(bool aligned, 1414 address nooverlap_target, address *entry, 1415 const char *name, bool dest_uninitialized = false) { 1416 const bool is_oop = true; 1417 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1418 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, name); 1419 } 1420 1421 1422 // Helper for generating a dynamic type check. 1423 // Smashes rscratch1. 1424 void generate_type_check(Register sub_klass, 1425 Register super_check_offset, 1426 Register super_klass, 1427 Label& L_success) { 1428 assert_different_registers(sub_klass, super_check_offset, super_klass); 1429 1430 BLOCK_COMMENT("type_check:"); 1431 1432 Label L_miss; 1433 1434 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1435 super_check_offset); 1436 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1437 1438 // Fall through on failure! 1439 __ BIND(L_miss); 1440 } 1441 1442 // 1443 // Generate checkcasting array copy stub 1444 // 1445 // Input: 1446 // c_rarg0 - source array address 1447 // c_rarg1 - destination array address 1448 // c_rarg2 - element count, treated as ssize_t, can be zero 1449 // c_rarg3 - size_t ckoff (super_check_offset) 1450 // c_rarg4 - oop ckval (super_klass) 1451 // 1452 // Output: 1453 // r0 == 0 - success 1454 // r0 == -1^K - failure, where K is partial transfer count 1455 // 1456 address generate_checkcast_copy(const char *name, address *entry, 1457 bool dest_uninitialized = false) { 1458 1459 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1460 1461 // Input registers (after setup_arg_regs) 1462 const Register from = c_rarg0; // source array address 1463 const Register to = c_rarg1; // destination array address 1464 const Register count = c_rarg2; // elementscount 1465 const Register ckoff = c_rarg3; // super_check_offset 1466 const Register ckval = c_rarg4; // super_klass 1467 1468 // Registers used as temps (r18, r19, r20 are save-on-entry) 1469 const Register count_save = r21; // orig elementscount 1470 const Register start_to = r20; // destination array start address 1471 const Register copied_oop = r18; // actual oop copied 1472 const Register r19_klass = r19; // oop._klass 1473 1474 //--------------------------------------------------------------- 1475 // Assembler stub will be used for this call to arraycopy 1476 // if the two arrays are subtypes of Object[] but the 1477 // destination array type is not equal to or a supertype 1478 // of the source type. Each element must be separately 1479 // checked. 1480 1481 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1482 copied_oop, r19_klass, count_save); 1483 1484 __ align(CodeEntryAlignment); 1485 StubCodeMark mark(this, "StubRoutines", name); 1486 address start = __ pc(); 1487 1488 __ enter(); // required for proper stackwalking of RuntimeStub frame 1489 1490 #ifdef ASSERT 1491 // caller guarantees that the arrays really are different 1492 // otherwise, we would have to make conjoint checks 1493 { Label L; 1494 array_overlap_test(L, TIMES_OOP); 1495 __ stop("checkcast_copy within a single array"); 1496 __ bind(L); 1497 } 1498 #endif //ASSERT 1499 1500 // Caller of this entry point must set up the argument registers. 1501 if (entry != NULL) { 1502 *entry = __ pc(); 1503 BLOCK_COMMENT("Entry:"); 1504 } 1505 1506 // Empty array: Nothing to do. 1507 __ cbz(count, L_done); 1508 1509 __ push(RegSet::of(r18, r19, r20, r21), sp); 1510 1511 #ifdef ASSERT 1512 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1513 // The ckoff and ckval must be mutually consistent, 1514 // even though caller generates both. 1515 { Label L; 1516 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1517 __ ldrw(start_to, Address(ckval, sco_offset)); 1518 __ cmpw(ckoff, start_to); 1519 __ br(Assembler::EQ, L); 1520 __ stop("super_check_offset inconsistent"); 1521 __ bind(L); 1522 } 1523 #endif //ASSERT 1524 1525 // save the original count 1526 __ mov(count_save, count); 1527 1528 // Copy from low to high addresses 1529 __ mov(start_to, to); // Save destination array start address 1530 __ b(L_load_element); 1531 1532 // ======== begin loop ======== 1533 // (Loop is rotated; its entry is L_load_element.) 1534 // Loop control: 1535 // for (; count != 0; count--) { 1536 // copied_oop = load_heap_oop(from++); 1537 // ... generate_type_check ...; 1538 // store_heap_oop(to++, copied_oop); 1539 // } 1540 __ align(OptoLoopAlignment); 1541 1542 __ BIND(L_store_element); 1543 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1544 __ sub(count, count, 1); 1545 __ cbz(count, L_do_card_marks); 1546 1547 // ======== loop entry is here ======== 1548 __ BIND(L_load_element); 1549 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1550 __ cbz(copied_oop, L_store_element); 1551 1552 __ load_klass(r19_klass, copied_oop);// query the object klass 1553 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1554 // ======== end loop ======== 1555 1556 // It was a real error; we must depend on the caller to finish the job. 1557 // Register count = remaining oops, count_orig = total oops. 1558 // Emit GC store barriers for the oops we have copied and report 1559 // their number to the caller. 1560 1561 __ subs(count, count_save, count); // K = partially copied oop count 1562 __ eon(count, count, zr); // report (-1^K) to caller 1563 __ br(Assembler::EQ, L_done_pop); 1564 1565 __ BIND(L_do_card_marks); 1566 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1567 gen_write_ref_array_post_barrier(start_to, to, rscratch1); 1568 1569 __ bind(L_done_pop); 1570 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1571 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1572 1573 __ bind(L_done); 1574 __ mov(r0, count); 1575 __ leave(); 1576 __ ret(lr); 1577 1578 return start; 1579 } 1580 1581 // Perform range checks on the proposed arraycopy. 1582 // Kills temp, but nothing else. 1583 // Also, clean the sign bits of src_pos and dst_pos. 1584 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1585 Register src_pos, // source position (c_rarg1) 1586 Register dst, // destination array oo (c_rarg2) 1587 Register dst_pos, // destination position (c_rarg3) 1588 Register length, 1589 Register temp, 1590 Label& L_failed) { 1591 BLOCK_COMMENT("arraycopy_range_checks:"); 1592 1593 assert_different_registers(rscratch1, temp); 1594 1595 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1596 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1597 __ addw(temp, length, src_pos); 1598 __ cmpw(temp, rscratch1); 1599 __ br(Assembler::HI, L_failed); 1600 1601 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1602 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1603 __ addw(temp, length, dst_pos); 1604 __ cmpw(temp, rscratch1); 1605 __ br(Assembler::HI, L_failed); 1606 1607 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1608 __ movw(src_pos, src_pos); 1609 __ movw(dst_pos, dst_pos); 1610 1611 BLOCK_COMMENT("arraycopy_range_checks done"); 1612 } 1613 1614 // These stubs get called from some dumb test routine. 1615 // I'll write them properly when they're called from 1616 // something that's actually doing something. 1617 static void fake_arraycopy_stub(address src, address dst, int count) { 1618 assert(count == 0, "huh?"); 1619 } 1620 1621 1622 // 1623 // Generate 'unsafe' array copy stub 1624 // Though just as safe as the other stubs, it takes an unscaled 1625 // size_t argument instead of an element count. 1626 // 1627 // Input: 1628 // c_rarg0 - source array address 1629 // c_rarg1 - destination array address 1630 // c_rarg2 - byte count, treated as ssize_t, can be zero 1631 // 1632 // Examines the alignment of the operands and dispatches 1633 // to a long, int, short, or byte copy loop. 1634 // 1635 address generate_unsafe_copy(const char *name, 1636 address byte_copy_entry) { 1637 #ifdef PRODUCT 1638 return StubRoutines::_jbyte_arraycopy; 1639 #else 1640 __ align(CodeEntryAlignment); 1641 StubCodeMark mark(this, "StubRoutines", name); 1642 address start = __ pc(); 1643 __ enter(); // required for proper stackwalking of RuntimeStub frame 1644 // bump this on entry, not on exit: 1645 __ lea(rscratch2, ExternalAddress((address)&SharedRuntime::_unsafe_array_copy_ctr)); 1646 __ incrementw(Address(rscratch2)); 1647 __ b(RuntimeAddress(byte_copy_entry)); 1648 return start; 1649 #endif 1650 } 1651 1652 // 1653 // Generate generic array copy stubs 1654 // 1655 // Input: 1656 // c_rarg0 - src oop 1657 // c_rarg1 - src_pos (32-bits) 1658 // c_rarg2 - dst oop 1659 // c_rarg3 - dst_pos (32-bits) 1660 // c_rarg4 - element count (32-bits) 1661 // 1662 // Output: 1663 // r0 == 0 - success 1664 // r0 == -1^K - failure, where K is partial transfer count 1665 // 1666 address generate_generic_copy(const char *name, 1667 address byte_copy_entry, address short_copy_entry, 1668 address int_copy_entry, address oop_copy_entry, 1669 address long_copy_entry, address checkcast_copy_entry) { 1670 1671 Label L_failed, L_failed_0, L_objArray; 1672 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1673 1674 // Input registers 1675 const Register src = c_rarg0; // source array oop 1676 const Register src_pos = c_rarg1; // source position 1677 const Register dst = c_rarg2; // destination array oop 1678 const Register dst_pos = c_rarg3; // destination position 1679 const Register length = c_rarg4; 1680 1681 StubCodeMark mark(this, "StubRoutines", name); 1682 1683 __ align(CodeEntryAlignment); 1684 address start = __ pc(); 1685 1686 __ enter(); // required for proper stackwalking of RuntimeStub frame 1687 1688 // bump this on entry, not on exit: 1689 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1690 1691 //----------------------------------------------------------------------- 1692 // Assembler stub will be used for this call to arraycopy 1693 // if the following conditions are met: 1694 // 1695 // (1) src and dst must not be null. 1696 // (2) src_pos must not be negative. 1697 // (3) dst_pos must not be negative. 1698 // (4) length must not be negative. 1699 // (5) src klass and dst klass should be the same and not NULL. 1700 // (6) src and dst should be arrays. 1701 // (7) src_pos + length must not exceed length of src. 1702 // (8) dst_pos + length must not exceed length of dst. 1703 // 1704 1705 // if (src == NULL) return -1; 1706 __ cbz(src, L_failed); 1707 1708 // if (src_pos < 0) return -1; 1709 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 1710 1711 // if (dst == NULL) return -1; 1712 __ cbz(dst, L_failed); 1713 1714 // if (dst_pos < 0) return -1; 1715 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 1716 1717 // registers used as temp 1718 const Register scratch_length = r16; // elements count to copy 1719 const Register scratch_src_klass = r17; // array klass 1720 const Register lh = r18; // layout helper 1721 1722 // if (length < 0) return -1; 1723 __ movw(scratch_length, length); // length (elements count, 32-bits value) 1724 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 1725 1726 __ load_klass(scratch_src_klass, src); 1727 #ifdef ASSERT 1728 // assert(src->klass() != NULL); 1729 { 1730 BLOCK_COMMENT("assert klasses not null {"); 1731 Label L1, L2; 1732 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 1733 __ bind(L1); 1734 __ stop("broken null klass"); 1735 __ bind(L2); 1736 __ load_klass(rscratch1, dst); 1737 __ cbz(rscratch1, L1); // this would be broken also 1738 BLOCK_COMMENT("} assert klasses not null done"); 1739 } 1740 #endif 1741 1742 // Load layout helper (32-bits) 1743 // 1744 // |array_tag| | header_size | element_type | |log2_element_size| 1745 // 32 30 24 16 8 2 0 1746 // 1747 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 1748 // 1749 1750 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 1751 1752 // Handle objArrays completely differently... 1753 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 1754 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 1755 __ movw(rscratch1, objArray_lh); 1756 __ eorw(rscratch2, lh, rscratch1); 1757 __ cbzw(rscratch2, L_objArray); 1758 1759 // if (src->klass() != dst->klass()) return -1; 1760 __ load_klass(rscratch2, dst); 1761 __ eor(rscratch2, rscratch2, scratch_src_klass); 1762 __ cbnz(rscratch2, L_failed); 1763 1764 // if (!src->is_Array()) return -1; 1765 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 1766 1767 // At this point, it is known to be a typeArray (array_tag 0x3). 1768 #ifdef ASSERT 1769 { 1770 BLOCK_COMMENT("assert primitive array {"); 1771 Label L; 1772 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 1773 __ cmpw(lh, rscratch2); 1774 __ br(Assembler::GE, L); 1775 __ stop("must be a primitive array"); 1776 __ bind(L); 1777 BLOCK_COMMENT("} assert primitive array done"); 1778 } 1779 #endif 1780 1781 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1782 rscratch2, L_failed); 1783 1784 // TypeArrayKlass 1785 // 1786 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 1787 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 1788 // 1789 1790 const Register rscratch1_offset = rscratch1; // array offset 1791 const Register r18_elsize = lh; // element size 1792 1793 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 1794 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 1795 __ add(src, src, rscratch1_offset); // src array offset 1796 __ add(dst, dst, rscratch1_offset); // dst array offset 1797 BLOCK_COMMENT("choose copy loop based on element size"); 1798 1799 // next registers should be set before the jump to corresponding stub 1800 const Register from = c_rarg0; // source array address 1801 const Register to = c_rarg1; // destination array address 1802 const Register count = c_rarg2; // elements count 1803 1804 // 'from', 'to', 'count' registers should be set in such order 1805 // since they are the same as 'src', 'src_pos', 'dst'. 1806 1807 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 1808 1809 // The possible values of elsize are 0-3, i.e. exact_log2(element 1810 // size in bytes). We do a simple bitwise binary search. 1811 __ BIND(L_copy_bytes); 1812 __ tbnz(r18_elsize, 1, L_copy_ints); 1813 __ tbnz(r18_elsize, 0, L_copy_shorts); 1814 __ lea(from, Address(src, src_pos));// src_addr 1815 __ lea(to, Address(dst, dst_pos));// dst_addr 1816 __ movw(count, scratch_length); // length 1817 __ b(RuntimeAddress(byte_copy_entry)); 1818 1819 __ BIND(L_copy_shorts); 1820 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 1821 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 1822 __ movw(count, scratch_length); // length 1823 __ b(RuntimeAddress(short_copy_entry)); 1824 1825 __ BIND(L_copy_ints); 1826 __ tbnz(r18_elsize, 0, L_copy_longs); 1827 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 1828 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 1829 __ movw(count, scratch_length); // length 1830 __ b(RuntimeAddress(int_copy_entry)); 1831 1832 __ BIND(L_copy_longs); 1833 #ifdef ASSERT 1834 { 1835 BLOCK_COMMENT("assert long copy {"); 1836 Label L; 1837 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 1838 __ cmpw(r18_elsize, LogBytesPerLong); 1839 __ br(Assembler::EQ, L); 1840 __ stop("must be long copy, but elsize is wrong"); 1841 __ bind(L); 1842 BLOCK_COMMENT("} assert long copy done"); 1843 } 1844 #endif 1845 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 1846 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 1847 __ movw(count, scratch_length); // length 1848 __ b(RuntimeAddress(long_copy_entry)); 1849 1850 // ObjArrayKlass 1851 __ BIND(L_objArray); 1852 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 1853 1854 Label L_plain_copy, L_checkcast_copy; 1855 // test array classes for subtyping 1856 __ load_klass(r18, dst); 1857 __ cmp(scratch_src_klass, r18); // usual case is exact equality 1858 __ br(Assembler::NE, L_checkcast_copy); 1859 1860 // Identically typed arrays can be copied without element-wise checks. 1861 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1862 rscratch2, L_failed); 1863 1864 __ lea(from, Address(src, src_pos, Address::lsl(3))); 1865 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1866 __ lea(to, Address(dst, dst_pos, Address::lsl(3))); 1867 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1868 __ movw(count, scratch_length); // length 1869 __ BIND(L_plain_copy); 1870 __ b(RuntimeAddress(oop_copy_entry)); 1871 1872 __ BIND(L_checkcast_copy); 1873 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 1874 { 1875 // Before looking at dst.length, make sure dst is also an objArray. 1876 __ ldrw(rscratch1, Address(r18, lh_offset)); 1877 __ movw(rscratch2, objArray_lh); 1878 __ eorw(rscratch1, rscratch1, rscratch2); 1879 __ cbnzw(rscratch1, L_failed); 1880 1881 // It is safe to examine both src.length and dst.length. 1882 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 1883 r18, L_failed); 1884 1885 const Register rscratch2_dst_klass = rscratch2; 1886 __ load_klass(rscratch2_dst_klass, dst); // reload 1887 1888 // Marshal the base address arguments now, freeing registers. 1889 __ lea(from, Address(src, src_pos, Address::lsl(3))); 1890 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1891 __ lea(to, Address(dst, dst_pos, Address::lsl(3))); 1892 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 1893 __ movw(count, length); // length (reloaded) 1894 Register sco_temp = c_rarg3; // this register is free now 1895 assert_different_registers(from, to, count, sco_temp, 1896 rscratch2_dst_klass, scratch_src_klass); 1897 // assert_clean_int(count, sco_temp); 1898 1899 // Generate the type check. 1900 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1901 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 1902 // assert_clean_int(sco_temp, r18); 1903 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 1904 1905 // Fetch destination element klass from the ObjArrayKlass header. 1906 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 1907 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 1908 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 1909 1910 // the checkcast_copy loop needs two extra arguments: 1911 assert(c_rarg3 == sco_temp, "#3 already in place"); 1912 // Set up arguments for checkcast_copy_entry. 1913 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 1914 __ b(RuntimeAddress(checkcast_copy_entry)); 1915 } 1916 1917 __ BIND(L_failed); 1918 __ mov(r0, -1); 1919 __ leave(); // required for proper stackwalking of RuntimeStub frame 1920 __ ret(lr); 1921 1922 return start; 1923 } 1924 1925 void generate_arraycopy_stubs() { 1926 address entry; 1927 address entry_jbyte_arraycopy; 1928 address entry_jshort_arraycopy; 1929 address entry_jint_arraycopy; 1930 address entry_oop_arraycopy; 1931 address entry_jlong_arraycopy; 1932 address entry_checkcast_arraycopy; 1933 1934 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 1935 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 1936 1937 //*** jbyte 1938 // Always need aligned and unaligned versions 1939 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 1940 "jbyte_disjoint_arraycopy"); 1941 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 1942 &entry_jbyte_arraycopy, 1943 "jbyte_arraycopy"); 1944 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 1945 "arrayof_jbyte_disjoint_arraycopy"); 1946 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 1947 "arrayof_jbyte_arraycopy"); 1948 1949 //*** jshort 1950 // Always need aligned and unaligned versions 1951 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 1952 "jshort_disjoint_arraycopy"); 1953 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 1954 &entry_jshort_arraycopy, 1955 "jshort_arraycopy"); 1956 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 1957 "arrayof_jshort_disjoint_arraycopy"); 1958 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 1959 "arrayof_jshort_arraycopy"); 1960 1961 //*** jint 1962 // Aligned versions 1963 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 1964 "arrayof_jint_disjoint_arraycopy"); 1965 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 1966 "arrayof_jint_arraycopy"); 1967 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 1968 // entry_jint_arraycopy always points to the unaligned version 1969 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 1970 "jint_disjoint_arraycopy"); 1971 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 1972 &entry_jint_arraycopy, 1973 "jint_arraycopy"); 1974 1975 //*** jlong 1976 // It is always aligned 1977 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 1978 "arrayof_jlong_disjoint_arraycopy"); 1979 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 1980 "arrayof_jlong_arraycopy"); 1981 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 1982 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 1983 1984 //*** oops 1985 { 1986 // With compressed oops we need unaligned versions; notice that 1987 // we overwrite entry_oop_arraycopy. 1988 bool aligned = !UseCompressedOops; 1989 1990 StubRoutines::_arrayof_oop_disjoint_arraycopy 1991 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy"); 1992 StubRoutines::_arrayof_oop_arraycopy 1993 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy"); 1994 // Aligned versions without pre-barriers 1995 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 1996 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 1997 /*dest_uninitialized*/true); 1998 StubRoutines::_arrayof_oop_arraycopy_uninit 1999 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2000 /*dest_uninitialized*/true); 2001 } 2002 2003 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2004 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2005 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2006 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2007 2008 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2009 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2010 /*dest_uninitialized*/true); 2011 2012 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2013 entry_jbyte_arraycopy); 2014 2015 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2016 entry_jbyte_arraycopy, 2017 entry_jshort_arraycopy, 2018 entry_jint_arraycopy, 2019 entry_oop_arraycopy, 2020 entry_jlong_arraycopy, 2021 entry_checkcast_arraycopy); 2022 2023 } 2024 2025 void generate_math_stubs() { Unimplemented(); } 2026 2027 // Arguments: 2028 // 2029 // Inputs: 2030 // c_rarg0 - source byte array address 2031 // c_rarg1 - destination byte array address 2032 // c_rarg2 - K (key) in little endian int array 2033 // 2034 address generate_aescrypt_encryptBlock() { 2035 __ align(CodeEntryAlignment); 2036 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2037 2038 Label L_doLast; 2039 2040 const Register from = c_rarg0; // source array address 2041 const Register to = c_rarg1; // destination array address 2042 const Register key = c_rarg2; // key array address 2043 const Register keylen = rscratch1; 2044 2045 address start = __ pc(); 2046 __ enter(); 2047 2048 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2049 2050 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2051 2052 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2053 __ rev32(v1, __ T16B, v1); 2054 __ rev32(v2, __ T16B, v2); 2055 __ rev32(v3, __ T16B, v3); 2056 __ rev32(v4, __ T16B, v4); 2057 __ aese(v0, v1); 2058 __ aesmc(v0, v0); 2059 __ aese(v0, v2); 2060 __ aesmc(v0, v0); 2061 __ aese(v0, v3); 2062 __ aesmc(v0, v0); 2063 __ aese(v0, v4); 2064 __ aesmc(v0, v0); 2065 2066 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2067 __ rev32(v1, __ T16B, v1); 2068 __ rev32(v2, __ T16B, v2); 2069 __ rev32(v3, __ T16B, v3); 2070 __ rev32(v4, __ T16B, v4); 2071 __ aese(v0, v1); 2072 __ aesmc(v0, v0); 2073 __ aese(v0, v2); 2074 __ aesmc(v0, v0); 2075 __ aese(v0, v3); 2076 __ aesmc(v0, v0); 2077 __ aese(v0, v4); 2078 __ aesmc(v0, v0); 2079 2080 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2081 __ rev32(v1, __ T16B, v1); 2082 __ rev32(v2, __ T16B, v2); 2083 2084 __ cmpw(keylen, 44); 2085 __ br(Assembler::EQ, L_doLast); 2086 2087 __ aese(v0, v1); 2088 __ aesmc(v0, v0); 2089 __ aese(v0, v2); 2090 __ aesmc(v0, v0); 2091 2092 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2093 __ rev32(v1, __ T16B, v1); 2094 __ rev32(v2, __ T16B, v2); 2095 2096 __ cmpw(keylen, 52); 2097 __ br(Assembler::EQ, L_doLast); 2098 2099 __ aese(v0, v1); 2100 __ aesmc(v0, v0); 2101 __ aese(v0, v2); 2102 __ aesmc(v0, v0); 2103 2104 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2105 __ rev32(v1, __ T16B, v1); 2106 __ rev32(v2, __ T16B, v2); 2107 2108 __ BIND(L_doLast); 2109 2110 __ aese(v0, v1); 2111 __ aesmc(v0, v0); 2112 __ aese(v0, v2); 2113 2114 __ ld1(v1, __ T16B, key); 2115 __ rev32(v1, __ T16B, v1); 2116 __ eor(v0, __ T16B, v0, v1); 2117 2118 __ st1(v0, __ T16B, to); 2119 2120 __ mov(r0, 0); 2121 2122 __ leave(); 2123 __ ret(lr); 2124 2125 return start; 2126 } 2127 2128 // Arguments: 2129 // 2130 // Inputs: 2131 // c_rarg0 - source byte array address 2132 // c_rarg1 - destination byte array address 2133 // c_rarg2 - K (key) in little endian int array 2134 // 2135 address generate_aescrypt_decryptBlock() { 2136 assert(UseAES, "need AES instructions and misaligned SSE support"); 2137 __ align(CodeEntryAlignment); 2138 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2139 Label L_doLast; 2140 2141 const Register from = c_rarg0; // source array address 2142 const Register to = c_rarg1; // destination array address 2143 const Register key = c_rarg2; // key array address 2144 const Register keylen = rscratch1; 2145 2146 address start = __ pc(); 2147 __ enter(); // required for proper stackwalking of RuntimeStub frame 2148 2149 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2150 2151 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2152 2153 __ ld1(v5, __ T16B, __ post(key, 16)); 2154 __ rev32(v5, __ T16B, v5); 2155 2156 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2157 __ rev32(v1, __ T16B, v1); 2158 __ rev32(v2, __ T16B, v2); 2159 __ rev32(v3, __ T16B, v3); 2160 __ rev32(v4, __ T16B, v4); 2161 __ aesd(v0, v1); 2162 __ aesimc(v0, v0); 2163 __ aesd(v0, v2); 2164 __ aesimc(v0, v0); 2165 __ aesd(v0, v3); 2166 __ aesimc(v0, v0); 2167 __ aesd(v0, v4); 2168 __ aesimc(v0, v0); 2169 2170 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2171 __ rev32(v1, __ T16B, v1); 2172 __ rev32(v2, __ T16B, v2); 2173 __ rev32(v3, __ T16B, v3); 2174 __ rev32(v4, __ T16B, v4); 2175 __ aesd(v0, v1); 2176 __ aesimc(v0, v0); 2177 __ aesd(v0, v2); 2178 __ aesimc(v0, v0); 2179 __ aesd(v0, v3); 2180 __ aesimc(v0, v0); 2181 __ aesd(v0, v4); 2182 __ aesimc(v0, v0); 2183 2184 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2185 __ rev32(v1, __ T16B, v1); 2186 __ rev32(v2, __ T16B, v2); 2187 2188 __ cmpw(keylen, 44); 2189 __ br(Assembler::EQ, L_doLast); 2190 2191 __ aesd(v0, v1); 2192 __ aesimc(v0, v0); 2193 __ aesd(v0, v2); 2194 __ aesimc(v0, v0); 2195 2196 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2197 __ rev32(v1, __ T16B, v1); 2198 __ rev32(v2, __ T16B, v2); 2199 2200 __ cmpw(keylen, 52); 2201 __ br(Assembler::EQ, L_doLast); 2202 2203 __ aesd(v0, v1); 2204 __ aesimc(v0, v0); 2205 __ aesd(v0, v2); 2206 __ aesimc(v0, v0); 2207 2208 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2209 __ rev32(v1, __ T16B, v1); 2210 __ rev32(v2, __ T16B, v2); 2211 2212 __ BIND(L_doLast); 2213 2214 __ aesd(v0, v1); 2215 __ aesimc(v0, v0); 2216 __ aesd(v0, v2); 2217 2218 __ eor(v0, __ T16B, v0, v5); 2219 2220 __ st1(v0, __ T16B, to); 2221 2222 __ mov(r0, 0); 2223 2224 __ leave(); 2225 __ ret(lr); 2226 2227 return start; 2228 } 2229 2230 // Arguments: 2231 // 2232 // Inputs: 2233 // c_rarg0 - source byte array address 2234 // c_rarg1 - destination byte array address 2235 // c_rarg2 - K (key) in little endian int array 2236 // c_rarg3 - r vector byte array address 2237 // c_rarg4 - input length 2238 // 2239 // Output: 2240 // x0 - input length 2241 // 2242 address generate_cipherBlockChaining_encryptAESCrypt() { 2243 assert(UseAES, "need AES instructions and misaligned SSE support"); 2244 __ align(CodeEntryAlignment); 2245 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2246 2247 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2248 2249 const Register from = c_rarg0; // source array address 2250 const Register to = c_rarg1; // destination array address 2251 const Register key = c_rarg2; // key array address 2252 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2253 // and left with the results of the last encryption block 2254 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2255 const Register keylen = rscratch1; 2256 2257 address start = __ pc(); 2258 __ enter(); 2259 2260 __ mov(rscratch2, len_reg); 2261 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2262 2263 __ ld1(v0, __ T16B, rvec); 2264 2265 __ cmpw(keylen, 52); 2266 __ br(Assembler::CC, L_loadkeys_44); 2267 __ br(Assembler::EQ, L_loadkeys_52); 2268 2269 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2270 __ rev32(v17, __ T16B, v17); 2271 __ rev32(v18, __ T16B, v18); 2272 __ BIND(L_loadkeys_52); 2273 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2274 __ rev32(v19, __ T16B, v19); 2275 __ rev32(v20, __ T16B, v20); 2276 __ BIND(L_loadkeys_44); 2277 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2278 __ rev32(v21, __ T16B, v21); 2279 __ rev32(v22, __ T16B, v22); 2280 __ rev32(v23, __ T16B, v23); 2281 __ rev32(v24, __ T16B, v24); 2282 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2283 __ rev32(v25, __ T16B, v25); 2284 __ rev32(v26, __ T16B, v26); 2285 __ rev32(v27, __ T16B, v27); 2286 __ rev32(v28, __ T16B, v28); 2287 __ ld1(v29, v30, v31, __ T16B, key); 2288 __ rev32(v29, __ T16B, v29); 2289 __ rev32(v30, __ T16B, v30); 2290 __ rev32(v31, __ T16B, v31); 2291 2292 __ BIND(L_aes_loop); 2293 __ ld1(v1, __ T16B, __ post(from, 16)); 2294 __ eor(v0, __ T16B, v0, v1); 2295 2296 __ br(Assembler::CC, L_rounds_44); 2297 __ br(Assembler::EQ, L_rounds_52); 2298 2299 __ aese(v0, v17); __ aesmc(v0, v0); 2300 __ aese(v0, v18); __ aesmc(v0, v0); 2301 __ BIND(L_rounds_52); 2302 __ aese(v0, v19); __ aesmc(v0, v0); 2303 __ aese(v0, v20); __ aesmc(v0, v0); 2304 __ BIND(L_rounds_44); 2305 __ aese(v0, v21); __ aesmc(v0, v0); 2306 __ aese(v0, v22); __ aesmc(v0, v0); 2307 __ aese(v0, v23); __ aesmc(v0, v0); 2308 __ aese(v0, v24); __ aesmc(v0, v0); 2309 __ aese(v0, v25); __ aesmc(v0, v0); 2310 __ aese(v0, v26); __ aesmc(v0, v0); 2311 __ aese(v0, v27); __ aesmc(v0, v0); 2312 __ aese(v0, v28); __ aesmc(v0, v0); 2313 __ aese(v0, v29); __ aesmc(v0, v0); 2314 __ aese(v0, v30); 2315 __ eor(v0, __ T16B, v0, v31); 2316 2317 __ st1(v0, __ T16B, __ post(to, 16)); 2318 __ sub(len_reg, len_reg, 16); 2319 __ cbnz(len_reg, L_aes_loop); 2320 2321 __ st1(v0, __ T16B, rvec); 2322 2323 __ mov(r0, rscratch2); 2324 2325 __ leave(); 2326 __ ret(lr); 2327 2328 return start; 2329 } 2330 2331 // Arguments: 2332 // 2333 // Inputs: 2334 // c_rarg0 - source byte array address 2335 // c_rarg1 - destination byte array address 2336 // c_rarg2 - K (key) in little endian int array 2337 // c_rarg3 - r vector byte array address 2338 // c_rarg4 - input length 2339 // 2340 // Output: 2341 // r0 - input length 2342 // 2343 address generate_cipherBlockChaining_decryptAESCrypt() { 2344 assert(UseAES, "need AES instructions and misaligned SSE support"); 2345 __ align(CodeEntryAlignment); 2346 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2347 2348 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2349 2350 const Register from = c_rarg0; // source array address 2351 const Register to = c_rarg1; // destination array address 2352 const Register key = c_rarg2; // key array address 2353 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2354 // and left with the results of the last encryption block 2355 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2356 const Register keylen = rscratch1; 2357 2358 address start = __ pc(); 2359 __ enter(); 2360 2361 __ mov(rscratch2, len_reg); 2362 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2363 2364 __ ld1(v2, __ T16B, rvec); 2365 2366 __ ld1(v31, __ T16B, __ post(key, 16)); 2367 __ rev32(v31, __ T16B, v31); 2368 2369 __ cmpw(keylen, 52); 2370 __ br(Assembler::CC, L_loadkeys_44); 2371 __ br(Assembler::EQ, L_loadkeys_52); 2372 2373 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2374 __ rev32(v17, __ T16B, v17); 2375 __ rev32(v18, __ T16B, v18); 2376 __ BIND(L_loadkeys_52); 2377 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2378 __ rev32(v19, __ T16B, v19); 2379 __ rev32(v20, __ T16B, v20); 2380 __ BIND(L_loadkeys_44); 2381 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2382 __ rev32(v21, __ T16B, v21); 2383 __ rev32(v22, __ T16B, v22); 2384 __ rev32(v23, __ T16B, v23); 2385 __ rev32(v24, __ T16B, v24); 2386 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2387 __ rev32(v25, __ T16B, v25); 2388 __ rev32(v26, __ T16B, v26); 2389 __ rev32(v27, __ T16B, v27); 2390 __ rev32(v28, __ T16B, v28); 2391 __ ld1(v29, v30, __ T16B, key); 2392 __ rev32(v29, __ T16B, v29); 2393 __ rev32(v30, __ T16B, v30); 2394 2395 __ BIND(L_aes_loop); 2396 __ ld1(v0, __ T16B, __ post(from, 16)); 2397 __ orr(v1, __ T16B, v0, v0); 2398 2399 __ br(Assembler::CC, L_rounds_44); 2400 __ br(Assembler::EQ, L_rounds_52); 2401 2402 __ aesd(v0, v17); __ aesimc(v0, v0); 2403 __ aesd(v0, v18); __ aesimc(v0, v0); 2404 __ BIND(L_rounds_52); 2405 __ aesd(v0, v19); __ aesimc(v0, v0); 2406 __ aesd(v0, v20); __ aesimc(v0, v0); 2407 __ BIND(L_rounds_44); 2408 __ aesd(v0, v21); __ aesimc(v0, v0); 2409 __ aesd(v0, v22); __ aesimc(v0, v0); 2410 __ aesd(v0, v23); __ aesimc(v0, v0); 2411 __ aesd(v0, v24); __ aesimc(v0, v0); 2412 __ aesd(v0, v25); __ aesimc(v0, v0); 2413 __ aesd(v0, v26); __ aesimc(v0, v0); 2414 __ aesd(v0, v27); __ aesimc(v0, v0); 2415 __ aesd(v0, v28); __ aesimc(v0, v0); 2416 __ aesd(v0, v29); __ aesimc(v0, v0); 2417 __ aesd(v0, v30); 2418 __ eor(v0, __ T16B, v0, v31); 2419 __ eor(v0, __ T16B, v0, v2); 2420 2421 __ st1(v0, __ T16B, __ post(to, 16)); 2422 __ orr(v2, __ T16B, v1, v1); 2423 2424 __ sub(len_reg, len_reg, 16); 2425 __ cbnz(len_reg, L_aes_loop); 2426 2427 __ st1(v2, __ T16B, rvec); 2428 2429 __ mov(r0, rscratch2); 2430 2431 __ leave(); 2432 __ ret(lr); 2433 2434 return start; 2435 } 2436 2437 // Arguments: 2438 // 2439 // Inputs: 2440 // c_rarg0 - byte[] source+offset 2441 // c_rarg1 - int[] SHA.state 2442 // c_rarg2 - int offset 2443 // c_rarg3 - int limit 2444 // 2445 address generate_sha1_implCompress(bool multi_block, const char *name) { 2446 __ align(CodeEntryAlignment); 2447 StubCodeMark mark(this, "StubRoutines", name); 2448 address start = __ pc(); 2449 2450 Register buf = c_rarg0; 2451 Register state = c_rarg1; 2452 Register ofs = c_rarg2; 2453 Register limit = c_rarg3; 2454 2455 Label keys; 2456 Label sha1_loop; 2457 2458 // load the keys into v0..v3 2459 __ adr(rscratch1, keys); 2460 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2461 // load 5 words state into v6, v7 2462 __ ldrq(v6, Address(state, 0)); 2463 __ ldrs(v7, Address(state, 16)); 2464 2465 2466 __ BIND(sha1_loop); 2467 // load 64 bytes of data into v16..v19 2468 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2469 __ rev32(v16, __ T16B, v16); 2470 __ rev32(v17, __ T16B, v17); 2471 __ rev32(v18, __ T16B, v18); 2472 __ rev32(v19, __ T16B, v19); 2473 2474 // do the sha1 2475 __ addv(v4, __ T4S, v16, v0); 2476 __ orr(v20, __ T16B, v6, v6); 2477 2478 FloatRegister d0 = v16; 2479 FloatRegister d1 = v17; 2480 FloatRegister d2 = v18; 2481 FloatRegister d3 = v19; 2482 2483 for (int round = 0; round < 20; round++) { 2484 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2485 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2486 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2487 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2488 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2489 2490 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2491 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2492 __ sha1h(tmp2, __ T4S, v20); 2493 if (round < 5) 2494 __ sha1c(v20, __ T4S, tmp3, tmp4); 2495 else if (round < 10 || round >= 15) 2496 __ sha1p(v20, __ T4S, tmp3, tmp4); 2497 else 2498 __ sha1m(v20, __ T4S, tmp3, tmp4); 2499 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2500 2501 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2502 } 2503 2504 __ addv(v7, __ T2S, v7, v21); 2505 __ addv(v6, __ T4S, v6, v20); 2506 2507 if (multi_block) { 2508 __ add(ofs, ofs, 64); 2509 __ cmp(ofs, limit); 2510 __ br(Assembler::LE, sha1_loop); 2511 __ mov(c_rarg0, ofs); // return ofs 2512 } 2513 2514 __ strq(v6, Address(state, 0)); 2515 __ strs(v7, Address(state, 16)); 2516 2517 __ ret(lr); 2518 2519 __ bind(keys); 2520 __ emit_int32(0x5a827999); 2521 __ emit_int32(0x6ed9eba1); 2522 __ emit_int32(0x8f1bbcdc); 2523 __ emit_int32(0xca62c1d6); 2524 2525 return start; 2526 } 2527 2528 2529 // Arguments: 2530 // 2531 // Inputs: 2532 // c_rarg0 - byte[] source+offset 2533 // c_rarg1 - int[] SHA.state 2534 // c_rarg2 - int offset 2535 // c_rarg3 - int limit 2536 // 2537 address generate_sha256_implCompress(bool multi_block, const char *name) { 2538 static const uint32_t round_consts[64] = { 2539 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 2540 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 2541 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 2542 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 2543 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 2544 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 2545 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 2546 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 2547 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 2548 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 2549 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 2550 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 2551 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 2552 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 2553 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 2554 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 2555 }; 2556 __ align(CodeEntryAlignment); 2557 StubCodeMark mark(this, "StubRoutines", name); 2558 address start = __ pc(); 2559 2560 Register buf = c_rarg0; 2561 Register state = c_rarg1; 2562 Register ofs = c_rarg2; 2563 Register limit = c_rarg3; 2564 2565 Label sha1_loop; 2566 2567 __ stpd(v8, v9, __ pre(sp, -32)); 2568 __ stpd(v10, v11, Address(sp, 16)); 2569 2570 // dga == v0 2571 // dgb == v1 2572 // dg0 == v2 2573 // dg1 == v3 2574 // dg2 == v4 2575 // t0 == v6 2576 // t1 == v7 2577 2578 // load 16 keys to v16..v31 2579 __ lea(rscratch1, ExternalAddress((address)round_consts)); 2580 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 2581 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 2582 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 2583 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 2584 2585 // load 8 words (256 bits) state 2586 __ ldpq(v0, v1, state); 2587 2588 __ BIND(sha1_loop); 2589 // load 64 bytes of data into v8..v11 2590 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 2591 __ rev32(v8, __ T16B, v8); 2592 __ rev32(v9, __ T16B, v9); 2593 __ rev32(v10, __ T16B, v10); 2594 __ rev32(v11, __ T16B, v11); 2595 2596 __ addv(v6, __ T4S, v8, v16); 2597 __ orr(v2, __ T16B, v0, v0); 2598 __ orr(v3, __ T16B, v1, v1); 2599 2600 FloatRegister d0 = v8; 2601 FloatRegister d1 = v9; 2602 FloatRegister d2 = v10; 2603 FloatRegister d3 = v11; 2604 2605 2606 for (int round = 0; round < 16; round++) { 2607 FloatRegister tmp1 = (round & 1) ? v6 : v7; 2608 FloatRegister tmp2 = (round & 1) ? v7 : v6; 2609 FloatRegister tmp3 = (round & 1) ? v2 : v4; 2610 FloatRegister tmp4 = (round & 1) ? v4 : v2; 2611 2612 if (round < 12) __ sha256su0(d0, __ T4S, d1); 2613 __ orr(v4, __ T16B, v2, v2); 2614 if (round < 15) 2615 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 2616 __ sha256h(v2, __ T4S, v3, tmp2); 2617 __ sha256h2(v3, __ T4S, v4, tmp2); 2618 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 2619 2620 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2621 } 2622 2623 __ addv(v0, __ T4S, v0, v2); 2624 __ addv(v1, __ T4S, v1, v3); 2625 2626 if (multi_block) { 2627 __ add(ofs, ofs, 64); 2628 __ cmp(ofs, limit); 2629 __ br(Assembler::LE, sha1_loop); 2630 __ mov(c_rarg0, ofs); // return ofs 2631 } 2632 2633 __ ldpd(v10, v11, Address(sp, 16)); 2634 __ ldpd(v8, v9, __ post(sp, 32)); 2635 2636 __ stpq(v0, v1, state); 2637 2638 __ ret(lr); 2639 2640 return start; 2641 } 2642 2643 #ifndef BUILTIN_SIM 2644 // Safefetch stubs. 2645 void generate_safefetch(const char* name, int size, address* entry, 2646 address* fault_pc, address* continuation_pc) { 2647 // safefetch signatures: 2648 // int SafeFetch32(int* adr, int errValue); 2649 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 2650 // 2651 // arguments: 2652 // c_rarg0 = adr 2653 // c_rarg1 = errValue 2654 // 2655 // result: 2656 // PPC_RET = *adr or errValue 2657 2658 StubCodeMark mark(this, "StubRoutines", name); 2659 2660 // Entry point, pc or function descriptor. 2661 *entry = __ pc(); 2662 2663 // Load *adr into c_rarg1, may fault. 2664 *fault_pc = __ pc(); 2665 switch (size) { 2666 case 4: 2667 // int32_t 2668 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 2669 break; 2670 case 8: 2671 // int64_t 2672 __ ldr(c_rarg1, Address(c_rarg0, 0)); 2673 break; 2674 default: 2675 ShouldNotReachHere(); 2676 } 2677 2678 // return errValue or *adr 2679 *continuation_pc = __ pc(); 2680 __ mov(r0, c_rarg1); 2681 __ ret(lr); 2682 } 2683 #endif 2684 2685 /** 2686 * Arguments: 2687 * 2688 * Inputs: 2689 * c_rarg0 - int crc 2690 * c_rarg1 - byte* buf 2691 * c_rarg2 - int length 2692 * 2693 * Ouput: 2694 * rax - int crc result 2695 */ 2696 address generate_updateBytesCRC32() { 2697 assert(UseCRC32Intrinsics, "what are we doing here?"); 2698 2699 __ align(CodeEntryAlignment); 2700 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 2701 2702 address start = __ pc(); 2703 2704 const Register crc = c_rarg0; // crc 2705 const Register buf = c_rarg1; // source java byte array address 2706 const Register len = c_rarg2; // length 2707 const Register table0 = c_rarg3; // crc_table address 2708 const Register table1 = c_rarg4; 2709 const Register table2 = c_rarg5; 2710 const Register table3 = c_rarg6; 2711 const Register tmp3 = c_rarg7; 2712 2713 BLOCK_COMMENT("Entry:"); 2714 __ enter(); // required for proper stackwalking of RuntimeStub frame 2715 2716 __ kernel_crc32(crc, buf, len, 2717 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 2718 2719 __ leave(); // required for proper stackwalking of RuntimeStub frame 2720 __ ret(lr); 2721 2722 return start; 2723 } 2724 2725 /** 2726 * Arguments: 2727 * 2728 * Inputs: 2729 * c_rarg0 - int crc 2730 * c_rarg1 - byte* buf 2731 * c_rarg2 - int length 2732 * c_rarg3 - int* table 2733 * 2734 * Ouput: 2735 * r0 - int crc result 2736 */ 2737 address generate_updateBytesCRC32C() { 2738 assert(UseCRC32CIntrinsics, "what are we doing here?"); 2739 2740 __ align(CodeEntryAlignment); 2741 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 2742 2743 address start = __ pc(); 2744 2745 const Register crc = c_rarg0; // crc 2746 const Register buf = c_rarg1; // source java byte array address 2747 const Register len = c_rarg2; // length 2748 const Register table0 = c_rarg3; // crc_table address 2749 const Register table1 = c_rarg4; 2750 const Register table2 = c_rarg5; 2751 const Register table3 = c_rarg6; 2752 const Register tmp3 = c_rarg7; 2753 2754 BLOCK_COMMENT("Entry:"); 2755 __ enter(); // required for proper stackwalking of RuntimeStub frame 2756 2757 __ kernel_crc32c(crc, buf, len, 2758 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 2759 2760 __ leave(); // required for proper stackwalking of RuntimeStub frame 2761 __ ret(lr); 2762 2763 return start; 2764 } 2765 2766 /*** 2767 * Arguments: 2768 * 2769 * Inputs: 2770 * c_rarg0 - int adler 2771 * c_rarg1 - byte* buff 2772 * c_rarg2 - int len 2773 * 2774 * Output: 2775 * c_rarg0 - int adler result 2776 */ 2777 address generate_updateBytesAdler32() { 2778 __ align(CodeEntryAlignment); 2779 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 2780 address start = __ pc(); 2781 2782 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 2783 2784 // Aliases 2785 Register adler = c_rarg0; 2786 Register s1 = c_rarg0; 2787 Register s2 = c_rarg3; 2788 Register buff = c_rarg1; 2789 Register len = c_rarg2; 2790 Register nmax = r4; 2791 Register base = r5; 2792 Register count = r6; 2793 Register temp0 = rscratch1; 2794 Register temp1 = rscratch2; 2795 Register temp2 = r7; 2796 2797 // Max number of bytes we can process before having to take the mod 2798 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 2799 unsigned long BASE = 0xfff1; 2800 unsigned long NMAX = 0x15B0; 2801 2802 __ mov(base, BASE); 2803 __ mov(nmax, NMAX); 2804 2805 // s1 is initialized to the lower 16 bits of adler 2806 // s2 is initialized to the upper 16 bits of adler 2807 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 2808 __ uxth(s1, adler); // s1 = (adler & 0xffff) 2809 2810 // The pipelined loop needs at least 16 elements for 1 iteration 2811 // It does check this, but it is more effective to skip to the cleanup loop 2812 __ cmp(len, 16); 2813 __ br(Assembler::HS, L_nmax); 2814 __ cbz(len, L_combine); 2815 2816 __ bind(L_simple_by1_loop); 2817 __ ldrb(temp0, Address(__ post(buff, 1))); 2818 __ add(s1, s1, temp0); 2819 __ add(s2, s2, s1); 2820 __ subs(len, len, 1); 2821 __ br(Assembler::HI, L_simple_by1_loop); 2822 2823 // s1 = s1 % BASE 2824 __ subs(temp0, s1, base); 2825 __ csel(s1, temp0, s1, Assembler::HS); 2826 2827 // s2 = s2 % BASE 2828 __ lsr(temp0, s2, 16); 2829 __ lsl(temp1, temp0, 4); 2830 __ sub(temp1, temp1, temp0); 2831 __ add(s2, temp1, s2, ext::uxth); 2832 2833 __ subs(temp0, s2, base); 2834 __ csel(s2, temp0, s2, Assembler::HS); 2835 2836 __ b(L_combine); 2837 2838 __ bind(L_nmax); 2839 __ subs(len, len, nmax); 2840 __ sub(count, nmax, 16); 2841 __ br(Assembler::LO, L_by16); 2842 2843 __ bind(L_nmax_loop); 2844 2845 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 2846 2847 __ add(s1, s1, temp0, ext::uxtb); 2848 __ ubfx(temp2, temp0, 8, 8); 2849 __ add(s2, s2, s1); 2850 __ add(s1, s1, temp2); 2851 __ ubfx(temp2, temp0, 16, 8); 2852 __ add(s2, s2, s1); 2853 __ add(s1, s1, temp2); 2854 __ ubfx(temp2, temp0, 24, 8); 2855 __ add(s2, s2, s1); 2856 __ add(s1, s1, temp2); 2857 __ ubfx(temp2, temp0, 32, 8); 2858 __ add(s2, s2, s1); 2859 __ add(s1, s1, temp2); 2860 __ ubfx(temp2, temp0, 40, 8); 2861 __ add(s2, s2, s1); 2862 __ add(s1, s1, temp2); 2863 __ ubfx(temp2, temp0, 48, 8); 2864 __ add(s2, s2, s1); 2865 __ add(s1, s1, temp2); 2866 __ add(s2, s2, s1); 2867 __ add(s1, s1, temp0, Assembler::LSR, 56); 2868 __ add(s2, s2, s1); 2869 2870 __ add(s1, s1, temp1, ext::uxtb); 2871 __ ubfx(temp2, temp1, 8, 8); 2872 __ add(s2, s2, s1); 2873 __ add(s1, s1, temp2); 2874 __ ubfx(temp2, temp1, 16, 8); 2875 __ add(s2, s2, s1); 2876 __ add(s1, s1, temp2); 2877 __ ubfx(temp2, temp1, 24, 8); 2878 __ add(s2, s2, s1); 2879 __ add(s1, s1, temp2); 2880 __ ubfx(temp2, temp1, 32, 8); 2881 __ add(s2, s2, s1); 2882 __ add(s1, s1, temp2); 2883 __ ubfx(temp2, temp1, 40, 8); 2884 __ add(s2, s2, s1); 2885 __ add(s1, s1, temp2); 2886 __ ubfx(temp2, temp1, 48, 8); 2887 __ add(s2, s2, s1); 2888 __ add(s1, s1, temp2); 2889 __ add(s2, s2, s1); 2890 __ add(s1, s1, temp1, Assembler::LSR, 56); 2891 __ add(s2, s2, s1); 2892 2893 __ subs(count, count, 16); 2894 __ br(Assembler::HS, L_nmax_loop); 2895 2896 // s1 = s1 % BASE 2897 __ lsr(temp0, s1, 16); 2898 __ lsl(temp1, temp0, 4); 2899 __ sub(temp1, temp1, temp0); 2900 __ add(temp1, temp1, s1, ext::uxth); 2901 2902 __ lsr(temp0, temp1, 16); 2903 __ lsl(s1, temp0, 4); 2904 __ sub(s1, s1, temp0); 2905 __ add(s1, s1, temp1, ext:: uxth); 2906 2907 __ subs(temp0, s1, base); 2908 __ csel(s1, temp0, s1, Assembler::HS); 2909 2910 // s2 = s2 % BASE 2911 __ lsr(temp0, s2, 16); 2912 __ lsl(temp1, temp0, 4); 2913 __ sub(temp1, temp1, temp0); 2914 __ add(temp1, temp1, s2, ext::uxth); 2915 2916 __ lsr(temp0, temp1, 16); 2917 __ lsl(s2, temp0, 4); 2918 __ sub(s2, s2, temp0); 2919 __ add(s2, s2, temp1, ext:: uxth); 2920 2921 __ subs(temp0, s2, base); 2922 __ csel(s2, temp0, s2, Assembler::HS); 2923 2924 __ subs(len, len, nmax); 2925 __ sub(count, nmax, 16); 2926 __ br(Assembler::HS, L_nmax_loop); 2927 2928 __ bind(L_by16); 2929 __ adds(len, len, count); 2930 __ br(Assembler::LO, L_by1); 2931 2932 __ bind(L_by16_loop); 2933 2934 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 2935 2936 __ add(s1, s1, temp0, ext::uxtb); 2937 __ ubfx(temp2, temp0, 8, 8); 2938 __ add(s2, s2, s1); 2939 __ add(s1, s1, temp2); 2940 __ ubfx(temp2, temp0, 16, 8); 2941 __ add(s2, s2, s1); 2942 __ add(s1, s1, temp2); 2943 __ ubfx(temp2, temp0, 24, 8); 2944 __ add(s2, s2, s1); 2945 __ add(s1, s1, temp2); 2946 __ ubfx(temp2, temp0, 32, 8); 2947 __ add(s2, s2, s1); 2948 __ add(s1, s1, temp2); 2949 __ ubfx(temp2, temp0, 40, 8); 2950 __ add(s2, s2, s1); 2951 __ add(s1, s1, temp2); 2952 __ ubfx(temp2, temp0, 48, 8); 2953 __ add(s2, s2, s1); 2954 __ add(s1, s1, temp2); 2955 __ add(s2, s2, s1); 2956 __ add(s1, s1, temp0, Assembler::LSR, 56); 2957 __ add(s2, s2, s1); 2958 2959 __ add(s1, s1, temp1, ext::uxtb); 2960 __ ubfx(temp2, temp1, 8, 8); 2961 __ add(s2, s2, s1); 2962 __ add(s1, s1, temp2); 2963 __ ubfx(temp2, temp1, 16, 8); 2964 __ add(s2, s2, s1); 2965 __ add(s1, s1, temp2); 2966 __ ubfx(temp2, temp1, 24, 8); 2967 __ add(s2, s2, s1); 2968 __ add(s1, s1, temp2); 2969 __ ubfx(temp2, temp1, 32, 8); 2970 __ add(s2, s2, s1); 2971 __ add(s1, s1, temp2); 2972 __ ubfx(temp2, temp1, 40, 8); 2973 __ add(s2, s2, s1); 2974 __ add(s1, s1, temp2); 2975 __ ubfx(temp2, temp1, 48, 8); 2976 __ add(s2, s2, s1); 2977 __ add(s1, s1, temp2); 2978 __ add(s2, s2, s1); 2979 __ add(s1, s1, temp1, Assembler::LSR, 56); 2980 __ add(s2, s2, s1); 2981 2982 __ subs(len, len, 16); 2983 __ br(Assembler::HS, L_by16_loop); 2984 2985 __ bind(L_by1); 2986 __ adds(len, len, 15); 2987 __ br(Assembler::LO, L_do_mod); 2988 2989 __ bind(L_by1_loop); 2990 __ ldrb(temp0, Address(__ post(buff, 1))); 2991 __ add(s1, temp0, s1); 2992 __ add(s2, s2, s1); 2993 __ subs(len, len, 1); 2994 __ br(Assembler::HS, L_by1_loop); 2995 2996 __ bind(L_do_mod); 2997 // s1 = s1 % BASE 2998 __ lsr(temp0, s1, 16); 2999 __ lsl(temp1, temp0, 4); 3000 __ sub(temp1, temp1, temp0); 3001 __ add(temp1, temp1, s1, ext::uxth); 3002 3003 __ lsr(temp0, temp1, 16); 3004 __ lsl(s1, temp0, 4); 3005 __ sub(s1, s1, temp0); 3006 __ add(s1, s1, temp1, ext:: uxth); 3007 3008 __ subs(temp0, s1, base); 3009 __ csel(s1, temp0, s1, Assembler::HS); 3010 3011 // s2 = s2 % BASE 3012 __ lsr(temp0, s2, 16); 3013 __ lsl(temp1, temp0, 4); 3014 __ sub(temp1, temp1, temp0); 3015 __ add(temp1, temp1, s2, ext::uxth); 3016 3017 __ lsr(temp0, temp1, 16); 3018 __ lsl(s2, temp0, 4); 3019 __ sub(s2, s2, temp0); 3020 __ add(s2, s2, temp1, ext:: uxth); 3021 3022 __ subs(temp0, s2, base); 3023 __ csel(s2, temp0, s2, Assembler::HS); 3024 3025 // Combine lower bits and higher bits 3026 __ bind(L_combine); 3027 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3028 3029 __ ret(lr); 3030 3031 return start; 3032 } 3033 3034 /** 3035 * Arguments: 3036 * 3037 * Input: 3038 * c_rarg0 - x address 3039 * c_rarg1 - x length 3040 * c_rarg2 - y address 3041 * c_rarg3 - y lenth 3042 * c_rarg4 - z address 3043 * c_rarg5 - z length 3044 */ 3045 address generate_multiplyToLen() { 3046 __ align(CodeEntryAlignment); 3047 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3048 3049 address start = __ pc(); 3050 const Register x = r0; 3051 const Register xlen = r1; 3052 const Register y = r2; 3053 const Register ylen = r3; 3054 const Register z = r4; 3055 const Register zlen = r5; 3056 3057 const Register tmp1 = r10; 3058 const Register tmp2 = r11; 3059 const Register tmp3 = r12; 3060 const Register tmp4 = r13; 3061 const Register tmp5 = r14; 3062 const Register tmp6 = r15; 3063 const Register tmp7 = r16; 3064 3065 BLOCK_COMMENT("Entry:"); 3066 __ enter(); // required for proper stackwalking of RuntimeStub frame 3067 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3068 __ leave(); // required for proper stackwalking of RuntimeStub frame 3069 __ ret(lr); 3070 3071 return start; 3072 } 3073 3074 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3075 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3076 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3077 // Karatsuba multiplication performs a 128*128 -> 256-bit 3078 // multiplication in three 128-bit multiplications and a few 3079 // additions. 3080 // 3081 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3082 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3083 // 3084 // Inputs: 3085 // 3086 // A0 in a.d[0] (subkey) 3087 // A1 in a.d[1] 3088 // (A1+A0) in a1_xor_a0.d[0] 3089 // 3090 // B0 in b.d[0] (state) 3091 // B1 in b.d[1] 3092 3093 __ ext(tmp1, __ T16B, b, b, 0x08); 3094 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3095 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3096 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3097 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3098 3099 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3100 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3101 __ eor(tmp2, __ T16B, tmp2, tmp4); 3102 __ eor(tmp2, __ T16B, tmp2, tmp3); 3103 3104 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3105 __ ins(result_hi, __ D, tmp2, 0, 1); 3106 __ ins(result_lo, __ D, tmp2, 1, 0); 3107 } 3108 3109 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3110 FloatRegister p, FloatRegister z, FloatRegister t1) { 3111 const FloatRegister t0 = result; 3112 3113 // The GCM field polynomial f is z^128 + p(z), where p = 3114 // z^7+z^2+z+1. 3115 // 3116 // z^128 === -p(z) (mod (z^128 + p(z))) 3117 // 3118 // so, given that the product we're reducing is 3119 // a == lo + hi * z^128 3120 // substituting, 3121 // === lo - hi * p(z) (mod (z^128 + p(z))) 3122 // 3123 // we reduce by multiplying hi by p(z) and subtracting the result 3124 // from (i.e. XORing it with) lo. Because p has no nonzero high 3125 // bits we can do this with two 64-bit multiplications, lo*p and 3126 // hi*p. 3127 3128 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3129 __ ext(t1, __ T16B, t0, z, 8); 3130 __ eor(hi, __ T16B, hi, t1); 3131 __ ext(t1, __ T16B, z, t0, 8); 3132 __ eor(lo, __ T16B, lo, t1); 3133 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3134 __ eor(result, __ T16B, lo, t0); 3135 } 3136 3137 /** 3138 * Arguments: 3139 * 3140 * Input: 3141 * c_rarg0 - current state address 3142 * c_rarg1 - H key address 3143 * c_rarg2 - data address 3144 * c_rarg3 - number of blocks 3145 * 3146 * Output: 3147 * Updated state at c_rarg0 3148 */ 3149 address generate_ghash_processBlocks() { 3150 // Bafflingly, GCM uses little-endian for the byte order, but 3151 // big-endian for the bit order. For example, the polynomial 1 is 3152 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3153 // 3154 // So, we must either reverse the bytes in each word and do 3155 // everything big-endian or reverse the bits in each byte and do 3156 // it little-endian. On AArch64 it's more idiomatic to reverse 3157 // the bits in each byte (we have an instruction, RBIT, to do 3158 // that) and keep the data in little-endian bit order throught the 3159 // calculation, bit-reversing the inputs and outputs. 3160 3161 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3162 __ align(wordSize * 2); 3163 address p = __ pc(); 3164 __ emit_int64(0x87); // The low-order bits of the field 3165 // polynomial (i.e. p = z^7+z^2+z+1) 3166 // repeated in the low and high parts of a 3167 // 128-bit vector 3168 __ emit_int64(0x87); 3169 3170 __ align(CodeEntryAlignment); 3171 address start = __ pc(); 3172 3173 Register state = c_rarg0; 3174 Register subkeyH = c_rarg1; 3175 Register data = c_rarg2; 3176 Register blocks = c_rarg3; 3177 3178 FloatRegister vzr = v30; 3179 __ eor(vzr, __ T16B, vzr, vzr); // zero register 3180 3181 __ ldrq(v0, Address(state)); 3182 __ ldrq(v1, Address(subkeyH)); 3183 3184 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 3185 __ rbit(v0, __ T16B, v0); 3186 __ rev64(v1, __ T16B, v1); 3187 __ rbit(v1, __ T16B, v1); 3188 3189 __ ldrq(v26, p); 3190 3191 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 3192 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 3193 3194 { 3195 Label L_ghash_loop; 3196 __ bind(L_ghash_loop); 3197 3198 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 3199 // reversing each byte 3200 __ rbit(v2, __ T16B, v2); 3201 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 3202 3203 // Multiply state in v2 by subkey in v1 3204 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 3205 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 3206 /*temps*/v6, v20, v18, v21); 3207 // Reduce v7:v5 by the field polynomial 3208 ghash_reduce(v0, v5, v7, v26, vzr, v20); 3209 3210 __ sub(blocks, blocks, 1); 3211 __ cbnz(blocks, L_ghash_loop); 3212 } 3213 3214 // The bit-reversed result is at this point in v0 3215 __ rev64(v1, __ T16B, v0); 3216 __ rbit(v1, __ T16B, v1); 3217 3218 __ st1(v1, __ T16B, state); 3219 __ ret(lr); 3220 3221 return start; 3222 } 3223 3224 // Continuation point for throwing of implicit exceptions that are 3225 // not handled in the current activation. Fabricates an exception 3226 // oop and initiates normal exception dispatching in this 3227 // frame. Since we need to preserve callee-saved values (currently 3228 // only for C2, but done for C1 as well) we need a callee-saved oop 3229 // map and therefore have to make these stubs into RuntimeStubs 3230 // rather than BufferBlobs. If the compiler needs all registers to 3231 // be preserved between the fault point and the exception handler 3232 // then it must assume responsibility for that in 3233 // AbstractCompiler::continuation_for_implicit_null_exception or 3234 // continuation_for_implicit_division_by_zero_exception. All other 3235 // implicit exceptions (e.g., NullPointerException or 3236 // AbstractMethodError on entry) are either at call sites or 3237 // otherwise assume that stack unwinding will be initiated, so 3238 // caller saved registers were assumed volatile in the compiler. 3239 3240 #undef __ 3241 #define __ masm-> 3242 3243 address generate_throw_exception(const char* name, 3244 address runtime_entry, 3245 Register arg1 = noreg, 3246 Register arg2 = noreg) { 3247 // Information about frame layout at time of blocking runtime call. 3248 // Note that we only have to preserve callee-saved registers since 3249 // the compilers are responsible for supplying a continuation point 3250 // if they expect all registers to be preserved. 3251 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 3252 enum layout { 3253 rfp_off = 0, 3254 rfp_off2, 3255 return_off, 3256 return_off2, 3257 framesize // inclusive of return address 3258 }; 3259 3260 int insts_size = 512; 3261 int locs_size = 64; 3262 3263 CodeBuffer code(name, insts_size, locs_size); 3264 OopMapSet* oop_maps = new OopMapSet(); 3265 MacroAssembler* masm = new MacroAssembler(&code); 3266 3267 address start = __ pc(); 3268 3269 // This is an inlined and slightly modified version of call_VM 3270 // which has the ability to fetch the return PC out of 3271 // thread-local storage and also sets up last_Java_sp slightly 3272 // differently than the real call_VM 3273 3274 __ enter(); // Save FP and LR before call 3275 3276 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3277 3278 // lr and fp are already in place 3279 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 3280 3281 int frame_complete = __ pc() - start; 3282 3283 // Set up last_Java_sp and last_Java_fp 3284 address the_pc = __ pc(); 3285 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 3286 3287 // Call runtime 3288 if (arg1 != noreg) { 3289 assert(arg2 != c_rarg1, "clobbered"); 3290 __ mov(c_rarg1, arg1); 3291 } 3292 if (arg2 != noreg) { 3293 __ mov(c_rarg2, arg2); 3294 } 3295 __ mov(c_rarg0, rthread); 3296 BLOCK_COMMENT("call runtime_entry"); 3297 __ mov(rscratch1, runtime_entry); 3298 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 3299 3300 // Generate oop map 3301 OopMap* map = new OopMap(framesize, 0); 3302 3303 oop_maps->add_gc_map(the_pc - start, map); 3304 3305 __ reset_last_Java_frame(true, true); 3306 __ maybe_isb(); 3307 3308 __ leave(); 3309 3310 // check for pending exceptions 3311 #ifdef ASSERT 3312 Label L; 3313 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 3314 __ cbnz(rscratch1, L); 3315 __ should_not_reach_here(); 3316 __ bind(L); 3317 #endif // ASSERT 3318 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3319 3320 3321 // codeBlob framesize is in words (not VMRegImpl::slot_size) 3322 RuntimeStub* stub = 3323 RuntimeStub::new_runtime_stub(name, 3324 &code, 3325 frame_complete, 3326 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 3327 oop_maps, false); 3328 return stub->entry_point(); 3329 } 3330 3331 class MontgomeryMultiplyGenerator : public MacroAssembler { 3332 3333 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 3334 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 3335 3336 RegSet _toSave; 3337 bool _squaring; 3338 3339 public: 3340 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 3341 : MacroAssembler(as->code()), _squaring(squaring) { 3342 3343 // Register allocation 3344 3345 Register reg = c_rarg0; 3346 Pa_base = reg; // Argument registers 3347 if (squaring) 3348 Pb_base = Pa_base; 3349 else 3350 Pb_base = ++reg; 3351 Pn_base = ++reg; 3352 Rlen= ++reg; 3353 inv = ++reg; 3354 Pm_base = ++reg; 3355 3356 // Working registers: 3357 Ra = ++reg; // The current digit of a, b, n, and m. 3358 Rb = ++reg; 3359 Rm = ++reg; 3360 Rn = ++reg; 3361 3362 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 3363 Pb = ++reg; 3364 Pm = ++reg; 3365 Pn = ++reg; 3366 3367 t0 = ++reg; // Three registers which form a 3368 t1 = ++reg; // triple-precision accumuator. 3369 t2 = ++reg; 3370 3371 Ri = ++reg; // Inner and outer loop indexes. 3372 Rj = ++reg; 3373 3374 Rhi_ab = ++reg; // Product registers: low and high parts 3375 Rlo_ab = ++reg; // of a*b and m*n. 3376 Rhi_mn = ++reg; 3377 Rlo_mn = ++reg; 3378 3379 // r19 and up are callee-saved. 3380 _toSave = RegSet::range(r19, reg) + Pm_base; 3381 } 3382 3383 private: 3384 void save_regs() { 3385 push(_toSave, sp); 3386 } 3387 3388 void restore_regs() { 3389 pop(_toSave, sp); 3390 } 3391 3392 template <typename T> 3393 void unroll_2(Register count, T block) { 3394 Label loop, end, odd; 3395 tbnz(count, 0, odd); 3396 cbz(count, end); 3397 align(16); 3398 bind(loop); 3399 (this->*block)(); 3400 bind(odd); 3401 (this->*block)(); 3402 subs(count, count, 2); 3403 br(Assembler::GT, loop); 3404 bind(end); 3405 } 3406 3407 template <typename T> 3408 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 3409 Label loop, end, odd; 3410 tbnz(count, 0, odd); 3411 cbz(count, end); 3412 align(16); 3413 bind(loop); 3414 (this->*block)(d, s, tmp); 3415 bind(odd); 3416 (this->*block)(d, s, tmp); 3417 subs(count, count, 2); 3418 br(Assembler::GT, loop); 3419 bind(end); 3420 } 3421 3422 void pre1(RegisterOrConstant i) { 3423 block_comment("pre1"); 3424 // Pa = Pa_base; 3425 // Pb = Pb_base + i; 3426 // Pm = Pm_base; 3427 // Pn = Pn_base + i; 3428 // Ra = *Pa; 3429 // Rb = *Pb; 3430 // Rm = *Pm; 3431 // Rn = *Pn; 3432 ldr(Ra, Address(Pa_base)); 3433 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3434 ldr(Rm, Address(Pm_base)); 3435 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3436 lea(Pa, Address(Pa_base)); 3437 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 3438 lea(Pm, Address(Pm_base)); 3439 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3440 3441 // Zero the m*n result. 3442 mov(Rhi_mn, zr); 3443 mov(Rlo_mn, zr); 3444 } 3445 3446 // The core multiply-accumulate step of a Montgomery 3447 // multiplication. The idea is to schedule operations as a 3448 // pipeline so that instructions with long latencies (loads and 3449 // multiplies) have time to complete before their results are 3450 // used. This most benefits in-order implementations of the 3451 // architecture but out-of-order ones also benefit. 3452 void step() { 3453 block_comment("step"); 3454 // MACC(Ra, Rb, t0, t1, t2); 3455 // Ra = *++Pa; 3456 // Rb = *--Pb; 3457 umulh(Rhi_ab, Ra, Rb); 3458 mul(Rlo_ab, Ra, Rb); 3459 ldr(Ra, pre(Pa, wordSize)); 3460 ldr(Rb, pre(Pb, -wordSize)); 3461 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 3462 // previous iteration. 3463 // MACC(Rm, Rn, t0, t1, t2); 3464 // Rm = *++Pm; 3465 // Rn = *--Pn; 3466 umulh(Rhi_mn, Rm, Rn); 3467 mul(Rlo_mn, Rm, Rn); 3468 ldr(Rm, pre(Pm, wordSize)); 3469 ldr(Rn, pre(Pn, -wordSize)); 3470 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3471 } 3472 3473 void post1() { 3474 block_comment("post1"); 3475 3476 // MACC(Ra, Rb, t0, t1, t2); 3477 // Ra = *++Pa; 3478 // Rb = *--Pb; 3479 umulh(Rhi_ab, Ra, Rb); 3480 mul(Rlo_ab, Ra, Rb); 3481 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3482 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3483 3484 // *Pm = Rm = t0 * inv; 3485 mul(Rm, t0, inv); 3486 str(Rm, Address(Pm)); 3487 3488 // MACC(Rm, Rn, t0, t1, t2); 3489 // t0 = t1; t1 = t2; t2 = 0; 3490 umulh(Rhi_mn, Rm, Rn); 3491 3492 #ifndef PRODUCT 3493 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 3494 { 3495 mul(Rlo_mn, Rm, Rn); 3496 add(Rlo_mn, t0, Rlo_mn); 3497 Label ok; 3498 cbz(Rlo_mn, ok); { 3499 stop("broken Montgomery multiply"); 3500 } bind(ok); 3501 } 3502 #endif 3503 // We have very carefully set things up so that 3504 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 3505 // the lower half of Rm * Rn because we know the result already: 3506 // it must be -t0. t0 + (-t0) must generate a carry iff 3507 // t0 != 0. So, rather than do a mul and an adds we just set 3508 // the carry flag iff t0 is nonzero. 3509 // 3510 // mul(Rlo_mn, Rm, Rn); 3511 // adds(zr, t0, Rlo_mn); 3512 subs(zr, t0, 1); // Set carry iff t0 is nonzero 3513 adcs(t0, t1, Rhi_mn); 3514 adc(t1, t2, zr); 3515 mov(t2, zr); 3516 } 3517 3518 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 3519 block_comment("pre2"); 3520 // Pa = Pa_base + i-len; 3521 // Pb = Pb_base + len; 3522 // Pm = Pm_base + i-len; 3523 // Pn = Pn_base + len; 3524 3525 if (i.is_register()) { 3526 sub(Rj, i.as_register(), len); 3527 } else { 3528 mov(Rj, i.as_constant()); 3529 sub(Rj, Rj, len); 3530 } 3531 // Rj == i-len 3532 3533 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 3534 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 3535 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 3536 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 3537 3538 // Ra = *++Pa; 3539 // Rb = *--Pb; 3540 // Rm = *++Pm; 3541 // Rn = *--Pn; 3542 ldr(Ra, pre(Pa, wordSize)); 3543 ldr(Rb, pre(Pb, -wordSize)); 3544 ldr(Rm, pre(Pm, wordSize)); 3545 ldr(Rn, pre(Pn, -wordSize)); 3546 3547 mov(Rhi_mn, zr); 3548 mov(Rlo_mn, zr); 3549 } 3550 3551 void post2(RegisterOrConstant i, RegisterOrConstant len) { 3552 block_comment("post2"); 3553 if (i.is_constant()) { 3554 mov(Rj, i.as_constant()-len.as_constant()); 3555 } else { 3556 sub(Rj, i.as_register(), len); 3557 } 3558 3559 adds(t0, t0, Rlo_mn); // The pending m*n, low part 3560 3561 // As soon as we know the least significant digit of our result, 3562 // store it. 3563 // Pm_base[i-len] = t0; 3564 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 3565 3566 // t0 = t1; t1 = t2; t2 = 0; 3567 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 3568 adc(t1, t2, zr); 3569 mov(t2, zr); 3570 } 3571 3572 // A carry in t0 after Montgomery multiplication means that we 3573 // should subtract multiples of n from our result in m. We'll 3574 // keep doing that until there is no carry. 3575 void normalize(RegisterOrConstant len) { 3576 block_comment("normalize"); 3577 // while (t0) 3578 // t0 = sub(Pm_base, Pn_base, t0, len); 3579 Label loop, post, again; 3580 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 3581 cbz(t0, post); { 3582 bind(again); { 3583 mov(i, zr); 3584 mov(cnt, len); 3585 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3586 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3587 subs(zr, zr, zr); // set carry flag, i.e. no borrow 3588 align(16); 3589 bind(loop); { 3590 sbcs(Rm, Rm, Rn); 3591 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3592 add(i, i, 1); 3593 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 3594 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 3595 sub(cnt, cnt, 1); 3596 } cbnz(cnt, loop); 3597 sbc(t0, t0, zr); 3598 } cbnz(t0, again); 3599 } bind(post); 3600 } 3601 3602 // Move memory at s to d, reversing words. 3603 // Increments d to end of copied memory 3604 // Destroys tmp1, tmp2 3605 // Preserves len 3606 // Leaves s pointing to the address which was in d at start 3607 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 3608 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 3609 3610 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 3611 mov(tmp1, len); 3612 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 3613 sub(s, d, len, ext::uxtw, LogBytesPerWord); 3614 } 3615 // where 3616 void reverse1(Register d, Register s, Register tmp) { 3617 ldr(tmp, pre(s, -wordSize)); 3618 ror(tmp, tmp, 32); 3619 str(tmp, post(d, wordSize)); 3620 } 3621 3622 void step_squaring() { 3623 // An extra ACC 3624 step(); 3625 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3626 } 3627 3628 void last_squaring(RegisterOrConstant i) { 3629 Label dont; 3630 // if ((i & 1) == 0) { 3631 tbnz(i.as_register(), 0, dont); { 3632 // MACC(Ra, Rb, t0, t1, t2); 3633 // Ra = *++Pa; 3634 // Rb = *--Pb; 3635 umulh(Rhi_ab, Ra, Rb); 3636 mul(Rlo_ab, Ra, Rb); 3637 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 3638 } bind(dont); 3639 } 3640 3641 void extra_step_squaring() { 3642 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3643 3644 // MACC(Rm, Rn, t0, t1, t2); 3645 // Rm = *++Pm; 3646 // Rn = *--Pn; 3647 umulh(Rhi_mn, Rm, Rn); 3648 mul(Rlo_mn, Rm, Rn); 3649 ldr(Rm, pre(Pm, wordSize)); 3650 ldr(Rn, pre(Pn, -wordSize)); 3651 } 3652 3653 void post1_squaring() { 3654 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 3655 3656 // *Pm = Rm = t0 * inv; 3657 mul(Rm, t0, inv); 3658 str(Rm, Address(Pm)); 3659 3660 // MACC(Rm, Rn, t0, t1, t2); 3661 // t0 = t1; t1 = t2; t2 = 0; 3662 umulh(Rhi_mn, Rm, Rn); 3663 3664 #ifndef PRODUCT 3665 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 3666 { 3667 mul(Rlo_mn, Rm, Rn); 3668 add(Rlo_mn, t0, Rlo_mn); 3669 Label ok; 3670 cbz(Rlo_mn, ok); { 3671 stop("broken Montgomery multiply"); 3672 } bind(ok); 3673 } 3674 #endif 3675 // We have very carefully set things up so that 3676 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 3677 // the lower half of Rm * Rn because we know the result already: 3678 // it must be -t0. t0 + (-t0) must generate a carry iff 3679 // t0 != 0. So, rather than do a mul and an adds we just set 3680 // the carry flag iff t0 is nonzero. 3681 // 3682 // mul(Rlo_mn, Rm, Rn); 3683 // adds(zr, t0, Rlo_mn); 3684 subs(zr, t0, 1); // Set carry iff t0 is nonzero 3685 adcs(t0, t1, Rhi_mn); 3686 adc(t1, t2, zr); 3687 mov(t2, zr); 3688 } 3689 3690 void acc(Register Rhi, Register Rlo, 3691 Register t0, Register t1, Register t2) { 3692 adds(t0, t0, Rlo); 3693 adcs(t1, t1, Rhi); 3694 adc(t2, t2, zr); 3695 } 3696 3697 public: 3698 /** 3699 * Fast Montgomery multiplication. The derivation of the 3700 * algorithm is in A Cryptographic Library for the Motorola 3701 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 3702 * 3703 * Arguments: 3704 * 3705 * Inputs for multiplication: 3706 * c_rarg0 - int array elements a 3707 * c_rarg1 - int array elements b 3708 * c_rarg2 - int array elements n (the modulus) 3709 * c_rarg3 - int length 3710 * c_rarg4 - int inv 3711 * c_rarg5 - int array elements m (the result) 3712 * 3713 * Inputs for squaring: 3714 * c_rarg0 - int array elements a 3715 * c_rarg1 - int array elements n (the modulus) 3716 * c_rarg2 - int length 3717 * c_rarg3 - int inv 3718 * c_rarg4 - int array elements m (the result) 3719 * 3720 */ 3721 address generate_multiply() { 3722 Label argh, nothing; 3723 bind(argh); 3724 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3725 3726 align(CodeEntryAlignment); 3727 address entry = pc(); 3728 3729 cbzw(Rlen, nothing); 3730 3731 enter(); 3732 3733 // Make room. 3734 cmpw(Rlen, 512); 3735 br(Assembler::HI, argh); 3736 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 3737 andr(sp, Ra, -2 * wordSize); 3738 3739 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 3740 3741 { 3742 // Copy input args, reversing as we go. We use Ra as a 3743 // temporary variable. 3744 reverse(Ra, Pa_base, Rlen, t0, t1); 3745 if (!_squaring) 3746 reverse(Ra, Pb_base, Rlen, t0, t1); 3747 reverse(Ra, Pn_base, Rlen, t0, t1); 3748 } 3749 3750 // Push all call-saved registers and also Pm_base which we'll need 3751 // at the end. 3752 save_regs(); 3753 3754 #ifndef PRODUCT 3755 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 3756 { 3757 ldr(Rn, Address(Pn_base, 0)); 3758 mul(Rlo_mn, Rn, inv); 3759 cmp(Rlo_mn, -1); 3760 Label ok; 3761 br(EQ, ok); { 3762 stop("broken inverse in Montgomery multiply"); 3763 } bind(ok); 3764 } 3765 #endif 3766 3767 mov(Pm_base, Ra); 3768 3769 mov(t0, zr); 3770 mov(t1, zr); 3771 mov(t2, zr); 3772 3773 block_comment("for (int i = 0; i < len; i++) {"); 3774 mov(Ri, zr); { 3775 Label loop, end; 3776 cmpw(Ri, Rlen); 3777 br(Assembler::GE, end); 3778 3779 bind(loop); 3780 pre1(Ri); 3781 3782 block_comment(" for (j = i; j; j--) {"); { 3783 movw(Rj, Ri); 3784 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3785 } block_comment(" } // j"); 3786 3787 post1(); 3788 addw(Ri, Ri, 1); 3789 cmpw(Ri, Rlen); 3790 br(Assembler::LT, loop); 3791 bind(end); 3792 block_comment("} // i"); 3793 } 3794 3795 block_comment("for (int i = len; i < 2*len; i++) {"); 3796 mov(Ri, Rlen); { 3797 Label loop, end; 3798 cmpw(Ri, Rlen, Assembler::LSL, 1); 3799 br(Assembler::GE, end); 3800 3801 bind(loop); 3802 pre2(Ri, Rlen); 3803 3804 block_comment(" for (j = len*2-i-1; j; j--) {"); { 3805 lslw(Rj, Rlen, 1); 3806 subw(Rj, Rj, Ri); 3807 subw(Rj, Rj, 1); 3808 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 3809 } block_comment(" } // j"); 3810 3811 post2(Ri, Rlen); 3812 addw(Ri, Ri, 1); 3813 cmpw(Ri, Rlen, Assembler::LSL, 1); 3814 br(Assembler::LT, loop); 3815 bind(end); 3816 } 3817 block_comment("} // i"); 3818 3819 normalize(Rlen); 3820 3821 mov(Ra, Pm_base); // Save Pm_base in Ra 3822 restore_regs(); // Restore caller's Pm_base 3823 3824 // Copy our result into caller's Pm_base 3825 reverse(Pm_base, Ra, Rlen, t0, t1); 3826 3827 leave(); 3828 bind(nothing); 3829 ret(lr); 3830 3831 return entry; 3832 } 3833 // In C, approximately: 3834 3835 // void 3836 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 3837 // unsigned long Pn_base[], unsigned long Pm_base[], 3838 // unsigned long inv, int len) { 3839 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 3840 // unsigned long *Pa, *Pb, *Pn, *Pm; 3841 // unsigned long Ra, Rb, Rn, Rm; 3842 3843 // int i; 3844 3845 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 3846 3847 // for (i = 0; i < len; i++) { 3848 // int j; 3849 3850 // Pa = Pa_base; 3851 // Pb = Pb_base + i; 3852 // Pm = Pm_base; 3853 // Pn = Pn_base + i; 3854 3855 // Ra = *Pa; 3856 // Rb = *Pb; 3857 // Rm = *Pm; 3858 // Rn = *Pn; 3859 3860 // int iters = i; 3861 // for (j = 0; iters--; j++) { 3862 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 3863 // MACC(Ra, Rb, t0, t1, t2); 3864 // Ra = *++Pa; 3865 // Rb = *--Pb; 3866 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 3867 // MACC(Rm, Rn, t0, t1, t2); 3868 // Rm = *++Pm; 3869 // Rn = *--Pn; 3870 // } 3871 3872 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 3873 // MACC(Ra, Rb, t0, t1, t2); 3874 // *Pm = Rm = t0 * inv; 3875 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 3876 // MACC(Rm, Rn, t0, t1, t2); 3877 3878 // assert(t0 == 0, "broken Montgomery multiply"); 3879 3880 // t0 = t1; t1 = t2; t2 = 0; 3881 // } 3882 3883 // for (i = len; i < 2*len; i++) { 3884 // int j; 3885 3886 // Pa = Pa_base + i-len; 3887 // Pb = Pb_base + len; 3888 // Pm = Pm_base + i-len; 3889 // Pn = Pn_base + len; 3890 3891 // Ra = *++Pa; 3892 // Rb = *--Pb; 3893 // Rm = *++Pm; 3894 // Rn = *--Pn; 3895 3896 // int iters = len*2-i-1; 3897 // for (j = i-len+1; iters--; j++) { 3898 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 3899 // MACC(Ra, Rb, t0, t1, t2); 3900 // Ra = *++Pa; 3901 // Rb = *--Pb; 3902 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 3903 // MACC(Rm, Rn, t0, t1, t2); 3904 // Rm = *++Pm; 3905 // Rn = *--Pn; 3906 // } 3907 3908 // Pm_base[i-len] = t0; 3909 // t0 = t1; t1 = t2; t2 = 0; 3910 // } 3911 3912 // while (t0) 3913 // t0 = sub(Pm_base, Pn_base, t0, len); 3914 // } 3915 3916 /** 3917 * Fast Montgomery squaring. This uses asymptotically 25% fewer 3918 * multiplies than Montgomery multiplication so it should be up to 3919 * 25% faster. However, its loop control is more complex and it 3920 * may actually run slower on some machines. 3921 * 3922 * Arguments: 3923 * 3924 * Inputs: 3925 * c_rarg0 - int array elements a 3926 * c_rarg1 - int array elements n (the modulus) 3927 * c_rarg2 - int length 3928 * c_rarg3 - int inv 3929 * c_rarg4 - int array elements m (the result) 3930 * 3931 */ 3932 address generate_square() { 3933 Label argh; 3934 bind(argh); 3935 stop("MontgomeryMultiply total_allocation must be <= 8192"); 3936 3937 align(CodeEntryAlignment); 3938 address entry = pc(); 3939 3940 enter(); 3941 3942 // Make room. 3943 cmpw(Rlen, 512); 3944 br(Assembler::HI, argh); 3945 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 3946 andr(sp, Ra, -2 * wordSize); 3947 3948 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 3949 3950 { 3951 // Copy input args, reversing as we go. We use Ra as a 3952 // temporary variable. 3953 reverse(Ra, Pa_base, Rlen, t0, t1); 3954 reverse(Ra, Pn_base, Rlen, t0, t1); 3955 } 3956 3957 // Push all call-saved registers and also Pm_base which we'll need 3958 // at the end. 3959 save_regs(); 3960 3961 mov(Pm_base, Ra); 3962 3963 mov(t0, zr); 3964 mov(t1, zr); 3965 mov(t2, zr); 3966 3967 block_comment("for (int i = 0; i < len; i++) {"); 3968 mov(Ri, zr); { 3969 Label loop, end; 3970 bind(loop); 3971 cmp(Ri, Rlen); 3972 br(Assembler::GE, end); 3973 3974 pre1(Ri); 3975 3976 block_comment("for (j = (i+1)/2; j; j--) {"); { 3977 add(Rj, Ri, 1); 3978 lsr(Rj, Rj, 1); 3979 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 3980 } block_comment(" } // j"); 3981 3982 last_squaring(Ri); 3983 3984 block_comment(" for (j = i/2; j; j--) {"); { 3985 lsr(Rj, Ri, 1); 3986 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 3987 } block_comment(" } // j"); 3988 3989 post1_squaring(); 3990 add(Ri, Ri, 1); 3991 cmp(Ri, Rlen); 3992 br(Assembler::LT, loop); 3993 3994 bind(end); 3995 block_comment("} // i"); 3996 } 3997 3998 block_comment("for (int i = len; i < 2*len; i++) {"); 3999 mov(Ri, Rlen); { 4000 Label loop, end; 4001 bind(loop); 4002 cmp(Ri, Rlen, Assembler::LSL, 1); 4003 br(Assembler::GE, end); 4004 4005 pre2(Ri, Rlen); 4006 4007 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4008 lsl(Rj, Rlen, 1); 4009 sub(Rj, Rj, Ri); 4010 sub(Rj, Rj, 1); 4011 lsr(Rj, Rj, 1); 4012 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4013 } block_comment(" } // j"); 4014 4015 last_squaring(Ri); 4016 4017 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4018 lsl(Rj, Rlen, 1); 4019 sub(Rj, Rj, Ri); 4020 lsr(Rj, Rj, 1); 4021 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4022 } block_comment(" } // j"); 4023 4024 post2(Ri, Rlen); 4025 add(Ri, Ri, 1); 4026 cmp(Ri, Rlen, Assembler::LSL, 1); 4027 4028 br(Assembler::LT, loop); 4029 bind(end); 4030 block_comment("} // i"); 4031 } 4032 4033 normalize(Rlen); 4034 4035 mov(Ra, Pm_base); // Save Pm_base in Ra 4036 restore_regs(); // Restore caller's Pm_base 4037 4038 // Copy our result into caller's Pm_base 4039 reverse(Pm_base, Ra, Rlen, t0, t1); 4040 4041 leave(); 4042 ret(lr); 4043 4044 return entry; 4045 } 4046 // In C, approximately: 4047 4048 // void 4049 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4050 // unsigned long Pm_base[], unsigned long inv, int len) { 4051 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4052 // unsigned long *Pa, *Pb, *Pn, *Pm; 4053 // unsigned long Ra, Rb, Rn, Rm; 4054 4055 // int i; 4056 4057 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4058 4059 // for (i = 0; i < len; i++) { 4060 // int j; 4061 4062 // Pa = Pa_base; 4063 // Pb = Pa_base + i; 4064 // Pm = Pm_base; 4065 // Pn = Pn_base + i; 4066 4067 // Ra = *Pa; 4068 // Rb = *Pb; 4069 // Rm = *Pm; 4070 // Rn = *Pn; 4071 4072 // int iters = (i+1)/2; 4073 // for (j = 0; iters--; j++) { 4074 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4075 // MACC2(Ra, Rb, t0, t1, t2); 4076 // Ra = *++Pa; 4077 // Rb = *--Pb; 4078 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4079 // MACC(Rm, Rn, t0, t1, t2); 4080 // Rm = *++Pm; 4081 // Rn = *--Pn; 4082 // } 4083 // if ((i & 1) == 0) { 4084 // assert(Ra == Pa_base[j], "must be"); 4085 // MACC(Ra, Ra, t0, t1, t2); 4086 // } 4087 // iters = i/2; 4088 // assert(iters == i-j, "must be"); 4089 // for (; iters--; j++) { 4090 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4091 // MACC(Rm, Rn, t0, t1, t2); 4092 // Rm = *++Pm; 4093 // Rn = *--Pn; 4094 // } 4095 4096 // *Pm = Rm = t0 * inv; 4097 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4098 // MACC(Rm, Rn, t0, t1, t2); 4099 4100 // assert(t0 == 0, "broken Montgomery multiply"); 4101 4102 // t0 = t1; t1 = t2; t2 = 0; 4103 // } 4104 4105 // for (i = len; i < 2*len; i++) { 4106 // int start = i-len+1; 4107 // int end = start + (len - start)/2; 4108 // int j; 4109 4110 // Pa = Pa_base + i-len; 4111 // Pb = Pa_base + len; 4112 // Pm = Pm_base + i-len; 4113 // Pn = Pn_base + len; 4114 4115 // Ra = *++Pa; 4116 // Rb = *--Pb; 4117 // Rm = *++Pm; 4118 // Rn = *--Pn; 4119 4120 // int iters = (2*len-i-1)/2; 4121 // assert(iters == end-start, "must be"); 4122 // for (j = start; iters--; j++) { 4123 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4124 // MACC2(Ra, Rb, t0, t1, t2); 4125 // Ra = *++Pa; 4126 // Rb = *--Pb; 4127 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4128 // MACC(Rm, Rn, t0, t1, t2); 4129 // Rm = *++Pm; 4130 // Rn = *--Pn; 4131 // } 4132 // if ((i & 1) == 0) { 4133 // assert(Ra == Pa_base[j], "must be"); 4134 // MACC(Ra, Ra, t0, t1, t2); 4135 // } 4136 // iters = (2*len-i)/2; 4137 // assert(iters == len-j, "must be"); 4138 // for (; iters--; j++) { 4139 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4140 // MACC(Rm, Rn, t0, t1, t2); 4141 // Rm = *++Pm; 4142 // Rn = *--Pn; 4143 // } 4144 // Pm_base[i-len] = t0; 4145 // t0 = t1; t1 = t2; t2 = 0; 4146 // } 4147 4148 // while (t0) 4149 // t0 = sub(Pm_base, Pn_base, t0, len); 4150 // } 4151 }; 4152 4153 // Initialization 4154 void generate_initial() { 4155 // Generate initial stubs and initializes the entry points 4156 4157 // entry points that exist in all platforms Note: This is code 4158 // that could be shared among different platforms - however the 4159 // benefit seems to be smaller than the disadvantage of having a 4160 // much more complicated generator structure. See also comment in 4161 // stubRoutines.hpp. 4162 4163 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4164 4165 StubRoutines::_call_stub_entry = 4166 generate_call_stub(StubRoutines::_call_stub_return_address); 4167 4168 // is referenced by megamorphic call 4169 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4170 4171 // Build this early so it's available for the interpreter. 4172 StubRoutines::_throw_StackOverflowError_entry = 4173 generate_throw_exception("StackOverflowError throw_exception", 4174 CAST_FROM_FN_PTR(address, 4175 SharedRuntime:: 4176 throw_StackOverflowError)); 4177 if (UseCRC32Intrinsics) { 4178 // set table address before stub generation which use it 4179 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 4180 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4181 } 4182 } 4183 4184 void generate_all() { 4185 // support for verify_oop (must happen after universe_init) 4186 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4187 StubRoutines::_throw_AbstractMethodError_entry = 4188 generate_throw_exception("AbstractMethodError throw_exception", 4189 CAST_FROM_FN_PTR(address, 4190 SharedRuntime:: 4191 throw_AbstractMethodError)); 4192 4193 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4194 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4195 CAST_FROM_FN_PTR(address, 4196 SharedRuntime:: 4197 throw_IncompatibleClassChangeError)); 4198 4199 StubRoutines::_throw_NullPointerException_at_call_entry = 4200 generate_throw_exception("NullPointerException at call throw_exception", 4201 CAST_FROM_FN_PTR(address, 4202 SharedRuntime:: 4203 throw_NullPointerException_at_call)); 4204 4205 // arraycopy stubs used by compilers 4206 generate_arraycopy_stubs(); 4207 4208 if (UseMultiplyToLenIntrinsic) { 4209 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4210 } 4211 4212 if (UseMontgomeryMultiplyIntrinsic) { 4213 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4214 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4215 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4216 } 4217 4218 if (UseMontgomerySquareIntrinsic) { 4219 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4220 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4221 // We use generate_multiply() rather than generate_square() 4222 // because it's faster for the sizes of modulus we care about. 4223 StubRoutines::_montgomerySquare = g.generate_multiply(); 4224 } 4225 4226 #ifndef BUILTIN_SIM 4227 // generate GHASH intrinsics code 4228 if (UseGHASHIntrinsics) { 4229 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 4230 } 4231 4232 if (UseAESIntrinsics) { 4233 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4234 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4235 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4236 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4237 } 4238 4239 if (UseSHA1Intrinsics) { 4240 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4241 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4242 } 4243 if (UseSHA256Intrinsics) { 4244 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4245 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4246 } 4247 4248 if (UseCRC32CIntrinsics) { 4249 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 4250 } 4251 4252 // generate Adler32 intrinsics code 4253 if (UseAdler32Intrinsics) { 4254 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 4255 } 4256 4257 // Safefetch stubs. 4258 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4259 &StubRoutines::_safefetch32_fault_pc, 4260 &StubRoutines::_safefetch32_continuation_pc); 4261 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4262 &StubRoutines::_safefetchN_fault_pc, 4263 &StubRoutines::_safefetchN_continuation_pc); 4264 #endif 4265 } 4266 4267 public: 4268 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4269 if (all) { 4270 generate_all(); 4271 } else { 4272 generate_initial(); 4273 } 4274 } 4275 }; // end class declaration 4276 4277 void StubGenerator_generate(CodeBuffer* code, bool all) { 4278 StubGenerator g(code, all); 4279 }