1 /* 2 * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetCodeGen.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "nativeInst_aarch64.hpp" 33 #include "oops/instanceOop.hpp" 34 #include "oops/method.hpp" 35 #include "oops/objArrayKlass.hpp" 36 #include "oops/oop.inline.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "runtime/frame.inline.hpp" 39 #include "runtime/handles.inline.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubCodeGenerator.hpp" 42 #include "runtime/stubRoutines.hpp" 43 #include "runtime/thread.inline.hpp" 44 #include "utilities/align.hpp" 45 #ifdef COMPILER2 46 #include "opto/runtime.hpp" 47 #endif 48 49 #ifdef BUILTIN_SIM 50 #include "../../../../../../simulator/simulator.hpp" 51 #endif 52 53 // Declaration and definition of StubGenerator (no .hpp file). 54 // For a more detailed description of the stub routine structure 55 // see the comment in stubRoutines.hpp 56 57 #undef __ 58 #define __ _masm-> 59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 60 61 #ifdef PRODUCT 62 #define BLOCK_COMMENT(str) /* nothing */ 63 #else 64 #define BLOCK_COMMENT(str) __ block_comment(str) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Stub Code definitions 70 71 class StubGenerator: public StubCodeGenerator { 72 private: 73 74 #ifdef PRODUCT 75 #define inc_counter_np(counter) ((void)0) 76 #else 77 void inc_counter_np_(int& counter) { 78 __ lea(rscratch2, ExternalAddress((address)&counter)); 79 __ ldrw(rscratch1, Address(rscratch2)); 80 __ addw(rscratch1, rscratch1, 1); 81 __ strw(rscratch1, Address(rscratch2)); 82 } 83 #define inc_counter_np(counter) \ 84 BLOCK_COMMENT("inc_counter " #counter); \ 85 inc_counter_np_(counter); 86 #endif 87 88 // Call stubs are used to call Java from C 89 // 90 // Arguments: 91 // c_rarg0: call wrapper address address 92 // c_rarg1: result address 93 // c_rarg2: result type BasicType 94 // c_rarg3: method Method* 95 // c_rarg4: (interpreter) entry point address 96 // c_rarg5: parameters intptr_t* 97 // c_rarg6: parameter size (in words) int 98 // c_rarg7: thread Thread* 99 // 100 // There is no return from the stub itself as any Java result 101 // is written to result 102 // 103 // we save r30 (lr) as the return PC at the base of the frame and 104 // link r29 (fp) below it as the frame pointer installing sp (r31) 105 // into fp. 106 // 107 // we save r0-r7, which accounts for all the c arguments. 108 // 109 // TODO: strictly do we need to save them all? they are treated as 110 // volatile by C so could we omit saving the ones we are going to 111 // place in global registers (thread? method?) or those we only use 112 // during setup of the Java call? 113 // 114 // we don't need to save r8 which C uses as an indirect result location 115 // return register. 116 // 117 // we don't need to save r9-r15 which both C and Java treat as 118 // volatile 119 // 120 // we don't need to save r16-18 because Java does not use them 121 // 122 // we save r19-r28 which Java uses as scratch registers and C 123 // expects to be callee-save 124 // 125 // we save the bottom 64 bits of each value stored in v8-v15; it is 126 // the responsibility of the caller to preserve larger values. 127 // 128 // so the stub frame looks like this when we enter Java code 129 // 130 // [ return_from_Java ] <--- sp 131 // [ argument word n ] 132 // ... 133 // -27 [ argument word 1 ] 134 // -26 [ saved v15 ] <--- sp_after_call 135 // -25 [ saved v14 ] 136 // -24 [ saved v13 ] 137 // -23 [ saved v12 ] 138 // -22 [ saved v11 ] 139 // -21 [ saved v10 ] 140 // -20 [ saved v9 ] 141 // -19 [ saved v8 ] 142 // -18 [ saved r28 ] 143 // -17 [ saved r27 ] 144 // -16 [ saved r26 ] 145 // -15 [ saved r25 ] 146 // -14 [ saved r24 ] 147 // -13 [ saved r23 ] 148 // -12 [ saved r22 ] 149 // -11 [ saved r21 ] 150 // -10 [ saved r20 ] 151 // -9 [ saved r19 ] 152 // -8 [ call wrapper (r0) ] 153 // -7 [ result (r1) ] 154 // -6 [ result type (r2) ] 155 // -5 [ method (r3) ] 156 // -4 [ entry point (r4) ] 157 // -3 [ parameters (r5) ] 158 // -2 [ parameter size (r6) ] 159 // -1 [ thread (r7) ] 160 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 161 // 1 [ saved lr (r30) ] 162 163 // Call stub stack layout word offsets from fp 164 enum call_stub_layout { 165 sp_after_call_off = -26, 166 167 d15_off = -26, 168 d13_off = -24, 169 d11_off = -22, 170 d9_off = -20, 171 172 r28_off = -18, 173 r26_off = -16, 174 r24_off = -14, 175 r22_off = -12, 176 r20_off = -10, 177 call_wrapper_off = -8, 178 result_off = -7, 179 result_type_off = -6, 180 method_off = -5, 181 entry_point_off = -4, 182 parameter_size_off = -2, 183 thread_off = -1, 184 fp_f = 0, 185 retaddr_off = 1, 186 }; 187 188 address generate_call_stub(address& return_address) { 189 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 190 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 191 "adjust this code"); 192 193 StubCodeMark mark(this, "StubRoutines", "call_stub"); 194 address start = __ pc(); 195 196 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 197 198 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 199 const Address result (rfp, result_off * wordSize); 200 const Address result_type (rfp, result_type_off * wordSize); 201 const Address method (rfp, method_off * wordSize); 202 const Address entry_point (rfp, entry_point_off * wordSize); 203 const Address parameter_size(rfp, parameter_size_off * wordSize); 204 205 const Address thread (rfp, thread_off * wordSize); 206 207 const Address d15_save (rfp, d15_off * wordSize); 208 const Address d13_save (rfp, d13_off * wordSize); 209 const Address d11_save (rfp, d11_off * wordSize); 210 const Address d9_save (rfp, d9_off * wordSize); 211 212 const Address r28_save (rfp, r28_off * wordSize); 213 const Address r26_save (rfp, r26_off * wordSize); 214 const Address r24_save (rfp, r24_off * wordSize); 215 const Address r22_save (rfp, r22_off * wordSize); 216 const Address r20_save (rfp, r20_off * wordSize); 217 218 // stub code 219 220 // we need a C prolog to bootstrap the x86 caller into the sim 221 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 222 223 address aarch64_entry = __ pc(); 224 225 #ifdef BUILTIN_SIM 226 // Save sender's SP for stack traces. 227 __ mov(rscratch1, sp); 228 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 229 #endif 230 // set up frame and move sp to end of save area 231 __ enter(); 232 __ sub(sp, rfp, -sp_after_call_off * wordSize); 233 234 // save register parameters and Java scratch/global registers 235 // n.b. we save thread even though it gets installed in 236 // rthread because we want to sanity check rthread later 237 __ str(c_rarg7, thread); 238 __ strw(c_rarg6, parameter_size); 239 __ stp(c_rarg4, c_rarg5, entry_point); 240 __ stp(c_rarg2, c_rarg3, result_type); 241 __ stp(c_rarg0, c_rarg1, call_wrapper); 242 243 __ stp(r20, r19, r20_save); 244 __ stp(r22, r21, r22_save); 245 __ stp(r24, r23, r24_save); 246 __ stp(r26, r25, r26_save); 247 __ stp(r28, r27, r28_save); 248 249 __ stpd(v9, v8, d9_save); 250 __ stpd(v11, v10, d11_save); 251 __ stpd(v13, v12, d13_save); 252 __ stpd(v15, v14, d15_save); 253 254 // install Java thread in global register now we have saved 255 // whatever value it held 256 __ mov(rthread, c_rarg7); 257 // And method 258 __ mov(rmethod, c_rarg3); 259 260 // set up the heapbase register 261 __ reinit_heapbase(); 262 263 #ifdef ASSERT 264 // make sure we have no pending exceptions 265 { 266 Label L; 267 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 268 __ cmp(rscratch1, (unsigned)NULL_WORD); 269 __ br(Assembler::EQ, L); 270 __ stop("StubRoutines::call_stub: entered with pending exception"); 271 __ BIND(L); 272 } 273 #endif 274 // pass parameters if any 275 __ mov(esp, sp); 276 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 277 __ andr(sp, rscratch1, -2 * wordSize); 278 279 BLOCK_COMMENT("pass parameters if any"); 280 Label parameters_done; 281 // parameter count is still in c_rarg6 282 // and parameter pointer identifying param 1 is in c_rarg5 283 __ cbzw(c_rarg6, parameters_done); 284 285 address loop = __ pc(); 286 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 287 __ subsw(c_rarg6, c_rarg6, 1); 288 __ push(rscratch1); 289 __ br(Assembler::GT, loop); 290 291 __ BIND(parameters_done); 292 293 // call Java entry -- passing methdoOop, and current sp 294 // rmethod: Method* 295 // r13: sender sp 296 BLOCK_COMMENT("call Java function"); 297 __ mov(r13, sp); 298 __ blr(c_rarg4); 299 300 // tell the simulator we have returned to the stub 301 302 // we do this here because the notify will already have been done 303 // if we get to the next instruction via an exception 304 // 305 // n.b. adding this instruction here affects the calculation of 306 // whether or not a routine returns to the call stub (used when 307 // doing stack walks) since the normal test is to check the return 308 // pc against the address saved below. so we may need to allow for 309 // this extra instruction in the check. 310 311 if (NotifySimulator) { 312 __ notify(Assembler::method_reentry); 313 } 314 // save current address for use by exception handling code 315 316 return_address = __ pc(); 317 318 // store result depending on type (everything that is not 319 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 320 // n.b. this assumes Java returns an integral result in r0 321 // and a floating result in j_farg0 322 __ ldr(j_rarg2, result); 323 Label is_long, is_float, is_double, exit; 324 __ ldr(j_rarg1, result_type); 325 __ cmp(j_rarg1, T_OBJECT); 326 __ br(Assembler::EQ, is_long); 327 __ cmp(j_rarg1, T_LONG); 328 __ br(Assembler::EQ, is_long); 329 __ cmp(j_rarg1, T_FLOAT); 330 __ br(Assembler::EQ, is_float); 331 __ cmp(j_rarg1, T_DOUBLE); 332 __ br(Assembler::EQ, is_double); 333 334 // handle T_INT case 335 __ strw(r0, Address(j_rarg2)); 336 337 __ BIND(exit); 338 339 // pop parameters 340 __ sub(esp, rfp, -sp_after_call_off * wordSize); 341 342 #ifdef ASSERT 343 // verify that threads correspond 344 { 345 Label L, S; 346 __ ldr(rscratch1, thread); 347 __ cmp(rthread, rscratch1); 348 __ br(Assembler::NE, S); 349 __ get_thread(rscratch1); 350 __ cmp(rthread, rscratch1); 351 __ br(Assembler::EQ, L); 352 __ BIND(S); 353 __ stop("StubRoutines::call_stub: threads must correspond"); 354 __ BIND(L); 355 } 356 #endif 357 358 // restore callee-save registers 359 __ ldpd(v15, v14, d15_save); 360 __ ldpd(v13, v12, d13_save); 361 __ ldpd(v11, v10, d11_save); 362 __ ldpd(v9, v8, d9_save); 363 364 __ ldp(r28, r27, r28_save); 365 __ ldp(r26, r25, r26_save); 366 __ ldp(r24, r23, r24_save); 367 __ ldp(r22, r21, r22_save); 368 __ ldp(r20, r19, r20_save); 369 370 __ ldp(c_rarg0, c_rarg1, call_wrapper); 371 __ ldrw(c_rarg2, result_type); 372 __ ldr(c_rarg3, method); 373 __ ldp(c_rarg4, c_rarg5, entry_point); 374 __ ldp(c_rarg6, c_rarg7, parameter_size); 375 376 #ifndef PRODUCT 377 // tell the simulator we are about to end Java execution 378 if (NotifySimulator) { 379 __ notify(Assembler::method_exit); 380 } 381 #endif 382 // leave frame and return to caller 383 __ leave(); 384 __ ret(lr); 385 386 // handle return types different from T_INT 387 388 __ BIND(is_long); 389 __ str(r0, Address(j_rarg2, 0)); 390 __ br(Assembler::AL, exit); 391 392 __ BIND(is_float); 393 __ strs(j_farg0, Address(j_rarg2, 0)); 394 __ br(Assembler::AL, exit); 395 396 __ BIND(is_double); 397 __ strd(j_farg0, Address(j_rarg2, 0)); 398 __ br(Assembler::AL, exit); 399 400 return start; 401 } 402 403 // Return point for a Java call if there's an exception thrown in 404 // Java code. The exception is caught and transformed into a 405 // pending exception stored in JavaThread that can be tested from 406 // within the VM. 407 // 408 // Note: Usually the parameters are removed by the callee. In case 409 // of an exception crossing an activation frame boundary, that is 410 // not the case if the callee is compiled code => need to setup the 411 // rsp. 412 // 413 // r0: exception oop 414 415 // NOTE: this is used as a target from the signal handler so it 416 // needs an x86 prolog which returns into the current simulator 417 // executing the generated catch_exception code. so the prolog 418 // needs to install rax in a sim register and adjust the sim's 419 // restart pc to enter the generated code at the start position 420 // then return from native to simulated execution. 421 422 address generate_catch_exception() { 423 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 424 address start = __ pc(); 425 426 // same as in generate_call_stub(): 427 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 428 const Address thread (rfp, thread_off * wordSize); 429 430 #ifdef ASSERT 431 // verify that threads correspond 432 { 433 Label L, S; 434 __ ldr(rscratch1, thread); 435 __ cmp(rthread, rscratch1); 436 __ br(Assembler::NE, S); 437 __ get_thread(rscratch1); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::EQ, L); 440 __ bind(S); 441 __ stop("StubRoutines::catch_exception: threads must correspond"); 442 __ bind(L); 443 } 444 #endif 445 446 // set pending exception 447 __ verify_oop(r0); 448 449 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 450 __ mov(rscratch1, (address)__FILE__); 451 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 452 __ movw(rscratch1, (int)__LINE__); 453 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 454 455 // complete return to VM 456 assert(StubRoutines::_call_stub_return_address != NULL, 457 "_call_stub_return_address must have been generated before"); 458 __ b(StubRoutines::_call_stub_return_address); 459 460 return start; 461 } 462 463 // Continuation point for runtime calls returning with a pending 464 // exception. The pending exception check happened in the runtime 465 // or native call stub. The pending exception in Thread is 466 // converted into a Java-level exception. 467 // 468 // Contract with Java-level exception handlers: 469 // r0: exception 470 // r3: throwing pc 471 // 472 // NOTE: At entry of this stub, exception-pc must be in LR !! 473 474 // NOTE: this is always used as a jump target within generated code 475 // so it just needs to be generated code wiht no x86 prolog 476 477 address generate_forward_exception() { 478 StubCodeMark mark(this, "StubRoutines", "forward exception"); 479 address start = __ pc(); 480 481 // Upon entry, LR points to the return address returning into 482 // Java (interpreted or compiled) code; i.e., the return address 483 // becomes the throwing pc. 484 // 485 // Arguments pushed before the runtime call are still on the stack 486 // but the exception handler will reset the stack pointer -> 487 // ignore them. A potential result in registers can be ignored as 488 // well. 489 490 #ifdef ASSERT 491 // make sure this code is only executed if there is a pending exception 492 { 493 Label L; 494 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 495 __ cbnz(rscratch1, L); 496 __ stop("StubRoutines::forward exception: no pending exception (1)"); 497 __ bind(L); 498 } 499 #endif 500 501 // compute exception handler into r19 502 503 // call the VM to find the handler address associated with the 504 // caller address. pass thread in r0 and caller pc (ret address) 505 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 506 // the stack. 507 __ mov(c_rarg1, lr); 508 // lr will be trashed by the VM call so we move it to R19 509 // (callee-saved) because we also need to pass it to the handler 510 // returned by this call. 511 __ mov(r19, lr); 512 BLOCK_COMMENT("call exception_handler_for_return_address"); 513 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 514 SharedRuntime::exception_handler_for_return_address), 515 rthread, c_rarg1); 516 // we should not really care that lr is no longer the callee 517 // address. we saved the value the handler needs in r19 so we can 518 // just copy it to r3. however, the C2 handler will push its own 519 // frame and then calls into the VM and the VM code asserts that 520 // the PC for the frame above the handler belongs to a compiled 521 // Java method. So, we restore lr here to satisfy that assert. 522 __ mov(lr, r19); 523 // setup r0 & r3 & clear pending exception 524 __ mov(r3, r19); 525 __ mov(r19, r0); 526 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 527 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 528 529 #ifdef ASSERT 530 // make sure exception is set 531 { 532 Label L; 533 __ cbnz(r0, L); 534 __ stop("StubRoutines::forward exception: no pending exception (2)"); 535 __ bind(L); 536 } 537 #endif 538 539 // continue at exception handler 540 // r0: exception 541 // r3: throwing pc 542 // r19: exception handler 543 __ verify_oop(r0); 544 __ br(r19); 545 546 return start; 547 } 548 549 // Non-destructive plausibility checks for oops 550 // 551 // Arguments: 552 // r0: oop to verify 553 // rscratch1: error message 554 // 555 // Stack after saving c_rarg3: 556 // [tos + 0]: saved c_rarg3 557 // [tos + 1]: saved c_rarg2 558 // [tos + 2]: saved lr 559 // [tos + 3]: saved rscratch2 560 // [tos + 4]: saved r0 561 // [tos + 5]: saved rscratch1 562 address generate_verify_oop() { 563 564 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 565 address start = __ pc(); 566 567 Label exit, error; 568 569 // save c_rarg2 and c_rarg3 570 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 571 572 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 573 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 574 __ ldr(c_rarg3, Address(c_rarg2)); 575 __ add(c_rarg3, c_rarg3, 1); 576 __ str(c_rarg3, Address(c_rarg2)); 577 578 // object is in r0 579 // make sure object is 'reasonable' 580 __ cbz(r0, exit); // if obj is NULL it is OK 581 582 // Check if the oop is in the right area of memory 583 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 584 __ andr(c_rarg2, r0, c_rarg3); 585 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 586 587 // Compare c_rarg2 and c_rarg3. We don't use a compare 588 // instruction here because the flags register is live. 589 __ eor(c_rarg2, c_rarg2, c_rarg3); 590 __ cbnz(c_rarg2, error); 591 592 // make sure klass is 'reasonable', which is not zero. 593 __ load_klass(r0, r0); // get klass 594 __ cbz(r0, error); // if klass is NULL it is broken 595 596 // return if everything seems ok 597 __ bind(exit); 598 599 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 600 __ ret(lr); 601 602 // handle errors 603 __ bind(error); 604 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 605 606 __ push(RegSet::range(r0, r29), sp); 607 // debug(char* msg, int64_t pc, int64_t regs[]) 608 __ mov(c_rarg0, rscratch1); // pass address of error message 609 __ mov(c_rarg1, lr); // pass return address 610 __ mov(c_rarg2, sp); // pass address of regs on stack 611 #ifndef PRODUCT 612 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 613 #endif 614 BLOCK_COMMENT("call MacroAssembler::debug"); 615 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 616 __ blrt(rscratch1, 3, 0, 1); 617 618 return start; 619 } 620 621 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 622 623 // The inner part of zero_words(). This is the bulk operation, 624 // zeroing words in blocks, possibly using DC ZVA to do it. The 625 // caller is responsible for zeroing the last few words. 626 // 627 // Inputs: 628 // r10: the HeapWord-aligned base address of an array to zero. 629 // r11: the count in HeapWords, r11 > 0. 630 // 631 // Returns r10 and r11, adjusted for the caller to clear. 632 // r10: the base address of the tail of words left to clear. 633 // r11: the number of words in the tail. 634 // r11 < MacroAssembler::zero_words_block_size. 635 636 address generate_zero_blocks() { 637 Label store_pair, loop_store_pair, done; 638 Label base_aligned; 639 640 Register base = r10, cnt = r11; 641 642 __ align(CodeEntryAlignment); 643 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 644 address start = __ pc(); 645 646 if (UseBlockZeroing) { 647 int zva_length = VM_Version::zva_length(); 648 649 // Ensure ZVA length can be divided by 16. This is required by 650 // the subsequent operations. 651 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 652 653 __ tbz(base, 3, base_aligned); 654 __ str(zr, Address(__ post(base, 8))); 655 __ sub(cnt, cnt, 1); 656 __ bind(base_aligned); 657 658 // Ensure count >= zva_length * 2 so that it still deserves a zva after 659 // alignment. 660 Label small; 661 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 662 __ subs(rscratch1, cnt, low_limit >> 3); 663 __ br(Assembler::LT, small); 664 __ zero_dcache_blocks(base, cnt); 665 __ bind(small); 666 } 667 668 { 669 // Number of stp instructions we'll unroll 670 const int unroll = 671 MacroAssembler::zero_words_block_size / 2; 672 // Clear the remaining blocks. 673 Label loop; 674 __ subs(cnt, cnt, unroll * 2); 675 __ br(Assembler::LT, done); 676 __ bind(loop); 677 for (int i = 0; i < unroll; i++) 678 __ stp(zr, zr, __ post(base, 16)); 679 __ subs(cnt, cnt, unroll * 2); 680 __ br(Assembler::GE, loop); 681 __ bind(done); 682 __ add(cnt, cnt, unroll * 2); 683 } 684 685 __ ret(lr); 686 687 return start; 688 } 689 690 691 typedef enum { 692 copy_forwards = 1, 693 copy_backwards = -1 694 } copy_direction; 695 696 // Bulk copy of blocks of 8 words. 697 // 698 // count is a count of words. 699 // 700 // Precondition: count >= 8 701 // 702 // Postconditions: 703 // 704 // The least significant bit of count contains the remaining count 705 // of words to copy. The rest of count is trash. 706 // 707 // s and d are adjusted to point to the remaining words to copy 708 // 709 void generate_copy_longs(Label &start, Register s, Register d, Register count, 710 copy_direction direction) { 711 int unit = wordSize * direction; 712 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 713 714 int offset; 715 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 716 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 717 const Register stride = r13; 718 719 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 720 assert_different_registers(s, d, count, rscratch1); 721 722 Label again, drain; 723 const char *stub_name; 724 if (direction == copy_forwards) 725 stub_name = "forward_copy_longs"; 726 else 727 stub_name = "backward_copy_longs"; 728 StubCodeMark mark(this, "StubRoutines", stub_name); 729 __ align(CodeEntryAlignment); 730 __ bind(start); 731 732 Label unaligned_copy_long; 733 if (AvoidUnalignedAccesses) { 734 __ tbnz(d, 3, unaligned_copy_long); 735 } 736 737 if (direction == copy_forwards) { 738 __ sub(s, s, bias); 739 __ sub(d, d, bias); 740 } 741 742 #ifdef ASSERT 743 // Make sure we are never given < 8 words 744 { 745 Label L; 746 __ cmp(count, 8); 747 __ br(Assembler::GE, L); 748 __ stop("genrate_copy_longs called with < 8 words"); 749 __ bind(L); 750 } 751 #endif 752 753 // Fill 8 registers 754 if (UseSIMDForMemoryOps) { 755 __ ldpq(v0, v1, Address(s, 4 * unit)); 756 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 757 } else { 758 __ ldp(t0, t1, Address(s, 2 * unit)); 759 __ ldp(t2, t3, Address(s, 4 * unit)); 760 __ ldp(t4, t5, Address(s, 6 * unit)); 761 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 762 } 763 764 __ subs(count, count, 16); 765 __ br(Assembler::LO, drain); 766 767 int prefetch = PrefetchCopyIntervalInBytes; 768 bool use_stride = false; 769 if (direction == copy_backwards) { 770 use_stride = prefetch > 256; 771 prefetch = -prefetch; 772 if (use_stride) __ mov(stride, prefetch); 773 } 774 775 __ bind(again); 776 777 if (PrefetchCopyIntervalInBytes > 0) 778 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 779 780 if (UseSIMDForMemoryOps) { 781 __ stpq(v0, v1, Address(d, 4 * unit)); 782 __ ldpq(v0, v1, Address(s, 4 * unit)); 783 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 784 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 785 } else { 786 __ stp(t0, t1, Address(d, 2 * unit)); 787 __ ldp(t0, t1, Address(s, 2 * unit)); 788 __ stp(t2, t3, Address(d, 4 * unit)); 789 __ ldp(t2, t3, Address(s, 4 * unit)); 790 __ stp(t4, t5, Address(d, 6 * unit)); 791 __ ldp(t4, t5, Address(s, 6 * unit)); 792 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 793 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 794 } 795 796 __ subs(count, count, 8); 797 __ br(Assembler::HS, again); 798 799 // Drain 800 __ bind(drain); 801 if (UseSIMDForMemoryOps) { 802 __ stpq(v0, v1, Address(d, 4 * unit)); 803 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 804 } else { 805 __ stp(t0, t1, Address(d, 2 * unit)); 806 __ stp(t2, t3, Address(d, 4 * unit)); 807 __ stp(t4, t5, Address(d, 6 * unit)); 808 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 809 } 810 811 { 812 Label L1, L2; 813 __ tbz(count, exact_log2(4), L1); 814 if (UseSIMDForMemoryOps) { 815 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 816 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 817 } else { 818 __ ldp(t0, t1, Address(s, 2 * unit)); 819 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 820 __ stp(t0, t1, Address(d, 2 * unit)); 821 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 822 } 823 __ bind(L1); 824 825 if (direction == copy_forwards) { 826 __ add(s, s, bias); 827 __ add(d, d, bias); 828 } 829 830 __ tbz(count, 1, L2); 831 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 832 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 833 __ bind(L2); 834 } 835 836 __ ret(lr); 837 838 if (AvoidUnalignedAccesses) { 839 Label drain, again; 840 // Register order for storing. Order is different for backward copy. 841 842 __ bind(unaligned_copy_long); 843 844 // source address is even aligned, target odd aligned 845 // 846 // when forward copying word pairs we read long pairs at offsets 847 // {0, 2, 4, 6} (in long words). when backwards copying we read 848 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 849 // address by -2 in the forwards case so we can compute the 850 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 851 // or -1. 852 // 853 // when forward copying we need to store 1 word, 3 pairs and 854 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 855 // zero offset We adjust the destination by -1 which means we 856 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 857 // 858 // When backwards copyng we need to store 1 word, 3 pairs and 859 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 860 // offsets {1, 3, 5, 7, 8} * unit. 861 862 if (direction == copy_forwards) { 863 __ sub(s, s, 16); 864 __ sub(d, d, 8); 865 } 866 867 // Fill 8 registers 868 // 869 // for forwards copy s was offset by -16 from the original input 870 // value of s so the register contents are at these offsets 871 // relative to the 64 bit block addressed by that original input 872 // and so on for each successive 64 byte block when s is updated 873 // 874 // t0 at offset 0, t1 at offset 8 875 // t2 at offset 16, t3 at offset 24 876 // t4 at offset 32, t5 at offset 40 877 // t6 at offset 48, t7 at offset 56 878 879 // for backwards copy s was not offset so the register contents 880 // are at these offsets into the preceding 64 byte block 881 // relative to that original input and so on for each successive 882 // preceding 64 byte block when s is updated. this explains the 883 // slightly counter-intuitive looking pattern of register usage 884 // in the stp instructions for backwards copy. 885 // 886 // t0 at offset -16, t1 at offset -8 887 // t2 at offset -32, t3 at offset -24 888 // t4 at offset -48, t5 at offset -40 889 // t6 at offset -64, t7 at offset -56 890 891 __ ldp(t0, t1, Address(s, 2 * unit)); 892 __ ldp(t2, t3, Address(s, 4 * unit)); 893 __ ldp(t4, t5, Address(s, 6 * unit)); 894 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 895 896 __ subs(count, count, 16); 897 __ br(Assembler::LO, drain); 898 899 int prefetch = PrefetchCopyIntervalInBytes; 900 bool use_stride = false; 901 if (direction == copy_backwards) { 902 use_stride = prefetch > 256; 903 prefetch = -prefetch; 904 if (use_stride) __ mov(stride, prefetch); 905 } 906 907 __ bind(again); 908 909 if (PrefetchCopyIntervalInBytes > 0) 910 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 911 912 if (direction == copy_forwards) { 913 // allowing for the offset of -8 the store instructions place 914 // registers into the target 64 bit block at the following 915 // offsets 916 // 917 // t0 at offset 0 918 // t1 at offset 8, t2 at offset 16 919 // t3 at offset 24, t4 at offset 32 920 // t5 at offset 40, t6 at offset 48 921 // t7 at offset 56 922 923 __ str(t0, Address(d, 1 * unit)); 924 __ stp(t1, t2, Address(d, 2 * unit)); 925 __ ldp(t0, t1, Address(s, 2 * unit)); 926 __ stp(t3, t4, Address(d, 4 * unit)); 927 __ ldp(t2, t3, Address(s, 4 * unit)); 928 __ stp(t5, t6, Address(d, 6 * unit)); 929 __ ldp(t4, t5, Address(s, 6 * unit)); 930 __ str(t7, Address(__ pre(d, 8 * unit))); 931 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 932 } else { 933 // d was not offset when we started so the registers are 934 // written into the 64 bit block preceding d with the following 935 // offsets 936 // 937 // t1 at offset -8 938 // t3 at offset -24, t0 at offset -16 939 // t5 at offset -48, t2 at offset -32 940 // t7 at offset -56, t4 at offset -48 941 // t6 at offset -64 942 // 943 // note that this matches the offsets previously noted for the 944 // loads 945 946 __ str(t1, Address(d, 1 * unit)); 947 __ stp(t3, t0, Address(d, 3 * unit)); 948 __ ldp(t0, t1, Address(s, 2 * unit)); 949 __ stp(t5, t2, Address(d, 5 * unit)); 950 __ ldp(t2, t3, Address(s, 4 * unit)); 951 __ stp(t7, t4, Address(d, 7 * unit)); 952 __ ldp(t4, t5, Address(s, 6 * unit)); 953 __ str(t6, Address(__ pre(d, 8 * unit))); 954 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 955 } 956 957 __ subs(count, count, 8); 958 __ br(Assembler::HS, again); 959 960 // Drain 961 // 962 // this uses the same pattern of offsets and register arguments 963 // as above 964 __ bind(drain); 965 if (direction == copy_forwards) { 966 __ str(t0, Address(d, 1 * unit)); 967 __ stp(t1, t2, Address(d, 2 * unit)); 968 __ stp(t3, t4, Address(d, 4 * unit)); 969 __ stp(t5, t6, Address(d, 6 * unit)); 970 __ str(t7, Address(__ pre(d, 8 * unit))); 971 } else { 972 __ str(t1, Address(d, 1 * unit)); 973 __ stp(t3, t0, Address(d, 3 * unit)); 974 __ stp(t5, t2, Address(d, 5 * unit)); 975 __ stp(t7, t4, Address(d, 7 * unit)); 976 __ str(t6, Address(__ pre(d, 8 * unit))); 977 } 978 // now we need to copy any remaining part block which may 979 // include a 4 word block subblock and/or a 2 word subblock. 980 // bits 2 and 1 in the count are the tell-tale for whetehr we 981 // have each such subblock 982 { 983 Label L1, L2; 984 __ tbz(count, exact_log2(4), L1); 985 // this is the same as above but copying only 4 longs hence 986 // with ony one intervening stp between the str instructions 987 // but note that the offsets and registers still follow the 988 // same pattern 989 __ ldp(t0, t1, Address(s, 2 * unit)); 990 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 991 if (direction == copy_forwards) { 992 __ str(t0, Address(d, 1 * unit)); 993 __ stp(t1, t2, Address(d, 2 * unit)); 994 __ str(t3, Address(__ pre(d, 4 * unit))); 995 } else { 996 __ str(t1, Address(d, 1 * unit)); 997 __ stp(t3, t0, Address(d, 3 * unit)); 998 __ str(t2, Address(__ pre(d, 4 * unit))); 999 } 1000 __ bind(L1); 1001 1002 __ tbz(count, 1, L2); 1003 // this is the same as above but copying only 2 longs hence 1004 // there is no intervening stp between the str instructions 1005 // but note that the offset and register patterns are still 1006 // the same 1007 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1008 if (direction == copy_forwards) { 1009 __ str(t0, Address(d, 1 * unit)); 1010 __ str(t1, Address(__ pre(d, 2 * unit))); 1011 } else { 1012 __ str(t1, Address(d, 1 * unit)); 1013 __ str(t0, Address(__ pre(d, 2 * unit))); 1014 } 1015 __ bind(L2); 1016 1017 // for forwards copy we need to re-adjust the offsets we 1018 // applied so that s and d are follow the last words written 1019 1020 if (direction == copy_forwards) { 1021 __ add(s, s, 16); 1022 __ add(d, d, 8); 1023 } 1024 1025 } 1026 1027 __ ret(lr); 1028 } 1029 } 1030 1031 // Small copy: less than 16 bytes. 1032 // 1033 // NB: Ignores all of the bits of count which represent more than 15 1034 // bytes, so a caller doesn't have to mask them. 1035 1036 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1037 bool is_backwards = step < 0; 1038 size_t granularity = uabs(step); 1039 int direction = is_backwards ? -1 : 1; 1040 int unit = wordSize * direction; 1041 1042 Label Lpair, Lword, Lint, Lshort, Lbyte; 1043 1044 assert(granularity 1045 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1046 1047 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1048 1049 // ??? I don't know if this bit-test-and-branch is the right thing 1050 // to do. It does a lot of jumping, resulting in several 1051 // mispredicted branches. It might make more sense to do this 1052 // with something like Duff's device with a single computed branch. 1053 1054 __ tbz(count, 3 - exact_log2(granularity), Lword); 1055 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1056 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1057 __ bind(Lword); 1058 1059 if (granularity <= sizeof (jint)) { 1060 __ tbz(count, 2 - exact_log2(granularity), Lint); 1061 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1062 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1063 __ bind(Lint); 1064 } 1065 1066 if (granularity <= sizeof (jshort)) { 1067 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1068 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1069 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1070 __ bind(Lshort); 1071 } 1072 1073 if (granularity <= sizeof (jbyte)) { 1074 __ tbz(count, 0, Lbyte); 1075 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1076 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1077 __ bind(Lbyte); 1078 } 1079 } 1080 1081 Label copy_f, copy_b; 1082 1083 // All-singing all-dancing memory copy. 1084 // 1085 // Copy count units of memory from s to d. The size of a unit is 1086 // step, which can be positive or negative depending on the direction 1087 // of copy. If is_aligned is false, we align the source address. 1088 // 1089 1090 void copy_memory(bool is_aligned, Register s, Register d, 1091 Register count, Register tmp, int step) { 1092 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1093 bool is_backwards = step < 0; 1094 int granularity = uabs(step); 1095 const Register t0 = r3, t1 = r4; 1096 1097 // <= 96 bytes do inline. Direction doesn't matter because we always 1098 // load all the data before writing anything 1099 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1100 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1101 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1102 const Register send = r17, dend = r18; 1103 1104 if (PrefetchCopyIntervalInBytes > 0) 1105 __ prfm(Address(s, 0), PLDL1KEEP); 1106 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1107 __ br(Assembler::HI, copy_big); 1108 1109 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1110 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1111 1112 __ cmp(count, 16/granularity); 1113 __ br(Assembler::LS, copy16); 1114 1115 __ cmp(count, 64/granularity); 1116 __ br(Assembler::HI, copy80); 1117 1118 __ cmp(count, 32/granularity); 1119 __ br(Assembler::LS, copy32); 1120 1121 // 33..64 bytes 1122 if (UseSIMDForMemoryOps) { 1123 __ ldpq(v0, v1, Address(s, 0)); 1124 __ ldpq(v2, v3, Address(send, -32)); 1125 __ stpq(v0, v1, Address(d, 0)); 1126 __ stpq(v2, v3, Address(dend, -32)); 1127 } else { 1128 __ ldp(t0, t1, Address(s, 0)); 1129 __ ldp(t2, t3, Address(s, 16)); 1130 __ ldp(t4, t5, Address(send, -32)); 1131 __ ldp(t6, t7, Address(send, -16)); 1132 1133 __ stp(t0, t1, Address(d, 0)); 1134 __ stp(t2, t3, Address(d, 16)); 1135 __ stp(t4, t5, Address(dend, -32)); 1136 __ stp(t6, t7, Address(dend, -16)); 1137 } 1138 __ b(finish); 1139 1140 // 17..32 bytes 1141 __ bind(copy32); 1142 __ ldp(t0, t1, Address(s, 0)); 1143 __ ldp(t2, t3, Address(send, -16)); 1144 __ stp(t0, t1, Address(d, 0)); 1145 __ stp(t2, t3, Address(dend, -16)); 1146 __ b(finish); 1147 1148 // 65..80/96 bytes 1149 // (96 bytes if SIMD because we do 32 byes per instruction) 1150 __ bind(copy80); 1151 if (UseSIMDForMemoryOps) { 1152 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1153 __ ldpq(v4, v5, Address(send, -32)); 1154 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1155 __ stpq(v4, v5, Address(dend, -32)); 1156 } else { 1157 __ ldp(t0, t1, Address(s, 0)); 1158 __ ldp(t2, t3, Address(s, 16)); 1159 __ ldp(t4, t5, Address(s, 32)); 1160 __ ldp(t6, t7, Address(s, 48)); 1161 __ ldp(t8, t9, Address(send, -16)); 1162 1163 __ stp(t0, t1, Address(d, 0)); 1164 __ stp(t2, t3, Address(d, 16)); 1165 __ stp(t4, t5, Address(d, 32)); 1166 __ stp(t6, t7, Address(d, 48)); 1167 __ stp(t8, t9, Address(dend, -16)); 1168 } 1169 __ b(finish); 1170 1171 // 0..16 bytes 1172 __ bind(copy16); 1173 __ cmp(count, 8/granularity); 1174 __ br(Assembler::LO, copy8); 1175 1176 // 8..16 bytes 1177 __ ldr(t0, Address(s, 0)); 1178 __ ldr(t1, Address(send, -8)); 1179 __ str(t0, Address(d, 0)); 1180 __ str(t1, Address(dend, -8)); 1181 __ b(finish); 1182 1183 if (granularity < 8) { 1184 // 4..7 bytes 1185 __ bind(copy8); 1186 __ tbz(count, 2 - exact_log2(granularity), copy4); 1187 __ ldrw(t0, Address(s, 0)); 1188 __ ldrw(t1, Address(send, -4)); 1189 __ strw(t0, Address(d, 0)); 1190 __ strw(t1, Address(dend, -4)); 1191 __ b(finish); 1192 if (granularity < 4) { 1193 // 0..3 bytes 1194 __ bind(copy4); 1195 __ cbz(count, finish); // get rid of 0 case 1196 if (granularity == 2) { 1197 __ ldrh(t0, Address(s, 0)); 1198 __ strh(t0, Address(d, 0)); 1199 } else { // granularity == 1 1200 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1201 // the first and last byte. 1202 // Handle the 3 byte case by loading and storing base + count/2 1203 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1204 // This does means in the 1 byte case we load/store the same 1205 // byte 3 times. 1206 __ lsr(count, count, 1); 1207 __ ldrb(t0, Address(s, 0)); 1208 __ ldrb(t1, Address(send, -1)); 1209 __ ldrb(t2, Address(s, count)); 1210 __ strb(t0, Address(d, 0)); 1211 __ strb(t1, Address(dend, -1)); 1212 __ strb(t2, Address(d, count)); 1213 } 1214 __ b(finish); 1215 } 1216 } 1217 1218 __ bind(copy_big); 1219 if (is_backwards) { 1220 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1221 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1222 } 1223 1224 // Now we've got the small case out of the way we can align the 1225 // source address on a 2-word boundary. 1226 1227 Label aligned; 1228 1229 if (is_aligned) { 1230 // We may have to adjust by 1 word to get s 2-word-aligned. 1231 __ tbz(s, exact_log2(wordSize), aligned); 1232 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1233 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1234 __ sub(count, count, wordSize/granularity); 1235 } else { 1236 if (is_backwards) { 1237 __ andr(rscratch2, s, 2 * wordSize - 1); 1238 } else { 1239 __ neg(rscratch2, s); 1240 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1241 } 1242 // rscratch2 is the byte adjustment needed to align s. 1243 __ cbz(rscratch2, aligned); 1244 int shift = exact_log2(granularity); 1245 if (shift) __ lsr(rscratch2, rscratch2, shift); 1246 __ sub(count, count, rscratch2); 1247 1248 #if 0 1249 // ?? This code is only correct for a disjoint copy. It may or 1250 // may not make sense to use it in that case. 1251 1252 // Copy the first pair; s and d may not be aligned. 1253 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1254 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1255 1256 // Align s and d, adjust count 1257 if (is_backwards) { 1258 __ sub(s, s, rscratch2); 1259 __ sub(d, d, rscratch2); 1260 } else { 1261 __ add(s, s, rscratch2); 1262 __ add(d, d, rscratch2); 1263 } 1264 #else 1265 copy_memory_small(s, d, rscratch2, rscratch1, step); 1266 #endif 1267 } 1268 1269 __ bind(aligned); 1270 1271 // s is now 2-word-aligned. 1272 1273 // We have a count of units and some trailing bytes. Adjust the 1274 // count and do a bulk copy of words. 1275 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1276 if (direction == copy_forwards) 1277 __ bl(copy_f); 1278 else 1279 __ bl(copy_b); 1280 1281 // And the tail. 1282 copy_memory_small(s, d, count, tmp, step); 1283 1284 if (granularity >= 8) __ bind(copy8); 1285 if (granularity >= 4) __ bind(copy4); 1286 __ bind(finish); 1287 } 1288 1289 1290 void clobber_registers() { 1291 #ifdef ASSERT 1292 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1293 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1294 for (Register r = r3; r <= r18; r++) 1295 if (r != rscratch1) __ mov(r, rscratch1); 1296 #endif 1297 } 1298 1299 // Scan over array at a for count oops, verifying each one. 1300 // Preserves a and count, clobbers rscratch1 and rscratch2. 1301 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1302 Label loop, end; 1303 __ mov(rscratch1, a); 1304 __ mov(rscratch2, zr); 1305 __ bind(loop); 1306 __ cmp(rscratch2, count); 1307 __ br(Assembler::HS, end); 1308 if (size == (size_t)wordSize) { 1309 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1310 __ verify_oop(temp); 1311 } else { 1312 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1313 __ decode_heap_oop(temp); // calls verify_oop 1314 } 1315 __ add(rscratch2, rscratch2, size); 1316 __ b(loop); 1317 __ bind(end); 1318 } 1319 1320 // Arguments: 1321 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1322 // ignored 1323 // is_oop - true => oop array, so generate store check code 1324 // name - stub name string 1325 // 1326 // Inputs: 1327 // c_rarg0 - source array address 1328 // c_rarg1 - destination array address 1329 // c_rarg2 - element count, treated as ssize_t, can be zero 1330 // 1331 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1332 // the hardware handle it. The two dwords within qwords that span 1333 // cache line boundaries will still be loaded and stored atomicly. 1334 // 1335 // Side Effects: 1336 // disjoint_int_copy_entry is set to the no-overlap entry point 1337 // used by generate_conjoint_int_oop_copy(). 1338 // 1339 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1340 const char *name, bool dest_uninitialized = false) { 1341 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1342 RegSet saved_reg = RegSet::of(s, d, count); 1343 __ align(CodeEntryAlignment); 1344 StubCodeMark mark(this, "StubRoutines", name); 1345 address start = __ pc(); 1346 __ enter(); 1347 1348 if (entry != NULL) { 1349 *entry = __ pc(); 1350 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1351 BLOCK_COMMENT("Entry:"); 1352 } 1353 1354 BarrierSetCodeGen *bs = Universe::heap()->barrier_set()->code_gen(); 1355 DecoratorSet decorators = ARRAYCOPY_DISJOINT; 1356 if (dest_uninitialized) { 1357 decorators |= AS_DEST_NOT_INITIALIZED; 1358 } 1359 if (aligned) { 1360 decorators |= ARRAYCOPY_ALIGNED; 1361 } 1362 1363 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg); 1364 1365 if (is_oop) { 1366 // save regs before copy_memory 1367 __ push(RegSet::of(d, count), sp); 1368 } 1369 copy_memory(aligned, s, d, count, rscratch1, size); 1370 1371 if (is_oop) { 1372 __ pop(RegSet::of(d, count), sp); 1373 if (VerifyOops) 1374 verify_oop_array(size, d, count, r16); 1375 __ sub(count, count, 1); // make an inclusive end pointer 1376 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1377 } 1378 1379 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1380 1381 __ leave(); 1382 __ mov(r0, zr); // return 0 1383 __ ret(lr); 1384 #ifdef BUILTIN_SIM 1385 { 1386 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1387 sim->notifyCompile(const_cast<char*>(name), start); 1388 } 1389 #endif 1390 return start; 1391 } 1392 1393 // Arguments: 1394 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1395 // ignored 1396 // is_oop - true => oop array, so generate store check code 1397 // name - stub name string 1398 // 1399 // Inputs: 1400 // c_rarg0 - source array address 1401 // c_rarg1 - destination array address 1402 // c_rarg2 - element count, treated as ssize_t, can be zero 1403 // 1404 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1405 // the hardware handle it. The two dwords within qwords that span 1406 // cache line boundaries will still be loaded and stored atomicly. 1407 // 1408 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1409 address *entry, const char *name, 1410 bool dest_uninitialized = false) { 1411 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1412 RegSet saved_regs = RegSet::of(s, d, count); 1413 StubCodeMark mark(this, "StubRoutines", name); 1414 address start = __ pc(); 1415 __ enter(); 1416 1417 if (entry != NULL) { 1418 *entry = __ pc(); 1419 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1420 BLOCK_COMMENT("Entry:"); 1421 } 1422 1423 // use fwd copy when (d-s) above_equal (count*size) 1424 __ sub(rscratch1, d, s); 1425 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1426 __ br(Assembler::HS, nooverlap_target); 1427 1428 BarrierSetCodeGen *bs = Universe::heap()->barrier_set()->code_gen(); 1429 DecoratorSet decorators = 0; 1430 if (dest_uninitialized) { 1431 decorators |= AS_DEST_NOT_INITIALIZED; 1432 } 1433 if (aligned) { 1434 decorators |= ARRAYCOPY_ALIGNED; 1435 } 1436 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs); 1437 1438 if (is_oop) { 1439 // save regs before copy_memory 1440 __ push(RegSet::of(d, count), sp); 1441 } 1442 copy_memory(aligned, s, d, count, rscratch1, -size); 1443 if (is_oop) { 1444 __ pop(RegSet::of(d, count), sp); 1445 if (VerifyOops) 1446 verify_oop_array(size, d, count, r16); 1447 __ sub(count, count, 1); // make an inclusive end pointer 1448 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1449 } 1450 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1451 __ leave(); 1452 __ mov(r0, zr); // return 0 1453 __ ret(lr); 1454 #ifdef BUILTIN_SIM 1455 { 1456 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1457 sim->notifyCompile(const_cast<char*>(name), start); 1458 } 1459 #endif 1460 return start; 1461 } 1462 1463 // Arguments: 1464 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1465 // ignored 1466 // name - stub name string 1467 // 1468 // Inputs: 1469 // c_rarg0 - source array address 1470 // c_rarg1 - destination array address 1471 // c_rarg2 - element count, treated as ssize_t, can be zero 1472 // 1473 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1474 // we let the hardware handle it. The one to eight bytes within words, 1475 // dwords or qwords that span cache line boundaries will still be loaded 1476 // and stored atomically. 1477 // 1478 // Side Effects: 1479 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1480 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1481 // we let the hardware handle it. The one to eight bytes within words, 1482 // dwords or qwords that span cache line boundaries will still be loaded 1483 // and stored atomically. 1484 // 1485 // Side Effects: 1486 // disjoint_byte_copy_entry is set to the no-overlap entry point 1487 // used by generate_conjoint_byte_copy(). 1488 // 1489 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1490 const bool not_oop = false; 1491 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1492 } 1493 1494 // Arguments: 1495 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1496 // ignored 1497 // name - stub name string 1498 // 1499 // Inputs: 1500 // c_rarg0 - source array address 1501 // c_rarg1 - destination array address 1502 // c_rarg2 - element count, treated as ssize_t, can be zero 1503 // 1504 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1505 // we let the hardware handle it. The one to eight bytes within words, 1506 // dwords or qwords that span cache line boundaries will still be loaded 1507 // and stored atomically. 1508 // 1509 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1510 address* entry, const char *name) { 1511 const bool not_oop = false; 1512 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1513 } 1514 1515 // Arguments: 1516 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1517 // ignored 1518 // name - stub name string 1519 // 1520 // Inputs: 1521 // c_rarg0 - source array address 1522 // c_rarg1 - destination array address 1523 // c_rarg2 - element count, treated as ssize_t, can be zero 1524 // 1525 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1526 // let the hardware handle it. The two or four words within dwords 1527 // or qwords that span cache line boundaries will still be loaded 1528 // and stored atomically. 1529 // 1530 // Side Effects: 1531 // disjoint_short_copy_entry is set to the no-overlap entry point 1532 // used by generate_conjoint_short_copy(). 1533 // 1534 address generate_disjoint_short_copy(bool aligned, 1535 address* entry, const char *name) { 1536 const bool not_oop = false; 1537 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1538 } 1539 1540 // Arguments: 1541 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1542 // ignored 1543 // name - stub name string 1544 // 1545 // Inputs: 1546 // c_rarg0 - source array address 1547 // c_rarg1 - destination array address 1548 // c_rarg2 - element count, treated as ssize_t, can be zero 1549 // 1550 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1551 // let the hardware handle it. The two or four words within dwords 1552 // or qwords that span cache line boundaries will still be loaded 1553 // and stored atomically. 1554 // 1555 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1556 address *entry, const char *name) { 1557 const bool not_oop = false; 1558 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1559 1560 } 1561 // Arguments: 1562 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1563 // ignored 1564 // name - stub name string 1565 // 1566 // Inputs: 1567 // c_rarg0 - source array address 1568 // c_rarg1 - destination array address 1569 // c_rarg2 - element count, treated as ssize_t, can be zero 1570 // 1571 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1572 // the hardware handle it. The two dwords within qwords that span 1573 // cache line boundaries will still be loaded and stored atomicly. 1574 // 1575 // Side Effects: 1576 // disjoint_int_copy_entry is set to the no-overlap entry point 1577 // used by generate_conjoint_int_oop_copy(). 1578 // 1579 address generate_disjoint_int_copy(bool aligned, address *entry, 1580 const char *name, bool dest_uninitialized = false) { 1581 const bool not_oop = false; 1582 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1583 } 1584 1585 // Arguments: 1586 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1587 // ignored 1588 // name - stub name string 1589 // 1590 // Inputs: 1591 // c_rarg0 - source array address 1592 // c_rarg1 - destination array address 1593 // c_rarg2 - element count, treated as ssize_t, can be zero 1594 // 1595 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1596 // the hardware handle it. The two dwords within qwords that span 1597 // cache line boundaries will still be loaded and stored atomicly. 1598 // 1599 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1600 address *entry, const char *name, 1601 bool dest_uninitialized = false) { 1602 const bool not_oop = false; 1603 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1604 } 1605 1606 1607 // Arguments: 1608 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1609 // ignored 1610 // name - stub name string 1611 // 1612 // Inputs: 1613 // c_rarg0 - source array address 1614 // c_rarg1 - destination array address 1615 // c_rarg2 - element count, treated as size_t, can be zero 1616 // 1617 // Side Effects: 1618 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1619 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1620 // 1621 address generate_disjoint_long_copy(bool aligned, address *entry, 1622 const char *name, bool dest_uninitialized = false) { 1623 const bool not_oop = false; 1624 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1625 } 1626 1627 // Arguments: 1628 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1629 // ignored 1630 // name - stub name string 1631 // 1632 // Inputs: 1633 // c_rarg0 - source array address 1634 // c_rarg1 - destination array address 1635 // c_rarg2 - element count, treated as size_t, can be zero 1636 // 1637 address generate_conjoint_long_copy(bool aligned, 1638 address nooverlap_target, address *entry, 1639 const char *name, bool dest_uninitialized = false) { 1640 const bool not_oop = false; 1641 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1642 } 1643 1644 // Arguments: 1645 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1646 // ignored 1647 // name - stub name string 1648 // 1649 // Inputs: 1650 // c_rarg0 - source array address 1651 // c_rarg1 - destination array address 1652 // c_rarg2 - element count, treated as size_t, can be zero 1653 // 1654 // Side Effects: 1655 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1656 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1657 // 1658 address generate_disjoint_oop_copy(bool aligned, address *entry, 1659 const char *name, bool dest_uninitialized) { 1660 const bool is_oop = true; 1661 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1662 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1663 } 1664 1665 // Arguments: 1666 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1667 // ignored 1668 // name - stub name string 1669 // 1670 // Inputs: 1671 // c_rarg0 - source array address 1672 // c_rarg1 - destination array address 1673 // c_rarg2 - element count, treated as size_t, can be zero 1674 // 1675 address generate_conjoint_oop_copy(bool aligned, 1676 address nooverlap_target, address *entry, 1677 const char *name, bool dest_uninitialized) { 1678 const bool is_oop = true; 1679 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1680 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1681 name, dest_uninitialized); 1682 } 1683 1684 1685 // Helper for generating a dynamic type check. 1686 // Smashes rscratch1. 1687 void generate_type_check(Register sub_klass, 1688 Register super_check_offset, 1689 Register super_klass, 1690 Label& L_success) { 1691 assert_different_registers(sub_klass, super_check_offset, super_klass); 1692 1693 BLOCK_COMMENT("type_check:"); 1694 1695 Label L_miss; 1696 1697 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1698 super_check_offset); 1699 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1700 1701 // Fall through on failure! 1702 __ BIND(L_miss); 1703 } 1704 1705 // 1706 // Generate checkcasting array copy stub 1707 // 1708 // Input: 1709 // c_rarg0 - source array address 1710 // c_rarg1 - destination array address 1711 // c_rarg2 - element count, treated as ssize_t, can be zero 1712 // c_rarg3 - size_t ckoff (super_check_offset) 1713 // c_rarg4 - oop ckval (super_klass) 1714 // 1715 // Output: 1716 // r0 == 0 - success 1717 // r0 == -1^K - failure, where K is partial transfer count 1718 // 1719 address generate_checkcast_copy(const char *name, address *entry, 1720 bool dest_uninitialized = false) { 1721 1722 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1723 1724 // Input registers (after setup_arg_regs) 1725 const Register from = c_rarg0; // source array address 1726 const Register to = c_rarg1; // destination array address 1727 const Register count = c_rarg2; // elementscount 1728 const Register ckoff = c_rarg3; // super_check_offset 1729 const Register ckval = c_rarg4; // super_klass 1730 1731 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1732 RegSet wb_post_saved_regs = RegSet::of(count); 1733 1734 // Registers used as temps (r18, r19, r20 are save-on-entry) 1735 const Register count_save = r21; // orig elementscount 1736 const Register start_to = r20; // destination array start address 1737 const Register copied_oop = r18; // actual oop copied 1738 const Register r19_klass = r19; // oop._klass 1739 1740 //--------------------------------------------------------------- 1741 // Assembler stub will be used for this call to arraycopy 1742 // if the two arrays are subtypes of Object[] but the 1743 // destination array type is not equal to or a supertype 1744 // of the source type. Each element must be separately 1745 // checked. 1746 1747 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1748 copied_oop, r19_klass, count_save); 1749 1750 __ align(CodeEntryAlignment); 1751 StubCodeMark mark(this, "StubRoutines", name); 1752 address start = __ pc(); 1753 1754 __ enter(); // required for proper stackwalking of RuntimeStub frame 1755 1756 #ifdef ASSERT 1757 // caller guarantees that the arrays really are different 1758 // otherwise, we would have to make conjoint checks 1759 { Label L; 1760 array_overlap_test(L, TIMES_OOP); 1761 __ stop("checkcast_copy within a single array"); 1762 __ bind(L); 1763 } 1764 #endif //ASSERT 1765 1766 // Caller of this entry point must set up the argument registers. 1767 if (entry != NULL) { 1768 *entry = __ pc(); 1769 BLOCK_COMMENT("Entry:"); 1770 } 1771 1772 // Empty array: Nothing to do. 1773 __ cbz(count, L_done); 1774 1775 __ push(RegSet::of(r18, r19, r20, r21), sp); 1776 1777 #ifdef ASSERT 1778 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1779 // The ckoff and ckval must be mutually consistent, 1780 // even though caller generates both. 1781 { Label L; 1782 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1783 __ ldrw(start_to, Address(ckval, sco_offset)); 1784 __ cmpw(ckoff, start_to); 1785 __ br(Assembler::EQ, L); 1786 __ stop("super_check_offset inconsistent"); 1787 __ bind(L); 1788 } 1789 #endif //ASSERT 1790 1791 BarrierSetCodeGen *bs = Universe::heap()->barrier_set()->code_gen(); 1792 DecoratorSet decorators = ARRAYCOPY_CHECKCAST; 1793 bool is_oop = true; 1794 if (dest_uninitialized) { 1795 decorators |= AS_DEST_NOT_INITIALIZED; 1796 } 1797 1798 bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs); 1799 1800 // save the original count 1801 __ mov(count_save, count); 1802 1803 // Copy from low to high addresses 1804 __ mov(start_to, to); // Save destination array start address 1805 __ b(L_load_element); 1806 1807 // ======== begin loop ======== 1808 // (Loop is rotated; its entry is L_load_element.) 1809 // Loop control: 1810 // for (; count != 0; count--) { 1811 // copied_oop = load_heap_oop(from++); 1812 // ... generate_type_check ...; 1813 // store_heap_oop(to++, copied_oop); 1814 // } 1815 __ align(OptoLoopAlignment); 1816 1817 __ BIND(L_store_element); 1818 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1819 __ sub(count, count, 1); 1820 __ cbz(count, L_do_card_marks); 1821 1822 // ======== loop entry is here ======== 1823 __ BIND(L_load_element); 1824 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1825 __ cbz(copied_oop, L_store_element); 1826 1827 __ load_klass(r19_klass, copied_oop);// query the object klass 1828 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1829 // ======== end loop ======== 1830 1831 // It was a real error; we must depend on the caller to finish the job. 1832 // Register count = remaining oops, count_orig = total oops. 1833 // Emit GC store barriers for the oops we have copied and report 1834 // their number to the caller. 1835 1836 __ subs(count, count_save, count); // K = partially copied oop count 1837 __ eon(count, count, zr); // report (-1^K) to caller 1838 __ br(Assembler::EQ, L_done_pop); 1839 1840 __ BIND(L_do_card_marks); 1841 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1842 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs); 1843 1844 __ bind(L_done_pop); 1845 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1846 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1847 1848 __ bind(L_done); 1849 __ mov(r0, count); 1850 __ leave(); 1851 __ ret(lr); 1852 1853 return start; 1854 } 1855 1856 // Perform range checks on the proposed arraycopy. 1857 // Kills temp, but nothing else. 1858 // Also, clean the sign bits of src_pos and dst_pos. 1859 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1860 Register src_pos, // source position (c_rarg1) 1861 Register dst, // destination array oo (c_rarg2) 1862 Register dst_pos, // destination position (c_rarg3) 1863 Register length, 1864 Register temp, 1865 Label& L_failed) { 1866 BLOCK_COMMENT("arraycopy_range_checks:"); 1867 1868 assert_different_registers(rscratch1, temp); 1869 1870 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1871 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1872 __ addw(temp, length, src_pos); 1873 __ cmpw(temp, rscratch1); 1874 __ br(Assembler::HI, L_failed); 1875 1876 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1877 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1878 __ addw(temp, length, dst_pos); 1879 __ cmpw(temp, rscratch1); 1880 __ br(Assembler::HI, L_failed); 1881 1882 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1883 __ movw(src_pos, src_pos); 1884 __ movw(dst_pos, dst_pos); 1885 1886 BLOCK_COMMENT("arraycopy_range_checks done"); 1887 } 1888 1889 // These stubs get called from some dumb test routine. 1890 // I'll write them properly when they're called from 1891 // something that's actually doing something. 1892 static void fake_arraycopy_stub(address src, address dst, int count) { 1893 assert(count == 0, "huh?"); 1894 } 1895 1896 1897 // 1898 // Generate 'unsafe' array copy stub 1899 // Though just as safe as the other stubs, it takes an unscaled 1900 // size_t argument instead of an element count. 1901 // 1902 // Input: 1903 // c_rarg0 - source array address 1904 // c_rarg1 - destination array address 1905 // c_rarg2 - byte count, treated as ssize_t, can be zero 1906 // 1907 // Examines the alignment of the operands and dispatches 1908 // to a long, int, short, or byte copy loop. 1909 // 1910 address generate_unsafe_copy(const char *name, 1911 address byte_copy_entry, 1912 address short_copy_entry, 1913 address int_copy_entry, 1914 address long_copy_entry) { 1915 Label L_long_aligned, L_int_aligned, L_short_aligned; 1916 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1917 1918 __ align(CodeEntryAlignment); 1919 StubCodeMark mark(this, "StubRoutines", name); 1920 address start = __ pc(); 1921 __ enter(); // required for proper stackwalking of RuntimeStub frame 1922 1923 // bump this on entry, not on exit: 1924 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1925 1926 __ orr(rscratch1, s, d); 1927 __ orr(rscratch1, rscratch1, count); 1928 1929 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1930 __ cbz(rscratch1, L_long_aligned); 1931 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1932 __ cbz(rscratch1, L_int_aligned); 1933 __ tbz(rscratch1, 0, L_short_aligned); 1934 __ b(RuntimeAddress(byte_copy_entry)); 1935 1936 __ BIND(L_short_aligned); 1937 __ lsr(count, count, LogBytesPerShort); // size => short_count 1938 __ b(RuntimeAddress(short_copy_entry)); 1939 __ BIND(L_int_aligned); 1940 __ lsr(count, count, LogBytesPerInt); // size => int_count 1941 __ b(RuntimeAddress(int_copy_entry)); 1942 __ BIND(L_long_aligned); 1943 __ lsr(count, count, LogBytesPerLong); // size => long_count 1944 __ b(RuntimeAddress(long_copy_entry)); 1945 1946 return start; 1947 } 1948 1949 // 1950 // Generate generic array copy stubs 1951 // 1952 // Input: 1953 // c_rarg0 - src oop 1954 // c_rarg1 - src_pos (32-bits) 1955 // c_rarg2 - dst oop 1956 // c_rarg3 - dst_pos (32-bits) 1957 // c_rarg4 - element count (32-bits) 1958 // 1959 // Output: 1960 // r0 == 0 - success 1961 // r0 == -1^K - failure, where K is partial transfer count 1962 // 1963 address generate_generic_copy(const char *name, 1964 address byte_copy_entry, address short_copy_entry, 1965 address int_copy_entry, address oop_copy_entry, 1966 address long_copy_entry, address checkcast_copy_entry) { 1967 1968 Label L_failed, L_failed_0, L_objArray; 1969 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1970 1971 // Input registers 1972 const Register src = c_rarg0; // source array oop 1973 const Register src_pos = c_rarg1; // source position 1974 const Register dst = c_rarg2; // destination array oop 1975 const Register dst_pos = c_rarg3; // destination position 1976 const Register length = c_rarg4; 1977 1978 StubCodeMark mark(this, "StubRoutines", name); 1979 1980 __ align(CodeEntryAlignment); 1981 address start = __ pc(); 1982 1983 __ enter(); // required for proper stackwalking of RuntimeStub frame 1984 1985 // bump this on entry, not on exit: 1986 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1987 1988 //----------------------------------------------------------------------- 1989 // Assembler stub will be used for this call to arraycopy 1990 // if the following conditions are met: 1991 // 1992 // (1) src and dst must not be null. 1993 // (2) src_pos must not be negative. 1994 // (3) dst_pos must not be negative. 1995 // (4) length must not be negative. 1996 // (5) src klass and dst klass should be the same and not NULL. 1997 // (6) src and dst should be arrays. 1998 // (7) src_pos + length must not exceed length of src. 1999 // (8) dst_pos + length must not exceed length of dst. 2000 // 2001 2002 // if (src == NULL) return -1; 2003 __ cbz(src, L_failed); 2004 2005 // if (src_pos < 0) return -1; 2006 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2007 2008 // if (dst == NULL) return -1; 2009 __ cbz(dst, L_failed); 2010 2011 // if (dst_pos < 0) return -1; 2012 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2013 2014 // registers used as temp 2015 const Register scratch_length = r16; // elements count to copy 2016 const Register scratch_src_klass = r17; // array klass 2017 const Register lh = r18; // layout helper 2018 2019 // if (length < 0) return -1; 2020 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2021 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2022 2023 __ load_klass(scratch_src_klass, src); 2024 #ifdef ASSERT 2025 // assert(src->klass() != NULL); 2026 { 2027 BLOCK_COMMENT("assert klasses not null {"); 2028 Label L1, L2; 2029 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2030 __ bind(L1); 2031 __ stop("broken null klass"); 2032 __ bind(L2); 2033 __ load_klass(rscratch1, dst); 2034 __ cbz(rscratch1, L1); // this would be broken also 2035 BLOCK_COMMENT("} assert klasses not null done"); 2036 } 2037 #endif 2038 2039 // Load layout helper (32-bits) 2040 // 2041 // |array_tag| | header_size | element_type | |log2_element_size| 2042 // 32 30 24 16 8 2 0 2043 // 2044 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2045 // 2046 2047 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2048 2049 // Handle objArrays completely differently... 2050 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2051 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2052 __ movw(rscratch1, objArray_lh); 2053 __ eorw(rscratch2, lh, rscratch1); 2054 __ cbzw(rscratch2, L_objArray); 2055 2056 // if (src->klass() != dst->klass()) return -1; 2057 __ load_klass(rscratch2, dst); 2058 __ eor(rscratch2, rscratch2, scratch_src_klass); 2059 __ cbnz(rscratch2, L_failed); 2060 2061 // if (!src->is_Array()) return -1; 2062 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2063 2064 // At this point, it is known to be a typeArray (array_tag 0x3). 2065 #ifdef ASSERT 2066 { 2067 BLOCK_COMMENT("assert primitive array {"); 2068 Label L; 2069 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2070 __ cmpw(lh, rscratch2); 2071 __ br(Assembler::GE, L); 2072 __ stop("must be a primitive array"); 2073 __ bind(L); 2074 BLOCK_COMMENT("} assert primitive array done"); 2075 } 2076 #endif 2077 2078 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2079 rscratch2, L_failed); 2080 2081 // TypeArrayKlass 2082 // 2083 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2084 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2085 // 2086 2087 const Register rscratch1_offset = rscratch1; // array offset 2088 const Register r18_elsize = lh; // element size 2089 2090 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2091 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2092 __ add(src, src, rscratch1_offset); // src array offset 2093 __ add(dst, dst, rscratch1_offset); // dst array offset 2094 BLOCK_COMMENT("choose copy loop based on element size"); 2095 2096 // next registers should be set before the jump to corresponding stub 2097 const Register from = c_rarg0; // source array address 2098 const Register to = c_rarg1; // destination array address 2099 const Register count = c_rarg2; // elements count 2100 2101 // 'from', 'to', 'count' registers should be set in such order 2102 // since they are the same as 'src', 'src_pos', 'dst'. 2103 2104 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2105 2106 // The possible values of elsize are 0-3, i.e. exact_log2(element 2107 // size in bytes). We do a simple bitwise binary search. 2108 __ BIND(L_copy_bytes); 2109 __ tbnz(r18_elsize, 1, L_copy_ints); 2110 __ tbnz(r18_elsize, 0, L_copy_shorts); 2111 __ lea(from, Address(src, src_pos));// src_addr 2112 __ lea(to, Address(dst, dst_pos));// dst_addr 2113 __ movw(count, scratch_length); // length 2114 __ b(RuntimeAddress(byte_copy_entry)); 2115 2116 __ BIND(L_copy_shorts); 2117 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2118 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2119 __ movw(count, scratch_length); // length 2120 __ b(RuntimeAddress(short_copy_entry)); 2121 2122 __ BIND(L_copy_ints); 2123 __ tbnz(r18_elsize, 0, L_copy_longs); 2124 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2125 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2126 __ movw(count, scratch_length); // length 2127 __ b(RuntimeAddress(int_copy_entry)); 2128 2129 __ BIND(L_copy_longs); 2130 #ifdef ASSERT 2131 { 2132 BLOCK_COMMENT("assert long copy {"); 2133 Label L; 2134 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2135 __ cmpw(r18_elsize, LogBytesPerLong); 2136 __ br(Assembler::EQ, L); 2137 __ stop("must be long copy, but elsize is wrong"); 2138 __ bind(L); 2139 BLOCK_COMMENT("} assert long copy done"); 2140 } 2141 #endif 2142 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2143 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2144 __ movw(count, scratch_length); // length 2145 __ b(RuntimeAddress(long_copy_entry)); 2146 2147 // ObjArrayKlass 2148 __ BIND(L_objArray); 2149 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2150 2151 Label L_plain_copy, L_checkcast_copy; 2152 // test array classes for subtyping 2153 __ load_klass(r18, dst); 2154 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2155 __ br(Assembler::NE, L_checkcast_copy); 2156 2157 // Identically typed arrays can be copied without element-wise checks. 2158 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2159 rscratch2, L_failed); 2160 2161 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2162 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2163 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2164 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2165 __ movw(count, scratch_length); // length 2166 __ BIND(L_plain_copy); 2167 __ b(RuntimeAddress(oop_copy_entry)); 2168 2169 __ BIND(L_checkcast_copy); 2170 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2171 { 2172 // Before looking at dst.length, make sure dst is also an objArray. 2173 __ ldrw(rscratch1, Address(r18, lh_offset)); 2174 __ movw(rscratch2, objArray_lh); 2175 __ eorw(rscratch1, rscratch1, rscratch2); 2176 __ cbnzw(rscratch1, L_failed); 2177 2178 // It is safe to examine both src.length and dst.length. 2179 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2180 r18, L_failed); 2181 2182 const Register rscratch2_dst_klass = rscratch2; 2183 __ load_klass(rscratch2_dst_klass, dst); // reload 2184 2185 // Marshal the base address arguments now, freeing registers. 2186 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2187 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2188 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2189 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2190 __ movw(count, length); // length (reloaded) 2191 Register sco_temp = c_rarg3; // this register is free now 2192 assert_different_registers(from, to, count, sco_temp, 2193 rscratch2_dst_klass, scratch_src_klass); 2194 // assert_clean_int(count, sco_temp); 2195 2196 // Generate the type check. 2197 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2198 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2199 // assert_clean_int(sco_temp, r18); 2200 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2201 2202 // Fetch destination element klass from the ObjArrayKlass header. 2203 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2204 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2205 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2206 2207 // the checkcast_copy loop needs two extra arguments: 2208 assert(c_rarg3 == sco_temp, "#3 already in place"); 2209 // Set up arguments for checkcast_copy_entry. 2210 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2211 __ b(RuntimeAddress(checkcast_copy_entry)); 2212 } 2213 2214 __ BIND(L_failed); 2215 __ mov(r0, -1); 2216 __ leave(); // required for proper stackwalking of RuntimeStub frame 2217 __ ret(lr); 2218 2219 return start; 2220 } 2221 2222 // 2223 // Generate stub for array fill. If "aligned" is true, the 2224 // "to" address is assumed to be heapword aligned. 2225 // 2226 // Arguments for generated stub: 2227 // to: c_rarg0 2228 // value: c_rarg1 2229 // count: c_rarg2 treated as signed 2230 // 2231 address generate_fill(BasicType t, bool aligned, const char *name) { 2232 __ align(CodeEntryAlignment); 2233 StubCodeMark mark(this, "StubRoutines", name); 2234 address start = __ pc(); 2235 2236 BLOCK_COMMENT("Entry:"); 2237 2238 const Register to = c_rarg0; // source array address 2239 const Register value = c_rarg1; // value 2240 const Register count = c_rarg2; // elements count 2241 2242 const Register bz_base = r10; // base for block_zero routine 2243 const Register cnt_words = r11; // temp register 2244 2245 __ enter(); 2246 2247 Label L_fill_elements, L_exit1; 2248 2249 int shift = -1; 2250 switch (t) { 2251 case T_BYTE: 2252 shift = 0; 2253 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2254 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2255 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2256 __ br(Assembler::LO, L_fill_elements); 2257 break; 2258 case T_SHORT: 2259 shift = 1; 2260 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2261 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2262 __ br(Assembler::LO, L_fill_elements); 2263 break; 2264 case T_INT: 2265 shift = 2; 2266 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2267 __ br(Assembler::LO, L_fill_elements); 2268 break; 2269 default: ShouldNotReachHere(); 2270 } 2271 2272 // Align source address at 8 bytes address boundary. 2273 Label L_skip_align1, L_skip_align2, L_skip_align4; 2274 if (!aligned) { 2275 switch (t) { 2276 case T_BYTE: 2277 // One byte misalignment happens only for byte arrays. 2278 __ tbz(to, 0, L_skip_align1); 2279 __ strb(value, Address(__ post(to, 1))); 2280 __ subw(count, count, 1); 2281 __ bind(L_skip_align1); 2282 // Fallthrough 2283 case T_SHORT: 2284 // Two bytes misalignment happens only for byte and short (char) arrays. 2285 __ tbz(to, 1, L_skip_align2); 2286 __ strh(value, Address(__ post(to, 2))); 2287 __ subw(count, count, 2 >> shift); 2288 __ bind(L_skip_align2); 2289 // Fallthrough 2290 case T_INT: 2291 // Align to 8 bytes, we know we are 4 byte aligned to start. 2292 __ tbz(to, 2, L_skip_align4); 2293 __ strw(value, Address(__ post(to, 4))); 2294 __ subw(count, count, 4 >> shift); 2295 __ bind(L_skip_align4); 2296 break; 2297 default: ShouldNotReachHere(); 2298 } 2299 } 2300 2301 // 2302 // Fill large chunks 2303 // 2304 __ lsrw(cnt_words, count, 3 - shift); // number of words 2305 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2306 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2307 if (UseBlockZeroing) { 2308 Label non_block_zeroing, rest; 2309 // If the fill value is zero we can use the fast zero_words(). 2310 __ cbnz(value, non_block_zeroing); 2311 __ mov(bz_base, to); 2312 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2313 __ zero_words(bz_base, cnt_words); 2314 __ b(rest); 2315 __ bind(non_block_zeroing); 2316 __ fill_words(to, cnt_words, value); 2317 __ bind(rest); 2318 } else { 2319 __ fill_words(to, cnt_words, value); 2320 } 2321 2322 // Remaining count is less than 8 bytes. Fill it by a single store. 2323 // Note that the total length is no less than 8 bytes. 2324 if (t == T_BYTE || t == T_SHORT) { 2325 Label L_exit1; 2326 __ cbzw(count, L_exit1); 2327 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2328 __ str(value, Address(to, -8)); // overwrite some elements 2329 __ bind(L_exit1); 2330 __ leave(); 2331 __ ret(lr); 2332 } 2333 2334 // Handle copies less than 8 bytes. 2335 Label L_fill_2, L_fill_4, L_exit2; 2336 __ bind(L_fill_elements); 2337 switch (t) { 2338 case T_BYTE: 2339 __ tbz(count, 0, L_fill_2); 2340 __ strb(value, Address(__ post(to, 1))); 2341 __ bind(L_fill_2); 2342 __ tbz(count, 1, L_fill_4); 2343 __ strh(value, Address(__ post(to, 2))); 2344 __ bind(L_fill_4); 2345 __ tbz(count, 2, L_exit2); 2346 __ strw(value, Address(to)); 2347 break; 2348 case T_SHORT: 2349 __ tbz(count, 0, L_fill_4); 2350 __ strh(value, Address(__ post(to, 2))); 2351 __ bind(L_fill_4); 2352 __ tbz(count, 1, L_exit2); 2353 __ strw(value, Address(to)); 2354 break; 2355 case T_INT: 2356 __ cbzw(count, L_exit2); 2357 __ strw(value, Address(to)); 2358 break; 2359 default: ShouldNotReachHere(); 2360 } 2361 __ bind(L_exit2); 2362 __ leave(); 2363 __ ret(lr); 2364 return start; 2365 } 2366 2367 void generate_arraycopy_stubs() { 2368 address entry; 2369 address entry_jbyte_arraycopy; 2370 address entry_jshort_arraycopy; 2371 address entry_jint_arraycopy; 2372 address entry_oop_arraycopy; 2373 address entry_jlong_arraycopy; 2374 address entry_checkcast_arraycopy; 2375 2376 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2377 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2378 2379 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2380 2381 //*** jbyte 2382 // Always need aligned and unaligned versions 2383 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2384 "jbyte_disjoint_arraycopy"); 2385 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2386 &entry_jbyte_arraycopy, 2387 "jbyte_arraycopy"); 2388 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2389 "arrayof_jbyte_disjoint_arraycopy"); 2390 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2391 "arrayof_jbyte_arraycopy"); 2392 2393 //*** jshort 2394 // Always need aligned and unaligned versions 2395 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2396 "jshort_disjoint_arraycopy"); 2397 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2398 &entry_jshort_arraycopy, 2399 "jshort_arraycopy"); 2400 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2401 "arrayof_jshort_disjoint_arraycopy"); 2402 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2403 "arrayof_jshort_arraycopy"); 2404 2405 //*** jint 2406 // Aligned versions 2407 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2408 "arrayof_jint_disjoint_arraycopy"); 2409 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2410 "arrayof_jint_arraycopy"); 2411 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2412 // entry_jint_arraycopy always points to the unaligned version 2413 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2414 "jint_disjoint_arraycopy"); 2415 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2416 &entry_jint_arraycopy, 2417 "jint_arraycopy"); 2418 2419 //*** jlong 2420 // It is always aligned 2421 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2422 "arrayof_jlong_disjoint_arraycopy"); 2423 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2424 "arrayof_jlong_arraycopy"); 2425 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2426 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2427 2428 //*** oops 2429 { 2430 // With compressed oops we need unaligned versions; notice that 2431 // we overwrite entry_oop_arraycopy. 2432 bool aligned = !UseCompressedOops; 2433 2434 StubRoutines::_arrayof_oop_disjoint_arraycopy 2435 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2436 /*dest_uninitialized*/false); 2437 StubRoutines::_arrayof_oop_arraycopy 2438 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2439 /*dest_uninitialized*/false); 2440 // Aligned versions without pre-barriers 2441 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2442 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2443 /*dest_uninitialized*/true); 2444 StubRoutines::_arrayof_oop_arraycopy_uninit 2445 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2446 /*dest_uninitialized*/true); 2447 } 2448 2449 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2450 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2451 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2452 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2453 2454 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2455 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2456 /*dest_uninitialized*/true); 2457 2458 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2459 entry_jbyte_arraycopy, 2460 entry_jshort_arraycopy, 2461 entry_jint_arraycopy, 2462 entry_jlong_arraycopy); 2463 2464 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2465 entry_jbyte_arraycopy, 2466 entry_jshort_arraycopy, 2467 entry_jint_arraycopy, 2468 entry_oop_arraycopy, 2469 entry_jlong_arraycopy, 2470 entry_checkcast_arraycopy); 2471 2472 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2473 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2474 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2475 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2476 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2477 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2478 } 2479 2480 void generate_math_stubs() { Unimplemented(); } 2481 2482 // Arguments: 2483 // 2484 // Inputs: 2485 // c_rarg0 - source byte array address 2486 // c_rarg1 - destination byte array address 2487 // c_rarg2 - K (key) in little endian int array 2488 // 2489 address generate_aescrypt_encryptBlock() { 2490 __ align(CodeEntryAlignment); 2491 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2492 2493 Label L_doLast; 2494 2495 const Register from = c_rarg0; // source array address 2496 const Register to = c_rarg1; // destination array address 2497 const Register key = c_rarg2; // key array address 2498 const Register keylen = rscratch1; 2499 2500 address start = __ pc(); 2501 __ enter(); 2502 2503 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2504 2505 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2506 2507 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2508 __ rev32(v1, __ T16B, v1); 2509 __ rev32(v2, __ T16B, v2); 2510 __ rev32(v3, __ T16B, v3); 2511 __ rev32(v4, __ T16B, v4); 2512 __ aese(v0, v1); 2513 __ aesmc(v0, v0); 2514 __ aese(v0, v2); 2515 __ aesmc(v0, v0); 2516 __ aese(v0, v3); 2517 __ aesmc(v0, v0); 2518 __ aese(v0, v4); 2519 __ aesmc(v0, v0); 2520 2521 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2522 __ rev32(v1, __ T16B, v1); 2523 __ rev32(v2, __ T16B, v2); 2524 __ rev32(v3, __ T16B, v3); 2525 __ rev32(v4, __ T16B, v4); 2526 __ aese(v0, v1); 2527 __ aesmc(v0, v0); 2528 __ aese(v0, v2); 2529 __ aesmc(v0, v0); 2530 __ aese(v0, v3); 2531 __ aesmc(v0, v0); 2532 __ aese(v0, v4); 2533 __ aesmc(v0, v0); 2534 2535 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2536 __ rev32(v1, __ T16B, v1); 2537 __ rev32(v2, __ T16B, v2); 2538 2539 __ cmpw(keylen, 44); 2540 __ br(Assembler::EQ, L_doLast); 2541 2542 __ aese(v0, v1); 2543 __ aesmc(v0, v0); 2544 __ aese(v0, v2); 2545 __ aesmc(v0, v0); 2546 2547 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2548 __ rev32(v1, __ T16B, v1); 2549 __ rev32(v2, __ T16B, v2); 2550 2551 __ cmpw(keylen, 52); 2552 __ br(Assembler::EQ, L_doLast); 2553 2554 __ aese(v0, v1); 2555 __ aesmc(v0, v0); 2556 __ aese(v0, v2); 2557 __ aesmc(v0, v0); 2558 2559 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2560 __ rev32(v1, __ T16B, v1); 2561 __ rev32(v2, __ T16B, v2); 2562 2563 __ BIND(L_doLast); 2564 2565 __ aese(v0, v1); 2566 __ aesmc(v0, v0); 2567 __ aese(v0, v2); 2568 2569 __ ld1(v1, __ T16B, key); 2570 __ rev32(v1, __ T16B, v1); 2571 __ eor(v0, __ T16B, v0, v1); 2572 2573 __ st1(v0, __ T16B, to); 2574 2575 __ mov(r0, 0); 2576 2577 __ leave(); 2578 __ ret(lr); 2579 2580 return start; 2581 } 2582 2583 // Arguments: 2584 // 2585 // Inputs: 2586 // c_rarg0 - source byte array address 2587 // c_rarg1 - destination byte array address 2588 // c_rarg2 - K (key) in little endian int array 2589 // 2590 address generate_aescrypt_decryptBlock() { 2591 assert(UseAES, "need AES instructions and misaligned SSE support"); 2592 __ align(CodeEntryAlignment); 2593 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2594 Label L_doLast; 2595 2596 const Register from = c_rarg0; // source array address 2597 const Register to = c_rarg1; // destination array address 2598 const Register key = c_rarg2; // key array address 2599 const Register keylen = rscratch1; 2600 2601 address start = __ pc(); 2602 __ enter(); // required for proper stackwalking of RuntimeStub frame 2603 2604 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2605 2606 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2607 2608 __ ld1(v5, __ T16B, __ post(key, 16)); 2609 __ rev32(v5, __ T16B, v5); 2610 2611 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2612 __ rev32(v1, __ T16B, v1); 2613 __ rev32(v2, __ T16B, v2); 2614 __ rev32(v3, __ T16B, v3); 2615 __ rev32(v4, __ T16B, v4); 2616 __ aesd(v0, v1); 2617 __ aesimc(v0, v0); 2618 __ aesd(v0, v2); 2619 __ aesimc(v0, v0); 2620 __ aesd(v0, v3); 2621 __ aesimc(v0, v0); 2622 __ aesd(v0, v4); 2623 __ aesimc(v0, v0); 2624 2625 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2626 __ rev32(v1, __ T16B, v1); 2627 __ rev32(v2, __ T16B, v2); 2628 __ rev32(v3, __ T16B, v3); 2629 __ rev32(v4, __ T16B, v4); 2630 __ aesd(v0, v1); 2631 __ aesimc(v0, v0); 2632 __ aesd(v0, v2); 2633 __ aesimc(v0, v0); 2634 __ aesd(v0, v3); 2635 __ aesimc(v0, v0); 2636 __ aesd(v0, v4); 2637 __ aesimc(v0, v0); 2638 2639 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2640 __ rev32(v1, __ T16B, v1); 2641 __ rev32(v2, __ T16B, v2); 2642 2643 __ cmpw(keylen, 44); 2644 __ br(Assembler::EQ, L_doLast); 2645 2646 __ aesd(v0, v1); 2647 __ aesimc(v0, v0); 2648 __ aesd(v0, v2); 2649 __ aesimc(v0, v0); 2650 2651 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2652 __ rev32(v1, __ T16B, v1); 2653 __ rev32(v2, __ T16B, v2); 2654 2655 __ cmpw(keylen, 52); 2656 __ br(Assembler::EQ, L_doLast); 2657 2658 __ aesd(v0, v1); 2659 __ aesimc(v0, v0); 2660 __ aesd(v0, v2); 2661 __ aesimc(v0, v0); 2662 2663 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2664 __ rev32(v1, __ T16B, v1); 2665 __ rev32(v2, __ T16B, v2); 2666 2667 __ BIND(L_doLast); 2668 2669 __ aesd(v0, v1); 2670 __ aesimc(v0, v0); 2671 __ aesd(v0, v2); 2672 2673 __ eor(v0, __ T16B, v0, v5); 2674 2675 __ st1(v0, __ T16B, to); 2676 2677 __ mov(r0, 0); 2678 2679 __ leave(); 2680 __ ret(lr); 2681 2682 return start; 2683 } 2684 2685 // Arguments: 2686 // 2687 // Inputs: 2688 // c_rarg0 - source byte array address 2689 // c_rarg1 - destination byte array address 2690 // c_rarg2 - K (key) in little endian int array 2691 // c_rarg3 - r vector byte array address 2692 // c_rarg4 - input length 2693 // 2694 // Output: 2695 // x0 - input length 2696 // 2697 address generate_cipherBlockChaining_encryptAESCrypt() { 2698 assert(UseAES, "need AES instructions and misaligned SSE support"); 2699 __ align(CodeEntryAlignment); 2700 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2701 2702 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2703 2704 const Register from = c_rarg0; // source array address 2705 const Register to = c_rarg1; // destination array address 2706 const Register key = c_rarg2; // key array address 2707 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2708 // and left with the results of the last encryption block 2709 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2710 const Register keylen = rscratch1; 2711 2712 address start = __ pc(); 2713 2714 __ enter(); 2715 2716 __ movw(rscratch2, len_reg); 2717 2718 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2719 2720 __ ld1(v0, __ T16B, rvec); 2721 2722 __ cmpw(keylen, 52); 2723 __ br(Assembler::CC, L_loadkeys_44); 2724 __ br(Assembler::EQ, L_loadkeys_52); 2725 2726 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2727 __ rev32(v17, __ T16B, v17); 2728 __ rev32(v18, __ T16B, v18); 2729 __ BIND(L_loadkeys_52); 2730 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2731 __ rev32(v19, __ T16B, v19); 2732 __ rev32(v20, __ T16B, v20); 2733 __ BIND(L_loadkeys_44); 2734 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2735 __ rev32(v21, __ T16B, v21); 2736 __ rev32(v22, __ T16B, v22); 2737 __ rev32(v23, __ T16B, v23); 2738 __ rev32(v24, __ T16B, v24); 2739 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2740 __ rev32(v25, __ T16B, v25); 2741 __ rev32(v26, __ T16B, v26); 2742 __ rev32(v27, __ T16B, v27); 2743 __ rev32(v28, __ T16B, v28); 2744 __ ld1(v29, v30, v31, __ T16B, key); 2745 __ rev32(v29, __ T16B, v29); 2746 __ rev32(v30, __ T16B, v30); 2747 __ rev32(v31, __ T16B, v31); 2748 2749 __ BIND(L_aes_loop); 2750 __ ld1(v1, __ T16B, __ post(from, 16)); 2751 __ eor(v0, __ T16B, v0, v1); 2752 2753 __ br(Assembler::CC, L_rounds_44); 2754 __ br(Assembler::EQ, L_rounds_52); 2755 2756 __ aese(v0, v17); __ aesmc(v0, v0); 2757 __ aese(v0, v18); __ aesmc(v0, v0); 2758 __ BIND(L_rounds_52); 2759 __ aese(v0, v19); __ aesmc(v0, v0); 2760 __ aese(v0, v20); __ aesmc(v0, v0); 2761 __ BIND(L_rounds_44); 2762 __ aese(v0, v21); __ aesmc(v0, v0); 2763 __ aese(v0, v22); __ aesmc(v0, v0); 2764 __ aese(v0, v23); __ aesmc(v0, v0); 2765 __ aese(v0, v24); __ aesmc(v0, v0); 2766 __ aese(v0, v25); __ aesmc(v0, v0); 2767 __ aese(v0, v26); __ aesmc(v0, v0); 2768 __ aese(v0, v27); __ aesmc(v0, v0); 2769 __ aese(v0, v28); __ aesmc(v0, v0); 2770 __ aese(v0, v29); __ aesmc(v0, v0); 2771 __ aese(v0, v30); 2772 __ eor(v0, __ T16B, v0, v31); 2773 2774 __ st1(v0, __ T16B, __ post(to, 16)); 2775 2776 __ subw(len_reg, len_reg, 16); 2777 __ cbnzw(len_reg, L_aes_loop); 2778 2779 __ st1(v0, __ T16B, rvec); 2780 2781 __ mov(r0, rscratch2); 2782 2783 __ leave(); 2784 __ ret(lr); 2785 2786 return start; 2787 } 2788 2789 // Arguments: 2790 // 2791 // Inputs: 2792 // c_rarg0 - source byte array address 2793 // c_rarg1 - destination byte array address 2794 // c_rarg2 - K (key) in little endian int array 2795 // c_rarg3 - r vector byte array address 2796 // c_rarg4 - input length 2797 // 2798 // Output: 2799 // r0 - input length 2800 // 2801 address generate_cipherBlockChaining_decryptAESCrypt() { 2802 assert(UseAES, "need AES instructions and misaligned SSE support"); 2803 __ align(CodeEntryAlignment); 2804 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2805 2806 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2807 2808 const Register from = c_rarg0; // source array address 2809 const Register to = c_rarg1; // destination array address 2810 const Register key = c_rarg2; // key array address 2811 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2812 // and left with the results of the last encryption block 2813 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2814 const Register keylen = rscratch1; 2815 2816 address start = __ pc(); 2817 2818 __ enter(); 2819 2820 __ movw(rscratch2, len_reg); 2821 2822 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2823 2824 __ ld1(v2, __ T16B, rvec); 2825 2826 __ ld1(v31, __ T16B, __ post(key, 16)); 2827 __ rev32(v31, __ T16B, v31); 2828 2829 __ cmpw(keylen, 52); 2830 __ br(Assembler::CC, L_loadkeys_44); 2831 __ br(Assembler::EQ, L_loadkeys_52); 2832 2833 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2834 __ rev32(v17, __ T16B, v17); 2835 __ rev32(v18, __ T16B, v18); 2836 __ BIND(L_loadkeys_52); 2837 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2838 __ rev32(v19, __ T16B, v19); 2839 __ rev32(v20, __ T16B, v20); 2840 __ BIND(L_loadkeys_44); 2841 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2842 __ rev32(v21, __ T16B, v21); 2843 __ rev32(v22, __ T16B, v22); 2844 __ rev32(v23, __ T16B, v23); 2845 __ rev32(v24, __ T16B, v24); 2846 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2847 __ rev32(v25, __ T16B, v25); 2848 __ rev32(v26, __ T16B, v26); 2849 __ rev32(v27, __ T16B, v27); 2850 __ rev32(v28, __ T16B, v28); 2851 __ ld1(v29, v30, __ T16B, key); 2852 __ rev32(v29, __ T16B, v29); 2853 __ rev32(v30, __ T16B, v30); 2854 2855 __ BIND(L_aes_loop); 2856 __ ld1(v0, __ T16B, __ post(from, 16)); 2857 __ orr(v1, __ T16B, v0, v0); 2858 2859 __ br(Assembler::CC, L_rounds_44); 2860 __ br(Assembler::EQ, L_rounds_52); 2861 2862 __ aesd(v0, v17); __ aesimc(v0, v0); 2863 __ aesd(v0, v18); __ aesimc(v0, v0); 2864 __ BIND(L_rounds_52); 2865 __ aesd(v0, v19); __ aesimc(v0, v0); 2866 __ aesd(v0, v20); __ aesimc(v0, v0); 2867 __ BIND(L_rounds_44); 2868 __ aesd(v0, v21); __ aesimc(v0, v0); 2869 __ aesd(v0, v22); __ aesimc(v0, v0); 2870 __ aesd(v0, v23); __ aesimc(v0, v0); 2871 __ aesd(v0, v24); __ aesimc(v0, v0); 2872 __ aesd(v0, v25); __ aesimc(v0, v0); 2873 __ aesd(v0, v26); __ aesimc(v0, v0); 2874 __ aesd(v0, v27); __ aesimc(v0, v0); 2875 __ aesd(v0, v28); __ aesimc(v0, v0); 2876 __ aesd(v0, v29); __ aesimc(v0, v0); 2877 __ aesd(v0, v30); 2878 __ eor(v0, __ T16B, v0, v31); 2879 __ eor(v0, __ T16B, v0, v2); 2880 2881 __ st1(v0, __ T16B, __ post(to, 16)); 2882 __ orr(v2, __ T16B, v1, v1); 2883 2884 __ subw(len_reg, len_reg, 16); 2885 __ cbnzw(len_reg, L_aes_loop); 2886 2887 __ st1(v2, __ T16B, rvec); 2888 2889 __ mov(r0, rscratch2); 2890 2891 __ leave(); 2892 __ ret(lr); 2893 2894 return start; 2895 } 2896 2897 // Arguments: 2898 // 2899 // Inputs: 2900 // c_rarg0 - byte[] source+offset 2901 // c_rarg1 - int[] SHA.state 2902 // c_rarg2 - int offset 2903 // c_rarg3 - int limit 2904 // 2905 address generate_sha1_implCompress(bool multi_block, const char *name) { 2906 __ align(CodeEntryAlignment); 2907 StubCodeMark mark(this, "StubRoutines", name); 2908 address start = __ pc(); 2909 2910 Register buf = c_rarg0; 2911 Register state = c_rarg1; 2912 Register ofs = c_rarg2; 2913 Register limit = c_rarg3; 2914 2915 Label keys; 2916 Label sha1_loop; 2917 2918 // load the keys into v0..v3 2919 __ adr(rscratch1, keys); 2920 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2921 // load 5 words state into v6, v7 2922 __ ldrq(v6, Address(state, 0)); 2923 __ ldrs(v7, Address(state, 16)); 2924 2925 2926 __ BIND(sha1_loop); 2927 // load 64 bytes of data into v16..v19 2928 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2929 __ rev32(v16, __ T16B, v16); 2930 __ rev32(v17, __ T16B, v17); 2931 __ rev32(v18, __ T16B, v18); 2932 __ rev32(v19, __ T16B, v19); 2933 2934 // do the sha1 2935 __ addv(v4, __ T4S, v16, v0); 2936 __ orr(v20, __ T16B, v6, v6); 2937 2938 FloatRegister d0 = v16; 2939 FloatRegister d1 = v17; 2940 FloatRegister d2 = v18; 2941 FloatRegister d3 = v19; 2942 2943 for (int round = 0; round < 20; round++) { 2944 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2945 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2946 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2947 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2948 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2949 2950 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2951 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2952 __ sha1h(tmp2, __ T4S, v20); 2953 if (round < 5) 2954 __ sha1c(v20, __ T4S, tmp3, tmp4); 2955 else if (round < 10 || round >= 15) 2956 __ sha1p(v20, __ T4S, tmp3, tmp4); 2957 else 2958 __ sha1m(v20, __ T4S, tmp3, tmp4); 2959 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2960 2961 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2962 } 2963 2964 __ addv(v7, __ T2S, v7, v21); 2965 __ addv(v6, __ T4S, v6, v20); 2966 2967 if (multi_block) { 2968 __ add(ofs, ofs, 64); 2969 __ cmp(ofs, limit); 2970 __ br(Assembler::LE, sha1_loop); 2971 __ mov(c_rarg0, ofs); // return ofs 2972 } 2973 2974 __ strq(v6, Address(state, 0)); 2975 __ strs(v7, Address(state, 16)); 2976 2977 __ ret(lr); 2978 2979 __ bind(keys); 2980 __ emit_int32(0x5a827999); 2981 __ emit_int32(0x6ed9eba1); 2982 __ emit_int32(0x8f1bbcdc); 2983 __ emit_int32(0xca62c1d6); 2984 2985 return start; 2986 } 2987 2988 2989 // Arguments: 2990 // 2991 // Inputs: 2992 // c_rarg0 - byte[] source+offset 2993 // c_rarg1 - int[] SHA.state 2994 // c_rarg2 - int offset 2995 // c_rarg3 - int limit 2996 // 2997 address generate_sha256_implCompress(bool multi_block, const char *name) { 2998 static const uint32_t round_consts[64] = { 2999 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3000 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3001 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3002 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3003 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3004 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3005 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3006 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3007 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3008 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3009 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3010 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3011 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3012 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3013 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3014 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3015 }; 3016 __ align(CodeEntryAlignment); 3017 StubCodeMark mark(this, "StubRoutines", name); 3018 address start = __ pc(); 3019 3020 Register buf = c_rarg0; 3021 Register state = c_rarg1; 3022 Register ofs = c_rarg2; 3023 Register limit = c_rarg3; 3024 3025 Label sha1_loop; 3026 3027 __ stpd(v8, v9, __ pre(sp, -32)); 3028 __ stpd(v10, v11, Address(sp, 16)); 3029 3030 // dga == v0 3031 // dgb == v1 3032 // dg0 == v2 3033 // dg1 == v3 3034 // dg2 == v4 3035 // t0 == v6 3036 // t1 == v7 3037 3038 // load 16 keys to v16..v31 3039 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3040 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3041 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3042 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3043 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3044 3045 // load 8 words (256 bits) state 3046 __ ldpq(v0, v1, state); 3047 3048 __ BIND(sha1_loop); 3049 // load 64 bytes of data into v8..v11 3050 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3051 __ rev32(v8, __ T16B, v8); 3052 __ rev32(v9, __ T16B, v9); 3053 __ rev32(v10, __ T16B, v10); 3054 __ rev32(v11, __ T16B, v11); 3055 3056 __ addv(v6, __ T4S, v8, v16); 3057 __ orr(v2, __ T16B, v0, v0); 3058 __ orr(v3, __ T16B, v1, v1); 3059 3060 FloatRegister d0 = v8; 3061 FloatRegister d1 = v9; 3062 FloatRegister d2 = v10; 3063 FloatRegister d3 = v11; 3064 3065 3066 for (int round = 0; round < 16; round++) { 3067 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3068 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3069 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3070 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3071 3072 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3073 __ orr(v4, __ T16B, v2, v2); 3074 if (round < 15) 3075 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3076 __ sha256h(v2, __ T4S, v3, tmp2); 3077 __ sha256h2(v3, __ T4S, v4, tmp2); 3078 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3079 3080 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3081 } 3082 3083 __ addv(v0, __ T4S, v0, v2); 3084 __ addv(v1, __ T4S, v1, v3); 3085 3086 if (multi_block) { 3087 __ add(ofs, ofs, 64); 3088 __ cmp(ofs, limit); 3089 __ br(Assembler::LE, sha1_loop); 3090 __ mov(c_rarg0, ofs); // return ofs 3091 } 3092 3093 __ ldpd(v10, v11, Address(sp, 16)); 3094 __ ldpd(v8, v9, __ post(sp, 32)); 3095 3096 __ stpq(v0, v1, state); 3097 3098 __ ret(lr); 3099 3100 return start; 3101 } 3102 3103 #ifndef BUILTIN_SIM 3104 // Safefetch stubs. 3105 void generate_safefetch(const char* name, int size, address* entry, 3106 address* fault_pc, address* continuation_pc) { 3107 // safefetch signatures: 3108 // int SafeFetch32(int* adr, int errValue); 3109 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3110 // 3111 // arguments: 3112 // c_rarg0 = adr 3113 // c_rarg1 = errValue 3114 // 3115 // result: 3116 // PPC_RET = *adr or errValue 3117 3118 StubCodeMark mark(this, "StubRoutines", name); 3119 3120 // Entry point, pc or function descriptor. 3121 *entry = __ pc(); 3122 3123 // Load *adr into c_rarg1, may fault. 3124 *fault_pc = __ pc(); 3125 switch (size) { 3126 case 4: 3127 // int32_t 3128 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3129 break; 3130 case 8: 3131 // int64_t 3132 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3133 break; 3134 default: 3135 ShouldNotReachHere(); 3136 } 3137 3138 // return errValue or *adr 3139 *continuation_pc = __ pc(); 3140 __ mov(r0, c_rarg1); 3141 __ ret(lr); 3142 } 3143 #endif 3144 3145 /** 3146 * Arguments: 3147 * 3148 * Inputs: 3149 * c_rarg0 - int crc 3150 * c_rarg1 - byte* buf 3151 * c_rarg2 - int length 3152 * 3153 * Ouput: 3154 * rax - int crc result 3155 */ 3156 address generate_updateBytesCRC32() { 3157 assert(UseCRC32Intrinsics, "what are we doing here?"); 3158 3159 __ align(CodeEntryAlignment); 3160 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3161 3162 address start = __ pc(); 3163 3164 const Register crc = c_rarg0; // crc 3165 const Register buf = c_rarg1; // source java byte array address 3166 const Register len = c_rarg2; // length 3167 const Register table0 = c_rarg3; // crc_table address 3168 const Register table1 = c_rarg4; 3169 const Register table2 = c_rarg5; 3170 const Register table3 = c_rarg6; 3171 const Register tmp3 = c_rarg7; 3172 3173 BLOCK_COMMENT("Entry:"); 3174 __ enter(); // required for proper stackwalking of RuntimeStub frame 3175 3176 __ kernel_crc32(crc, buf, len, 3177 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3178 3179 __ leave(); // required for proper stackwalking of RuntimeStub frame 3180 __ ret(lr); 3181 3182 return start; 3183 } 3184 3185 /** 3186 * Arguments: 3187 * 3188 * Inputs: 3189 * c_rarg0 - int crc 3190 * c_rarg1 - byte* buf 3191 * c_rarg2 - int length 3192 * c_rarg3 - int* table 3193 * 3194 * Ouput: 3195 * r0 - int crc result 3196 */ 3197 address generate_updateBytesCRC32C() { 3198 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3199 3200 __ align(CodeEntryAlignment); 3201 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3202 3203 address start = __ pc(); 3204 3205 const Register crc = c_rarg0; // crc 3206 const Register buf = c_rarg1; // source java byte array address 3207 const Register len = c_rarg2; // length 3208 const Register table0 = c_rarg3; // crc_table address 3209 const Register table1 = c_rarg4; 3210 const Register table2 = c_rarg5; 3211 const Register table3 = c_rarg6; 3212 const Register tmp3 = c_rarg7; 3213 3214 BLOCK_COMMENT("Entry:"); 3215 __ enter(); // required for proper stackwalking of RuntimeStub frame 3216 3217 __ kernel_crc32c(crc, buf, len, 3218 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3219 3220 __ leave(); // required for proper stackwalking of RuntimeStub frame 3221 __ ret(lr); 3222 3223 return start; 3224 } 3225 3226 /*** 3227 * Arguments: 3228 * 3229 * Inputs: 3230 * c_rarg0 - int adler 3231 * c_rarg1 - byte* buff 3232 * c_rarg2 - int len 3233 * 3234 * Output: 3235 * c_rarg0 - int adler result 3236 */ 3237 address generate_updateBytesAdler32() { 3238 __ align(CodeEntryAlignment); 3239 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3240 address start = __ pc(); 3241 3242 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3243 3244 // Aliases 3245 Register adler = c_rarg0; 3246 Register s1 = c_rarg0; 3247 Register s2 = c_rarg3; 3248 Register buff = c_rarg1; 3249 Register len = c_rarg2; 3250 Register nmax = r4; 3251 Register base = r5; 3252 Register count = r6; 3253 Register temp0 = rscratch1; 3254 Register temp1 = rscratch2; 3255 Register temp2 = r7; 3256 3257 // Max number of bytes we can process before having to take the mod 3258 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3259 unsigned long BASE = 0xfff1; 3260 unsigned long NMAX = 0x15B0; 3261 3262 __ mov(base, BASE); 3263 __ mov(nmax, NMAX); 3264 3265 // s1 is initialized to the lower 16 bits of adler 3266 // s2 is initialized to the upper 16 bits of adler 3267 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3268 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3269 3270 // The pipelined loop needs at least 16 elements for 1 iteration 3271 // It does check this, but it is more effective to skip to the cleanup loop 3272 __ cmp(len, 16); 3273 __ br(Assembler::HS, L_nmax); 3274 __ cbz(len, L_combine); 3275 3276 __ bind(L_simple_by1_loop); 3277 __ ldrb(temp0, Address(__ post(buff, 1))); 3278 __ add(s1, s1, temp0); 3279 __ add(s2, s2, s1); 3280 __ subs(len, len, 1); 3281 __ br(Assembler::HI, L_simple_by1_loop); 3282 3283 // s1 = s1 % BASE 3284 __ subs(temp0, s1, base); 3285 __ csel(s1, temp0, s1, Assembler::HS); 3286 3287 // s2 = s2 % BASE 3288 __ lsr(temp0, s2, 16); 3289 __ lsl(temp1, temp0, 4); 3290 __ sub(temp1, temp1, temp0); 3291 __ add(s2, temp1, s2, ext::uxth); 3292 3293 __ subs(temp0, s2, base); 3294 __ csel(s2, temp0, s2, Assembler::HS); 3295 3296 __ b(L_combine); 3297 3298 __ bind(L_nmax); 3299 __ subs(len, len, nmax); 3300 __ sub(count, nmax, 16); 3301 __ br(Assembler::LO, L_by16); 3302 3303 __ bind(L_nmax_loop); 3304 3305 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3306 3307 __ add(s1, s1, temp0, ext::uxtb); 3308 __ ubfx(temp2, temp0, 8, 8); 3309 __ add(s2, s2, s1); 3310 __ add(s1, s1, temp2); 3311 __ ubfx(temp2, temp0, 16, 8); 3312 __ add(s2, s2, s1); 3313 __ add(s1, s1, temp2); 3314 __ ubfx(temp2, temp0, 24, 8); 3315 __ add(s2, s2, s1); 3316 __ add(s1, s1, temp2); 3317 __ ubfx(temp2, temp0, 32, 8); 3318 __ add(s2, s2, s1); 3319 __ add(s1, s1, temp2); 3320 __ ubfx(temp2, temp0, 40, 8); 3321 __ add(s2, s2, s1); 3322 __ add(s1, s1, temp2); 3323 __ ubfx(temp2, temp0, 48, 8); 3324 __ add(s2, s2, s1); 3325 __ add(s1, s1, temp2); 3326 __ add(s2, s2, s1); 3327 __ add(s1, s1, temp0, Assembler::LSR, 56); 3328 __ add(s2, s2, s1); 3329 3330 __ add(s1, s1, temp1, ext::uxtb); 3331 __ ubfx(temp2, temp1, 8, 8); 3332 __ add(s2, s2, s1); 3333 __ add(s1, s1, temp2); 3334 __ ubfx(temp2, temp1, 16, 8); 3335 __ add(s2, s2, s1); 3336 __ add(s1, s1, temp2); 3337 __ ubfx(temp2, temp1, 24, 8); 3338 __ add(s2, s2, s1); 3339 __ add(s1, s1, temp2); 3340 __ ubfx(temp2, temp1, 32, 8); 3341 __ add(s2, s2, s1); 3342 __ add(s1, s1, temp2); 3343 __ ubfx(temp2, temp1, 40, 8); 3344 __ add(s2, s2, s1); 3345 __ add(s1, s1, temp2); 3346 __ ubfx(temp2, temp1, 48, 8); 3347 __ add(s2, s2, s1); 3348 __ add(s1, s1, temp2); 3349 __ add(s2, s2, s1); 3350 __ add(s1, s1, temp1, Assembler::LSR, 56); 3351 __ add(s2, s2, s1); 3352 3353 __ subs(count, count, 16); 3354 __ br(Assembler::HS, L_nmax_loop); 3355 3356 // s1 = s1 % BASE 3357 __ lsr(temp0, s1, 16); 3358 __ lsl(temp1, temp0, 4); 3359 __ sub(temp1, temp1, temp0); 3360 __ add(temp1, temp1, s1, ext::uxth); 3361 3362 __ lsr(temp0, temp1, 16); 3363 __ lsl(s1, temp0, 4); 3364 __ sub(s1, s1, temp0); 3365 __ add(s1, s1, temp1, ext:: uxth); 3366 3367 __ subs(temp0, s1, base); 3368 __ csel(s1, temp0, s1, Assembler::HS); 3369 3370 // s2 = s2 % BASE 3371 __ lsr(temp0, s2, 16); 3372 __ lsl(temp1, temp0, 4); 3373 __ sub(temp1, temp1, temp0); 3374 __ add(temp1, temp1, s2, ext::uxth); 3375 3376 __ lsr(temp0, temp1, 16); 3377 __ lsl(s2, temp0, 4); 3378 __ sub(s2, s2, temp0); 3379 __ add(s2, s2, temp1, ext:: uxth); 3380 3381 __ subs(temp0, s2, base); 3382 __ csel(s2, temp0, s2, Assembler::HS); 3383 3384 __ subs(len, len, nmax); 3385 __ sub(count, nmax, 16); 3386 __ br(Assembler::HS, L_nmax_loop); 3387 3388 __ bind(L_by16); 3389 __ adds(len, len, count); 3390 __ br(Assembler::LO, L_by1); 3391 3392 __ bind(L_by16_loop); 3393 3394 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3395 3396 __ add(s1, s1, temp0, ext::uxtb); 3397 __ ubfx(temp2, temp0, 8, 8); 3398 __ add(s2, s2, s1); 3399 __ add(s1, s1, temp2); 3400 __ ubfx(temp2, temp0, 16, 8); 3401 __ add(s2, s2, s1); 3402 __ add(s1, s1, temp2); 3403 __ ubfx(temp2, temp0, 24, 8); 3404 __ add(s2, s2, s1); 3405 __ add(s1, s1, temp2); 3406 __ ubfx(temp2, temp0, 32, 8); 3407 __ add(s2, s2, s1); 3408 __ add(s1, s1, temp2); 3409 __ ubfx(temp2, temp0, 40, 8); 3410 __ add(s2, s2, s1); 3411 __ add(s1, s1, temp2); 3412 __ ubfx(temp2, temp0, 48, 8); 3413 __ add(s2, s2, s1); 3414 __ add(s1, s1, temp2); 3415 __ add(s2, s2, s1); 3416 __ add(s1, s1, temp0, Assembler::LSR, 56); 3417 __ add(s2, s2, s1); 3418 3419 __ add(s1, s1, temp1, ext::uxtb); 3420 __ ubfx(temp2, temp1, 8, 8); 3421 __ add(s2, s2, s1); 3422 __ add(s1, s1, temp2); 3423 __ ubfx(temp2, temp1, 16, 8); 3424 __ add(s2, s2, s1); 3425 __ add(s1, s1, temp2); 3426 __ ubfx(temp2, temp1, 24, 8); 3427 __ add(s2, s2, s1); 3428 __ add(s1, s1, temp2); 3429 __ ubfx(temp2, temp1, 32, 8); 3430 __ add(s2, s2, s1); 3431 __ add(s1, s1, temp2); 3432 __ ubfx(temp2, temp1, 40, 8); 3433 __ add(s2, s2, s1); 3434 __ add(s1, s1, temp2); 3435 __ ubfx(temp2, temp1, 48, 8); 3436 __ add(s2, s2, s1); 3437 __ add(s1, s1, temp2); 3438 __ add(s2, s2, s1); 3439 __ add(s1, s1, temp1, Assembler::LSR, 56); 3440 __ add(s2, s2, s1); 3441 3442 __ subs(len, len, 16); 3443 __ br(Assembler::HS, L_by16_loop); 3444 3445 __ bind(L_by1); 3446 __ adds(len, len, 15); 3447 __ br(Assembler::LO, L_do_mod); 3448 3449 __ bind(L_by1_loop); 3450 __ ldrb(temp0, Address(__ post(buff, 1))); 3451 __ add(s1, temp0, s1); 3452 __ add(s2, s2, s1); 3453 __ subs(len, len, 1); 3454 __ br(Assembler::HS, L_by1_loop); 3455 3456 __ bind(L_do_mod); 3457 // s1 = s1 % BASE 3458 __ lsr(temp0, s1, 16); 3459 __ lsl(temp1, temp0, 4); 3460 __ sub(temp1, temp1, temp0); 3461 __ add(temp1, temp1, s1, ext::uxth); 3462 3463 __ lsr(temp0, temp1, 16); 3464 __ lsl(s1, temp0, 4); 3465 __ sub(s1, s1, temp0); 3466 __ add(s1, s1, temp1, ext:: uxth); 3467 3468 __ subs(temp0, s1, base); 3469 __ csel(s1, temp0, s1, Assembler::HS); 3470 3471 // s2 = s2 % BASE 3472 __ lsr(temp0, s2, 16); 3473 __ lsl(temp1, temp0, 4); 3474 __ sub(temp1, temp1, temp0); 3475 __ add(temp1, temp1, s2, ext::uxth); 3476 3477 __ lsr(temp0, temp1, 16); 3478 __ lsl(s2, temp0, 4); 3479 __ sub(s2, s2, temp0); 3480 __ add(s2, s2, temp1, ext:: uxth); 3481 3482 __ subs(temp0, s2, base); 3483 __ csel(s2, temp0, s2, Assembler::HS); 3484 3485 // Combine lower bits and higher bits 3486 __ bind(L_combine); 3487 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3488 3489 __ ret(lr); 3490 3491 return start; 3492 } 3493 3494 /** 3495 * Arguments: 3496 * 3497 * Input: 3498 * c_rarg0 - x address 3499 * c_rarg1 - x length 3500 * c_rarg2 - y address 3501 * c_rarg3 - y lenth 3502 * c_rarg4 - z address 3503 * c_rarg5 - z length 3504 */ 3505 address generate_multiplyToLen() { 3506 __ align(CodeEntryAlignment); 3507 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3508 3509 address start = __ pc(); 3510 const Register x = r0; 3511 const Register xlen = r1; 3512 const Register y = r2; 3513 const Register ylen = r3; 3514 const Register z = r4; 3515 const Register zlen = r5; 3516 3517 const Register tmp1 = r10; 3518 const Register tmp2 = r11; 3519 const Register tmp3 = r12; 3520 const Register tmp4 = r13; 3521 const Register tmp5 = r14; 3522 const Register tmp6 = r15; 3523 const Register tmp7 = r16; 3524 3525 BLOCK_COMMENT("Entry:"); 3526 __ enter(); // required for proper stackwalking of RuntimeStub frame 3527 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3528 __ leave(); // required for proper stackwalking of RuntimeStub frame 3529 __ ret(lr); 3530 3531 return start; 3532 } 3533 3534 address generate_squareToLen() { 3535 // squareToLen algorithm for sizes 1..127 described in java code works 3536 // faster than multiply_to_len on some CPUs and slower on others, but 3537 // multiply_to_len shows a bit better overall results 3538 __ align(CodeEntryAlignment); 3539 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3540 address start = __ pc(); 3541 3542 const Register x = r0; 3543 const Register xlen = r1; 3544 const Register z = r2; 3545 const Register zlen = r3; 3546 const Register y = r4; // == x 3547 const Register ylen = r5; // == xlen 3548 3549 const Register tmp1 = r10; 3550 const Register tmp2 = r11; 3551 const Register tmp3 = r12; 3552 const Register tmp4 = r13; 3553 const Register tmp5 = r14; 3554 const Register tmp6 = r15; 3555 const Register tmp7 = r16; 3556 3557 RegSet spilled_regs = RegSet::of(y, ylen); 3558 BLOCK_COMMENT("Entry:"); 3559 __ enter(); 3560 __ push(spilled_regs, sp); 3561 __ mov(y, x); 3562 __ mov(ylen, xlen); 3563 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3564 __ pop(spilled_regs, sp); 3565 __ leave(); 3566 __ ret(lr); 3567 return start; 3568 } 3569 3570 address generate_mulAdd() { 3571 __ align(CodeEntryAlignment); 3572 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3573 3574 address start = __ pc(); 3575 3576 const Register out = r0; 3577 const Register in = r1; 3578 const Register offset = r2; 3579 const Register len = r3; 3580 const Register k = r4; 3581 3582 BLOCK_COMMENT("Entry:"); 3583 __ enter(); 3584 __ mul_add(out, in, offset, len, k); 3585 __ leave(); 3586 __ ret(lr); 3587 3588 return start; 3589 } 3590 3591 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3592 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3593 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3594 // Karatsuba multiplication performs a 128*128 -> 256-bit 3595 // multiplication in three 128-bit multiplications and a few 3596 // additions. 3597 // 3598 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3599 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3600 // 3601 // Inputs: 3602 // 3603 // A0 in a.d[0] (subkey) 3604 // A1 in a.d[1] 3605 // (A1+A0) in a1_xor_a0.d[0] 3606 // 3607 // B0 in b.d[0] (state) 3608 // B1 in b.d[1] 3609 3610 __ ext(tmp1, __ T16B, b, b, 0x08); 3611 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3612 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3613 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3614 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3615 3616 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3617 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3618 __ eor(tmp2, __ T16B, tmp2, tmp4); 3619 __ eor(tmp2, __ T16B, tmp2, tmp3); 3620 3621 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3622 __ ins(result_hi, __ D, tmp2, 0, 1); 3623 __ ins(result_lo, __ D, tmp2, 1, 0); 3624 } 3625 3626 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3627 FloatRegister p, FloatRegister z, FloatRegister t1) { 3628 const FloatRegister t0 = result; 3629 3630 // The GCM field polynomial f is z^128 + p(z), where p = 3631 // z^7+z^2+z+1. 3632 // 3633 // z^128 === -p(z) (mod (z^128 + p(z))) 3634 // 3635 // so, given that the product we're reducing is 3636 // a == lo + hi * z^128 3637 // substituting, 3638 // === lo - hi * p(z) (mod (z^128 + p(z))) 3639 // 3640 // we reduce by multiplying hi by p(z) and subtracting the result 3641 // from (i.e. XORing it with) lo. Because p has no nonzero high 3642 // bits we can do this with two 64-bit multiplications, lo*p and 3643 // hi*p. 3644 3645 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3646 __ ext(t1, __ T16B, t0, z, 8); 3647 __ eor(hi, __ T16B, hi, t1); 3648 __ ext(t1, __ T16B, z, t0, 8); 3649 __ eor(lo, __ T16B, lo, t1); 3650 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3651 __ eor(result, __ T16B, lo, t0); 3652 } 3653 3654 address generate_has_negatives(address &has_negatives_long) { 3655 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3656 const int large_loop_size = 64; 3657 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3658 int dcache_line = VM_Version::dcache_line_size(); 3659 3660 Register ary1 = r1, len = r2, result = r0; 3661 3662 __ align(CodeEntryAlignment); 3663 address entry = __ pc(); 3664 3665 __ enter(); 3666 3667 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3668 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3669 3670 __ cmp(len, 15); 3671 __ br(Assembler::GT, LEN_OVER_15); 3672 // The only case when execution falls into this code is when pointer is near 3673 // the end of memory page and we have to avoid reading next page 3674 __ add(ary1, ary1, len); 3675 __ subs(len, len, 8); 3676 __ br(Assembler::GT, LEN_OVER_8); 3677 __ ldr(rscratch2, Address(ary1, -8)); 3678 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3679 __ lsrv(rscratch2, rscratch2, rscratch1); 3680 __ tst(rscratch2, UPPER_BIT_MASK); 3681 __ cset(result, Assembler::NE); 3682 __ leave(); 3683 __ ret(lr); 3684 __ bind(LEN_OVER_8); 3685 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3686 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3687 __ tst(rscratch2, UPPER_BIT_MASK); 3688 __ br(Assembler::NE, RET_TRUE_NO_POP); 3689 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3690 __ lsrv(rscratch1, rscratch1, rscratch2); 3691 __ tst(rscratch1, UPPER_BIT_MASK); 3692 __ cset(result, Assembler::NE); 3693 __ leave(); 3694 __ ret(lr); 3695 3696 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3697 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3698 3699 has_negatives_long = __ pc(); // 2nd entry point 3700 3701 __ enter(); 3702 3703 __ bind(LEN_OVER_15); 3704 __ push(spilled_regs, sp); 3705 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3706 __ cbz(rscratch2, ALIGNED); 3707 __ ldp(tmp6, tmp1, Address(ary1)); 3708 __ mov(tmp5, 16); 3709 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3710 __ add(ary1, ary1, rscratch1); 3711 __ sub(len, len, rscratch1); 3712 __ orr(tmp6, tmp6, tmp1); 3713 __ tst(tmp6, UPPER_BIT_MASK); 3714 __ br(Assembler::NE, RET_TRUE); 3715 3716 __ bind(ALIGNED); 3717 __ cmp(len, large_loop_size); 3718 __ br(Assembler::LT, CHECK_16); 3719 // Perform 16-byte load as early return in pre-loop to handle situation 3720 // when initially aligned large array has negative values at starting bytes, 3721 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3722 // slower. Cases with negative bytes further ahead won't be affected that 3723 // much. In fact, it'll be faster due to early loads, less instructions and 3724 // less branches in LARGE_LOOP. 3725 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3726 __ sub(len, len, 16); 3727 __ orr(tmp6, tmp6, tmp1); 3728 __ tst(tmp6, UPPER_BIT_MASK); 3729 __ br(Assembler::NE, RET_TRUE); 3730 __ cmp(len, large_loop_size); 3731 __ br(Assembler::LT, CHECK_16); 3732 3733 if (SoftwarePrefetchHintDistance >= 0 3734 && SoftwarePrefetchHintDistance >= dcache_line) { 3735 // initial prefetch 3736 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3737 } 3738 __ bind(LARGE_LOOP); 3739 if (SoftwarePrefetchHintDistance >= 0) { 3740 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3741 } 3742 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3743 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3744 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3745 // instructions per cycle and have less branches, but this approach disables 3746 // early return, thus, all 64 bytes are loaded and checked every time. 3747 __ ldp(tmp2, tmp3, Address(ary1)); 3748 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3749 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3750 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3751 __ add(ary1, ary1, large_loop_size); 3752 __ sub(len, len, large_loop_size); 3753 __ orr(tmp2, tmp2, tmp3); 3754 __ orr(tmp4, tmp4, tmp5); 3755 __ orr(rscratch1, rscratch1, rscratch2); 3756 __ orr(tmp6, tmp6, tmp1); 3757 __ orr(tmp2, tmp2, tmp4); 3758 __ orr(rscratch1, rscratch1, tmp6); 3759 __ orr(tmp2, tmp2, rscratch1); 3760 __ tst(tmp2, UPPER_BIT_MASK); 3761 __ br(Assembler::NE, RET_TRUE); 3762 __ cmp(len, large_loop_size); 3763 __ br(Assembler::GE, LARGE_LOOP); 3764 3765 __ bind(CHECK_16); // small 16-byte load pre-loop 3766 __ cmp(len, 16); 3767 __ br(Assembler::LT, POST_LOOP16); 3768 3769 __ bind(LOOP16); // small 16-byte load loop 3770 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3771 __ sub(len, len, 16); 3772 __ orr(tmp2, tmp2, tmp3); 3773 __ tst(tmp2, UPPER_BIT_MASK); 3774 __ br(Assembler::NE, RET_TRUE); 3775 __ cmp(len, 16); 3776 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3777 3778 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3779 __ cmp(len, 8); 3780 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3781 __ ldr(tmp3, Address(__ post(ary1, 8))); 3782 __ sub(len, len, 8); 3783 __ tst(tmp3, UPPER_BIT_MASK); 3784 __ br(Assembler::NE, RET_TRUE); 3785 3786 __ bind(POST_LOOP16_LOAD_TAIL); 3787 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3788 __ ldr(tmp1, Address(ary1)); 3789 __ mov(tmp2, 64); 3790 __ sub(tmp4, tmp2, len, __ LSL, 3); 3791 __ lslv(tmp1, tmp1, tmp4); 3792 __ tst(tmp1, UPPER_BIT_MASK); 3793 __ br(Assembler::NE, RET_TRUE); 3794 // Fallthrough 3795 3796 __ bind(RET_FALSE); 3797 __ pop(spilled_regs, sp); 3798 __ leave(); 3799 __ mov(result, zr); 3800 __ ret(lr); 3801 3802 __ bind(RET_TRUE); 3803 __ pop(spilled_regs, sp); 3804 __ bind(RET_TRUE_NO_POP); 3805 __ leave(); 3806 __ mov(result, 1); 3807 __ ret(lr); 3808 3809 __ bind(DONE); 3810 __ pop(spilled_regs, sp); 3811 __ leave(); 3812 __ ret(lr); 3813 return entry; 3814 } 3815 /** 3816 * Arguments: 3817 * 3818 * Input: 3819 * c_rarg0 - current state address 3820 * c_rarg1 - H key address 3821 * c_rarg2 - data address 3822 * c_rarg3 - number of blocks 3823 * 3824 * Output: 3825 * Updated state at c_rarg0 3826 */ 3827 address generate_ghash_processBlocks() { 3828 // Bafflingly, GCM uses little-endian for the byte order, but 3829 // big-endian for the bit order. For example, the polynomial 1 is 3830 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 3831 // 3832 // So, we must either reverse the bytes in each word and do 3833 // everything big-endian or reverse the bits in each byte and do 3834 // it little-endian. On AArch64 it's more idiomatic to reverse 3835 // the bits in each byte (we have an instruction, RBIT, to do 3836 // that) and keep the data in little-endian bit order throught the 3837 // calculation, bit-reversing the inputs and outputs. 3838 3839 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 3840 __ align(wordSize * 2); 3841 address p = __ pc(); 3842 __ emit_int64(0x87); // The low-order bits of the field 3843 // polynomial (i.e. p = z^7+z^2+z+1) 3844 // repeated in the low and high parts of a 3845 // 128-bit vector 3846 __ emit_int64(0x87); 3847 3848 __ align(CodeEntryAlignment); 3849 address start = __ pc(); 3850 3851 Register state = c_rarg0; 3852 Register subkeyH = c_rarg1; 3853 Register data = c_rarg2; 3854 Register blocks = c_rarg3; 3855 3856 FloatRegister vzr = v30; 3857 __ eor(vzr, __ T16B, vzr, vzr); // zero register 3858 3859 __ ldrq(v0, Address(state)); 3860 __ ldrq(v1, Address(subkeyH)); 3861 3862 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 3863 __ rbit(v0, __ T16B, v0); 3864 __ rev64(v1, __ T16B, v1); 3865 __ rbit(v1, __ T16B, v1); 3866 3867 __ ldrq(v26, p); 3868 3869 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 3870 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 3871 3872 { 3873 Label L_ghash_loop; 3874 __ bind(L_ghash_loop); 3875 3876 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 3877 // reversing each byte 3878 __ rbit(v2, __ T16B, v2); 3879 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 3880 3881 // Multiply state in v2 by subkey in v1 3882 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 3883 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 3884 /*temps*/v6, v20, v18, v21); 3885 // Reduce v7:v5 by the field polynomial 3886 ghash_reduce(v0, v5, v7, v26, vzr, v20); 3887 3888 __ sub(blocks, blocks, 1); 3889 __ cbnz(blocks, L_ghash_loop); 3890 } 3891 3892 // The bit-reversed result is at this point in v0 3893 __ rev64(v1, __ T16B, v0); 3894 __ rbit(v1, __ T16B, v1); 3895 3896 __ st1(v1, __ T16B, state); 3897 __ ret(lr); 3898 3899 return start; 3900 } 3901 3902 // Continuation point for throwing of implicit exceptions that are 3903 // not handled in the current activation. Fabricates an exception 3904 // oop and initiates normal exception dispatching in this 3905 // frame. Since we need to preserve callee-saved values (currently 3906 // only for C2, but done for C1 as well) we need a callee-saved oop 3907 // map and therefore have to make these stubs into RuntimeStubs 3908 // rather than BufferBlobs. If the compiler needs all registers to 3909 // be preserved between the fault point and the exception handler 3910 // then it must assume responsibility for that in 3911 // AbstractCompiler::continuation_for_implicit_null_exception or 3912 // continuation_for_implicit_division_by_zero_exception. All other 3913 // implicit exceptions (e.g., NullPointerException or 3914 // AbstractMethodError on entry) are either at call sites or 3915 // otherwise assume that stack unwinding will be initiated, so 3916 // caller saved registers were assumed volatile in the compiler. 3917 3918 #undef __ 3919 #define __ masm-> 3920 3921 address generate_throw_exception(const char* name, 3922 address runtime_entry, 3923 Register arg1 = noreg, 3924 Register arg2 = noreg) { 3925 // Information about frame layout at time of blocking runtime call. 3926 // Note that we only have to preserve callee-saved registers since 3927 // the compilers are responsible for supplying a continuation point 3928 // if they expect all registers to be preserved. 3929 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 3930 enum layout { 3931 rfp_off = 0, 3932 rfp_off2, 3933 return_off, 3934 return_off2, 3935 framesize // inclusive of return address 3936 }; 3937 3938 int insts_size = 512; 3939 int locs_size = 64; 3940 3941 CodeBuffer code(name, insts_size, locs_size); 3942 OopMapSet* oop_maps = new OopMapSet(); 3943 MacroAssembler* masm = new MacroAssembler(&code); 3944 3945 address start = __ pc(); 3946 3947 // This is an inlined and slightly modified version of call_VM 3948 // which has the ability to fetch the return PC out of 3949 // thread-local storage and also sets up last_Java_sp slightly 3950 // differently than the real call_VM 3951 3952 __ enter(); // Save FP and LR before call 3953 3954 assert(is_even(framesize/2), "sp not 16-byte aligned"); 3955 3956 // lr and fp are already in place 3957 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 3958 3959 int frame_complete = __ pc() - start; 3960 3961 // Set up last_Java_sp and last_Java_fp 3962 address the_pc = __ pc(); 3963 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 3964 3965 // Call runtime 3966 if (arg1 != noreg) { 3967 assert(arg2 != c_rarg1, "clobbered"); 3968 __ mov(c_rarg1, arg1); 3969 } 3970 if (arg2 != noreg) { 3971 __ mov(c_rarg2, arg2); 3972 } 3973 __ mov(c_rarg0, rthread); 3974 BLOCK_COMMENT("call runtime_entry"); 3975 __ mov(rscratch1, runtime_entry); 3976 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 3977 3978 // Generate oop map 3979 OopMap* map = new OopMap(framesize, 0); 3980 3981 oop_maps->add_gc_map(the_pc - start, map); 3982 3983 __ reset_last_Java_frame(true); 3984 __ maybe_isb(); 3985 3986 __ leave(); 3987 3988 // check for pending exceptions 3989 #ifdef ASSERT 3990 Label L; 3991 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 3992 __ cbnz(rscratch1, L); 3993 __ should_not_reach_here(); 3994 __ bind(L); 3995 #endif // ASSERT 3996 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 3997 3998 3999 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4000 RuntimeStub* stub = 4001 RuntimeStub::new_runtime_stub(name, 4002 &code, 4003 frame_complete, 4004 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4005 oop_maps, false); 4006 return stub->entry_point(); 4007 } 4008 4009 class MontgomeryMultiplyGenerator : public MacroAssembler { 4010 4011 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4012 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4013 4014 RegSet _toSave; 4015 bool _squaring; 4016 4017 public: 4018 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4019 : MacroAssembler(as->code()), _squaring(squaring) { 4020 4021 // Register allocation 4022 4023 Register reg = c_rarg0; 4024 Pa_base = reg; // Argument registers 4025 if (squaring) 4026 Pb_base = Pa_base; 4027 else 4028 Pb_base = ++reg; 4029 Pn_base = ++reg; 4030 Rlen= ++reg; 4031 inv = ++reg; 4032 Pm_base = ++reg; 4033 4034 // Working registers: 4035 Ra = ++reg; // The current digit of a, b, n, and m. 4036 Rb = ++reg; 4037 Rm = ++reg; 4038 Rn = ++reg; 4039 4040 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4041 Pb = ++reg; 4042 Pm = ++reg; 4043 Pn = ++reg; 4044 4045 t0 = ++reg; // Three registers which form a 4046 t1 = ++reg; // triple-precision accumuator. 4047 t2 = ++reg; 4048 4049 Ri = ++reg; // Inner and outer loop indexes. 4050 Rj = ++reg; 4051 4052 Rhi_ab = ++reg; // Product registers: low and high parts 4053 Rlo_ab = ++reg; // of a*b and m*n. 4054 Rhi_mn = ++reg; 4055 Rlo_mn = ++reg; 4056 4057 // r19 and up are callee-saved. 4058 _toSave = RegSet::range(r19, reg) + Pm_base; 4059 } 4060 4061 private: 4062 void save_regs() { 4063 push(_toSave, sp); 4064 } 4065 4066 void restore_regs() { 4067 pop(_toSave, sp); 4068 } 4069 4070 template <typename T> 4071 void unroll_2(Register count, T block) { 4072 Label loop, end, odd; 4073 tbnz(count, 0, odd); 4074 cbz(count, end); 4075 align(16); 4076 bind(loop); 4077 (this->*block)(); 4078 bind(odd); 4079 (this->*block)(); 4080 subs(count, count, 2); 4081 br(Assembler::GT, loop); 4082 bind(end); 4083 } 4084 4085 template <typename T> 4086 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4087 Label loop, end, odd; 4088 tbnz(count, 0, odd); 4089 cbz(count, end); 4090 align(16); 4091 bind(loop); 4092 (this->*block)(d, s, tmp); 4093 bind(odd); 4094 (this->*block)(d, s, tmp); 4095 subs(count, count, 2); 4096 br(Assembler::GT, loop); 4097 bind(end); 4098 } 4099 4100 void pre1(RegisterOrConstant i) { 4101 block_comment("pre1"); 4102 // Pa = Pa_base; 4103 // Pb = Pb_base + i; 4104 // Pm = Pm_base; 4105 // Pn = Pn_base + i; 4106 // Ra = *Pa; 4107 // Rb = *Pb; 4108 // Rm = *Pm; 4109 // Rn = *Pn; 4110 ldr(Ra, Address(Pa_base)); 4111 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4112 ldr(Rm, Address(Pm_base)); 4113 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4114 lea(Pa, Address(Pa_base)); 4115 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4116 lea(Pm, Address(Pm_base)); 4117 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4118 4119 // Zero the m*n result. 4120 mov(Rhi_mn, zr); 4121 mov(Rlo_mn, zr); 4122 } 4123 4124 // The core multiply-accumulate step of a Montgomery 4125 // multiplication. The idea is to schedule operations as a 4126 // pipeline so that instructions with long latencies (loads and 4127 // multiplies) have time to complete before their results are 4128 // used. This most benefits in-order implementations of the 4129 // architecture but out-of-order ones also benefit. 4130 void step() { 4131 block_comment("step"); 4132 // MACC(Ra, Rb, t0, t1, t2); 4133 // Ra = *++Pa; 4134 // Rb = *--Pb; 4135 umulh(Rhi_ab, Ra, Rb); 4136 mul(Rlo_ab, Ra, Rb); 4137 ldr(Ra, pre(Pa, wordSize)); 4138 ldr(Rb, pre(Pb, -wordSize)); 4139 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4140 // previous iteration. 4141 // MACC(Rm, Rn, t0, t1, t2); 4142 // Rm = *++Pm; 4143 // Rn = *--Pn; 4144 umulh(Rhi_mn, Rm, Rn); 4145 mul(Rlo_mn, Rm, Rn); 4146 ldr(Rm, pre(Pm, wordSize)); 4147 ldr(Rn, pre(Pn, -wordSize)); 4148 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4149 } 4150 4151 void post1() { 4152 block_comment("post1"); 4153 4154 // MACC(Ra, Rb, t0, t1, t2); 4155 // Ra = *++Pa; 4156 // Rb = *--Pb; 4157 umulh(Rhi_ab, Ra, Rb); 4158 mul(Rlo_ab, Ra, Rb); 4159 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4160 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4161 4162 // *Pm = Rm = t0 * inv; 4163 mul(Rm, t0, inv); 4164 str(Rm, Address(Pm)); 4165 4166 // MACC(Rm, Rn, t0, t1, t2); 4167 // t0 = t1; t1 = t2; t2 = 0; 4168 umulh(Rhi_mn, Rm, Rn); 4169 4170 #ifndef PRODUCT 4171 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4172 { 4173 mul(Rlo_mn, Rm, Rn); 4174 add(Rlo_mn, t0, Rlo_mn); 4175 Label ok; 4176 cbz(Rlo_mn, ok); { 4177 stop("broken Montgomery multiply"); 4178 } bind(ok); 4179 } 4180 #endif 4181 // We have very carefully set things up so that 4182 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4183 // the lower half of Rm * Rn because we know the result already: 4184 // it must be -t0. t0 + (-t0) must generate a carry iff 4185 // t0 != 0. So, rather than do a mul and an adds we just set 4186 // the carry flag iff t0 is nonzero. 4187 // 4188 // mul(Rlo_mn, Rm, Rn); 4189 // adds(zr, t0, Rlo_mn); 4190 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4191 adcs(t0, t1, Rhi_mn); 4192 adc(t1, t2, zr); 4193 mov(t2, zr); 4194 } 4195 4196 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 4197 block_comment("pre2"); 4198 // Pa = Pa_base + i-len; 4199 // Pb = Pb_base + len; 4200 // Pm = Pm_base + i-len; 4201 // Pn = Pn_base + len; 4202 4203 if (i.is_register()) { 4204 sub(Rj, i.as_register(), len); 4205 } else { 4206 mov(Rj, i.as_constant()); 4207 sub(Rj, Rj, len); 4208 } 4209 // Rj == i-len 4210 4211 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 4212 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 4213 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4214 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 4215 4216 // Ra = *++Pa; 4217 // Rb = *--Pb; 4218 // Rm = *++Pm; 4219 // Rn = *--Pn; 4220 ldr(Ra, pre(Pa, wordSize)); 4221 ldr(Rb, pre(Pb, -wordSize)); 4222 ldr(Rm, pre(Pm, wordSize)); 4223 ldr(Rn, pre(Pn, -wordSize)); 4224 4225 mov(Rhi_mn, zr); 4226 mov(Rlo_mn, zr); 4227 } 4228 4229 void post2(RegisterOrConstant i, RegisterOrConstant len) { 4230 block_comment("post2"); 4231 if (i.is_constant()) { 4232 mov(Rj, i.as_constant()-len.as_constant()); 4233 } else { 4234 sub(Rj, i.as_register(), len); 4235 } 4236 4237 adds(t0, t0, Rlo_mn); // The pending m*n, low part 4238 4239 // As soon as we know the least significant digit of our result, 4240 // store it. 4241 // Pm_base[i-len] = t0; 4242 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4243 4244 // t0 = t1; t1 = t2; t2 = 0; 4245 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 4246 adc(t1, t2, zr); 4247 mov(t2, zr); 4248 } 4249 4250 // A carry in t0 after Montgomery multiplication means that we 4251 // should subtract multiples of n from our result in m. We'll 4252 // keep doing that until there is no carry. 4253 void normalize(RegisterOrConstant len) { 4254 block_comment("normalize"); 4255 // while (t0) 4256 // t0 = sub(Pm_base, Pn_base, t0, len); 4257 Label loop, post, again; 4258 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 4259 cbz(t0, post); { 4260 bind(again); { 4261 mov(i, zr); 4262 mov(cnt, len); 4263 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4264 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4265 subs(zr, zr, zr); // set carry flag, i.e. no borrow 4266 align(16); 4267 bind(loop); { 4268 sbcs(Rm, Rm, Rn); 4269 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4270 add(i, i, 1); 4271 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4272 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4273 sub(cnt, cnt, 1); 4274 } cbnz(cnt, loop); 4275 sbc(t0, t0, zr); 4276 } cbnz(t0, again); 4277 } bind(post); 4278 } 4279 4280 // Move memory at s to d, reversing words. 4281 // Increments d to end of copied memory 4282 // Destroys tmp1, tmp2 4283 // Preserves len 4284 // Leaves s pointing to the address which was in d at start 4285 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 4286 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 4287 4288 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 4289 mov(tmp1, len); 4290 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 4291 sub(s, d, len, ext::uxtw, LogBytesPerWord); 4292 } 4293 // where 4294 void reverse1(Register d, Register s, Register tmp) { 4295 ldr(tmp, pre(s, -wordSize)); 4296 ror(tmp, tmp, 32); 4297 str(tmp, post(d, wordSize)); 4298 } 4299 4300 void step_squaring() { 4301 // An extra ACC 4302 step(); 4303 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4304 } 4305 4306 void last_squaring(RegisterOrConstant i) { 4307 Label dont; 4308 // if ((i & 1) == 0) { 4309 tbnz(i.as_register(), 0, dont); { 4310 // MACC(Ra, Rb, t0, t1, t2); 4311 // Ra = *++Pa; 4312 // Rb = *--Pb; 4313 umulh(Rhi_ab, Ra, Rb); 4314 mul(Rlo_ab, Ra, Rb); 4315 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4316 } bind(dont); 4317 } 4318 4319 void extra_step_squaring() { 4320 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4321 4322 // MACC(Rm, Rn, t0, t1, t2); 4323 // Rm = *++Pm; 4324 // Rn = *--Pn; 4325 umulh(Rhi_mn, Rm, Rn); 4326 mul(Rlo_mn, Rm, Rn); 4327 ldr(Rm, pre(Pm, wordSize)); 4328 ldr(Rn, pre(Pn, -wordSize)); 4329 } 4330 4331 void post1_squaring() { 4332 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4333 4334 // *Pm = Rm = t0 * inv; 4335 mul(Rm, t0, inv); 4336 str(Rm, Address(Pm)); 4337 4338 // MACC(Rm, Rn, t0, t1, t2); 4339 // t0 = t1; t1 = t2; t2 = 0; 4340 umulh(Rhi_mn, Rm, Rn); 4341 4342 #ifndef PRODUCT 4343 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4344 { 4345 mul(Rlo_mn, Rm, Rn); 4346 add(Rlo_mn, t0, Rlo_mn); 4347 Label ok; 4348 cbz(Rlo_mn, ok); { 4349 stop("broken Montgomery multiply"); 4350 } bind(ok); 4351 } 4352 #endif 4353 // We have very carefully set things up so that 4354 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4355 // the lower half of Rm * Rn because we know the result already: 4356 // it must be -t0. t0 + (-t0) must generate a carry iff 4357 // t0 != 0. So, rather than do a mul and an adds we just set 4358 // the carry flag iff t0 is nonzero. 4359 // 4360 // mul(Rlo_mn, Rm, Rn); 4361 // adds(zr, t0, Rlo_mn); 4362 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4363 adcs(t0, t1, Rhi_mn); 4364 adc(t1, t2, zr); 4365 mov(t2, zr); 4366 } 4367 4368 void acc(Register Rhi, Register Rlo, 4369 Register t0, Register t1, Register t2) { 4370 adds(t0, t0, Rlo); 4371 adcs(t1, t1, Rhi); 4372 adc(t2, t2, zr); 4373 } 4374 4375 public: 4376 /** 4377 * Fast Montgomery multiplication. The derivation of the 4378 * algorithm is in A Cryptographic Library for the Motorola 4379 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 4380 * 4381 * Arguments: 4382 * 4383 * Inputs for multiplication: 4384 * c_rarg0 - int array elements a 4385 * c_rarg1 - int array elements b 4386 * c_rarg2 - int array elements n (the modulus) 4387 * c_rarg3 - int length 4388 * c_rarg4 - int inv 4389 * c_rarg5 - int array elements m (the result) 4390 * 4391 * Inputs for squaring: 4392 * c_rarg0 - int array elements a 4393 * c_rarg1 - int array elements n (the modulus) 4394 * c_rarg2 - int length 4395 * c_rarg3 - int inv 4396 * c_rarg4 - int array elements m (the result) 4397 * 4398 */ 4399 address generate_multiply() { 4400 Label argh, nothing; 4401 bind(argh); 4402 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4403 4404 align(CodeEntryAlignment); 4405 address entry = pc(); 4406 4407 cbzw(Rlen, nothing); 4408 4409 enter(); 4410 4411 // Make room. 4412 cmpw(Rlen, 512); 4413 br(Assembler::HI, argh); 4414 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4415 andr(sp, Ra, -2 * wordSize); 4416 4417 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4418 4419 { 4420 // Copy input args, reversing as we go. We use Ra as a 4421 // temporary variable. 4422 reverse(Ra, Pa_base, Rlen, t0, t1); 4423 if (!_squaring) 4424 reverse(Ra, Pb_base, Rlen, t0, t1); 4425 reverse(Ra, Pn_base, Rlen, t0, t1); 4426 } 4427 4428 // Push all call-saved registers and also Pm_base which we'll need 4429 // at the end. 4430 save_regs(); 4431 4432 #ifndef PRODUCT 4433 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 4434 { 4435 ldr(Rn, Address(Pn_base, 0)); 4436 mul(Rlo_mn, Rn, inv); 4437 cmp(Rlo_mn, -1); 4438 Label ok; 4439 br(EQ, ok); { 4440 stop("broken inverse in Montgomery multiply"); 4441 } bind(ok); 4442 } 4443 #endif 4444 4445 mov(Pm_base, Ra); 4446 4447 mov(t0, zr); 4448 mov(t1, zr); 4449 mov(t2, zr); 4450 4451 block_comment("for (int i = 0; i < len; i++) {"); 4452 mov(Ri, zr); { 4453 Label loop, end; 4454 cmpw(Ri, Rlen); 4455 br(Assembler::GE, end); 4456 4457 bind(loop); 4458 pre1(Ri); 4459 4460 block_comment(" for (j = i; j; j--) {"); { 4461 movw(Rj, Ri); 4462 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4463 } block_comment(" } // j"); 4464 4465 post1(); 4466 addw(Ri, Ri, 1); 4467 cmpw(Ri, Rlen); 4468 br(Assembler::LT, loop); 4469 bind(end); 4470 block_comment("} // i"); 4471 } 4472 4473 block_comment("for (int i = len; i < 2*len; i++) {"); 4474 mov(Ri, Rlen); { 4475 Label loop, end; 4476 cmpw(Ri, Rlen, Assembler::LSL, 1); 4477 br(Assembler::GE, end); 4478 4479 bind(loop); 4480 pre2(Ri, Rlen); 4481 4482 block_comment(" for (j = len*2-i-1; j; j--) {"); { 4483 lslw(Rj, Rlen, 1); 4484 subw(Rj, Rj, Ri); 4485 subw(Rj, Rj, 1); 4486 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4487 } block_comment(" } // j"); 4488 4489 post2(Ri, Rlen); 4490 addw(Ri, Ri, 1); 4491 cmpw(Ri, Rlen, Assembler::LSL, 1); 4492 br(Assembler::LT, loop); 4493 bind(end); 4494 } 4495 block_comment("} // i"); 4496 4497 normalize(Rlen); 4498 4499 mov(Ra, Pm_base); // Save Pm_base in Ra 4500 restore_regs(); // Restore caller's Pm_base 4501 4502 // Copy our result into caller's Pm_base 4503 reverse(Pm_base, Ra, Rlen, t0, t1); 4504 4505 leave(); 4506 bind(nothing); 4507 ret(lr); 4508 4509 return entry; 4510 } 4511 // In C, approximately: 4512 4513 // void 4514 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 4515 // unsigned long Pn_base[], unsigned long Pm_base[], 4516 // unsigned long inv, int len) { 4517 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4518 // unsigned long *Pa, *Pb, *Pn, *Pm; 4519 // unsigned long Ra, Rb, Rn, Rm; 4520 4521 // int i; 4522 4523 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4524 4525 // for (i = 0; i < len; i++) { 4526 // int j; 4527 4528 // Pa = Pa_base; 4529 // Pb = Pb_base + i; 4530 // Pm = Pm_base; 4531 // Pn = Pn_base + i; 4532 4533 // Ra = *Pa; 4534 // Rb = *Pb; 4535 // Rm = *Pm; 4536 // Rn = *Pn; 4537 4538 // int iters = i; 4539 // for (j = 0; iters--; j++) { 4540 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4541 // MACC(Ra, Rb, t0, t1, t2); 4542 // Ra = *++Pa; 4543 // Rb = *--Pb; 4544 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4545 // MACC(Rm, Rn, t0, t1, t2); 4546 // Rm = *++Pm; 4547 // Rn = *--Pn; 4548 // } 4549 4550 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 4551 // MACC(Ra, Rb, t0, t1, t2); 4552 // *Pm = Rm = t0 * inv; 4553 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4554 // MACC(Rm, Rn, t0, t1, t2); 4555 4556 // assert(t0 == 0, "broken Montgomery multiply"); 4557 4558 // t0 = t1; t1 = t2; t2 = 0; 4559 // } 4560 4561 // for (i = len; i < 2*len; i++) { 4562 // int j; 4563 4564 // Pa = Pa_base + i-len; 4565 // Pb = Pb_base + len; 4566 // Pm = Pm_base + i-len; 4567 // Pn = Pn_base + len; 4568 4569 // Ra = *++Pa; 4570 // Rb = *--Pb; 4571 // Rm = *++Pm; 4572 // Rn = *--Pn; 4573 4574 // int iters = len*2-i-1; 4575 // for (j = i-len+1; iters--; j++) { 4576 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4577 // MACC(Ra, Rb, t0, t1, t2); 4578 // Ra = *++Pa; 4579 // Rb = *--Pb; 4580 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4581 // MACC(Rm, Rn, t0, t1, t2); 4582 // Rm = *++Pm; 4583 // Rn = *--Pn; 4584 // } 4585 4586 // Pm_base[i-len] = t0; 4587 // t0 = t1; t1 = t2; t2 = 0; 4588 // } 4589 4590 // while (t0) 4591 // t0 = sub(Pm_base, Pn_base, t0, len); 4592 // } 4593 4594 /** 4595 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4596 * multiplies than Montgomery multiplication so it should be up to 4597 * 25% faster. However, its loop control is more complex and it 4598 * may actually run slower on some machines. 4599 * 4600 * Arguments: 4601 * 4602 * Inputs: 4603 * c_rarg0 - int array elements a 4604 * c_rarg1 - int array elements n (the modulus) 4605 * c_rarg2 - int length 4606 * c_rarg3 - int inv 4607 * c_rarg4 - int array elements m (the result) 4608 * 4609 */ 4610 address generate_square() { 4611 Label argh; 4612 bind(argh); 4613 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4614 4615 align(CodeEntryAlignment); 4616 address entry = pc(); 4617 4618 enter(); 4619 4620 // Make room. 4621 cmpw(Rlen, 512); 4622 br(Assembler::HI, argh); 4623 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4624 andr(sp, Ra, -2 * wordSize); 4625 4626 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4627 4628 { 4629 // Copy input args, reversing as we go. We use Ra as a 4630 // temporary variable. 4631 reverse(Ra, Pa_base, Rlen, t0, t1); 4632 reverse(Ra, Pn_base, Rlen, t0, t1); 4633 } 4634 4635 // Push all call-saved registers and also Pm_base which we'll need 4636 // at the end. 4637 save_regs(); 4638 4639 mov(Pm_base, Ra); 4640 4641 mov(t0, zr); 4642 mov(t1, zr); 4643 mov(t2, zr); 4644 4645 block_comment("for (int i = 0; i < len; i++) {"); 4646 mov(Ri, zr); { 4647 Label loop, end; 4648 bind(loop); 4649 cmp(Ri, Rlen); 4650 br(Assembler::GE, end); 4651 4652 pre1(Ri); 4653 4654 block_comment("for (j = (i+1)/2; j; j--) {"); { 4655 add(Rj, Ri, 1); 4656 lsr(Rj, Rj, 1); 4657 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4658 } block_comment(" } // j"); 4659 4660 last_squaring(Ri); 4661 4662 block_comment(" for (j = i/2; j; j--) {"); { 4663 lsr(Rj, Ri, 1); 4664 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4665 } block_comment(" } // j"); 4666 4667 post1_squaring(); 4668 add(Ri, Ri, 1); 4669 cmp(Ri, Rlen); 4670 br(Assembler::LT, loop); 4671 4672 bind(end); 4673 block_comment("} // i"); 4674 } 4675 4676 block_comment("for (int i = len; i < 2*len; i++) {"); 4677 mov(Ri, Rlen); { 4678 Label loop, end; 4679 bind(loop); 4680 cmp(Ri, Rlen, Assembler::LSL, 1); 4681 br(Assembler::GE, end); 4682 4683 pre2(Ri, Rlen); 4684 4685 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4686 lsl(Rj, Rlen, 1); 4687 sub(Rj, Rj, Ri); 4688 sub(Rj, Rj, 1); 4689 lsr(Rj, Rj, 1); 4690 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4691 } block_comment(" } // j"); 4692 4693 last_squaring(Ri); 4694 4695 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4696 lsl(Rj, Rlen, 1); 4697 sub(Rj, Rj, Ri); 4698 lsr(Rj, Rj, 1); 4699 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4700 } block_comment(" } // j"); 4701 4702 post2(Ri, Rlen); 4703 add(Ri, Ri, 1); 4704 cmp(Ri, Rlen, Assembler::LSL, 1); 4705 4706 br(Assembler::LT, loop); 4707 bind(end); 4708 block_comment("} // i"); 4709 } 4710 4711 normalize(Rlen); 4712 4713 mov(Ra, Pm_base); // Save Pm_base in Ra 4714 restore_regs(); // Restore caller's Pm_base 4715 4716 // Copy our result into caller's Pm_base 4717 reverse(Pm_base, Ra, Rlen, t0, t1); 4718 4719 leave(); 4720 ret(lr); 4721 4722 return entry; 4723 } 4724 // In C, approximately: 4725 4726 // void 4727 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4728 // unsigned long Pm_base[], unsigned long inv, int len) { 4729 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4730 // unsigned long *Pa, *Pb, *Pn, *Pm; 4731 // unsigned long Ra, Rb, Rn, Rm; 4732 4733 // int i; 4734 4735 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4736 4737 // for (i = 0; i < len; i++) { 4738 // int j; 4739 4740 // Pa = Pa_base; 4741 // Pb = Pa_base + i; 4742 // Pm = Pm_base; 4743 // Pn = Pn_base + i; 4744 4745 // Ra = *Pa; 4746 // Rb = *Pb; 4747 // Rm = *Pm; 4748 // Rn = *Pn; 4749 4750 // int iters = (i+1)/2; 4751 // for (j = 0; iters--; j++) { 4752 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4753 // MACC2(Ra, Rb, t0, t1, t2); 4754 // Ra = *++Pa; 4755 // Rb = *--Pb; 4756 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4757 // MACC(Rm, Rn, t0, t1, t2); 4758 // Rm = *++Pm; 4759 // Rn = *--Pn; 4760 // } 4761 // if ((i & 1) == 0) { 4762 // assert(Ra == Pa_base[j], "must be"); 4763 // MACC(Ra, Ra, t0, t1, t2); 4764 // } 4765 // iters = i/2; 4766 // assert(iters == i-j, "must be"); 4767 // for (; iters--; j++) { 4768 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4769 // MACC(Rm, Rn, t0, t1, t2); 4770 // Rm = *++Pm; 4771 // Rn = *--Pn; 4772 // } 4773 4774 // *Pm = Rm = t0 * inv; 4775 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4776 // MACC(Rm, Rn, t0, t1, t2); 4777 4778 // assert(t0 == 0, "broken Montgomery multiply"); 4779 4780 // t0 = t1; t1 = t2; t2 = 0; 4781 // } 4782 4783 // for (i = len; i < 2*len; i++) { 4784 // int start = i-len+1; 4785 // int end = start + (len - start)/2; 4786 // int j; 4787 4788 // Pa = Pa_base + i-len; 4789 // Pb = Pa_base + len; 4790 // Pm = Pm_base + i-len; 4791 // Pn = Pn_base + len; 4792 4793 // Ra = *++Pa; 4794 // Rb = *--Pb; 4795 // Rm = *++Pm; 4796 // Rn = *--Pn; 4797 4798 // int iters = (2*len-i-1)/2; 4799 // assert(iters == end-start, "must be"); 4800 // for (j = start; iters--; j++) { 4801 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4802 // MACC2(Ra, Rb, t0, t1, t2); 4803 // Ra = *++Pa; 4804 // Rb = *--Pb; 4805 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4806 // MACC(Rm, Rn, t0, t1, t2); 4807 // Rm = *++Pm; 4808 // Rn = *--Pn; 4809 // } 4810 // if ((i & 1) == 0) { 4811 // assert(Ra == Pa_base[j], "must be"); 4812 // MACC(Ra, Ra, t0, t1, t2); 4813 // } 4814 // iters = (2*len-i)/2; 4815 // assert(iters == len-j, "must be"); 4816 // for (; iters--; j++) { 4817 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4818 // MACC(Rm, Rn, t0, t1, t2); 4819 // Rm = *++Pm; 4820 // Rn = *--Pn; 4821 // } 4822 // Pm_base[i-len] = t0; 4823 // t0 = t1; t1 = t2; t2 = 0; 4824 // } 4825 4826 // while (t0) 4827 // t0 = sub(Pm_base, Pn_base, t0, len); 4828 // } 4829 }; 4830 4831 4832 // Initialization 4833 void generate_initial() { 4834 // Generate initial stubs and initializes the entry points 4835 4836 // entry points that exist in all platforms Note: This is code 4837 // that could be shared among different platforms - however the 4838 // benefit seems to be smaller than the disadvantage of having a 4839 // much more complicated generator structure. See also comment in 4840 // stubRoutines.hpp. 4841 4842 StubRoutines::_forward_exception_entry = generate_forward_exception(); 4843 4844 StubRoutines::_call_stub_entry = 4845 generate_call_stub(StubRoutines::_call_stub_return_address); 4846 4847 // is referenced by megamorphic call 4848 StubRoutines::_catch_exception_entry = generate_catch_exception(); 4849 4850 // Build this early so it's available for the interpreter. 4851 StubRoutines::_throw_StackOverflowError_entry = 4852 generate_throw_exception("StackOverflowError throw_exception", 4853 CAST_FROM_FN_PTR(address, 4854 SharedRuntime::throw_StackOverflowError)); 4855 StubRoutines::_throw_delayed_StackOverflowError_entry = 4856 generate_throw_exception("delayed StackOverflowError throw_exception", 4857 CAST_FROM_FN_PTR(address, 4858 SharedRuntime::throw_delayed_StackOverflowError)); 4859 if (UseCRC32Intrinsics) { 4860 // set table address before stub generation which use it 4861 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 4862 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 4863 } 4864 4865 if (UseCRC32CIntrinsics) { 4866 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 4867 } 4868 } 4869 4870 void generate_all() { 4871 // support for verify_oop (must happen after universe_init) 4872 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 4873 StubRoutines::_throw_AbstractMethodError_entry = 4874 generate_throw_exception("AbstractMethodError throw_exception", 4875 CAST_FROM_FN_PTR(address, 4876 SharedRuntime:: 4877 throw_AbstractMethodError)); 4878 4879 StubRoutines::_throw_IncompatibleClassChangeError_entry = 4880 generate_throw_exception("IncompatibleClassChangeError throw_exception", 4881 CAST_FROM_FN_PTR(address, 4882 SharedRuntime:: 4883 throw_IncompatibleClassChangeError)); 4884 4885 StubRoutines::_throw_NullPointerException_at_call_entry = 4886 generate_throw_exception("NullPointerException at call throw_exception", 4887 CAST_FROM_FN_PTR(address, 4888 SharedRuntime:: 4889 throw_NullPointerException_at_call)); 4890 4891 // arraycopy stubs used by compilers 4892 generate_arraycopy_stubs(); 4893 4894 // has negatives stub for large arrays. 4895 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 4896 4897 if (UseMultiplyToLenIntrinsic) { 4898 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 4899 } 4900 4901 if (UseSquareToLenIntrinsic) { 4902 StubRoutines::_squareToLen = generate_squareToLen(); 4903 } 4904 4905 if (UseMulAddIntrinsic) { 4906 StubRoutines::_mulAdd = generate_mulAdd(); 4907 } 4908 4909 if (UseMontgomeryMultiplyIntrinsic) { 4910 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 4911 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 4912 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 4913 } 4914 4915 if (UseMontgomerySquareIntrinsic) { 4916 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 4917 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 4918 // We use generate_multiply() rather than generate_square() 4919 // because it's faster for the sizes of modulus we care about. 4920 StubRoutines::_montgomerySquare = g.generate_multiply(); 4921 } 4922 4923 #ifndef BUILTIN_SIM 4924 // generate GHASH intrinsics code 4925 if (UseGHASHIntrinsics) { 4926 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 4927 } 4928 4929 if (UseAESIntrinsics) { 4930 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 4931 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 4932 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 4933 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 4934 } 4935 4936 if (UseSHA1Intrinsics) { 4937 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 4938 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 4939 } 4940 if (UseSHA256Intrinsics) { 4941 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 4942 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 4943 } 4944 4945 // generate Adler32 intrinsics code 4946 if (UseAdler32Intrinsics) { 4947 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 4948 } 4949 4950 // Safefetch stubs. 4951 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 4952 &StubRoutines::_safefetch32_fault_pc, 4953 &StubRoutines::_safefetch32_continuation_pc); 4954 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 4955 &StubRoutines::_safefetchN_fault_pc, 4956 &StubRoutines::_safefetchN_continuation_pc); 4957 #endif 4958 StubRoutines::aarch64::set_completed(); 4959 } 4960 4961 public: 4962 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 4963 if (all) { 4964 generate_all(); 4965 } else { 4966 generate_initial(); 4967 } 4968 } 4969 }; // end class declaration 4970 4971 void StubGenerator_generate(CodeBuffer* code, bool all) { 4972 StubGenerator g(code, all); 4973 }