1 /* 2 * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "nativeInst_aarch64.hpp" 33 #include "oops/instanceOop.hpp" 34 #include "oops/method.hpp" 35 #include "oops/objArrayKlass.hpp" 36 #include "oops/oop.inline.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "runtime/frame.inline.hpp" 39 #include "runtime/handles.inline.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubCodeGenerator.hpp" 42 #include "runtime/stubRoutines.hpp" 43 #include "runtime/thread.inline.hpp" 44 #include "utilities/align.hpp" 45 #ifdef COMPILER2 46 #include "opto/runtime.hpp" 47 #endif 48 49 #ifdef BUILTIN_SIM 50 #include "../../../../../../simulator/simulator.hpp" 51 #endif 52 53 // Declaration and definition of StubGenerator (no .hpp file). 54 // For a more detailed description of the stub routine structure 55 // see the comment in stubRoutines.hpp 56 57 #undef __ 58 #define __ _masm-> 59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 60 61 #ifdef PRODUCT 62 #define BLOCK_COMMENT(str) /* nothing */ 63 #else 64 #define BLOCK_COMMENT(str) __ block_comment(str) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Stub Code definitions 70 71 class StubGenerator: public StubCodeGenerator { 72 private: 73 74 #ifdef PRODUCT 75 #define inc_counter_np(counter) ((void)0) 76 #else 77 void inc_counter_np_(int& counter) { 78 __ lea(rscratch2, ExternalAddress((address)&counter)); 79 __ ldrw(rscratch1, Address(rscratch2)); 80 __ addw(rscratch1, rscratch1, 1); 81 __ strw(rscratch1, Address(rscratch2)); 82 } 83 #define inc_counter_np(counter) \ 84 BLOCK_COMMENT("inc_counter " #counter); \ 85 inc_counter_np_(counter); 86 #endif 87 88 // Call stubs are used to call Java from C 89 // 90 // Arguments: 91 // c_rarg0: call wrapper address address 92 // c_rarg1: result address 93 // c_rarg2: result type BasicType 94 // c_rarg3: method Method* 95 // c_rarg4: (interpreter) entry point address 96 // c_rarg5: parameters intptr_t* 97 // c_rarg6: parameter size (in words) int 98 // c_rarg7: thread Thread* 99 // 100 // There is no return from the stub itself as any Java result 101 // is written to result 102 // 103 // we save r30 (lr) as the return PC at the base of the frame and 104 // link r29 (fp) below it as the frame pointer installing sp (r31) 105 // into fp. 106 // 107 // we save r0-r7, which accounts for all the c arguments. 108 // 109 // TODO: strictly do we need to save them all? they are treated as 110 // volatile by C so could we omit saving the ones we are going to 111 // place in global registers (thread? method?) or those we only use 112 // during setup of the Java call? 113 // 114 // we don't need to save r8 which C uses as an indirect result location 115 // return register. 116 // 117 // we don't need to save r9-r15 which both C and Java treat as 118 // volatile 119 // 120 // we don't need to save r16-18 because Java does not use them 121 // 122 // we save r19-r28 which Java uses as scratch registers and C 123 // expects to be callee-save 124 // 125 // we save the bottom 64 bits of each value stored in v8-v15; it is 126 // the responsibility of the caller to preserve larger values. 127 // 128 // so the stub frame looks like this when we enter Java code 129 // 130 // [ return_from_Java ] <--- sp 131 // [ argument word n ] 132 // ... 133 // -27 [ argument word 1 ] 134 // -26 [ saved v15 ] <--- sp_after_call 135 // -25 [ saved v14 ] 136 // -24 [ saved v13 ] 137 // -23 [ saved v12 ] 138 // -22 [ saved v11 ] 139 // -21 [ saved v10 ] 140 // -20 [ saved v9 ] 141 // -19 [ saved v8 ] 142 // -18 [ saved r28 ] 143 // -17 [ saved r27 ] 144 // -16 [ saved r26 ] 145 // -15 [ saved r25 ] 146 // -14 [ saved r24 ] 147 // -13 [ saved r23 ] 148 // -12 [ saved r22 ] 149 // -11 [ saved r21 ] 150 // -10 [ saved r20 ] 151 // -9 [ saved r19 ] 152 // -8 [ call wrapper (r0) ] 153 // -7 [ result (r1) ] 154 // -6 [ result type (r2) ] 155 // -5 [ method (r3) ] 156 // -4 [ entry point (r4) ] 157 // -3 [ parameters (r5) ] 158 // -2 [ parameter size (r6) ] 159 // -1 [ thread (r7) ] 160 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 161 // 1 [ saved lr (r30) ] 162 163 // Call stub stack layout word offsets from fp 164 enum call_stub_layout { 165 sp_after_call_off = -26, 166 167 d15_off = -26, 168 d13_off = -24, 169 d11_off = -22, 170 d9_off = -20, 171 172 r28_off = -18, 173 r26_off = -16, 174 r24_off = -14, 175 r22_off = -12, 176 r20_off = -10, 177 call_wrapper_off = -8, 178 result_off = -7, 179 result_type_off = -6, 180 method_off = -5, 181 entry_point_off = -4, 182 parameter_size_off = -2, 183 thread_off = -1, 184 fp_f = 0, 185 retaddr_off = 1, 186 }; 187 188 address generate_call_stub(address& return_address) { 189 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 190 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 191 "adjust this code"); 192 193 StubCodeMark mark(this, "StubRoutines", "call_stub"); 194 address start = __ pc(); 195 196 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 197 198 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 199 const Address result (rfp, result_off * wordSize); 200 const Address result_type (rfp, result_type_off * wordSize); 201 const Address method (rfp, method_off * wordSize); 202 const Address entry_point (rfp, entry_point_off * wordSize); 203 const Address parameter_size(rfp, parameter_size_off * wordSize); 204 205 const Address thread (rfp, thread_off * wordSize); 206 207 const Address d15_save (rfp, d15_off * wordSize); 208 const Address d13_save (rfp, d13_off * wordSize); 209 const Address d11_save (rfp, d11_off * wordSize); 210 const Address d9_save (rfp, d9_off * wordSize); 211 212 const Address r28_save (rfp, r28_off * wordSize); 213 const Address r26_save (rfp, r26_off * wordSize); 214 const Address r24_save (rfp, r24_off * wordSize); 215 const Address r22_save (rfp, r22_off * wordSize); 216 const Address r20_save (rfp, r20_off * wordSize); 217 218 // stub code 219 220 // we need a C prolog to bootstrap the x86 caller into the sim 221 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 222 223 address aarch64_entry = __ pc(); 224 225 #ifdef BUILTIN_SIM 226 // Save sender's SP for stack traces. 227 __ mov(rscratch1, sp); 228 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 229 #endif 230 // set up frame and move sp to end of save area 231 __ enter(); 232 __ sub(sp, rfp, -sp_after_call_off * wordSize); 233 234 // save register parameters and Java scratch/global registers 235 // n.b. we save thread even though it gets installed in 236 // rthread because we want to sanity check rthread later 237 __ str(c_rarg7, thread); 238 __ strw(c_rarg6, parameter_size); 239 __ stp(c_rarg4, c_rarg5, entry_point); 240 __ stp(c_rarg2, c_rarg3, result_type); 241 __ stp(c_rarg0, c_rarg1, call_wrapper); 242 243 __ stp(r20, r19, r20_save); 244 __ stp(r22, r21, r22_save); 245 __ stp(r24, r23, r24_save); 246 __ stp(r26, r25, r26_save); 247 __ stp(r28, r27, r28_save); 248 249 __ stpd(v9, v8, d9_save); 250 __ stpd(v11, v10, d11_save); 251 __ stpd(v13, v12, d13_save); 252 __ stpd(v15, v14, d15_save); 253 254 // install Java thread in global register now we have saved 255 // whatever value it held 256 __ mov(rthread, c_rarg7); 257 // And method 258 __ mov(rmethod, c_rarg3); 259 260 // set up the heapbase register 261 __ reinit_heapbase(); 262 263 #ifdef ASSERT 264 // make sure we have no pending exceptions 265 { 266 Label L; 267 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 268 __ cmp(rscratch1, (unsigned)NULL_WORD); 269 __ br(Assembler::EQ, L); 270 __ stop("StubRoutines::call_stub: entered with pending exception"); 271 __ BIND(L); 272 } 273 #endif 274 // pass parameters if any 275 __ mov(esp, sp); 276 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 277 __ andr(sp, rscratch1, -2 * wordSize); 278 279 BLOCK_COMMENT("pass parameters if any"); 280 Label parameters_done; 281 // parameter count is still in c_rarg6 282 // and parameter pointer identifying param 1 is in c_rarg5 283 __ cbzw(c_rarg6, parameters_done); 284 285 address loop = __ pc(); 286 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 287 __ subsw(c_rarg6, c_rarg6, 1); 288 __ push(rscratch1); 289 __ br(Assembler::GT, loop); 290 291 __ BIND(parameters_done); 292 293 // call Java entry -- passing methdoOop, and current sp 294 // rmethod: Method* 295 // r13: sender sp 296 BLOCK_COMMENT("call Java function"); 297 __ mov(r13, sp); 298 __ blr(c_rarg4); 299 300 // tell the simulator we have returned to the stub 301 302 // we do this here because the notify will already have been done 303 // if we get to the next instruction via an exception 304 // 305 // n.b. adding this instruction here affects the calculation of 306 // whether or not a routine returns to the call stub (used when 307 // doing stack walks) since the normal test is to check the return 308 // pc against the address saved below. so we may need to allow for 309 // this extra instruction in the check. 310 311 if (NotifySimulator) { 312 __ notify(Assembler::method_reentry); 313 } 314 // save current address for use by exception handling code 315 316 return_address = __ pc(); 317 318 // store result depending on type (everything that is not 319 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 320 // n.b. this assumes Java returns an integral result in r0 321 // and a floating result in j_farg0 322 __ ldr(j_rarg2, result); 323 Label is_long, is_float, is_double, exit; 324 __ ldr(j_rarg1, result_type); 325 __ cmp(j_rarg1, T_OBJECT); 326 __ br(Assembler::EQ, is_long); 327 __ cmp(j_rarg1, T_LONG); 328 __ br(Assembler::EQ, is_long); 329 __ cmp(j_rarg1, T_FLOAT); 330 __ br(Assembler::EQ, is_float); 331 __ cmp(j_rarg1, T_DOUBLE); 332 __ br(Assembler::EQ, is_double); 333 334 // handle T_INT case 335 __ strw(r0, Address(j_rarg2)); 336 337 __ BIND(exit); 338 339 // pop parameters 340 __ sub(esp, rfp, -sp_after_call_off * wordSize); 341 342 #ifdef ASSERT 343 // verify that threads correspond 344 { 345 Label L, S; 346 __ ldr(rscratch1, thread); 347 __ cmp(rthread, rscratch1); 348 __ br(Assembler::NE, S); 349 __ get_thread(rscratch1); 350 __ cmp(rthread, rscratch1); 351 __ br(Assembler::EQ, L); 352 __ BIND(S); 353 __ stop("StubRoutines::call_stub: threads must correspond"); 354 __ BIND(L); 355 } 356 #endif 357 358 // restore callee-save registers 359 __ ldpd(v15, v14, d15_save); 360 __ ldpd(v13, v12, d13_save); 361 __ ldpd(v11, v10, d11_save); 362 __ ldpd(v9, v8, d9_save); 363 364 __ ldp(r28, r27, r28_save); 365 __ ldp(r26, r25, r26_save); 366 __ ldp(r24, r23, r24_save); 367 __ ldp(r22, r21, r22_save); 368 __ ldp(r20, r19, r20_save); 369 370 __ ldp(c_rarg0, c_rarg1, call_wrapper); 371 __ ldrw(c_rarg2, result_type); 372 __ ldr(c_rarg3, method); 373 __ ldp(c_rarg4, c_rarg5, entry_point); 374 __ ldp(c_rarg6, c_rarg7, parameter_size); 375 376 #ifndef PRODUCT 377 // tell the simulator we are about to end Java execution 378 if (NotifySimulator) { 379 __ notify(Assembler::method_exit); 380 } 381 #endif 382 // leave frame and return to caller 383 __ leave(); 384 __ ret(lr); 385 386 // handle return types different from T_INT 387 388 __ BIND(is_long); 389 __ str(r0, Address(j_rarg2, 0)); 390 __ br(Assembler::AL, exit); 391 392 __ BIND(is_float); 393 __ strs(j_farg0, Address(j_rarg2, 0)); 394 __ br(Assembler::AL, exit); 395 396 __ BIND(is_double); 397 __ strd(j_farg0, Address(j_rarg2, 0)); 398 __ br(Assembler::AL, exit); 399 400 return start; 401 } 402 403 // Return point for a Java call if there's an exception thrown in 404 // Java code. The exception is caught and transformed into a 405 // pending exception stored in JavaThread that can be tested from 406 // within the VM. 407 // 408 // Note: Usually the parameters are removed by the callee. In case 409 // of an exception crossing an activation frame boundary, that is 410 // not the case if the callee is compiled code => need to setup the 411 // rsp. 412 // 413 // r0: exception oop 414 415 // NOTE: this is used as a target from the signal handler so it 416 // needs an x86 prolog which returns into the current simulator 417 // executing the generated catch_exception code. so the prolog 418 // needs to install rax in a sim register and adjust the sim's 419 // restart pc to enter the generated code at the start position 420 // then return from native to simulated execution. 421 422 address generate_catch_exception() { 423 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 424 address start = __ pc(); 425 426 // same as in generate_call_stub(): 427 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 428 const Address thread (rfp, thread_off * wordSize); 429 430 #ifdef ASSERT 431 // verify that threads correspond 432 { 433 Label L, S; 434 __ ldr(rscratch1, thread); 435 __ cmp(rthread, rscratch1); 436 __ br(Assembler::NE, S); 437 __ get_thread(rscratch1); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::EQ, L); 440 __ bind(S); 441 __ stop("StubRoutines::catch_exception: threads must correspond"); 442 __ bind(L); 443 } 444 #endif 445 446 // set pending exception 447 __ verify_oop(r0); 448 449 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 450 __ mov(rscratch1, (address)__FILE__); 451 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 452 __ movw(rscratch1, (int)__LINE__); 453 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 454 455 // complete return to VM 456 assert(StubRoutines::_call_stub_return_address != NULL, 457 "_call_stub_return_address must have been generated before"); 458 __ b(StubRoutines::_call_stub_return_address); 459 460 return start; 461 } 462 463 // Continuation point for runtime calls returning with a pending 464 // exception. The pending exception check happened in the runtime 465 // or native call stub. The pending exception in Thread is 466 // converted into a Java-level exception. 467 // 468 // Contract with Java-level exception handlers: 469 // r0: exception 470 // r3: throwing pc 471 // 472 // NOTE: At entry of this stub, exception-pc must be in LR !! 473 474 // NOTE: this is always used as a jump target within generated code 475 // so it just needs to be generated code wiht no x86 prolog 476 477 address generate_forward_exception() { 478 StubCodeMark mark(this, "StubRoutines", "forward exception"); 479 address start = __ pc(); 480 481 // Upon entry, LR points to the return address returning into 482 // Java (interpreted or compiled) code; i.e., the return address 483 // becomes the throwing pc. 484 // 485 // Arguments pushed before the runtime call are still on the stack 486 // but the exception handler will reset the stack pointer -> 487 // ignore them. A potential result in registers can be ignored as 488 // well. 489 490 #ifdef ASSERT 491 // make sure this code is only executed if there is a pending exception 492 { 493 Label L; 494 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 495 __ cbnz(rscratch1, L); 496 __ stop("StubRoutines::forward exception: no pending exception (1)"); 497 __ bind(L); 498 } 499 #endif 500 501 // compute exception handler into r19 502 503 // call the VM to find the handler address associated with the 504 // caller address. pass thread in r0 and caller pc (ret address) 505 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 506 // the stack. 507 __ mov(c_rarg1, lr); 508 // lr will be trashed by the VM call so we move it to R19 509 // (callee-saved) because we also need to pass it to the handler 510 // returned by this call. 511 __ mov(r19, lr); 512 BLOCK_COMMENT("call exception_handler_for_return_address"); 513 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 514 SharedRuntime::exception_handler_for_return_address), 515 rthread, c_rarg1); 516 // we should not really care that lr is no longer the callee 517 // address. we saved the value the handler needs in r19 so we can 518 // just copy it to r3. however, the C2 handler will push its own 519 // frame and then calls into the VM and the VM code asserts that 520 // the PC for the frame above the handler belongs to a compiled 521 // Java method. So, we restore lr here to satisfy that assert. 522 __ mov(lr, r19); 523 // setup r0 & r3 & clear pending exception 524 __ mov(r3, r19); 525 __ mov(r19, r0); 526 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 527 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 528 529 #ifdef ASSERT 530 // make sure exception is set 531 { 532 Label L; 533 __ cbnz(r0, L); 534 __ stop("StubRoutines::forward exception: no pending exception (2)"); 535 __ bind(L); 536 } 537 #endif 538 539 // continue at exception handler 540 // r0: exception 541 // r3: throwing pc 542 // r19: exception handler 543 __ verify_oop(r0); 544 __ br(r19); 545 546 return start; 547 } 548 549 // Non-destructive plausibility checks for oops 550 // 551 // Arguments: 552 // r0: oop to verify 553 // rscratch1: error message 554 // 555 // Stack after saving c_rarg3: 556 // [tos + 0]: saved c_rarg3 557 // [tos + 1]: saved c_rarg2 558 // [tos + 2]: saved lr 559 // [tos + 3]: saved rscratch2 560 // [tos + 4]: saved r0 561 // [tos + 5]: saved rscratch1 562 address generate_verify_oop() { 563 564 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 565 address start = __ pc(); 566 567 Label exit, error; 568 569 // save c_rarg2 and c_rarg3 570 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 571 572 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 573 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 574 __ ldr(c_rarg3, Address(c_rarg2)); 575 __ add(c_rarg3, c_rarg3, 1); 576 __ str(c_rarg3, Address(c_rarg2)); 577 578 // object is in r0 579 // make sure object is 'reasonable' 580 __ cbz(r0, exit); // if obj is NULL it is OK 581 582 // Check if the oop is in the right area of memory 583 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 584 __ andr(c_rarg2, r0, c_rarg3); 585 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 586 587 // Compare c_rarg2 and c_rarg3. We don't use a compare 588 // instruction here because the flags register is live. 589 __ eor(c_rarg2, c_rarg2, c_rarg3); 590 __ cbnz(c_rarg2, error); 591 592 // make sure klass is 'reasonable', which is not zero. 593 __ load_klass(r0, r0); // get klass 594 __ cbz(r0, error); // if klass is NULL it is broken 595 596 // return if everything seems ok 597 __ bind(exit); 598 599 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 600 __ ret(lr); 601 602 // handle errors 603 __ bind(error); 604 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 605 606 __ push(RegSet::range(r0, r29), sp); 607 // debug(char* msg, int64_t pc, int64_t regs[]) 608 __ mov(c_rarg0, rscratch1); // pass address of error message 609 __ mov(c_rarg1, lr); // pass return address 610 __ mov(c_rarg2, sp); // pass address of regs on stack 611 #ifndef PRODUCT 612 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 613 #endif 614 BLOCK_COMMENT("call MacroAssembler::debug"); 615 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 616 __ blrt(rscratch1, 3, 0, 1); 617 618 return start; 619 } 620 621 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 622 623 // The inner part of zero_words(). This is the bulk operation, 624 // zeroing words in blocks, possibly using DC ZVA to do it. The 625 // caller is responsible for zeroing the last few words. 626 // 627 // Inputs: 628 // r10: the HeapWord-aligned base address of an array to zero. 629 // r11: the count in HeapWords, r11 > 0. 630 // 631 // Returns r10 and r11, adjusted for the caller to clear. 632 // r10: the base address of the tail of words left to clear. 633 // r11: the number of words in the tail. 634 // r11 < MacroAssembler::zero_words_block_size. 635 636 address generate_zero_blocks() { 637 Label store_pair, loop_store_pair, done; 638 Label base_aligned; 639 640 Register base = r10, cnt = r11; 641 642 __ align(CodeEntryAlignment); 643 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 644 address start = __ pc(); 645 646 if (UseBlockZeroing) { 647 int zva_length = VM_Version::zva_length(); 648 649 // Ensure ZVA length can be divided by 16. This is required by 650 // the subsequent operations. 651 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 652 653 __ tbz(base, 3, base_aligned); 654 __ str(zr, Address(__ post(base, 8))); 655 __ sub(cnt, cnt, 1); 656 __ bind(base_aligned); 657 658 // Ensure count >= zva_length * 2 so that it still deserves a zva after 659 // alignment. 660 Label small; 661 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 662 __ subs(rscratch1, cnt, low_limit >> 3); 663 __ br(Assembler::LT, small); 664 __ zero_dcache_blocks(base, cnt); 665 __ bind(small); 666 } 667 668 { 669 // Number of stp instructions we'll unroll 670 const int unroll = 671 MacroAssembler::zero_words_block_size / 2; 672 // Clear the remaining blocks. 673 Label loop; 674 __ subs(cnt, cnt, unroll * 2); 675 __ br(Assembler::LT, done); 676 __ bind(loop); 677 for (int i = 0; i < unroll; i++) 678 __ stp(zr, zr, __ post(base, 16)); 679 __ subs(cnt, cnt, unroll * 2); 680 __ br(Assembler::GE, loop); 681 __ bind(done); 682 __ add(cnt, cnt, unroll * 2); 683 } 684 685 __ ret(lr); 686 687 return start; 688 } 689 690 691 typedef enum { 692 copy_forwards = 1, 693 copy_backwards = -1 694 } copy_direction; 695 696 // Bulk copy of blocks of 8 words. 697 // 698 // count is a count of words. 699 // 700 // Precondition: count >= 8 701 // 702 // Postconditions: 703 // 704 // The least significant bit of count contains the remaining count 705 // of words to copy. The rest of count is trash. 706 // 707 // s and d are adjusted to point to the remaining words to copy 708 // 709 void generate_copy_longs(Label &start, Register s, Register d, Register count, 710 copy_direction direction) { 711 int unit = wordSize * direction; 712 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 713 714 int offset; 715 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 716 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 717 const Register stride = r13; 718 719 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 720 assert_different_registers(s, d, count, rscratch1); 721 722 Label again, drain; 723 const char *stub_name; 724 if (direction == copy_forwards) 725 stub_name = "forward_copy_longs"; 726 else 727 stub_name = "backward_copy_longs"; 728 StubCodeMark mark(this, "StubRoutines", stub_name); 729 __ align(CodeEntryAlignment); 730 __ bind(start); 731 732 Label unaligned_copy_long; 733 if (AvoidUnalignedAccesses) { 734 __ tbnz(d, 3, unaligned_copy_long); 735 } 736 737 if (direction == copy_forwards) { 738 __ sub(s, s, bias); 739 __ sub(d, d, bias); 740 } 741 742 #ifdef ASSERT 743 // Make sure we are never given < 8 words 744 { 745 Label L; 746 __ cmp(count, 8); 747 __ br(Assembler::GE, L); 748 __ stop("genrate_copy_longs called with < 8 words"); 749 __ bind(L); 750 } 751 #endif 752 753 // Fill 8 registers 754 if (UseSIMDForMemoryOps) { 755 __ ldpq(v0, v1, Address(s, 4 * unit)); 756 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 757 } else { 758 __ ldp(t0, t1, Address(s, 2 * unit)); 759 __ ldp(t2, t3, Address(s, 4 * unit)); 760 __ ldp(t4, t5, Address(s, 6 * unit)); 761 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 762 } 763 764 __ subs(count, count, 16); 765 __ br(Assembler::LO, drain); 766 767 int prefetch = PrefetchCopyIntervalInBytes; 768 bool use_stride = false; 769 if (direction == copy_backwards) { 770 use_stride = prefetch > 256; 771 prefetch = -prefetch; 772 if (use_stride) __ mov(stride, prefetch); 773 } 774 775 __ bind(again); 776 777 if (PrefetchCopyIntervalInBytes > 0) 778 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 779 780 if (UseSIMDForMemoryOps) { 781 __ stpq(v0, v1, Address(d, 4 * unit)); 782 __ ldpq(v0, v1, Address(s, 4 * unit)); 783 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 784 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 785 } else { 786 __ stp(t0, t1, Address(d, 2 * unit)); 787 __ ldp(t0, t1, Address(s, 2 * unit)); 788 __ stp(t2, t3, Address(d, 4 * unit)); 789 __ ldp(t2, t3, Address(s, 4 * unit)); 790 __ stp(t4, t5, Address(d, 6 * unit)); 791 __ ldp(t4, t5, Address(s, 6 * unit)); 792 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 793 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 794 } 795 796 __ subs(count, count, 8); 797 __ br(Assembler::HS, again); 798 799 // Drain 800 __ bind(drain); 801 if (UseSIMDForMemoryOps) { 802 __ stpq(v0, v1, Address(d, 4 * unit)); 803 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 804 } else { 805 __ stp(t0, t1, Address(d, 2 * unit)); 806 __ stp(t2, t3, Address(d, 4 * unit)); 807 __ stp(t4, t5, Address(d, 6 * unit)); 808 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 809 } 810 811 { 812 Label L1, L2; 813 __ tbz(count, exact_log2(4), L1); 814 if (UseSIMDForMemoryOps) { 815 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 816 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 817 } else { 818 __ ldp(t0, t1, Address(s, 2 * unit)); 819 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 820 __ stp(t0, t1, Address(d, 2 * unit)); 821 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 822 } 823 __ bind(L1); 824 825 if (direction == copy_forwards) { 826 __ add(s, s, bias); 827 __ add(d, d, bias); 828 } 829 830 __ tbz(count, 1, L2); 831 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 832 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 833 __ bind(L2); 834 } 835 836 __ ret(lr); 837 838 if (AvoidUnalignedAccesses) { 839 Label drain, again; 840 // Register order for storing. Order is different for backward copy. 841 842 __ bind(unaligned_copy_long); 843 844 // source address is even aligned, target odd aligned 845 // 846 // when forward copying word pairs we read long pairs at offsets 847 // {0, 2, 4, 6} (in long words). when backwards copying we read 848 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 849 // address by -2 in the forwards case so we can compute the 850 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 851 // or -1. 852 // 853 // when forward copying we need to store 1 word, 3 pairs and 854 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 855 // zero offset We adjust the destination by -1 which means we 856 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 857 // 858 // When backwards copyng we need to store 1 word, 3 pairs and 859 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 860 // offsets {1, 3, 5, 7, 8} * unit. 861 862 if (direction == copy_forwards) { 863 __ sub(s, s, 16); 864 __ sub(d, d, 8); 865 } 866 867 // Fill 8 registers 868 // 869 // for forwards copy s was offset by -16 from the original input 870 // value of s so the register contents are at these offsets 871 // relative to the 64 bit block addressed by that original input 872 // and so on for each successive 64 byte block when s is updated 873 // 874 // t0 at offset 0, t1 at offset 8 875 // t2 at offset 16, t3 at offset 24 876 // t4 at offset 32, t5 at offset 40 877 // t6 at offset 48, t7 at offset 56 878 879 // for backwards copy s was not offset so the register contents 880 // are at these offsets into the preceding 64 byte block 881 // relative to that original input and so on for each successive 882 // preceding 64 byte block when s is updated. this explains the 883 // slightly counter-intuitive looking pattern of register usage 884 // in the stp instructions for backwards copy. 885 // 886 // t0 at offset -16, t1 at offset -8 887 // t2 at offset -32, t3 at offset -24 888 // t4 at offset -48, t5 at offset -40 889 // t6 at offset -64, t7 at offset -56 890 891 __ ldp(t0, t1, Address(s, 2 * unit)); 892 __ ldp(t2, t3, Address(s, 4 * unit)); 893 __ ldp(t4, t5, Address(s, 6 * unit)); 894 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 895 896 __ subs(count, count, 16); 897 __ br(Assembler::LO, drain); 898 899 int prefetch = PrefetchCopyIntervalInBytes; 900 bool use_stride = false; 901 if (direction == copy_backwards) { 902 use_stride = prefetch > 256; 903 prefetch = -prefetch; 904 if (use_stride) __ mov(stride, prefetch); 905 } 906 907 __ bind(again); 908 909 if (PrefetchCopyIntervalInBytes > 0) 910 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 911 912 if (direction == copy_forwards) { 913 // allowing for the offset of -8 the store instructions place 914 // registers into the target 64 bit block at the following 915 // offsets 916 // 917 // t0 at offset 0 918 // t1 at offset 8, t2 at offset 16 919 // t3 at offset 24, t4 at offset 32 920 // t5 at offset 40, t6 at offset 48 921 // t7 at offset 56 922 923 __ str(t0, Address(d, 1 * unit)); 924 __ stp(t1, t2, Address(d, 2 * unit)); 925 __ ldp(t0, t1, Address(s, 2 * unit)); 926 __ stp(t3, t4, Address(d, 4 * unit)); 927 __ ldp(t2, t3, Address(s, 4 * unit)); 928 __ stp(t5, t6, Address(d, 6 * unit)); 929 __ ldp(t4, t5, Address(s, 6 * unit)); 930 __ str(t7, Address(__ pre(d, 8 * unit))); 931 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 932 } else { 933 // d was not offset when we started so the registers are 934 // written into the 64 bit block preceding d with the following 935 // offsets 936 // 937 // t1 at offset -8 938 // t3 at offset -24, t0 at offset -16 939 // t5 at offset -48, t2 at offset -32 940 // t7 at offset -56, t4 at offset -48 941 // t6 at offset -64 942 // 943 // note that this matches the offsets previously noted for the 944 // loads 945 946 __ str(t1, Address(d, 1 * unit)); 947 __ stp(t3, t0, Address(d, 3 * unit)); 948 __ ldp(t0, t1, Address(s, 2 * unit)); 949 __ stp(t5, t2, Address(d, 5 * unit)); 950 __ ldp(t2, t3, Address(s, 4 * unit)); 951 __ stp(t7, t4, Address(d, 7 * unit)); 952 __ ldp(t4, t5, Address(s, 6 * unit)); 953 __ str(t6, Address(__ pre(d, 8 * unit))); 954 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 955 } 956 957 __ subs(count, count, 8); 958 __ br(Assembler::HS, again); 959 960 // Drain 961 // 962 // this uses the same pattern of offsets and register arguments 963 // as above 964 __ bind(drain); 965 if (direction == copy_forwards) { 966 __ str(t0, Address(d, 1 * unit)); 967 __ stp(t1, t2, Address(d, 2 * unit)); 968 __ stp(t3, t4, Address(d, 4 * unit)); 969 __ stp(t5, t6, Address(d, 6 * unit)); 970 __ str(t7, Address(__ pre(d, 8 * unit))); 971 } else { 972 __ str(t1, Address(d, 1 * unit)); 973 __ stp(t3, t0, Address(d, 3 * unit)); 974 __ stp(t5, t2, Address(d, 5 * unit)); 975 __ stp(t7, t4, Address(d, 7 * unit)); 976 __ str(t6, Address(__ pre(d, 8 * unit))); 977 } 978 // now we need to copy any remaining part block which may 979 // include a 4 word block subblock and/or a 2 word subblock. 980 // bits 2 and 1 in the count are the tell-tale for whetehr we 981 // have each such subblock 982 { 983 Label L1, L2; 984 __ tbz(count, exact_log2(4), L1); 985 // this is the same as above but copying only 4 longs hence 986 // with ony one intervening stp between the str instructions 987 // but note that the offsets and registers still follow the 988 // same pattern 989 __ ldp(t0, t1, Address(s, 2 * unit)); 990 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 991 if (direction == copy_forwards) { 992 __ str(t0, Address(d, 1 * unit)); 993 __ stp(t1, t2, Address(d, 2 * unit)); 994 __ str(t3, Address(__ pre(d, 4 * unit))); 995 } else { 996 __ str(t1, Address(d, 1 * unit)); 997 __ stp(t3, t0, Address(d, 3 * unit)); 998 __ str(t2, Address(__ pre(d, 4 * unit))); 999 } 1000 __ bind(L1); 1001 1002 __ tbz(count, 1, L2); 1003 // this is the same as above but copying only 2 longs hence 1004 // there is no intervening stp between the str instructions 1005 // but note that the offset and register patterns are still 1006 // the same 1007 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1008 if (direction == copy_forwards) { 1009 __ str(t0, Address(d, 1 * unit)); 1010 __ str(t1, Address(__ pre(d, 2 * unit))); 1011 } else { 1012 __ str(t1, Address(d, 1 * unit)); 1013 __ str(t0, Address(__ pre(d, 2 * unit))); 1014 } 1015 __ bind(L2); 1016 1017 // for forwards copy we need to re-adjust the offsets we 1018 // applied so that s and d are follow the last words written 1019 1020 if (direction == copy_forwards) { 1021 __ add(s, s, 16); 1022 __ add(d, d, 8); 1023 } 1024 1025 } 1026 1027 __ ret(lr); 1028 } 1029 } 1030 1031 // Small copy: less than 16 bytes. 1032 // 1033 // NB: Ignores all of the bits of count which represent more than 15 1034 // bytes, so a caller doesn't have to mask them. 1035 1036 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1037 bool is_backwards = step < 0; 1038 size_t granularity = uabs(step); 1039 int direction = is_backwards ? -1 : 1; 1040 int unit = wordSize * direction; 1041 1042 Label Lpair, Lword, Lint, Lshort, Lbyte; 1043 1044 assert(granularity 1045 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1046 1047 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1048 1049 // ??? I don't know if this bit-test-and-branch is the right thing 1050 // to do. It does a lot of jumping, resulting in several 1051 // mispredicted branches. It might make more sense to do this 1052 // with something like Duff's device with a single computed branch. 1053 1054 __ tbz(count, 3 - exact_log2(granularity), Lword); 1055 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1056 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1057 __ bind(Lword); 1058 1059 if (granularity <= sizeof (jint)) { 1060 __ tbz(count, 2 - exact_log2(granularity), Lint); 1061 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1062 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1063 __ bind(Lint); 1064 } 1065 1066 if (granularity <= sizeof (jshort)) { 1067 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1068 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1069 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1070 __ bind(Lshort); 1071 } 1072 1073 if (granularity <= sizeof (jbyte)) { 1074 __ tbz(count, 0, Lbyte); 1075 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1076 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1077 __ bind(Lbyte); 1078 } 1079 } 1080 1081 Label copy_f, copy_b; 1082 1083 // All-singing all-dancing memory copy. 1084 // 1085 // Copy count units of memory from s to d. The size of a unit is 1086 // step, which can be positive or negative depending on the direction 1087 // of copy. If is_aligned is false, we align the source address. 1088 // 1089 1090 void copy_memory(bool is_aligned, Register s, Register d, 1091 Register count, Register tmp, int step) { 1092 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1093 bool is_backwards = step < 0; 1094 int granularity = uabs(step); 1095 const Register t0 = r3, t1 = r4; 1096 1097 // <= 96 bytes do inline. Direction doesn't matter because we always 1098 // load all the data before writing anything 1099 Label copy4, copy8, copy16, copy32, copy80, copy128, copy_big, finish; 1100 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1101 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1102 const Register send = r17, dend = r18; 1103 1104 if (PrefetchCopyIntervalInBytes > 0) 1105 __ prfm(Address(s, 0), PLDL1KEEP); 1106 __ cmp(count, (UseSIMDForMemoryOps ? 96:80)/granularity); 1107 __ br(Assembler::HI, copy_big); 1108 1109 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1110 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1111 1112 __ cmp(count, 16/granularity); 1113 __ br(Assembler::LS, copy16); 1114 1115 __ cmp(count, 64/granularity); 1116 __ br(Assembler::HI, copy80); 1117 1118 __ cmp(count, 32/granularity); 1119 __ br(Assembler::LS, copy32); 1120 1121 // 33..64 bytes 1122 if (UseSIMDForMemoryOps) { 1123 __ ldpq(v0, v1, Address(s, 0)); 1124 __ ldpq(v2, v3, Address(send, -32)); 1125 __ stpq(v0, v1, Address(d, 0)); 1126 __ stpq(v2, v3, Address(dend, -32)); 1127 } else { 1128 __ ldp(t0, t1, Address(s, 0)); 1129 __ ldp(t2, t3, Address(s, 16)); 1130 __ ldp(t4, t5, Address(send, -32)); 1131 __ ldp(t6, t7, Address(send, -16)); 1132 1133 __ stp(t0, t1, Address(d, 0)); 1134 __ stp(t2, t3, Address(d, 16)); 1135 __ stp(t4, t5, Address(dend, -32)); 1136 __ stp(t6, t7, Address(dend, -16)); 1137 } 1138 __ b(finish); 1139 1140 // 17..32 bytes 1141 __ bind(copy32); 1142 __ ldp(t0, t1, Address(s, 0)); 1143 __ ldp(t2, t3, Address(send, -16)); 1144 __ stp(t0, t1, Address(d, 0)); 1145 __ stp(t2, t3, Address(dend, -16)); 1146 __ b(finish); 1147 1148 // 65..80/96 bytes 1149 // (96 bytes if SIMD because we do 32 byes per instruction) 1150 __ bind(copy80); 1151 if (UseSIMDForMemoryOps) { 1152 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1153 __ ldpq(v4, v5, Address(send, -32)); 1154 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1155 __ stpq(v4, v5, Address(dend, -32)); 1156 } else { 1157 __ ldp(t0, t1, Address(s, 0)); 1158 __ ldp(t2, t3, Address(s, 16)); 1159 __ ldp(t4, t5, Address(s, 32)); 1160 __ ldp(t6, t7, Address(s, 48)); 1161 __ ldp(t8, t9, Address(send, -16)); 1162 1163 __ stp(t0, t1, Address(d, 0)); 1164 __ stp(t2, t3, Address(d, 16)); 1165 __ stp(t4, t5, Address(d, 32)); 1166 __ stp(t6, t7, Address(d, 48)); 1167 __ stp(t8, t9, Address(dend, -16)); 1168 } 1169 __ b(finish); 1170 1171 // 0..16 bytes 1172 __ bind(copy16); 1173 __ cmp(count, 8/granularity); 1174 __ br(Assembler::LO, copy8); 1175 1176 // 8..16 bytes 1177 __ ldr(t0, Address(s, 0)); 1178 __ ldr(t1, Address(send, -8)); 1179 __ str(t0, Address(d, 0)); 1180 __ str(t1, Address(dend, -8)); 1181 __ b(finish); 1182 1183 if (granularity < 8) { 1184 // 4..7 bytes 1185 __ bind(copy8); 1186 __ tbz(count, 2 - exact_log2(granularity), copy4); 1187 __ ldrw(t0, Address(s, 0)); 1188 __ ldrw(t1, Address(send, -4)); 1189 __ strw(t0, Address(d, 0)); 1190 __ strw(t1, Address(dend, -4)); 1191 __ b(finish); 1192 if (granularity < 4) { 1193 // 0..3 bytes 1194 __ bind(copy4); 1195 __ cbz(count, finish); // get rid of 0 case 1196 if (granularity == 2) { 1197 __ ldrh(t0, Address(s, 0)); 1198 __ strh(t0, Address(d, 0)); 1199 } else { // granularity == 1 1200 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1201 // the first and last byte. 1202 // Handle the 3 byte case by loading and storing base + count/2 1203 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1204 // This does means in the 1 byte case we load/store the same 1205 // byte 3 times. 1206 __ lsr(count, count, 1); 1207 __ ldrb(t0, Address(s, 0)); 1208 __ ldrb(t1, Address(send, -1)); 1209 __ ldrb(t2, Address(s, count)); 1210 __ strb(t0, Address(d, 0)); 1211 __ strb(t1, Address(dend, -1)); 1212 __ strb(t2, Address(d, count)); 1213 } 1214 __ b(finish); 1215 } 1216 } 1217 1218 __ bind(copy_big); 1219 if (is_backwards) { 1220 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1221 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1222 } 1223 1224 // Now we've got the small case out of the way we can align the 1225 // source address on a 2-word boundary. 1226 1227 Label aligned; 1228 1229 if (is_aligned) { 1230 // We may have to adjust by 1 word to get s 2-word-aligned. 1231 __ tbz(s, exact_log2(wordSize), aligned); 1232 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1233 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1234 __ sub(count, count, wordSize/granularity); 1235 } else { 1236 if (is_backwards) { 1237 __ andr(rscratch2, s, 2 * wordSize - 1); 1238 } else { 1239 __ neg(rscratch2, s); 1240 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1241 } 1242 // rscratch2 is the byte adjustment needed to align s. 1243 __ cbz(rscratch2, aligned); 1244 int shift = exact_log2(granularity); 1245 if (shift) __ lsr(rscratch2, rscratch2, shift); 1246 __ sub(count, count, rscratch2); 1247 1248 #if 0 1249 // ?? This code is only correct for a disjoint copy. It may or 1250 // may not make sense to use it in that case. 1251 1252 // Copy the first pair; s and d may not be aligned. 1253 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1254 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1255 1256 // Align s and d, adjust count 1257 if (is_backwards) { 1258 __ sub(s, s, rscratch2); 1259 __ sub(d, d, rscratch2); 1260 } else { 1261 __ add(s, s, rscratch2); 1262 __ add(d, d, rscratch2); 1263 } 1264 #else 1265 copy_memory_small(s, d, rscratch2, rscratch1, step); 1266 #endif 1267 } 1268 1269 __ bind(aligned); 1270 1271 // s is now 2-word-aligned. 1272 1273 // We have a count of units and some trailing bytes. Adjust the 1274 // count and do a bulk copy of words. 1275 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1276 if (direction == copy_forwards) 1277 __ bl(copy_f); 1278 else 1279 __ bl(copy_b); 1280 1281 // And the tail. 1282 copy_memory_small(s, d, count, tmp, step); 1283 1284 if (granularity >= 8) __ bind(copy8); 1285 if (granularity >= 4) __ bind(copy4); 1286 __ bind(finish); 1287 } 1288 1289 1290 void clobber_registers() { 1291 #ifdef ASSERT 1292 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1293 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1294 for (Register r = r3; r <= r18; r++) 1295 if (r != rscratch1) __ mov(r, rscratch1); 1296 #endif 1297 } 1298 1299 // Scan over array at a for count oops, verifying each one. 1300 // Preserves a and count, clobbers rscratch1 and rscratch2. 1301 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1302 Label loop, end; 1303 __ mov(rscratch1, a); 1304 __ mov(rscratch2, zr); 1305 __ bind(loop); 1306 __ cmp(rscratch2, count); 1307 __ br(Assembler::HS, end); 1308 if (size == (size_t)wordSize) { 1309 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1310 __ verify_oop(temp); 1311 } else { 1312 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1313 __ decode_heap_oop(temp); // calls verify_oop 1314 } 1315 __ add(rscratch2, rscratch2, size); 1316 __ b(loop); 1317 __ bind(end); 1318 } 1319 1320 // Arguments: 1321 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1322 // ignored 1323 // is_oop - true => oop array, so generate store check code 1324 // name - stub name string 1325 // 1326 // Inputs: 1327 // c_rarg0 - source array address 1328 // c_rarg1 - destination array address 1329 // c_rarg2 - element count, treated as ssize_t, can be zero 1330 // 1331 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1332 // the hardware handle it. The two dwords within qwords that span 1333 // cache line boundaries will still be loaded and stored atomicly. 1334 // 1335 // Side Effects: 1336 // disjoint_int_copy_entry is set to the no-overlap entry point 1337 // used by generate_conjoint_int_oop_copy(). 1338 // 1339 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1340 const char *name, bool dest_uninitialized = false) { 1341 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1342 RegSet saved_reg = RegSet::of(s, d, count); 1343 __ align(CodeEntryAlignment); 1344 StubCodeMark mark(this, "StubRoutines", name); 1345 address start = __ pc(); 1346 __ enter(); 1347 1348 if (entry != NULL) { 1349 *entry = __ pc(); 1350 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1351 BLOCK_COMMENT("Entry:"); 1352 } 1353 1354 DecoratorSet decorators = ARRAYCOPY_DISJOINT; 1355 if (dest_uninitialized) { 1356 decorators |= AS_DEST_NOT_INITIALIZED; 1357 } 1358 if (aligned) { 1359 decorators |= ARRAYCOPY_ALIGNED; 1360 } 1361 1362 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1363 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg); 1364 1365 if (is_oop) { 1366 // save regs before copy_memory 1367 __ push(RegSet::of(d, count), sp); 1368 } 1369 copy_memory(aligned, s, d, count, rscratch1, size); 1370 1371 if (is_oop) { 1372 __ pop(RegSet::of(d, count), sp); 1373 if (VerifyOops) 1374 verify_oop_array(size, d, count, r16); 1375 __ sub(count, count, 1); // make an inclusive end pointer 1376 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1377 } 1378 1379 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1380 1381 __ leave(); 1382 __ mov(r0, zr); // return 0 1383 __ ret(lr); 1384 #ifdef BUILTIN_SIM 1385 { 1386 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1387 sim->notifyCompile(const_cast<char*>(name), start); 1388 } 1389 #endif 1390 return start; 1391 } 1392 1393 // Arguments: 1394 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1395 // ignored 1396 // is_oop - true => oop array, so generate store check code 1397 // name - stub name string 1398 // 1399 // Inputs: 1400 // c_rarg0 - source array address 1401 // c_rarg1 - destination array address 1402 // c_rarg2 - element count, treated as ssize_t, can be zero 1403 // 1404 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1405 // the hardware handle it. The two dwords within qwords that span 1406 // cache line boundaries will still be loaded and stored atomicly. 1407 // 1408 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1409 address *entry, const char *name, 1410 bool dest_uninitialized = false) { 1411 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1412 RegSet saved_regs = RegSet::of(s, d, count); 1413 StubCodeMark mark(this, "StubRoutines", name); 1414 address start = __ pc(); 1415 __ enter(); 1416 1417 if (entry != NULL) { 1418 *entry = __ pc(); 1419 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1420 BLOCK_COMMENT("Entry:"); 1421 } 1422 1423 // use fwd copy when (d-s) above_equal (count*size) 1424 __ sub(rscratch1, d, s); 1425 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1426 __ br(Assembler::HS, nooverlap_target); 1427 1428 DecoratorSet decorators = 0; 1429 if (dest_uninitialized) { 1430 decorators |= AS_DEST_NOT_INITIALIZED; 1431 } 1432 if (aligned) { 1433 decorators |= ARRAYCOPY_ALIGNED; 1434 } 1435 1436 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1437 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs); 1438 1439 if (is_oop) { 1440 // save regs before copy_memory 1441 __ push(RegSet::of(d, count), sp); 1442 } 1443 copy_memory(aligned, s, d, count, rscratch1, -size); 1444 if (is_oop) { 1445 __ pop(RegSet::of(d, count), sp); 1446 if (VerifyOops) 1447 verify_oop_array(size, d, count, r16); 1448 __ sub(count, count, 1); // make an inclusive end pointer 1449 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1450 } 1451 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1452 __ leave(); 1453 __ mov(r0, zr); // return 0 1454 __ ret(lr); 1455 #ifdef BUILTIN_SIM 1456 { 1457 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1458 sim->notifyCompile(const_cast<char*>(name), start); 1459 } 1460 #endif 1461 return start; 1462 } 1463 1464 // Arguments: 1465 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1466 // ignored 1467 // name - stub name string 1468 // 1469 // Inputs: 1470 // c_rarg0 - source array address 1471 // c_rarg1 - destination array address 1472 // c_rarg2 - element count, treated as ssize_t, can be zero 1473 // 1474 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1475 // we let the hardware handle it. The one to eight bytes within words, 1476 // dwords or qwords that span cache line boundaries will still be loaded 1477 // and stored atomically. 1478 // 1479 // Side Effects: 1480 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1481 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1482 // we let the hardware handle it. The one to eight bytes within words, 1483 // dwords or qwords that span cache line boundaries will still be loaded 1484 // and stored atomically. 1485 // 1486 // Side Effects: 1487 // disjoint_byte_copy_entry is set to the no-overlap entry point 1488 // used by generate_conjoint_byte_copy(). 1489 // 1490 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1491 const bool not_oop = false; 1492 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1493 } 1494 1495 // Arguments: 1496 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1497 // ignored 1498 // name - stub name string 1499 // 1500 // Inputs: 1501 // c_rarg0 - source array address 1502 // c_rarg1 - destination array address 1503 // c_rarg2 - element count, treated as ssize_t, can be zero 1504 // 1505 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1506 // we let the hardware handle it. The one to eight bytes within words, 1507 // dwords or qwords that span cache line boundaries will still be loaded 1508 // and stored atomically. 1509 // 1510 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1511 address* entry, const char *name) { 1512 const bool not_oop = false; 1513 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1514 } 1515 1516 // Arguments: 1517 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1518 // ignored 1519 // name - stub name string 1520 // 1521 // Inputs: 1522 // c_rarg0 - source array address 1523 // c_rarg1 - destination array address 1524 // c_rarg2 - element count, treated as ssize_t, can be zero 1525 // 1526 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1527 // let the hardware handle it. The two or four words within dwords 1528 // or qwords that span cache line boundaries will still be loaded 1529 // and stored atomically. 1530 // 1531 // Side Effects: 1532 // disjoint_short_copy_entry is set to the no-overlap entry point 1533 // used by generate_conjoint_short_copy(). 1534 // 1535 address generate_disjoint_short_copy(bool aligned, 1536 address* entry, const char *name) { 1537 const bool not_oop = false; 1538 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1539 } 1540 1541 // Arguments: 1542 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1543 // ignored 1544 // name - stub name string 1545 // 1546 // Inputs: 1547 // c_rarg0 - source array address 1548 // c_rarg1 - destination array address 1549 // c_rarg2 - element count, treated as ssize_t, can be zero 1550 // 1551 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1552 // let the hardware handle it. The two or four words within dwords 1553 // or qwords that span cache line boundaries will still be loaded 1554 // and stored atomically. 1555 // 1556 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1557 address *entry, const char *name) { 1558 const bool not_oop = false; 1559 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1560 1561 } 1562 // Arguments: 1563 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1564 // ignored 1565 // name - stub name string 1566 // 1567 // Inputs: 1568 // c_rarg0 - source array address 1569 // c_rarg1 - destination array address 1570 // c_rarg2 - element count, treated as ssize_t, can be zero 1571 // 1572 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1573 // the hardware handle it. The two dwords within qwords that span 1574 // cache line boundaries will still be loaded and stored atomicly. 1575 // 1576 // Side Effects: 1577 // disjoint_int_copy_entry is set to the no-overlap entry point 1578 // used by generate_conjoint_int_oop_copy(). 1579 // 1580 address generate_disjoint_int_copy(bool aligned, address *entry, 1581 const char *name, bool dest_uninitialized = false) { 1582 const bool not_oop = false; 1583 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1584 } 1585 1586 // Arguments: 1587 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1588 // ignored 1589 // name - stub name string 1590 // 1591 // Inputs: 1592 // c_rarg0 - source array address 1593 // c_rarg1 - destination array address 1594 // c_rarg2 - element count, treated as ssize_t, can be zero 1595 // 1596 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1597 // the hardware handle it. The two dwords within qwords that span 1598 // cache line boundaries will still be loaded and stored atomicly. 1599 // 1600 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1601 address *entry, const char *name, 1602 bool dest_uninitialized = false) { 1603 const bool not_oop = false; 1604 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1605 } 1606 1607 1608 // Arguments: 1609 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1610 // ignored 1611 // name - stub name string 1612 // 1613 // Inputs: 1614 // c_rarg0 - source array address 1615 // c_rarg1 - destination array address 1616 // c_rarg2 - element count, treated as size_t, can be zero 1617 // 1618 // Side Effects: 1619 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1620 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1621 // 1622 address generate_disjoint_long_copy(bool aligned, address *entry, 1623 const char *name, bool dest_uninitialized = false) { 1624 const bool not_oop = false; 1625 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1626 } 1627 1628 // Arguments: 1629 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1630 // ignored 1631 // name - stub name string 1632 // 1633 // Inputs: 1634 // c_rarg0 - source array address 1635 // c_rarg1 - destination array address 1636 // c_rarg2 - element count, treated as size_t, can be zero 1637 // 1638 address generate_conjoint_long_copy(bool aligned, 1639 address nooverlap_target, address *entry, 1640 const char *name, bool dest_uninitialized = false) { 1641 const bool not_oop = false; 1642 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1643 } 1644 1645 // Arguments: 1646 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1647 // ignored 1648 // name - stub name string 1649 // 1650 // Inputs: 1651 // c_rarg0 - source array address 1652 // c_rarg1 - destination array address 1653 // c_rarg2 - element count, treated as size_t, can be zero 1654 // 1655 // Side Effects: 1656 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1657 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1658 // 1659 address generate_disjoint_oop_copy(bool aligned, address *entry, 1660 const char *name, bool dest_uninitialized) { 1661 const bool is_oop = true; 1662 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1663 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1664 } 1665 1666 // Arguments: 1667 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1668 // ignored 1669 // name - stub name string 1670 // 1671 // Inputs: 1672 // c_rarg0 - source array address 1673 // c_rarg1 - destination array address 1674 // c_rarg2 - element count, treated as size_t, can be zero 1675 // 1676 address generate_conjoint_oop_copy(bool aligned, 1677 address nooverlap_target, address *entry, 1678 const char *name, bool dest_uninitialized) { 1679 const bool is_oop = true; 1680 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1681 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1682 name, dest_uninitialized); 1683 } 1684 1685 1686 // Helper for generating a dynamic type check. 1687 // Smashes rscratch1. 1688 void generate_type_check(Register sub_klass, 1689 Register super_check_offset, 1690 Register super_klass, 1691 Label& L_success) { 1692 assert_different_registers(sub_klass, super_check_offset, super_klass); 1693 1694 BLOCK_COMMENT("type_check:"); 1695 1696 Label L_miss; 1697 1698 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1699 super_check_offset); 1700 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1701 1702 // Fall through on failure! 1703 __ BIND(L_miss); 1704 } 1705 1706 // 1707 // Generate checkcasting array copy stub 1708 // 1709 // Input: 1710 // c_rarg0 - source array address 1711 // c_rarg1 - destination array address 1712 // c_rarg2 - element count, treated as ssize_t, can be zero 1713 // c_rarg3 - size_t ckoff (super_check_offset) 1714 // c_rarg4 - oop ckval (super_klass) 1715 // 1716 // Output: 1717 // r0 == 0 - success 1718 // r0 == -1^K - failure, where K is partial transfer count 1719 // 1720 address generate_checkcast_copy(const char *name, address *entry, 1721 bool dest_uninitialized = false) { 1722 1723 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1724 1725 // Input registers (after setup_arg_regs) 1726 const Register from = c_rarg0; // source array address 1727 const Register to = c_rarg1; // destination array address 1728 const Register count = c_rarg2; // elementscount 1729 const Register ckoff = c_rarg3; // super_check_offset 1730 const Register ckval = c_rarg4; // super_klass 1731 1732 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1733 RegSet wb_post_saved_regs = RegSet::of(count); 1734 1735 // Registers used as temps (r18, r19, r20 are save-on-entry) 1736 const Register count_save = r21; // orig elementscount 1737 const Register start_to = r20; // destination array start address 1738 const Register copied_oop = r18; // actual oop copied 1739 const Register r19_klass = r19; // oop._klass 1740 1741 //--------------------------------------------------------------- 1742 // Assembler stub will be used for this call to arraycopy 1743 // if the two arrays are subtypes of Object[] but the 1744 // destination array type is not equal to or a supertype 1745 // of the source type. Each element must be separately 1746 // checked. 1747 1748 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1749 copied_oop, r19_klass, count_save); 1750 1751 __ align(CodeEntryAlignment); 1752 StubCodeMark mark(this, "StubRoutines", name); 1753 address start = __ pc(); 1754 1755 __ enter(); // required for proper stackwalking of RuntimeStub frame 1756 1757 #ifdef ASSERT 1758 // caller guarantees that the arrays really are different 1759 // otherwise, we would have to make conjoint checks 1760 { Label L; 1761 array_overlap_test(L, TIMES_OOP); 1762 __ stop("checkcast_copy within a single array"); 1763 __ bind(L); 1764 } 1765 #endif //ASSERT 1766 1767 // Caller of this entry point must set up the argument registers. 1768 if (entry != NULL) { 1769 *entry = __ pc(); 1770 BLOCK_COMMENT("Entry:"); 1771 } 1772 1773 // Empty array: Nothing to do. 1774 __ cbz(count, L_done); 1775 1776 __ push(RegSet::of(r18, r19, r20, r21), sp); 1777 1778 #ifdef ASSERT 1779 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1780 // The ckoff and ckval must be mutually consistent, 1781 // even though caller generates both. 1782 { Label L; 1783 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1784 __ ldrw(start_to, Address(ckval, sco_offset)); 1785 __ cmpw(ckoff, start_to); 1786 __ br(Assembler::EQ, L); 1787 __ stop("super_check_offset inconsistent"); 1788 __ bind(L); 1789 } 1790 #endif //ASSERT 1791 1792 DecoratorSet decorators = ARRAYCOPY_CHECKCAST; 1793 bool is_oop = true; 1794 if (dest_uninitialized) { 1795 decorators |= AS_DEST_NOT_INITIALIZED; 1796 } 1797 1798 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1799 bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs); 1800 1801 // save the original count 1802 __ mov(count_save, count); 1803 1804 // Copy from low to high addresses 1805 __ mov(start_to, to); // Save destination array start address 1806 __ b(L_load_element); 1807 1808 // ======== begin loop ======== 1809 // (Loop is rotated; its entry is L_load_element.) 1810 // Loop control: 1811 // for (; count != 0; count--) { 1812 // copied_oop = load_heap_oop(from++); 1813 // ... generate_type_check ...; 1814 // store_heap_oop(to++, copied_oop); 1815 // } 1816 __ align(OptoLoopAlignment); 1817 1818 __ BIND(L_store_element); 1819 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop); // store the oop 1820 __ sub(count, count, 1); 1821 __ cbz(count, L_do_card_marks); 1822 1823 // ======== loop entry is here ======== 1824 __ BIND(L_load_element); 1825 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8)); // load the oop 1826 __ cbz(copied_oop, L_store_element); 1827 1828 __ load_klass(r19_klass, copied_oop);// query the object klass 1829 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1830 // ======== end loop ======== 1831 1832 // It was a real error; we must depend on the caller to finish the job. 1833 // Register count = remaining oops, count_orig = total oops. 1834 // Emit GC store barriers for the oops we have copied and report 1835 // their number to the caller. 1836 1837 __ subs(count, count_save, count); // K = partially copied oop count 1838 __ eon(count, count, zr); // report (-1^K) to caller 1839 __ br(Assembler::EQ, L_done_pop); 1840 1841 __ BIND(L_do_card_marks); 1842 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1843 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs); 1844 1845 __ bind(L_done_pop); 1846 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1847 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1848 1849 __ bind(L_done); 1850 __ mov(r0, count); 1851 __ leave(); 1852 __ ret(lr); 1853 1854 return start; 1855 } 1856 1857 // Perform range checks on the proposed arraycopy. 1858 // Kills temp, but nothing else. 1859 // Also, clean the sign bits of src_pos and dst_pos. 1860 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1861 Register src_pos, // source position (c_rarg1) 1862 Register dst, // destination array oo (c_rarg2) 1863 Register dst_pos, // destination position (c_rarg3) 1864 Register length, 1865 Register temp, 1866 Label& L_failed) { 1867 BLOCK_COMMENT("arraycopy_range_checks:"); 1868 1869 assert_different_registers(rscratch1, temp); 1870 1871 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1872 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1873 __ addw(temp, length, src_pos); 1874 __ cmpw(temp, rscratch1); 1875 __ br(Assembler::HI, L_failed); 1876 1877 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1878 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1879 __ addw(temp, length, dst_pos); 1880 __ cmpw(temp, rscratch1); 1881 __ br(Assembler::HI, L_failed); 1882 1883 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1884 __ movw(src_pos, src_pos); 1885 __ movw(dst_pos, dst_pos); 1886 1887 BLOCK_COMMENT("arraycopy_range_checks done"); 1888 } 1889 1890 // These stubs get called from some dumb test routine. 1891 // I'll write them properly when they're called from 1892 // something that's actually doing something. 1893 static void fake_arraycopy_stub(address src, address dst, int count) { 1894 assert(count == 0, "huh?"); 1895 } 1896 1897 1898 // 1899 // Generate 'unsafe' array copy stub 1900 // Though just as safe as the other stubs, it takes an unscaled 1901 // size_t argument instead of an element count. 1902 // 1903 // Input: 1904 // c_rarg0 - source array address 1905 // c_rarg1 - destination array address 1906 // c_rarg2 - byte count, treated as ssize_t, can be zero 1907 // 1908 // Examines the alignment of the operands and dispatches 1909 // to a long, int, short, or byte copy loop. 1910 // 1911 address generate_unsafe_copy(const char *name, 1912 address byte_copy_entry, 1913 address short_copy_entry, 1914 address int_copy_entry, 1915 address long_copy_entry) { 1916 Label L_long_aligned, L_int_aligned, L_short_aligned; 1917 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1918 1919 __ align(CodeEntryAlignment); 1920 StubCodeMark mark(this, "StubRoutines", name); 1921 address start = __ pc(); 1922 __ enter(); // required for proper stackwalking of RuntimeStub frame 1923 1924 // bump this on entry, not on exit: 1925 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1926 1927 __ orr(rscratch1, s, d); 1928 __ orr(rscratch1, rscratch1, count); 1929 1930 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1931 __ cbz(rscratch1, L_long_aligned); 1932 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1933 __ cbz(rscratch1, L_int_aligned); 1934 __ tbz(rscratch1, 0, L_short_aligned); 1935 __ b(RuntimeAddress(byte_copy_entry)); 1936 1937 __ BIND(L_short_aligned); 1938 __ lsr(count, count, LogBytesPerShort); // size => short_count 1939 __ b(RuntimeAddress(short_copy_entry)); 1940 __ BIND(L_int_aligned); 1941 __ lsr(count, count, LogBytesPerInt); // size => int_count 1942 __ b(RuntimeAddress(int_copy_entry)); 1943 __ BIND(L_long_aligned); 1944 __ lsr(count, count, LogBytesPerLong); // size => long_count 1945 __ b(RuntimeAddress(long_copy_entry)); 1946 1947 return start; 1948 } 1949 1950 // 1951 // Generate generic array copy stubs 1952 // 1953 // Input: 1954 // c_rarg0 - src oop 1955 // c_rarg1 - src_pos (32-bits) 1956 // c_rarg2 - dst oop 1957 // c_rarg3 - dst_pos (32-bits) 1958 // c_rarg4 - element count (32-bits) 1959 // 1960 // Output: 1961 // r0 == 0 - success 1962 // r0 == -1^K - failure, where K is partial transfer count 1963 // 1964 address generate_generic_copy(const char *name, 1965 address byte_copy_entry, address short_copy_entry, 1966 address int_copy_entry, address oop_copy_entry, 1967 address long_copy_entry, address checkcast_copy_entry) { 1968 1969 Label L_failed, L_failed_0, L_objArray; 1970 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1971 1972 // Input registers 1973 const Register src = c_rarg0; // source array oop 1974 const Register src_pos = c_rarg1; // source position 1975 const Register dst = c_rarg2; // destination array oop 1976 const Register dst_pos = c_rarg3; // destination position 1977 const Register length = c_rarg4; 1978 1979 StubCodeMark mark(this, "StubRoutines", name); 1980 1981 __ align(CodeEntryAlignment); 1982 address start = __ pc(); 1983 1984 __ enter(); // required for proper stackwalking of RuntimeStub frame 1985 1986 // bump this on entry, not on exit: 1987 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1988 1989 //----------------------------------------------------------------------- 1990 // Assembler stub will be used for this call to arraycopy 1991 // if the following conditions are met: 1992 // 1993 // (1) src and dst must not be null. 1994 // (2) src_pos must not be negative. 1995 // (3) dst_pos must not be negative. 1996 // (4) length must not be negative. 1997 // (5) src klass and dst klass should be the same and not NULL. 1998 // (6) src and dst should be arrays. 1999 // (7) src_pos + length must not exceed length of src. 2000 // (8) dst_pos + length must not exceed length of dst. 2001 // 2002 2003 // if (src == NULL) return -1; 2004 __ cbz(src, L_failed); 2005 2006 // if (src_pos < 0) return -1; 2007 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2008 2009 // if (dst == NULL) return -1; 2010 __ cbz(dst, L_failed); 2011 2012 // if (dst_pos < 0) return -1; 2013 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2014 2015 // registers used as temp 2016 const Register scratch_length = r16; // elements count to copy 2017 const Register scratch_src_klass = r17; // array klass 2018 const Register lh = r18; // layout helper 2019 2020 // if (length < 0) return -1; 2021 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2022 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2023 2024 __ load_klass(scratch_src_klass, src); 2025 #ifdef ASSERT 2026 // assert(src->klass() != NULL); 2027 { 2028 BLOCK_COMMENT("assert klasses not null {"); 2029 Label L1, L2; 2030 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2031 __ bind(L1); 2032 __ stop("broken null klass"); 2033 __ bind(L2); 2034 __ load_klass(rscratch1, dst); 2035 __ cbz(rscratch1, L1); // this would be broken also 2036 BLOCK_COMMENT("} assert klasses not null done"); 2037 } 2038 #endif 2039 2040 // Load layout helper (32-bits) 2041 // 2042 // |array_tag| | header_size | element_type | |log2_element_size| 2043 // 32 30 24 16 8 2 0 2044 // 2045 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2046 // 2047 2048 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2049 2050 // Handle objArrays completely differently... 2051 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2052 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2053 __ movw(rscratch1, objArray_lh); 2054 __ eorw(rscratch2, lh, rscratch1); 2055 __ cbzw(rscratch2, L_objArray); 2056 2057 // if (src->klass() != dst->klass()) return -1; 2058 __ load_klass(rscratch2, dst); 2059 __ eor(rscratch2, rscratch2, scratch_src_klass); 2060 __ cbnz(rscratch2, L_failed); 2061 2062 // if (!src->is_Array()) return -1; 2063 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2064 2065 // At this point, it is known to be a typeArray (array_tag 0x3). 2066 #ifdef ASSERT 2067 { 2068 BLOCK_COMMENT("assert primitive array {"); 2069 Label L; 2070 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2071 __ cmpw(lh, rscratch2); 2072 __ br(Assembler::GE, L); 2073 __ stop("must be a primitive array"); 2074 __ bind(L); 2075 BLOCK_COMMENT("} assert primitive array done"); 2076 } 2077 #endif 2078 2079 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2080 rscratch2, L_failed); 2081 2082 // TypeArrayKlass 2083 // 2084 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2085 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2086 // 2087 2088 const Register rscratch1_offset = rscratch1; // array offset 2089 const Register r18_elsize = lh; // element size 2090 2091 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2092 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2093 __ add(src, src, rscratch1_offset); // src array offset 2094 __ add(dst, dst, rscratch1_offset); // dst array offset 2095 BLOCK_COMMENT("choose copy loop based on element size"); 2096 2097 // next registers should be set before the jump to corresponding stub 2098 const Register from = c_rarg0; // source array address 2099 const Register to = c_rarg1; // destination array address 2100 const Register count = c_rarg2; // elements count 2101 2102 // 'from', 'to', 'count' registers should be set in such order 2103 // since they are the same as 'src', 'src_pos', 'dst'. 2104 2105 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2106 2107 // The possible values of elsize are 0-3, i.e. exact_log2(element 2108 // size in bytes). We do a simple bitwise binary search. 2109 __ BIND(L_copy_bytes); 2110 __ tbnz(r18_elsize, 1, L_copy_ints); 2111 __ tbnz(r18_elsize, 0, L_copy_shorts); 2112 __ lea(from, Address(src, src_pos));// src_addr 2113 __ lea(to, Address(dst, dst_pos));// dst_addr 2114 __ movw(count, scratch_length); // length 2115 __ b(RuntimeAddress(byte_copy_entry)); 2116 2117 __ BIND(L_copy_shorts); 2118 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2119 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2120 __ movw(count, scratch_length); // length 2121 __ b(RuntimeAddress(short_copy_entry)); 2122 2123 __ BIND(L_copy_ints); 2124 __ tbnz(r18_elsize, 0, L_copy_longs); 2125 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2126 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2127 __ movw(count, scratch_length); // length 2128 __ b(RuntimeAddress(int_copy_entry)); 2129 2130 __ BIND(L_copy_longs); 2131 #ifdef ASSERT 2132 { 2133 BLOCK_COMMENT("assert long copy {"); 2134 Label L; 2135 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2136 __ cmpw(r18_elsize, LogBytesPerLong); 2137 __ br(Assembler::EQ, L); 2138 __ stop("must be long copy, but elsize is wrong"); 2139 __ bind(L); 2140 BLOCK_COMMENT("} assert long copy done"); 2141 } 2142 #endif 2143 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2144 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2145 __ movw(count, scratch_length); // length 2146 __ b(RuntimeAddress(long_copy_entry)); 2147 2148 // ObjArrayKlass 2149 __ BIND(L_objArray); 2150 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2151 2152 Label L_plain_copy, L_checkcast_copy; 2153 // test array classes for subtyping 2154 __ load_klass(r18, dst); 2155 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2156 __ br(Assembler::NE, L_checkcast_copy); 2157 2158 // Identically typed arrays can be copied without element-wise checks. 2159 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2160 rscratch2, L_failed); 2161 2162 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2163 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2164 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2165 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2166 __ movw(count, scratch_length); // length 2167 __ BIND(L_plain_copy); 2168 __ b(RuntimeAddress(oop_copy_entry)); 2169 2170 __ BIND(L_checkcast_copy); 2171 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2172 { 2173 // Before looking at dst.length, make sure dst is also an objArray. 2174 __ ldrw(rscratch1, Address(r18, lh_offset)); 2175 __ movw(rscratch2, objArray_lh); 2176 __ eorw(rscratch1, rscratch1, rscratch2); 2177 __ cbnzw(rscratch1, L_failed); 2178 2179 // It is safe to examine both src.length and dst.length. 2180 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2181 r18, L_failed); 2182 2183 const Register rscratch2_dst_klass = rscratch2; 2184 __ load_klass(rscratch2_dst_klass, dst); // reload 2185 2186 // Marshal the base address arguments now, freeing registers. 2187 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2188 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2189 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2190 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2191 __ movw(count, length); // length (reloaded) 2192 Register sco_temp = c_rarg3; // this register is free now 2193 assert_different_registers(from, to, count, sco_temp, 2194 rscratch2_dst_klass, scratch_src_klass); 2195 // assert_clean_int(count, sco_temp); 2196 2197 // Generate the type check. 2198 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2199 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2200 // assert_clean_int(sco_temp, r18); 2201 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2202 2203 // Fetch destination element klass from the ObjArrayKlass header. 2204 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2205 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2206 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2207 2208 // the checkcast_copy loop needs two extra arguments: 2209 assert(c_rarg3 == sco_temp, "#3 already in place"); 2210 // Set up arguments for checkcast_copy_entry. 2211 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2212 __ b(RuntimeAddress(checkcast_copy_entry)); 2213 } 2214 2215 __ BIND(L_failed); 2216 __ mov(r0, -1); 2217 __ leave(); // required for proper stackwalking of RuntimeStub frame 2218 __ ret(lr); 2219 2220 return start; 2221 } 2222 2223 // 2224 // Generate stub for array fill. If "aligned" is true, the 2225 // "to" address is assumed to be heapword aligned. 2226 // 2227 // Arguments for generated stub: 2228 // to: c_rarg0 2229 // value: c_rarg1 2230 // count: c_rarg2 treated as signed 2231 // 2232 address generate_fill(BasicType t, bool aligned, const char *name) { 2233 __ align(CodeEntryAlignment); 2234 StubCodeMark mark(this, "StubRoutines", name); 2235 address start = __ pc(); 2236 2237 BLOCK_COMMENT("Entry:"); 2238 2239 const Register to = c_rarg0; // source array address 2240 const Register value = c_rarg1; // value 2241 const Register count = c_rarg2; // elements count 2242 2243 const Register bz_base = r10; // base for block_zero routine 2244 const Register cnt_words = r11; // temp register 2245 2246 __ enter(); 2247 2248 Label L_fill_elements, L_exit1; 2249 2250 int shift = -1; 2251 switch (t) { 2252 case T_BYTE: 2253 shift = 0; 2254 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2255 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2256 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2257 __ br(Assembler::LO, L_fill_elements); 2258 break; 2259 case T_SHORT: 2260 shift = 1; 2261 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2262 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2263 __ br(Assembler::LO, L_fill_elements); 2264 break; 2265 case T_INT: 2266 shift = 2; 2267 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2268 __ br(Assembler::LO, L_fill_elements); 2269 break; 2270 default: ShouldNotReachHere(); 2271 } 2272 2273 // Align source address at 8 bytes address boundary. 2274 Label L_skip_align1, L_skip_align2, L_skip_align4; 2275 if (!aligned) { 2276 switch (t) { 2277 case T_BYTE: 2278 // One byte misalignment happens only for byte arrays. 2279 __ tbz(to, 0, L_skip_align1); 2280 __ strb(value, Address(__ post(to, 1))); 2281 __ subw(count, count, 1); 2282 __ bind(L_skip_align1); 2283 // Fallthrough 2284 case T_SHORT: 2285 // Two bytes misalignment happens only for byte and short (char) arrays. 2286 __ tbz(to, 1, L_skip_align2); 2287 __ strh(value, Address(__ post(to, 2))); 2288 __ subw(count, count, 2 >> shift); 2289 __ bind(L_skip_align2); 2290 // Fallthrough 2291 case T_INT: 2292 // Align to 8 bytes, we know we are 4 byte aligned to start. 2293 __ tbz(to, 2, L_skip_align4); 2294 __ strw(value, Address(__ post(to, 4))); 2295 __ subw(count, count, 4 >> shift); 2296 __ bind(L_skip_align4); 2297 break; 2298 default: ShouldNotReachHere(); 2299 } 2300 } 2301 2302 // 2303 // Fill large chunks 2304 // 2305 __ lsrw(cnt_words, count, 3 - shift); // number of words 2306 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2307 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2308 if (UseBlockZeroing) { 2309 Label non_block_zeroing, rest; 2310 // If the fill value is zero we can use the fast zero_words(). 2311 __ cbnz(value, non_block_zeroing); 2312 __ mov(bz_base, to); 2313 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2314 __ zero_words(bz_base, cnt_words); 2315 __ b(rest); 2316 __ bind(non_block_zeroing); 2317 __ fill_words(to, cnt_words, value); 2318 __ bind(rest); 2319 } else { 2320 __ fill_words(to, cnt_words, value); 2321 } 2322 2323 // Remaining count is less than 8 bytes. Fill it by a single store. 2324 // Note that the total length is no less than 8 bytes. 2325 if (t == T_BYTE || t == T_SHORT) { 2326 Label L_exit1; 2327 __ cbzw(count, L_exit1); 2328 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2329 __ str(value, Address(to, -8)); // overwrite some elements 2330 __ bind(L_exit1); 2331 __ leave(); 2332 __ ret(lr); 2333 } 2334 2335 // Handle copies less than 8 bytes. 2336 Label L_fill_2, L_fill_4, L_exit2; 2337 __ bind(L_fill_elements); 2338 switch (t) { 2339 case T_BYTE: 2340 __ tbz(count, 0, L_fill_2); 2341 __ strb(value, Address(__ post(to, 1))); 2342 __ bind(L_fill_2); 2343 __ tbz(count, 1, L_fill_4); 2344 __ strh(value, Address(__ post(to, 2))); 2345 __ bind(L_fill_4); 2346 __ tbz(count, 2, L_exit2); 2347 __ strw(value, Address(to)); 2348 break; 2349 case T_SHORT: 2350 __ tbz(count, 0, L_fill_4); 2351 __ strh(value, Address(__ post(to, 2))); 2352 __ bind(L_fill_4); 2353 __ tbz(count, 1, L_exit2); 2354 __ strw(value, Address(to)); 2355 break; 2356 case T_INT: 2357 __ cbzw(count, L_exit2); 2358 __ strw(value, Address(to)); 2359 break; 2360 default: ShouldNotReachHere(); 2361 } 2362 __ bind(L_exit2); 2363 __ leave(); 2364 __ ret(lr); 2365 return start; 2366 } 2367 2368 void generate_arraycopy_stubs() { 2369 address entry; 2370 address entry_jbyte_arraycopy; 2371 address entry_jshort_arraycopy; 2372 address entry_jint_arraycopy; 2373 address entry_oop_arraycopy; 2374 address entry_jlong_arraycopy; 2375 address entry_checkcast_arraycopy; 2376 2377 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2378 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2379 2380 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2381 2382 //*** jbyte 2383 // Always need aligned and unaligned versions 2384 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2385 "jbyte_disjoint_arraycopy"); 2386 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2387 &entry_jbyte_arraycopy, 2388 "jbyte_arraycopy"); 2389 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2390 "arrayof_jbyte_disjoint_arraycopy"); 2391 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2392 "arrayof_jbyte_arraycopy"); 2393 2394 //*** jshort 2395 // Always need aligned and unaligned versions 2396 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2397 "jshort_disjoint_arraycopy"); 2398 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2399 &entry_jshort_arraycopy, 2400 "jshort_arraycopy"); 2401 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2402 "arrayof_jshort_disjoint_arraycopy"); 2403 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2404 "arrayof_jshort_arraycopy"); 2405 2406 //*** jint 2407 // Aligned versions 2408 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2409 "arrayof_jint_disjoint_arraycopy"); 2410 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2411 "arrayof_jint_arraycopy"); 2412 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2413 // entry_jint_arraycopy always points to the unaligned version 2414 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2415 "jint_disjoint_arraycopy"); 2416 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2417 &entry_jint_arraycopy, 2418 "jint_arraycopy"); 2419 2420 //*** jlong 2421 // It is always aligned 2422 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2423 "arrayof_jlong_disjoint_arraycopy"); 2424 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2425 "arrayof_jlong_arraycopy"); 2426 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2427 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2428 2429 //*** oops 2430 { 2431 // With compressed oops we need unaligned versions; notice that 2432 // we overwrite entry_oop_arraycopy. 2433 bool aligned = !UseCompressedOops; 2434 2435 StubRoutines::_arrayof_oop_disjoint_arraycopy 2436 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2437 /*dest_uninitialized*/false); 2438 StubRoutines::_arrayof_oop_arraycopy 2439 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2440 /*dest_uninitialized*/false); 2441 // Aligned versions without pre-barriers 2442 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2443 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2444 /*dest_uninitialized*/true); 2445 StubRoutines::_arrayof_oop_arraycopy_uninit 2446 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2447 /*dest_uninitialized*/true); 2448 } 2449 2450 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2451 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2452 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2453 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2454 2455 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2456 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2457 /*dest_uninitialized*/true); 2458 2459 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2460 entry_jbyte_arraycopy, 2461 entry_jshort_arraycopy, 2462 entry_jint_arraycopy, 2463 entry_jlong_arraycopy); 2464 2465 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2466 entry_jbyte_arraycopy, 2467 entry_jshort_arraycopy, 2468 entry_jint_arraycopy, 2469 entry_oop_arraycopy, 2470 entry_jlong_arraycopy, 2471 entry_checkcast_arraycopy); 2472 2473 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2474 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2475 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2476 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2477 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2478 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2479 } 2480 2481 void generate_math_stubs() { Unimplemented(); } 2482 2483 // Arguments: 2484 // 2485 // Inputs: 2486 // c_rarg0 - source byte array address 2487 // c_rarg1 - destination byte array address 2488 // c_rarg2 - K (key) in little endian int array 2489 // 2490 address generate_aescrypt_encryptBlock() { 2491 __ align(CodeEntryAlignment); 2492 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2493 2494 Label L_doLast; 2495 2496 const Register from = c_rarg0; // source array address 2497 const Register to = c_rarg1; // destination array address 2498 const Register key = c_rarg2; // key array address 2499 const Register keylen = rscratch1; 2500 2501 address start = __ pc(); 2502 __ enter(); 2503 2504 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2505 2506 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2507 2508 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2509 __ rev32(v1, __ T16B, v1); 2510 __ rev32(v2, __ T16B, v2); 2511 __ rev32(v3, __ T16B, v3); 2512 __ rev32(v4, __ T16B, v4); 2513 __ aese(v0, v1); 2514 __ aesmc(v0, v0); 2515 __ aese(v0, v2); 2516 __ aesmc(v0, v0); 2517 __ aese(v0, v3); 2518 __ aesmc(v0, v0); 2519 __ aese(v0, v4); 2520 __ aesmc(v0, v0); 2521 2522 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2523 __ rev32(v1, __ T16B, v1); 2524 __ rev32(v2, __ T16B, v2); 2525 __ rev32(v3, __ T16B, v3); 2526 __ rev32(v4, __ T16B, v4); 2527 __ aese(v0, v1); 2528 __ aesmc(v0, v0); 2529 __ aese(v0, v2); 2530 __ aesmc(v0, v0); 2531 __ aese(v0, v3); 2532 __ aesmc(v0, v0); 2533 __ aese(v0, v4); 2534 __ aesmc(v0, v0); 2535 2536 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2537 __ rev32(v1, __ T16B, v1); 2538 __ rev32(v2, __ T16B, v2); 2539 2540 __ cmpw(keylen, 44); 2541 __ br(Assembler::EQ, L_doLast); 2542 2543 __ aese(v0, v1); 2544 __ aesmc(v0, v0); 2545 __ aese(v0, v2); 2546 __ aesmc(v0, v0); 2547 2548 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2549 __ rev32(v1, __ T16B, v1); 2550 __ rev32(v2, __ T16B, v2); 2551 2552 __ cmpw(keylen, 52); 2553 __ br(Assembler::EQ, L_doLast); 2554 2555 __ aese(v0, v1); 2556 __ aesmc(v0, v0); 2557 __ aese(v0, v2); 2558 __ aesmc(v0, v0); 2559 2560 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2561 __ rev32(v1, __ T16B, v1); 2562 __ rev32(v2, __ T16B, v2); 2563 2564 __ BIND(L_doLast); 2565 2566 __ aese(v0, v1); 2567 __ aesmc(v0, v0); 2568 __ aese(v0, v2); 2569 2570 __ ld1(v1, __ T16B, key); 2571 __ rev32(v1, __ T16B, v1); 2572 __ eor(v0, __ T16B, v0, v1); 2573 2574 __ st1(v0, __ T16B, to); 2575 2576 __ mov(r0, 0); 2577 2578 __ leave(); 2579 __ ret(lr); 2580 2581 return start; 2582 } 2583 2584 // Arguments: 2585 // 2586 // Inputs: 2587 // c_rarg0 - source byte array address 2588 // c_rarg1 - destination byte array address 2589 // c_rarg2 - K (key) in little endian int array 2590 // 2591 address generate_aescrypt_decryptBlock() { 2592 assert(UseAES, "need AES instructions and misaligned SSE support"); 2593 __ align(CodeEntryAlignment); 2594 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2595 Label L_doLast; 2596 2597 const Register from = c_rarg0; // source array address 2598 const Register to = c_rarg1; // destination array address 2599 const Register key = c_rarg2; // key array address 2600 const Register keylen = rscratch1; 2601 2602 address start = __ pc(); 2603 __ enter(); // required for proper stackwalking of RuntimeStub frame 2604 2605 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2606 2607 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2608 2609 __ ld1(v5, __ T16B, __ post(key, 16)); 2610 __ rev32(v5, __ T16B, v5); 2611 2612 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2613 __ rev32(v1, __ T16B, v1); 2614 __ rev32(v2, __ T16B, v2); 2615 __ rev32(v3, __ T16B, v3); 2616 __ rev32(v4, __ T16B, v4); 2617 __ aesd(v0, v1); 2618 __ aesimc(v0, v0); 2619 __ aesd(v0, v2); 2620 __ aesimc(v0, v0); 2621 __ aesd(v0, v3); 2622 __ aesimc(v0, v0); 2623 __ aesd(v0, v4); 2624 __ aesimc(v0, v0); 2625 2626 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2627 __ rev32(v1, __ T16B, v1); 2628 __ rev32(v2, __ T16B, v2); 2629 __ rev32(v3, __ T16B, v3); 2630 __ rev32(v4, __ T16B, v4); 2631 __ aesd(v0, v1); 2632 __ aesimc(v0, v0); 2633 __ aesd(v0, v2); 2634 __ aesimc(v0, v0); 2635 __ aesd(v0, v3); 2636 __ aesimc(v0, v0); 2637 __ aesd(v0, v4); 2638 __ aesimc(v0, v0); 2639 2640 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2641 __ rev32(v1, __ T16B, v1); 2642 __ rev32(v2, __ T16B, v2); 2643 2644 __ cmpw(keylen, 44); 2645 __ br(Assembler::EQ, L_doLast); 2646 2647 __ aesd(v0, v1); 2648 __ aesimc(v0, v0); 2649 __ aesd(v0, v2); 2650 __ aesimc(v0, v0); 2651 2652 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2653 __ rev32(v1, __ T16B, v1); 2654 __ rev32(v2, __ T16B, v2); 2655 2656 __ cmpw(keylen, 52); 2657 __ br(Assembler::EQ, L_doLast); 2658 2659 __ aesd(v0, v1); 2660 __ aesimc(v0, v0); 2661 __ aesd(v0, v2); 2662 __ aesimc(v0, v0); 2663 2664 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2665 __ rev32(v1, __ T16B, v1); 2666 __ rev32(v2, __ T16B, v2); 2667 2668 __ BIND(L_doLast); 2669 2670 __ aesd(v0, v1); 2671 __ aesimc(v0, v0); 2672 __ aesd(v0, v2); 2673 2674 __ eor(v0, __ T16B, v0, v5); 2675 2676 __ st1(v0, __ T16B, to); 2677 2678 __ mov(r0, 0); 2679 2680 __ leave(); 2681 __ ret(lr); 2682 2683 return start; 2684 } 2685 2686 // Arguments: 2687 // 2688 // Inputs: 2689 // c_rarg0 - source byte array address 2690 // c_rarg1 - destination byte array address 2691 // c_rarg2 - K (key) in little endian int array 2692 // c_rarg3 - r vector byte array address 2693 // c_rarg4 - input length 2694 // 2695 // Output: 2696 // x0 - input length 2697 // 2698 address generate_cipherBlockChaining_encryptAESCrypt() { 2699 assert(UseAES, "need AES instructions and misaligned SSE support"); 2700 __ align(CodeEntryAlignment); 2701 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2702 2703 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2704 2705 const Register from = c_rarg0; // source array address 2706 const Register to = c_rarg1; // destination array address 2707 const Register key = c_rarg2; // key array address 2708 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2709 // and left with the results of the last encryption block 2710 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2711 const Register keylen = rscratch1; 2712 2713 address start = __ pc(); 2714 2715 __ enter(); 2716 2717 __ movw(rscratch2, len_reg); 2718 2719 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2720 2721 __ ld1(v0, __ T16B, rvec); 2722 2723 __ cmpw(keylen, 52); 2724 __ br(Assembler::CC, L_loadkeys_44); 2725 __ br(Assembler::EQ, L_loadkeys_52); 2726 2727 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2728 __ rev32(v17, __ T16B, v17); 2729 __ rev32(v18, __ T16B, v18); 2730 __ BIND(L_loadkeys_52); 2731 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2732 __ rev32(v19, __ T16B, v19); 2733 __ rev32(v20, __ T16B, v20); 2734 __ BIND(L_loadkeys_44); 2735 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2736 __ rev32(v21, __ T16B, v21); 2737 __ rev32(v22, __ T16B, v22); 2738 __ rev32(v23, __ T16B, v23); 2739 __ rev32(v24, __ T16B, v24); 2740 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2741 __ rev32(v25, __ T16B, v25); 2742 __ rev32(v26, __ T16B, v26); 2743 __ rev32(v27, __ T16B, v27); 2744 __ rev32(v28, __ T16B, v28); 2745 __ ld1(v29, v30, v31, __ T16B, key); 2746 __ rev32(v29, __ T16B, v29); 2747 __ rev32(v30, __ T16B, v30); 2748 __ rev32(v31, __ T16B, v31); 2749 2750 __ BIND(L_aes_loop); 2751 __ ld1(v1, __ T16B, __ post(from, 16)); 2752 __ eor(v0, __ T16B, v0, v1); 2753 2754 __ br(Assembler::CC, L_rounds_44); 2755 __ br(Assembler::EQ, L_rounds_52); 2756 2757 __ aese(v0, v17); __ aesmc(v0, v0); 2758 __ aese(v0, v18); __ aesmc(v0, v0); 2759 __ BIND(L_rounds_52); 2760 __ aese(v0, v19); __ aesmc(v0, v0); 2761 __ aese(v0, v20); __ aesmc(v0, v0); 2762 __ BIND(L_rounds_44); 2763 __ aese(v0, v21); __ aesmc(v0, v0); 2764 __ aese(v0, v22); __ aesmc(v0, v0); 2765 __ aese(v0, v23); __ aesmc(v0, v0); 2766 __ aese(v0, v24); __ aesmc(v0, v0); 2767 __ aese(v0, v25); __ aesmc(v0, v0); 2768 __ aese(v0, v26); __ aesmc(v0, v0); 2769 __ aese(v0, v27); __ aesmc(v0, v0); 2770 __ aese(v0, v28); __ aesmc(v0, v0); 2771 __ aese(v0, v29); __ aesmc(v0, v0); 2772 __ aese(v0, v30); 2773 __ eor(v0, __ T16B, v0, v31); 2774 2775 __ st1(v0, __ T16B, __ post(to, 16)); 2776 2777 __ subw(len_reg, len_reg, 16); 2778 __ cbnzw(len_reg, L_aes_loop); 2779 2780 __ st1(v0, __ T16B, rvec); 2781 2782 __ mov(r0, rscratch2); 2783 2784 __ leave(); 2785 __ ret(lr); 2786 2787 return start; 2788 } 2789 2790 // Arguments: 2791 // 2792 // Inputs: 2793 // c_rarg0 - source byte array address 2794 // c_rarg1 - destination byte array address 2795 // c_rarg2 - K (key) in little endian int array 2796 // c_rarg3 - r vector byte array address 2797 // c_rarg4 - input length 2798 // 2799 // Output: 2800 // r0 - input length 2801 // 2802 address generate_cipherBlockChaining_decryptAESCrypt() { 2803 assert(UseAES, "need AES instructions and misaligned SSE support"); 2804 __ align(CodeEntryAlignment); 2805 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2806 2807 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2808 2809 const Register from = c_rarg0; // source array address 2810 const Register to = c_rarg1; // destination array address 2811 const Register key = c_rarg2; // key array address 2812 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2813 // and left with the results of the last encryption block 2814 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2815 const Register keylen = rscratch1; 2816 2817 address start = __ pc(); 2818 2819 __ enter(); 2820 2821 __ movw(rscratch2, len_reg); 2822 2823 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2824 2825 __ ld1(v2, __ T16B, rvec); 2826 2827 __ ld1(v31, __ T16B, __ post(key, 16)); 2828 __ rev32(v31, __ T16B, v31); 2829 2830 __ cmpw(keylen, 52); 2831 __ br(Assembler::CC, L_loadkeys_44); 2832 __ br(Assembler::EQ, L_loadkeys_52); 2833 2834 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2835 __ rev32(v17, __ T16B, v17); 2836 __ rev32(v18, __ T16B, v18); 2837 __ BIND(L_loadkeys_52); 2838 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2839 __ rev32(v19, __ T16B, v19); 2840 __ rev32(v20, __ T16B, v20); 2841 __ BIND(L_loadkeys_44); 2842 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2843 __ rev32(v21, __ T16B, v21); 2844 __ rev32(v22, __ T16B, v22); 2845 __ rev32(v23, __ T16B, v23); 2846 __ rev32(v24, __ T16B, v24); 2847 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2848 __ rev32(v25, __ T16B, v25); 2849 __ rev32(v26, __ T16B, v26); 2850 __ rev32(v27, __ T16B, v27); 2851 __ rev32(v28, __ T16B, v28); 2852 __ ld1(v29, v30, __ T16B, key); 2853 __ rev32(v29, __ T16B, v29); 2854 __ rev32(v30, __ T16B, v30); 2855 2856 __ BIND(L_aes_loop); 2857 __ ld1(v0, __ T16B, __ post(from, 16)); 2858 __ orr(v1, __ T16B, v0, v0); 2859 2860 __ br(Assembler::CC, L_rounds_44); 2861 __ br(Assembler::EQ, L_rounds_52); 2862 2863 __ aesd(v0, v17); __ aesimc(v0, v0); 2864 __ aesd(v0, v18); __ aesimc(v0, v0); 2865 __ BIND(L_rounds_52); 2866 __ aesd(v0, v19); __ aesimc(v0, v0); 2867 __ aesd(v0, v20); __ aesimc(v0, v0); 2868 __ BIND(L_rounds_44); 2869 __ aesd(v0, v21); __ aesimc(v0, v0); 2870 __ aesd(v0, v22); __ aesimc(v0, v0); 2871 __ aesd(v0, v23); __ aesimc(v0, v0); 2872 __ aesd(v0, v24); __ aesimc(v0, v0); 2873 __ aesd(v0, v25); __ aesimc(v0, v0); 2874 __ aesd(v0, v26); __ aesimc(v0, v0); 2875 __ aesd(v0, v27); __ aesimc(v0, v0); 2876 __ aesd(v0, v28); __ aesimc(v0, v0); 2877 __ aesd(v0, v29); __ aesimc(v0, v0); 2878 __ aesd(v0, v30); 2879 __ eor(v0, __ T16B, v0, v31); 2880 __ eor(v0, __ T16B, v0, v2); 2881 2882 __ st1(v0, __ T16B, __ post(to, 16)); 2883 __ orr(v2, __ T16B, v1, v1); 2884 2885 __ subw(len_reg, len_reg, 16); 2886 __ cbnzw(len_reg, L_aes_loop); 2887 2888 __ st1(v2, __ T16B, rvec); 2889 2890 __ mov(r0, rscratch2); 2891 2892 __ leave(); 2893 __ ret(lr); 2894 2895 return start; 2896 } 2897 2898 // Arguments: 2899 // 2900 // Inputs: 2901 // c_rarg0 - byte[] source+offset 2902 // c_rarg1 - int[] SHA.state 2903 // c_rarg2 - int offset 2904 // c_rarg3 - int limit 2905 // 2906 address generate_sha1_implCompress(bool multi_block, const char *name) { 2907 __ align(CodeEntryAlignment); 2908 StubCodeMark mark(this, "StubRoutines", name); 2909 address start = __ pc(); 2910 2911 Register buf = c_rarg0; 2912 Register state = c_rarg1; 2913 Register ofs = c_rarg2; 2914 Register limit = c_rarg3; 2915 2916 Label keys; 2917 Label sha1_loop; 2918 2919 // load the keys into v0..v3 2920 __ adr(rscratch1, keys); 2921 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2922 // load 5 words state into v6, v7 2923 __ ldrq(v6, Address(state, 0)); 2924 __ ldrs(v7, Address(state, 16)); 2925 2926 2927 __ BIND(sha1_loop); 2928 // load 64 bytes of data into v16..v19 2929 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2930 __ rev32(v16, __ T16B, v16); 2931 __ rev32(v17, __ T16B, v17); 2932 __ rev32(v18, __ T16B, v18); 2933 __ rev32(v19, __ T16B, v19); 2934 2935 // do the sha1 2936 __ addv(v4, __ T4S, v16, v0); 2937 __ orr(v20, __ T16B, v6, v6); 2938 2939 FloatRegister d0 = v16; 2940 FloatRegister d1 = v17; 2941 FloatRegister d2 = v18; 2942 FloatRegister d3 = v19; 2943 2944 for (int round = 0; round < 20; round++) { 2945 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2946 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2947 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2948 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2949 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2950 2951 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2952 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2953 __ sha1h(tmp2, __ T4S, v20); 2954 if (round < 5) 2955 __ sha1c(v20, __ T4S, tmp3, tmp4); 2956 else if (round < 10 || round >= 15) 2957 __ sha1p(v20, __ T4S, tmp3, tmp4); 2958 else 2959 __ sha1m(v20, __ T4S, tmp3, tmp4); 2960 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2961 2962 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2963 } 2964 2965 __ addv(v7, __ T2S, v7, v21); 2966 __ addv(v6, __ T4S, v6, v20); 2967 2968 if (multi_block) { 2969 __ add(ofs, ofs, 64); 2970 __ cmp(ofs, limit); 2971 __ br(Assembler::LE, sha1_loop); 2972 __ mov(c_rarg0, ofs); // return ofs 2973 } 2974 2975 __ strq(v6, Address(state, 0)); 2976 __ strs(v7, Address(state, 16)); 2977 2978 __ ret(lr); 2979 2980 __ bind(keys); 2981 __ emit_int32(0x5a827999); 2982 __ emit_int32(0x6ed9eba1); 2983 __ emit_int32(0x8f1bbcdc); 2984 __ emit_int32(0xca62c1d6); 2985 2986 return start; 2987 } 2988 2989 2990 // Arguments: 2991 // 2992 // Inputs: 2993 // c_rarg0 - byte[] source+offset 2994 // c_rarg1 - int[] SHA.state 2995 // c_rarg2 - int offset 2996 // c_rarg3 - int limit 2997 // 2998 address generate_sha256_implCompress(bool multi_block, const char *name) { 2999 static const uint32_t round_consts[64] = { 3000 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3001 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3002 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3003 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3004 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3005 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3006 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3007 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3008 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3009 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3010 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3011 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3012 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3013 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3014 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3015 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3016 }; 3017 __ align(CodeEntryAlignment); 3018 StubCodeMark mark(this, "StubRoutines", name); 3019 address start = __ pc(); 3020 3021 Register buf = c_rarg0; 3022 Register state = c_rarg1; 3023 Register ofs = c_rarg2; 3024 Register limit = c_rarg3; 3025 3026 Label sha1_loop; 3027 3028 __ stpd(v8, v9, __ pre(sp, -32)); 3029 __ stpd(v10, v11, Address(sp, 16)); 3030 3031 // dga == v0 3032 // dgb == v1 3033 // dg0 == v2 3034 // dg1 == v3 3035 // dg2 == v4 3036 // t0 == v6 3037 // t1 == v7 3038 3039 // load 16 keys to v16..v31 3040 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3041 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3042 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3043 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3044 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3045 3046 // load 8 words (256 bits) state 3047 __ ldpq(v0, v1, state); 3048 3049 __ BIND(sha1_loop); 3050 // load 64 bytes of data into v8..v11 3051 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3052 __ rev32(v8, __ T16B, v8); 3053 __ rev32(v9, __ T16B, v9); 3054 __ rev32(v10, __ T16B, v10); 3055 __ rev32(v11, __ T16B, v11); 3056 3057 __ addv(v6, __ T4S, v8, v16); 3058 __ orr(v2, __ T16B, v0, v0); 3059 __ orr(v3, __ T16B, v1, v1); 3060 3061 FloatRegister d0 = v8; 3062 FloatRegister d1 = v9; 3063 FloatRegister d2 = v10; 3064 FloatRegister d3 = v11; 3065 3066 3067 for (int round = 0; round < 16; round++) { 3068 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3069 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3070 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3071 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3072 3073 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3074 __ orr(v4, __ T16B, v2, v2); 3075 if (round < 15) 3076 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3077 __ sha256h(v2, __ T4S, v3, tmp2); 3078 __ sha256h2(v3, __ T4S, v4, tmp2); 3079 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3080 3081 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3082 } 3083 3084 __ addv(v0, __ T4S, v0, v2); 3085 __ addv(v1, __ T4S, v1, v3); 3086 3087 if (multi_block) { 3088 __ add(ofs, ofs, 64); 3089 __ cmp(ofs, limit); 3090 __ br(Assembler::LE, sha1_loop); 3091 __ mov(c_rarg0, ofs); // return ofs 3092 } 3093 3094 __ ldpd(v10, v11, Address(sp, 16)); 3095 __ ldpd(v8, v9, __ post(sp, 32)); 3096 3097 __ stpq(v0, v1, state); 3098 3099 __ ret(lr); 3100 3101 return start; 3102 } 3103 3104 #ifndef BUILTIN_SIM 3105 // Safefetch stubs. 3106 void generate_safefetch(const char* name, int size, address* entry, 3107 address* fault_pc, address* continuation_pc) { 3108 // safefetch signatures: 3109 // int SafeFetch32(int* adr, int errValue); 3110 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3111 // 3112 // arguments: 3113 // c_rarg0 = adr 3114 // c_rarg1 = errValue 3115 // 3116 // result: 3117 // PPC_RET = *adr or errValue 3118 3119 StubCodeMark mark(this, "StubRoutines", name); 3120 3121 // Entry point, pc or function descriptor. 3122 *entry = __ pc(); 3123 3124 // Load *adr into c_rarg1, may fault. 3125 *fault_pc = __ pc(); 3126 switch (size) { 3127 case 4: 3128 // int32_t 3129 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3130 break; 3131 case 8: 3132 // int64_t 3133 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3134 break; 3135 default: 3136 ShouldNotReachHere(); 3137 } 3138 3139 // return errValue or *adr 3140 *continuation_pc = __ pc(); 3141 __ mov(r0, c_rarg1); 3142 __ ret(lr); 3143 } 3144 #endif 3145 3146 /** 3147 * Arguments: 3148 * 3149 * Inputs: 3150 * c_rarg0 - int crc 3151 * c_rarg1 - byte* buf 3152 * c_rarg2 - int length 3153 * 3154 * Ouput: 3155 * rax - int crc result 3156 */ 3157 address generate_updateBytesCRC32() { 3158 assert(UseCRC32Intrinsics, "what are we doing here?"); 3159 3160 __ align(CodeEntryAlignment); 3161 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3162 3163 address start = __ pc(); 3164 3165 const Register crc = c_rarg0; // crc 3166 const Register buf = c_rarg1; // source java byte array address 3167 const Register len = c_rarg2; // length 3168 const Register table0 = c_rarg3; // crc_table address 3169 const Register table1 = c_rarg4; 3170 const Register table2 = c_rarg5; 3171 const Register table3 = c_rarg6; 3172 const Register tmp3 = c_rarg7; 3173 3174 BLOCK_COMMENT("Entry:"); 3175 __ enter(); // required for proper stackwalking of RuntimeStub frame 3176 3177 __ kernel_crc32(crc, buf, len, 3178 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3179 3180 __ leave(); // required for proper stackwalking of RuntimeStub frame 3181 __ ret(lr); 3182 3183 return start; 3184 } 3185 3186 /** 3187 * Arguments: 3188 * 3189 * Inputs: 3190 * c_rarg0 - int crc 3191 * c_rarg1 - byte* buf 3192 * c_rarg2 - int length 3193 * c_rarg3 - int* table 3194 * 3195 * Ouput: 3196 * r0 - int crc result 3197 */ 3198 address generate_updateBytesCRC32C() { 3199 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3200 3201 __ align(CodeEntryAlignment); 3202 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3203 3204 address start = __ pc(); 3205 3206 const Register crc = c_rarg0; // crc 3207 const Register buf = c_rarg1; // source java byte array address 3208 const Register len = c_rarg2; // length 3209 const Register table0 = c_rarg3; // crc_table address 3210 const Register table1 = c_rarg4; 3211 const Register table2 = c_rarg5; 3212 const Register table3 = c_rarg6; 3213 const Register tmp3 = c_rarg7; 3214 3215 BLOCK_COMMENT("Entry:"); 3216 __ enter(); // required for proper stackwalking of RuntimeStub frame 3217 3218 __ kernel_crc32c(crc, buf, len, 3219 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3220 3221 __ leave(); // required for proper stackwalking of RuntimeStub frame 3222 __ ret(lr); 3223 3224 return start; 3225 } 3226 3227 /*** 3228 * Arguments: 3229 * 3230 * Inputs: 3231 * c_rarg0 - int adler 3232 * c_rarg1 - byte* buff 3233 * c_rarg2 - int len 3234 * 3235 * Output: 3236 * c_rarg0 - int adler result 3237 */ 3238 address generate_updateBytesAdler32() { 3239 __ align(CodeEntryAlignment); 3240 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3241 address start = __ pc(); 3242 3243 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3244 3245 // Aliases 3246 Register adler = c_rarg0; 3247 Register s1 = c_rarg0; 3248 Register s2 = c_rarg3; 3249 Register buff = c_rarg1; 3250 Register len = c_rarg2; 3251 Register nmax = r4; 3252 Register base = r5; 3253 Register count = r6; 3254 Register temp0 = rscratch1; 3255 Register temp1 = rscratch2; 3256 Register temp2 = r7; 3257 3258 // Max number of bytes we can process before having to take the mod 3259 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3260 unsigned long BASE = 0xfff1; 3261 unsigned long NMAX = 0x15B0; 3262 3263 __ mov(base, BASE); 3264 __ mov(nmax, NMAX); 3265 3266 // s1 is initialized to the lower 16 bits of adler 3267 // s2 is initialized to the upper 16 bits of adler 3268 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3269 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3270 3271 // The pipelined loop needs at least 16 elements for 1 iteration 3272 // It does check this, but it is more effective to skip to the cleanup loop 3273 __ cmp(len, 16); 3274 __ br(Assembler::HS, L_nmax); 3275 __ cbz(len, L_combine); 3276 3277 __ bind(L_simple_by1_loop); 3278 __ ldrb(temp0, Address(__ post(buff, 1))); 3279 __ add(s1, s1, temp0); 3280 __ add(s2, s2, s1); 3281 __ subs(len, len, 1); 3282 __ br(Assembler::HI, L_simple_by1_loop); 3283 3284 // s1 = s1 % BASE 3285 __ subs(temp0, s1, base); 3286 __ csel(s1, temp0, s1, Assembler::HS); 3287 3288 // s2 = s2 % BASE 3289 __ lsr(temp0, s2, 16); 3290 __ lsl(temp1, temp0, 4); 3291 __ sub(temp1, temp1, temp0); 3292 __ add(s2, temp1, s2, ext::uxth); 3293 3294 __ subs(temp0, s2, base); 3295 __ csel(s2, temp0, s2, Assembler::HS); 3296 3297 __ b(L_combine); 3298 3299 __ bind(L_nmax); 3300 __ subs(len, len, nmax); 3301 __ sub(count, nmax, 16); 3302 __ br(Assembler::LO, L_by16); 3303 3304 __ bind(L_nmax_loop); 3305 3306 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3307 3308 __ add(s1, s1, temp0, ext::uxtb); 3309 __ ubfx(temp2, temp0, 8, 8); 3310 __ add(s2, s2, s1); 3311 __ add(s1, s1, temp2); 3312 __ ubfx(temp2, temp0, 16, 8); 3313 __ add(s2, s2, s1); 3314 __ add(s1, s1, temp2); 3315 __ ubfx(temp2, temp0, 24, 8); 3316 __ add(s2, s2, s1); 3317 __ add(s1, s1, temp2); 3318 __ ubfx(temp2, temp0, 32, 8); 3319 __ add(s2, s2, s1); 3320 __ add(s1, s1, temp2); 3321 __ ubfx(temp2, temp0, 40, 8); 3322 __ add(s2, s2, s1); 3323 __ add(s1, s1, temp2); 3324 __ ubfx(temp2, temp0, 48, 8); 3325 __ add(s2, s2, s1); 3326 __ add(s1, s1, temp2); 3327 __ add(s2, s2, s1); 3328 __ add(s1, s1, temp0, Assembler::LSR, 56); 3329 __ add(s2, s2, s1); 3330 3331 __ add(s1, s1, temp1, ext::uxtb); 3332 __ ubfx(temp2, temp1, 8, 8); 3333 __ add(s2, s2, s1); 3334 __ add(s1, s1, temp2); 3335 __ ubfx(temp2, temp1, 16, 8); 3336 __ add(s2, s2, s1); 3337 __ add(s1, s1, temp2); 3338 __ ubfx(temp2, temp1, 24, 8); 3339 __ add(s2, s2, s1); 3340 __ add(s1, s1, temp2); 3341 __ ubfx(temp2, temp1, 32, 8); 3342 __ add(s2, s2, s1); 3343 __ add(s1, s1, temp2); 3344 __ ubfx(temp2, temp1, 40, 8); 3345 __ add(s2, s2, s1); 3346 __ add(s1, s1, temp2); 3347 __ ubfx(temp2, temp1, 48, 8); 3348 __ add(s2, s2, s1); 3349 __ add(s1, s1, temp2); 3350 __ add(s2, s2, s1); 3351 __ add(s1, s1, temp1, Assembler::LSR, 56); 3352 __ add(s2, s2, s1); 3353 3354 __ subs(count, count, 16); 3355 __ br(Assembler::HS, L_nmax_loop); 3356 3357 // s1 = s1 % BASE 3358 __ lsr(temp0, s1, 16); 3359 __ lsl(temp1, temp0, 4); 3360 __ sub(temp1, temp1, temp0); 3361 __ add(temp1, temp1, s1, ext::uxth); 3362 3363 __ lsr(temp0, temp1, 16); 3364 __ lsl(s1, temp0, 4); 3365 __ sub(s1, s1, temp0); 3366 __ add(s1, s1, temp1, ext:: uxth); 3367 3368 __ subs(temp0, s1, base); 3369 __ csel(s1, temp0, s1, Assembler::HS); 3370 3371 // s2 = s2 % BASE 3372 __ lsr(temp0, s2, 16); 3373 __ lsl(temp1, temp0, 4); 3374 __ sub(temp1, temp1, temp0); 3375 __ add(temp1, temp1, s2, ext::uxth); 3376 3377 __ lsr(temp0, temp1, 16); 3378 __ lsl(s2, temp0, 4); 3379 __ sub(s2, s2, temp0); 3380 __ add(s2, s2, temp1, ext:: uxth); 3381 3382 __ subs(temp0, s2, base); 3383 __ csel(s2, temp0, s2, Assembler::HS); 3384 3385 __ subs(len, len, nmax); 3386 __ sub(count, nmax, 16); 3387 __ br(Assembler::HS, L_nmax_loop); 3388 3389 __ bind(L_by16); 3390 __ adds(len, len, count); 3391 __ br(Assembler::LO, L_by1); 3392 3393 __ bind(L_by16_loop); 3394 3395 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3396 3397 __ add(s1, s1, temp0, ext::uxtb); 3398 __ ubfx(temp2, temp0, 8, 8); 3399 __ add(s2, s2, s1); 3400 __ add(s1, s1, temp2); 3401 __ ubfx(temp2, temp0, 16, 8); 3402 __ add(s2, s2, s1); 3403 __ add(s1, s1, temp2); 3404 __ ubfx(temp2, temp0, 24, 8); 3405 __ add(s2, s2, s1); 3406 __ add(s1, s1, temp2); 3407 __ ubfx(temp2, temp0, 32, 8); 3408 __ add(s2, s2, s1); 3409 __ add(s1, s1, temp2); 3410 __ ubfx(temp2, temp0, 40, 8); 3411 __ add(s2, s2, s1); 3412 __ add(s1, s1, temp2); 3413 __ ubfx(temp2, temp0, 48, 8); 3414 __ add(s2, s2, s1); 3415 __ add(s1, s1, temp2); 3416 __ add(s2, s2, s1); 3417 __ add(s1, s1, temp0, Assembler::LSR, 56); 3418 __ add(s2, s2, s1); 3419 3420 __ add(s1, s1, temp1, ext::uxtb); 3421 __ ubfx(temp2, temp1, 8, 8); 3422 __ add(s2, s2, s1); 3423 __ add(s1, s1, temp2); 3424 __ ubfx(temp2, temp1, 16, 8); 3425 __ add(s2, s2, s1); 3426 __ add(s1, s1, temp2); 3427 __ ubfx(temp2, temp1, 24, 8); 3428 __ add(s2, s2, s1); 3429 __ add(s1, s1, temp2); 3430 __ ubfx(temp2, temp1, 32, 8); 3431 __ add(s2, s2, s1); 3432 __ add(s1, s1, temp2); 3433 __ ubfx(temp2, temp1, 40, 8); 3434 __ add(s2, s2, s1); 3435 __ add(s1, s1, temp2); 3436 __ ubfx(temp2, temp1, 48, 8); 3437 __ add(s2, s2, s1); 3438 __ add(s1, s1, temp2); 3439 __ add(s2, s2, s1); 3440 __ add(s1, s1, temp1, Assembler::LSR, 56); 3441 __ add(s2, s2, s1); 3442 3443 __ subs(len, len, 16); 3444 __ br(Assembler::HS, L_by16_loop); 3445 3446 __ bind(L_by1); 3447 __ adds(len, len, 15); 3448 __ br(Assembler::LO, L_do_mod); 3449 3450 __ bind(L_by1_loop); 3451 __ ldrb(temp0, Address(__ post(buff, 1))); 3452 __ add(s1, temp0, s1); 3453 __ add(s2, s2, s1); 3454 __ subs(len, len, 1); 3455 __ br(Assembler::HS, L_by1_loop); 3456 3457 __ bind(L_do_mod); 3458 // s1 = s1 % BASE 3459 __ lsr(temp0, s1, 16); 3460 __ lsl(temp1, temp0, 4); 3461 __ sub(temp1, temp1, temp0); 3462 __ add(temp1, temp1, s1, ext::uxth); 3463 3464 __ lsr(temp0, temp1, 16); 3465 __ lsl(s1, temp0, 4); 3466 __ sub(s1, s1, temp0); 3467 __ add(s1, s1, temp1, ext:: uxth); 3468 3469 __ subs(temp0, s1, base); 3470 __ csel(s1, temp0, s1, Assembler::HS); 3471 3472 // s2 = s2 % BASE 3473 __ lsr(temp0, s2, 16); 3474 __ lsl(temp1, temp0, 4); 3475 __ sub(temp1, temp1, temp0); 3476 __ add(temp1, temp1, s2, ext::uxth); 3477 3478 __ lsr(temp0, temp1, 16); 3479 __ lsl(s2, temp0, 4); 3480 __ sub(s2, s2, temp0); 3481 __ add(s2, s2, temp1, ext:: uxth); 3482 3483 __ subs(temp0, s2, base); 3484 __ csel(s2, temp0, s2, Assembler::HS); 3485 3486 // Combine lower bits and higher bits 3487 __ bind(L_combine); 3488 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3489 3490 __ ret(lr); 3491 3492 return start; 3493 } 3494 3495 /** 3496 * Arguments: 3497 * 3498 * Input: 3499 * c_rarg0 - x address 3500 * c_rarg1 - x length 3501 * c_rarg2 - y address 3502 * c_rarg3 - y lenth 3503 * c_rarg4 - z address 3504 * c_rarg5 - z length 3505 */ 3506 address generate_multiplyToLen() { 3507 __ align(CodeEntryAlignment); 3508 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3509 3510 address start = __ pc(); 3511 const Register x = r0; 3512 const Register xlen = r1; 3513 const Register y = r2; 3514 const Register ylen = r3; 3515 const Register z = r4; 3516 const Register zlen = r5; 3517 3518 const Register tmp1 = r10; 3519 const Register tmp2 = r11; 3520 const Register tmp3 = r12; 3521 const Register tmp4 = r13; 3522 const Register tmp5 = r14; 3523 const Register tmp6 = r15; 3524 const Register tmp7 = r16; 3525 3526 BLOCK_COMMENT("Entry:"); 3527 __ enter(); // required for proper stackwalking of RuntimeStub frame 3528 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3529 __ leave(); // required for proper stackwalking of RuntimeStub frame 3530 __ ret(lr); 3531 3532 return start; 3533 } 3534 3535 address generate_squareToLen() { 3536 // squareToLen algorithm for sizes 1..127 described in java code works 3537 // faster than multiply_to_len on some CPUs and slower on others, but 3538 // multiply_to_len shows a bit better overall results 3539 __ align(CodeEntryAlignment); 3540 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3541 address start = __ pc(); 3542 3543 const Register x = r0; 3544 const Register xlen = r1; 3545 const Register z = r2; 3546 const Register zlen = r3; 3547 const Register y = r4; // == x 3548 const Register ylen = r5; // == xlen 3549 3550 const Register tmp1 = r10; 3551 const Register tmp2 = r11; 3552 const Register tmp3 = r12; 3553 const Register tmp4 = r13; 3554 const Register tmp5 = r14; 3555 const Register tmp6 = r15; 3556 const Register tmp7 = r16; 3557 3558 RegSet spilled_regs = RegSet::of(y, ylen); 3559 BLOCK_COMMENT("Entry:"); 3560 __ enter(); 3561 __ push(spilled_regs, sp); 3562 __ mov(y, x); 3563 __ mov(ylen, xlen); 3564 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3565 __ pop(spilled_regs, sp); 3566 __ leave(); 3567 __ ret(lr); 3568 return start; 3569 } 3570 3571 address generate_mulAdd() { 3572 __ align(CodeEntryAlignment); 3573 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3574 3575 address start = __ pc(); 3576 3577 const Register out = r0; 3578 const Register in = r1; 3579 const Register offset = r2; 3580 const Register len = r3; 3581 const Register k = r4; 3582 3583 BLOCK_COMMENT("Entry:"); 3584 __ enter(); 3585 __ mul_add(out, in, offset, len, k); 3586 __ leave(); 3587 __ ret(lr); 3588 3589 return start; 3590 } 3591 3592 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3593 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3594 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3595 // Karatsuba multiplication performs a 128*128 -> 256-bit 3596 // multiplication in three 128-bit multiplications and a few 3597 // additions. 3598 // 3599 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3600 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3601 // 3602 // Inputs: 3603 // 3604 // A0 in a.d[0] (subkey) 3605 // A1 in a.d[1] 3606 // (A1+A0) in a1_xor_a0.d[0] 3607 // 3608 // B0 in b.d[0] (state) 3609 // B1 in b.d[1] 3610 3611 __ ext(tmp1, __ T16B, b, b, 0x08); 3612 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3613 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3614 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3615 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3616 3617 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3618 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3619 __ eor(tmp2, __ T16B, tmp2, tmp4); 3620 __ eor(tmp2, __ T16B, tmp2, tmp3); 3621 3622 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3623 __ ins(result_hi, __ D, tmp2, 0, 1); 3624 __ ins(result_lo, __ D, tmp2, 1, 0); 3625 } 3626 3627 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3628 FloatRegister p, FloatRegister z, FloatRegister t1) { 3629 const FloatRegister t0 = result; 3630 3631 // The GCM field polynomial f is z^128 + p(z), where p = 3632 // z^7+z^2+z+1. 3633 // 3634 // z^128 === -p(z) (mod (z^128 + p(z))) 3635 // 3636 // so, given that the product we're reducing is 3637 // a == lo + hi * z^128 3638 // substituting, 3639 // === lo - hi * p(z) (mod (z^128 + p(z))) 3640 // 3641 // we reduce by multiplying hi by p(z) and subtracting the result 3642 // from (i.e. XORing it with) lo. Because p has no nonzero high 3643 // bits we can do this with two 64-bit multiplications, lo*p and 3644 // hi*p. 3645 3646 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3647 __ ext(t1, __ T16B, t0, z, 8); 3648 __ eor(hi, __ T16B, hi, t1); 3649 __ ext(t1, __ T16B, z, t0, 8); 3650 __ eor(lo, __ T16B, lo, t1); 3651 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3652 __ eor(result, __ T16B, lo, t0); 3653 } 3654 3655 address generate_has_negatives(address &has_negatives_long) { 3656 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3657 const int large_loop_size = 64; 3658 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3659 int dcache_line = VM_Version::dcache_line_size(); 3660 3661 Register ary1 = r1, len = r2, result = r0; 3662 3663 __ align(CodeEntryAlignment); 3664 address entry = __ pc(); 3665 3666 __ enter(); 3667 3668 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3669 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3670 3671 __ cmp(len, 15); 3672 __ br(Assembler::GT, LEN_OVER_15); 3673 // The only case when execution falls into this code is when pointer is near 3674 // the end of memory page and we have to avoid reading next page 3675 __ add(ary1, ary1, len); 3676 __ subs(len, len, 8); 3677 __ br(Assembler::GT, LEN_OVER_8); 3678 __ ldr(rscratch2, Address(ary1, -8)); 3679 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3680 __ lsrv(rscratch2, rscratch2, rscratch1); 3681 __ tst(rscratch2, UPPER_BIT_MASK); 3682 __ cset(result, Assembler::NE); 3683 __ leave(); 3684 __ ret(lr); 3685 __ bind(LEN_OVER_8); 3686 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3687 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3688 __ tst(rscratch2, UPPER_BIT_MASK); 3689 __ br(Assembler::NE, RET_TRUE_NO_POP); 3690 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3691 __ lsrv(rscratch1, rscratch1, rscratch2); 3692 __ tst(rscratch1, UPPER_BIT_MASK); 3693 __ cset(result, Assembler::NE); 3694 __ leave(); 3695 __ ret(lr); 3696 3697 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3698 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3699 3700 has_negatives_long = __ pc(); // 2nd entry point 3701 3702 __ enter(); 3703 3704 __ bind(LEN_OVER_15); 3705 __ push(spilled_regs, sp); 3706 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3707 __ cbz(rscratch2, ALIGNED); 3708 __ ldp(tmp6, tmp1, Address(ary1)); 3709 __ mov(tmp5, 16); 3710 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3711 __ add(ary1, ary1, rscratch1); 3712 __ sub(len, len, rscratch1); 3713 __ orr(tmp6, tmp6, tmp1); 3714 __ tst(tmp6, UPPER_BIT_MASK); 3715 __ br(Assembler::NE, RET_TRUE); 3716 3717 __ bind(ALIGNED); 3718 __ cmp(len, large_loop_size); 3719 __ br(Assembler::LT, CHECK_16); 3720 // Perform 16-byte load as early return in pre-loop to handle situation 3721 // when initially aligned large array has negative values at starting bytes, 3722 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3723 // slower. Cases with negative bytes further ahead won't be affected that 3724 // much. In fact, it'll be faster due to early loads, less instructions and 3725 // less branches in LARGE_LOOP. 3726 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3727 __ sub(len, len, 16); 3728 __ orr(tmp6, tmp6, tmp1); 3729 __ tst(tmp6, UPPER_BIT_MASK); 3730 __ br(Assembler::NE, RET_TRUE); 3731 __ cmp(len, large_loop_size); 3732 __ br(Assembler::LT, CHECK_16); 3733 3734 if (SoftwarePrefetchHintDistance >= 0 3735 && SoftwarePrefetchHintDistance >= dcache_line) { 3736 // initial prefetch 3737 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3738 } 3739 __ bind(LARGE_LOOP); 3740 if (SoftwarePrefetchHintDistance >= 0) { 3741 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3742 } 3743 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3744 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3745 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3746 // instructions per cycle and have less branches, but this approach disables 3747 // early return, thus, all 64 bytes are loaded and checked every time. 3748 __ ldp(tmp2, tmp3, Address(ary1)); 3749 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3750 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3751 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3752 __ add(ary1, ary1, large_loop_size); 3753 __ sub(len, len, large_loop_size); 3754 __ orr(tmp2, tmp2, tmp3); 3755 __ orr(tmp4, tmp4, tmp5); 3756 __ orr(rscratch1, rscratch1, rscratch2); 3757 __ orr(tmp6, tmp6, tmp1); 3758 __ orr(tmp2, tmp2, tmp4); 3759 __ orr(rscratch1, rscratch1, tmp6); 3760 __ orr(tmp2, tmp2, rscratch1); 3761 __ tst(tmp2, UPPER_BIT_MASK); 3762 __ br(Assembler::NE, RET_TRUE); 3763 __ cmp(len, large_loop_size); 3764 __ br(Assembler::GE, LARGE_LOOP); 3765 3766 __ bind(CHECK_16); // small 16-byte load pre-loop 3767 __ cmp(len, 16); 3768 __ br(Assembler::LT, POST_LOOP16); 3769 3770 __ bind(LOOP16); // small 16-byte load loop 3771 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3772 __ sub(len, len, 16); 3773 __ orr(tmp2, tmp2, tmp3); 3774 __ tst(tmp2, UPPER_BIT_MASK); 3775 __ br(Assembler::NE, RET_TRUE); 3776 __ cmp(len, 16); 3777 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3778 3779 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3780 __ cmp(len, 8); 3781 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3782 __ ldr(tmp3, Address(__ post(ary1, 8))); 3783 __ sub(len, len, 8); 3784 __ tst(tmp3, UPPER_BIT_MASK); 3785 __ br(Assembler::NE, RET_TRUE); 3786 3787 __ bind(POST_LOOP16_LOAD_TAIL); 3788 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3789 __ ldr(tmp1, Address(ary1)); 3790 __ mov(tmp2, 64); 3791 __ sub(tmp4, tmp2, len, __ LSL, 3); 3792 __ lslv(tmp1, tmp1, tmp4); 3793 __ tst(tmp1, UPPER_BIT_MASK); 3794 __ br(Assembler::NE, RET_TRUE); 3795 // Fallthrough 3796 3797 __ bind(RET_FALSE); 3798 __ pop(spilled_regs, sp); 3799 __ leave(); 3800 __ mov(result, zr); 3801 __ ret(lr); 3802 3803 __ bind(RET_TRUE); 3804 __ pop(spilled_regs, sp); 3805 __ bind(RET_TRUE_NO_POP); 3806 __ leave(); 3807 __ mov(result, 1); 3808 __ ret(lr); 3809 3810 __ bind(DONE); 3811 __ pop(spilled_regs, sp); 3812 __ leave(); 3813 __ ret(lr); 3814 return entry; 3815 } 3816 3817 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3818 bool usePrefetch, Label &NOT_EQUAL) { 3819 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3820 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3821 tmp7 = r12, tmp8 = r13; 3822 Label LOOP; 3823 3824 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3825 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3826 __ bind(LOOP); 3827 if (usePrefetch) { 3828 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3829 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3830 } 3831 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3832 __ eor(tmp1, tmp1, tmp2); 3833 __ eor(tmp3, tmp3, tmp4); 3834 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3835 __ orr(tmp1, tmp1, tmp3); 3836 __ cbnz(tmp1, NOT_EQUAL); 3837 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3838 __ eor(tmp5, tmp5, tmp6); 3839 __ eor(tmp7, tmp7, tmp8); 3840 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3841 __ orr(tmp5, tmp5, tmp7); 3842 __ cbnz(tmp5, NOT_EQUAL); 3843 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3844 __ eor(tmp1, tmp1, tmp2); 3845 __ eor(tmp3, tmp3, tmp4); 3846 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3847 __ orr(tmp1, tmp1, tmp3); 3848 __ cbnz(tmp1, NOT_EQUAL); 3849 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3850 __ eor(tmp5, tmp5, tmp6); 3851 __ sub(cnt1, cnt1, 8 * wordSize); 3852 __ eor(tmp7, tmp7, tmp8); 3853 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3854 __ cmp(cnt1, loopThreshold); 3855 __ orr(tmp5, tmp5, tmp7); 3856 __ cbnz(tmp5, NOT_EQUAL); 3857 __ br(__ GE, LOOP); 3858 // post-loop 3859 __ eor(tmp1, tmp1, tmp2); 3860 __ eor(tmp3, tmp3, tmp4); 3861 __ orr(tmp1, tmp1, tmp3); 3862 __ sub(cnt1, cnt1, 2 * wordSize); 3863 __ cbnz(tmp1, NOT_EQUAL); 3864 } 3865 3866 void generate_large_array_equals_loop_simd(int loopThreshold, 3867 bool usePrefetch, Label &NOT_EQUAL) { 3868 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3869 tmp2 = rscratch2; 3870 Label LOOP; 3871 3872 __ bind(LOOP); 3873 if (usePrefetch) { 3874 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3875 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3876 } 3877 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3878 __ sub(cnt1, cnt1, 8 * wordSize); 3879 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3880 __ cmp(cnt1, loopThreshold); 3881 __ eor(v0, __ T16B, v0, v4); 3882 __ eor(v1, __ T16B, v1, v5); 3883 __ eor(v2, __ T16B, v2, v6); 3884 __ eor(v3, __ T16B, v3, v7); 3885 __ orr(v0, __ T16B, v0, v1); 3886 __ orr(v1, __ T16B, v2, v3); 3887 __ orr(v0, __ T16B, v0, v1); 3888 __ umov(tmp1, v0, __ D, 0); 3889 __ umov(tmp2, v0, __ D, 1); 3890 __ orr(tmp1, tmp1, tmp2); 3891 __ cbnz(tmp1, NOT_EQUAL); 3892 __ br(__ GE, LOOP); 3893 } 3894 3895 // a1 = r1 - array1 address 3896 // a2 = r2 - array2 address 3897 // result = r0 - return value. Already contains "false" 3898 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3899 // r3-r5 are reserved temporary registers 3900 address generate_large_array_equals() { 3901 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3902 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3903 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3904 tmp7 = r12, tmp8 = r13; 3905 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3906 SMALL_LOOP, POST_LOOP; 3907 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3908 // calculate if at least 32 prefetched bytes are used 3909 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3910 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3911 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3912 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3913 tmp5, tmp6, tmp7, tmp8); 3914 3915 __ align(CodeEntryAlignment); 3916 address entry = __ pc(); 3917 __ enter(); 3918 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3919 // also advance pointers to use post-increment instead of pre-increment 3920 __ add(a1, a1, wordSize); 3921 __ add(a2, a2, wordSize); 3922 if (AvoidUnalignedAccesses) { 3923 // both implementations (SIMD/nonSIMD) are using relatively large load 3924 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 3925 // on some CPUs in case of address is not at least 16-byte aligned. 3926 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 3927 // load if needed at least for 1st address and make if 16-byte aligned. 3928 Label ALIGNED16; 3929 __ tbz(a1, 3, ALIGNED16); 3930 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3931 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3932 __ sub(cnt1, cnt1, wordSize); 3933 __ eor(tmp1, tmp1, tmp2); 3934 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 3935 __ bind(ALIGNED16); 3936 } 3937 if (UseSIMDForArrayEquals) { 3938 if (SoftwarePrefetchHintDistance >= 0) { 3939 __ cmp(cnt1, prefetchLoopThreshold); 3940 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3941 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 3942 /* prfm = */ true, NOT_EQUAL); 3943 __ cmp(cnt1, nonPrefetchLoopThreshold); 3944 __ br(__ LT, TAIL); 3945 } 3946 __ bind(NO_PREFETCH_LARGE_LOOP); 3947 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 3948 /* prfm = */ false, NOT_EQUAL); 3949 } else { 3950 __ push(spilled_regs, sp); 3951 if (SoftwarePrefetchHintDistance >= 0) { 3952 __ cmp(cnt1, prefetchLoopThreshold); 3953 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3954 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 3955 /* prfm = */ true, NOT_EQUAL); 3956 __ cmp(cnt1, nonPrefetchLoopThreshold); 3957 __ br(__ LT, TAIL); 3958 } 3959 __ bind(NO_PREFETCH_LARGE_LOOP); 3960 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 3961 /* prfm = */ false, NOT_EQUAL); 3962 } 3963 __ bind(TAIL); 3964 __ cbz(cnt1, EQUAL); 3965 __ subs(cnt1, cnt1, wordSize); 3966 __ br(__ LE, POST_LOOP); 3967 __ bind(SMALL_LOOP); 3968 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3969 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3970 __ subs(cnt1, cnt1, wordSize); 3971 __ eor(tmp1, tmp1, tmp2); 3972 __ cbnz(tmp1, NOT_EQUAL); 3973 __ br(__ GT, SMALL_LOOP); 3974 __ bind(POST_LOOP); 3975 __ ldr(tmp1, Address(a1, cnt1)); 3976 __ ldr(tmp2, Address(a2, cnt1)); 3977 __ eor(tmp1, tmp1, tmp2); 3978 __ cbnz(tmp1, NOT_EQUAL); 3979 __ bind(EQUAL); 3980 __ mov(result, true); 3981 __ bind(NOT_EQUAL); 3982 if (!UseSIMDForArrayEquals) { 3983 __ pop(spilled_regs, sp); 3984 } 3985 __ bind(NOT_EQUAL_NO_POP); 3986 __ leave(); 3987 __ ret(lr); 3988 return entry; 3989 } 3990 3991 3992 /** 3993 * Arguments: 3994 * 3995 * Input: 3996 * c_rarg0 - current state address 3997 * c_rarg1 - H key address 3998 * c_rarg2 - data address 3999 * c_rarg3 - number of blocks 4000 * 4001 * Output: 4002 * Updated state at c_rarg0 4003 */ 4004 address generate_ghash_processBlocks() { 4005 // Bafflingly, GCM uses little-endian for the byte order, but 4006 // big-endian for the bit order. For example, the polynomial 1 is 4007 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4008 // 4009 // So, we must either reverse the bytes in each word and do 4010 // everything big-endian or reverse the bits in each byte and do 4011 // it little-endian. On AArch64 it's more idiomatic to reverse 4012 // the bits in each byte (we have an instruction, RBIT, to do 4013 // that) and keep the data in little-endian bit order throught the 4014 // calculation, bit-reversing the inputs and outputs. 4015 4016 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4017 __ align(wordSize * 2); 4018 address p = __ pc(); 4019 __ emit_int64(0x87); // The low-order bits of the field 4020 // polynomial (i.e. p = z^7+z^2+z+1) 4021 // repeated in the low and high parts of a 4022 // 128-bit vector 4023 __ emit_int64(0x87); 4024 4025 __ align(CodeEntryAlignment); 4026 address start = __ pc(); 4027 4028 Register state = c_rarg0; 4029 Register subkeyH = c_rarg1; 4030 Register data = c_rarg2; 4031 Register blocks = c_rarg3; 4032 4033 FloatRegister vzr = v30; 4034 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4035 4036 __ ldrq(v0, Address(state)); 4037 __ ldrq(v1, Address(subkeyH)); 4038 4039 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4040 __ rbit(v0, __ T16B, v0); 4041 __ rev64(v1, __ T16B, v1); 4042 __ rbit(v1, __ T16B, v1); 4043 4044 __ ldrq(v26, p); 4045 4046 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4047 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4048 4049 { 4050 Label L_ghash_loop; 4051 __ bind(L_ghash_loop); 4052 4053 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4054 // reversing each byte 4055 __ rbit(v2, __ T16B, v2); 4056 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4057 4058 // Multiply state in v2 by subkey in v1 4059 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4060 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4061 /*temps*/v6, v20, v18, v21); 4062 // Reduce v7:v5 by the field polynomial 4063 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4064 4065 __ sub(blocks, blocks, 1); 4066 __ cbnz(blocks, L_ghash_loop); 4067 } 4068 4069 // The bit-reversed result is at this point in v0 4070 __ rev64(v1, __ T16B, v0); 4071 __ rbit(v1, __ T16B, v1); 4072 4073 __ st1(v1, __ T16B, state); 4074 __ ret(lr); 4075 4076 return start; 4077 } 4078 4079 // Continuation point for throwing of implicit exceptions that are 4080 // not handled in the current activation. Fabricates an exception 4081 // oop and initiates normal exception dispatching in this 4082 // frame. Since we need to preserve callee-saved values (currently 4083 // only for C2, but done for C1 as well) we need a callee-saved oop 4084 // map and therefore have to make these stubs into RuntimeStubs 4085 // rather than BufferBlobs. If the compiler needs all registers to 4086 // be preserved between the fault point and the exception handler 4087 // then it must assume responsibility for that in 4088 // AbstractCompiler::continuation_for_implicit_null_exception or 4089 // continuation_for_implicit_division_by_zero_exception. All other 4090 // implicit exceptions (e.g., NullPointerException or 4091 // AbstractMethodError on entry) are either at call sites or 4092 // otherwise assume that stack unwinding will be initiated, so 4093 // caller saved registers were assumed volatile in the compiler. 4094 4095 #undef __ 4096 #define __ masm-> 4097 4098 address generate_throw_exception(const char* name, 4099 address runtime_entry, 4100 Register arg1 = noreg, 4101 Register arg2 = noreg) { 4102 // Information about frame layout at time of blocking runtime call. 4103 // Note that we only have to preserve callee-saved registers since 4104 // the compilers are responsible for supplying a continuation point 4105 // if they expect all registers to be preserved. 4106 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4107 enum layout { 4108 rfp_off = 0, 4109 rfp_off2, 4110 return_off, 4111 return_off2, 4112 framesize // inclusive of return address 4113 }; 4114 4115 int insts_size = 512; 4116 int locs_size = 64; 4117 4118 CodeBuffer code(name, insts_size, locs_size); 4119 OopMapSet* oop_maps = new OopMapSet(); 4120 MacroAssembler* masm = new MacroAssembler(&code); 4121 4122 address start = __ pc(); 4123 4124 // This is an inlined and slightly modified version of call_VM 4125 // which has the ability to fetch the return PC out of 4126 // thread-local storage and also sets up last_Java_sp slightly 4127 // differently than the real call_VM 4128 4129 __ enter(); // Save FP and LR before call 4130 4131 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4132 4133 // lr and fp are already in place 4134 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4135 4136 int frame_complete = __ pc() - start; 4137 4138 // Set up last_Java_sp and last_Java_fp 4139 address the_pc = __ pc(); 4140 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 4141 4142 // Call runtime 4143 if (arg1 != noreg) { 4144 assert(arg2 != c_rarg1, "clobbered"); 4145 __ mov(c_rarg1, arg1); 4146 } 4147 if (arg2 != noreg) { 4148 __ mov(c_rarg2, arg2); 4149 } 4150 __ mov(c_rarg0, rthread); 4151 BLOCK_COMMENT("call runtime_entry"); 4152 __ mov(rscratch1, runtime_entry); 4153 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4154 4155 // Generate oop map 4156 OopMap* map = new OopMap(framesize, 0); 4157 4158 oop_maps->add_gc_map(the_pc - start, map); 4159 4160 __ reset_last_Java_frame(true); 4161 __ maybe_isb(); 4162 4163 __ leave(); 4164 4165 // check for pending exceptions 4166 #ifdef ASSERT 4167 Label L; 4168 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4169 __ cbnz(rscratch1, L); 4170 __ should_not_reach_here(); 4171 __ bind(L); 4172 #endif // ASSERT 4173 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4174 4175 4176 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4177 RuntimeStub* stub = 4178 RuntimeStub::new_runtime_stub(name, 4179 &code, 4180 frame_complete, 4181 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4182 oop_maps, false); 4183 return stub->entry_point(); 4184 } 4185 4186 class MontgomeryMultiplyGenerator : public MacroAssembler { 4187 4188 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4189 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4190 4191 RegSet _toSave; 4192 bool _squaring; 4193 4194 public: 4195 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4196 : MacroAssembler(as->code()), _squaring(squaring) { 4197 4198 // Register allocation 4199 4200 Register reg = c_rarg0; 4201 Pa_base = reg; // Argument registers 4202 if (squaring) 4203 Pb_base = Pa_base; 4204 else 4205 Pb_base = ++reg; 4206 Pn_base = ++reg; 4207 Rlen= ++reg; 4208 inv = ++reg; 4209 Pm_base = ++reg; 4210 4211 // Working registers: 4212 Ra = ++reg; // The current digit of a, b, n, and m. 4213 Rb = ++reg; 4214 Rm = ++reg; 4215 Rn = ++reg; 4216 4217 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4218 Pb = ++reg; 4219 Pm = ++reg; 4220 Pn = ++reg; 4221 4222 t0 = ++reg; // Three registers which form a 4223 t1 = ++reg; // triple-precision accumuator. 4224 t2 = ++reg; 4225 4226 Ri = ++reg; // Inner and outer loop indexes. 4227 Rj = ++reg; 4228 4229 Rhi_ab = ++reg; // Product registers: low and high parts 4230 Rlo_ab = ++reg; // of a*b and m*n. 4231 Rhi_mn = ++reg; 4232 Rlo_mn = ++reg; 4233 4234 // r19 and up are callee-saved. 4235 _toSave = RegSet::range(r19, reg) + Pm_base; 4236 } 4237 4238 private: 4239 void save_regs() { 4240 push(_toSave, sp); 4241 } 4242 4243 void restore_regs() { 4244 pop(_toSave, sp); 4245 } 4246 4247 template <typename T> 4248 void unroll_2(Register count, T block) { 4249 Label loop, end, odd; 4250 tbnz(count, 0, odd); 4251 cbz(count, end); 4252 align(16); 4253 bind(loop); 4254 (this->*block)(); 4255 bind(odd); 4256 (this->*block)(); 4257 subs(count, count, 2); 4258 br(Assembler::GT, loop); 4259 bind(end); 4260 } 4261 4262 template <typename T> 4263 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4264 Label loop, end, odd; 4265 tbnz(count, 0, odd); 4266 cbz(count, end); 4267 align(16); 4268 bind(loop); 4269 (this->*block)(d, s, tmp); 4270 bind(odd); 4271 (this->*block)(d, s, tmp); 4272 subs(count, count, 2); 4273 br(Assembler::GT, loop); 4274 bind(end); 4275 } 4276 4277 void pre1(RegisterOrConstant i) { 4278 block_comment("pre1"); 4279 // Pa = Pa_base; 4280 // Pb = Pb_base + i; 4281 // Pm = Pm_base; 4282 // Pn = Pn_base + i; 4283 // Ra = *Pa; 4284 // Rb = *Pb; 4285 // Rm = *Pm; 4286 // Rn = *Pn; 4287 ldr(Ra, Address(Pa_base)); 4288 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4289 ldr(Rm, Address(Pm_base)); 4290 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4291 lea(Pa, Address(Pa_base)); 4292 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4293 lea(Pm, Address(Pm_base)); 4294 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4295 4296 // Zero the m*n result. 4297 mov(Rhi_mn, zr); 4298 mov(Rlo_mn, zr); 4299 } 4300 4301 // The core multiply-accumulate step of a Montgomery 4302 // multiplication. The idea is to schedule operations as a 4303 // pipeline so that instructions with long latencies (loads and 4304 // multiplies) have time to complete before their results are 4305 // used. This most benefits in-order implementations of the 4306 // architecture but out-of-order ones also benefit. 4307 void step() { 4308 block_comment("step"); 4309 // MACC(Ra, Rb, t0, t1, t2); 4310 // Ra = *++Pa; 4311 // Rb = *--Pb; 4312 umulh(Rhi_ab, Ra, Rb); 4313 mul(Rlo_ab, Ra, Rb); 4314 ldr(Ra, pre(Pa, wordSize)); 4315 ldr(Rb, pre(Pb, -wordSize)); 4316 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4317 // previous iteration. 4318 // MACC(Rm, Rn, t0, t1, t2); 4319 // Rm = *++Pm; 4320 // Rn = *--Pn; 4321 umulh(Rhi_mn, Rm, Rn); 4322 mul(Rlo_mn, Rm, Rn); 4323 ldr(Rm, pre(Pm, wordSize)); 4324 ldr(Rn, pre(Pn, -wordSize)); 4325 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4326 } 4327 4328 void post1() { 4329 block_comment("post1"); 4330 4331 // MACC(Ra, Rb, t0, t1, t2); 4332 // Ra = *++Pa; 4333 // Rb = *--Pb; 4334 umulh(Rhi_ab, Ra, Rb); 4335 mul(Rlo_ab, Ra, Rb); 4336 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4337 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4338 4339 // *Pm = Rm = t0 * inv; 4340 mul(Rm, t0, inv); 4341 str(Rm, Address(Pm)); 4342 4343 // MACC(Rm, Rn, t0, t1, t2); 4344 // t0 = t1; t1 = t2; t2 = 0; 4345 umulh(Rhi_mn, Rm, Rn); 4346 4347 #ifndef PRODUCT 4348 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4349 { 4350 mul(Rlo_mn, Rm, Rn); 4351 add(Rlo_mn, t0, Rlo_mn); 4352 Label ok; 4353 cbz(Rlo_mn, ok); { 4354 stop("broken Montgomery multiply"); 4355 } bind(ok); 4356 } 4357 #endif 4358 // We have very carefully set things up so that 4359 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4360 // the lower half of Rm * Rn because we know the result already: 4361 // it must be -t0. t0 + (-t0) must generate a carry iff 4362 // t0 != 0. So, rather than do a mul and an adds we just set 4363 // the carry flag iff t0 is nonzero. 4364 // 4365 // mul(Rlo_mn, Rm, Rn); 4366 // adds(zr, t0, Rlo_mn); 4367 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4368 adcs(t0, t1, Rhi_mn); 4369 adc(t1, t2, zr); 4370 mov(t2, zr); 4371 } 4372 4373 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 4374 block_comment("pre2"); 4375 // Pa = Pa_base + i-len; 4376 // Pb = Pb_base + len; 4377 // Pm = Pm_base + i-len; 4378 // Pn = Pn_base + len; 4379 4380 if (i.is_register()) { 4381 sub(Rj, i.as_register(), len); 4382 } else { 4383 mov(Rj, i.as_constant()); 4384 sub(Rj, Rj, len); 4385 } 4386 // Rj == i-len 4387 4388 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 4389 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 4390 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4391 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 4392 4393 // Ra = *++Pa; 4394 // Rb = *--Pb; 4395 // Rm = *++Pm; 4396 // Rn = *--Pn; 4397 ldr(Ra, pre(Pa, wordSize)); 4398 ldr(Rb, pre(Pb, -wordSize)); 4399 ldr(Rm, pre(Pm, wordSize)); 4400 ldr(Rn, pre(Pn, -wordSize)); 4401 4402 mov(Rhi_mn, zr); 4403 mov(Rlo_mn, zr); 4404 } 4405 4406 void post2(RegisterOrConstant i, RegisterOrConstant len) { 4407 block_comment("post2"); 4408 if (i.is_constant()) { 4409 mov(Rj, i.as_constant()-len.as_constant()); 4410 } else { 4411 sub(Rj, i.as_register(), len); 4412 } 4413 4414 adds(t0, t0, Rlo_mn); // The pending m*n, low part 4415 4416 // As soon as we know the least significant digit of our result, 4417 // store it. 4418 // Pm_base[i-len] = t0; 4419 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 4420 4421 // t0 = t1; t1 = t2; t2 = 0; 4422 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 4423 adc(t1, t2, zr); 4424 mov(t2, zr); 4425 } 4426 4427 // A carry in t0 after Montgomery multiplication means that we 4428 // should subtract multiples of n from our result in m. We'll 4429 // keep doing that until there is no carry. 4430 void normalize(RegisterOrConstant len) { 4431 block_comment("normalize"); 4432 // while (t0) 4433 // t0 = sub(Pm_base, Pn_base, t0, len); 4434 Label loop, post, again; 4435 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 4436 cbz(t0, post); { 4437 bind(again); { 4438 mov(i, zr); 4439 mov(cnt, len); 4440 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4441 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4442 subs(zr, zr, zr); // set carry flag, i.e. no borrow 4443 align(16); 4444 bind(loop); { 4445 sbcs(Rm, Rm, Rn); 4446 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4447 add(i, i, 1); 4448 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 4449 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4450 sub(cnt, cnt, 1); 4451 } cbnz(cnt, loop); 4452 sbc(t0, t0, zr); 4453 } cbnz(t0, again); 4454 } bind(post); 4455 } 4456 4457 // Move memory at s to d, reversing words. 4458 // Increments d to end of copied memory 4459 // Destroys tmp1, tmp2 4460 // Preserves len 4461 // Leaves s pointing to the address which was in d at start 4462 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 4463 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 4464 4465 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 4466 mov(tmp1, len); 4467 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 4468 sub(s, d, len, ext::uxtw, LogBytesPerWord); 4469 } 4470 // where 4471 void reverse1(Register d, Register s, Register tmp) { 4472 ldr(tmp, pre(s, -wordSize)); 4473 ror(tmp, tmp, 32); 4474 str(tmp, post(d, wordSize)); 4475 } 4476 4477 void step_squaring() { 4478 // An extra ACC 4479 step(); 4480 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4481 } 4482 4483 void last_squaring(RegisterOrConstant i) { 4484 Label dont; 4485 // if ((i & 1) == 0) { 4486 tbnz(i.as_register(), 0, dont); { 4487 // MACC(Ra, Rb, t0, t1, t2); 4488 // Ra = *++Pa; 4489 // Rb = *--Pb; 4490 umulh(Rhi_ab, Ra, Rb); 4491 mul(Rlo_ab, Ra, Rb); 4492 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4493 } bind(dont); 4494 } 4495 4496 void extra_step_squaring() { 4497 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4498 4499 // MACC(Rm, Rn, t0, t1, t2); 4500 // Rm = *++Pm; 4501 // Rn = *--Pn; 4502 umulh(Rhi_mn, Rm, Rn); 4503 mul(Rlo_mn, Rm, Rn); 4504 ldr(Rm, pre(Pm, wordSize)); 4505 ldr(Rn, pre(Pn, -wordSize)); 4506 } 4507 4508 void post1_squaring() { 4509 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4510 4511 // *Pm = Rm = t0 * inv; 4512 mul(Rm, t0, inv); 4513 str(Rm, Address(Pm)); 4514 4515 // MACC(Rm, Rn, t0, t1, t2); 4516 // t0 = t1; t1 = t2; t2 = 0; 4517 umulh(Rhi_mn, Rm, Rn); 4518 4519 #ifndef PRODUCT 4520 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4521 { 4522 mul(Rlo_mn, Rm, Rn); 4523 add(Rlo_mn, t0, Rlo_mn); 4524 Label ok; 4525 cbz(Rlo_mn, ok); { 4526 stop("broken Montgomery multiply"); 4527 } bind(ok); 4528 } 4529 #endif 4530 // We have very carefully set things up so that 4531 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 4532 // the lower half of Rm * Rn because we know the result already: 4533 // it must be -t0. t0 + (-t0) must generate a carry iff 4534 // t0 != 0. So, rather than do a mul and an adds we just set 4535 // the carry flag iff t0 is nonzero. 4536 // 4537 // mul(Rlo_mn, Rm, Rn); 4538 // adds(zr, t0, Rlo_mn); 4539 subs(zr, t0, 1); // Set carry iff t0 is nonzero 4540 adcs(t0, t1, Rhi_mn); 4541 adc(t1, t2, zr); 4542 mov(t2, zr); 4543 } 4544 4545 void acc(Register Rhi, Register Rlo, 4546 Register t0, Register t1, Register t2) { 4547 adds(t0, t0, Rlo); 4548 adcs(t1, t1, Rhi); 4549 adc(t2, t2, zr); 4550 } 4551 4552 public: 4553 /** 4554 * Fast Montgomery multiplication. The derivation of the 4555 * algorithm is in A Cryptographic Library for the Motorola 4556 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 4557 * 4558 * Arguments: 4559 * 4560 * Inputs for multiplication: 4561 * c_rarg0 - int array elements a 4562 * c_rarg1 - int array elements b 4563 * c_rarg2 - int array elements n (the modulus) 4564 * c_rarg3 - int length 4565 * c_rarg4 - int inv 4566 * c_rarg5 - int array elements m (the result) 4567 * 4568 * Inputs for squaring: 4569 * c_rarg0 - int array elements a 4570 * c_rarg1 - int array elements n (the modulus) 4571 * c_rarg2 - int length 4572 * c_rarg3 - int inv 4573 * c_rarg4 - int array elements m (the result) 4574 * 4575 */ 4576 address generate_multiply() { 4577 Label argh, nothing; 4578 bind(argh); 4579 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4580 4581 align(CodeEntryAlignment); 4582 address entry = pc(); 4583 4584 cbzw(Rlen, nothing); 4585 4586 enter(); 4587 4588 // Make room. 4589 cmpw(Rlen, 512); 4590 br(Assembler::HI, argh); 4591 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4592 andr(sp, Ra, -2 * wordSize); 4593 4594 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4595 4596 { 4597 // Copy input args, reversing as we go. We use Ra as a 4598 // temporary variable. 4599 reverse(Ra, Pa_base, Rlen, t0, t1); 4600 if (!_squaring) 4601 reverse(Ra, Pb_base, Rlen, t0, t1); 4602 reverse(Ra, Pn_base, Rlen, t0, t1); 4603 } 4604 4605 // Push all call-saved registers and also Pm_base which we'll need 4606 // at the end. 4607 save_regs(); 4608 4609 #ifndef PRODUCT 4610 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 4611 { 4612 ldr(Rn, Address(Pn_base, 0)); 4613 mul(Rlo_mn, Rn, inv); 4614 cmp(Rlo_mn, -1); 4615 Label ok; 4616 br(EQ, ok); { 4617 stop("broken inverse in Montgomery multiply"); 4618 } bind(ok); 4619 } 4620 #endif 4621 4622 mov(Pm_base, Ra); 4623 4624 mov(t0, zr); 4625 mov(t1, zr); 4626 mov(t2, zr); 4627 4628 block_comment("for (int i = 0; i < len; i++) {"); 4629 mov(Ri, zr); { 4630 Label loop, end; 4631 cmpw(Ri, Rlen); 4632 br(Assembler::GE, end); 4633 4634 bind(loop); 4635 pre1(Ri); 4636 4637 block_comment(" for (j = i; j; j--) {"); { 4638 movw(Rj, Ri); 4639 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4640 } block_comment(" } // j"); 4641 4642 post1(); 4643 addw(Ri, Ri, 1); 4644 cmpw(Ri, Rlen); 4645 br(Assembler::LT, loop); 4646 bind(end); 4647 block_comment("} // i"); 4648 } 4649 4650 block_comment("for (int i = len; i < 2*len; i++) {"); 4651 mov(Ri, Rlen); { 4652 Label loop, end; 4653 cmpw(Ri, Rlen, Assembler::LSL, 1); 4654 br(Assembler::GE, end); 4655 4656 bind(loop); 4657 pre2(Ri, Rlen); 4658 4659 block_comment(" for (j = len*2-i-1; j; j--) {"); { 4660 lslw(Rj, Rlen, 1); 4661 subw(Rj, Rj, Ri); 4662 subw(Rj, Rj, 1); 4663 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 4664 } block_comment(" } // j"); 4665 4666 post2(Ri, Rlen); 4667 addw(Ri, Ri, 1); 4668 cmpw(Ri, Rlen, Assembler::LSL, 1); 4669 br(Assembler::LT, loop); 4670 bind(end); 4671 } 4672 block_comment("} // i"); 4673 4674 normalize(Rlen); 4675 4676 mov(Ra, Pm_base); // Save Pm_base in Ra 4677 restore_regs(); // Restore caller's Pm_base 4678 4679 // Copy our result into caller's Pm_base 4680 reverse(Pm_base, Ra, Rlen, t0, t1); 4681 4682 leave(); 4683 bind(nothing); 4684 ret(lr); 4685 4686 return entry; 4687 } 4688 // In C, approximately: 4689 4690 // void 4691 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 4692 // unsigned long Pn_base[], unsigned long Pm_base[], 4693 // unsigned long inv, int len) { 4694 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4695 // unsigned long *Pa, *Pb, *Pn, *Pm; 4696 // unsigned long Ra, Rb, Rn, Rm; 4697 4698 // int i; 4699 4700 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4701 4702 // for (i = 0; i < len; i++) { 4703 // int j; 4704 4705 // Pa = Pa_base; 4706 // Pb = Pb_base + i; 4707 // Pm = Pm_base; 4708 // Pn = Pn_base + i; 4709 4710 // Ra = *Pa; 4711 // Rb = *Pb; 4712 // Rm = *Pm; 4713 // Rn = *Pn; 4714 4715 // int iters = i; 4716 // for (j = 0; iters--; j++) { 4717 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4718 // MACC(Ra, Rb, t0, t1, t2); 4719 // Ra = *++Pa; 4720 // Rb = *--Pb; 4721 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4722 // MACC(Rm, Rn, t0, t1, t2); 4723 // Rm = *++Pm; 4724 // Rn = *--Pn; 4725 // } 4726 4727 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 4728 // MACC(Ra, Rb, t0, t1, t2); 4729 // *Pm = Rm = t0 * inv; 4730 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4731 // MACC(Rm, Rn, t0, t1, t2); 4732 4733 // assert(t0 == 0, "broken Montgomery multiply"); 4734 4735 // t0 = t1; t1 = t2; t2 = 0; 4736 // } 4737 4738 // for (i = len; i < 2*len; i++) { 4739 // int j; 4740 4741 // Pa = Pa_base + i-len; 4742 // Pb = Pb_base + len; 4743 // Pm = Pm_base + i-len; 4744 // Pn = Pn_base + len; 4745 4746 // Ra = *++Pa; 4747 // Rb = *--Pb; 4748 // Rm = *++Pm; 4749 // Rn = *--Pn; 4750 4751 // int iters = len*2-i-1; 4752 // for (j = i-len+1; iters--; j++) { 4753 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 4754 // MACC(Ra, Rb, t0, t1, t2); 4755 // Ra = *++Pa; 4756 // Rb = *--Pb; 4757 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4758 // MACC(Rm, Rn, t0, t1, t2); 4759 // Rm = *++Pm; 4760 // Rn = *--Pn; 4761 // } 4762 4763 // Pm_base[i-len] = t0; 4764 // t0 = t1; t1 = t2; t2 = 0; 4765 // } 4766 4767 // while (t0) 4768 // t0 = sub(Pm_base, Pn_base, t0, len); 4769 // } 4770 4771 /** 4772 * Fast Montgomery squaring. This uses asymptotically 25% fewer 4773 * multiplies than Montgomery multiplication so it should be up to 4774 * 25% faster. However, its loop control is more complex and it 4775 * may actually run slower on some machines. 4776 * 4777 * Arguments: 4778 * 4779 * Inputs: 4780 * c_rarg0 - int array elements a 4781 * c_rarg1 - int array elements n (the modulus) 4782 * c_rarg2 - int length 4783 * c_rarg3 - int inv 4784 * c_rarg4 - int array elements m (the result) 4785 * 4786 */ 4787 address generate_square() { 4788 Label argh; 4789 bind(argh); 4790 stop("MontgomeryMultiply total_allocation must be <= 8192"); 4791 4792 align(CodeEntryAlignment); 4793 address entry = pc(); 4794 4795 enter(); 4796 4797 // Make room. 4798 cmpw(Rlen, 512); 4799 br(Assembler::HI, argh); 4800 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 4801 andr(sp, Ra, -2 * wordSize); 4802 4803 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 4804 4805 { 4806 // Copy input args, reversing as we go. We use Ra as a 4807 // temporary variable. 4808 reverse(Ra, Pa_base, Rlen, t0, t1); 4809 reverse(Ra, Pn_base, Rlen, t0, t1); 4810 } 4811 4812 // Push all call-saved registers and also Pm_base which we'll need 4813 // at the end. 4814 save_regs(); 4815 4816 mov(Pm_base, Ra); 4817 4818 mov(t0, zr); 4819 mov(t1, zr); 4820 mov(t2, zr); 4821 4822 block_comment("for (int i = 0; i < len; i++) {"); 4823 mov(Ri, zr); { 4824 Label loop, end; 4825 bind(loop); 4826 cmp(Ri, Rlen); 4827 br(Assembler::GE, end); 4828 4829 pre1(Ri); 4830 4831 block_comment("for (j = (i+1)/2; j; j--) {"); { 4832 add(Rj, Ri, 1); 4833 lsr(Rj, Rj, 1); 4834 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4835 } block_comment(" } // j"); 4836 4837 last_squaring(Ri); 4838 4839 block_comment(" for (j = i/2; j; j--) {"); { 4840 lsr(Rj, Ri, 1); 4841 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4842 } block_comment(" } // j"); 4843 4844 post1_squaring(); 4845 add(Ri, Ri, 1); 4846 cmp(Ri, Rlen); 4847 br(Assembler::LT, loop); 4848 4849 bind(end); 4850 block_comment("} // i"); 4851 } 4852 4853 block_comment("for (int i = len; i < 2*len; i++) {"); 4854 mov(Ri, Rlen); { 4855 Label loop, end; 4856 bind(loop); 4857 cmp(Ri, Rlen, Assembler::LSL, 1); 4858 br(Assembler::GE, end); 4859 4860 pre2(Ri, Rlen); 4861 4862 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 4863 lsl(Rj, Rlen, 1); 4864 sub(Rj, Rj, Ri); 4865 sub(Rj, Rj, 1); 4866 lsr(Rj, Rj, 1); 4867 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 4868 } block_comment(" } // j"); 4869 4870 last_squaring(Ri); 4871 4872 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 4873 lsl(Rj, Rlen, 1); 4874 sub(Rj, Rj, Ri); 4875 lsr(Rj, Rj, 1); 4876 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 4877 } block_comment(" } // j"); 4878 4879 post2(Ri, Rlen); 4880 add(Ri, Ri, 1); 4881 cmp(Ri, Rlen, Assembler::LSL, 1); 4882 4883 br(Assembler::LT, loop); 4884 bind(end); 4885 block_comment("} // i"); 4886 } 4887 4888 normalize(Rlen); 4889 4890 mov(Ra, Pm_base); // Save Pm_base in Ra 4891 restore_regs(); // Restore caller's Pm_base 4892 4893 // Copy our result into caller's Pm_base 4894 reverse(Pm_base, Ra, Rlen, t0, t1); 4895 4896 leave(); 4897 ret(lr); 4898 4899 return entry; 4900 } 4901 // In C, approximately: 4902 4903 // void 4904 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 4905 // unsigned long Pm_base[], unsigned long inv, int len) { 4906 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 4907 // unsigned long *Pa, *Pb, *Pn, *Pm; 4908 // unsigned long Ra, Rb, Rn, Rm; 4909 4910 // int i; 4911 4912 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 4913 4914 // for (i = 0; i < len; i++) { 4915 // int j; 4916 4917 // Pa = Pa_base; 4918 // Pb = Pa_base + i; 4919 // Pm = Pm_base; 4920 // Pn = Pn_base + i; 4921 4922 // Ra = *Pa; 4923 // Rb = *Pb; 4924 // Rm = *Pm; 4925 // Rn = *Pn; 4926 4927 // int iters = (i+1)/2; 4928 // for (j = 0; iters--; j++) { 4929 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4930 // MACC2(Ra, Rb, t0, t1, t2); 4931 // Ra = *++Pa; 4932 // Rb = *--Pb; 4933 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4934 // MACC(Rm, Rn, t0, t1, t2); 4935 // Rm = *++Pm; 4936 // Rn = *--Pn; 4937 // } 4938 // if ((i & 1) == 0) { 4939 // assert(Ra == Pa_base[j], "must be"); 4940 // MACC(Ra, Ra, t0, t1, t2); 4941 // } 4942 // iters = i/2; 4943 // assert(iters == i-j, "must be"); 4944 // for (; iters--; j++) { 4945 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4946 // MACC(Rm, Rn, t0, t1, t2); 4947 // Rm = *++Pm; 4948 // Rn = *--Pn; 4949 // } 4950 4951 // *Pm = Rm = t0 * inv; 4952 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 4953 // MACC(Rm, Rn, t0, t1, t2); 4954 4955 // assert(t0 == 0, "broken Montgomery multiply"); 4956 4957 // t0 = t1; t1 = t2; t2 = 0; 4958 // } 4959 4960 // for (i = len; i < 2*len; i++) { 4961 // int start = i-len+1; 4962 // int end = start + (len - start)/2; 4963 // int j; 4964 4965 // Pa = Pa_base + i-len; 4966 // Pb = Pa_base + len; 4967 // Pm = Pm_base + i-len; 4968 // Pn = Pn_base + len; 4969 4970 // Ra = *++Pa; 4971 // Rb = *--Pb; 4972 // Rm = *++Pm; 4973 // Rn = *--Pn; 4974 4975 // int iters = (2*len-i-1)/2; 4976 // assert(iters == end-start, "must be"); 4977 // for (j = start; iters--; j++) { 4978 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 4979 // MACC2(Ra, Rb, t0, t1, t2); 4980 // Ra = *++Pa; 4981 // Rb = *--Pb; 4982 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4983 // MACC(Rm, Rn, t0, t1, t2); 4984 // Rm = *++Pm; 4985 // Rn = *--Pn; 4986 // } 4987 // if ((i & 1) == 0) { 4988 // assert(Ra == Pa_base[j], "must be"); 4989 // MACC(Ra, Ra, t0, t1, t2); 4990 // } 4991 // iters = (2*len-i)/2; 4992 // assert(iters == len-j, "must be"); 4993 // for (; iters--; j++) { 4994 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 4995 // MACC(Rm, Rn, t0, t1, t2); 4996 // Rm = *++Pm; 4997 // Rn = *--Pn; 4998 // } 4999 // Pm_base[i-len] = t0; 5000 // t0 = t1; t1 = t2; t2 = 0; 5001 // } 5002 5003 // while (t0) 5004 // t0 = sub(Pm_base, Pn_base, t0, len); 5005 // } 5006 }; 5007 5008 5009 // Initialization 5010 void generate_initial() { 5011 // Generate initial stubs and initializes the entry points 5012 5013 // entry points that exist in all platforms Note: This is code 5014 // that could be shared among different platforms - however the 5015 // benefit seems to be smaller than the disadvantage of having a 5016 // much more complicated generator structure. See also comment in 5017 // stubRoutines.hpp. 5018 5019 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5020 5021 StubRoutines::_call_stub_entry = 5022 generate_call_stub(StubRoutines::_call_stub_return_address); 5023 5024 // is referenced by megamorphic call 5025 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5026 5027 // Build this early so it's available for the interpreter. 5028 StubRoutines::_throw_StackOverflowError_entry = 5029 generate_throw_exception("StackOverflowError throw_exception", 5030 CAST_FROM_FN_PTR(address, 5031 SharedRuntime::throw_StackOverflowError)); 5032 StubRoutines::_throw_delayed_StackOverflowError_entry = 5033 generate_throw_exception("delayed StackOverflowError throw_exception", 5034 CAST_FROM_FN_PTR(address, 5035 SharedRuntime::throw_delayed_StackOverflowError)); 5036 if (UseCRC32Intrinsics) { 5037 // set table address before stub generation which use it 5038 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5039 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5040 } 5041 5042 if (UseCRC32CIntrinsics) { 5043 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5044 } 5045 } 5046 5047 void generate_all() { 5048 // support for verify_oop (must happen after universe_init) 5049 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5050 StubRoutines::_throw_AbstractMethodError_entry = 5051 generate_throw_exception("AbstractMethodError throw_exception", 5052 CAST_FROM_FN_PTR(address, 5053 SharedRuntime:: 5054 throw_AbstractMethodError)); 5055 5056 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5057 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5058 CAST_FROM_FN_PTR(address, 5059 SharedRuntime:: 5060 throw_IncompatibleClassChangeError)); 5061 5062 StubRoutines::_throw_NullPointerException_at_call_entry = 5063 generate_throw_exception("NullPointerException at call throw_exception", 5064 CAST_FROM_FN_PTR(address, 5065 SharedRuntime:: 5066 throw_NullPointerException_at_call)); 5067 5068 // arraycopy stubs used by compilers 5069 generate_arraycopy_stubs(); 5070 5071 // has negatives stub for large arrays. 5072 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5073 5074 // array equals stub for large arrays. 5075 if (!UseSimpleArrayEquals) { 5076 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5077 } 5078 5079 if (UseMultiplyToLenIntrinsic) { 5080 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5081 } 5082 5083 if (UseSquareToLenIntrinsic) { 5084 StubRoutines::_squareToLen = generate_squareToLen(); 5085 } 5086 5087 if (UseMulAddIntrinsic) { 5088 StubRoutines::_mulAdd = generate_mulAdd(); 5089 } 5090 5091 if (UseMontgomeryMultiplyIntrinsic) { 5092 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5093 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5094 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5095 } 5096 5097 if (UseMontgomerySquareIntrinsic) { 5098 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5099 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5100 // We use generate_multiply() rather than generate_square() 5101 // because it's faster for the sizes of modulus we care about. 5102 StubRoutines::_montgomerySquare = g.generate_multiply(); 5103 } 5104 5105 #ifndef BUILTIN_SIM 5106 // generate GHASH intrinsics code 5107 if (UseGHASHIntrinsics) { 5108 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5109 } 5110 5111 if (UseAESIntrinsics) { 5112 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5113 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5114 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5115 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5116 } 5117 5118 if (UseSHA1Intrinsics) { 5119 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5120 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5121 } 5122 if (UseSHA256Intrinsics) { 5123 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5124 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5125 } 5126 5127 // generate Adler32 intrinsics code 5128 if (UseAdler32Intrinsics) { 5129 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5130 } 5131 5132 // Safefetch stubs. 5133 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5134 &StubRoutines::_safefetch32_fault_pc, 5135 &StubRoutines::_safefetch32_continuation_pc); 5136 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5137 &StubRoutines::_safefetchN_fault_pc, 5138 &StubRoutines::_safefetchN_continuation_pc); 5139 #endif 5140 StubRoutines::aarch64::set_completed(); 5141 } 5142 5143 public: 5144 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5145 if (all) { 5146 generate_all(); 5147 } else { 5148 generate_initial(); 5149 } 5150 } 5151 }; // end class declaration 5152 5153 void StubGenerator_generate(CodeBuffer* code, bool all) { 5154 StubGenerator g(code, all); 5155 }