1 /* 2 * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/universe.hpp" 33 #include "nativeInst_aarch64.hpp" 34 #include "oops/instanceOop.hpp" 35 #include "oops/method.hpp" 36 #include "oops/objArrayKlass.hpp" 37 #include "oops/oop.inline.hpp" 38 #include "prims/methodHandles.hpp" 39 #include "runtime/frame.inline.hpp" 40 #include "runtime/handles.inline.hpp" 41 #include "runtime/sharedRuntime.hpp" 42 #include "runtime/stubCodeGenerator.hpp" 43 #include "runtime/stubRoutines.hpp" 44 #include "runtime/thread.inline.hpp" 45 #include "utilities/align.hpp" 46 #include "utilities/powerOfTwo.hpp" 47 #ifdef COMPILER2 48 #include "opto/runtime.hpp" 49 #endif 50 #if INCLUDE_ZGC 51 #include "gc/z/zThreadLocalData.hpp" 52 #endif 53 54 // Declaration and definition of StubGenerator (no .hpp file). 55 // For a more detailed description of the stub routine structure 56 // see the comment in stubRoutines.hpp 57 58 #undef __ 59 #define __ _masm-> 60 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 61 62 #ifdef PRODUCT 63 #define BLOCK_COMMENT(str) /* nothing */ 64 #else 65 #define BLOCK_COMMENT(str) __ block_comment(str) 66 #endif 67 68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 69 70 // Stub Code definitions 71 72 class StubGenerator: public StubCodeGenerator { 73 private: 74 75 #ifdef PRODUCT 76 #define inc_counter_np(counter) ((void)0) 77 #else 78 void inc_counter_np_(int& counter) { 79 __ lea(rscratch2, ExternalAddress((address)&counter)); 80 __ ldrw(rscratch1, Address(rscratch2)); 81 __ addw(rscratch1, rscratch1, 1); 82 __ strw(rscratch1, Address(rscratch2)); 83 } 84 #define inc_counter_np(counter) \ 85 BLOCK_COMMENT("inc_counter " #counter); \ 86 inc_counter_np_(counter); 87 #endif 88 89 // Call stubs are used to call Java from C 90 // 91 // Arguments: 92 // c_rarg0: call wrapper address address 93 // c_rarg1: result address 94 // c_rarg2: result type BasicType 95 // c_rarg3: method Method* 96 // c_rarg4: (interpreter) entry point address 97 // c_rarg5: parameters intptr_t* 98 // c_rarg6: parameter size (in words) int 99 // c_rarg7: thread Thread* 100 // 101 // There is no return from the stub itself as any Java result 102 // is written to result 103 // 104 // we save r30 (lr) as the return PC at the base of the frame and 105 // link r29 (fp) below it as the frame pointer installing sp (r31) 106 // into fp. 107 // 108 // we save r0-r7, which accounts for all the c arguments. 109 // 110 // TODO: strictly do we need to save them all? they are treated as 111 // volatile by C so could we omit saving the ones we are going to 112 // place in global registers (thread? method?) or those we only use 113 // during setup of the Java call? 114 // 115 // we don't need to save r8 which C uses as an indirect result location 116 // return register. 117 // 118 // we don't need to save r9-r15 which both C and Java treat as 119 // volatile 120 // 121 // we don't need to save r16-18 because Java does not use them 122 // 123 // we save r19-r28 which Java uses as scratch registers and C 124 // expects to be callee-save 125 // 126 // we save the bottom 64 bits of each value stored in v8-v15; it is 127 // the responsibility of the caller to preserve larger values. 128 // 129 // so the stub frame looks like this when we enter Java code 130 // 131 // [ return_from_Java ] <--- sp 132 // [ argument word n ] 133 // ... 134 // -27 [ argument word 1 ] 135 // -26 [ saved v15 ] <--- sp_after_call 136 // -25 [ saved v14 ] 137 // -24 [ saved v13 ] 138 // -23 [ saved v12 ] 139 // -22 [ saved v11 ] 140 // -21 [ saved v10 ] 141 // -20 [ saved v9 ] 142 // -19 [ saved v8 ] 143 // -18 [ saved r28 ] 144 // -17 [ saved r27 ] 145 // -16 [ saved r26 ] 146 // -15 [ saved r25 ] 147 // -14 [ saved r24 ] 148 // -13 [ saved r23 ] 149 // -12 [ saved r22 ] 150 // -11 [ saved r21 ] 151 // -10 [ saved r20 ] 152 // -9 [ saved r19 ] 153 // -8 [ call wrapper (r0) ] 154 // -7 [ result (r1) ] 155 // -6 [ result type (r2) ] 156 // -5 [ method (r3) ] 157 // -4 [ entry point (r4) ] 158 // -3 [ parameters (r5) ] 159 // -2 [ parameter size (r6) ] 160 // -1 [ thread (r7) ] 161 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 162 // 1 [ saved lr (r30) ] 163 164 // Call stub stack layout word offsets from fp 165 enum call_stub_layout { 166 sp_after_call_off = -26, 167 168 d15_off = -26, 169 d13_off = -24, 170 d11_off = -22, 171 d9_off = -20, 172 173 r28_off = -18, 174 r26_off = -16, 175 r24_off = -14, 176 r22_off = -12, 177 r20_off = -10, 178 call_wrapper_off = -8, 179 result_off = -7, 180 result_type_off = -6, 181 method_off = -5, 182 entry_point_off = -4, 183 parameter_size_off = -2, 184 thread_off = -1, 185 fp_f = 0, 186 retaddr_off = 1, 187 }; 188 189 address generate_call_stub(address& return_address) { 190 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 191 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 192 "adjust this code"); 193 194 StubCodeMark mark(this, "StubRoutines", "call_stub"); 195 address start = __ pc(); 196 197 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 198 199 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 200 const Address result (rfp, result_off * wordSize); 201 const Address result_type (rfp, result_type_off * wordSize); 202 const Address method (rfp, method_off * wordSize); 203 const Address entry_point (rfp, entry_point_off * wordSize); 204 const Address parameter_size(rfp, parameter_size_off * wordSize); 205 206 const Address thread (rfp, thread_off * wordSize); 207 208 const Address d15_save (rfp, d15_off * wordSize); 209 const Address d13_save (rfp, d13_off * wordSize); 210 const Address d11_save (rfp, d11_off * wordSize); 211 const Address d9_save (rfp, d9_off * wordSize); 212 213 const Address r28_save (rfp, r28_off * wordSize); 214 const Address r26_save (rfp, r26_off * wordSize); 215 const Address r24_save (rfp, r24_off * wordSize); 216 const Address r22_save (rfp, r22_off * wordSize); 217 const Address r20_save (rfp, r20_off * wordSize); 218 219 // stub code 220 221 address aarch64_entry = __ pc(); 222 223 // set up frame and move sp to end of save area 224 __ enter(); 225 __ sub(sp, rfp, -sp_after_call_off * wordSize); 226 227 // save register parameters and Java scratch/global registers 228 // n.b. we save thread even though it gets installed in 229 // rthread because we want to sanity check rthread later 230 __ str(c_rarg7, thread); 231 __ strw(c_rarg6, parameter_size); 232 __ stp(c_rarg4, c_rarg5, entry_point); 233 __ stp(c_rarg2, c_rarg3, result_type); 234 __ stp(c_rarg0, c_rarg1, call_wrapper); 235 236 __ stp(r20, r19, r20_save); 237 __ stp(r22, r21, r22_save); 238 __ stp(r24, r23, r24_save); 239 __ stp(r26, r25, r26_save); 240 __ stp(r28, r27, r28_save); 241 242 __ stpd(v9, v8, d9_save); 243 __ stpd(v11, v10, d11_save); 244 __ stpd(v13, v12, d13_save); 245 __ stpd(v15, v14, d15_save); 246 247 // install Java thread in global register now we have saved 248 // whatever value it held 249 __ mov(rthread, c_rarg7); 250 // And method 251 __ mov(rmethod, c_rarg3); 252 253 // set up the heapbase register 254 __ reinit_heapbase(); 255 256 #ifdef ASSERT 257 // make sure we have no pending exceptions 258 { 259 Label L; 260 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 261 __ cmp(rscratch1, (u1)NULL_WORD); 262 __ br(Assembler::EQ, L); 263 __ stop("StubRoutines::call_stub: entered with pending exception"); 264 __ BIND(L); 265 } 266 #endif 267 // pass parameters if any 268 __ mov(esp, sp); 269 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 270 __ andr(sp, rscratch1, -2 * wordSize); 271 272 BLOCK_COMMENT("pass parameters if any"); 273 Label parameters_done; 274 // parameter count is still in c_rarg6 275 // and parameter pointer identifying param 1 is in c_rarg5 276 __ cbzw(c_rarg6, parameters_done); 277 278 address loop = __ pc(); 279 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 280 __ subsw(c_rarg6, c_rarg6, 1); 281 __ push(rscratch1); 282 __ br(Assembler::GT, loop); 283 284 __ BIND(parameters_done); 285 286 // call Java entry -- passing methdoOop, and current sp 287 // rmethod: Method* 288 // r13: sender sp 289 BLOCK_COMMENT("call Java function"); 290 __ mov(r13, sp); 291 __ blr(c_rarg4); 292 293 // we do this here because the notify will already have been done 294 // if we get to the next instruction via an exception 295 // 296 // n.b. adding this instruction here affects the calculation of 297 // whether or not a routine returns to the call stub (used when 298 // doing stack walks) since the normal test is to check the return 299 // pc against the address saved below. so we may need to allow for 300 // this extra instruction in the check. 301 302 // save current address for use by exception handling code 303 304 return_address = __ pc(); 305 306 // store result depending on type (everything that is not 307 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 308 // n.b. this assumes Java returns an integral result in r0 309 // and a floating result in j_farg0 310 __ ldr(j_rarg2, result); 311 Label is_long, is_float, is_double, exit; 312 __ ldr(j_rarg1, result_type); 313 __ cmp(j_rarg1, (u1)T_OBJECT); 314 __ br(Assembler::EQ, is_long); 315 __ cmp(j_rarg1, (u1)T_LONG); 316 __ br(Assembler::EQ, is_long); 317 __ cmp(j_rarg1, (u1)T_FLOAT); 318 __ br(Assembler::EQ, is_float); 319 __ cmp(j_rarg1, (u1)T_DOUBLE); 320 __ br(Assembler::EQ, is_double); 321 322 // handle T_INT case 323 __ strw(r0, Address(j_rarg2)); 324 325 __ BIND(exit); 326 327 // pop parameters 328 __ sub(esp, rfp, -sp_after_call_off * wordSize); 329 330 #ifdef ASSERT 331 // verify that threads correspond 332 { 333 Label L, S; 334 __ ldr(rscratch1, thread); 335 __ cmp(rthread, rscratch1); 336 __ br(Assembler::NE, S); 337 __ get_thread(rscratch1); 338 __ cmp(rthread, rscratch1); 339 __ br(Assembler::EQ, L); 340 __ BIND(S); 341 __ stop("StubRoutines::call_stub: threads must correspond"); 342 __ BIND(L); 343 } 344 #endif 345 346 // restore callee-save registers 347 __ ldpd(v15, v14, d15_save); 348 __ ldpd(v13, v12, d13_save); 349 __ ldpd(v11, v10, d11_save); 350 __ ldpd(v9, v8, d9_save); 351 352 __ ldp(r28, r27, r28_save); 353 __ ldp(r26, r25, r26_save); 354 __ ldp(r24, r23, r24_save); 355 __ ldp(r22, r21, r22_save); 356 __ ldp(r20, r19, r20_save); 357 358 __ ldp(c_rarg0, c_rarg1, call_wrapper); 359 __ ldrw(c_rarg2, result_type); 360 __ ldr(c_rarg3, method); 361 __ ldp(c_rarg4, c_rarg5, entry_point); 362 __ ldp(c_rarg6, c_rarg7, parameter_size); 363 364 // leave frame and return to caller 365 __ leave(); 366 __ ret(lr); 367 368 // handle return types different from T_INT 369 370 __ BIND(is_long); 371 __ str(r0, Address(j_rarg2, 0)); 372 __ br(Assembler::AL, exit); 373 374 __ BIND(is_float); 375 __ strs(j_farg0, Address(j_rarg2, 0)); 376 __ br(Assembler::AL, exit); 377 378 __ BIND(is_double); 379 __ strd(j_farg0, Address(j_rarg2, 0)); 380 __ br(Assembler::AL, exit); 381 382 return start; 383 } 384 385 // Return point for a Java call if there's an exception thrown in 386 // Java code. The exception is caught and transformed into a 387 // pending exception stored in JavaThread that can be tested from 388 // within the VM. 389 // 390 // Note: Usually the parameters are removed by the callee. In case 391 // of an exception crossing an activation frame boundary, that is 392 // not the case if the callee is compiled code => need to setup the 393 // rsp. 394 // 395 // r0: exception oop 396 397 address generate_catch_exception() { 398 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 399 address start = __ pc(); 400 401 // same as in generate_call_stub(): 402 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 403 const Address thread (rfp, thread_off * wordSize); 404 405 #ifdef ASSERT 406 // verify that threads correspond 407 { 408 Label L, S; 409 __ ldr(rscratch1, thread); 410 __ cmp(rthread, rscratch1); 411 __ br(Assembler::NE, S); 412 __ get_thread(rscratch1); 413 __ cmp(rthread, rscratch1); 414 __ br(Assembler::EQ, L); 415 __ bind(S); 416 __ stop("StubRoutines::catch_exception: threads must correspond"); 417 __ bind(L); 418 } 419 #endif 420 421 // set pending exception 422 __ verify_oop(r0); 423 424 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 425 __ mov(rscratch1, (address)__FILE__); 426 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 427 __ movw(rscratch1, (int)__LINE__); 428 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 429 430 // complete return to VM 431 assert(StubRoutines::_call_stub_return_address != NULL, 432 "_call_stub_return_address must have been generated before"); 433 __ b(StubRoutines::_call_stub_return_address); 434 435 return start; 436 } 437 438 // Continuation point for runtime calls returning with a pending 439 // exception. The pending exception check happened in the runtime 440 // or native call stub. The pending exception in Thread is 441 // converted into a Java-level exception. 442 // 443 // Contract with Java-level exception handlers: 444 // r0: exception 445 // r3: throwing pc 446 // 447 // NOTE: At entry of this stub, exception-pc must be in LR !! 448 449 // NOTE: this is always used as a jump target within generated code 450 // so it just needs to be generated code wiht no x86 prolog 451 452 address generate_forward_exception() { 453 StubCodeMark mark(this, "StubRoutines", "forward exception"); 454 address start = __ pc(); 455 456 // Upon entry, LR points to the return address returning into 457 // Java (interpreted or compiled) code; i.e., the return address 458 // becomes the throwing pc. 459 // 460 // Arguments pushed before the runtime call are still on the stack 461 // but the exception handler will reset the stack pointer -> 462 // ignore them. A potential result in registers can be ignored as 463 // well. 464 465 #ifdef ASSERT 466 // make sure this code is only executed if there is a pending exception 467 { 468 Label L; 469 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 470 __ cbnz(rscratch1, L); 471 __ stop("StubRoutines::forward exception: no pending exception (1)"); 472 __ bind(L); 473 } 474 #endif 475 476 // compute exception handler into r19 477 478 // call the VM to find the handler address associated with the 479 // caller address. pass thread in r0 and caller pc (ret address) 480 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 481 // the stack. 482 __ mov(c_rarg1, lr); 483 // lr will be trashed by the VM call so we move it to R19 484 // (callee-saved) because we also need to pass it to the handler 485 // returned by this call. 486 __ mov(r19, lr); 487 BLOCK_COMMENT("call exception_handler_for_return_address"); 488 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 489 SharedRuntime::exception_handler_for_return_address), 490 rthread, c_rarg1); 491 if (UseSVE > 0 ) { 492 // Reinitialize the ptrue predicate register, in case the external runtime 493 // call clobbers ptrue reg, as we may return to SVE compiled code. 494 __ reinitialize_ptrue(); 495 } 496 // we should not really care that lr is no longer the callee 497 // address. we saved the value the handler needs in r19 so we can 498 // just copy it to r3. however, the C2 handler will push its own 499 // frame and then calls into the VM and the VM code asserts that 500 // the PC for the frame above the handler belongs to a compiled 501 // Java method. So, we restore lr here to satisfy that assert. 502 __ mov(lr, r19); 503 // setup r0 & r3 & clear pending exception 504 __ mov(r3, r19); 505 __ mov(r19, r0); 506 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 507 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 508 509 #ifdef ASSERT 510 // make sure exception is set 511 { 512 Label L; 513 __ cbnz(r0, L); 514 __ stop("StubRoutines::forward exception: no pending exception (2)"); 515 __ bind(L); 516 } 517 #endif 518 519 // continue at exception handler 520 // r0: exception 521 // r3: throwing pc 522 // r19: exception handler 523 __ verify_oop(r0); 524 __ br(r19); 525 526 return start; 527 } 528 529 // Non-destructive plausibility checks for oops 530 // 531 // Arguments: 532 // r0: oop to verify 533 // rscratch1: error message 534 // 535 // Stack after saving c_rarg3: 536 // [tos + 0]: saved c_rarg3 537 // [tos + 1]: saved c_rarg2 538 // [tos + 2]: saved lr 539 // [tos + 3]: saved rscratch2 540 // [tos + 4]: saved r0 541 // [tos + 5]: saved rscratch1 542 address generate_verify_oop() { 543 544 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 545 address start = __ pc(); 546 547 Label exit, error; 548 549 // save c_rarg2 and c_rarg3 550 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 551 552 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 553 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 554 __ ldr(c_rarg3, Address(c_rarg2)); 555 __ add(c_rarg3, c_rarg3, 1); 556 __ str(c_rarg3, Address(c_rarg2)); 557 558 // object is in r0 559 // make sure object is 'reasonable' 560 __ cbz(r0, exit); // if obj is NULL it is OK 561 562 #if INCLUDE_ZGC 563 if (UseZGC) { 564 // Check if mask is good. 565 // verifies that ZAddressBadMask & r0 == 0 566 __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset())); 567 __ andr(c_rarg2, r0, c_rarg3); 568 __ cbnz(c_rarg2, error); 569 } 570 #endif 571 572 // Check if the oop is in the right area of memory 573 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 574 __ andr(c_rarg2, r0, c_rarg3); 575 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 576 577 // Compare c_rarg2 and c_rarg3. We don't use a compare 578 // instruction here because the flags register is live. 579 __ eor(c_rarg2, c_rarg2, c_rarg3); 580 __ cbnz(c_rarg2, error); 581 582 // make sure klass is 'reasonable', which is not zero. 583 __ load_klass(r0, r0); // get klass 584 __ cbz(r0, error); // if klass is NULL it is broken 585 586 // return if everything seems ok 587 __ bind(exit); 588 589 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 590 __ ret(lr); 591 592 // handle errors 593 __ bind(error); 594 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 595 596 __ push(RegSet::range(r0, r29), sp); 597 // debug(char* msg, int64_t pc, int64_t regs[]) 598 __ mov(c_rarg0, rscratch1); // pass address of error message 599 __ mov(c_rarg1, lr); // pass return address 600 __ mov(c_rarg2, sp); // pass address of regs on stack 601 #ifndef PRODUCT 602 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 603 #endif 604 BLOCK_COMMENT("call MacroAssembler::debug"); 605 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 606 __ blr(rscratch1); 607 __ hlt(0); 608 609 return start; 610 } 611 612 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 613 614 // The inner part of zero_words(). This is the bulk operation, 615 // zeroing words in blocks, possibly using DC ZVA to do it. The 616 // caller is responsible for zeroing the last few words. 617 // 618 // Inputs: 619 // r10: the HeapWord-aligned base address of an array to zero. 620 // r11: the count in HeapWords, r11 > 0. 621 // 622 // Returns r10 and r11, adjusted for the caller to clear. 623 // r10: the base address of the tail of words left to clear. 624 // r11: the number of words in the tail. 625 // r11 < MacroAssembler::zero_words_block_size. 626 627 address generate_zero_blocks() { 628 Label done; 629 Label base_aligned; 630 631 Register base = r10, cnt = r11; 632 633 __ align(CodeEntryAlignment); 634 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 635 address start = __ pc(); 636 637 if (UseBlockZeroing) { 638 int zva_length = VM_Version::zva_length(); 639 640 // Ensure ZVA length can be divided by 16. This is required by 641 // the subsequent operations. 642 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 643 644 __ tbz(base, 3, base_aligned); 645 __ str(zr, Address(__ post(base, 8))); 646 __ sub(cnt, cnt, 1); 647 __ bind(base_aligned); 648 649 // Ensure count >= zva_length * 2 so that it still deserves a zva after 650 // alignment. 651 Label small; 652 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 653 __ subs(rscratch1, cnt, low_limit >> 3); 654 __ br(Assembler::LT, small); 655 __ zero_dcache_blocks(base, cnt); 656 __ bind(small); 657 } 658 659 { 660 // Number of stp instructions we'll unroll 661 const int unroll = 662 MacroAssembler::zero_words_block_size / 2; 663 // Clear the remaining blocks. 664 Label loop; 665 __ subs(cnt, cnt, unroll * 2); 666 __ br(Assembler::LT, done); 667 __ bind(loop); 668 for (int i = 0; i < unroll; i++) 669 __ stp(zr, zr, __ post(base, 16)); 670 __ subs(cnt, cnt, unroll * 2); 671 __ br(Assembler::GE, loop); 672 __ bind(done); 673 __ add(cnt, cnt, unroll * 2); 674 } 675 676 __ ret(lr); 677 678 return start; 679 } 680 681 682 typedef enum { 683 copy_forwards = 1, 684 copy_backwards = -1 685 } copy_direction; 686 687 // Bulk copy of blocks of 8 words. 688 // 689 // count is a count of words. 690 // 691 // Precondition: count >= 8 692 // 693 // Postconditions: 694 // 695 // The least significant bit of count contains the remaining count 696 // of words to copy. The rest of count is trash. 697 // 698 // s and d are adjusted to point to the remaining words to copy 699 // 700 void generate_copy_longs(Label &start, Register s, Register d, Register count, 701 copy_direction direction) { 702 int unit = wordSize * direction; 703 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 704 705 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 706 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 707 const Register stride = r13; 708 709 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 710 assert_different_registers(s, d, count, rscratch1); 711 712 Label again, drain; 713 const char *stub_name; 714 if (direction == copy_forwards) 715 stub_name = "forward_copy_longs"; 716 else 717 stub_name = "backward_copy_longs"; 718 719 __ align(CodeEntryAlignment); 720 721 StubCodeMark mark(this, "StubRoutines", stub_name); 722 723 __ bind(start); 724 725 Label unaligned_copy_long; 726 if (AvoidUnalignedAccesses) { 727 __ tbnz(d, 3, unaligned_copy_long); 728 } 729 730 if (direction == copy_forwards) { 731 __ sub(s, s, bias); 732 __ sub(d, d, bias); 733 } 734 735 #ifdef ASSERT 736 // Make sure we are never given < 8 words 737 { 738 Label L; 739 __ cmp(count, (u1)8); 740 __ br(Assembler::GE, L); 741 __ stop("genrate_copy_longs called with < 8 words"); 742 __ bind(L); 743 } 744 #endif 745 746 // Fill 8 registers 747 if (UseSIMDForMemoryOps) { 748 __ ldpq(v0, v1, Address(s, 4 * unit)); 749 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 750 } else { 751 __ ldp(t0, t1, Address(s, 2 * unit)); 752 __ ldp(t2, t3, Address(s, 4 * unit)); 753 __ ldp(t4, t5, Address(s, 6 * unit)); 754 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 755 } 756 757 __ subs(count, count, 16); 758 __ br(Assembler::LO, drain); 759 760 int prefetch = PrefetchCopyIntervalInBytes; 761 bool use_stride = false; 762 if (direction == copy_backwards) { 763 use_stride = prefetch > 256; 764 prefetch = -prefetch; 765 if (use_stride) __ mov(stride, prefetch); 766 } 767 768 __ bind(again); 769 770 if (PrefetchCopyIntervalInBytes > 0) 771 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 772 773 if (UseSIMDForMemoryOps) { 774 __ stpq(v0, v1, Address(d, 4 * unit)); 775 __ ldpq(v0, v1, Address(s, 4 * unit)); 776 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 777 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 778 } else { 779 __ stp(t0, t1, Address(d, 2 * unit)); 780 __ ldp(t0, t1, Address(s, 2 * unit)); 781 __ stp(t2, t3, Address(d, 4 * unit)); 782 __ ldp(t2, t3, Address(s, 4 * unit)); 783 __ stp(t4, t5, Address(d, 6 * unit)); 784 __ ldp(t4, t5, Address(s, 6 * unit)); 785 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 786 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 787 } 788 789 __ subs(count, count, 8); 790 __ br(Assembler::HS, again); 791 792 // Drain 793 __ bind(drain); 794 if (UseSIMDForMemoryOps) { 795 __ stpq(v0, v1, Address(d, 4 * unit)); 796 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 797 } else { 798 __ stp(t0, t1, Address(d, 2 * unit)); 799 __ stp(t2, t3, Address(d, 4 * unit)); 800 __ stp(t4, t5, Address(d, 6 * unit)); 801 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 802 } 803 804 { 805 Label L1, L2; 806 __ tbz(count, exact_log2(4), L1); 807 if (UseSIMDForMemoryOps) { 808 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 809 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 810 } else { 811 __ ldp(t0, t1, Address(s, 2 * unit)); 812 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 813 __ stp(t0, t1, Address(d, 2 * unit)); 814 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 815 } 816 __ bind(L1); 817 818 if (direction == copy_forwards) { 819 __ add(s, s, bias); 820 __ add(d, d, bias); 821 } 822 823 __ tbz(count, 1, L2); 824 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 825 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 826 __ bind(L2); 827 } 828 829 __ ret(lr); 830 831 if (AvoidUnalignedAccesses) { 832 Label drain, again; 833 // Register order for storing. Order is different for backward copy. 834 835 __ bind(unaligned_copy_long); 836 837 // source address is even aligned, target odd aligned 838 // 839 // when forward copying word pairs we read long pairs at offsets 840 // {0, 2, 4, 6} (in long words). when backwards copying we read 841 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 842 // address by -2 in the forwards case so we can compute the 843 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 844 // or -1. 845 // 846 // when forward copying we need to store 1 word, 3 pairs and 847 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 848 // zero offset We adjust the destination by -1 which means we 849 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 850 // 851 // When backwards copyng we need to store 1 word, 3 pairs and 852 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 853 // offsets {1, 3, 5, 7, 8} * unit. 854 855 if (direction == copy_forwards) { 856 __ sub(s, s, 16); 857 __ sub(d, d, 8); 858 } 859 860 // Fill 8 registers 861 // 862 // for forwards copy s was offset by -16 from the original input 863 // value of s so the register contents are at these offsets 864 // relative to the 64 bit block addressed by that original input 865 // and so on for each successive 64 byte block when s is updated 866 // 867 // t0 at offset 0, t1 at offset 8 868 // t2 at offset 16, t3 at offset 24 869 // t4 at offset 32, t5 at offset 40 870 // t6 at offset 48, t7 at offset 56 871 872 // for backwards copy s was not offset so the register contents 873 // are at these offsets into the preceding 64 byte block 874 // relative to that original input and so on for each successive 875 // preceding 64 byte block when s is updated. this explains the 876 // slightly counter-intuitive looking pattern of register usage 877 // in the stp instructions for backwards copy. 878 // 879 // t0 at offset -16, t1 at offset -8 880 // t2 at offset -32, t3 at offset -24 881 // t4 at offset -48, t5 at offset -40 882 // t6 at offset -64, t7 at offset -56 883 884 __ ldp(t0, t1, Address(s, 2 * unit)); 885 __ ldp(t2, t3, Address(s, 4 * unit)); 886 __ ldp(t4, t5, Address(s, 6 * unit)); 887 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 888 889 __ subs(count, count, 16); 890 __ br(Assembler::LO, drain); 891 892 int prefetch = PrefetchCopyIntervalInBytes; 893 bool use_stride = false; 894 if (direction == copy_backwards) { 895 use_stride = prefetch > 256; 896 prefetch = -prefetch; 897 if (use_stride) __ mov(stride, prefetch); 898 } 899 900 __ bind(again); 901 902 if (PrefetchCopyIntervalInBytes > 0) 903 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 904 905 if (direction == copy_forwards) { 906 // allowing for the offset of -8 the store instructions place 907 // registers into the target 64 bit block at the following 908 // offsets 909 // 910 // t0 at offset 0 911 // t1 at offset 8, t2 at offset 16 912 // t3 at offset 24, t4 at offset 32 913 // t5 at offset 40, t6 at offset 48 914 // t7 at offset 56 915 916 __ str(t0, Address(d, 1 * unit)); 917 __ stp(t1, t2, Address(d, 2 * unit)); 918 __ ldp(t0, t1, Address(s, 2 * unit)); 919 __ stp(t3, t4, Address(d, 4 * unit)); 920 __ ldp(t2, t3, Address(s, 4 * unit)); 921 __ stp(t5, t6, Address(d, 6 * unit)); 922 __ ldp(t4, t5, Address(s, 6 * unit)); 923 __ str(t7, Address(__ pre(d, 8 * unit))); 924 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 925 } else { 926 // d was not offset when we started so the registers are 927 // written into the 64 bit block preceding d with the following 928 // offsets 929 // 930 // t1 at offset -8 931 // t3 at offset -24, t0 at offset -16 932 // t5 at offset -48, t2 at offset -32 933 // t7 at offset -56, t4 at offset -48 934 // t6 at offset -64 935 // 936 // note that this matches the offsets previously noted for the 937 // loads 938 939 __ str(t1, Address(d, 1 * unit)); 940 __ stp(t3, t0, Address(d, 3 * unit)); 941 __ ldp(t0, t1, Address(s, 2 * unit)); 942 __ stp(t5, t2, Address(d, 5 * unit)); 943 __ ldp(t2, t3, Address(s, 4 * unit)); 944 __ stp(t7, t4, Address(d, 7 * unit)); 945 __ ldp(t4, t5, Address(s, 6 * unit)); 946 __ str(t6, Address(__ pre(d, 8 * unit))); 947 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 948 } 949 950 __ subs(count, count, 8); 951 __ br(Assembler::HS, again); 952 953 // Drain 954 // 955 // this uses the same pattern of offsets and register arguments 956 // as above 957 __ bind(drain); 958 if (direction == copy_forwards) { 959 __ str(t0, Address(d, 1 * unit)); 960 __ stp(t1, t2, Address(d, 2 * unit)); 961 __ stp(t3, t4, Address(d, 4 * unit)); 962 __ stp(t5, t6, Address(d, 6 * unit)); 963 __ str(t7, Address(__ pre(d, 8 * unit))); 964 } else { 965 __ str(t1, Address(d, 1 * unit)); 966 __ stp(t3, t0, Address(d, 3 * unit)); 967 __ stp(t5, t2, Address(d, 5 * unit)); 968 __ stp(t7, t4, Address(d, 7 * unit)); 969 __ str(t6, Address(__ pre(d, 8 * unit))); 970 } 971 // now we need to copy any remaining part block which may 972 // include a 4 word block subblock and/or a 2 word subblock. 973 // bits 2 and 1 in the count are the tell-tale for whetehr we 974 // have each such subblock 975 { 976 Label L1, L2; 977 __ tbz(count, exact_log2(4), L1); 978 // this is the same as above but copying only 4 longs hence 979 // with ony one intervening stp between the str instructions 980 // but note that the offsets and registers still follow the 981 // same pattern 982 __ ldp(t0, t1, Address(s, 2 * unit)); 983 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 984 if (direction == copy_forwards) { 985 __ str(t0, Address(d, 1 * unit)); 986 __ stp(t1, t2, Address(d, 2 * unit)); 987 __ str(t3, Address(__ pre(d, 4 * unit))); 988 } else { 989 __ str(t1, Address(d, 1 * unit)); 990 __ stp(t3, t0, Address(d, 3 * unit)); 991 __ str(t2, Address(__ pre(d, 4 * unit))); 992 } 993 __ bind(L1); 994 995 __ tbz(count, 1, L2); 996 // this is the same as above but copying only 2 longs hence 997 // there is no intervening stp between the str instructions 998 // but note that the offset and register patterns are still 999 // the same 1000 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1001 if (direction == copy_forwards) { 1002 __ str(t0, Address(d, 1 * unit)); 1003 __ str(t1, Address(__ pre(d, 2 * unit))); 1004 } else { 1005 __ str(t1, Address(d, 1 * unit)); 1006 __ str(t0, Address(__ pre(d, 2 * unit))); 1007 } 1008 __ bind(L2); 1009 1010 // for forwards copy we need to re-adjust the offsets we 1011 // applied so that s and d are follow the last words written 1012 1013 if (direction == copy_forwards) { 1014 __ add(s, s, 16); 1015 __ add(d, d, 8); 1016 } 1017 1018 } 1019 1020 __ ret(lr); 1021 } 1022 } 1023 1024 // Small copy: less than 16 bytes. 1025 // 1026 // NB: Ignores all of the bits of count which represent more than 15 1027 // bytes, so a caller doesn't have to mask them. 1028 1029 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1030 bool is_backwards = step < 0; 1031 size_t granularity = uabs(step); 1032 int direction = is_backwards ? -1 : 1; 1033 int unit = wordSize * direction; 1034 1035 Label Lword, Lint, Lshort, Lbyte; 1036 1037 assert(granularity 1038 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1039 1040 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1041 1042 // ??? I don't know if this bit-test-and-branch is the right thing 1043 // to do. It does a lot of jumping, resulting in several 1044 // mispredicted branches. It might make more sense to do this 1045 // with something like Duff's device with a single computed branch. 1046 1047 __ tbz(count, 3 - exact_log2(granularity), Lword); 1048 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1049 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1050 __ bind(Lword); 1051 1052 if (granularity <= sizeof (jint)) { 1053 __ tbz(count, 2 - exact_log2(granularity), Lint); 1054 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1055 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1056 __ bind(Lint); 1057 } 1058 1059 if (granularity <= sizeof (jshort)) { 1060 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1061 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1062 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1063 __ bind(Lshort); 1064 } 1065 1066 if (granularity <= sizeof (jbyte)) { 1067 __ tbz(count, 0, Lbyte); 1068 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1069 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1070 __ bind(Lbyte); 1071 } 1072 } 1073 1074 Label copy_f, copy_b; 1075 1076 // All-singing all-dancing memory copy. 1077 // 1078 // Copy count units of memory from s to d. The size of a unit is 1079 // step, which can be positive or negative depending on the direction 1080 // of copy. If is_aligned is false, we align the source address. 1081 // 1082 1083 void copy_memory(bool is_aligned, Register s, Register d, 1084 Register count, Register tmp, int step) { 1085 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1086 bool is_backwards = step < 0; 1087 int granularity = uabs(step); 1088 const Register t0 = r3, t1 = r4; 1089 1090 // <= 96 bytes do inline. Direction doesn't matter because we always 1091 // load all the data before writing anything 1092 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1093 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1094 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1095 const Register send = r17, dend = r18; 1096 1097 if (PrefetchCopyIntervalInBytes > 0) 1098 __ prfm(Address(s, 0), PLDL1KEEP); 1099 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1100 __ br(Assembler::HI, copy_big); 1101 1102 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1103 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1104 1105 __ cmp(count, u1(16/granularity)); 1106 __ br(Assembler::LS, copy16); 1107 1108 __ cmp(count, u1(64/granularity)); 1109 __ br(Assembler::HI, copy80); 1110 1111 __ cmp(count, u1(32/granularity)); 1112 __ br(Assembler::LS, copy32); 1113 1114 // 33..64 bytes 1115 if (UseSIMDForMemoryOps) { 1116 __ ldpq(v0, v1, Address(s, 0)); 1117 __ ldpq(v2, v3, Address(send, -32)); 1118 __ stpq(v0, v1, Address(d, 0)); 1119 __ stpq(v2, v3, Address(dend, -32)); 1120 } else { 1121 __ ldp(t0, t1, Address(s, 0)); 1122 __ ldp(t2, t3, Address(s, 16)); 1123 __ ldp(t4, t5, Address(send, -32)); 1124 __ ldp(t6, t7, Address(send, -16)); 1125 1126 __ stp(t0, t1, Address(d, 0)); 1127 __ stp(t2, t3, Address(d, 16)); 1128 __ stp(t4, t5, Address(dend, -32)); 1129 __ stp(t6, t7, Address(dend, -16)); 1130 } 1131 __ b(finish); 1132 1133 // 17..32 bytes 1134 __ bind(copy32); 1135 __ ldp(t0, t1, Address(s, 0)); 1136 __ ldp(t2, t3, Address(send, -16)); 1137 __ stp(t0, t1, Address(d, 0)); 1138 __ stp(t2, t3, Address(dend, -16)); 1139 __ b(finish); 1140 1141 // 65..80/96 bytes 1142 // (96 bytes if SIMD because we do 32 byes per instruction) 1143 __ bind(copy80); 1144 if (UseSIMDForMemoryOps) { 1145 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1146 __ ldpq(v4, v5, Address(send, -32)); 1147 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1148 __ stpq(v4, v5, Address(dend, -32)); 1149 } else { 1150 __ ldp(t0, t1, Address(s, 0)); 1151 __ ldp(t2, t3, Address(s, 16)); 1152 __ ldp(t4, t5, Address(s, 32)); 1153 __ ldp(t6, t7, Address(s, 48)); 1154 __ ldp(t8, t9, Address(send, -16)); 1155 1156 __ stp(t0, t1, Address(d, 0)); 1157 __ stp(t2, t3, Address(d, 16)); 1158 __ stp(t4, t5, Address(d, 32)); 1159 __ stp(t6, t7, Address(d, 48)); 1160 __ stp(t8, t9, Address(dend, -16)); 1161 } 1162 __ b(finish); 1163 1164 // 0..16 bytes 1165 __ bind(copy16); 1166 __ cmp(count, u1(8/granularity)); 1167 __ br(Assembler::LO, copy8); 1168 1169 // 8..16 bytes 1170 __ ldr(t0, Address(s, 0)); 1171 __ ldr(t1, Address(send, -8)); 1172 __ str(t0, Address(d, 0)); 1173 __ str(t1, Address(dend, -8)); 1174 __ b(finish); 1175 1176 if (granularity < 8) { 1177 // 4..7 bytes 1178 __ bind(copy8); 1179 __ tbz(count, 2 - exact_log2(granularity), copy4); 1180 __ ldrw(t0, Address(s, 0)); 1181 __ ldrw(t1, Address(send, -4)); 1182 __ strw(t0, Address(d, 0)); 1183 __ strw(t1, Address(dend, -4)); 1184 __ b(finish); 1185 if (granularity < 4) { 1186 // 0..3 bytes 1187 __ bind(copy4); 1188 __ cbz(count, finish); // get rid of 0 case 1189 if (granularity == 2) { 1190 __ ldrh(t0, Address(s, 0)); 1191 __ strh(t0, Address(d, 0)); 1192 } else { // granularity == 1 1193 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1194 // the first and last byte. 1195 // Handle the 3 byte case by loading and storing base + count/2 1196 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1197 // This does means in the 1 byte case we load/store the same 1198 // byte 3 times. 1199 __ lsr(count, count, 1); 1200 __ ldrb(t0, Address(s, 0)); 1201 __ ldrb(t1, Address(send, -1)); 1202 __ ldrb(t2, Address(s, count)); 1203 __ strb(t0, Address(d, 0)); 1204 __ strb(t1, Address(dend, -1)); 1205 __ strb(t2, Address(d, count)); 1206 } 1207 __ b(finish); 1208 } 1209 } 1210 1211 __ bind(copy_big); 1212 if (is_backwards) { 1213 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1214 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1215 } 1216 1217 // Now we've got the small case out of the way we can align the 1218 // source address on a 2-word boundary. 1219 1220 Label aligned; 1221 1222 if (is_aligned) { 1223 // We may have to adjust by 1 word to get s 2-word-aligned. 1224 __ tbz(s, exact_log2(wordSize), aligned); 1225 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1226 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1227 __ sub(count, count, wordSize/granularity); 1228 } else { 1229 if (is_backwards) { 1230 __ andr(rscratch2, s, 2 * wordSize - 1); 1231 } else { 1232 __ neg(rscratch2, s); 1233 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1234 } 1235 // rscratch2 is the byte adjustment needed to align s. 1236 __ cbz(rscratch2, aligned); 1237 int shift = exact_log2(granularity); 1238 if (shift) __ lsr(rscratch2, rscratch2, shift); 1239 __ sub(count, count, rscratch2); 1240 1241 #if 0 1242 // ?? This code is only correct for a disjoint copy. It may or 1243 // may not make sense to use it in that case. 1244 1245 // Copy the first pair; s and d may not be aligned. 1246 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1247 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1248 1249 // Align s and d, adjust count 1250 if (is_backwards) { 1251 __ sub(s, s, rscratch2); 1252 __ sub(d, d, rscratch2); 1253 } else { 1254 __ add(s, s, rscratch2); 1255 __ add(d, d, rscratch2); 1256 } 1257 #else 1258 copy_memory_small(s, d, rscratch2, rscratch1, step); 1259 #endif 1260 } 1261 1262 __ bind(aligned); 1263 1264 // s is now 2-word-aligned. 1265 1266 // We have a count of units and some trailing bytes. Adjust the 1267 // count and do a bulk copy of words. 1268 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1269 if (direction == copy_forwards) 1270 __ bl(copy_f); 1271 else 1272 __ bl(copy_b); 1273 1274 // And the tail. 1275 copy_memory_small(s, d, count, tmp, step); 1276 1277 if (granularity >= 8) __ bind(copy8); 1278 if (granularity >= 4) __ bind(copy4); 1279 __ bind(finish); 1280 } 1281 1282 1283 void clobber_registers() { 1284 #ifdef ASSERT 1285 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1286 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1287 for (Register r = r3; r <= r18; r++) 1288 if (r != rscratch1) __ mov(r, rscratch1); 1289 #endif 1290 } 1291 1292 // Scan over array at a for count oops, verifying each one. 1293 // Preserves a and count, clobbers rscratch1 and rscratch2. 1294 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1295 Label loop, end; 1296 __ mov(rscratch1, a); 1297 __ mov(rscratch2, zr); 1298 __ bind(loop); 1299 __ cmp(rscratch2, count); 1300 __ br(Assembler::HS, end); 1301 if (size == (size_t)wordSize) { 1302 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1303 __ verify_oop(temp); 1304 } else { 1305 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1306 __ decode_heap_oop(temp); // calls verify_oop 1307 } 1308 __ add(rscratch2, rscratch2, size); 1309 __ b(loop); 1310 __ bind(end); 1311 } 1312 1313 // Arguments: 1314 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1315 // ignored 1316 // is_oop - true => oop array, so generate store check code 1317 // name - stub name string 1318 // 1319 // Inputs: 1320 // c_rarg0 - source array address 1321 // c_rarg1 - destination array address 1322 // c_rarg2 - element count, treated as ssize_t, can be zero 1323 // 1324 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1325 // the hardware handle it. The two dwords within qwords that span 1326 // cache line boundaries will still be loaded and stored atomicly. 1327 // 1328 // Side Effects: 1329 // disjoint_int_copy_entry is set to the no-overlap entry point 1330 // used by generate_conjoint_int_oop_copy(). 1331 // 1332 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1333 const char *name, bool dest_uninitialized = false) { 1334 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1335 RegSet saved_reg = RegSet::of(s, d, count); 1336 __ align(CodeEntryAlignment); 1337 StubCodeMark mark(this, "StubRoutines", name); 1338 address start = __ pc(); 1339 __ enter(); 1340 1341 if (entry != NULL) { 1342 *entry = __ pc(); 1343 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1344 BLOCK_COMMENT("Entry:"); 1345 } 1346 1347 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1348 if (dest_uninitialized) { 1349 decorators |= IS_DEST_UNINITIALIZED; 1350 } 1351 if (aligned) { 1352 decorators |= ARRAYCOPY_ALIGNED; 1353 } 1354 1355 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1356 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1357 1358 if (is_oop) { 1359 // save regs before copy_memory 1360 __ push(RegSet::of(d, count), sp); 1361 } 1362 { 1363 // UnsafeCopyMemory page error: continue after ucm 1364 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1365 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1366 copy_memory(aligned, s, d, count, rscratch1, size); 1367 } 1368 1369 if (is_oop) { 1370 __ pop(RegSet::of(d, count), sp); 1371 if (VerifyOops) 1372 verify_oop_array(size, d, count, r16); 1373 } 1374 1375 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1376 1377 __ leave(); 1378 __ mov(r0, zr); // return 0 1379 __ ret(lr); 1380 return start; 1381 } 1382 1383 // Arguments: 1384 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1385 // ignored 1386 // is_oop - true => oop array, so generate store check code 1387 // name - stub name string 1388 // 1389 // Inputs: 1390 // c_rarg0 - source array address 1391 // c_rarg1 - destination array address 1392 // c_rarg2 - element count, treated as ssize_t, can be zero 1393 // 1394 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1395 // the hardware handle it. The two dwords within qwords that span 1396 // cache line boundaries will still be loaded and stored atomicly. 1397 // 1398 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1399 address *entry, const char *name, 1400 bool dest_uninitialized = false) { 1401 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1402 RegSet saved_regs = RegSet::of(s, d, count); 1403 StubCodeMark mark(this, "StubRoutines", name); 1404 address start = __ pc(); 1405 __ enter(); 1406 1407 if (entry != NULL) { 1408 *entry = __ pc(); 1409 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1410 BLOCK_COMMENT("Entry:"); 1411 } 1412 1413 // use fwd copy when (d-s) above_equal (count*size) 1414 __ sub(rscratch1, d, s); 1415 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1416 __ br(Assembler::HS, nooverlap_target); 1417 1418 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1419 if (dest_uninitialized) { 1420 decorators |= IS_DEST_UNINITIALIZED; 1421 } 1422 if (aligned) { 1423 decorators |= ARRAYCOPY_ALIGNED; 1424 } 1425 1426 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1427 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1428 1429 if (is_oop) { 1430 // save regs before copy_memory 1431 __ push(RegSet::of(d, count), sp); 1432 } 1433 { 1434 // UnsafeCopyMemory page error: continue after ucm 1435 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1436 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1437 copy_memory(aligned, s, d, count, rscratch1, -size); 1438 } 1439 if (is_oop) { 1440 __ pop(RegSet::of(d, count), sp); 1441 if (VerifyOops) 1442 verify_oop_array(size, d, count, r16); 1443 } 1444 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1445 __ leave(); 1446 __ mov(r0, zr); // return 0 1447 __ ret(lr); 1448 return start; 1449 } 1450 1451 // Arguments: 1452 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1453 // ignored 1454 // name - stub name string 1455 // 1456 // Inputs: 1457 // c_rarg0 - source array address 1458 // c_rarg1 - destination array address 1459 // c_rarg2 - element count, treated as ssize_t, can be zero 1460 // 1461 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1462 // we let the hardware handle it. The one to eight bytes within words, 1463 // dwords or qwords that span cache line boundaries will still be loaded 1464 // and stored atomically. 1465 // 1466 // Side Effects: 1467 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1468 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1469 // we let the hardware handle it. The one to eight bytes within words, 1470 // dwords or qwords that span cache line boundaries will still be loaded 1471 // and stored atomically. 1472 // 1473 // Side Effects: 1474 // disjoint_byte_copy_entry is set to the no-overlap entry point 1475 // used by generate_conjoint_byte_copy(). 1476 // 1477 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1478 const bool not_oop = false; 1479 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1480 } 1481 1482 // Arguments: 1483 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1484 // ignored 1485 // name - stub name string 1486 // 1487 // Inputs: 1488 // c_rarg0 - source array address 1489 // c_rarg1 - destination array address 1490 // c_rarg2 - element count, treated as ssize_t, can be zero 1491 // 1492 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1493 // we let the hardware handle it. The one to eight bytes within words, 1494 // dwords or qwords that span cache line boundaries will still be loaded 1495 // and stored atomically. 1496 // 1497 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1498 address* entry, const char *name) { 1499 const bool not_oop = false; 1500 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1501 } 1502 1503 // Arguments: 1504 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1505 // ignored 1506 // name - stub name string 1507 // 1508 // Inputs: 1509 // c_rarg0 - source array address 1510 // c_rarg1 - destination array address 1511 // c_rarg2 - element count, treated as ssize_t, can be zero 1512 // 1513 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1514 // let the hardware handle it. The two or four words within dwords 1515 // or qwords that span cache line boundaries will still be loaded 1516 // and stored atomically. 1517 // 1518 // Side Effects: 1519 // disjoint_short_copy_entry is set to the no-overlap entry point 1520 // used by generate_conjoint_short_copy(). 1521 // 1522 address generate_disjoint_short_copy(bool aligned, 1523 address* entry, const char *name) { 1524 const bool not_oop = false; 1525 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1526 } 1527 1528 // Arguments: 1529 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1530 // ignored 1531 // name - stub name string 1532 // 1533 // Inputs: 1534 // c_rarg0 - source array address 1535 // c_rarg1 - destination array address 1536 // c_rarg2 - element count, treated as ssize_t, can be zero 1537 // 1538 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1539 // let the hardware handle it. The two or four words within dwords 1540 // or qwords that span cache line boundaries will still be loaded 1541 // and stored atomically. 1542 // 1543 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1544 address *entry, const char *name) { 1545 const bool not_oop = false; 1546 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1547 1548 } 1549 // Arguments: 1550 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1551 // ignored 1552 // name - stub name string 1553 // 1554 // Inputs: 1555 // c_rarg0 - source array address 1556 // c_rarg1 - destination array address 1557 // c_rarg2 - element count, treated as ssize_t, can be zero 1558 // 1559 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1560 // the hardware handle it. The two dwords within qwords that span 1561 // cache line boundaries will still be loaded and stored atomicly. 1562 // 1563 // Side Effects: 1564 // disjoint_int_copy_entry is set to the no-overlap entry point 1565 // used by generate_conjoint_int_oop_copy(). 1566 // 1567 address generate_disjoint_int_copy(bool aligned, address *entry, 1568 const char *name, bool dest_uninitialized = false) { 1569 const bool not_oop = false; 1570 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1571 } 1572 1573 // Arguments: 1574 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1575 // ignored 1576 // name - stub name string 1577 // 1578 // Inputs: 1579 // c_rarg0 - source array address 1580 // c_rarg1 - destination array address 1581 // c_rarg2 - element count, treated as ssize_t, can be zero 1582 // 1583 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1584 // the hardware handle it. The two dwords within qwords that span 1585 // cache line boundaries will still be loaded and stored atomicly. 1586 // 1587 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1588 address *entry, const char *name, 1589 bool dest_uninitialized = false) { 1590 const bool not_oop = false; 1591 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1592 } 1593 1594 1595 // Arguments: 1596 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1597 // ignored 1598 // name - stub name string 1599 // 1600 // Inputs: 1601 // c_rarg0 - source array address 1602 // c_rarg1 - destination array address 1603 // c_rarg2 - element count, treated as size_t, can be zero 1604 // 1605 // Side Effects: 1606 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1607 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1608 // 1609 address generate_disjoint_long_copy(bool aligned, address *entry, 1610 const char *name, bool dest_uninitialized = false) { 1611 const bool not_oop = false; 1612 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1613 } 1614 1615 // Arguments: 1616 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1617 // ignored 1618 // name - stub name string 1619 // 1620 // Inputs: 1621 // c_rarg0 - source array address 1622 // c_rarg1 - destination array address 1623 // c_rarg2 - element count, treated as size_t, can be zero 1624 // 1625 address generate_conjoint_long_copy(bool aligned, 1626 address nooverlap_target, address *entry, 1627 const char *name, bool dest_uninitialized = false) { 1628 const bool not_oop = false; 1629 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1630 } 1631 1632 // Arguments: 1633 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1634 // ignored 1635 // name - stub name string 1636 // 1637 // Inputs: 1638 // c_rarg0 - source array address 1639 // c_rarg1 - destination array address 1640 // c_rarg2 - element count, treated as size_t, can be zero 1641 // 1642 // Side Effects: 1643 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1644 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1645 // 1646 address generate_disjoint_oop_copy(bool aligned, address *entry, 1647 const char *name, bool dest_uninitialized) { 1648 const bool is_oop = true; 1649 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1650 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1651 } 1652 1653 // Arguments: 1654 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1655 // ignored 1656 // name - stub name string 1657 // 1658 // Inputs: 1659 // c_rarg0 - source array address 1660 // c_rarg1 - destination array address 1661 // c_rarg2 - element count, treated as size_t, can be zero 1662 // 1663 address generate_conjoint_oop_copy(bool aligned, 1664 address nooverlap_target, address *entry, 1665 const char *name, bool dest_uninitialized) { 1666 const bool is_oop = true; 1667 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1668 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1669 name, dest_uninitialized); 1670 } 1671 1672 1673 // Helper for generating a dynamic type check. 1674 // Smashes rscratch1, rscratch2. 1675 void generate_type_check(Register sub_klass, 1676 Register super_check_offset, 1677 Register super_klass, 1678 Label& L_success) { 1679 assert_different_registers(sub_klass, super_check_offset, super_klass); 1680 1681 BLOCK_COMMENT("type_check:"); 1682 1683 Label L_miss; 1684 1685 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1686 super_check_offset); 1687 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1688 1689 // Fall through on failure! 1690 __ BIND(L_miss); 1691 } 1692 1693 // 1694 // Generate checkcasting array copy stub 1695 // 1696 // Input: 1697 // c_rarg0 - source array address 1698 // c_rarg1 - destination array address 1699 // c_rarg2 - element count, treated as ssize_t, can be zero 1700 // c_rarg3 - size_t ckoff (super_check_offset) 1701 // c_rarg4 - oop ckval (super_klass) 1702 // 1703 // Output: 1704 // r0 == 0 - success 1705 // r0 == -1^K - failure, where K is partial transfer count 1706 // 1707 address generate_checkcast_copy(const char *name, address *entry, 1708 bool dest_uninitialized = false) { 1709 1710 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1711 1712 // Input registers (after setup_arg_regs) 1713 const Register from = c_rarg0; // source array address 1714 const Register to = c_rarg1; // destination array address 1715 const Register count = c_rarg2; // elementscount 1716 const Register ckoff = c_rarg3; // super_check_offset 1717 const Register ckval = c_rarg4; // super_klass 1718 1719 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1720 RegSet wb_post_saved_regs = RegSet::of(count); 1721 1722 // Registers used as temps (r18, r19, r20 are save-on-entry) 1723 const Register count_save = r21; // orig elementscount 1724 const Register start_to = r20; // destination array start address 1725 const Register copied_oop = r18; // actual oop copied 1726 const Register r19_klass = r19; // oop._klass 1727 1728 //--------------------------------------------------------------- 1729 // Assembler stub will be used for this call to arraycopy 1730 // if the two arrays are subtypes of Object[] but the 1731 // destination array type is not equal to or a supertype 1732 // of the source type. Each element must be separately 1733 // checked. 1734 1735 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1736 copied_oop, r19_klass, count_save); 1737 1738 __ align(CodeEntryAlignment); 1739 StubCodeMark mark(this, "StubRoutines", name); 1740 address start = __ pc(); 1741 1742 __ enter(); // required for proper stackwalking of RuntimeStub frame 1743 1744 #ifdef ASSERT 1745 // caller guarantees that the arrays really are different 1746 // otherwise, we would have to make conjoint checks 1747 { Label L; 1748 array_overlap_test(L, TIMES_OOP); 1749 __ stop("checkcast_copy within a single array"); 1750 __ bind(L); 1751 } 1752 #endif //ASSERT 1753 1754 // Caller of this entry point must set up the argument registers. 1755 if (entry != NULL) { 1756 *entry = __ pc(); 1757 BLOCK_COMMENT("Entry:"); 1758 } 1759 1760 // Empty array: Nothing to do. 1761 __ cbz(count, L_done); 1762 1763 __ push(RegSet::of(r18, r19, r20, r21), sp); 1764 1765 #ifdef ASSERT 1766 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1767 // The ckoff and ckval must be mutually consistent, 1768 // even though caller generates both. 1769 { Label L; 1770 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1771 __ ldrw(start_to, Address(ckval, sco_offset)); 1772 __ cmpw(ckoff, start_to); 1773 __ br(Assembler::EQ, L); 1774 __ stop("super_check_offset inconsistent"); 1775 __ bind(L); 1776 } 1777 #endif //ASSERT 1778 1779 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1780 bool is_oop = true; 1781 if (dest_uninitialized) { 1782 decorators |= IS_DEST_UNINITIALIZED; 1783 } 1784 1785 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1786 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1787 1788 // save the original count 1789 __ mov(count_save, count); 1790 1791 // Copy from low to high addresses 1792 __ mov(start_to, to); // Save destination array start address 1793 __ b(L_load_element); 1794 1795 // ======== begin loop ======== 1796 // (Loop is rotated; its entry is L_load_element.) 1797 // Loop control: 1798 // for (; count != 0; count--) { 1799 // copied_oop = load_heap_oop(from++); 1800 // ... generate_type_check ...; 1801 // store_heap_oop(to++, copied_oop); 1802 // } 1803 __ align(OptoLoopAlignment); 1804 1805 __ BIND(L_store_element); 1806 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1807 __ sub(count, count, 1); 1808 __ cbz(count, L_do_card_marks); 1809 1810 // ======== loop entry is here ======== 1811 __ BIND(L_load_element); 1812 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1813 __ cbz(copied_oop, L_store_element); 1814 1815 __ load_klass(r19_klass, copied_oop);// query the object klass 1816 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1817 // ======== end loop ======== 1818 1819 // It was a real error; we must depend on the caller to finish the job. 1820 // Register count = remaining oops, count_orig = total oops. 1821 // Emit GC store barriers for the oops we have copied and report 1822 // their number to the caller. 1823 1824 __ subs(count, count_save, count); // K = partially copied oop count 1825 __ eon(count, count, zr); // report (-1^K) to caller 1826 __ br(Assembler::EQ, L_done_pop); 1827 1828 __ BIND(L_do_card_marks); 1829 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1830 1831 __ bind(L_done_pop); 1832 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1833 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1834 1835 __ bind(L_done); 1836 __ mov(r0, count); 1837 __ leave(); 1838 __ ret(lr); 1839 1840 return start; 1841 } 1842 1843 // Perform range checks on the proposed arraycopy. 1844 // Kills temp, but nothing else. 1845 // Also, clean the sign bits of src_pos and dst_pos. 1846 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1847 Register src_pos, // source position (c_rarg1) 1848 Register dst, // destination array oo (c_rarg2) 1849 Register dst_pos, // destination position (c_rarg3) 1850 Register length, 1851 Register temp, 1852 Label& L_failed) { 1853 BLOCK_COMMENT("arraycopy_range_checks:"); 1854 1855 assert_different_registers(rscratch1, temp); 1856 1857 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1858 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1859 __ addw(temp, length, src_pos); 1860 __ cmpw(temp, rscratch1); 1861 __ br(Assembler::HI, L_failed); 1862 1863 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1864 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1865 __ addw(temp, length, dst_pos); 1866 __ cmpw(temp, rscratch1); 1867 __ br(Assembler::HI, L_failed); 1868 1869 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1870 __ movw(src_pos, src_pos); 1871 __ movw(dst_pos, dst_pos); 1872 1873 BLOCK_COMMENT("arraycopy_range_checks done"); 1874 } 1875 1876 // These stubs get called from some dumb test routine. 1877 // I'll write them properly when they're called from 1878 // something that's actually doing something. 1879 static void fake_arraycopy_stub(address src, address dst, int count) { 1880 assert(count == 0, "huh?"); 1881 } 1882 1883 1884 // 1885 // Generate 'unsafe' array copy stub 1886 // Though just as safe as the other stubs, it takes an unscaled 1887 // size_t argument instead of an element count. 1888 // 1889 // Input: 1890 // c_rarg0 - source array address 1891 // c_rarg1 - destination array address 1892 // c_rarg2 - byte count, treated as ssize_t, can be zero 1893 // 1894 // Examines the alignment of the operands and dispatches 1895 // to a long, int, short, or byte copy loop. 1896 // 1897 address generate_unsafe_copy(const char *name, 1898 address byte_copy_entry, 1899 address short_copy_entry, 1900 address int_copy_entry, 1901 address long_copy_entry) { 1902 Label L_long_aligned, L_int_aligned, L_short_aligned; 1903 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1904 1905 __ align(CodeEntryAlignment); 1906 StubCodeMark mark(this, "StubRoutines", name); 1907 address start = __ pc(); 1908 __ enter(); // required for proper stackwalking of RuntimeStub frame 1909 1910 // bump this on entry, not on exit: 1911 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1912 1913 __ orr(rscratch1, s, d); 1914 __ orr(rscratch1, rscratch1, count); 1915 1916 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1917 __ cbz(rscratch1, L_long_aligned); 1918 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1919 __ cbz(rscratch1, L_int_aligned); 1920 __ tbz(rscratch1, 0, L_short_aligned); 1921 __ b(RuntimeAddress(byte_copy_entry)); 1922 1923 __ BIND(L_short_aligned); 1924 __ lsr(count, count, LogBytesPerShort); // size => short_count 1925 __ b(RuntimeAddress(short_copy_entry)); 1926 __ BIND(L_int_aligned); 1927 __ lsr(count, count, LogBytesPerInt); // size => int_count 1928 __ b(RuntimeAddress(int_copy_entry)); 1929 __ BIND(L_long_aligned); 1930 __ lsr(count, count, LogBytesPerLong); // size => long_count 1931 __ b(RuntimeAddress(long_copy_entry)); 1932 1933 return start; 1934 } 1935 1936 // 1937 // Generate generic array copy stubs 1938 // 1939 // Input: 1940 // c_rarg0 - src oop 1941 // c_rarg1 - src_pos (32-bits) 1942 // c_rarg2 - dst oop 1943 // c_rarg3 - dst_pos (32-bits) 1944 // c_rarg4 - element count (32-bits) 1945 // 1946 // Output: 1947 // r0 == 0 - success 1948 // r0 == -1^K - failure, where K is partial transfer count 1949 // 1950 address generate_generic_copy(const char *name, 1951 address byte_copy_entry, address short_copy_entry, 1952 address int_copy_entry, address oop_copy_entry, 1953 address long_copy_entry, address checkcast_copy_entry) { 1954 1955 Label L_failed, L_objArray; 1956 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1957 1958 // Input registers 1959 const Register src = c_rarg0; // source array oop 1960 const Register src_pos = c_rarg1; // source position 1961 const Register dst = c_rarg2; // destination array oop 1962 const Register dst_pos = c_rarg3; // destination position 1963 const Register length = c_rarg4; 1964 1965 1966 // Registers used as temps 1967 const Register dst_klass = c_rarg5; 1968 1969 __ align(CodeEntryAlignment); 1970 1971 StubCodeMark mark(this, "StubRoutines", name); 1972 1973 address start = __ pc(); 1974 1975 __ enter(); // required for proper stackwalking of RuntimeStub frame 1976 1977 // bump this on entry, not on exit: 1978 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1979 1980 //----------------------------------------------------------------------- 1981 // Assembler stub will be used for this call to arraycopy 1982 // if the following conditions are met: 1983 // 1984 // (1) src and dst must not be null. 1985 // (2) src_pos must not be negative. 1986 // (3) dst_pos must not be negative. 1987 // (4) length must not be negative. 1988 // (5) src klass and dst klass should be the same and not NULL. 1989 // (6) src and dst should be arrays. 1990 // (7) src_pos + length must not exceed length of src. 1991 // (8) dst_pos + length must not exceed length of dst. 1992 // 1993 1994 // if (src == NULL) return -1; 1995 __ cbz(src, L_failed); 1996 1997 // if (src_pos < 0) return -1; 1998 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 1999 2000 // if (dst == NULL) return -1; 2001 __ cbz(dst, L_failed); 2002 2003 // if (dst_pos < 0) return -1; 2004 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2005 2006 // registers used as temp 2007 const Register scratch_length = r16; // elements count to copy 2008 const Register scratch_src_klass = r17; // array klass 2009 const Register lh = r18; // layout helper 2010 2011 // if (length < 0) return -1; 2012 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2013 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2014 2015 __ load_klass(scratch_src_klass, src); 2016 #ifdef ASSERT 2017 // assert(src->klass() != NULL); 2018 { 2019 BLOCK_COMMENT("assert klasses not null {"); 2020 Label L1, L2; 2021 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2022 __ bind(L1); 2023 __ stop("broken null klass"); 2024 __ bind(L2); 2025 __ load_klass(rscratch1, dst); 2026 __ cbz(rscratch1, L1); // this would be broken also 2027 BLOCK_COMMENT("} assert klasses not null done"); 2028 } 2029 #endif 2030 2031 // Load layout helper (32-bits) 2032 // 2033 // |array_tag| | header_size | element_type | |log2_element_size| 2034 // 32 30 24 16 8 2 0 2035 // 2036 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2037 // 2038 2039 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2040 2041 // Handle objArrays completely differently... 2042 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2043 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2044 __ movw(rscratch1, objArray_lh); 2045 __ eorw(rscratch2, lh, rscratch1); 2046 __ cbzw(rscratch2, L_objArray); 2047 2048 // if (src->klass() != dst->klass()) return -1; 2049 __ load_klass(rscratch2, dst); 2050 __ eor(rscratch2, rscratch2, scratch_src_klass); 2051 __ cbnz(rscratch2, L_failed); 2052 2053 // if (!src->is_Array()) return -1; 2054 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2055 2056 // At this point, it is known to be a typeArray (array_tag 0x3). 2057 #ifdef ASSERT 2058 { 2059 BLOCK_COMMENT("assert primitive array {"); 2060 Label L; 2061 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2062 __ cmpw(lh, rscratch2); 2063 __ br(Assembler::GE, L); 2064 __ stop("must be a primitive array"); 2065 __ bind(L); 2066 BLOCK_COMMENT("} assert primitive array done"); 2067 } 2068 #endif 2069 2070 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2071 rscratch2, L_failed); 2072 2073 // TypeArrayKlass 2074 // 2075 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2076 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2077 // 2078 2079 const Register rscratch1_offset = rscratch1; // array offset 2080 const Register r18_elsize = lh; // element size 2081 2082 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2083 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2084 __ add(src, src, rscratch1_offset); // src array offset 2085 __ add(dst, dst, rscratch1_offset); // dst array offset 2086 BLOCK_COMMENT("choose copy loop based on element size"); 2087 2088 // next registers should be set before the jump to corresponding stub 2089 const Register from = c_rarg0; // source array address 2090 const Register to = c_rarg1; // destination array address 2091 const Register count = c_rarg2; // elements count 2092 2093 // 'from', 'to', 'count' registers should be set in such order 2094 // since they are the same as 'src', 'src_pos', 'dst'. 2095 2096 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2097 2098 // The possible values of elsize are 0-3, i.e. exact_log2(element 2099 // size in bytes). We do a simple bitwise binary search. 2100 __ BIND(L_copy_bytes); 2101 __ tbnz(r18_elsize, 1, L_copy_ints); 2102 __ tbnz(r18_elsize, 0, L_copy_shorts); 2103 __ lea(from, Address(src, src_pos));// src_addr 2104 __ lea(to, Address(dst, dst_pos));// dst_addr 2105 __ movw(count, scratch_length); // length 2106 __ b(RuntimeAddress(byte_copy_entry)); 2107 2108 __ BIND(L_copy_shorts); 2109 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2110 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2111 __ movw(count, scratch_length); // length 2112 __ b(RuntimeAddress(short_copy_entry)); 2113 2114 __ BIND(L_copy_ints); 2115 __ tbnz(r18_elsize, 0, L_copy_longs); 2116 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2117 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2118 __ movw(count, scratch_length); // length 2119 __ b(RuntimeAddress(int_copy_entry)); 2120 2121 __ BIND(L_copy_longs); 2122 #ifdef ASSERT 2123 { 2124 BLOCK_COMMENT("assert long copy {"); 2125 Label L; 2126 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2127 __ cmpw(r18_elsize, LogBytesPerLong); 2128 __ br(Assembler::EQ, L); 2129 __ stop("must be long copy, but elsize is wrong"); 2130 __ bind(L); 2131 BLOCK_COMMENT("} assert long copy done"); 2132 } 2133 #endif 2134 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2135 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2136 __ movw(count, scratch_length); // length 2137 __ b(RuntimeAddress(long_copy_entry)); 2138 2139 // ObjArrayKlass 2140 __ BIND(L_objArray); 2141 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2142 2143 Label L_plain_copy, L_checkcast_copy; 2144 // test array classes for subtyping 2145 __ load_klass(r18, dst); 2146 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2147 __ br(Assembler::NE, L_checkcast_copy); 2148 2149 // Identically typed arrays can be copied without element-wise checks. 2150 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2151 rscratch2, L_failed); 2152 2153 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2154 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2155 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2156 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2157 __ movw(count, scratch_length); // length 2158 __ BIND(L_plain_copy); 2159 __ b(RuntimeAddress(oop_copy_entry)); 2160 2161 __ BIND(L_checkcast_copy); 2162 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2163 { 2164 // Before looking at dst.length, make sure dst is also an objArray. 2165 __ ldrw(rscratch1, Address(r18, lh_offset)); 2166 __ movw(rscratch2, objArray_lh); 2167 __ eorw(rscratch1, rscratch1, rscratch2); 2168 __ cbnzw(rscratch1, L_failed); 2169 2170 // It is safe to examine both src.length and dst.length. 2171 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2172 r18, L_failed); 2173 2174 __ load_klass(dst_klass, dst); // reload 2175 2176 // Marshal the base address arguments now, freeing registers. 2177 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2178 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2179 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2180 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2181 __ movw(count, length); // length (reloaded) 2182 Register sco_temp = c_rarg3; // this register is free now 2183 assert_different_registers(from, to, count, sco_temp, 2184 dst_klass, scratch_src_klass); 2185 // assert_clean_int(count, sco_temp); 2186 2187 // Generate the type check. 2188 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2189 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2190 2191 // Smashes rscratch1, rscratch2 2192 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2193 2194 // Fetch destination element klass from the ObjArrayKlass header. 2195 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2196 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2197 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2198 2199 // the checkcast_copy loop needs two extra arguments: 2200 assert(c_rarg3 == sco_temp, "#3 already in place"); 2201 // Set up arguments for checkcast_copy_entry. 2202 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2203 __ b(RuntimeAddress(checkcast_copy_entry)); 2204 } 2205 2206 __ BIND(L_failed); 2207 __ mov(r0, -1); 2208 __ leave(); // required for proper stackwalking of RuntimeStub frame 2209 __ ret(lr); 2210 2211 return start; 2212 } 2213 2214 // 2215 // Generate stub for array fill. If "aligned" is true, the 2216 // "to" address is assumed to be heapword aligned. 2217 // 2218 // Arguments for generated stub: 2219 // to: c_rarg0 2220 // value: c_rarg1 2221 // count: c_rarg2 treated as signed 2222 // 2223 address generate_fill(BasicType t, bool aligned, const char *name) { 2224 __ align(CodeEntryAlignment); 2225 StubCodeMark mark(this, "StubRoutines", name); 2226 address start = __ pc(); 2227 2228 BLOCK_COMMENT("Entry:"); 2229 2230 const Register to = c_rarg0; // source array address 2231 const Register value = c_rarg1; // value 2232 const Register count = c_rarg2; // elements count 2233 2234 const Register bz_base = r10; // base for block_zero routine 2235 const Register cnt_words = r11; // temp register 2236 2237 __ enter(); 2238 2239 Label L_fill_elements, L_exit1; 2240 2241 int shift = -1; 2242 switch (t) { 2243 case T_BYTE: 2244 shift = 0; 2245 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2246 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2247 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2248 __ br(Assembler::LO, L_fill_elements); 2249 break; 2250 case T_SHORT: 2251 shift = 1; 2252 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2253 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2254 __ br(Assembler::LO, L_fill_elements); 2255 break; 2256 case T_INT: 2257 shift = 2; 2258 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2259 __ br(Assembler::LO, L_fill_elements); 2260 break; 2261 default: ShouldNotReachHere(); 2262 } 2263 2264 // Align source address at 8 bytes address boundary. 2265 Label L_skip_align1, L_skip_align2, L_skip_align4; 2266 if (!aligned) { 2267 switch (t) { 2268 case T_BYTE: 2269 // One byte misalignment happens only for byte arrays. 2270 __ tbz(to, 0, L_skip_align1); 2271 __ strb(value, Address(__ post(to, 1))); 2272 __ subw(count, count, 1); 2273 __ bind(L_skip_align1); 2274 // Fallthrough 2275 case T_SHORT: 2276 // Two bytes misalignment happens only for byte and short (char) arrays. 2277 __ tbz(to, 1, L_skip_align2); 2278 __ strh(value, Address(__ post(to, 2))); 2279 __ subw(count, count, 2 >> shift); 2280 __ bind(L_skip_align2); 2281 // Fallthrough 2282 case T_INT: 2283 // Align to 8 bytes, we know we are 4 byte aligned to start. 2284 __ tbz(to, 2, L_skip_align4); 2285 __ strw(value, Address(__ post(to, 4))); 2286 __ subw(count, count, 4 >> shift); 2287 __ bind(L_skip_align4); 2288 break; 2289 default: ShouldNotReachHere(); 2290 } 2291 } 2292 2293 // 2294 // Fill large chunks 2295 // 2296 __ lsrw(cnt_words, count, 3 - shift); // number of words 2297 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2298 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2299 if (UseBlockZeroing) { 2300 Label non_block_zeroing, rest; 2301 // If the fill value is zero we can use the fast zero_words(). 2302 __ cbnz(value, non_block_zeroing); 2303 __ mov(bz_base, to); 2304 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2305 __ zero_words(bz_base, cnt_words); 2306 __ b(rest); 2307 __ bind(non_block_zeroing); 2308 __ fill_words(to, cnt_words, value); 2309 __ bind(rest); 2310 } else { 2311 __ fill_words(to, cnt_words, value); 2312 } 2313 2314 // Remaining count is less than 8 bytes. Fill it by a single store. 2315 // Note that the total length is no less than 8 bytes. 2316 if (t == T_BYTE || t == T_SHORT) { 2317 Label L_exit1; 2318 __ cbzw(count, L_exit1); 2319 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2320 __ str(value, Address(to, -8)); // overwrite some elements 2321 __ bind(L_exit1); 2322 __ leave(); 2323 __ ret(lr); 2324 } 2325 2326 // Handle copies less than 8 bytes. 2327 Label L_fill_2, L_fill_4, L_exit2; 2328 __ bind(L_fill_elements); 2329 switch (t) { 2330 case T_BYTE: 2331 __ tbz(count, 0, L_fill_2); 2332 __ strb(value, Address(__ post(to, 1))); 2333 __ bind(L_fill_2); 2334 __ tbz(count, 1, L_fill_4); 2335 __ strh(value, Address(__ post(to, 2))); 2336 __ bind(L_fill_4); 2337 __ tbz(count, 2, L_exit2); 2338 __ strw(value, Address(to)); 2339 break; 2340 case T_SHORT: 2341 __ tbz(count, 0, L_fill_4); 2342 __ strh(value, Address(__ post(to, 2))); 2343 __ bind(L_fill_4); 2344 __ tbz(count, 1, L_exit2); 2345 __ strw(value, Address(to)); 2346 break; 2347 case T_INT: 2348 __ cbzw(count, L_exit2); 2349 __ strw(value, Address(to)); 2350 break; 2351 default: ShouldNotReachHere(); 2352 } 2353 __ bind(L_exit2); 2354 __ leave(); 2355 __ ret(lr); 2356 return start; 2357 } 2358 2359 address generate_data_cache_writeback() { 2360 const Register line = c_rarg0; // address of line to write back 2361 2362 __ align(CodeEntryAlignment); 2363 2364 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2365 2366 address start = __ pc(); 2367 __ enter(); 2368 __ cache_wb(Address(line, 0)); 2369 __ leave(); 2370 __ ret(lr); 2371 2372 return start; 2373 } 2374 2375 address generate_data_cache_writeback_sync() { 2376 const Register is_pre = c_rarg0; // pre or post sync 2377 2378 __ align(CodeEntryAlignment); 2379 2380 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2381 2382 // pre wbsync is a no-op 2383 // post wbsync translates to an sfence 2384 2385 Label skip; 2386 address start = __ pc(); 2387 __ enter(); 2388 __ cbnz(is_pre, skip); 2389 __ cache_wbsync(false); 2390 __ bind(skip); 2391 __ leave(); 2392 __ ret(lr); 2393 2394 return start; 2395 } 2396 2397 void generate_arraycopy_stubs() { 2398 address entry; 2399 address entry_jbyte_arraycopy; 2400 address entry_jshort_arraycopy; 2401 address entry_jint_arraycopy; 2402 address entry_oop_arraycopy; 2403 address entry_jlong_arraycopy; 2404 address entry_checkcast_arraycopy; 2405 2406 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2407 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2408 2409 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2410 2411 //*** jbyte 2412 // Always need aligned and unaligned versions 2413 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2414 "jbyte_disjoint_arraycopy"); 2415 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2416 &entry_jbyte_arraycopy, 2417 "jbyte_arraycopy"); 2418 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2419 "arrayof_jbyte_disjoint_arraycopy"); 2420 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2421 "arrayof_jbyte_arraycopy"); 2422 2423 //*** jshort 2424 // Always need aligned and unaligned versions 2425 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2426 "jshort_disjoint_arraycopy"); 2427 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2428 &entry_jshort_arraycopy, 2429 "jshort_arraycopy"); 2430 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2431 "arrayof_jshort_disjoint_arraycopy"); 2432 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2433 "arrayof_jshort_arraycopy"); 2434 2435 //*** jint 2436 // Aligned versions 2437 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2438 "arrayof_jint_disjoint_arraycopy"); 2439 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2440 "arrayof_jint_arraycopy"); 2441 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2442 // entry_jint_arraycopy always points to the unaligned version 2443 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2444 "jint_disjoint_arraycopy"); 2445 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2446 &entry_jint_arraycopy, 2447 "jint_arraycopy"); 2448 2449 //*** jlong 2450 // It is always aligned 2451 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2452 "arrayof_jlong_disjoint_arraycopy"); 2453 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2454 "arrayof_jlong_arraycopy"); 2455 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2456 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2457 2458 //*** oops 2459 { 2460 // With compressed oops we need unaligned versions; notice that 2461 // we overwrite entry_oop_arraycopy. 2462 bool aligned = !UseCompressedOops; 2463 2464 StubRoutines::_arrayof_oop_disjoint_arraycopy 2465 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2466 /*dest_uninitialized*/false); 2467 StubRoutines::_arrayof_oop_arraycopy 2468 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2469 /*dest_uninitialized*/false); 2470 // Aligned versions without pre-barriers 2471 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2472 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2473 /*dest_uninitialized*/true); 2474 StubRoutines::_arrayof_oop_arraycopy_uninit 2475 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2476 /*dest_uninitialized*/true); 2477 } 2478 2479 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2480 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2481 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2482 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2483 2484 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2485 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2486 /*dest_uninitialized*/true); 2487 2488 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2489 entry_jbyte_arraycopy, 2490 entry_jshort_arraycopy, 2491 entry_jint_arraycopy, 2492 entry_jlong_arraycopy); 2493 2494 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2495 entry_jbyte_arraycopy, 2496 entry_jshort_arraycopy, 2497 entry_jint_arraycopy, 2498 entry_oop_arraycopy, 2499 entry_jlong_arraycopy, 2500 entry_checkcast_arraycopy); 2501 2502 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2503 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2504 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2505 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2506 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2507 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2508 } 2509 2510 void generate_math_stubs() { Unimplemented(); } 2511 2512 // Arguments: 2513 // 2514 // Inputs: 2515 // c_rarg0 - source byte array address 2516 // c_rarg1 - destination byte array address 2517 // c_rarg2 - K (key) in little endian int array 2518 // 2519 address generate_aescrypt_encryptBlock() { 2520 __ align(CodeEntryAlignment); 2521 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2522 2523 Label L_doLast; 2524 2525 const Register from = c_rarg0; // source array address 2526 const Register to = c_rarg1; // destination array address 2527 const Register key = c_rarg2; // key array address 2528 const Register keylen = rscratch1; 2529 2530 address start = __ pc(); 2531 __ enter(); 2532 2533 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2534 2535 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2536 2537 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2538 __ rev32(v1, __ T16B, v1); 2539 __ rev32(v2, __ T16B, v2); 2540 __ rev32(v3, __ T16B, v3); 2541 __ rev32(v4, __ T16B, v4); 2542 __ aese(v0, v1); 2543 __ aesmc(v0, v0); 2544 __ aese(v0, v2); 2545 __ aesmc(v0, v0); 2546 __ aese(v0, v3); 2547 __ aesmc(v0, v0); 2548 __ aese(v0, v4); 2549 __ aesmc(v0, v0); 2550 2551 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2552 __ rev32(v1, __ T16B, v1); 2553 __ rev32(v2, __ T16B, v2); 2554 __ rev32(v3, __ T16B, v3); 2555 __ rev32(v4, __ T16B, v4); 2556 __ aese(v0, v1); 2557 __ aesmc(v0, v0); 2558 __ aese(v0, v2); 2559 __ aesmc(v0, v0); 2560 __ aese(v0, v3); 2561 __ aesmc(v0, v0); 2562 __ aese(v0, v4); 2563 __ aesmc(v0, v0); 2564 2565 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2566 __ rev32(v1, __ T16B, v1); 2567 __ rev32(v2, __ T16B, v2); 2568 2569 __ cmpw(keylen, 44); 2570 __ br(Assembler::EQ, L_doLast); 2571 2572 __ aese(v0, v1); 2573 __ aesmc(v0, v0); 2574 __ aese(v0, v2); 2575 __ aesmc(v0, v0); 2576 2577 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2578 __ rev32(v1, __ T16B, v1); 2579 __ rev32(v2, __ T16B, v2); 2580 2581 __ cmpw(keylen, 52); 2582 __ br(Assembler::EQ, L_doLast); 2583 2584 __ aese(v0, v1); 2585 __ aesmc(v0, v0); 2586 __ aese(v0, v2); 2587 __ aesmc(v0, v0); 2588 2589 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2590 __ rev32(v1, __ T16B, v1); 2591 __ rev32(v2, __ T16B, v2); 2592 2593 __ BIND(L_doLast); 2594 2595 __ aese(v0, v1); 2596 __ aesmc(v0, v0); 2597 __ aese(v0, v2); 2598 2599 __ ld1(v1, __ T16B, key); 2600 __ rev32(v1, __ T16B, v1); 2601 __ eor(v0, __ T16B, v0, v1); 2602 2603 __ st1(v0, __ T16B, to); 2604 2605 __ mov(r0, 0); 2606 2607 __ leave(); 2608 __ ret(lr); 2609 2610 return start; 2611 } 2612 2613 // Arguments: 2614 // 2615 // Inputs: 2616 // c_rarg0 - source byte array address 2617 // c_rarg1 - destination byte array address 2618 // c_rarg2 - K (key) in little endian int array 2619 // 2620 address generate_aescrypt_decryptBlock() { 2621 assert(UseAES, "need AES instructions and misaligned SSE support"); 2622 __ align(CodeEntryAlignment); 2623 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2624 Label L_doLast; 2625 2626 const Register from = c_rarg0; // source array address 2627 const Register to = c_rarg1; // destination array address 2628 const Register key = c_rarg2; // key array address 2629 const Register keylen = rscratch1; 2630 2631 address start = __ pc(); 2632 __ enter(); // required for proper stackwalking of RuntimeStub frame 2633 2634 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2635 2636 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2637 2638 __ ld1(v5, __ T16B, __ post(key, 16)); 2639 __ rev32(v5, __ T16B, v5); 2640 2641 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2642 __ rev32(v1, __ T16B, v1); 2643 __ rev32(v2, __ T16B, v2); 2644 __ rev32(v3, __ T16B, v3); 2645 __ rev32(v4, __ T16B, v4); 2646 __ aesd(v0, v1); 2647 __ aesimc(v0, v0); 2648 __ aesd(v0, v2); 2649 __ aesimc(v0, v0); 2650 __ aesd(v0, v3); 2651 __ aesimc(v0, v0); 2652 __ aesd(v0, v4); 2653 __ aesimc(v0, v0); 2654 2655 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2656 __ rev32(v1, __ T16B, v1); 2657 __ rev32(v2, __ T16B, v2); 2658 __ rev32(v3, __ T16B, v3); 2659 __ rev32(v4, __ T16B, v4); 2660 __ aesd(v0, v1); 2661 __ aesimc(v0, v0); 2662 __ aesd(v0, v2); 2663 __ aesimc(v0, v0); 2664 __ aesd(v0, v3); 2665 __ aesimc(v0, v0); 2666 __ aesd(v0, v4); 2667 __ aesimc(v0, v0); 2668 2669 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2670 __ rev32(v1, __ T16B, v1); 2671 __ rev32(v2, __ T16B, v2); 2672 2673 __ cmpw(keylen, 44); 2674 __ br(Assembler::EQ, L_doLast); 2675 2676 __ aesd(v0, v1); 2677 __ aesimc(v0, v0); 2678 __ aesd(v0, v2); 2679 __ aesimc(v0, v0); 2680 2681 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2682 __ rev32(v1, __ T16B, v1); 2683 __ rev32(v2, __ T16B, v2); 2684 2685 __ cmpw(keylen, 52); 2686 __ br(Assembler::EQ, L_doLast); 2687 2688 __ aesd(v0, v1); 2689 __ aesimc(v0, v0); 2690 __ aesd(v0, v2); 2691 __ aesimc(v0, v0); 2692 2693 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2694 __ rev32(v1, __ T16B, v1); 2695 __ rev32(v2, __ T16B, v2); 2696 2697 __ BIND(L_doLast); 2698 2699 __ aesd(v0, v1); 2700 __ aesimc(v0, v0); 2701 __ aesd(v0, v2); 2702 2703 __ eor(v0, __ T16B, v0, v5); 2704 2705 __ st1(v0, __ T16B, to); 2706 2707 __ mov(r0, 0); 2708 2709 __ leave(); 2710 __ ret(lr); 2711 2712 return start; 2713 } 2714 2715 // Arguments: 2716 // 2717 // Inputs: 2718 // c_rarg0 - source byte array address 2719 // c_rarg1 - destination byte array address 2720 // c_rarg2 - K (key) in little endian int array 2721 // c_rarg3 - r vector byte array address 2722 // c_rarg4 - input length 2723 // 2724 // Output: 2725 // x0 - input length 2726 // 2727 address generate_cipherBlockChaining_encryptAESCrypt() { 2728 assert(UseAES, "need AES instructions and misaligned SSE support"); 2729 __ align(CodeEntryAlignment); 2730 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2731 2732 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2733 2734 const Register from = c_rarg0; // source array address 2735 const Register to = c_rarg1; // destination array address 2736 const Register key = c_rarg2; // key array address 2737 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2738 // and left with the results of the last encryption block 2739 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2740 const Register keylen = rscratch1; 2741 2742 address start = __ pc(); 2743 2744 __ enter(); 2745 2746 __ movw(rscratch2, len_reg); 2747 2748 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2749 2750 __ ld1(v0, __ T16B, rvec); 2751 2752 __ cmpw(keylen, 52); 2753 __ br(Assembler::CC, L_loadkeys_44); 2754 __ br(Assembler::EQ, L_loadkeys_52); 2755 2756 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2757 __ rev32(v17, __ T16B, v17); 2758 __ rev32(v18, __ T16B, v18); 2759 __ BIND(L_loadkeys_52); 2760 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2761 __ rev32(v19, __ T16B, v19); 2762 __ rev32(v20, __ T16B, v20); 2763 __ BIND(L_loadkeys_44); 2764 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2765 __ rev32(v21, __ T16B, v21); 2766 __ rev32(v22, __ T16B, v22); 2767 __ rev32(v23, __ T16B, v23); 2768 __ rev32(v24, __ T16B, v24); 2769 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2770 __ rev32(v25, __ T16B, v25); 2771 __ rev32(v26, __ T16B, v26); 2772 __ rev32(v27, __ T16B, v27); 2773 __ rev32(v28, __ T16B, v28); 2774 __ ld1(v29, v30, v31, __ T16B, key); 2775 __ rev32(v29, __ T16B, v29); 2776 __ rev32(v30, __ T16B, v30); 2777 __ rev32(v31, __ T16B, v31); 2778 2779 __ BIND(L_aes_loop); 2780 __ ld1(v1, __ T16B, __ post(from, 16)); 2781 __ eor(v0, __ T16B, v0, v1); 2782 2783 __ br(Assembler::CC, L_rounds_44); 2784 __ br(Assembler::EQ, L_rounds_52); 2785 2786 __ aese(v0, v17); __ aesmc(v0, v0); 2787 __ aese(v0, v18); __ aesmc(v0, v0); 2788 __ BIND(L_rounds_52); 2789 __ aese(v0, v19); __ aesmc(v0, v0); 2790 __ aese(v0, v20); __ aesmc(v0, v0); 2791 __ BIND(L_rounds_44); 2792 __ aese(v0, v21); __ aesmc(v0, v0); 2793 __ aese(v0, v22); __ aesmc(v0, v0); 2794 __ aese(v0, v23); __ aesmc(v0, v0); 2795 __ aese(v0, v24); __ aesmc(v0, v0); 2796 __ aese(v0, v25); __ aesmc(v0, v0); 2797 __ aese(v0, v26); __ aesmc(v0, v0); 2798 __ aese(v0, v27); __ aesmc(v0, v0); 2799 __ aese(v0, v28); __ aesmc(v0, v0); 2800 __ aese(v0, v29); __ aesmc(v0, v0); 2801 __ aese(v0, v30); 2802 __ eor(v0, __ T16B, v0, v31); 2803 2804 __ st1(v0, __ T16B, __ post(to, 16)); 2805 2806 __ subw(len_reg, len_reg, 16); 2807 __ cbnzw(len_reg, L_aes_loop); 2808 2809 __ st1(v0, __ T16B, rvec); 2810 2811 __ mov(r0, rscratch2); 2812 2813 __ leave(); 2814 __ ret(lr); 2815 2816 return start; 2817 } 2818 2819 // Arguments: 2820 // 2821 // Inputs: 2822 // c_rarg0 - source byte array address 2823 // c_rarg1 - destination byte array address 2824 // c_rarg2 - K (key) in little endian int array 2825 // c_rarg3 - r vector byte array address 2826 // c_rarg4 - input length 2827 // 2828 // Output: 2829 // r0 - input length 2830 // 2831 address generate_cipherBlockChaining_decryptAESCrypt() { 2832 assert(UseAES, "need AES instructions and misaligned SSE support"); 2833 __ align(CodeEntryAlignment); 2834 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2835 2836 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2837 2838 const Register from = c_rarg0; // source array address 2839 const Register to = c_rarg1; // destination array address 2840 const Register key = c_rarg2; // key array address 2841 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2842 // and left with the results of the last encryption block 2843 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2844 const Register keylen = rscratch1; 2845 2846 address start = __ pc(); 2847 2848 __ enter(); 2849 2850 __ movw(rscratch2, len_reg); 2851 2852 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2853 2854 __ ld1(v2, __ T16B, rvec); 2855 2856 __ ld1(v31, __ T16B, __ post(key, 16)); 2857 __ rev32(v31, __ T16B, v31); 2858 2859 __ cmpw(keylen, 52); 2860 __ br(Assembler::CC, L_loadkeys_44); 2861 __ br(Assembler::EQ, L_loadkeys_52); 2862 2863 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2864 __ rev32(v17, __ T16B, v17); 2865 __ rev32(v18, __ T16B, v18); 2866 __ BIND(L_loadkeys_52); 2867 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2868 __ rev32(v19, __ T16B, v19); 2869 __ rev32(v20, __ T16B, v20); 2870 __ BIND(L_loadkeys_44); 2871 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2872 __ rev32(v21, __ T16B, v21); 2873 __ rev32(v22, __ T16B, v22); 2874 __ rev32(v23, __ T16B, v23); 2875 __ rev32(v24, __ T16B, v24); 2876 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2877 __ rev32(v25, __ T16B, v25); 2878 __ rev32(v26, __ T16B, v26); 2879 __ rev32(v27, __ T16B, v27); 2880 __ rev32(v28, __ T16B, v28); 2881 __ ld1(v29, v30, __ T16B, key); 2882 __ rev32(v29, __ T16B, v29); 2883 __ rev32(v30, __ T16B, v30); 2884 2885 __ BIND(L_aes_loop); 2886 __ ld1(v0, __ T16B, __ post(from, 16)); 2887 __ orr(v1, __ T16B, v0, v0); 2888 2889 __ br(Assembler::CC, L_rounds_44); 2890 __ br(Assembler::EQ, L_rounds_52); 2891 2892 __ aesd(v0, v17); __ aesimc(v0, v0); 2893 __ aesd(v0, v18); __ aesimc(v0, v0); 2894 __ BIND(L_rounds_52); 2895 __ aesd(v0, v19); __ aesimc(v0, v0); 2896 __ aesd(v0, v20); __ aesimc(v0, v0); 2897 __ BIND(L_rounds_44); 2898 __ aesd(v0, v21); __ aesimc(v0, v0); 2899 __ aesd(v0, v22); __ aesimc(v0, v0); 2900 __ aesd(v0, v23); __ aesimc(v0, v0); 2901 __ aesd(v0, v24); __ aesimc(v0, v0); 2902 __ aesd(v0, v25); __ aesimc(v0, v0); 2903 __ aesd(v0, v26); __ aesimc(v0, v0); 2904 __ aesd(v0, v27); __ aesimc(v0, v0); 2905 __ aesd(v0, v28); __ aesimc(v0, v0); 2906 __ aesd(v0, v29); __ aesimc(v0, v0); 2907 __ aesd(v0, v30); 2908 __ eor(v0, __ T16B, v0, v31); 2909 __ eor(v0, __ T16B, v0, v2); 2910 2911 __ st1(v0, __ T16B, __ post(to, 16)); 2912 __ orr(v2, __ T16B, v1, v1); 2913 2914 __ subw(len_reg, len_reg, 16); 2915 __ cbnzw(len_reg, L_aes_loop); 2916 2917 __ st1(v2, __ T16B, rvec); 2918 2919 __ mov(r0, rscratch2); 2920 2921 __ leave(); 2922 __ ret(lr); 2923 2924 return start; 2925 } 2926 2927 // Arguments: 2928 // 2929 // Inputs: 2930 // c_rarg0 - byte[] source+offset 2931 // c_rarg1 - int[] SHA.state 2932 // c_rarg2 - int offset 2933 // c_rarg3 - int limit 2934 // 2935 address generate_sha1_implCompress(bool multi_block, const char *name) { 2936 __ align(CodeEntryAlignment); 2937 StubCodeMark mark(this, "StubRoutines", name); 2938 address start = __ pc(); 2939 2940 Register buf = c_rarg0; 2941 Register state = c_rarg1; 2942 Register ofs = c_rarg2; 2943 Register limit = c_rarg3; 2944 2945 Label keys; 2946 Label sha1_loop; 2947 2948 // load the keys into v0..v3 2949 __ adr(rscratch1, keys); 2950 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2951 // load 5 words state into v6, v7 2952 __ ldrq(v6, Address(state, 0)); 2953 __ ldrs(v7, Address(state, 16)); 2954 2955 2956 __ BIND(sha1_loop); 2957 // load 64 bytes of data into v16..v19 2958 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2959 __ rev32(v16, __ T16B, v16); 2960 __ rev32(v17, __ T16B, v17); 2961 __ rev32(v18, __ T16B, v18); 2962 __ rev32(v19, __ T16B, v19); 2963 2964 // do the sha1 2965 __ addv(v4, __ T4S, v16, v0); 2966 __ orr(v20, __ T16B, v6, v6); 2967 2968 FloatRegister d0 = v16; 2969 FloatRegister d1 = v17; 2970 FloatRegister d2 = v18; 2971 FloatRegister d3 = v19; 2972 2973 for (int round = 0; round < 20; round++) { 2974 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2975 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2976 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2977 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2978 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2979 2980 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2981 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2982 __ sha1h(tmp2, __ T4S, v20); 2983 if (round < 5) 2984 __ sha1c(v20, __ T4S, tmp3, tmp4); 2985 else if (round < 10 || round >= 15) 2986 __ sha1p(v20, __ T4S, tmp3, tmp4); 2987 else 2988 __ sha1m(v20, __ T4S, tmp3, tmp4); 2989 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2990 2991 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2992 } 2993 2994 __ addv(v7, __ T2S, v7, v21); 2995 __ addv(v6, __ T4S, v6, v20); 2996 2997 if (multi_block) { 2998 __ add(ofs, ofs, 64); 2999 __ cmp(ofs, limit); 3000 __ br(Assembler::LE, sha1_loop); 3001 __ mov(c_rarg0, ofs); // return ofs 3002 } 3003 3004 __ strq(v6, Address(state, 0)); 3005 __ strs(v7, Address(state, 16)); 3006 3007 __ ret(lr); 3008 3009 __ bind(keys); 3010 __ emit_int32(0x5a827999); 3011 __ emit_int32(0x6ed9eba1); 3012 __ emit_int32(0x8f1bbcdc); 3013 __ emit_int32(0xca62c1d6); 3014 3015 return start; 3016 } 3017 3018 3019 // Arguments: 3020 // 3021 // Inputs: 3022 // c_rarg0 - byte[] source+offset 3023 // c_rarg1 - int[] SHA.state 3024 // c_rarg2 - int offset 3025 // c_rarg3 - int limit 3026 // 3027 address generate_sha256_implCompress(bool multi_block, const char *name) { 3028 static const uint32_t round_consts[64] = { 3029 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3030 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3031 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3032 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3033 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3034 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3035 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3036 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3037 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3038 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3039 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3040 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3041 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3042 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3043 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3044 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3045 }; 3046 __ align(CodeEntryAlignment); 3047 StubCodeMark mark(this, "StubRoutines", name); 3048 address start = __ pc(); 3049 3050 Register buf = c_rarg0; 3051 Register state = c_rarg1; 3052 Register ofs = c_rarg2; 3053 Register limit = c_rarg3; 3054 3055 Label sha1_loop; 3056 3057 __ stpd(v8, v9, __ pre(sp, -32)); 3058 __ stpd(v10, v11, Address(sp, 16)); 3059 3060 // dga == v0 3061 // dgb == v1 3062 // dg0 == v2 3063 // dg1 == v3 3064 // dg2 == v4 3065 // t0 == v6 3066 // t1 == v7 3067 3068 // load 16 keys to v16..v31 3069 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3070 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3071 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3072 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3073 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3074 3075 // load 8 words (256 bits) state 3076 __ ldpq(v0, v1, state); 3077 3078 __ BIND(sha1_loop); 3079 // load 64 bytes of data into v8..v11 3080 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3081 __ rev32(v8, __ T16B, v8); 3082 __ rev32(v9, __ T16B, v9); 3083 __ rev32(v10, __ T16B, v10); 3084 __ rev32(v11, __ T16B, v11); 3085 3086 __ addv(v6, __ T4S, v8, v16); 3087 __ orr(v2, __ T16B, v0, v0); 3088 __ orr(v3, __ T16B, v1, v1); 3089 3090 FloatRegister d0 = v8; 3091 FloatRegister d1 = v9; 3092 FloatRegister d2 = v10; 3093 FloatRegister d3 = v11; 3094 3095 3096 for (int round = 0; round < 16; round++) { 3097 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3098 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3099 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3100 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3101 3102 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3103 __ orr(v4, __ T16B, v2, v2); 3104 if (round < 15) 3105 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3106 __ sha256h(v2, __ T4S, v3, tmp2); 3107 __ sha256h2(v3, __ T4S, v4, tmp2); 3108 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3109 3110 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3111 } 3112 3113 __ addv(v0, __ T4S, v0, v2); 3114 __ addv(v1, __ T4S, v1, v3); 3115 3116 if (multi_block) { 3117 __ add(ofs, ofs, 64); 3118 __ cmp(ofs, limit); 3119 __ br(Assembler::LE, sha1_loop); 3120 __ mov(c_rarg0, ofs); // return ofs 3121 } 3122 3123 __ ldpd(v10, v11, Address(sp, 16)); 3124 __ ldpd(v8, v9, __ post(sp, 32)); 3125 3126 __ stpq(v0, v1, state); 3127 3128 __ ret(lr); 3129 3130 return start; 3131 } 3132 3133 // Arguments: 3134 // 3135 // Inputs: 3136 // c_rarg0 - byte[] source+offset 3137 // c_rarg1 - int[] SHA.state 3138 // c_rarg2 - int offset 3139 // c_rarg3 - int limit 3140 // 3141 address generate_sha512_implCompress(bool multi_block, const char *name) { 3142 static const uint64_t round_consts[80] = { 3143 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3144 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3145 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3146 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3147 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3148 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3149 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3150 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3151 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3152 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3153 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3154 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3155 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3156 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3157 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3158 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3159 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3160 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3161 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3162 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3163 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3164 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3165 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3166 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3167 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3168 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3169 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3170 }; 3171 3172 // Double rounds for sha512. 3173 #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \ 3174 if (dr < 36) \ 3175 __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16)); \ 3176 __ addv(v5, __ T2D, v##rc0, v##in0); \ 3177 __ ext(v6, __ T16B, v##i2, v##i3, 8); \ 3178 __ ext(v5, __ T16B, v5, v5, 8); \ 3179 __ ext(v7, __ T16B, v##i1, v##i2, 8); \ 3180 __ addv(v##i3, __ T2D, v##i3, v5); \ 3181 if (dr < 32) { \ 3182 __ ext(v5, __ T16B, v##in3, v##in4, 8); \ 3183 __ sha512su0(v##in0, __ T2D, v##in1); \ 3184 } \ 3185 __ sha512h(v##i3, __ T2D, v6, v7); \ 3186 if (dr < 32) \ 3187 __ sha512su1(v##in0, __ T2D, v##in2, v5); \ 3188 __ addv(v##i4, __ T2D, v##i1, v##i3); \ 3189 __ sha512h2(v##i3, __ T2D, v##i1, v##i0); \ 3190 3191 __ align(CodeEntryAlignment); 3192 StubCodeMark mark(this, "StubRoutines", name); 3193 address start = __ pc(); 3194 3195 Register buf = c_rarg0; 3196 Register state = c_rarg1; 3197 Register ofs = c_rarg2; 3198 Register limit = c_rarg3; 3199 3200 __ stpd(v8, v9, __ pre(sp, -64)); 3201 __ stpd(v10, v11, Address(sp, 16)); 3202 __ stpd(v12, v13, Address(sp, 32)); 3203 __ stpd(v14, v15, Address(sp, 48)); 3204 3205 Label sha512_loop; 3206 3207 // load state 3208 __ ld1(v8, v9, v10, v11, __ T2D, state); 3209 3210 // load first 4 round constants 3211 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3212 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3213 3214 __ BIND(sha512_loop); 3215 // load 128B of data into v12..v19 3216 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3217 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3218 __ rev64(v12, __ T16B, v12); 3219 __ rev64(v13, __ T16B, v13); 3220 __ rev64(v14, __ T16B, v14); 3221 __ rev64(v15, __ T16B, v15); 3222 __ rev64(v16, __ T16B, v16); 3223 __ rev64(v17, __ T16B, v17); 3224 __ rev64(v18, __ T16B, v18); 3225 __ rev64(v19, __ T16B, v19); 3226 3227 __ mov(rscratch2, rscratch1); 3228 3229 __ mov(v0, __ T16B, v8); 3230 __ mov(v1, __ T16B, v9); 3231 __ mov(v2, __ T16B, v10); 3232 __ mov(v3, __ T16B, v11); 3233 3234 sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17); 3235 sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18); 3236 sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19); 3237 sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12); 3238 sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13); 3239 sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14); 3240 sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15); 3241 sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16); 3242 sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17); 3243 sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18); 3244 sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19); 3245 sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12); 3246 sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13); 3247 sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14); 3248 sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15); 3249 sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16); 3250 sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17); 3251 sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18); 3252 sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19); 3253 sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12); 3254 sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13); 3255 sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14); 3256 sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15); 3257 sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16); 3258 sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17); 3259 sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18); 3260 sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19); 3261 sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12); 3262 sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13); 3263 sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14); 3264 sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15); 3265 sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16); 3266 sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12, 0, 0, 0, 0); 3267 sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13, 0, 0, 0, 0); 3268 sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14, 0, 0, 0, 0); 3269 sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15, 0, 0, 0, 0); 3270 sha512_dround(36, 3, 0, 4, 2, 1, 24, 0, 16, 0, 0, 0, 0); 3271 sha512_dround(37, 2, 3, 1, 4, 0, 25, 0, 17, 0, 0, 0, 0); 3272 sha512_dround(38, 4, 2, 0, 1, 3, 26, 0, 18, 0, 0, 0, 0); 3273 sha512_dround(39, 1, 4, 3, 0, 2, 27, 0, 19, 0, 0, 0, 0); 3274 3275 __ addv(v8, __ T2D, v8, v0); 3276 __ addv(v9, __ T2D, v9, v1); 3277 __ addv(v10, __ T2D, v10, v2); 3278 __ addv(v11, __ T2D, v11, v3); 3279 3280 if (multi_block) { 3281 __ add(ofs, ofs, 128); 3282 __ cmp(ofs, limit); 3283 __ br(Assembler::LE, sha512_loop); 3284 __ mov(c_rarg0, ofs); // return ofs 3285 } 3286 3287 __ st1(v8, v9, v10, v11, __ T2D, state); 3288 3289 __ ldpd(v14, v15, Address(sp, 48)); 3290 __ ldpd(v12, v13, Address(sp, 32)); 3291 __ ldpd(v10, v11, Address(sp, 16)); 3292 __ ldpd(v8, v9, __ post(sp, 64)); 3293 3294 __ ret(lr); 3295 3296 return start; 3297 } 3298 3299 // Safefetch stubs. 3300 void generate_safefetch(const char* name, int size, address* entry, 3301 address* fault_pc, address* continuation_pc) { 3302 // safefetch signatures: 3303 // int SafeFetch32(int* adr, int errValue); 3304 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3305 // 3306 // arguments: 3307 // c_rarg0 = adr 3308 // c_rarg1 = errValue 3309 // 3310 // result: 3311 // PPC_RET = *adr or errValue 3312 3313 StubCodeMark mark(this, "StubRoutines", name); 3314 3315 // Entry point, pc or function descriptor. 3316 *entry = __ pc(); 3317 3318 // Load *adr into c_rarg1, may fault. 3319 *fault_pc = __ pc(); 3320 switch (size) { 3321 case 4: 3322 // int32_t 3323 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3324 break; 3325 case 8: 3326 // int64_t 3327 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3328 break; 3329 default: 3330 ShouldNotReachHere(); 3331 } 3332 3333 // return errValue or *adr 3334 *continuation_pc = __ pc(); 3335 __ mov(r0, c_rarg1); 3336 __ ret(lr); 3337 } 3338 3339 /** 3340 * Arguments: 3341 * 3342 * Inputs: 3343 * c_rarg0 - int crc 3344 * c_rarg1 - byte* buf 3345 * c_rarg2 - int length 3346 * 3347 * Ouput: 3348 * rax - int crc result 3349 */ 3350 address generate_updateBytesCRC32() { 3351 assert(UseCRC32Intrinsics, "what are we doing here?"); 3352 3353 __ align(CodeEntryAlignment); 3354 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3355 3356 address start = __ pc(); 3357 3358 const Register crc = c_rarg0; // crc 3359 const Register buf = c_rarg1; // source java byte array address 3360 const Register len = c_rarg2; // length 3361 const Register table0 = c_rarg3; // crc_table address 3362 const Register table1 = c_rarg4; 3363 const Register table2 = c_rarg5; 3364 const Register table3 = c_rarg6; 3365 const Register tmp3 = c_rarg7; 3366 3367 BLOCK_COMMENT("Entry:"); 3368 __ enter(); // required for proper stackwalking of RuntimeStub frame 3369 3370 __ kernel_crc32(crc, buf, len, 3371 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3372 3373 __ leave(); // required for proper stackwalking of RuntimeStub frame 3374 __ ret(lr); 3375 3376 return start; 3377 } 3378 3379 /** 3380 * Arguments: 3381 * 3382 * Inputs: 3383 * c_rarg0 - int crc 3384 * c_rarg1 - byte* buf 3385 * c_rarg2 - int length 3386 * c_rarg3 - int* table 3387 * 3388 * Ouput: 3389 * r0 - int crc result 3390 */ 3391 address generate_updateBytesCRC32C() { 3392 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3393 3394 __ align(CodeEntryAlignment); 3395 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3396 3397 address start = __ pc(); 3398 3399 const Register crc = c_rarg0; // crc 3400 const Register buf = c_rarg1; // source java byte array address 3401 const Register len = c_rarg2; // length 3402 const Register table0 = c_rarg3; // crc_table address 3403 const Register table1 = c_rarg4; 3404 const Register table2 = c_rarg5; 3405 const Register table3 = c_rarg6; 3406 const Register tmp3 = c_rarg7; 3407 3408 BLOCK_COMMENT("Entry:"); 3409 __ enter(); // required for proper stackwalking of RuntimeStub frame 3410 3411 __ kernel_crc32c(crc, buf, len, 3412 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3413 3414 __ leave(); // required for proper stackwalking of RuntimeStub frame 3415 __ ret(lr); 3416 3417 return start; 3418 } 3419 3420 /*** 3421 * Arguments: 3422 * 3423 * Inputs: 3424 * c_rarg0 - int adler 3425 * c_rarg1 - byte* buff 3426 * c_rarg2 - int len 3427 * 3428 * Output: 3429 * c_rarg0 - int adler result 3430 */ 3431 address generate_updateBytesAdler32() { 3432 __ align(CodeEntryAlignment); 3433 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3434 address start = __ pc(); 3435 3436 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3437 3438 // Aliases 3439 Register adler = c_rarg0; 3440 Register s1 = c_rarg0; 3441 Register s2 = c_rarg3; 3442 Register buff = c_rarg1; 3443 Register len = c_rarg2; 3444 Register nmax = r4; 3445 Register base = r5; 3446 Register count = r6; 3447 Register temp0 = rscratch1; 3448 Register temp1 = rscratch2; 3449 FloatRegister vbytes = v0; 3450 FloatRegister vs1acc = v1; 3451 FloatRegister vs2acc = v2; 3452 FloatRegister vtable = v3; 3453 3454 // Max number of bytes we can process before having to take the mod 3455 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3456 uint64_t BASE = 0xfff1; 3457 uint64_t NMAX = 0x15B0; 3458 3459 __ mov(base, BASE); 3460 __ mov(nmax, NMAX); 3461 3462 // Load accumulation coefficients for the upper 16 bits 3463 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 3464 __ ld1(vtable, __ T16B, Address(temp0)); 3465 3466 // s1 is initialized to the lower 16 bits of adler 3467 // s2 is initialized to the upper 16 bits of adler 3468 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3469 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3470 3471 // The pipelined loop needs at least 16 elements for 1 iteration 3472 // It does check this, but it is more effective to skip to the cleanup loop 3473 __ cmp(len, (u1)16); 3474 __ br(Assembler::HS, L_nmax); 3475 __ cbz(len, L_combine); 3476 3477 __ bind(L_simple_by1_loop); 3478 __ ldrb(temp0, Address(__ post(buff, 1))); 3479 __ add(s1, s1, temp0); 3480 __ add(s2, s2, s1); 3481 __ subs(len, len, 1); 3482 __ br(Assembler::HI, L_simple_by1_loop); 3483 3484 // s1 = s1 % BASE 3485 __ subs(temp0, s1, base); 3486 __ csel(s1, temp0, s1, Assembler::HS); 3487 3488 // s2 = s2 % BASE 3489 __ lsr(temp0, s2, 16); 3490 __ lsl(temp1, temp0, 4); 3491 __ sub(temp1, temp1, temp0); 3492 __ add(s2, temp1, s2, ext::uxth); 3493 3494 __ subs(temp0, s2, base); 3495 __ csel(s2, temp0, s2, Assembler::HS); 3496 3497 __ b(L_combine); 3498 3499 __ bind(L_nmax); 3500 __ subs(len, len, nmax); 3501 __ sub(count, nmax, 16); 3502 __ br(Assembler::LO, L_by16); 3503 3504 __ bind(L_nmax_loop); 3505 3506 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3507 vbytes, vs1acc, vs2acc, vtable); 3508 3509 __ subs(count, count, 16); 3510 __ br(Assembler::HS, L_nmax_loop); 3511 3512 // s1 = s1 % BASE 3513 __ lsr(temp0, s1, 16); 3514 __ lsl(temp1, temp0, 4); 3515 __ sub(temp1, temp1, temp0); 3516 __ add(temp1, temp1, s1, ext::uxth); 3517 3518 __ lsr(temp0, temp1, 16); 3519 __ lsl(s1, temp0, 4); 3520 __ sub(s1, s1, temp0); 3521 __ add(s1, s1, temp1, ext:: uxth); 3522 3523 __ subs(temp0, s1, base); 3524 __ csel(s1, temp0, s1, Assembler::HS); 3525 3526 // s2 = s2 % BASE 3527 __ lsr(temp0, s2, 16); 3528 __ lsl(temp1, temp0, 4); 3529 __ sub(temp1, temp1, temp0); 3530 __ add(temp1, temp1, s2, ext::uxth); 3531 3532 __ lsr(temp0, temp1, 16); 3533 __ lsl(s2, temp0, 4); 3534 __ sub(s2, s2, temp0); 3535 __ add(s2, s2, temp1, ext:: uxth); 3536 3537 __ subs(temp0, s2, base); 3538 __ csel(s2, temp0, s2, Assembler::HS); 3539 3540 __ subs(len, len, nmax); 3541 __ sub(count, nmax, 16); 3542 __ br(Assembler::HS, L_nmax_loop); 3543 3544 __ bind(L_by16); 3545 __ adds(len, len, count); 3546 __ br(Assembler::LO, L_by1); 3547 3548 __ bind(L_by16_loop); 3549 3550 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3551 vbytes, vs1acc, vs2acc, vtable); 3552 3553 __ subs(len, len, 16); 3554 __ br(Assembler::HS, L_by16_loop); 3555 3556 __ bind(L_by1); 3557 __ adds(len, len, 15); 3558 __ br(Assembler::LO, L_do_mod); 3559 3560 __ bind(L_by1_loop); 3561 __ ldrb(temp0, Address(__ post(buff, 1))); 3562 __ add(s1, temp0, s1); 3563 __ add(s2, s2, s1); 3564 __ subs(len, len, 1); 3565 __ br(Assembler::HS, L_by1_loop); 3566 3567 __ bind(L_do_mod); 3568 // s1 = s1 % BASE 3569 __ lsr(temp0, s1, 16); 3570 __ lsl(temp1, temp0, 4); 3571 __ sub(temp1, temp1, temp0); 3572 __ add(temp1, temp1, s1, ext::uxth); 3573 3574 __ lsr(temp0, temp1, 16); 3575 __ lsl(s1, temp0, 4); 3576 __ sub(s1, s1, temp0); 3577 __ add(s1, s1, temp1, ext:: uxth); 3578 3579 __ subs(temp0, s1, base); 3580 __ csel(s1, temp0, s1, Assembler::HS); 3581 3582 // s2 = s2 % BASE 3583 __ lsr(temp0, s2, 16); 3584 __ lsl(temp1, temp0, 4); 3585 __ sub(temp1, temp1, temp0); 3586 __ add(temp1, temp1, s2, ext::uxth); 3587 3588 __ lsr(temp0, temp1, 16); 3589 __ lsl(s2, temp0, 4); 3590 __ sub(s2, s2, temp0); 3591 __ add(s2, s2, temp1, ext:: uxth); 3592 3593 __ subs(temp0, s2, base); 3594 __ csel(s2, temp0, s2, Assembler::HS); 3595 3596 // Combine lower bits and higher bits 3597 __ bind(L_combine); 3598 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3599 3600 __ ret(lr); 3601 3602 return start; 3603 } 3604 3605 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 3606 Register temp0, Register temp1, FloatRegister vbytes, 3607 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 3608 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 3609 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 3610 // In non-vectorized code, we update s1 and s2 as: 3611 // s1 <- s1 + b1 3612 // s2 <- s2 + s1 3613 // s1 <- s1 + b2 3614 // s2 <- s2 + b1 3615 // ... 3616 // s1 <- s1 + b16 3617 // s2 <- s2 + s1 3618 // Putting above assignments together, we have: 3619 // s1_new = s1 + b1 + b2 + ... + b16 3620 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 3621 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 3622 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 3623 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 3624 3625 // s2 = s2 + s1 * 16 3626 __ add(s2, s2, s1, Assembler::LSL, 4); 3627 3628 // vs1acc = b1 + b2 + b3 + ... + b16 3629 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 3630 __ umullv(vs2acc, __ T8B, vtable, vbytes); 3631 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 3632 __ uaddlv(vs1acc, __ T16B, vbytes); 3633 __ uaddlv(vs2acc, __ T8H, vs2acc); 3634 3635 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 3636 __ fmovd(temp0, vs1acc); 3637 __ fmovd(temp1, vs2acc); 3638 __ add(s1, s1, temp0); 3639 __ add(s2, s2, temp1); 3640 } 3641 3642 /** 3643 * Arguments: 3644 * 3645 * Input: 3646 * c_rarg0 - x address 3647 * c_rarg1 - x length 3648 * c_rarg2 - y address 3649 * c_rarg3 - y lenth 3650 * c_rarg4 - z address 3651 * c_rarg5 - z length 3652 */ 3653 address generate_multiplyToLen() { 3654 __ align(CodeEntryAlignment); 3655 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3656 3657 address start = __ pc(); 3658 const Register x = r0; 3659 const Register xlen = r1; 3660 const Register y = r2; 3661 const Register ylen = r3; 3662 const Register z = r4; 3663 const Register zlen = r5; 3664 3665 const Register tmp1 = r10; 3666 const Register tmp2 = r11; 3667 const Register tmp3 = r12; 3668 const Register tmp4 = r13; 3669 const Register tmp5 = r14; 3670 const Register tmp6 = r15; 3671 const Register tmp7 = r16; 3672 3673 BLOCK_COMMENT("Entry:"); 3674 __ enter(); // required for proper stackwalking of RuntimeStub frame 3675 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3676 __ leave(); // required for proper stackwalking of RuntimeStub frame 3677 __ ret(lr); 3678 3679 return start; 3680 } 3681 3682 address generate_squareToLen() { 3683 // squareToLen algorithm for sizes 1..127 described in java code works 3684 // faster than multiply_to_len on some CPUs and slower on others, but 3685 // multiply_to_len shows a bit better overall results 3686 __ align(CodeEntryAlignment); 3687 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3688 address start = __ pc(); 3689 3690 const Register x = r0; 3691 const Register xlen = r1; 3692 const Register z = r2; 3693 const Register zlen = r3; 3694 const Register y = r4; // == x 3695 const Register ylen = r5; // == xlen 3696 3697 const Register tmp1 = r10; 3698 const Register tmp2 = r11; 3699 const Register tmp3 = r12; 3700 const Register tmp4 = r13; 3701 const Register tmp5 = r14; 3702 const Register tmp6 = r15; 3703 const Register tmp7 = r16; 3704 3705 RegSet spilled_regs = RegSet::of(y, ylen); 3706 BLOCK_COMMENT("Entry:"); 3707 __ enter(); 3708 __ push(spilled_regs, sp); 3709 __ mov(y, x); 3710 __ mov(ylen, xlen); 3711 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3712 __ pop(spilled_regs, sp); 3713 __ leave(); 3714 __ ret(lr); 3715 return start; 3716 } 3717 3718 address generate_mulAdd() { 3719 __ align(CodeEntryAlignment); 3720 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3721 3722 address start = __ pc(); 3723 3724 const Register out = r0; 3725 const Register in = r1; 3726 const Register offset = r2; 3727 const Register len = r3; 3728 const Register k = r4; 3729 3730 BLOCK_COMMENT("Entry:"); 3731 __ enter(); 3732 __ mul_add(out, in, offset, len, k); 3733 __ leave(); 3734 __ ret(lr); 3735 3736 return start; 3737 } 3738 3739 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3740 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3741 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3742 // Karatsuba multiplication performs a 128*128 -> 256-bit 3743 // multiplication in three 128-bit multiplications and a few 3744 // additions. 3745 // 3746 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3747 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3748 // 3749 // Inputs: 3750 // 3751 // A0 in a.d[0] (subkey) 3752 // A1 in a.d[1] 3753 // (A1+A0) in a1_xor_a0.d[0] 3754 // 3755 // B0 in b.d[0] (state) 3756 // B1 in b.d[1] 3757 3758 __ ext(tmp1, __ T16B, b, b, 0x08); 3759 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3760 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3761 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3762 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3763 3764 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3765 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3766 __ eor(tmp2, __ T16B, tmp2, tmp4); 3767 __ eor(tmp2, __ T16B, tmp2, tmp3); 3768 3769 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3770 __ ins(result_hi, __ D, tmp2, 0, 1); 3771 __ ins(result_lo, __ D, tmp2, 1, 0); 3772 } 3773 3774 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3775 FloatRegister p, FloatRegister z, FloatRegister t1) { 3776 const FloatRegister t0 = result; 3777 3778 // The GCM field polynomial f is z^128 + p(z), where p = 3779 // z^7+z^2+z+1. 3780 // 3781 // z^128 === -p(z) (mod (z^128 + p(z))) 3782 // 3783 // so, given that the product we're reducing is 3784 // a == lo + hi * z^128 3785 // substituting, 3786 // === lo - hi * p(z) (mod (z^128 + p(z))) 3787 // 3788 // we reduce by multiplying hi by p(z) and subtracting the result 3789 // from (i.e. XORing it with) lo. Because p has no nonzero high 3790 // bits we can do this with two 64-bit multiplications, lo*p and 3791 // hi*p. 3792 3793 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3794 __ ext(t1, __ T16B, t0, z, 8); 3795 __ eor(hi, __ T16B, hi, t1); 3796 __ ext(t1, __ T16B, z, t0, 8); 3797 __ eor(lo, __ T16B, lo, t1); 3798 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3799 __ eor(result, __ T16B, lo, t0); 3800 } 3801 3802 address generate_has_negatives(address &has_negatives_long) { 3803 const u1 large_loop_size = 64; 3804 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3805 int dcache_line = VM_Version::dcache_line_size(); 3806 3807 Register ary1 = r1, len = r2, result = r0; 3808 3809 __ align(CodeEntryAlignment); 3810 3811 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3812 3813 address entry = __ pc(); 3814 3815 __ enter(); 3816 3817 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3818 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3819 3820 __ cmp(len, (u1)15); 3821 __ br(Assembler::GT, LEN_OVER_15); 3822 // The only case when execution falls into this code is when pointer is near 3823 // the end of memory page and we have to avoid reading next page 3824 __ add(ary1, ary1, len); 3825 __ subs(len, len, 8); 3826 __ br(Assembler::GT, LEN_OVER_8); 3827 __ ldr(rscratch2, Address(ary1, -8)); 3828 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3829 __ lsrv(rscratch2, rscratch2, rscratch1); 3830 __ tst(rscratch2, UPPER_BIT_MASK); 3831 __ cset(result, Assembler::NE); 3832 __ leave(); 3833 __ ret(lr); 3834 __ bind(LEN_OVER_8); 3835 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3836 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3837 __ tst(rscratch2, UPPER_BIT_MASK); 3838 __ br(Assembler::NE, RET_TRUE_NO_POP); 3839 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3840 __ lsrv(rscratch1, rscratch1, rscratch2); 3841 __ tst(rscratch1, UPPER_BIT_MASK); 3842 __ cset(result, Assembler::NE); 3843 __ leave(); 3844 __ ret(lr); 3845 3846 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3847 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3848 3849 has_negatives_long = __ pc(); // 2nd entry point 3850 3851 __ enter(); 3852 3853 __ bind(LEN_OVER_15); 3854 __ push(spilled_regs, sp); 3855 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3856 __ cbz(rscratch2, ALIGNED); 3857 __ ldp(tmp6, tmp1, Address(ary1)); 3858 __ mov(tmp5, 16); 3859 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3860 __ add(ary1, ary1, rscratch1); 3861 __ sub(len, len, rscratch1); 3862 __ orr(tmp6, tmp6, tmp1); 3863 __ tst(tmp6, UPPER_BIT_MASK); 3864 __ br(Assembler::NE, RET_TRUE); 3865 3866 __ bind(ALIGNED); 3867 __ cmp(len, large_loop_size); 3868 __ br(Assembler::LT, CHECK_16); 3869 // Perform 16-byte load as early return in pre-loop to handle situation 3870 // when initially aligned large array has negative values at starting bytes, 3871 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3872 // slower. Cases with negative bytes further ahead won't be affected that 3873 // much. In fact, it'll be faster due to early loads, less instructions and 3874 // less branches in LARGE_LOOP. 3875 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3876 __ sub(len, len, 16); 3877 __ orr(tmp6, tmp6, tmp1); 3878 __ tst(tmp6, UPPER_BIT_MASK); 3879 __ br(Assembler::NE, RET_TRUE); 3880 __ cmp(len, large_loop_size); 3881 __ br(Assembler::LT, CHECK_16); 3882 3883 if (SoftwarePrefetchHintDistance >= 0 3884 && SoftwarePrefetchHintDistance >= dcache_line) { 3885 // initial prefetch 3886 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3887 } 3888 __ bind(LARGE_LOOP); 3889 if (SoftwarePrefetchHintDistance >= 0) { 3890 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3891 } 3892 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3893 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3894 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3895 // instructions per cycle and have less branches, but this approach disables 3896 // early return, thus, all 64 bytes are loaded and checked every time. 3897 __ ldp(tmp2, tmp3, Address(ary1)); 3898 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3899 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3900 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3901 __ add(ary1, ary1, large_loop_size); 3902 __ sub(len, len, large_loop_size); 3903 __ orr(tmp2, tmp2, tmp3); 3904 __ orr(tmp4, tmp4, tmp5); 3905 __ orr(rscratch1, rscratch1, rscratch2); 3906 __ orr(tmp6, tmp6, tmp1); 3907 __ orr(tmp2, tmp2, tmp4); 3908 __ orr(rscratch1, rscratch1, tmp6); 3909 __ orr(tmp2, tmp2, rscratch1); 3910 __ tst(tmp2, UPPER_BIT_MASK); 3911 __ br(Assembler::NE, RET_TRUE); 3912 __ cmp(len, large_loop_size); 3913 __ br(Assembler::GE, LARGE_LOOP); 3914 3915 __ bind(CHECK_16); // small 16-byte load pre-loop 3916 __ cmp(len, (u1)16); 3917 __ br(Assembler::LT, POST_LOOP16); 3918 3919 __ bind(LOOP16); // small 16-byte load loop 3920 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3921 __ sub(len, len, 16); 3922 __ orr(tmp2, tmp2, tmp3); 3923 __ tst(tmp2, UPPER_BIT_MASK); 3924 __ br(Assembler::NE, RET_TRUE); 3925 __ cmp(len, (u1)16); 3926 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3927 3928 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3929 __ cmp(len, (u1)8); 3930 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3931 __ ldr(tmp3, Address(__ post(ary1, 8))); 3932 __ sub(len, len, 8); 3933 __ tst(tmp3, UPPER_BIT_MASK); 3934 __ br(Assembler::NE, RET_TRUE); 3935 3936 __ bind(POST_LOOP16_LOAD_TAIL); 3937 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3938 __ ldr(tmp1, Address(ary1)); 3939 __ mov(tmp2, 64); 3940 __ sub(tmp4, tmp2, len, __ LSL, 3); 3941 __ lslv(tmp1, tmp1, tmp4); 3942 __ tst(tmp1, UPPER_BIT_MASK); 3943 __ br(Assembler::NE, RET_TRUE); 3944 // Fallthrough 3945 3946 __ bind(RET_FALSE); 3947 __ pop(spilled_regs, sp); 3948 __ leave(); 3949 __ mov(result, zr); 3950 __ ret(lr); 3951 3952 __ bind(RET_TRUE); 3953 __ pop(spilled_regs, sp); 3954 __ bind(RET_TRUE_NO_POP); 3955 __ leave(); 3956 __ mov(result, 1); 3957 __ ret(lr); 3958 3959 __ bind(DONE); 3960 __ pop(spilled_regs, sp); 3961 __ leave(); 3962 __ ret(lr); 3963 return entry; 3964 } 3965 3966 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3967 bool usePrefetch, Label &NOT_EQUAL) { 3968 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3969 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3970 tmp7 = r12, tmp8 = r13; 3971 Label LOOP; 3972 3973 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3974 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3975 __ bind(LOOP); 3976 if (usePrefetch) { 3977 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3978 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3979 } 3980 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3981 __ eor(tmp1, tmp1, tmp2); 3982 __ eor(tmp3, tmp3, tmp4); 3983 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3984 __ orr(tmp1, tmp1, tmp3); 3985 __ cbnz(tmp1, NOT_EQUAL); 3986 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3987 __ eor(tmp5, tmp5, tmp6); 3988 __ eor(tmp7, tmp7, tmp8); 3989 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3990 __ orr(tmp5, tmp5, tmp7); 3991 __ cbnz(tmp5, NOT_EQUAL); 3992 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3993 __ eor(tmp1, tmp1, tmp2); 3994 __ eor(tmp3, tmp3, tmp4); 3995 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3996 __ orr(tmp1, tmp1, tmp3); 3997 __ cbnz(tmp1, NOT_EQUAL); 3998 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3999 __ eor(tmp5, tmp5, tmp6); 4000 __ sub(cnt1, cnt1, 8 * wordSize); 4001 __ eor(tmp7, tmp7, tmp8); 4002 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 4003 // tmp6 is not used. MacroAssembler::subs is used here (rather than 4004 // cmp) because subs allows an unlimited range of immediate operand. 4005 __ subs(tmp6, cnt1, loopThreshold); 4006 __ orr(tmp5, tmp5, tmp7); 4007 __ cbnz(tmp5, NOT_EQUAL); 4008 __ br(__ GE, LOOP); 4009 // post-loop 4010 __ eor(tmp1, tmp1, tmp2); 4011 __ eor(tmp3, tmp3, tmp4); 4012 __ orr(tmp1, tmp1, tmp3); 4013 __ sub(cnt1, cnt1, 2 * wordSize); 4014 __ cbnz(tmp1, NOT_EQUAL); 4015 } 4016 4017 void generate_large_array_equals_loop_simd(int loopThreshold, 4018 bool usePrefetch, Label &NOT_EQUAL) { 4019 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 4020 tmp2 = rscratch2; 4021 Label LOOP; 4022 4023 __ bind(LOOP); 4024 if (usePrefetch) { 4025 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 4026 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 4027 } 4028 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 4029 __ sub(cnt1, cnt1, 8 * wordSize); 4030 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 4031 __ subs(tmp1, cnt1, loopThreshold); 4032 __ eor(v0, __ T16B, v0, v4); 4033 __ eor(v1, __ T16B, v1, v5); 4034 __ eor(v2, __ T16B, v2, v6); 4035 __ eor(v3, __ T16B, v3, v7); 4036 __ orr(v0, __ T16B, v0, v1); 4037 __ orr(v1, __ T16B, v2, v3); 4038 __ orr(v0, __ T16B, v0, v1); 4039 __ umov(tmp1, v0, __ D, 0); 4040 __ umov(tmp2, v0, __ D, 1); 4041 __ orr(tmp1, tmp1, tmp2); 4042 __ cbnz(tmp1, NOT_EQUAL); 4043 __ br(__ GE, LOOP); 4044 } 4045 4046 // a1 = r1 - array1 address 4047 // a2 = r2 - array2 address 4048 // result = r0 - return value. Already contains "false" 4049 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 4050 // r3-r5 are reserved temporary registers 4051 address generate_large_array_equals() { 4052 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 4053 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 4054 tmp7 = r12, tmp8 = r13; 4055 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 4056 SMALL_LOOP, POST_LOOP; 4057 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 4058 // calculate if at least 32 prefetched bytes are used 4059 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 4060 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 4061 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 4062 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 4063 tmp5, tmp6, tmp7, tmp8); 4064 4065 __ align(CodeEntryAlignment); 4066 4067 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 4068 4069 address entry = __ pc(); 4070 __ enter(); 4071 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 4072 // also advance pointers to use post-increment instead of pre-increment 4073 __ add(a1, a1, wordSize); 4074 __ add(a2, a2, wordSize); 4075 if (AvoidUnalignedAccesses) { 4076 // both implementations (SIMD/nonSIMD) are using relatively large load 4077 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 4078 // on some CPUs in case of address is not at least 16-byte aligned. 4079 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 4080 // load if needed at least for 1st address and make if 16-byte aligned. 4081 Label ALIGNED16; 4082 __ tbz(a1, 3, ALIGNED16); 4083 __ ldr(tmp1, Address(__ post(a1, wordSize))); 4084 __ ldr(tmp2, Address(__ post(a2, wordSize))); 4085 __ sub(cnt1, cnt1, wordSize); 4086 __ eor(tmp1, tmp1, tmp2); 4087 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 4088 __ bind(ALIGNED16); 4089 } 4090 if (UseSIMDForArrayEquals) { 4091 if (SoftwarePrefetchHintDistance >= 0) { 4092 __ subs(tmp1, cnt1, prefetchLoopThreshold); 4093 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 4094 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 4095 /* prfm = */ true, NOT_EQUAL); 4096 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 4097 __ br(__ LT, TAIL); 4098 } 4099 __ bind(NO_PREFETCH_LARGE_LOOP); 4100 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 4101 /* prfm = */ false, NOT_EQUAL); 4102 } else { 4103 __ push(spilled_regs, sp); 4104 if (SoftwarePrefetchHintDistance >= 0) { 4105 __ subs(tmp1, cnt1, prefetchLoopThreshold); 4106 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 4107 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 4108 /* prfm = */ true, NOT_EQUAL); 4109 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 4110 __ br(__ LT, TAIL); 4111 } 4112 __ bind(NO_PREFETCH_LARGE_LOOP); 4113 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 4114 /* prfm = */ false, NOT_EQUAL); 4115 } 4116 __ bind(TAIL); 4117 __ cbz(cnt1, EQUAL); 4118 __ subs(cnt1, cnt1, wordSize); 4119 __ br(__ LE, POST_LOOP); 4120 __ bind(SMALL_LOOP); 4121 __ ldr(tmp1, Address(__ post(a1, wordSize))); 4122 __ ldr(tmp2, Address(__ post(a2, wordSize))); 4123 __ subs(cnt1, cnt1, wordSize); 4124 __ eor(tmp1, tmp1, tmp2); 4125 __ cbnz(tmp1, NOT_EQUAL); 4126 __ br(__ GT, SMALL_LOOP); 4127 __ bind(POST_LOOP); 4128 __ ldr(tmp1, Address(a1, cnt1)); 4129 __ ldr(tmp2, Address(a2, cnt1)); 4130 __ eor(tmp1, tmp1, tmp2); 4131 __ cbnz(tmp1, NOT_EQUAL); 4132 __ bind(EQUAL); 4133 __ mov(result, true); 4134 __ bind(NOT_EQUAL); 4135 if (!UseSIMDForArrayEquals) { 4136 __ pop(spilled_regs, sp); 4137 } 4138 __ bind(NOT_EQUAL_NO_POP); 4139 __ leave(); 4140 __ ret(lr); 4141 return entry; 4142 } 4143 4144 address generate_dsin_dcos(bool isCos) { 4145 __ align(CodeEntryAlignment); 4146 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 4147 address start = __ pc(); 4148 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 4149 (address)StubRoutines::aarch64::_two_over_pi, 4150 (address)StubRoutines::aarch64::_pio2, 4151 (address)StubRoutines::aarch64::_dsin_coef, 4152 (address)StubRoutines::aarch64::_dcos_coef); 4153 return start; 4154 } 4155 4156 address generate_dlog() { 4157 __ align(CodeEntryAlignment); 4158 StubCodeMark mark(this, "StubRoutines", "dlog"); 4159 address entry = __ pc(); 4160 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 4161 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 4162 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 4163 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 4164 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 4165 return entry; 4166 } 4167 4168 // code for comparing 16 bytes of strings with same encoding 4169 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 4170 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; 4171 __ ldr(rscratch1, Address(__ post(str1, 8))); 4172 __ eor(rscratch2, tmp1, tmp2); 4173 __ ldr(cnt1, Address(__ post(str2, 8))); 4174 __ cbnz(rscratch2, DIFF1); 4175 __ ldr(tmp1, Address(__ post(str1, 8))); 4176 __ eor(rscratch2, rscratch1, cnt1); 4177 __ ldr(tmp2, Address(__ post(str2, 8))); 4178 __ cbnz(rscratch2, DIFF2); 4179 } 4180 4181 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 4182 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 4183 Label &DIFF2) { 4184 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 4185 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 4186 4187 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 4188 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4189 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4190 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 4191 4192 __ fmovd(tmpL, vtmp3); 4193 __ eor(rscratch2, tmp3, tmpL); 4194 __ cbnz(rscratch2, DIFF2); 4195 4196 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4197 __ umov(tmpL, vtmp3, __ D, 1); 4198 __ eor(rscratch2, tmpU, tmpL); 4199 __ cbnz(rscratch2, DIFF1); 4200 4201 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4202 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4203 __ fmovd(tmpL, vtmp); 4204 __ eor(rscratch2, tmp3, tmpL); 4205 __ cbnz(rscratch2, DIFF2); 4206 4207 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4208 __ umov(tmpL, vtmp, __ D, 1); 4209 __ eor(rscratch2, tmpU, tmpL); 4210 __ cbnz(rscratch2, DIFF1); 4211 } 4212 4213 // r0 = result 4214 // r1 = str1 4215 // r2 = cnt1 4216 // r3 = str2 4217 // r4 = cnt2 4218 // r10 = tmp1 4219 // r11 = tmp2 4220 address generate_compare_long_string_different_encoding(bool isLU) { 4221 __ align(CodeEntryAlignment); 4222 StubCodeMark mark(this, "StubRoutines", isLU 4223 ? "compare_long_string_different_encoding LU" 4224 : "compare_long_string_different_encoding UL"); 4225 address entry = __ pc(); 4226 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 4227 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 4228 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 4229 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4230 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 4231 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 4232 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 4233 4234 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 4235 4236 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 4237 // cnt2 == amount of characters left to compare 4238 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 4239 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4240 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 4241 __ add(str2, str2, isLU ? wordSize : wordSize/2); 4242 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 4243 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 4244 __ eor(rscratch2, tmp1, tmp2); 4245 __ mov(rscratch1, tmp2); 4246 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 4247 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 4248 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 4249 __ push(spilled_regs, sp); 4250 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 4251 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 4252 4253 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4254 4255 if (SoftwarePrefetchHintDistance >= 0) { 4256 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4257 __ br(__ LT, NO_PREFETCH); 4258 __ bind(LARGE_LOOP_PREFETCH); 4259 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 4260 __ mov(tmp4, 2); 4261 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4262 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 4263 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4264 __ subs(tmp4, tmp4, 1); 4265 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 4266 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4267 __ mov(tmp4, 2); 4268 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 4269 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4270 __ subs(tmp4, tmp4, 1); 4271 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 4272 __ sub(cnt2, cnt2, 64); 4273 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4274 __ br(__ GE, LARGE_LOOP_PREFETCH); 4275 } 4276 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 4277 __ bind(NO_PREFETCH); 4278 __ subs(cnt2, cnt2, 16); 4279 __ br(__ LT, TAIL); 4280 __ align(OptoLoopAlignment); 4281 __ bind(SMALL_LOOP); // smaller loop 4282 __ subs(cnt2, cnt2, 16); 4283 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4284 __ br(__ GE, SMALL_LOOP); 4285 __ cmn(cnt2, (u1)16); 4286 __ br(__ EQ, LOAD_LAST); 4287 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 4288 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 4289 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 4290 __ ldr(tmp3, Address(cnt1, -8)); 4291 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 4292 __ b(LOAD_LAST); 4293 __ bind(DIFF2); 4294 __ mov(tmpU, tmp3); 4295 __ bind(DIFF1); 4296 __ pop(spilled_regs, sp); 4297 __ b(CALCULATE_DIFFERENCE); 4298 __ bind(LOAD_LAST); 4299 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 4300 // No need to load it again 4301 __ mov(tmpU, tmp3); 4302 __ pop(spilled_regs, sp); 4303 4304 // tmp2 points to the address of the last 4 Latin1 characters right now 4305 __ ldrs(vtmp, Address(tmp2)); 4306 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4307 __ fmovd(tmpL, vtmp); 4308 4309 __ eor(rscratch2, tmpU, tmpL); 4310 __ cbz(rscratch2, DONE); 4311 4312 // Find the first different characters in the longwords and 4313 // compute their difference. 4314 __ bind(CALCULATE_DIFFERENCE); 4315 __ rev(rscratch2, rscratch2); 4316 __ clz(rscratch2, rscratch2); 4317 __ andr(rscratch2, rscratch2, -16); 4318 __ lsrv(tmp1, tmp1, rscratch2); 4319 __ uxthw(tmp1, tmp1); 4320 __ lsrv(rscratch1, rscratch1, rscratch2); 4321 __ uxthw(rscratch1, rscratch1); 4322 __ subw(result, tmp1, rscratch1); 4323 __ bind(DONE); 4324 __ ret(lr); 4325 return entry; 4326 } 4327 4328 address generate_method_entry_barrier() { 4329 __ align(CodeEntryAlignment); 4330 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 4331 4332 Label deoptimize_label; 4333 4334 address start = __ pc(); 4335 4336 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 4337 4338 __ enter(); 4339 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 4340 4341 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 4342 4343 __ push_call_clobbered_registers(); 4344 4345 __ mov(c_rarg0, rscratch2); 4346 __ call_VM_leaf 4347 (CAST_FROM_FN_PTR 4348 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 4349 4350 __ reset_last_Java_frame(true); 4351 4352 __ mov(rscratch1, r0); 4353 4354 __ pop_call_clobbered_registers(); 4355 4356 __ cbnz(rscratch1, deoptimize_label); 4357 4358 __ leave(); 4359 __ ret(lr); 4360 4361 __ BIND(deoptimize_label); 4362 4363 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 4364 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 4365 4366 __ mov(sp, rscratch1); 4367 __ br(rscratch2); 4368 4369 return start; 4370 } 4371 4372 // r0 = result 4373 // r1 = str1 4374 // r2 = cnt1 4375 // r3 = str2 4376 // r4 = cnt2 4377 // r10 = tmp1 4378 // r11 = tmp2 4379 address generate_compare_long_string_same_encoding(bool isLL) { 4380 __ align(CodeEntryAlignment); 4381 StubCodeMark mark(this, "StubRoutines", isLL 4382 ? "compare_long_string_same_encoding LL" 4383 : "compare_long_string_same_encoding UU"); 4384 address entry = __ pc(); 4385 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4386 tmp1 = r10, tmp2 = r11; 4387 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, 4388 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, 4389 DIFF_LAST_POSITION, DIFF_LAST_POSITION2; 4390 // exit from large loop when less than 64 bytes left to read or we're about 4391 // to prefetch memory behind array border 4392 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 4393 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 4394 // update cnt2 counter with already loaded 8 bytes 4395 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 4396 // update pointers, because of previous read 4397 __ add(str1, str1, wordSize); 4398 __ add(str2, str2, wordSize); 4399 if (SoftwarePrefetchHintDistance >= 0) { 4400 __ bind(LARGE_LOOP_PREFETCH); 4401 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 4402 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 4403 compare_string_16_bytes_same(DIFF, DIFF2); 4404 compare_string_16_bytes_same(DIFF, DIFF2); 4405 __ sub(cnt2, cnt2, isLL ? 64 : 32); 4406 compare_string_16_bytes_same(DIFF, DIFF2); 4407 __ subs(rscratch2, cnt2, largeLoopExitCondition); 4408 compare_string_16_bytes_same(DIFF, DIFF2); 4409 __ br(__ GT, LARGE_LOOP_PREFETCH); 4410 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? 4411 } 4412 // less than 16 bytes left? 4413 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4414 __ br(__ LT, TAIL); 4415 __ align(OptoLoopAlignment); 4416 __ bind(SMALL_LOOP); 4417 compare_string_16_bytes_same(DIFF, DIFF2); 4418 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4419 __ br(__ GE, SMALL_LOOP); 4420 __ bind(TAIL); 4421 __ adds(cnt2, cnt2, isLL ? 16 : 8); 4422 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); 4423 __ subs(cnt2, cnt2, isLL ? 8 : 4); 4424 __ br(__ LE, CHECK_LAST); 4425 __ eor(rscratch2, tmp1, tmp2); 4426 __ cbnz(rscratch2, DIFF); 4427 __ ldr(tmp1, Address(__ post(str1, 8))); 4428 __ ldr(tmp2, Address(__ post(str2, 8))); 4429 __ sub(cnt2, cnt2, isLL ? 8 : 4); 4430 __ bind(CHECK_LAST); 4431 if (!isLL) { 4432 __ add(cnt2, cnt2, cnt2); // now in bytes 4433 } 4434 __ eor(rscratch2, tmp1, tmp2); 4435 __ cbnz(rscratch2, DIFF); 4436 __ ldr(rscratch1, Address(str1, cnt2)); 4437 __ ldr(cnt1, Address(str2, cnt2)); 4438 __ eor(rscratch2, rscratch1, cnt1); 4439 __ cbz(rscratch2, LENGTH_DIFF); 4440 // Find the first different characters in the longwords and 4441 // compute their difference. 4442 __ bind(DIFF2); 4443 __ rev(rscratch2, rscratch2); 4444 __ clz(rscratch2, rscratch2); 4445 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4446 __ lsrv(rscratch1, rscratch1, rscratch2); 4447 if (isLL) { 4448 __ lsrv(cnt1, cnt1, rscratch2); 4449 __ uxtbw(rscratch1, rscratch1); 4450 __ uxtbw(cnt1, cnt1); 4451 } else { 4452 __ lsrv(cnt1, cnt1, rscratch2); 4453 __ uxthw(rscratch1, rscratch1); 4454 __ uxthw(cnt1, cnt1); 4455 } 4456 __ subw(result, rscratch1, cnt1); 4457 __ b(LENGTH_DIFF); 4458 __ bind(DIFF); 4459 __ rev(rscratch2, rscratch2); 4460 __ clz(rscratch2, rscratch2); 4461 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4462 __ lsrv(tmp1, tmp1, rscratch2); 4463 if (isLL) { 4464 __ lsrv(tmp2, tmp2, rscratch2); 4465 __ uxtbw(tmp1, tmp1); 4466 __ uxtbw(tmp2, tmp2); 4467 } else { 4468 __ lsrv(tmp2, tmp2, rscratch2); 4469 __ uxthw(tmp1, tmp1); 4470 __ uxthw(tmp2, tmp2); 4471 } 4472 __ subw(result, tmp1, tmp2); 4473 __ b(LENGTH_DIFF); 4474 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 4475 __ eor(rscratch2, tmp1, tmp2); 4476 __ cbnz(rscratch2, DIFF); 4477 __ bind(LENGTH_DIFF); 4478 __ ret(lr); 4479 return entry; 4480 } 4481 4482 void generate_compare_long_strings() { 4483 StubRoutines::aarch64::_compare_long_string_LL 4484 = generate_compare_long_string_same_encoding(true); 4485 StubRoutines::aarch64::_compare_long_string_UU 4486 = generate_compare_long_string_same_encoding(false); 4487 StubRoutines::aarch64::_compare_long_string_LU 4488 = generate_compare_long_string_different_encoding(true); 4489 StubRoutines::aarch64::_compare_long_string_UL 4490 = generate_compare_long_string_different_encoding(false); 4491 } 4492 4493 // R0 = result 4494 // R1 = str2 4495 // R2 = cnt1 4496 // R3 = str1 4497 // R4 = cnt2 4498 // This generic linear code use few additional ideas, which makes it faster: 4499 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 4500 // in order to skip initial loading(help in systems with 1 ld pipeline) 4501 // 2) we can use "fast" algorithm of finding single character to search for 4502 // first symbol with less branches(1 branch per each loaded register instead 4503 // of branch for each symbol), so, this is where constants like 4504 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 4505 // 3) after loading and analyzing 1st register of source string, it can be 4506 // used to search for every 1st character entry, saving few loads in 4507 // comparison with "simplier-but-slower" implementation 4508 // 4) in order to avoid lots of push/pop operations, code below is heavily 4509 // re-using/re-initializing/compressing register values, which makes code 4510 // larger and a bit less readable, however, most of extra operations are 4511 // issued during loads or branches, so, penalty is minimal 4512 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 4513 const char* stubName = str1_isL 4514 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 4515 : "indexof_linear_uu"; 4516 __ align(CodeEntryAlignment); 4517 StubCodeMark mark(this, "StubRoutines", stubName); 4518 address entry = __ pc(); 4519 4520 int str1_chr_size = str1_isL ? 1 : 2; 4521 int str2_chr_size = str2_isL ? 1 : 2; 4522 int str1_chr_shift = str1_isL ? 0 : 1; 4523 int str2_chr_shift = str2_isL ? 0 : 1; 4524 bool isL = str1_isL && str2_isL; 4525 // parameters 4526 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 4527 // temporary registers 4528 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 4529 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 4530 // redefinitions 4531 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 4532 4533 __ push(spilled_regs, sp); 4534 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 4535 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 4536 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 4537 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 4538 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 4539 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 4540 // Read whole register from str1. It is safe, because length >=8 here 4541 __ ldr(ch1, Address(str1)); 4542 // Read whole register from str2. It is safe, because length >=8 here 4543 __ ldr(ch2, Address(str2)); 4544 __ sub(cnt2, cnt2, cnt1); 4545 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 4546 if (str1_isL != str2_isL) { 4547 __ eor(v0, __ T16B, v0, v0); 4548 } 4549 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4550 __ mul(first, first, tmp1); 4551 // check if we have less than 1 register to check 4552 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 4553 if (str1_isL != str2_isL) { 4554 __ fmovd(v1, ch1); 4555 } 4556 __ br(__ LE, L_SMALL); 4557 __ eor(ch2, first, ch2); 4558 if (str1_isL != str2_isL) { 4559 __ zip1(v1, __ T16B, v1, v0); 4560 } 4561 __ sub(tmp2, ch2, tmp1); 4562 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4563 __ bics(tmp2, tmp2, ch2); 4564 if (str1_isL != str2_isL) { 4565 __ fmovd(ch1, v1); 4566 } 4567 __ br(__ NE, L_HAS_ZERO); 4568 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4569 __ add(result, result, wordSize/str2_chr_size); 4570 __ add(str2, str2, wordSize); 4571 __ br(__ LT, L_POST_LOOP); 4572 __ BIND(L_LOOP); 4573 __ ldr(ch2, Address(str2)); 4574 __ eor(ch2, first, ch2); 4575 __ sub(tmp2, ch2, tmp1); 4576 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4577 __ bics(tmp2, tmp2, ch2); 4578 __ br(__ NE, L_HAS_ZERO); 4579 __ BIND(L_LOOP_PROCEED); 4580 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4581 __ add(str2, str2, wordSize); 4582 __ add(result, result, wordSize/str2_chr_size); 4583 __ br(__ GE, L_LOOP); 4584 __ BIND(L_POST_LOOP); 4585 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 4586 __ br(__ LE, NOMATCH); 4587 __ ldr(ch2, Address(str2)); 4588 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4589 __ eor(ch2, first, ch2); 4590 __ sub(tmp2, ch2, tmp1); 4591 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4592 __ mov(tmp4, -1); // all bits set 4593 __ b(L_SMALL_PROCEED); 4594 __ align(OptoLoopAlignment); 4595 __ BIND(L_SMALL); 4596 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4597 __ eor(ch2, first, ch2); 4598 if (str1_isL != str2_isL) { 4599 __ zip1(v1, __ T16B, v1, v0); 4600 } 4601 __ sub(tmp2, ch2, tmp1); 4602 __ mov(tmp4, -1); // all bits set 4603 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4604 if (str1_isL != str2_isL) { 4605 __ fmovd(ch1, v1); // move converted 4 symbols 4606 } 4607 __ BIND(L_SMALL_PROCEED); 4608 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 4609 __ bic(tmp2, tmp2, ch2); 4610 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 4611 __ rbit(tmp2, tmp2); 4612 __ br(__ EQ, NOMATCH); 4613 __ BIND(L_SMALL_HAS_ZERO_LOOP); 4614 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 4615 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 4616 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 4617 if (str2_isL) { // LL 4618 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4619 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4620 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4621 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4622 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4623 } else { 4624 __ mov(ch2, 0xE); // all bits in byte set except last one 4625 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4626 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4627 __ lslv(tmp2, tmp2, tmp4); 4628 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4629 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4630 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4631 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4632 } 4633 __ cmp(ch1, ch2); 4634 __ mov(tmp4, wordSize/str2_chr_size); 4635 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4636 __ BIND(L_SMALL_CMP_LOOP); 4637 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4638 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4639 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4640 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4641 __ add(tmp4, tmp4, 1); 4642 __ cmp(tmp4, cnt1); 4643 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 4644 __ cmp(first, ch2); 4645 __ br(__ EQ, L_SMALL_CMP_LOOP); 4646 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 4647 __ cbz(tmp2, NOMATCH); // no more matches. exit 4648 __ clz(tmp4, tmp2); 4649 __ add(result, result, 1); // advance index 4650 __ add(str2, str2, str2_chr_size); // advance pointer 4651 __ b(L_SMALL_HAS_ZERO_LOOP); 4652 __ align(OptoLoopAlignment); 4653 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 4654 __ cmp(first, ch2); 4655 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4656 __ b(DONE); 4657 __ align(OptoLoopAlignment); 4658 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 4659 if (str2_isL) { // LL 4660 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4661 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4662 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4663 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4664 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4665 } else { 4666 __ mov(ch2, 0xE); // all bits in byte set except last one 4667 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4668 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4669 __ lslv(tmp2, tmp2, tmp4); 4670 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4671 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4672 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4673 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4674 } 4675 __ cmp(ch1, ch2); 4676 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4677 __ b(DONE); 4678 __ align(OptoLoopAlignment); 4679 __ BIND(L_HAS_ZERO); 4680 __ rbit(tmp2, tmp2); 4681 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 4682 // Now, perform compression of counters(cnt2 and cnt1) into one register. 4683 // It's fine because both counters are 32bit and are not changed in this 4684 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 4685 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 4686 __ sub(result, result, 1); 4687 __ BIND(L_HAS_ZERO_LOOP); 4688 __ mov(cnt1, wordSize/str2_chr_size); 4689 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4690 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 4691 if (str2_isL) { 4692 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4693 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4694 __ lslv(tmp2, tmp2, tmp4); 4695 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4696 __ add(tmp4, tmp4, 1); 4697 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4698 __ lsl(tmp2, tmp2, 1); 4699 __ mov(tmp4, wordSize/str2_chr_size); 4700 } else { 4701 __ mov(ch2, 0xE); 4702 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4703 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4704 __ lslv(tmp2, tmp2, tmp4); 4705 __ add(tmp4, tmp4, 1); 4706 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4707 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4708 __ lsl(tmp2, tmp2, 1); 4709 __ mov(tmp4, wordSize/str2_chr_size); 4710 __ sub(str2, str2, str2_chr_size); 4711 } 4712 __ cmp(ch1, ch2); 4713 __ mov(tmp4, wordSize/str2_chr_size); 4714 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4715 __ BIND(L_CMP_LOOP); 4716 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4717 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4718 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4719 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4720 __ add(tmp4, tmp4, 1); 4721 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4722 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 4723 __ cmp(cnt1, ch2); 4724 __ br(__ EQ, L_CMP_LOOP); 4725 __ BIND(L_CMP_LOOP_NOMATCH); 4726 // here we're not matched 4727 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 4728 __ clz(tmp4, tmp2); 4729 __ add(str2, str2, str2_chr_size); // advance pointer 4730 __ b(L_HAS_ZERO_LOOP); 4731 __ align(OptoLoopAlignment); 4732 __ BIND(L_CMP_LOOP_LAST_CMP); 4733 __ cmp(cnt1, ch2); 4734 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4735 __ b(DONE); 4736 __ align(OptoLoopAlignment); 4737 __ BIND(L_CMP_LOOP_LAST_CMP2); 4738 if (str2_isL) { 4739 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4740 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4741 __ lslv(tmp2, tmp2, tmp4); 4742 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4743 __ add(tmp4, tmp4, 1); 4744 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4745 __ lsl(tmp2, tmp2, 1); 4746 } else { 4747 __ mov(ch2, 0xE); 4748 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4749 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4750 __ lslv(tmp2, tmp2, tmp4); 4751 __ add(tmp4, tmp4, 1); 4752 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4753 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4754 __ lsl(tmp2, tmp2, 1); 4755 __ sub(str2, str2, str2_chr_size); 4756 } 4757 __ cmp(ch1, ch2); 4758 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4759 __ b(DONE); 4760 __ align(OptoLoopAlignment); 4761 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 4762 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 4763 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 4764 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 4765 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 4766 // result by analyzed characters value, so, we can just reset lower bits 4767 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 4768 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 4769 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 4770 // index of last analyzed substring inside current octet. So, str2 in at 4771 // respective start address. We need to advance it to next octet 4772 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 4773 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 4774 __ bfm(result, zr, 0, 2 - str2_chr_shift); 4775 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 4776 __ movw(cnt2, cnt2); 4777 __ b(L_LOOP_PROCEED); 4778 __ align(OptoLoopAlignment); 4779 __ BIND(NOMATCH); 4780 __ mov(result, -1); 4781 __ BIND(DONE); 4782 __ pop(spilled_regs, sp); 4783 __ ret(lr); 4784 return entry; 4785 } 4786 4787 void generate_string_indexof_stubs() { 4788 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 4789 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 4790 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 4791 } 4792 4793 void inflate_and_store_2_fp_registers(bool generatePrfm, 4794 FloatRegister src1, FloatRegister src2) { 4795 Register dst = r1; 4796 __ zip1(v1, __ T16B, src1, v0); 4797 __ zip2(v2, __ T16B, src1, v0); 4798 if (generatePrfm) { 4799 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 4800 } 4801 __ zip1(v3, __ T16B, src2, v0); 4802 __ zip2(v4, __ T16B, src2, v0); 4803 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4804 } 4805 4806 // R0 = src 4807 // R1 = dst 4808 // R2 = len 4809 // R3 = len >> 3 4810 // V0 = 0 4811 // v1 = loaded 8 bytes 4812 address generate_large_byte_array_inflate() { 4813 __ align(CodeEntryAlignment); 4814 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4815 address entry = __ pc(); 4816 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4817 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4818 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 4819 4820 // do one more 8-byte read to have address 16-byte aligned in most cases 4821 // also use single store instruction 4822 __ ldrd(v2, __ post(src, 8)); 4823 __ sub(octetCounter, octetCounter, 2); 4824 __ zip1(v1, __ T16B, v1, v0); 4825 __ zip1(v2, __ T16B, v2, v0); 4826 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4827 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4828 __ subs(rscratch1, octetCounter, large_loop_threshold); 4829 __ br(__ LE, LOOP_START); 4830 __ b(LOOP_PRFM_START); 4831 __ bind(LOOP_PRFM); 4832 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4833 __ bind(LOOP_PRFM_START); 4834 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4835 __ sub(octetCounter, octetCounter, 8); 4836 __ subs(rscratch1, octetCounter, large_loop_threshold); 4837 inflate_and_store_2_fp_registers(true, v3, v4); 4838 inflate_and_store_2_fp_registers(true, v5, v6); 4839 __ br(__ GT, LOOP_PRFM); 4840 __ cmp(octetCounter, (u1)8); 4841 __ br(__ LT, DONE); 4842 __ bind(LOOP); 4843 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4844 __ bind(LOOP_START); 4845 __ sub(octetCounter, octetCounter, 8); 4846 __ cmp(octetCounter, (u1)8); 4847 inflate_and_store_2_fp_registers(false, v3, v4); 4848 inflate_and_store_2_fp_registers(false, v5, v6); 4849 __ br(__ GE, LOOP); 4850 __ bind(DONE); 4851 __ ret(lr); 4852 return entry; 4853 } 4854 4855 /** 4856 * Arguments: 4857 * 4858 * Input: 4859 * c_rarg0 - current state address 4860 * c_rarg1 - H key address 4861 * c_rarg2 - data address 4862 * c_rarg3 - number of blocks 4863 * 4864 * Output: 4865 * Updated state at c_rarg0 4866 */ 4867 address generate_ghash_processBlocks() { 4868 // Bafflingly, GCM uses little-endian for the byte order, but 4869 // big-endian for the bit order. For example, the polynomial 1 is 4870 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4871 // 4872 // So, we must either reverse the bytes in each word and do 4873 // everything big-endian or reverse the bits in each byte and do 4874 // it little-endian. On AArch64 it's more idiomatic to reverse 4875 // the bits in each byte (we have an instruction, RBIT, to do 4876 // that) and keep the data in little-endian bit order throught the 4877 // calculation, bit-reversing the inputs and outputs. 4878 4879 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4880 __ align(wordSize * 2); 4881 address p = __ pc(); 4882 __ emit_int64(0x87); // The low-order bits of the field 4883 // polynomial (i.e. p = z^7+z^2+z+1) 4884 // repeated in the low and high parts of a 4885 // 128-bit vector 4886 __ emit_int64(0x87); 4887 4888 __ align(CodeEntryAlignment); 4889 address start = __ pc(); 4890 4891 Register state = c_rarg0; 4892 Register subkeyH = c_rarg1; 4893 Register data = c_rarg2; 4894 Register blocks = c_rarg3; 4895 4896 FloatRegister vzr = v30; 4897 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4898 4899 __ ldrq(v0, Address(state)); 4900 __ ldrq(v1, Address(subkeyH)); 4901 4902 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4903 __ rbit(v0, __ T16B, v0); 4904 __ rev64(v1, __ T16B, v1); 4905 __ rbit(v1, __ T16B, v1); 4906 4907 __ ldrq(v26, p); 4908 4909 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4910 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4911 4912 { 4913 Label L_ghash_loop; 4914 __ bind(L_ghash_loop); 4915 4916 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4917 // reversing each byte 4918 __ rbit(v2, __ T16B, v2); 4919 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4920 4921 // Multiply state in v2 by subkey in v1 4922 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4923 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4924 /*temps*/v6, v20, v18, v21); 4925 // Reduce v7:v5 by the field polynomial 4926 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4927 4928 __ sub(blocks, blocks, 1); 4929 __ cbnz(blocks, L_ghash_loop); 4930 } 4931 4932 // The bit-reversed result is at this point in v0 4933 __ rev64(v1, __ T16B, v0); 4934 __ rbit(v1, __ T16B, v1); 4935 4936 __ st1(v1, __ T16B, state); 4937 __ ret(lr); 4938 4939 return start; 4940 } 4941 4942 // Continuation point for throwing of implicit exceptions that are 4943 // not handled in the current activation. Fabricates an exception 4944 // oop and initiates normal exception dispatching in this 4945 // frame. Since we need to preserve callee-saved values (currently 4946 // only for C2, but done for C1 as well) we need a callee-saved oop 4947 // map and therefore have to make these stubs into RuntimeStubs 4948 // rather than BufferBlobs. If the compiler needs all registers to 4949 // be preserved between the fault point and the exception handler 4950 // then it must assume responsibility for that in 4951 // AbstractCompiler::continuation_for_implicit_null_exception or 4952 // continuation_for_implicit_division_by_zero_exception. All other 4953 // implicit exceptions (e.g., NullPointerException or 4954 // AbstractMethodError on entry) are either at call sites or 4955 // otherwise assume that stack unwinding will be initiated, so 4956 // caller saved registers were assumed volatile in the compiler. 4957 4958 #undef __ 4959 #define __ masm-> 4960 4961 address generate_throw_exception(const char* name, 4962 address runtime_entry, 4963 Register arg1 = noreg, 4964 Register arg2 = noreg) { 4965 // Information about frame layout at time of blocking runtime call. 4966 // Note that we only have to preserve callee-saved registers since 4967 // the compilers are responsible for supplying a continuation point 4968 // if they expect all registers to be preserved. 4969 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4970 enum layout { 4971 rfp_off = 0, 4972 rfp_off2, 4973 return_off, 4974 return_off2, 4975 framesize // inclusive of return address 4976 }; 4977 4978 int insts_size = 512; 4979 int locs_size = 64; 4980 4981 CodeBuffer code(name, insts_size, locs_size); 4982 OopMapSet* oop_maps = new OopMapSet(); 4983 MacroAssembler* masm = new MacroAssembler(&code); 4984 4985 address start = __ pc(); 4986 4987 // This is an inlined and slightly modified version of call_VM 4988 // which has the ability to fetch the return PC out of 4989 // thread-local storage and also sets up last_Java_sp slightly 4990 // differently than the real call_VM 4991 4992 __ enter(); // Save FP and LR before call 4993 4994 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4995 4996 // lr and fp are already in place 4997 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4998 4999 int frame_complete = __ pc() - start; 5000 5001 // Set up last_Java_sp and last_Java_fp 5002 address the_pc = __ pc(); 5003 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 5004 5005 // Call runtime 5006 if (arg1 != noreg) { 5007 assert(arg2 != c_rarg1, "clobbered"); 5008 __ mov(c_rarg1, arg1); 5009 } 5010 if (arg2 != noreg) { 5011 __ mov(c_rarg2, arg2); 5012 } 5013 __ mov(c_rarg0, rthread); 5014 BLOCK_COMMENT("call runtime_entry"); 5015 __ mov(rscratch1, runtime_entry); 5016 __ blr(rscratch1); 5017 5018 // Generate oop map 5019 OopMap* map = new OopMap(framesize, 0); 5020 5021 oop_maps->add_gc_map(the_pc - start, map); 5022 5023 __ reset_last_Java_frame(true); 5024 __ maybe_isb(); 5025 5026 if (UseSVE > 0) { 5027 // Reinitialize the ptrue predicate register, in case the external runtime 5028 // call clobbers ptrue reg, as we may return to SVE compiled code. 5029 __ reinitialize_ptrue(); 5030 } 5031 5032 __ leave(); 5033 5034 // check for pending exceptions 5035 #ifdef ASSERT 5036 Label L; 5037 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 5038 __ cbnz(rscratch1, L); 5039 __ should_not_reach_here(); 5040 __ bind(L); 5041 #endif // ASSERT 5042 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 5043 5044 5045 // codeBlob framesize is in words (not VMRegImpl::slot_size) 5046 RuntimeStub* stub = 5047 RuntimeStub::new_runtime_stub(name, 5048 &code, 5049 frame_complete, 5050 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 5051 oop_maps, false); 5052 return stub->entry_point(); 5053 } 5054 5055 class MontgomeryMultiplyGenerator : public MacroAssembler { 5056 5057 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 5058 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 5059 5060 RegSet _toSave; 5061 bool _squaring; 5062 5063 public: 5064 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 5065 : MacroAssembler(as->code()), _squaring(squaring) { 5066 5067 // Register allocation 5068 5069 Register reg = c_rarg0; 5070 Pa_base = reg; // Argument registers 5071 if (squaring) 5072 Pb_base = Pa_base; 5073 else 5074 Pb_base = ++reg; 5075 Pn_base = ++reg; 5076 Rlen= ++reg; 5077 inv = ++reg; 5078 Pm_base = ++reg; 5079 5080 // Working registers: 5081 Ra = ++reg; // The current digit of a, b, n, and m. 5082 Rb = ++reg; 5083 Rm = ++reg; 5084 Rn = ++reg; 5085 5086 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 5087 Pb = ++reg; 5088 Pm = ++reg; 5089 Pn = ++reg; 5090 5091 t0 = ++reg; // Three registers which form a 5092 t1 = ++reg; // triple-precision accumuator. 5093 t2 = ++reg; 5094 5095 Ri = ++reg; // Inner and outer loop indexes. 5096 Rj = ++reg; 5097 5098 Rhi_ab = ++reg; // Product registers: low and high parts 5099 Rlo_ab = ++reg; // of a*b and m*n. 5100 Rhi_mn = ++reg; 5101 Rlo_mn = ++reg; 5102 5103 // r19 and up are callee-saved. 5104 _toSave = RegSet::range(r19, reg) + Pm_base; 5105 } 5106 5107 private: 5108 void save_regs() { 5109 push(_toSave, sp); 5110 } 5111 5112 void restore_regs() { 5113 pop(_toSave, sp); 5114 } 5115 5116 template <typename T> 5117 void unroll_2(Register count, T block) { 5118 Label loop, end, odd; 5119 tbnz(count, 0, odd); 5120 cbz(count, end); 5121 align(16); 5122 bind(loop); 5123 (this->*block)(); 5124 bind(odd); 5125 (this->*block)(); 5126 subs(count, count, 2); 5127 br(Assembler::GT, loop); 5128 bind(end); 5129 } 5130 5131 template <typename T> 5132 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 5133 Label loop, end, odd; 5134 tbnz(count, 0, odd); 5135 cbz(count, end); 5136 align(16); 5137 bind(loop); 5138 (this->*block)(d, s, tmp); 5139 bind(odd); 5140 (this->*block)(d, s, tmp); 5141 subs(count, count, 2); 5142 br(Assembler::GT, loop); 5143 bind(end); 5144 } 5145 5146 void pre1(RegisterOrConstant i) { 5147 block_comment("pre1"); 5148 // Pa = Pa_base; 5149 // Pb = Pb_base + i; 5150 // Pm = Pm_base; 5151 // Pn = Pn_base + i; 5152 // Ra = *Pa; 5153 // Rb = *Pb; 5154 // Rm = *Pm; 5155 // Rn = *Pn; 5156 ldr(Ra, Address(Pa_base)); 5157 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 5158 ldr(Rm, Address(Pm_base)); 5159 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5160 lea(Pa, Address(Pa_base)); 5161 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 5162 lea(Pm, Address(Pm_base)); 5163 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5164 5165 // Zero the m*n result. 5166 mov(Rhi_mn, zr); 5167 mov(Rlo_mn, zr); 5168 } 5169 5170 // The core multiply-accumulate step of a Montgomery 5171 // multiplication. The idea is to schedule operations as a 5172 // pipeline so that instructions with long latencies (loads and 5173 // multiplies) have time to complete before their results are 5174 // used. This most benefits in-order implementations of the 5175 // architecture but out-of-order ones also benefit. 5176 void step() { 5177 block_comment("step"); 5178 // MACC(Ra, Rb, t0, t1, t2); 5179 // Ra = *++Pa; 5180 // Rb = *--Pb; 5181 umulh(Rhi_ab, Ra, Rb); 5182 mul(Rlo_ab, Ra, Rb); 5183 ldr(Ra, pre(Pa, wordSize)); 5184 ldr(Rb, pre(Pb, -wordSize)); 5185 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 5186 // previous iteration. 5187 // MACC(Rm, Rn, t0, t1, t2); 5188 // Rm = *++Pm; 5189 // Rn = *--Pn; 5190 umulh(Rhi_mn, Rm, Rn); 5191 mul(Rlo_mn, Rm, Rn); 5192 ldr(Rm, pre(Pm, wordSize)); 5193 ldr(Rn, pre(Pn, -wordSize)); 5194 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5195 } 5196 5197 void post1() { 5198 block_comment("post1"); 5199 5200 // MACC(Ra, Rb, t0, t1, t2); 5201 // Ra = *++Pa; 5202 // Rb = *--Pb; 5203 umulh(Rhi_ab, Ra, Rb); 5204 mul(Rlo_ab, Ra, Rb); 5205 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5206 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5207 5208 // *Pm = Rm = t0 * inv; 5209 mul(Rm, t0, inv); 5210 str(Rm, Address(Pm)); 5211 5212 // MACC(Rm, Rn, t0, t1, t2); 5213 // t0 = t1; t1 = t2; t2 = 0; 5214 umulh(Rhi_mn, Rm, Rn); 5215 5216 #ifndef PRODUCT 5217 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5218 { 5219 mul(Rlo_mn, Rm, Rn); 5220 add(Rlo_mn, t0, Rlo_mn); 5221 Label ok; 5222 cbz(Rlo_mn, ok); { 5223 stop("broken Montgomery multiply"); 5224 } bind(ok); 5225 } 5226 #endif 5227 // We have very carefully set things up so that 5228 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5229 // the lower half of Rm * Rn because we know the result already: 5230 // it must be -t0. t0 + (-t0) must generate a carry iff 5231 // t0 != 0. So, rather than do a mul and an adds we just set 5232 // the carry flag iff t0 is nonzero. 5233 // 5234 // mul(Rlo_mn, Rm, Rn); 5235 // adds(zr, t0, Rlo_mn); 5236 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5237 adcs(t0, t1, Rhi_mn); 5238 adc(t1, t2, zr); 5239 mov(t2, zr); 5240 } 5241 5242 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 5243 block_comment("pre2"); 5244 // Pa = Pa_base + i-len; 5245 // Pb = Pb_base + len; 5246 // Pm = Pm_base + i-len; 5247 // Pn = Pn_base + len; 5248 5249 if (i.is_register()) { 5250 sub(Rj, i.as_register(), len); 5251 } else { 5252 mov(Rj, i.as_constant()); 5253 sub(Rj, Rj, len); 5254 } 5255 // Rj == i-len 5256 5257 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 5258 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 5259 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5260 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 5261 5262 // Ra = *++Pa; 5263 // Rb = *--Pb; 5264 // Rm = *++Pm; 5265 // Rn = *--Pn; 5266 ldr(Ra, pre(Pa, wordSize)); 5267 ldr(Rb, pre(Pb, -wordSize)); 5268 ldr(Rm, pre(Pm, wordSize)); 5269 ldr(Rn, pre(Pn, -wordSize)); 5270 5271 mov(Rhi_mn, zr); 5272 mov(Rlo_mn, zr); 5273 } 5274 5275 void post2(RegisterOrConstant i, RegisterOrConstant len) { 5276 block_comment("post2"); 5277 if (i.is_constant()) { 5278 mov(Rj, i.as_constant()-len.as_constant()); 5279 } else { 5280 sub(Rj, i.as_register(), len); 5281 } 5282 5283 adds(t0, t0, Rlo_mn); // The pending m*n, low part 5284 5285 // As soon as we know the least significant digit of our result, 5286 // store it. 5287 // Pm_base[i-len] = t0; 5288 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5289 5290 // t0 = t1; t1 = t2; t2 = 0; 5291 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 5292 adc(t1, t2, zr); 5293 mov(t2, zr); 5294 } 5295 5296 // A carry in t0 after Montgomery multiplication means that we 5297 // should subtract multiples of n from our result in m. We'll 5298 // keep doing that until there is no carry. 5299 void normalize(RegisterOrConstant len) { 5300 block_comment("normalize"); 5301 // while (t0) 5302 // t0 = sub(Pm_base, Pn_base, t0, len); 5303 Label loop, post, again; 5304 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 5305 cbz(t0, post); { 5306 bind(again); { 5307 mov(i, zr); 5308 mov(cnt, len); 5309 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5310 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5311 subs(zr, zr, zr); // set carry flag, i.e. no borrow 5312 align(16); 5313 bind(loop); { 5314 sbcs(Rm, Rm, Rn); 5315 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5316 add(i, i, 1); 5317 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5318 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5319 sub(cnt, cnt, 1); 5320 } cbnz(cnt, loop); 5321 sbc(t0, t0, zr); 5322 } cbnz(t0, again); 5323 } bind(post); 5324 } 5325 5326 // Move memory at s to d, reversing words. 5327 // Increments d to end of copied memory 5328 // Destroys tmp1, tmp2 5329 // Preserves len 5330 // Leaves s pointing to the address which was in d at start 5331 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 5332 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 5333 5334 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 5335 mov(tmp1, len); 5336 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 5337 sub(s, d, len, ext::uxtw, LogBytesPerWord); 5338 } 5339 // where 5340 void reverse1(Register d, Register s, Register tmp) { 5341 ldr(tmp, pre(s, -wordSize)); 5342 ror(tmp, tmp, 32); 5343 str(tmp, post(d, wordSize)); 5344 } 5345 5346 void step_squaring() { 5347 // An extra ACC 5348 step(); 5349 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5350 } 5351 5352 void last_squaring(RegisterOrConstant i) { 5353 Label dont; 5354 // if ((i & 1) == 0) { 5355 tbnz(i.as_register(), 0, dont); { 5356 // MACC(Ra, Rb, t0, t1, t2); 5357 // Ra = *++Pa; 5358 // Rb = *--Pb; 5359 umulh(Rhi_ab, Ra, Rb); 5360 mul(Rlo_ab, Ra, Rb); 5361 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5362 } bind(dont); 5363 } 5364 5365 void extra_step_squaring() { 5366 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5367 5368 // MACC(Rm, Rn, t0, t1, t2); 5369 // Rm = *++Pm; 5370 // Rn = *--Pn; 5371 umulh(Rhi_mn, Rm, Rn); 5372 mul(Rlo_mn, Rm, Rn); 5373 ldr(Rm, pre(Pm, wordSize)); 5374 ldr(Rn, pre(Pn, -wordSize)); 5375 } 5376 5377 void post1_squaring() { 5378 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5379 5380 // *Pm = Rm = t0 * inv; 5381 mul(Rm, t0, inv); 5382 str(Rm, Address(Pm)); 5383 5384 // MACC(Rm, Rn, t0, t1, t2); 5385 // t0 = t1; t1 = t2; t2 = 0; 5386 umulh(Rhi_mn, Rm, Rn); 5387 5388 #ifndef PRODUCT 5389 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5390 { 5391 mul(Rlo_mn, Rm, Rn); 5392 add(Rlo_mn, t0, Rlo_mn); 5393 Label ok; 5394 cbz(Rlo_mn, ok); { 5395 stop("broken Montgomery multiply"); 5396 } bind(ok); 5397 } 5398 #endif 5399 // We have very carefully set things up so that 5400 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5401 // the lower half of Rm * Rn because we know the result already: 5402 // it must be -t0. t0 + (-t0) must generate a carry iff 5403 // t0 != 0. So, rather than do a mul and an adds we just set 5404 // the carry flag iff t0 is nonzero. 5405 // 5406 // mul(Rlo_mn, Rm, Rn); 5407 // adds(zr, t0, Rlo_mn); 5408 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5409 adcs(t0, t1, Rhi_mn); 5410 adc(t1, t2, zr); 5411 mov(t2, zr); 5412 } 5413 5414 void acc(Register Rhi, Register Rlo, 5415 Register t0, Register t1, Register t2) { 5416 adds(t0, t0, Rlo); 5417 adcs(t1, t1, Rhi); 5418 adc(t2, t2, zr); 5419 } 5420 5421 public: 5422 /** 5423 * Fast Montgomery multiplication. The derivation of the 5424 * algorithm is in A Cryptographic Library for the Motorola 5425 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 5426 * 5427 * Arguments: 5428 * 5429 * Inputs for multiplication: 5430 * c_rarg0 - int array elements a 5431 * c_rarg1 - int array elements b 5432 * c_rarg2 - int array elements n (the modulus) 5433 * c_rarg3 - int length 5434 * c_rarg4 - int inv 5435 * c_rarg5 - int array elements m (the result) 5436 * 5437 * Inputs for squaring: 5438 * c_rarg0 - int array elements a 5439 * c_rarg1 - int array elements n (the modulus) 5440 * c_rarg2 - int length 5441 * c_rarg3 - int inv 5442 * c_rarg4 - int array elements m (the result) 5443 * 5444 */ 5445 address generate_multiply() { 5446 Label argh, nothing; 5447 bind(argh); 5448 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5449 5450 align(CodeEntryAlignment); 5451 address entry = pc(); 5452 5453 cbzw(Rlen, nothing); 5454 5455 enter(); 5456 5457 // Make room. 5458 cmpw(Rlen, 512); 5459 br(Assembler::HI, argh); 5460 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5461 andr(sp, Ra, -2 * wordSize); 5462 5463 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5464 5465 { 5466 // Copy input args, reversing as we go. We use Ra as a 5467 // temporary variable. 5468 reverse(Ra, Pa_base, Rlen, t0, t1); 5469 if (!_squaring) 5470 reverse(Ra, Pb_base, Rlen, t0, t1); 5471 reverse(Ra, Pn_base, Rlen, t0, t1); 5472 } 5473 5474 // Push all call-saved registers and also Pm_base which we'll need 5475 // at the end. 5476 save_regs(); 5477 5478 #ifndef PRODUCT 5479 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 5480 { 5481 ldr(Rn, Address(Pn_base, 0)); 5482 mul(Rlo_mn, Rn, inv); 5483 subs(zr, Rlo_mn, -1); 5484 Label ok; 5485 br(EQ, ok); { 5486 stop("broken inverse in Montgomery multiply"); 5487 } bind(ok); 5488 } 5489 #endif 5490 5491 mov(Pm_base, Ra); 5492 5493 mov(t0, zr); 5494 mov(t1, zr); 5495 mov(t2, zr); 5496 5497 block_comment("for (int i = 0; i < len; i++) {"); 5498 mov(Ri, zr); { 5499 Label loop, end; 5500 cmpw(Ri, Rlen); 5501 br(Assembler::GE, end); 5502 5503 bind(loop); 5504 pre1(Ri); 5505 5506 block_comment(" for (j = i; j; j--) {"); { 5507 movw(Rj, Ri); 5508 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5509 } block_comment(" } // j"); 5510 5511 post1(); 5512 addw(Ri, Ri, 1); 5513 cmpw(Ri, Rlen); 5514 br(Assembler::LT, loop); 5515 bind(end); 5516 block_comment("} // i"); 5517 } 5518 5519 block_comment("for (int i = len; i < 2*len; i++) {"); 5520 mov(Ri, Rlen); { 5521 Label loop, end; 5522 cmpw(Ri, Rlen, Assembler::LSL, 1); 5523 br(Assembler::GE, end); 5524 5525 bind(loop); 5526 pre2(Ri, Rlen); 5527 5528 block_comment(" for (j = len*2-i-1; j; j--) {"); { 5529 lslw(Rj, Rlen, 1); 5530 subw(Rj, Rj, Ri); 5531 subw(Rj, Rj, 1); 5532 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5533 } block_comment(" } // j"); 5534 5535 post2(Ri, Rlen); 5536 addw(Ri, Ri, 1); 5537 cmpw(Ri, Rlen, Assembler::LSL, 1); 5538 br(Assembler::LT, loop); 5539 bind(end); 5540 } 5541 block_comment("} // i"); 5542 5543 normalize(Rlen); 5544 5545 mov(Ra, Pm_base); // Save Pm_base in Ra 5546 restore_regs(); // Restore caller's Pm_base 5547 5548 // Copy our result into caller's Pm_base 5549 reverse(Pm_base, Ra, Rlen, t0, t1); 5550 5551 leave(); 5552 bind(nothing); 5553 ret(lr); 5554 5555 return entry; 5556 } 5557 // In C, approximately: 5558 5559 // void 5560 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 5561 // julong Pn_base[], julong Pm_base[], 5562 // julong inv, int len) { 5563 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5564 // julong *Pa, *Pb, *Pn, *Pm; 5565 // julong Ra, Rb, Rn, Rm; 5566 5567 // int i; 5568 5569 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5570 5571 // for (i = 0; i < len; i++) { 5572 // int j; 5573 5574 // Pa = Pa_base; 5575 // Pb = Pb_base + i; 5576 // Pm = Pm_base; 5577 // Pn = Pn_base + i; 5578 5579 // Ra = *Pa; 5580 // Rb = *Pb; 5581 // Rm = *Pm; 5582 // Rn = *Pn; 5583 5584 // int iters = i; 5585 // for (j = 0; iters--; j++) { 5586 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5587 // MACC(Ra, Rb, t0, t1, t2); 5588 // Ra = *++Pa; 5589 // Rb = *--Pb; 5590 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5591 // MACC(Rm, Rn, t0, t1, t2); 5592 // Rm = *++Pm; 5593 // Rn = *--Pn; 5594 // } 5595 5596 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 5597 // MACC(Ra, Rb, t0, t1, t2); 5598 // *Pm = Rm = t0 * inv; 5599 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5600 // MACC(Rm, Rn, t0, t1, t2); 5601 5602 // assert(t0 == 0, "broken Montgomery multiply"); 5603 5604 // t0 = t1; t1 = t2; t2 = 0; 5605 // } 5606 5607 // for (i = len; i < 2*len; i++) { 5608 // int j; 5609 5610 // Pa = Pa_base + i-len; 5611 // Pb = Pb_base + len; 5612 // Pm = Pm_base + i-len; 5613 // Pn = Pn_base + len; 5614 5615 // Ra = *++Pa; 5616 // Rb = *--Pb; 5617 // Rm = *++Pm; 5618 // Rn = *--Pn; 5619 5620 // int iters = len*2-i-1; 5621 // for (j = i-len+1; iters--; j++) { 5622 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5623 // MACC(Ra, Rb, t0, t1, t2); 5624 // Ra = *++Pa; 5625 // Rb = *--Pb; 5626 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5627 // MACC(Rm, Rn, t0, t1, t2); 5628 // Rm = *++Pm; 5629 // Rn = *--Pn; 5630 // } 5631 5632 // Pm_base[i-len] = t0; 5633 // t0 = t1; t1 = t2; t2 = 0; 5634 // } 5635 5636 // while (t0) 5637 // t0 = sub(Pm_base, Pn_base, t0, len); 5638 // } 5639 5640 /** 5641 * Fast Montgomery squaring. This uses asymptotically 25% fewer 5642 * multiplies than Montgomery multiplication so it should be up to 5643 * 25% faster. However, its loop control is more complex and it 5644 * may actually run slower on some machines. 5645 * 5646 * Arguments: 5647 * 5648 * Inputs: 5649 * c_rarg0 - int array elements a 5650 * c_rarg1 - int array elements n (the modulus) 5651 * c_rarg2 - int length 5652 * c_rarg3 - int inv 5653 * c_rarg4 - int array elements m (the result) 5654 * 5655 */ 5656 address generate_square() { 5657 Label argh; 5658 bind(argh); 5659 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5660 5661 align(CodeEntryAlignment); 5662 address entry = pc(); 5663 5664 enter(); 5665 5666 // Make room. 5667 cmpw(Rlen, 512); 5668 br(Assembler::HI, argh); 5669 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5670 andr(sp, Ra, -2 * wordSize); 5671 5672 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5673 5674 { 5675 // Copy input args, reversing as we go. We use Ra as a 5676 // temporary variable. 5677 reverse(Ra, Pa_base, Rlen, t0, t1); 5678 reverse(Ra, Pn_base, Rlen, t0, t1); 5679 } 5680 5681 // Push all call-saved registers and also Pm_base which we'll need 5682 // at the end. 5683 save_regs(); 5684 5685 mov(Pm_base, Ra); 5686 5687 mov(t0, zr); 5688 mov(t1, zr); 5689 mov(t2, zr); 5690 5691 block_comment("for (int i = 0; i < len; i++) {"); 5692 mov(Ri, zr); { 5693 Label loop, end; 5694 bind(loop); 5695 cmp(Ri, Rlen); 5696 br(Assembler::GE, end); 5697 5698 pre1(Ri); 5699 5700 block_comment("for (j = (i+1)/2; j; j--) {"); { 5701 add(Rj, Ri, 1); 5702 lsr(Rj, Rj, 1); 5703 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5704 } block_comment(" } // j"); 5705 5706 last_squaring(Ri); 5707 5708 block_comment(" for (j = i/2; j; j--) {"); { 5709 lsr(Rj, Ri, 1); 5710 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5711 } block_comment(" } // j"); 5712 5713 post1_squaring(); 5714 add(Ri, Ri, 1); 5715 cmp(Ri, Rlen); 5716 br(Assembler::LT, loop); 5717 5718 bind(end); 5719 block_comment("} // i"); 5720 } 5721 5722 block_comment("for (int i = len; i < 2*len; i++) {"); 5723 mov(Ri, Rlen); { 5724 Label loop, end; 5725 bind(loop); 5726 cmp(Ri, Rlen, Assembler::LSL, 1); 5727 br(Assembler::GE, end); 5728 5729 pre2(Ri, Rlen); 5730 5731 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 5732 lsl(Rj, Rlen, 1); 5733 sub(Rj, Rj, Ri); 5734 sub(Rj, Rj, 1); 5735 lsr(Rj, Rj, 1); 5736 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5737 } block_comment(" } // j"); 5738 5739 last_squaring(Ri); 5740 5741 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 5742 lsl(Rj, Rlen, 1); 5743 sub(Rj, Rj, Ri); 5744 lsr(Rj, Rj, 1); 5745 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5746 } block_comment(" } // j"); 5747 5748 post2(Ri, Rlen); 5749 add(Ri, Ri, 1); 5750 cmp(Ri, Rlen, Assembler::LSL, 1); 5751 5752 br(Assembler::LT, loop); 5753 bind(end); 5754 block_comment("} // i"); 5755 } 5756 5757 normalize(Rlen); 5758 5759 mov(Ra, Pm_base); // Save Pm_base in Ra 5760 restore_regs(); // Restore caller's Pm_base 5761 5762 // Copy our result into caller's Pm_base 5763 reverse(Pm_base, Ra, Rlen, t0, t1); 5764 5765 leave(); 5766 ret(lr); 5767 5768 return entry; 5769 } 5770 // In C, approximately: 5771 5772 // void 5773 // montgomery_square(julong Pa_base[], julong Pn_base[], 5774 // julong Pm_base[], julong inv, int len) { 5775 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5776 // julong *Pa, *Pb, *Pn, *Pm; 5777 // julong Ra, Rb, Rn, Rm; 5778 5779 // int i; 5780 5781 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5782 5783 // for (i = 0; i < len; i++) { 5784 // int j; 5785 5786 // Pa = Pa_base; 5787 // Pb = Pa_base + i; 5788 // Pm = Pm_base; 5789 // Pn = Pn_base + i; 5790 5791 // Ra = *Pa; 5792 // Rb = *Pb; 5793 // Rm = *Pm; 5794 // Rn = *Pn; 5795 5796 // int iters = (i+1)/2; 5797 // for (j = 0; iters--; j++) { 5798 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5799 // MACC2(Ra, Rb, t0, t1, t2); 5800 // Ra = *++Pa; 5801 // Rb = *--Pb; 5802 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5803 // MACC(Rm, Rn, t0, t1, t2); 5804 // Rm = *++Pm; 5805 // Rn = *--Pn; 5806 // } 5807 // if ((i & 1) == 0) { 5808 // assert(Ra == Pa_base[j], "must be"); 5809 // MACC(Ra, Ra, t0, t1, t2); 5810 // } 5811 // iters = i/2; 5812 // assert(iters == i-j, "must be"); 5813 // for (; iters--; j++) { 5814 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5815 // MACC(Rm, Rn, t0, t1, t2); 5816 // Rm = *++Pm; 5817 // Rn = *--Pn; 5818 // } 5819 5820 // *Pm = Rm = t0 * inv; 5821 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5822 // MACC(Rm, Rn, t0, t1, t2); 5823 5824 // assert(t0 == 0, "broken Montgomery multiply"); 5825 5826 // t0 = t1; t1 = t2; t2 = 0; 5827 // } 5828 5829 // for (i = len; i < 2*len; i++) { 5830 // int start = i-len+1; 5831 // int end = start + (len - start)/2; 5832 // int j; 5833 5834 // Pa = Pa_base + i-len; 5835 // Pb = Pa_base + len; 5836 // Pm = Pm_base + i-len; 5837 // Pn = Pn_base + len; 5838 5839 // Ra = *++Pa; 5840 // Rb = *--Pb; 5841 // Rm = *++Pm; 5842 // Rn = *--Pn; 5843 5844 // int iters = (2*len-i-1)/2; 5845 // assert(iters == end-start, "must be"); 5846 // for (j = start; iters--; j++) { 5847 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5848 // MACC2(Ra, Rb, t0, t1, t2); 5849 // Ra = *++Pa; 5850 // Rb = *--Pb; 5851 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5852 // MACC(Rm, Rn, t0, t1, t2); 5853 // Rm = *++Pm; 5854 // Rn = *--Pn; 5855 // } 5856 // if ((i & 1) == 0) { 5857 // assert(Ra == Pa_base[j], "must be"); 5858 // MACC(Ra, Ra, t0, t1, t2); 5859 // } 5860 // iters = (2*len-i)/2; 5861 // assert(iters == len-j, "must be"); 5862 // for (; iters--; j++) { 5863 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5864 // MACC(Rm, Rn, t0, t1, t2); 5865 // Rm = *++Pm; 5866 // Rn = *--Pn; 5867 // } 5868 // Pm_base[i-len] = t0; 5869 // t0 = t1; t1 = t2; t2 = 0; 5870 // } 5871 5872 // while (t0) 5873 // t0 = sub(Pm_base, Pn_base, t0, len); 5874 // } 5875 }; 5876 5877 5878 // Initialization 5879 void generate_initial() { 5880 // Generate initial stubs and initializes the entry points 5881 5882 // entry points that exist in all platforms Note: This is code 5883 // that could be shared among different platforms - however the 5884 // benefit seems to be smaller than the disadvantage of having a 5885 // much more complicated generator structure. See also comment in 5886 // stubRoutines.hpp. 5887 5888 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5889 5890 StubRoutines::_call_stub_entry = 5891 generate_call_stub(StubRoutines::_call_stub_return_address); 5892 5893 // is referenced by megamorphic call 5894 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5895 5896 // Build this early so it's available for the interpreter. 5897 StubRoutines::_throw_StackOverflowError_entry = 5898 generate_throw_exception("StackOverflowError throw_exception", 5899 CAST_FROM_FN_PTR(address, 5900 SharedRuntime::throw_StackOverflowError)); 5901 StubRoutines::_throw_delayed_StackOverflowError_entry = 5902 generate_throw_exception("delayed StackOverflowError throw_exception", 5903 CAST_FROM_FN_PTR(address, 5904 SharedRuntime::throw_delayed_StackOverflowError)); 5905 if (UseCRC32Intrinsics) { 5906 // set table address before stub generation which use it 5907 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5908 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5909 } 5910 5911 if (UseCRC32CIntrinsics) { 5912 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5913 } 5914 5915 // Disabled until JDK-8210858 is fixed 5916 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5917 // StubRoutines::_dlog = generate_dlog(); 5918 // } 5919 5920 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5921 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 5922 } 5923 5924 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5925 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 5926 } 5927 5928 // Safefetch stubs. 5929 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5930 &StubRoutines::_safefetch32_fault_pc, 5931 &StubRoutines::_safefetch32_continuation_pc); 5932 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5933 &StubRoutines::_safefetchN_fault_pc, 5934 &StubRoutines::_safefetchN_continuation_pc); 5935 } 5936 5937 void generate_all() { 5938 // support for verify_oop (must happen after universe_init) 5939 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5940 StubRoutines::_throw_AbstractMethodError_entry = 5941 generate_throw_exception("AbstractMethodError throw_exception", 5942 CAST_FROM_FN_PTR(address, 5943 SharedRuntime:: 5944 throw_AbstractMethodError)); 5945 5946 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5947 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5948 CAST_FROM_FN_PTR(address, 5949 SharedRuntime:: 5950 throw_IncompatibleClassChangeError)); 5951 5952 StubRoutines::_throw_NullPointerException_at_call_entry = 5953 generate_throw_exception("NullPointerException at call throw_exception", 5954 CAST_FROM_FN_PTR(address, 5955 SharedRuntime:: 5956 throw_NullPointerException_at_call)); 5957 5958 // arraycopy stubs used by compilers 5959 generate_arraycopy_stubs(); 5960 5961 // has negatives stub for large arrays. 5962 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5963 5964 // array equals stub for large arrays. 5965 if (!UseSimpleArrayEquals) { 5966 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5967 } 5968 5969 generate_compare_long_strings(); 5970 5971 generate_string_indexof_stubs(); 5972 5973 // byte_array_inflate stub for large arrays. 5974 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5975 5976 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5977 if (bs_nm != NULL) { 5978 StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier(); 5979 } 5980 #ifdef COMPILER2 5981 if (UseMultiplyToLenIntrinsic) { 5982 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5983 } 5984 5985 if (UseSquareToLenIntrinsic) { 5986 StubRoutines::_squareToLen = generate_squareToLen(); 5987 } 5988 5989 if (UseMulAddIntrinsic) { 5990 StubRoutines::_mulAdd = generate_mulAdd(); 5991 } 5992 5993 if (UseMontgomeryMultiplyIntrinsic) { 5994 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5995 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5996 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5997 } 5998 5999 if (UseMontgomerySquareIntrinsic) { 6000 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 6001 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 6002 // We use generate_multiply() rather than generate_square() 6003 // because it's faster for the sizes of modulus we care about. 6004 StubRoutines::_montgomerySquare = g.generate_multiply(); 6005 } 6006 #endif // COMPILER2 6007 6008 // generate GHASH intrinsics code 6009 if (UseGHASHIntrinsics) { 6010 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 6011 } 6012 6013 // data cache line writeback 6014 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 6015 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 6016 6017 if (UseAESIntrinsics) { 6018 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 6019 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 6020 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 6021 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 6022 } 6023 6024 if (UseSHA1Intrinsics) { 6025 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 6026 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 6027 } 6028 if (UseSHA256Intrinsics) { 6029 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 6030 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 6031 } 6032 if (UseSHA512Intrinsics) { 6033 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 6034 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 6035 } 6036 6037 // generate Adler32 intrinsics code 6038 if (UseAdler32Intrinsics) { 6039 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 6040 } 6041 6042 StubRoutines::aarch64::set_completed(); 6043 } 6044 6045 public: 6046 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 6047 if (all) { 6048 generate_all(); 6049 } else { 6050 generate_initial(); 6051 } 6052 } 6053 }; // end class declaration 6054 6055 #define UCM_TABLE_MAX_ENTRIES 8 6056 void StubGenerator_generate(CodeBuffer* code, bool all) { 6057 if (UnsafeCopyMemory::_table == NULL) { 6058 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES); 6059 } 6060 StubGenerator g(code, all); 6061 }