1 /* 2 * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/universe.hpp" 33 #include "nativeInst_aarch64.hpp" 34 #include "oops/instanceOop.hpp" 35 #include "oops/method.hpp" 36 #include "oops/objArrayKlass.hpp" 37 #include "oops/oop.inline.hpp" 38 #include "prims/methodHandles.hpp" 39 #include "runtime/frame.inline.hpp" 40 #include "runtime/handles.inline.hpp" 41 #include "runtime/sharedRuntime.hpp" 42 #include "runtime/stubCodeGenerator.hpp" 43 #include "runtime/stubRoutines.hpp" 44 #include "runtime/thread.inline.hpp" 45 #include "utilities/align.hpp" 46 #include "utilities/powerOfTwo.hpp" 47 #ifdef COMPILER2 48 #include "opto/runtime.hpp" 49 #endif 50 #if INCLUDE_ZGC 51 #include "gc/z/zThreadLocalData.hpp" 52 #endif 53 54 // Declaration and definition of StubGenerator (no .hpp file). 55 // For a more detailed description of the stub routine structure 56 // see the comment in stubRoutines.hpp 57 58 #undef __ 59 #define __ _masm-> 60 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 61 62 #ifdef PRODUCT 63 #define BLOCK_COMMENT(str) /* nothing */ 64 #else 65 #define BLOCK_COMMENT(str) __ block_comment(str) 66 #endif 67 68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 69 70 // Stub Code definitions 71 72 class StubGenerator: public StubCodeGenerator { 73 private: 74 75 #ifdef PRODUCT 76 #define inc_counter_np(counter) ((void)0) 77 #else 78 void inc_counter_np_(int& counter) { 79 __ lea(rscratch2, ExternalAddress((address)&counter)); 80 __ ldrw(rscratch1, Address(rscratch2)); 81 __ addw(rscratch1, rscratch1, 1); 82 __ strw(rscratch1, Address(rscratch2)); 83 } 84 #define inc_counter_np(counter) \ 85 BLOCK_COMMENT("inc_counter " #counter); \ 86 inc_counter_np_(counter); 87 #endif 88 89 // Call stubs are used to call Java from C 90 // 91 // Arguments: 92 // c_rarg0: call wrapper address address 93 // c_rarg1: result address 94 // c_rarg2: result type BasicType 95 // c_rarg3: method Method* 96 // c_rarg4: (interpreter) entry point address 97 // c_rarg5: parameters intptr_t* 98 // c_rarg6: parameter size (in words) int 99 // c_rarg7: thread Thread* 100 // 101 // There is no return from the stub itself as any Java result 102 // is written to result 103 // 104 // we save r30 (lr) as the return PC at the base of the frame and 105 // link r29 (fp) below it as the frame pointer installing sp (r31) 106 // into fp. 107 // 108 // we save r0-r7, which accounts for all the c arguments. 109 // 110 // TODO: strictly do we need to save them all? they are treated as 111 // volatile by C so could we omit saving the ones we are going to 112 // place in global registers (thread? method?) or those we only use 113 // during setup of the Java call? 114 // 115 // we don't need to save r8 which C uses as an indirect result location 116 // return register. 117 // 118 // we don't need to save r9-r15 which both C and Java treat as 119 // volatile 120 // 121 // we don't need to save r16-18 because Java does not use them 122 // 123 // we save r19-r28 which Java uses as scratch registers and C 124 // expects to be callee-save 125 // 126 // we save the bottom 64 bits of each value stored in v8-v15; it is 127 // the responsibility of the caller to preserve larger values. 128 // 129 // so the stub frame looks like this when we enter Java code 130 // 131 // [ return_from_Java ] <--- sp 132 // [ argument word n ] 133 // ... 134 // -27 [ argument word 1 ] 135 // -26 [ saved v15 ] <--- sp_after_call 136 // -25 [ saved v14 ] 137 // -24 [ saved v13 ] 138 // -23 [ saved v12 ] 139 // -22 [ saved v11 ] 140 // -21 [ saved v10 ] 141 // -20 [ saved v9 ] 142 // -19 [ saved v8 ] 143 // -18 [ saved r28 ] 144 // -17 [ saved r27 ] 145 // -16 [ saved r26 ] 146 // -15 [ saved r25 ] 147 // -14 [ saved r24 ] 148 // -13 [ saved r23 ] 149 // -12 [ saved r22 ] 150 // -11 [ saved r21 ] 151 // -10 [ saved r20 ] 152 // -9 [ saved r19 ] 153 // -8 [ call wrapper (r0) ] 154 // -7 [ result (r1) ] 155 // -6 [ result type (r2) ] 156 // -5 [ method (r3) ] 157 // -4 [ entry point (r4) ] 158 // -3 [ parameters (r5) ] 159 // -2 [ parameter size (r6) ] 160 // -1 [ thread (r7) ] 161 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 162 // 1 [ saved lr (r30) ] 163 164 // Call stub stack layout word offsets from fp 165 enum call_stub_layout { 166 sp_after_call_off = -26, 167 168 d15_off = -26, 169 d13_off = -24, 170 d11_off = -22, 171 d9_off = -20, 172 173 r28_off = -18, 174 r26_off = -16, 175 r24_off = -14, 176 r22_off = -12, 177 r20_off = -10, 178 call_wrapper_off = -8, 179 result_off = -7, 180 result_type_off = -6, 181 method_off = -5, 182 entry_point_off = -4, 183 parameter_size_off = -2, 184 thread_off = -1, 185 fp_f = 0, 186 retaddr_off = 1, 187 }; 188 189 address generate_call_stub(address& return_address) { 190 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 191 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 192 "adjust this code"); 193 194 StubCodeMark mark(this, "StubRoutines", "call_stub"); 195 address start = __ pc(); 196 197 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 198 199 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 200 const Address result (rfp, result_off * wordSize); 201 const Address result_type (rfp, result_type_off * wordSize); 202 const Address method (rfp, method_off * wordSize); 203 const Address entry_point (rfp, entry_point_off * wordSize); 204 const Address parameter_size(rfp, parameter_size_off * wordSize); 205 206 const Address thread (rfp, thread_off * wordSize); 207 208 const Address d15_save (rfp, d15_off * wordSize); 209 const Address d13_save (rfp, d13_off * wordSize); 210 const Address d11_save (rfp, d11_off * wordSize); 211 const Address d9_save (rfp, d9_off * wordSize); 212 213 const Address r28_save (rfp, r28_off * wordSize); 214 const Address r26_save (rfp, r26_off * wordSize); 215 const Address r24_save (rfp, r24_off * wordSize); 216 const Address r22_save (rfp, r22_off * wordSize); 217 const Address r20_save (rfp, r20_off * wordSize); 218 219 // stub code 220 221 address aarch64_entry = __ pc(); 222 223 // set up frame and move sp to end of save area 224 __ enter(); 225 __ sub(sp, rfp, -sp_after_call_off * wordSize); 226 227 // save register parameters and Java scratch/global registers 228 // n.b. we save thread even though it gets installed in 229 // rthread because we want to sanity check rthread later 230 __ str(c_rarg7, thread); 231 __ strw(c_rarg6, parameter_size); 232 __ stp(c_rarg4, c_rarg5, entry_point); 233 __ stp(c_rarg2, c_rarg3, result_type); 234 __ stp(c_rarg0, c_rarg1, call_wrapper); 235 236 __ stp(r20, r19, r20_save); 237 __ stp(r22, r21, r22_save); 238 __ stp(r24, r23, r24_save); 239 __ stp(r26, r25, r26_save); 240 __ stp(r28, r27, r28_save); 241 242 __ stpd(v9, v8, d9_save); 243 __ stpd(v11, v10, d11_save); 244 __ stpd(v13, v12, d13_save); 245 __ stpd(v15, v14, d15_save); 246 247 // install Java thread in global register now we have saved 248 // whatever value it held 249 __ mov(rthread, c_rarg7); 250 // And method 251 __ mov(rmethod, c_rarg3); 252 253 // set up the heapbase register 254 __ reinit_heapbase(); 255 256 #ifdef ASSERT 257 // make sure we have no pending exceptions 258 { 259 Label L; 260 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 261 __ cmp(rscratch1, (u1)NULL_WORD); 262 __ br(Assembler::EQ, L); 263 __ stop("StubRoutines::call_stub: entered with pending exception"); 264 __ BIND(L); 265 } 266 #endif 267 // pass parameters if any 268 __ mov(esp, sp); 269 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 270 __ andr(sp, rscratch1, -2 * wordSize); 271 272 BLOCK_COMMENT("pass parameters if any"); 273 Label parameters_done; 274 // parameter count is still in c_rarg6 275 // and parameter pointer identifying param 1 is in c_rarg5 276 __ cbzw(c_rarg6, parameters_done); 277 278 address loop = __ pc(); 279 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 280 __ subsw(c_rarg6, c_rarg6, 1); 281 __ push(rscratch1); 282 __ br(Assembler::GT, loop); 283 284 __ BIND(parameters_done); 285 286 // call Java entry -- passing methdoOop, and current sp 287 // rmethod: Method* 288 // r13: sender sp 289 BLOCK_COMMENT("call Java function"); 290 __ mov(r13, sp); 291 __ blr(c_rarg4); 292 293 // we do this here because the notify will already have been done 294 // if we get to the next instruction via an exception 295 // 296 // n.b. adding this instruction here affects the calculation of 297 // whether or not a routine returns to the call stub (used when 298 // doing stack walks) since the normal test is to check the return 299 // pc against the address saved below. so we may need to allow for 300 // this extra instruction in the check. 301 302 // save current address for use by exception handling code 303 304 return_address = __ pc(); 305 306 // store result depending on type (everything that is not 307 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 308 // n.b. this assumes Java returns an integral result in r0 309 // and a floating result in j_farg0 310 __ ldr(j_rarg2, result); 311 Label is_long, is_float, is_double, exit; 312 __ ldr(j_rarg1, result_type); 313 __ cmp(j_rarg1, (u1)T_OBJECT); 314 __ br(Assembler::EQ, is_long); 315 __ cmp(j_rarg1, (u1)T_LONG); 316 __ br(Assembler::EQ, is_long); 317 __ cmp(j_rarg1, (u1)T_FLOAT); 318 __ br(Assembler::EQ, is_float); 319 __ cmp(j_rarg1, (u1)T_DOUBLE); 320 __ br(Assembler::EQ, is_double); 321 322 // handle T_INT case 323 __ strw(r0, Address(j_rarg2)); 324 325 __ BIND(exit); 326 327 // pop parameters 328 __ sub(esp, rfp, -sp_after_call_off * wordSize); 329 330 #ifdef ASSERT 331 // verify that threads correspond 332 { 333 Label L, S; 334 __ ldr(rscratch1, thread); 335 __ cmp(rthread, rscratch1); 336 __ br(Assembler::NE, S); 337 __ get_thread(rscratch1); 338 __ cmp(rthread, rscratch1); 339 __ br(Assembler::EQ, L); 340 __ BIND(S); 341 __ stop("StubRoutines::call_stub: threads must correspond"); 342 __ BIND(L); 343 } 344 #endif 345 346 // restore callee-save registers 347 __ ldpd(v15, v14, d15_save); 348 __ ldpd(v13, v12, d13_save); 349 __ ldpd(v11, v10, d11_save); 350 __ ldpd(v9, v8, d9_save); 351 352 __ ldp(r28, r27, r28_save); 353 __ ldp(r26, r25, r26_save); 354 __ ldp(r24, r23, r24_save); 355 __ ldp(r22, r21, r22_save); 356 __ ldp(r20, r19, r20_save); 357 358 __ ldp(c_rarg0, c_rarg1, call_wrapper); 359 __ ldrw(c_rarg2, result_type); 360 __ ldr(c_rarg3, method); 361 __ ldp(c_rarg4, c_rarg5, entry_point); 362 __ ldp(c_rarg6, c_rarg7, parameter_size); 363 364 // leave frame and return to caller 365 __ leave(); 366 __ ret(lr); 367 368 // handle return types different from T_INT 369 370 __ BIND(is_long); 371 __ str(r0, Address(j_rarg2, 0)); 372 __ br(Assembler::AL, exit); 373 374 __ BIND(is_float); 375 __ strs(j_farg0, Address(j_rarg2, 0)); 376 __ br(Assembler::AL, exit); 377 378 __ BIND(is_double); 379 __ strd(j_farg0, Address(j_rarg2, 0)); 380 __ br(Assembler::AL, exit); 381 382 return start; 383 } 384 385 // Return point for a Java call if there's an exception thrown in 386 // Java code. The exception is caught and transformed into a 387 // pending exception stored in JavaThread that can be tested from 388 // within the VM. 389 // 390 // Note: Usually the parameters are removed by the callee. In case 391 // of an exception crossing an activation frame boundary, that is 392 // not the case if the callee is compiled code => need to setup the 393 // rsp. 394 // 395 // r0: exception oop 396 397 address generate_catch_exception() { 398 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 399 address start = __ pc(); 400 401 // same as in generate_call_stub(): 402 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 403 const Address thread (rfp, thread_off * wordSize); 404 405 #ifdef ASSERT 406 // verify that threads correspond 407 { 408 Label L, S; 409 __ ldr(rscratch1, thread); 410 __ cmp(rthread, rscratch1); 411 __ br(Assembler::NE, S); 412 __ get_thread(rscratch1); 413 __ cmp(rthread, rscratch1); 414 __ br(Assembler::EQ, L); 415 __ bind(S); 416 __ stop("StubRoutines::catch_exception: threads must correspond"); 417 __ bind(L); 418 } 419 #endif 420 421 // set pending exception 422 __ verify_oop(r0); 423 424 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 425 __ mov(rscratch1, (address)__FILE__); 426 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 427 __ movw(rscratch1, (int)__LINE__); 428 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 429 430 // complete return to VM 431 assert(StubRoutines::_call_stub_return_address != NULL, 432 "_call_stub_return_address must have been generated before"); 433 __ b(StubRoutines::_call_stub_return_address); 434 435 return start; 436 } 437 438 // Continuation point for runtime calls returning with a pending 439 // exception. The pending exception check happened in the runtime 440 // or native call stub. The pending exception in Thread is 441 // converted into a Java-level exception. 442 // 443 // Contract with Java-level exception handlers: 444 // r0: exception 445 // r3: throwing pc 446 // 447 // NOTE: At entry of this stub, exception-pc must be in LR !! 448 449 // NOTE: this is always used as a jump target within generated code 450 // so it just needs to be generated code wiht no x86 prolog 451 452 address generate_forward_exception() { 453 StubCodeMark mark(this, "StubRoutines", "forward exception"); 454 address start = __ pc(); 455 456 // Upon entry, LR points to the return address returning into 457 // Java (interpreted or compiled) code; i.e., the return address 458 // becomes the throwing pc. 459 // 460 // Arguments pushed before the runtime call are still on the stack 461 // but the exception handler will reset the stack pointer -> 462 // ignore them. A potential result in registers can be ignored as 463 // well. 464 465 #ifdef ASSERT 466 // make sure this code is only executed if there is a pending exception 467 { 468 Label L; 469 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 470 __ cbnz(rscratch1, L); 471 __ stop("StubRoutines::forward exception: no pending exception (1)"); 472 __ bind(L); 473 } 474 #endif 475 476 // compute exception handler into r19 477 478 // call the VM to find the handler address associated with the 479 // caller address. pass thread in r0 and caller pc (ret address) 480 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 481 // the stack. 482 __ mov(c_rarg1, lr); 483 // lr will be trashed by the VM call so we move it to R19 484 // (callee-saved) because we also need to pass it to the handler 485 // returned by this call. 486 __ mov(r19, lr); 487 BLOCK_COMMENT("call exception_handler_for_return_address"); 488 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 489 SharedRuntime::exception_handler_for_return_address), 490 rthread, c_rarg1); 491 // we should not really care that lr is no longer the callee 492 // address. we saved the value the handler needs in r19 so we can 493 // just copy it to r3. however, the C2 handler will push its own 494 // frame and then calls into the VM and the VM code asserts that 495 // the PC for the frame above the handler belongs to a compiled 496 // Java method. So, we restore lr here to satisfy that assert. 497 __ mov(lr, r19); 498 // setup r0 & r3 & clear pending exception 499 __ mov(r3, r19); 500 __ mov(r19, r0); 501 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 502 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 503 504 #ifdef ASSERT 505 // make sure exception is set 506 { 507 Label L; 508 __ cbnz(r0, L); 509 __ stop("StubRoutines::forward exception: no pending exception (2)"); 510 __ bind(L); 511 } 512 #endif 513 514 // continue at exception handler 515 // r0: exception 516 // r3: throwing pc 517 // r19: exception handler 518 __ verify_oop(r0); 519 __ br(r19); 520 521 return start; 522 } 523 524 // Non-destructive plausibility checks for oops 525 // 526 // Arguments: 527 // r0: oop to verify 528 // rscratch1: error message 529 // 530 // Stack after saving c_rarg3: 531 // [tos + 0]: saved c_rarg3 532 // [tos + 1]: saved c_rarg2 533 // [tos + 2]: saved lr 534 // [tos + 3]: saved rscratch2 535 // [tos + 4]: saved r0 536 // [tos + 5]: saved rscratch1 537 address generate_verify_oop() { 538 539 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 540 address start = __ pc(); 541 542 Label exit, error; 543 544 // save c_rarg2 and c_rarg3 545 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 546 547 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 548 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 549 __ ldr(c_rarg3, Address(c_rarg2)); 550 __ add(c_rarg3, c_rarg3, 1); 551 __ str(c_rarg3, Address(c_rarg2)); 552 553 // object is in r0 554 // make sure object is 'reasonable' 555 __ cbz(r0, exit); // if obj is NULL it is OK 556 557 #if INCLUDE_ZGC 558 if (UseZGC) { 559 // Check if mask is good. 560 // verifies that ZAddressBadMask & r0 == 0 561 __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset())); 562 __ andr(c_rarg2, r0, c_rarg3); 563 __ cbnz(c_rarg2, error); 564 } 565 #endif 566 567 // Check if the oop is in the right area of memory 568 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 569 __ andr(c_rarg2, r0, c_rarg3); 570 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 571 572 // Compare c_rarg2 and c_rarg3. We don't use a compare 573 // instruction here because the flags register is live. 574 __ eor(c_rarg2, c_rarg2, c_rarg3); 575 __ cbnz(c_rarg2, error); 576 577 // make sure klass is 'reasonable', which is not zero. 578 __ load_klass(r0, r0); // get klass 579 __ cbz(r0, error); // if klass is NULL it is broken 580 581 // return if everything seems ok 582 __ bind(exit); 583 584 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 585 __ ret(lr); 586 587 // handle errors 588 __ bind(error); 589 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 590 591 __ push(RegSet::range(r0, r29), sp); 592 // debug(char* msg, int64_t pc, int64_t regs[]) 593 __ mov(c_rarg0, rscratch1); // pass address of error message 594 __ mov(c_rarg1, lr); // pass return address 595 __ mov(c_rarg2, sp); // pass address of regs on stack 596 #ifndef PRODUCT 597 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 598 #endif 599 BLOCK_COMMENT("call MacroAssembler::debug"); 600 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 601 __ blr(rscratch1); 602 __ hlt(0); 603 604 return start; 605 } 606 607 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 608 609 // The inner part of zero_words(). This is the bulk operation, 610 // zeroing words in blocks, possibly using DC ZVA to do it. The 611 // caller is responsible for zeroing the last few words. 612 // 613 // Inputs: 614 // r10: the HeapWord-aligned base address of an array to zero. 615 // r11: the count in HeapWords, r11 > 0. 616 // 617 // Returns r10 and r11, adjusted for the caller to clear. 618 // r10: the base address of the tail of words left to clear. 619 // r11: the number of words in the tail. 620 // r11 < MacroAssembler::zero_words_block_size. 621 622 address generate_zero_blocks() { 623 Label done; 624 Label base_aligned; 625 626 Register base = r10, cnt = r11; 627 628 __ align(CodeEntryAlignment); 629 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 630 address start = __ pc(); 631 632 if (UseBlockZeroing) { 633 int zva_length = VM_Version::zva_length(); 634 635 // Ensure ZVA length can be divided by 16. This is required by 636 // the subsequent operations. 637 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 638 639 __ tbz(base, 3, base_aligned); 640 __ str(zr, Address(__ post(base, 8))); 641 __ sub(cnt, cnt, 1); 642 __ bind(base_aligned); 643 644 // Ensure count >= zva_length * 2 so that it still deserves a zva after 645 // alignment. 646 Label small; 647 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 648 __ subs(rscratch1, cnt, low_limit >> 3); 649 __ br(Assembler::LT, small); 650 __ zero_dcache_blocks(base, cnt); 651 __ bind(small); 652 } 653 654 { 655 // Number of stp instructions we'll unroll 656 const int unroll = 657 MacroAssembler::zero_words_block_size / 2; 658 // Clear the remaining blocks. 659 Label loop; 660 __ subs(cnt, cnt, unroll * 2); 661 __ br(Assembler::LT, done); 662 __ bind(loop); 663 for (int i = 0; i < unroll; i++) 664 __ stp(zr, zr, __ post(base, 16)); 665 __ subs(cnt, cnt, unroll * 2); 666 __ br(Assembler::GE, loop); 667 __ bind(done); 668 __ add(cnt, cnt, unroll * 2); 669 } 670 671 __ ret(lr); 672 673 return start; 674 } 675 676 677 typedef enum { 678 copy_forwards = 1, 679 copy_backwards = -1 680 } copy_direction; 681 682 // Bulk copy of blocks of 8 words. 683 // 684 // count is a count of words. 685 // 686 // Precondition: count >= 8 687 // 688 // Postconditions: 689 // 690 // The least significant bit of count contains the remaining count 691 // of words to copy. The rest of count is trash. 692 // 693 // s and d are adjusted to point to the remaining words to copy 694 // 695 void generate_copy_longs(Label &start, Register s, Register d, Register count, 696 copy_direction direction) { 697 int unit = wordSize * direction; 698 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 699 700 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 701 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 702 const Register stride = r13; 703 704 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 705 assert_different_registers(s, d, count, rscratch1); 706 707 Label again, drain; 708 const char *stub_name; 709 if (direction == copy_forwards) 710 stub_name = "forward_copy_longs"; 711 else 712 stub_name = "backward_copy_longs"; 713 714 __ align(CodeEntryAlignment); 715 716 StubCodeMark mark(this, "StubRoutines", stub_name); 717 718 __ bind(start); 719 720 Label unaligned_copy_long; 721 if (AvoidUnalignedAccesses) { 722 __ tbnz(d, 3, unaligned_copy_long); 723 } 724 725 if (direction == copy_forwards) { 726 __ sub(s, s, bias); 727 __ sub(d, d, bias); 728 } 729 730 #ifdef ASSERT 731 // Make sure we are never given < 8 words 732 { 733 Label L; 734 __ cmp(count, (u1)8); 735 __ br(Assembler::GE, L); 736 __ stop("genrate_copy_longs called with < 8 words"); 737 __ bind(L); 738 } 739 #endif 740 741 // Fill 8 registers 742 if (UseSIMDForMemoryOps) { 743 __ ldpq(v0, v1, Address(s, 4 * unit)); 744 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 745 } else { 746 __ ldp(t0, t1, Address(s, 2 * unit)); 747 __ ldp(t2, t3, Address(s, 4 * unit)); 748 __ ldp(t4, t5, Address(s, 6 * unit)); 749 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 750 } 751 752 __ subs(count, count, 16); 753 __ br(Assembler::LO, drain); 754 755 int prefetch = PrefetchCopyIntervalInBytes; 756 bool use_stride = false; 757 if (direction == copy_backwards) { 758 use_stride = prefetch > 256; 759 prefetch = -prefetch; 760 if (use_stride) __ mov(stride, prefetch); 761 } 762 763 __ bind(again); 764 765 if (PrefetchCopyIntervalInBytes > 0) 766 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 767 768 if (UseSIMDForMemoryOps) { 769 __ stpq(v0, v1, Address(d, 4 * unit)); 770 __ ldpq(v0, v1, Address(s, 4 * unit)); 771 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 772 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 773 } else { 774 __ stp(t0, t1, Address(d, 2 * unit)); 775 __ ldp(t0, t1, Address(s, 2 * unit)); 776 __ stp(t2, t3, Address(d, 4 * unit)); 777 __ ldp(t2, t3, Address(s, 4 * unit)); 778 __ stp(t4, t5, Address(d, 6 * unit)); 779 __ ldp(t4, t5, Address(s, 6 * unit)); 780 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 781 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 782 } 783 784 __ subs(count, count, 8); 785 __ br(Assembler::HS, again); 786 787 // Drain 788 __ bind(drain); 789 if (UseSIMDForMemoryOps) { 790 __ stpq(v0, v1, Address(d, 4 * unit)); 791 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 792 } else { 793 __ stp(t0, t1, Address(d, 2 * unit)); 794 __ stp(t2, t3, Address(d, 4 * unit)); 795 __ stp(t4, t5, Address(d, 6 * unit)); 796 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 797 } 798 799 { 800 Label L1, L2; 801 __ tbz(count, exact_log2(4), L1); 802 if (UseSIMDForMemoryOps) { 803 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 804 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 805 } else { 806 __ ldp(t0, t1, Address(s, 2 * unit)); 807 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 808 __ stp(t0, t1, Address(d, 2 * unit)); 809 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 810 } 811 __ bind(L1); 812 813 if (direction == copy_forwards) { 814 __ add(s, s, bias); 815 __ add(d, d, bias); 816 } 817 818 __ tbz(count, 1, L2); 819 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 820 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 821 __ bind(L2); 822 } 823 824 __ ret(lr); 825 826 if (AvoidUnalignedAccesses) { 827 Label drain, again; 828 // Register order for storing. Order is different for backward copy. 829 830 __ bind(unaligned_copy_long); 831 832 // source address is even aligned, target odd aligned 833 // 834 // when forward copying word pairs we read long pairs at offsets 835 // {0, 2, 4, 6} (in long words). when backwards copying we read 836 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 837 // address by -2 in the forwards case so we can compute the 838 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 839 // or -1. 840 // 841 // when forward copying we need to store 1 word, 3 pairs and 842 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 843 // zero offset We adjust the destination by -1 which means we 844 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 845 // 846 // When backwards copyng we need to store 1 word, 3 pairs and 847 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 848 // offsets {1, 3, 5, 7, 8} * unit. 849 850 if (direction == copy_forwards) { 851 __ sub(s, s, 16); 852 __ sub(d, d, 8); 853 } 854 855 // Fill 8 registers 856 // 857 // for forwards copy s was offset by -16 from the original input 858 // value of s so the register contents are at these offsets 859 // relative to the 64 bit block addressed by that original input 860 // and so on for each successive 64 byte block when s is updated 861 // 862 // t0 at offset 0, t1 at offset 8 863 // t2 at offset 16, t3 at offset 24 864 // t4 at offset 32, t5 at offset 40 865 // t6 at offset 48, t7 at offset 56 866 867 // for backwards copy s was not offset so the register contents 868 // are at these offsets into the preceding 64 byte block 869 // relative to that original input and so on for each successive 870 // preceding 64 byte block when s is updated. this explains the 871 // slightly counter-intuitive looking pattern of register usage 872 // in the stp instructions for backwards copy. 873 // 874 // t0 at offset -16, t1 at offset -8 875 // t2 at offset -32, t3 at offset -24 876 // t4 at offset -48, t5 at offset -40 877 // t6 at offset -64, t7 at offset -56 878 879 __ ldp(t0, t1, Address(s, 2 * unit)); 880 __ ldp(t2, t3, Address(s, 4 * unit)); 881 __ ldp(t4, t5, Address(s, 6 * unit)); 882 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 883 884 __ subs(count, count, 16); 885 __ br(Assembler::LO, drain); 886 887 int prefetch = PrefetchCopyIntervalInBytes; 888 bool use_stride = false; 889 if (direction == copy_backwards) { 890 use_stride = prefetch > 256; 891 prefetch = -prefetch; 892 if (use_stride) __ mov(stride, prefetch); 893 } 894 895 __ bind(again); 896 897 if (PrefetchCopyIntervalInBytes > 0) 898 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 899 900 if (direction == copy_forwards) { 901 // allowing for the offset of -8 the store instructions place 902 // registers into the target 64 bit block at the following 903 // offsets 904 // 905 // t0 at offset 0 906 // t1 at offset 8, t2 at offset 16 907 // t3 at offset 24, t4 at offset 32 908 // t5 at offset 40, t6 at offset 48 909 // t7 at offset 56 910 911 __ str(t0, Address(d, 1 * unit)); 912 __ stp(t1, t2, Address(d, 2 * unit)); 913 __ ldp(t0, t1, Address(s, 2 * unit)); 914 __ stp(t3, t4, Address(d, 4 * unit)); 915 __ ldp(t2, t3, Address(s, 4 * unit)); 916 __ stp(t5, t6, Address(d, 6 * unit)); 917 __ ldp(t4, t5, Address(s, 6 * unit)); 918 __ str(t7, Address(__ pre(d, 8 * unit))); 919 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 920 } else { 921 // d was not offset when we started so the registers are 922 // written into the 64 bit block preceding d with the following 923 // offsets 924 // 925 // t1 at offset -8 926 // t3 at offset -24, t0 at offset -16 927 // t5 at offset -48, t2 at offset -32 928 // t7 at offset -56, t4 at offset -48 929 // t6 at offset -64 930 // 931 // note that this matches the offsets previously noted for the 932 // loads 933 934 __ str(t1, Address(d, 1 * unit)); 935 __ stp(t3, t0, Address(d, 3 * unit)); 936 __ ldp(t0, t1, Address(s, 2 * unit)); 937 __ stp(t5, t2, Address(d, 5 * unit)); 938 __ ldp(t2, t3, Address(s, 4 * unit)); 939 __ stp(t7, t4, Address(d, 7 * unit)); 940 __ ldp(t4, t5, Address(s, 6 * unit)); 941 __ str(t6, Address(__ pre(d, 8 * unit))); 942 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 943 } 944 945 __ subs(count, count, 8); 946 __ br(Assembler::HS, again); 947 948 // Drain 949 // 950 // this uses the same pattern of offsets and register arguments 951 // as above 952 __ bind(drain); 953 if (direction == copy_forwards) { 954 __ str(t0, Address(d, 1 * unit)); 955 __ stp(t1, t2, Address(d, 2 * unit)); 956 __ stp(t3, t4, Address(d, 4 * unit)); 957 __ stp(t5, t6, Address(d, 6 * unit)); 958 __ str(t7, Address(__ pre(d, 8 * unit))); 959 } else { 960 __ str(t1, Address(d, 1 * unit)); 961 __ stp(t3, t0, Address(d, 3 * unit)); 962 __ stp(t5, t2, Address(d, 5 * unit)); 963 __ stp(t7, t4, Address(d, 7 * unit)); 964 __ str(t6, Address(__ pre(d, 8 * unit))); 965 } 966 // now we need to copy any remaining part block which may 967 // include a 4 word block subblock and/or a 2 word subblock. 968 // bits 2 and 1 in the count are the tell-tale for whetehr we 969 // have each such subblock 970 { 971 Label L1, L2; 972 __ tbz(count, exact_log2(4), L1); 973 // this is the same as above but copying only 4 longs hence 974 // with ony one intervening stp between the str instructions 975 // but note that the offsets and registers still follow the 976 // same pattern 977 __ ldp(t0, t1, Address(s, 2 * unit)); 978 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 979 if (direction == copy_forwards) { 980 __ str(t0, Address(d, 1 * unit)); 981 __ stp(t1, t2, Address(d, 2 * unit)); 982 __ str(t3, Address(__ pre(d, 4 * unit))); 983 } else { 984 __ str(t1, Address(d, 1 * unit)); 985 __ stp(t3, t0, Address(d, 3 * unit)); 986 __ str(t2, Address(__ pre(d, 4 * unit))); 987 } 988 __ bind(L1); 989 990 __ tbz(count, 1, L2); 991 // this is the same as above but copying only 2 longs hence 992 // there is no intervening stp between the str instructions 993 // but note that the offset and register patterns are still 994 // the same 995 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 996 if (direction == copy_forwards) { 997 __ str(t0, Address(d, 1 * unit)); 998 __ str(t1, Address(__ pre(d, 2 * unit))); 999 } else { 1000 __ str(t1, Address(d, 1 * unit)); 1001 __ str(t0, Address(__ pre(d, 2 * unit))); 1002 } 1003 __ bind(L2); 1004 1005 // for forwards copy we need to re-adjust the offsets we 1006 // applied so that s and d are follow the last words written 1007 1008 if (direction == copy_forwards) { 1009 __ add(s, s, 16); 1010 __ add(d, d, 8); 1011 } 1012 1013 } 1014 1015 __ ret(lr); 1016 } 1017 } 1018 1019 // Small copy: less than 16 bytes. 1020 // 1021 // NB: Ignores all of the bits of count which represent more than 15 1022 // bytes, so a caller doesn't have to mask them. 1023 1024 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1025 bool is_backwards = step < 0; 1026 size_t granularity = uabs(step); 1027 int direction = is_backwards ? -1 : 1; 1028 int unit = wordSize * direction; 1029 1030 Label Lword, Lint, Lshort, Lbyte; 1031 1032 assert(granularity 1033 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1034 1035 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1036 1037 // ??? I don't know if this bit-test-and-branch is the right thing 1038 // to do. It does a lot of jumping, resulting in several 1039 // mispredicted branches. It might make more sense to do this 1040 // with something like Duff's device with a single computed branch. 1041 1042 __ tbz(count, 3 - exact_log2(granularity), Lword); 1043 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1044 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1045 __ bind(Lword); 1046 1047 if (granularity <= sizeof (jint)) { 1048 __ tbz(count, 2 - exact_log2(granularity), Lint); 1049 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1050 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1051 __ bind(Lint); 1052 } 1053 1054 if (granularity <= sizeof (jshort)) { 1055 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1056 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1057 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1058 __ bind(Lshort); 1059 } 1060 1061 if (granularity <= sizeof (jbyte)) { 1062 __ tbz(count, 0, Lbyte); 1063 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1064 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1065 __ bind(Lbyte); 1066 } 1067 } 1068 1069 Label copy_f, copy_b; 1070 1071 // All-singing all-dancing memory copy. 1072 // 1073 // Copy count units of memory from s to d. The size of a unit is 1074 // step, which can be positive or negative depending on the direction 1075 // of copy. If is_aligned is false, we align the source address. 1076 // 1077 1078 void copy_memory(bool is_aligned, Register s, Register d, 1079 Register count, Register tmp, int step) { 1080 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1081 bool is_backwards = step < 0; 1082 int granularity = uabs(step); 1083 const Register t0 = r3, t1 = r4; 1084 1085 // <= 96 bytes do inline. Direction doesn't matter because we always 1086 // load all the data before writing anything 1087 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1088 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1089 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1090 const Register send = r17, dend = r16; 1091 1092 if (PrefetchCopyIntervalInBytes > 0) 1093 __ prfm(Address(s, 0), PLDL1KEEP); 1094 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1095 __ br(Assembler::HI, copy_big); 1096 1097 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1098 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1099 1100 __ cmp(count, u1(16/granularity)); 1101 __ br(Assembler::LS, copy16); 1102 1103 __ cmp(count, u1(64/granularity)); 1104 __ br(Assembler::HI, copy80); 1105 1106 __ cmp(count, u1(32/granularity)); 1107 __ br(Assembler::LS, copy32); 1108 1109 // 33..64 bytes 1110 if (UseSIMDForMemoryOps) { 1111 __ ldpq(v0, v1, Address(s, 0)); 1112 __ ldpq(v2, v3, Address(send, -32)); 1113 __ stpq(v0, v1, Address(d, 0)); 1114 __ stpq(v2, v3, Address(dend, -32)); 1115 } else { 1116 __ ldp(t0, t1, Address(s, 0)); 1117 __ ldp(t2, t3, Address(s, 16)); 1118 __ ldp(t4, t5, Address(send, -32)); 1119 __ ldp(t6, t7, Address(send, -16)); 1120 1121 __ stp(t0, t1, Address(d, 0)); 1122 __ stp(t2, t3, Address(d, 16)); 1123 __ stp(t4, t5, Address(dend, -32)); 1124 __ stp(t6, t7, Address(dend, -16)); 1125 } 1126 __ b(finish); 1127 1128 // 17..32 bytes 1129 __ bind(copy32); 1130 __ ldp(t0, t1, Address(s, 0)); 1131 __ ldp(t2, t3, Address(send, -16)); 1132 __ stp(t0, t1, Address(d, 0)); 1133 __ stp(t2, t3, Address(dend, -16)); 1134 __ b(finish); 1135 1136 // 65..80/96 bytes 1137 // (96 bytes if SIMD because we do 32 byes per instruction) 1138 __ bind(copy80); 1139 if (UseSIMDForMemoryOps) { 1140 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1141 __ ldpq(v4, v5, Address(send, -32)); 1142 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1143 __ stpq(v4, v5, Address(dend, -32)); 1144 } else { 1145 __ ldp(t0, t1, Address(s, 0)); 1146 __ ldp(t2, t3, Address(s, 16)); 1147 __ ldp(t4, t5, Address(s, 32)); 1148 __ ldp(t6, t7, Address(s, 48)); 1149 __ ldp(t8, t9, Address(send, -16)); 1150 1151 __ stp(t0, t1, Address(d, 0)); 1152 __ stp(t2, t3, Address(d, 16)); 1153 __ stp(t4, t5, Address(d, 32)); 1154 __ stp(t6, t7, Address(d, 48)); 1155 __ stp(t8, t9, Address(dend, -16)); 1156 } 1157 __ b(finish); 1158 1159 // 0..16 bytes 1160 __ bind(copy16); 1161 __ cmp(count, u1(8/granularity)); 1162 __ br(Assembler::LO, copy8); 1163 1164 // 8..16 bytes 1165 __ ldr(t0, Address(s, 0)); 1166 __ ldr(t1, Address(send, -8)); 1167 __ str(t0, Address(d, 0)); 1168 __ str(t1, Address(dend, -8)); 1169 __ b(finish); 1170 1171 if (granularity < 8) { 1172 // 4..7 bytes 1173 __ bind(copy8); 1174 __ tbz(count, 2 - exact_log2(granularity), copy4); 1175 __ ldrw(t0, Address(s, 0)); 1176 __ ldrw(t1, Address(send, -4)); 1177 __ strw(t0, Address(d, 0)); 1178 __ strw(t1, Address(dend, -4)); 1179 __ b(finish); 1180 if (granularity < 4) { 1181 // 0..3 bytes 1182 __ bind(copy4); 1183 __ cbz(count, finish); // get rid of 0 case 1184 if (granularity == 2) { 1185 __ ldrh(t0, Address(s, 0)); 1186 __ strh(t0, Address(d, 0)); 1187 } else { // granularity == 1 1188 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1189 // the first and last byte. 1190 // Handle the 3 byte case by loading and storing base + count/2 1191 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1192 // This does means in the 1 byte case we load/store the same 1193 // byte 3 times. 1194 __ lsr(count, count, 1); 1195 __ ldrb(t0, Address(s, 0)); 1196 __ ldrb(t1, Address(send, -1)); 1197 __ ldrb(t2, Address(s, count)); 1198 __ strb(t0, Address(d, 0)); 1199 __ strb(t1, Address(dend, -1)); 1200 __ strb(t2, Address(d, count)); 1201 } 1202 __ b(finish); 1203 } 1204 } 1205 1206 __ bind(copy_big); 1207 if (is_backwards) { 1208 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1209 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1210 } 1211 1212 // Now we've got the small case out of the way we can align the 1213 // source address on a 2-word boundary. 1214 1215 Label aligned; 1216 1217 if (is_aligned) { 1218 // We may have to adjust by 1 word to get s 2-word-aligned. 1219 __ tbz(s, exact_log2(wordSize), aligned); 1220 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1221 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1222 __ sub(count, count, wordSize/granularity); 1223 } else { 1224 if (is_backwards) { 1225 __ andr(rscratch2, s, 2 * wordSize - 1); 1226 } else { 1227 __ neg(rscratch2, s); 1228 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1229 } 1230 // rscratch2 is the byte adjustment needed to align s. 1231 __ cbz(rscratch2, aligned); 1232 int shift = exact_log2(granularity); 1233 if (shift) __ lsr(rscratch2, rscratch2, shift); 1234 __ sub(count, count, rscratch2); 1235 1236 #if 0 1237 // ?? This code is only correct for a disjoint copy. It may or 1238 // may not make sense to use it in that case. 1239 1240 // Copy the first pair; s and d may not be aligned. 1241 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1242 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1243 1244 // Align s and d, adjust count 1245 if (is_backwards) { 1246 __ sub(s, s, rscratch2); 1247 __ sub(d, d, rscratch2); 1248 } else { 1249 __ add(s, s, rscratch2); 1250 __ add(d, d, rscratch2); 1251 } 1252 #else 1253 copy_memory_small(s, d, rscratch2, rscratch1, step); 1254 #endif 1255 } 1256 1257 __ bind(aligned); 1258 1259 // s is now 2-word-aligned. 1260 1261 // We have a count of units and some trailing bytes. Adjust the 1262 // count and do a bulk copy of words. 1263 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1264 if (direction == copy_forwards) 1265 __ bl(copy_f); 1266 else 1267 __ bl(copy_b); 1268 1269 // And the tail. 1270 copy_memory_small(s, d, count, tmp, step); 1271 1272 if (granularity >= 8) __ bind(copy8); 1273 if (granularity >= 4) __ bind(copy4); 1274 __ bind(finish); 1275 } 1276 1277 1278 void clobber_registers() { 1279 #ifdef ASSERT 1280 RegSet clobbered 1281 = MacroAssembler::call_clobbered_registers() - rscratch1; 1282 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1283 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1284 for (RegSetIterator it = clobbered.begin(); *it != noreg; ++it) { 1285 __ mov(*it, rscratch1); 1286 } 1287 #endif 1288 1289 } 1290 1291 // Scan over array at a for count oops, verifying each one. 1292 // Preserves a and count, clobbers rscratch1 and rscratch2. 1293 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1294 Label loop, end; 1295 __ mov(rscratch1, a); 1296 __ mov(rscratch2, zr); 1297 __ bind(loop); 1298 __ cmp(rscratch2, count); 1299 __ br(Assembler::HS, end); 1300 if (size == (size_t)wordSize) { 1301 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1302 __ verify_oop(temp); 1303 } else { 1304 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1305 __ decode_heap_oop(temp); // calls verify_oop 1306 } 1307 __ add(rscratch2, rscratch2, size); 1308 __ b(loop); 1309 __ bind(end); 1310 } 1311 1312 // Arguments: 1313 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1314 // ignored 1315 // is_oop - true => oop array, so generate store check code 1316 // name - stub name string 1317 // 1318 // Inputs: 1319 // c_rarg0 - source array address 1320 // c_rarg1 - destination array address 1321 // c_rarg2 - element count, treated as ssize_t, can be zero 1322 // 1323 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1324 // the hardware handle it. The two dwords within qwords that span 1325 // cache line boundaries will still be loaded and stored atomicly. 1326 // 1327 // Side Effects: 1328 // disjoint_int_copy_entry is set to the no-overlap entry point 1329 // used by generate_conjoint_int_oop_copy(). 1330 // 1331 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1332 const char *name, bool dest_uninitialized = false) { 1333 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1334 RegSet saved_reg = RegSet::of(s, d, count); 1335 __ align(CodeEntryAlignment); 1336 StubCodeMark mark(this, "StubRoutines", name); 1337 address start = __ pc(); 1338 __ enter(); 1339 1340 if (entry != NULL) { 1341 *entry = __ pc(); 1342 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1343 BLOCK_COMMENT("Entry:"); 1344 } 1345 1346 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1347 if (dest_uninitialized) { 1348 decorators |= IS_DEST_UNINITIALIZED; 1349 } 1350 if (aligned) { 1351 decorators |= ARRAYCOPY_ALIGNED; 1352 } 1353 1354 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1355 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1356 1357 if (is_oop) { 1358 // save regs before copy_memory 1359 __ push(RegSet::of(d, count), sp); 1360 } 1361 { 1362 // UnsafeCopyMemory page error: continue after ucm 1363 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1364 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1365 copy_memory(aligned, s, d, count, rscratch1, size); 1366 } 1367 1368 if (is_oop) { 1369 __ pop(RegSet::of(d, count), sp); 1370 if (VerifyOops) 1371 verify_oop_array(size, d, count, r16); 1372 } 1373 1374 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1375 1376 __ leave(); 1377 __ mov(r0, zr); // return 0 1378 __ ret(lr); 1379 return start; 1380 } 1381 1382 // Arguments: 1383 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1384 // ignored 1385 // is_oop - true => oop array, so generate store check code 1386 // name - stub name string 1387 // 1388 // Inputs: 1389 // c_rarg0 - source array address 1390 // c_rarg1 - destination array address 1391 // c_rarg2 - element count, treated as ssize_t, can be zero 1392 // 1393 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1394 // the hardware handle it. The two dwords within qwords that span 1395 // cache line boundaries will still be loaded and stored atomicly. 1396 // 1397 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1398 address *entry, const char *name, 1399 bool dest_uninitialized = false) { 1400 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1401 RegSet saved_regs = RegSet::of(s, d, count); 1402 StubCodeMark mark(this, "StubRoutines", name); 1403 address start = __ pc(); 1404 __ enter(); 1405 1406 if (entry != NULL) { 1407 *entry = __ pc(); 1408 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1409 BLOCK_COMMENT("Entry:"); 1410 } 1411 1412 // use fwd copy when (d-s) above_equal (count*size) 1413 __ sub(rscratch1, d, s); 1414 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1415 __ br(Assembler::HS, nooverlap_target); 1416 1417 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1418 if (dest_uninitialized) { 1419 decorators |= IS_DEST_UNINITIALIZED; 1420 } 1421 if (aligned) { 1422 decorators |= ARRAYCOPY_ALIGNED; 1423 } 1424 1425 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1426 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1427 1428 if (is_oop) { 1429 // save regs before copy_memory 1430 __ push(RegSet::of(d, count), sp); 1431 } 1432 { 1433 // UnsafeCopyMemory page error: continue after ucm 1434 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1435 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1436 copy_memory(aligned, s, d, count, rscratch1, -size); 1437 } 1438 if (is_oop) { 1439 __ pop(RegSet::of(d, count), sp); 1440 if (VerifyOops) 1441 verify_oop_array(size, d, count, r16); 1442 } 1443 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1444 __ leave(); 1445 __ mov(r0, zr); // return 0 1446 __ ret(lr); 1447 return start; 1448 } 1449 1450 // Arguments: 1451 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1452 // ignored 1453 // name - stub name string 1454 // 1455 // Inputs: 1456 // c_rarg0 - source array address 1457 // c_rarg1 - destination array address 1458 // c_rarg2 - element count, treated as ssize_t, can be zero 1459 // 1460 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1461 // we let the hardware handle it. The one to eight bytes within words, 1462 // dwords or qwords that span cache line boundaries will still be loaded 1463 // and stored atomically. 1464 // 1465 // Side Effects: 1466 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1467 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1468 // we let the hardware handle it. The one to eight bytes within words, 1469 // dwords or qwords that span cache line boundaries will still be loaded 1470 // and stored atomically. 1471 // 1472 // Side Effects: 1473 // disjoint_byte_copy_entry is set to the no-overlap entry point 1474 // used by generate_conjoint_byte_copy(). 1475 // 1476 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1477 const bool not_oop = false; 1478 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1479 } 1480 1481 // Arguments: 1482 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1483 // ignored 1484 // name - stub name string 1485 // 1486 // Inputs: 1487 // c_rarg0 - source array address 1488 // c_rarg1 - destination array address 1489 // c_rarg2 - element count, treated as ssize_t, can be zero 1490 // 1491 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1492 // we let the hardware handle it. The one to eight bytes within words, 1493 // dwords or qwords that span cache line boundaries will still be loaded 1494 // and stored atomically. 1495 // 1496 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1497 address* entry, const char *name) { 1498 const bool not_oop = false; 1499 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1500 } 1501 1502 // Arguments: 1503 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1504 // ignored 1505 // name - stub name string 1506 // 1507 // Inputs: 1508 // c_rarg0 - source array address 1509 // c_rarg1 - destination array address 1510 // c_rarg2 - element count, treated as ssize_t, can be zero 1511 // 1512 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1513 // let the hardware handle it. The two or four words within dwords 1514 // or qwords that span cache line boundaries will still be loaded 1515 // and stored atomically. 1516 // 1517 // Side Effects: 1518 // disjoint_short_copy_entry is set to the no-overlap entry point 1519 // used by generate_conjoint_short_copy(). 1520 // 1521 address generate_disjoint_short_copy(bool aligned, 1522 address* entry, const char *name) { 1523 const bool not_oop = false; 1524 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1525 } 1526 1527 // Arguments: 1528 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1529 // ignored 1530 // name - stub name string 1531 // 1532 // Inputs: 1533 // c_rarg0 - source array address 1534 // c_rarg1 - destination array address 1535 // c_rarg2 - element count, treated as ssize_t, can be zero 1536 // 1537 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1538 // let the hardware handle it. The two or four words within dwords 1539 // or qwords that span cache line boundaries will still be loaded 1540 // and stored atomically. 1541 // 1542 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1543 address *entry, const char *name) { 1544 const bool not_oop = false; 1545 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1546 1547 } 1548 // Arguments: 1549 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1550 // ignored 1551 // name - stub name string 1552 // 1553 // Inputs: 1554 // c_rarg0 - source array address 1555 // c_rarg1 - destination array address 1556 // c_rarg2 - element count, treated as ssize_t, can be zero 1557 // 1558 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1559 // the hardware handle it. The two dwords within qwords that span 1560 // cache line boundaries will still be loaded and stored atomicly. 1561 // 1562 // Side Effects: 1563 // disjoint_int_copy_entry is set to the no-overlap entry point 1564 // used by generate_conjoint_int_oop_copy(). 1565 // 1566 address generate_disjoint_int_copy(bool aligned, address *entry, 1567 const char *name, bool dest_uninitialized = false) { 1568 const bool not_oop = false; 1569 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1570 } 1571 1572 // Arguments: 1573 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1574 // ignored 1575 // name - stub name string 1576 // 1577 // Inputs: 1578 // c_rarg0 - source array address 1579 // c_rarg1 - destination array address 1580 // c_rarg2 - element count, treated as ssize_t, can be zero 1581 // 1582 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1583 // the hardware handle it. The two dwords within qwords that span 1584 // cache line boundaries will still be loaded and stored atomicly. 1585 // 1586 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1587 address *entry, const char *name, 1588 bool dest_uninitialized = false) { 1589 const bool not_oop = false; 1590 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1591 } 1592 1593 1594 // Arguments: 1595 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1596 // ignored 1597 // name - stub name string 1598 // 1599 // Inputs: 1600 // c_rarg0 - source array address 1601 // c_rarg1 - destination array address 1602 // c_rarg2 - element count, treated as size_t, can be zero 1603 // 1604 // Side Effects: 1605 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1606 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1607 // 1608 address generate_disjoint_long_copy(bool aligned, address *entry, 1609 const char *name, bool dest_uninitialized = false) { 1610 const bool not_oop = false; 1611 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1612 } 1613 1614 // Arguments: 1615 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1616 // ignored 1617 // name - stub name string 1618 // 1619 // Inputs: 1620 // c_rarg0 - source array address 1621 // c_rarg1 - destination array address 1622 // c_rarg2 - element count, treated as size_t, can be zero 1623 // 1624 address generate_conjoint_long_copy(bool aligned, 1625 address nooverlap_target, address *entry, 1626 const char *name, bool dest_uninitialized = false) { 1627 const bool not_oop = false; 1628 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1629 } 1630 1631 // Arguments: 1632 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1633 // ignored 1634 // name - stub name string 1635 // 1636 // Inputs: 1637 // c_rarg0 - source array address 1638 // c_rarg1 - destination array address 1639 // c_rarg2 - element count, treated as size_t, can be zero 1640 // 1641 // Side Effects: 1642 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1643 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1644 // 1645 address generate_disjoint_oop_copy(bool aligned, address *entry, 1646 const char *name, bool dest_uninitialized) { 1647 const bool is_oop = true; 1648 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1649 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1650 } 1651 1652 // Arguments: 1653 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1654 // ignored 1655 // name - stub name string 1656 // 1657 // Inputs: 1658 // c_rarg0 - source array address 1659 // c_rarg1 - destination array address 1660 // c_rarg2 - element count, treated as size_t, can be zero 1661 // 1662 address generate_conjoint_oop_copy(bool aligned, 1663 address nooverlap_target, address *entry, 1664 const char *name, bool dest_uninitialized) { 1665 const bool is_oop = true; 1666 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1667 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1668 name, dest_uninitialized); 1669 } 1670 1671 1672 // Helper for generating a dynamic type check. 1673 // Smashes rscratch1, rscratch2. 1674 void generate_type_check(Register sub_klass, 1675 Register super_check_offset, 1676 Register super_klass, 1677 Label& L_success) { 1678 assert_different_registers(sub_klass, super_check_offset, super_klass); 1679 1680 BLOCK_COMMENT("type_check:"); 1681 1682 Label L_miss; 1683 1684 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1685 super_check_offset); 1686 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1687 1688 // Fall through on failure! 1689 __ BIND(L_miss); 1690 } 1691 1692 // 1693 // Generate checkcasting array copy stub 1694 // 1695 // Input: 1696 // c_rarg0 - source array address 1697 // c_rarg1 - destination array address 1698 // c_rarg2 - element count, treated as ssize_t, can be zero 1699 // c_rarg3 - size_t ckoff (super_check_offset) 1700 // c_rarg4 - oop ckval (super_klass) 1701 // 1702 // Output: 1703 // r0 == 0 - success 1704 // r0 == -1^K - failure, where K is partial transfer count 1705 // 1706 address generate_checkcast_copy(const char *name, address *entry, 1707 bool dest_uninitialized = false) { 1708 1709 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1710 1711 // Input registers (after setup_arg_regs) 1712 const Register from = c_rarg0; // source array address 1713 const Register to = c_rarg1; // destination array address 1714 const Register count = c_rarg2; // elementscount 1715 const Register ckoff = c_rarg3; // super_check_offset 1716 const Register ckval = c_rarg4; // super_klass 1717 1718 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1719 RegSet wb_post_saved_regs = RegSet::of(count); 1720 1721 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1722 const Register copied_oop = r22; // actual oop copied 1723 const Register count_save = r21; // orig elementscount 1724 const Register start_to = r20; // destination array start address 1725 const Register r19_klass = r19; // oop._klass 1726 1727 //--------------------------------------------------------------- 1728 // Assembler stub will be used for this call to arraycopy 1729 // if the two arrays are subtypes of Object[] but the 1730 // destination array type is not equal to or a supertype 1731 // of the source type. Each element must be separately 1732 // checked. 1733 1734 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1735 copied_oop, r19_klass, count_save); 1736 1737 __ align(CodeEntryAlignment); 1738 StubCodeMark mark(this, "StubRoutines", name); 1739 address start = __ pc(); 1740 1741 __ enter(); // required for proper stackwalking of RuntimeStub frame 1742 1743 #ifdef ASSERT 1744 // caller guarantees that the arrays really are different 1745 // otherwise, we would have to make conjoint checks 1746 { Label L; 1747 array_overlap_test(L, TIMES_OOP); 1748 __ stop("checkcast_copy within a single array"); 1749 __ bind(L); 1750 } 1751 #endif //ASSERT 1752 1753 // Caller of this entry point must set up the argument registers. 1754 if (entry != NULL) { 1755 *entry = __ pc(); 1756 BLOCK_COMMENT("Entry:"); 1757 } 1758 1759 // Empty array: Nothing to do. 1760 __ cbz(count, L_done); 1761 __ push(RegSet::of(r19, r20, r21, r22), sp); 1762 1763 #ifdef ASSERT 1764 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1765 // The ckoff and ckval must be mutually consistent, 1766 // even though caller generates both. 1767 { Label L; 1768 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1769 __ ldrw(start_to, Address(ckval, sco_offset)); 1770 __ cmpw(ckoff, start_to); 1771 __ br(Assembler::EQ, L); 1772 __ stop("super_check_offset inconsistent"); 1773 __ bind(L); 1774 } 1775 #endif //ASSERT 1776 1777 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1778 bool is_oop = true; 1779 if (dest_uninitialized) { 1780 decorators |= IS_DEST_UNINITIALIZED; 1781 } 1782 1783 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1784 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1785 1786 // save the original count 1787 __ mov(count_save, count); 1788 1789 // Copy from low to high addresses 1790 __ mov(start_to, to); // Save destination array start address 1791 __ b(L_load_element); 1792 1793 // ======== begin loop ======== 1794 // (Loop is rotated; its entry is L_load_element.) 1795 // Loop control: 1796 // for (; count != 0; count--) { 1797 // copied_oop = load_heap_oop(from++); 1798 // ... generate_type_check ...; 1799 // store_heap_oop(to++, copied_oop); 1800 // } 1801 __ align(OptoLoopAlignment); 1802 1803 __ BIND(L_store_element); 1804 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1805 __ sub(count, count, 1); 1806 __ cbz(count, L_do_card_marks); 1807 1808 // ======== loop entry is here ======== 1809 __ BIND(L_load_element); 1810 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1811 __ cbz(copied_oop, L_store_element); 1812 1813 __ load_klass(r19_klass, copied_oop);// query the object klass 1814 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1815 // ======== end loop ======== 1816 1817 // It was a real error; we must depend on the caller to finish the job. 1818 // Register count = remaining oops, count_orig = total oops. 1819 // Emit GC store barriers for the oops we have copied and report 1820 // their number to the caller. 1821 1822 __ subs(count, count_save, count); // K = partially copied oop count 1823 __ eon(count, count, zr); // report (-1^K) to caller 1824 __ br(Assembler::EQ, L_done_pop); 1825 1826 __ BIND(L_do_card_marks); 1827 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1828 1829 __ bind(L_done_pop); 1830 __ pop(RegSet::of(r19, r20, r21, r22), sp); 1831 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1832 1833 __ bind(L_done); 1834 __ mov(r0, count); 1835 __ leave(); 1836 __ ret(lr); 1837 1838 return start; 1839 } 1840 1841 // Perform range checks on the proposed arraycopy. 1842 // Kills temp, but nothing else. 1843 // Also, clean the sign bits of src_pos and dst_pos. 1844 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1845 Register src_pos, // source position (c_rarg1) 1846 Register dst, // destination array oo (c_rarg2) 1847 Register dst_pos, // destination position (c_rarg3) 1848 Register length, 1849 Register temp, 1850 Label& L_failed) { 1851 BLOCK_COMMENT("arraycopy_range_checks:"); 1852 1853 assert_different_registers(rscratch1, temp); 1854 1855 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1856 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1857 __ addw(temp, length, src_pos); 1858 __ cmpw(temp, rscratch1); 1859 __ br(Assembler::HI, L_failed); 1860 1861 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1862 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1863 __ addw(temp, length, dst_pos); 1864 __ cmpw(temp, rscratch1); 1865 __ br(Assembler::HI, L_failed); 1866 1867 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1868 __ movw(src_pos, src_pos); 1869 __ movw(dst_pos, dst_pos); 1870 1871 BLOCK_COMMENT("arraycopy_range_checks done"); 1872 } 1873 1874 // These stubs get called from some dumb test routine. 1875 // I'll write them properly when they're called from 1876 // something that's actually doing something. 1877 static void fake_arraycopy_stub(address src, address dst, int count) { 1878 assert(count == 0, "huh?"); 1879 } 1880 1881 1882 // 1883 // Generate 'unsafe' array copy stub 1884 // Though just as safe as the other stubs, it takes an unscaled 1885 // size_t argument instead of an element count. 1886 // 1887 // Input: 1888 // c_rarg0 - source array address 1889 // c_rarg1 - destination array address 1890 // c_rarg2 - byte count, treated as ssize_t, can be zero 1891 // 1892 // Examines the alignment of the operands and dispatches 1893 // to a long, int, short, or byte copy loop. 1894 // 1895 address generate_unsafe_copy(const char *name, 1896 address byte_copy_entry, 1897 address short_copy_entry, 1898 address int_copy_entry, 1899 address long_copy_entry) { 1900 Label L_long_aligned, L_int_aligned, L_short_aligned; 1901 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1902 1903 __ align(CodeEntryAlignment); 1904 StubCodeMark mark(this, "StubRoutines", name); 1905 address start = __ pc(); 1906 __ enter(); // required for proper stackwalking of RuntimeStub frame 1907 1908 // bump this on entry, not on exit: 1909 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1910 1911 __ orr(rscratch1, s, d); 1912 __ orr(rscratch1, rscratch1, count); 1913 1914 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1915 __ cbz(rscratch1, L_long_aligned); 1916 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1917 __ cbz(rscratch1, L_int_aligned); 1918 __ tbz(rscratch1, 0, L_short_aligned); 1919 __ b(RuntimeAddress(byte_copy_entry)); 1920 1921 __ BIND(L_short_aligned); 1922 __ lsr(count, count, LogBytesPerShort); // size => short_count 1923 __ b(RuntimeAddress(short_copy_entry)); 1924 __ BIND(L_int_aligned); 1925 __ lsr(count, count, LogBytesPerInt); // size => int_count 1926 __ b(RuntimeAddress(int_copy_entry)); 1927 __ BIND(L_long_aligned); 1928 __ lsr(count, count, LogBytesPerLong); // size => long_count 1929 __ b(RuntimeAddress(long_copy_entry)); 1930 1931 return start; 1932 } 1933 1934 // 1935 // Generate generic array copy stubs 1936 // 1937 // Input: 1938 // c_rarg0 - src oop 1939 // c_rarg1 - src_pos (32-bits) 1940 // c_rarg2 - dst oop 1941 // c_rarg3 - dst_pos (32-bits) 1942 // c_rarg4 - element count (32-bits) 1943 // 1944 // Output: 1945 // r0 == 0 - success 1946 // r0 == -1^K - failure, where K is partial transfer count 1947 // 1948 address generate_generic_copy(const char *name, 1949 address byte_copy_entry, address short_copy_entry, 1950 address int_copy_entry, address oop_copy_entry, 1951 address long_copy_entry, address checkcast_copy_entry) { 1952 1953 Label L_failed, L_objArray; 1954 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1955 1956 // Input registers 1957 const Register src = c_rarg0; // source array oop 1958 const Register src_pos = c_rarg1; // source position 1959 const Register dst = c_rarg2; // destination array oop 1960 const Register dst_pos = c_rarg3; // destination position 1961 const Register length = c_rarg4; 1962 1963 1964 // Registers used as temps 1965 const Register dst_klass = c_rarg5; 1966 1967 __ align(CodeEntryAlignment); 1968 1969 StubCodeMark mark(this, "StubRoutines", name); 1970 1971 address start = __ pc(); 1972 1973 __ enter(); // required for proper stackwalking of RuntimeStub frame 1974 1975 // bump this on entry, not on exit: 1976 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1977 1978 //----------------------------------------------------------------------- 1979 // Assembler stub will be used for this call to arraycopy 1980 // if the following conditions are met: 1981 // 1982 // (1) src and dst must not be null. 1983 // (2) src_pos must not be negative. 1984 // (3) dst_pos must not be negative. 1985 // (4) length must not be negative. 1986 // (5) src klass and dst klass should be the same and not NULL. 1987 // (6) src and dst should be arrays. 1988 // (7) src_pos + length must not exceed length of src. 1989 // (8) dst_pos + length must not exceed length of dst. 1990 // 1991 1992 // if (src == NULL) return -1; 1993 __ cbz(src, L_failed); 1994 1995 // if (src_pos < 0) return -1; 1996 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 1997 1998 // if (dst == NULL) return -1; 1999 __ cbz(dst, L_failed); 2000 2001 // if (dst_pos < 0) return -1; 2002 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2003 2004 // registers used as temp 2005 const Register scratch_length = r16; // elements count to copy 2006 const Register scratch_src_klass = r17; // array klass 2007 const Register lh = r15; // layout helper 2008 2009 // if (length < 0) return -1; 2010 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2011 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2012 2013 __ load_klass(scratch_src_klass, src); 2014 #ifdef ASSERT 2015 // assert(src->klass() != NULL); 2016 { 2017 BLOCK_COMMENT("assert klasses not null {"); 2018 Label L1, L2; 2019 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2020 __ bind(L1); 2021 __ stop("broken null klass"); 2022 __ bind(L2); 2023 __ load_klass(rscratch1, dst); 2024 __ cbz(rscratch1, L1); // this would be broken also 2025 BLOCK_COMMENT("} assert klasses not null done"); 2026 } 2027 #endif 2028 2029 // Load layout helper (32-bits) 2030 // 2031 // |array_tag| | header_size | element_type | |log2_element_size| 2032 // 32 30 24 16 8 2 0 2033 // 2034 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2035 // 2036 2037 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2038 2039 // Handle objArrays completely differently... 2040 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2041 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2042 __ movw(rscratch1, objArray_lh); 2043 __ eorw(rscratch2, lh, rscratch1); 2044 __ cbzw(rscratch2, L_objArray); 2045 2046 // if (src->klass() != dst->klass()) return -1; 2047 __ load_klass(rscratch2, dst); 2048 __ eor(rscratch2, rscratch2, scratch_src_klass); 2049 __ cbnz(rscratch2, L_failed); 2050 2051 // if (!src->is_Array()) return -1; 2052 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2053 2054 // At this point, it is known to be a typeArray (array_tag 0x3). 2055 #ifdef ASSERT 2056 { 2057 BLOCK_COMMENT("assert primitive array {"); 2058 Label L; 2059 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2060 __ cmpw(lh, rscratch2); 2061 __ br(Assembler::GE, L); 2062 __ stop("must be a primitive array"); 2063 __ bind(L); 2064 BLOCK_COMMENT("} assert primitive array done"); 2065 } 2066 #endif 2067 2068 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2069 rscratch2, L_failed); 2070 2071 // TypeArrayKlass 2072 // 2073 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2074 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2075 // 2076 2077 const Register rscratch1_offset = rscratch1; // array offset 2078 const Register r15_elsize = lh; // element size 2079 2080 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2081 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2082 __ add(src, src, rscratch1_offset); // src array offset 2083 __ add(dst, dst, rscratch1_offset); // dst array offset 2084 BLOCK_COMMENT("choose copy loop based on element size"); 2085 2086 // next registers should be set before the jump to corresponding stub 2087 const Register from = c_rarg0; // source array address 2088 const Register to = c_rarg1; // destination array address 2089 const Register count = c_rarg2; // elements count 2090 2091 // 'from', 'to', 'count' registers should be set in such order 2092 // since they are the same as 'src', 'src_pos', 'dst'. 2093 2094 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2095 2096 // The possible values of elsize are 0-3, i.e. exact_log2(element 2097 // size in bytes). We do a simple bitwise binary search. 2098 __ BIND(L_copy_bytes); 2099 __ tbnz(r15_elsize, 1, L_copy_ints); 2100 __ tbnz(r15_elsize, 0, L_copy_shorts); 2101 __ lea(from, Address(src, src_pos));// src_addr 2102 __ lea(to, Address(dst, dst_pos));// dst_addr 2103 __ movw(count, scratch_length); // length 2104 __ b(RuntimeAddress(byte_copy_entry)); 2105 2106 __ BIND(L_copy_shorts); 2107 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2108 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2109 __ movw(count, scratch_length); // length 2110 __ b(RuntimeAddress(short_copy_entry)); 2111 2112 __ BIND(L_copy_ints); 2113 __ tbnz(r15_elsize, 0, L_copy_longs); 2114 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2115 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2116 __ movw(count, scratch_length); // length 2117 __ b(RuntimeAddress(int_copy_entry)); 2118 2119 __ BIND(L_copy_longs); 2120 #ifdef ASSERT 2121 { 2122 BLOCK_COMMENT("assert long copy {"); 2123 Label L; 2124 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2125 __ cmpw(r15_elsize, LogBytesPerLong); 2126 __ br(Assembler::EQ, L); 2127 __ stop("must be long copy, but elsize is wrong"); 2128 __ bind(L); 2129 BLOCK_COMMENT("} assert long copy done"); 2130 } 2131 #endif 2132 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2133 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2134 __ movw(count, scratch_length); // length 2135 __ b(RuntimeAddress(long_copy_entry)); 2136 2137 // ObjArrayKlass 2138 __ BIND(L_objArray); 2139 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2140 2141 Label L_plain_copy, L_checkcast_copy; 2142 // test array classes for subtyping 2143 __ load_klass(r15, dst); 2144 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2145 __ br(Assembler::NE, L_checkcast_copy); 2146 2147 // Identically typed arrays can be copied without element-wise checks. 2148 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2149 rscratch2, L_failed); 2150 2151 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2152 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2153 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2154 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2155 __ movw(count, scratch_length); // length 2156 __ BIND(L_plain_copy); 2157 __ b(RuntimeAddress(oop_copy_entry)); 2158 2159 __ BIND(L_checkcast_copy); 2160 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2161 { 2162 // Before looking at dst.length, make sure dst is also an objArray. 2163 __ ldrw(rscratch1, Address(r15, lh_offset)); 2164 __ movw(rscratch2, objArray_lh); 2165 __ eorw(rscratch1, rscratch1, rscratch2); 2166 __ cbnzw(rscratch1, L_failed); 2167 2168 // It is safe to examine both src.length and dst.length. 2169 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2170 r15, L_failed); 2171 2172 __ load_klass(dst_klass, dst); // reload 2173 2174 // Marshal the base address arguments now, freeing registers. 2175 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2176 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2177 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2178 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2179 __ movw(count, length); // length (reloaded) 2180 Register sco_temp = c_rarg3; // this register is free now 2181 assert_different_registers(from, to, count, sco_temp, 2182 dst_klass, scratch_src_klass); 2183 // assert_clean_int(count, sco_temp); 2184 2185 // Generate the type check. 2186 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2187 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2188 2189 // Smashes rscratch1, rscratch2 2190 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2191 2192 // Fetch destination element klass from the ObjArrayKlass header. 2193 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2194 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2195 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2196 2197 // the checkcast_copy loop needs two extra arguments: 2198 assert(c_rarg3 == sco_temp, "#3 already in place"); 2199 // Set up arguments for checkcast_copy_entry. 2200 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2201 __ b(RuntimeAddress(checkcast_copy_entry)); 2202 } 2203 2204 __ BIND(L_failed); 2205 __ mov(r0, -1); 2206 __ leave(); // required for proper stackwalking of RuntimeStub frame 2207 __ ret(lr); 2208 2209 return start; 2210 } 2211 2212 // 2213 // Generate stub for array fill. If "aligned" is true, the 2214 // "to" address is assumed to be heapword aligned. 2215 // 2216 // Arguments for generated stub: 2217 // to: c_rarg0 2218 // value: c_rarg1 2219 // count: c_rarg2 treated as signed 2220 // 2221 address generate_fill(BasicType t, bool aligned, const char *name) { 2222 __ align(CodeEntryAlignment); 2223 StubCodeMark mark(this, "StubRoutines", name); 2224 address start = __ pc(); 2225 2226 BLOCK_COMMENT("Entry:"); 2227 2228 const Register to = c_rarg0; // source array address 2229 const Register value = c_rarg1; // value 2230 const Register count = c_rarg2; // elements count 2231 2232 const Register bz_base = r10; // base for block_zero routine 2233 const Register cnt_words = r11; // temp register 2234 2235 __ enter(); 2236 2237 Label L_fill_elements, L_exit1; 2238 2239 int shift = -1; 2240 switch (t) { 2241 case T_BYTE: 2242 shift = 0; 2243 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2244 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2245 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2246 __ br(Assembler::LO, L_fill_elements); 2247 break; 2248 case T_SHORT: 2249 shift = 1; 2250 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2251 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2252 __ br(Assembler::LO, L_fill_elements); 2253 break; 2254 case T_INT: 2255 shift = 2; 2256 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2257 __ br(Assembler::LO, L_fill_elements); 2258 break; 2259 default: ShouldNotReachHere(); 2260 } 2261 2262 // Align source address at 8 bytes address boundary. 2263 Label L_skip_align1, L_skip_align2, L_skip_align4; 2264 if (!aligned) { 2265 switch (t) { 2266 case T_BYTE: 2267 // One byte misalignment happens only for byte arrays. 2268 __ tbz(to, 0, L_skip_align1); 2269 __ strb(value, Address(__ post(to, 1))); 2270 __ subw(count, count, 1); 2271 __ bind(L_skip_align1); 2272 // Fallthrough 2273 case T_SHORT: 2274 // Two bytes misalignment happens only for byte and short (char) arrays. 2275 __ tbz(to, 1, L_skip_align2); 2276 __ strh(value, Address(__ post(to, 2))); 2277 __ subw(count, count, 2 >> shift); 2278 __ bind(L_skip_align2); 2279 // Fallthrough 2280 case T_INT: 2281 // Align to 8 bytes, we know we are 4 byte aligned to start. 2282 __ tbz(to, 2, L_skip_align4); 2283 __ strw(value, Address(__ post(to, 4))); 2284 __ subw(count, count, 4 >> shift); 2285 __ bind(L_skip_align4); 2286 break; 2287 default: ShouldNotReachHere(); 2288 } 2289 } 2290 2291 // 2292 // Fill large chunks 2293 // 2294 __ lsrw(cnt_words, count, 3 - shift); // number of words 2295 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2296 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2297 if (UseBlockZeroing) { 2298 Label non_block_zeroing, rest; 2299 // If the fill value is zero we can use the fast zero_words(). 2300 __ cbnz(value, non_block_zeroing); 2301 __ mov(bz_base, to); 2302 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2303 __ zero_words(bz_base, cnt_words); 2304 __ b(rest); 2305 __ bind(non_block_zeroing); 2306 __ fill_words(to, cnt_words, value); 2307 __ bind(rest); 2308 } else { 2309 __ fill_words(to, cnt_words, value); 2310 } 2311 2312 // Remaining count is less than 8 bytes. Fill it by a single store. 2313 // Note that the total length is no less than 8 bytes. 2314 if (t == T_BYTE || t == T_SHORT) { 2315 Label L_exit1; 2316 __ cbzw(count, L_exit1); 2317 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2318 __ str(value, Address(to, -8)); // overwrite some elements 2319 __ bind(L_exit1); 2320 __ leave(); 2321 __ ret(lr); 2322 } 2323 2324 // Handle copies less than 8 bytes. 2325 Label L_fill_2, L_fill_4, L_exit2; 2326 __ bind(L_fill_elements); 2327 switch (t) { 2328 case T_BYTE: 2329 __ tbz(count, 0, L_fill_2); 2330 __ strb(value, Address(__ post(to, 1))); 2331 __ bind(L_fill_2); 2332 __ tbz(count, 1, L_fill_4); 2333 __ strh(value, Address(__ post(to, 2))); 2334 __ bind(L_fill_4); 2335 __ tbz(count, 2, L_exit2); 2336 __ strw(value, Address(to)); 2337 break; 2338 case T_SHORT: 2339 __ tbz(count, 0, L_fill_4); 2340 __ strh(value, Address(__ post(to, 2))); 2341 __ bind(L_fill_4); 2342 __ tbz(count, 1, L_exit2); 2343 __ strw(value, Address(to)); 2344 break; 2345 case T_INT: 2346 __ cbzw(count, L_exit2); 2347 __ strw(value, Address(to)); 2348 break; 2349 default: ShouldNotReachHere(); 2350 } 2351 __ bind(L_exit2); 2352 __ leave(); 2353 __ ret(lr); 2354 return start; 2355 } 2356 2357 address generate_data_cache_writeback() { 2358 const Register line = c_rarg0; // address of line to write back 2359 2360 __ align(CodeEntryAlignment); 2361 2362 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2363 2364 address start = __ pc(); 2365 __ enter(); 2366 __ cache_wb(Address(line, 0)); 2367 __ leave(); 2368 __ ret(lr); 2369 2370 return start; 2371 } 2372 2373 address generate_data_cache_writeback_sync() { 2374 const Register is_pre = c_rarg0; // pre or post sync 2375 2376 __ align(CodeEntryAlignment); 2377 2378 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2379 2380 // pre wbsync is a no-op 2381 // post wbsync translates to an sfence 2382 2383 Label skip; 2384 address start = __ pc(); 2385 __ enter(); 2386 __ cbnz(is_pre, skip); 2387 __ cache_wbsync(false); 2388 __ bind(skip); 2389 __ leave(); 2390 __ ret(lr); 2391 2392 return start; 2393 } 2394 2395 void generate_arraycopy_stubs() { 2396 address entry; 2397 address entry_jbyte_arraycopy; 2398 address entry_jshort_arraycopy; 2399 address entry_jint_arraycopy; 2400 address entry_oop_arraycopy; 2401 address entry_jlong_arraycopy; 2402 address entry_checkcast_arraycopy; 2403 2404 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2405 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2406 2407 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2408 2409 //*** jbyte 2410 // Always need aligned and unaligned versions 2411 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2412 "jbyte_disjoint_arraycopy"); 2413 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2414 &entry_jbyte_arraycopy, 2415 "jbyte_arraycopy"); 2416 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2417 "arrayof_jbyte_disjoint_arraycopy"); 2418 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2419 "arrayof_jbyte_arraycopy"); 2420 2421 //*** jshort 2422 // Always need aligned and unaligned versions 2423 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2424 "jshort_disjoint_arraycopy"); 2425 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2426 &entry_jshort_arraycopy, 2427 "jshort_arraycopy"); 2428 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2429 "arrayof_jshort_disjoint_arraycopy"); 2430 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2431 "arrayof_jshort_arraycopy"); 2432 2433 //*** jint 2434 // Aligned versions 2435 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2436 "arrayof_jint_disjoint_arraycopy"); 2437 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2438 "arrayof_jint_arraycopy"); 2439 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2440 // entry_jint_arraycopy always points to the unaligned version 2441 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2442 "jint_disjoint_arraycopy"); 2443 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2444 &entry_jint_arraycopy, 2445 "jint_arraycopy"); 2446 2447 //*** jlong 2448 // It is always aligned 2449 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2450 "arrayof_jlong_disjoint_arraycopy"); 2451 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2452 "arrayof_jlong_arraycopy"); 2453 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2454 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2455 2456 //*** oops 2457 { 2458 // With compressed oops we need unaligned versions; notice that 2459 // we overwrite entry_oop_arraycopy. 2460 bool aligned = !UseCompressedOops; 2461 2462 StubRoutines::_arrayof_oop_disjoint_arraycopy 2463 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2464 /*dest_uninitialized*/false); 2465 StubRoutines::_arrayof_oop_arraycopy 2466 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2467 /*dest_uninitialized*/false); 2468 // Aligned versions without pre-barriers 2469 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2470 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2471 /*dest_uninitialized*/true); 2472 StubRoutines::_arrayof_oop_arraycopy_uninit 2473 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2474 /*dest_uninitialized*/true); 2475 } 2476 2477 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2478 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2479 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2480 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2481 2482 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2483 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2484 /*dest_uninitialized*/true); 2485 2486 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2487 entry_jbyte_arraycopy, 2488 entry_jshort_arraycopy, 2489 entry_jint_arraycopy, 2490 entry_jlong_arraycopy); 2491 2492 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2493 entry_jbyte_arraycopy, 2494 entry_jshort_arraycopy, 2495 entry_jint_arraycopy, 2496 entry_oop_arraycopy, 2497 entry_jlong_arraycopy, 2498 entry_checkcast_arraycopy); 2499 2500 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2501 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2502 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2503 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2504 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2505 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2506 } 2507 2508 void generate_math_stubs() { Unimplemented(); } 2509 2510 // Arguments: 2511 // 2512 // Inputs: 2513 // c_rarg0 - source byte array address 2514 // c_rarg1 - destination byte array address 2515 // c_rarg2 - K (key) in little endian int array 2516 // 2517 address generate_aescrypt_encryptBlock() { 2518 __ align(CodeEntryAlignment); 2519 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2520 2521 Label L_doLast; 2522 2523 const Register from = c_rarg0; // source array address 2524 const Register to = c_rarg1; // destination array address 2525 const Register key = c_rarg2; // key array address 2526 const Register keylen = rscratch1; 2527 2528 address start = __ pc(); 2529 __ enter(); 2530 2531 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2532 2533 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2534 2535 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2536 __ rev32(v1, __ T16B, v1); 2537 __ rev32(v2, __ T16B, v2); 2538 __ rev32(v3, __ T16B, v3); 2539 __ rev32(v4, __ T16B, v4); 2540 __ aese(v0, v1); 2541 __ aesmc(v0, v0); 2542 __ aese(v0, v2); 2543 __ aesmc(v0, v0); 2544 __ aese(v0, v3); 2545 __ aesmc(v0, v0); 2546 __ aese(v0, v4); 2547 __ aesmc(v0, v0); 2548 2549 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2550 __ rev32(v1, __ T16B, v1); 2551 __ rev32(v2, __ T16B, v2); 2552 __ rev32(v3, __ T16B, v3); 2553 __ rev32(v4, __ T16B, v4); 2554 __ aese(v0, v1); 2555 __ aesmc(v0, v0); 2556 __ aese(v0, v2); 2557 __ aesmc(v0, v0); 2558 __ aese(v0, v3); 2559 __ aesmc(v0, v0); 2560 __ aese(v0, v4); 2561 __ aesmc(v0, v0); 2562 2563 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2564 __ rev32(v1, __ T16B, v1); 2565 __ rev32(v2, __ T16B, v2); 2566 2567 __ cmpw(keylen, 44); 2568 __ br(Assembler::EQ, L_doLast); 2569 2570 __ aese(v0, v1); 2571 __ aesmc(v0, v0); 2572 __ aese(v0, v2); 2573 __ aesmc(v0, v0); 2574 2575 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2576 __ rev32(v1, __ T16B, v1); 2577 __ rev32(v2, __ T16B, v2); 2578 2579 __ cmpw(keylen, 52); 2580 __ br(Assembler::EQ, L_doLast); 2581 2582 __ aese(v0, v1); 2583 __ aesmc(v0, v0); 2584 __ aese(v0, v2); 2585 __ aesmc(v0, v0); 2586 2587 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2588 __ rev32(v1, __ T16B, v1); 2589 __ rev32(v2, __ T16B, v2); 2590 2591 __ BIND(L_doLast); 2592 2593 __ aese(v0, v1); 2594 __ aesmc(v0, v0); 2595 __ aese(v0, v2); 2596 2597 __ ld1(v1, __ T16B, key); 2598 __ rev32(v1, __ T16B, v1); 2599 __ eor(v0, __ T16B, v0, v1); 2600 2601 __ st1(v0, __ T16B, to); 2602 2603 __ mov(r0, 0); 2604 2605 __ leave(); 2606 __ ret(lr); 2607 2608 return start; 2609 } 2610 2611 // Arguments: 2612 // 2613 // Inputs: 2614 // c_rarg0 - source byte array address 2615 // c_rarg1 - destination byte array address 2616 // c_rarg2 - K (key) in little endian int array 2617 // 2618 address generate_aescrypt_decryptBlock() { 2619 assert(UseAES, "need AES instructions and misaligned SSE support"); 2620 __ align(CodeEntryAlignment); 2621 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2622 Label L_doLast; 2623 2624 const Register from = c_rarg0; // source array address 2625 const Register to = c_rarg1; // destination array address 2626 const Register key = c_rarg2; // key array address 2627 const Register keylen = rscratch1; 2628 2629 address start = __ pc(); 2630 __ enter(); // required for proper stackwalking of RuntimeStub frame 2631 2632 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2633 2634 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2635 2636 __ ld1(v5, __ T16B, __ post(key, 16)); 2637 __ rev32(v5, __ T16B, v5); 2638 2639 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2640 __ rev32(v1, __ T16B, v1); 2641 __ rev32(v2, __ T16B, v2); 2642 __ rev32(v3, __ T16B, v3); 2643 __ rev32(v4, __ T16B, v4); 2644 __ aesd(v0, v1); 2645 __ aesimc(v0, v0); 2646 __ aesd(v0, v2); 2647 __ aesimc(v0, v0); 2648 __ aesd(v0, v3); 2649 __ aesimc(v0, v0); 2650 __ aesd(v0, v4); 2651 __ aesimc(v0, v0); 2652 2653 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2654 __ rev32(v1, __ T16B, v1); 2655 __ rev32(v2, __ T16B, v2); 2656 __ rev32(v3, __ T16B, v3); 2657 __ rev32(v4, __ T16B, v4); 2658 __ aesd(v0, v1); 2659 __ aesimc(v0, v0); 2660 __ aesd(v0, v2); 2661 __ aesimc(v0, v0); 2662 __ aesd(v0, v3); 2663 __ aesimc(v0, v0); 2664 __ aesd(v0, v4); 2665 __ aesimc(v0, v0); 2666 2667 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2668 __ rev32(v1, __ T16B, v1); 2669 __ rev32(v2, __ T16B, v2); 2670 2671 __ cmpw(keylen, 44); 2672 __ br(Assembler::EQ, L_doLast); 2673 2674 __ aesd(v0, v1); 2675 __ aesimc(v0, v0); 2676 __ aesd(v0, v2); 2677 __ aesimc(v0, v0); 2678 2679 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2680 __ rev32(v1, __ T16B, v1); 2681 __ rev32(v2, __ T16B, v2); 2682 2683 __ cmpw(keylen, 52); 2684 __ br(Assembler::EQ, L_doLast); 2685 2686 __ aesd(v0, v1); 2687 __ aesimc(v0, v0); 2688 __ aesd(v0, v2); 2689 __ aesimc(v0, v0); 2690 2691 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2692 __ rev32(v1, __ T16B, v1); 2693 __ rev32(v2, __ T16B, v2); 2694 2695 __ BIND(L_doLast); 2696 2697 __ aesd(v0, v1); 2698 __ aesimc(v0, v0); 2699 __ aesd(v0, v2); 2700 2701 __ eor(v0, __ T16B, v0, v5); 2702 2703 __ st1(v0, __ T16B, to); 2704 2705 __ mov(r0, 0); 2706 2707 __ leave(); 2708 __ ret(lr); 2709 2710 return start; 2711 } 2712 2713 // Arguments: 2714 // 2715 // Inputs: 2716 // c_rarg0 - source byte array address 2717 // c_rarg1 - destination byte array address 2718 // c_rarg2 - K (key) in little endian int array 2719 // c_rarg3 - r vector byte array address 2720 // c_rarg4 - input length 2721 // 2722 // Output: 2723 // x0 - input length 2724 // 2725 address generate_cipherBlockChaining_encryptAESCrypt() { 2726 assert(UseAES, "need AES instructions and misaligned SSE support"); 2727 __ align(CodeEntryAlignment); 2728 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2729 2730 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2731 2732 const Register from = c_rarg0; // source array address 2733 const Register to = c_rarg1; // destination array address 2734 const Register key = c_rarg2; // key array address 2735 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2736 // and left with the results of the last encryption block 2737 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2738 const Register keylen = rscratch1; 2739 2740 address start = __ pc(); 2741 2742 __ enter(); 2743 2744 __ movw(rscratch2, len_reg); 2745 2746 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2747 2748 __ ld1(v0, __ T16B, rvec); 2749 2750 __ cmpw(keylen, 52); 2751 __ br(Assembler::CC, L_loadkeys_44); 2752 __ br(Assembler::EQ, L_loadkeys_52); 2753 2754 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2755 __ rev32(v17, __ T16B, v17); 2756 __ rev32(v18, __ T16B, v18); 2757 __ BIND(L_loadkeys_52); 2758 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2759 __ rev32(v19, __ T16B, v19); 2760 __ rev32(v20, __ T16B, v20); 2761 __ BIND(L_loadkeys_44); 2762 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2763 __ rev32(v21, __ T16B, v21); 2764 __ rev32(v22, __ T16B, v22); 2765 __ rev32(v23, __ T16B, v23); 2766 __ rev32(v24, __ T16B, v24); 2767 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2768 __ rev32(v25, __ T16B, v25); 2769 __ rev32(v26, __ T16B, v26); 2770 __ rev32(v27, __ T16B, v27); 2771 __ rev32(v28, __ T16B, v28); 2772 __ ld1(v29, v30, v31, __ T16B, key); 2773 __ rev32(v29, __ T16B, v29); 2774 __ rev32(v30, __ T16B, v30); 2775 __ rev32(v31, __ T16B, v31); 2776 2777 __ BIND(L_aes_loop); 2778 __ ld1(v1, __ T16B, __ post(from, 16)); 2779 __ eor(v0, __ T16B, v0, v1); 2780 2781 __ br(Assembler::CC, L_rounds_44); 2782 __ br(Assembler::EQ, L_rounds_52); 2783 2784 __ aese(v0, v17); __ aesmc(v0, v0); 2785 __ aese(v0, v18); __ aesmc(v0, v0); 2786 __ BIND(L_rounds_52); 2787 __ aese(v0, v19); __ aesmc(v0, v0); 2788 __ aese(v0, v20); __ aesmc(v0, v0); 2789 __ BIND(L_rounds_44); 2790 __ aese(v0, v21); __ aesmc(v0, v0); 2791 __ aese(v0, v22); __ aesmc(v0, v0); 2792 __ aese(v0, v23); __ aesmc(v0, v0); 2793 __ aese(v0, v24); __ aesmc(v0, v0); 2794 __ aese(v0, v25); __ aesmc(v0, v0); 2795 __ aese(v0, v26); __ aesmc(v0, v0); 2796 __ aese(v0, v27); __ aesmc(v0, v0); 2797 __ aese(v0, v28); __ aesmc(v0, v0); 2798 __ aese(v0, v29); __ aesmc(v0, v0); 2799 __ aese(v0, v30); 2800 __ eor(v0, __ T16B, v0, v31); 2801 2802 __ st1(v0, __ T16B, __ post(to, 16)); 2803 2804 __ subw(len_reg, len_reg, 16); 2805 __ cbnzw(len_reg, L_aes_loop); 2806 2807 __ st1(v0, __ T16B, rvec); 2808 2809 __ mov(r0, rscratch2); 2810 2811 __ leave(); 2812 __ ret(lr); 2813 2814 return start; 2815 } 2816 2817 // Arguments: 2818 // 2819 // Inputs: 2820 // c_rarg0 - source byte array address 2821 // c_rarg1 - destination byte array address 2822 // c_rarg2 - K (key) in little endian int array 2823 // c_rarg3 - r vector byte array address 2824 // c_rarg4 - input length 2825 // 2826 // Output: 2827 // r0 - input length 2828 // 2829 address generate_cipherBlockChaining_decryptAESCrypt() { 2830 assert(UseAES, "need AES instructions and misaligned SSE support"); 2831 __ align(CodeEntryAlignment); 2832 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2833 2834 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2835 2836 const Register from = c_rarg0; // source array address 2837 const Register to = c_rarg1; // destination array address 2838 const Register key = c_rarg2; // key array address 2839 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2840 // and left with the results of the last encryption block 2841 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2842 const Register keylen = rscratch1; 2843 2844 address start = __ pc(); 2845 2846 __ enter(); 2847 2848 __ movw(rscratch2, len_reg); 2849 2850 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2851 2852 __ ld1(v2, __ T16B, rvec); 2853 2854 __ ld1(v31, __ T16B, __ post(key, 16)); 2855 __ rev32(v31, __ T16B, v31); 2856 2857 __ cmpw(keylen, 52); 2858 __ br(Assembler::CC, L_loadkeys_44); 2859 __ br(Assembler::EQ, L_loadkeys_52); 2860 2861 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2862 __ rev32(v17, __ T16B, v17); 2863 __ rev32(v18, __ T16B, v18); 2864 __ BIND(L_loadkeys_52); 2865 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2866 __ rev32(v19, __ T16B, v19); 2867 __ rev32(v20, __ T16B, v20); 2868 __ BIND(L_loadkeys_44); 2869 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2870 __ rev32(v21, __ T16B, v21); 2871 __ rev32(v22, __ T16B, v22); 2872 __ rev32(v23, __ T16B, v23); 2873 __ rev32(v24, __ T16B, v24); 2874 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2875 __ rev32(v25, __ T16B, v25); 2876 __ rev32(v26, __ T16B, v26); 2877 __ rev32(v27, __ T16B, v27); 2878 __ rev32(v28, __ T16B, v28); 2879 __ ld1(v29, v30, __ T16B, key); 2880 __ rev32(v29, __ T16B, v29); 2881 __ rev32(v30, __ T16B, v30); 2882 2883 __ BIND(L_aes_loop); 2884 __ ld1(v0, __ T16B, __ post(from, 16)); 2885 __ orr(v1, __ T16B, v0, v0); 2886 2887 __ br(Assembler::CC, L_rounds_44); 2888 __ br(Assembler::EQ, L_rounds_52); 2889 2890 __ aesd(v0, v17); __ aesimc(v0, v0); 2891 __ aesd(v0, v18); __ aesimc(v0, v0); 2892 __ BIND(L_rounds_52); 2893 __ aesd(v0, v19); __ aesimc(v0, v0); 2894 __ aesd(v0, v20); __ aesimc(v0, v0); 2895 __ BIND(L_rounds_44); 2896 __ aesd(v0, v21); __ aesimc(v0, v0); 2897 __ aesd(v0, v22); __ aesimc(v0, v0); 2898 __ aesd(v0, v23); __ aesimc(v0, v0); 2899 __ aesd(v0, v24); __ aesimc(v0, v0); 2900 __ aesd(v0, v25); __ aesimc(v0, v0); 2901 __ aesd(v0, v26); __ aesimc(v0, v0); 2902 __ aesd(v0, v27); __ aesimc(v0, v0); 2903 __ aesd(v0, v28); __ aesimc(v0, v0); 2904 __ aesd(v0, v29); __ aesimc(v0, v0); 2905 __ aesd(v0, v30); 2906 __ eor(v0, __ T16B, v0, v31); 2907 __ eor(v0, __ T16B, v0, v2); 2908 2909 __ st1(v0, __ T16B, __ post(to, 16)); 2910 __ orr(v2, __ T16B, v1, v1); 2911 2912 __ subw(len_reg, len_reg, 16); 2913 __ cbnzw(len_reg, L_aes_loop); 2914 2915 __ st1(v2, __ T16B, rvec); 2916 2917 __ mov(r0, rscratch2); 2918 2919 __ leave(); 2920 __ ret(lr); 2921 2922 return start; 2923 } 2924 2925 // Arguments: 2926 // 2927 // Inputs: 2928 // c_rarg0 - byte[] source+offset 2929 // c_rarg1 - int[] SHA.state 2930 // c_rarg2 - int offset 2931 // c_rarg3 - int limit 2932 // 2933 address generate_sha1_implCompress(bool multi_block, const char *name) { 2934 __ align(CodeEntryAlignment); 2935 StubCodeMark mark(this, "StubRoutines", name); 2936 address start = __ pc(); 2937 2938 Register buf = c_rarg0; 2939 Register state = c_rarg1; 2940 Register ofs = c_rarg2; 2941 Register limit = c_rarg3; 2942 2943 Label keys; 2944 Label sha1_loop; 2945 2946 // load the keys into v0..v3 2947 __ adr(rscratch1, keys); 2948 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2949 // load 5 words state into v6, v7 2950 __ ldrq(v6, Address(state, 0)); 2951 __ ldrs(v7, Address(state, 16)); 2952 2953 2954 __ BIND(sha1_loop); 2955 // load 64 bytes of data into v16..v19 2956 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2957 __ rev32(v16, __ T16B, v16); 2958 __ rev32(v17, __ T16B, v17); 2959 __ rev32(v18, __ T16B, v18); 2960 __ rev32(v19, __ T16B, v19); 2961 2962 // do the sha1 2963 __ addv(v4, __ T4S, v16, v0); 2964 __ orr(v20, __ T16B, v6, v6); 2965 2966 FloatRegister d0 = v16; 2967 FloatRegister d1 = v17; 2968 FloatRegister d2 = v18; 2969 FloatRegister d3 = v19; 2970 2971 for (int round = 0; round < 20; round++) { 2972 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2973 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2974 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2975 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2976 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2977 2978 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2979 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2980 __ sha1h(tmp2, __ T4S, v20); 2981 if (round < 5) 2982 __ sha1c(v20, __ T4S, tmp3, tmp4); 2983 else if (round < 10 || round >= 15) 2984 __ sha1p(v20, __ T4S, tmp3, tmp4); 2985 else 2986 __ sha1m(v20, __ T4S, tmp3, tmp4); 2987 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2988 2989 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2990 } 2991 2992 __ addv(v7, __ T2S, v7, v21); 2993 __ addv(v6, __ T4S, v6, v20); 2994 2995 if (multi_block) { 2996 __ add(ofs, ofs, 64); 2997 __ cmp(ofs, limit); 2998 __ br(Assembler::LE, sha1_loop); 2999 __ mov(c_rarg0, ofs); // return ofs 3000 } 3001 3002 __ strq(v6, Address(state, 0)); 3003 __ strs(v7, Address(state, 16)); 3004 3005 __ ret(lr); 3006 3007 __ bind(keys); 3008 __ emit_int32(0x5a827999); 3009 __ emit_int32(0x6ed9eba1); 3010 __ emit_int32(0x8f1bbcdc); 3011 __ emit_int32(0xca62c1d6); 3012 3013 return start; 3014 } 3015 3016 3017 // Arguments: 3018 // 3019 // Inputs: 3020 // c_rarg0 - byte[] source+offset 3021 // c_rarg1 - int[] SHA.state 3022 // c_rarg2 - int offset 3023 // c_rarg3 - int limit 3024 // 3025 address generate_sha256_implCompress(bool multi_block, const char *name) { 3026 static const uint32_t round_consts[64] = { 3027 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3028 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3029 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3030 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3031 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3032 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3033 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3034 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3035 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3036 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3037 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3038 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3039 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3040 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3041 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3042 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3043 }; 3044 __ align(CodeEntryAlignment); 3045 StubCodeMark mark(this, "StubRoutines", name); 3046 address start = __ pc(); 3047 3048 Register buf = c_rarg0; 3049 Register state = c_rarg1; 3050 Register ofs = c_rarg2; 3051 Register limit = c_rarg3; 3052 3053 Label sha1_loop; 3054 3055 __ stpd(v8, v9, __ pre(sp, -32)); 3056 __ stpd(v10, v11, Address(sp, 16)); 3057 3058 // dga == v0 3059 // dgb == v1 3060 // dg0 == v2 3061 // dg1 == v3 3062 // dg2 == v4 3063 // t0 == v6 3064 // t1 == v7 3065 3066 // load 16 keys to v16..v31 3067 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3068 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3069 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3070 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3071 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3072 3073 // load 8 words (256 bits) state 3074 __ ldpq(v0, v1, state); 3075 3076 __ BIND(sha1_loop); 3077 // load 64 bytes of data into v8..v11 3078 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3079 __ rev32(v8, __ T16B, v8); 3080 __ rev32(v9, __ T16B, v9); 3081 __ rev32(v10, __ T16B, v10); 3082 __ rev32(v11, __ T16B, v11); 3083 3084 __ addv(v6, __ T4S, v8, v16); 3085 __ orr(v2, __ T16B, v0, v0); 3086 __ orr(v3, __ T16B, v1, v1); 3087 3088 FloatRegister d0 = v8; 3089 FloatRegister d1 = v9; 3090 FloatRegister d2 = v10; 3091 FloatRegister d3 = v11; 3092 3093 3094 for (int round = 0; round < 16; round++) { 3095 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3096 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3097 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3098 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3099 3100 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3101 __ orr(v4, __ T16B, v2, v2); 3102 if (round < 15) 3103 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3104 __ sha256h(v2, __ T4S, v3, tmp2); 3105 __ sha256h2(v3, __ T4S, v4, tmp2); 3106 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3107 3108 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3109 } 3110 3111 __ addv(v0, __ T4S, v0, v2); 3112 __ addv(v1, __ T4S, v1, v3); 3113 3114 if (multi_block) { 3115 __ add(ofs, ofs, 64); 3116 __ cmp(ofs, limit); 3117 __ br(Assembler::LE, sha1_loop); 3118 __ mov(c_rarg0, ofs); // return ofs 3119 } 3120 3121 __ ldpd(v10, v11, Address(sp, 16)); 3122 __ ldpd(v8, v9, __ post(sp, 32)); 3123 3124 __ stpq(v0, v1, state); 3125 3126 __ ret(lr); 3127 3128 return start; 3129 } 3130 3131 // Arguments: 3132 // 3133 // Inputs: 3134 // c_rarg0 - byte[] source+offset 3135 // c_rarg1 - int[] SHA.state 3136 // c_rarg2 - int offset 3137 // c_rarg3 - int limit 3138 // 3139 address generate_sha512_implCompress(bool multi_block, const char *name) { 3140 static const uint64_t round_consts[80] = { 3141 0x428A2F98D728AE22L, 0x7137449123EF65CDL, 0xB5C0FBCFEC4D3B2FL, 3142 0xE9B5DBA58189DBBCL, 0x3956C25BF348B538L, 0x59F111F1B605D019L, 3143 0x923F82A4AF194F9BL, 0xAB1C5ED5DA6D8118L, 0xD807AA98A3030242L, 3144 0x12835B0145706FBEL, 0x243185BE4EE4B28CL, 0x550C7DC3D5FFB4E2L, 3145 0x72BE5D74F27B896FL, 0x80DEB1FE3B1696B1L, 0x9BDC06A725C71235L, 3146 0xC19BF174CF692694L, 0xE49B69C19EF14AD2L, 0xEFBE4786384F25E3L, 3147 0x0FC19DC68B8CD5B5L, 0x240CA1CC77AC9C65L, 0x2DE92C6F592B0275L, 3148 0x4A7484AA6EA6E483L, 0x5CB0A9DCBD41FBD4L, 0x76F988DA831153B5L, 3149 0x983E5152EE66DFABL, 0xA831C66D2DB43210L, 0xB00327C898FB213FL, 3150 0xBF597FC7BEEF0EE4L, 0xC6E00BF33DA88FC2L, 0xD5A79147930AA725L, 3151 0x06CA6351E003826FL, 0x142929670A0E6E70L, 0x27B70A8546D22FFCL, 3152 0x2E1B21385C26C926L, 0x4D2C6DFC5AC42AEDL, 0x53380D139D95B3DFL, 3153 0x650A73548BAF63DEL, 0x766A0ABB3C77B2A8L, 0x81C2C92E47EDAEE6L, 3154 0x92722C851482353BL, 0xA2BFE8A14CF10364L, 0xA81A664BBC423001L, 3155 0xC24B8B70D0F89791L, 0xC76C51A30654BE30L, 0xD192E819D6EF5218L, 3156 0xD69906245565A910L, 0xF40E35855771202AL, 0x106AA07032BBD1B8L, 3157 0x19A4C116B8D2D0C8L, 0x1E376C085141AB53L, 0x2748774CDF8EEB99L, 3158 0x34B0BCB5E19B48A8L, 0x391C0CB3C5C95A63L, 0x4ED8AA4AE3418ACBL, 3159 0x5B9CCA4F7763E373L, 0x682E6FF3D6B2B8A3L, 0x748F82EE5DEFB2FCL, 3160 0x78A5636F43172F60L, 0x84C87814A1F0AB72L, 0x8CC702081A6439ECL, 3161 0x90BEFFFA23631E28L, 0xA4506CEBDE82BDE9L, 0xBEF9A3F7B2C67915L, 3162 0xC67178F2E372532BL, 0xCA273ECEEA26619CL, 0xD186B8C721C0C207L, 3163 0xEADA7DD6CDE0EB1EL, 0xF57D4F7FEE6ED178L, 0x06F067AA72176FBAL, 3164 0x0A637DC5A2C898A6L, 0x113F9804BEF90DAEL, 0x1B710B35131C471BL, 3165 0x28DB77F523047D84L, 0x32CAAB7B40C72493L, 0x3C9EBE0A15C9BEBCL, 3166 0x431D67C49C100D4CL, 0x4CC5D4BECB3E42B6L, 0x597F299CFC657E2AL, 3167 0x5FCB6FAB3AD6FAECL, 0x6C44198C4A475817L 3168 }; 3169 3170 // Double rounds for sha512. 3171 #define sha512_dround(dr, i0, i1, i2, i3, i4, rc0, rc1, in0, in1, in2, in3, in4) \ 3172 if (dr < 36) \ 3173 __ ld1(v##rc1, __ T2D, __ post(rscratch2, 16)); \ 3174 __ addv(v5, __ T2D, v##rc0, v##in0); \ 3175 __ ext(v6, __ T16B, v##i2, v##i3, 8); \ 3176 __ ext(v5, __ T16B, v5, v5, 8); \ 3177 __ ext(v7, __ T16B, v##i1, v##i2, 8); \ 3178 __ addv(v##i3, __ T2D, v##i3, v5); \ 3179 if (dr < 32) { \ 3180 __ ext(v5, __ T16B, v##in3, v##in4, 8); \ 3181 __ sha512su0(v##in0, __ T2D, v##in1); \ 3182 } \ 3183 __ sha512h(v##i3, __ T2D, v6, v7); \ 3184 if (dr < 32) \ 3185 __ sha512su1(v##in0, __ T2D, v##in2, v5); \ 3186 __ addv(v##i4, __ T2D, v##i1, v##i3); \ 3187 __ sha512h2(v##i3, __ T2D, v##i1, v##i0); \ 3188 3189 __ align(CodeEntryAlignment); 3190 StubCodeMark mark(this, "StubRoutines", name); 3191 address start = __ pc(); 3192 3193 Register buf = c_rarg0; 3194 Register state = c_rarg1; 3195 Register ofs = c_rarg2; 3196 Register limit = c_rarg3; 3197 3198 __ stpd(v8, v9, __ pre(sp, -64)); 3199 __ stpd(v10, v11, Address(sp, 16)); 3200 __ stpd(v12, v13, Address(sp, 32)); 3201 __ stpd(v14, v15, Address(sp, 48)); 3202 3203 Label sha512_loop; 3204 3205 // load state 3206 __ ld1(v8, v9, v10, v11, __ T2D, state); 3207 3208 // load first 4 round constants 3209 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3210 __ ld1(v20, v21, v22, v23, __ T2D, __ post(rscratch1, 64)); 3211 3212 __ BIND(sha512_loop); 3213 // load 128B of data into v12..v19 3214 __ ld1(v12, v13, v14, v15, __ T2D, __ post(buf, 64)); 3215 __ ld1(v16, v17, v18, v19, __ T2D, __ post(buf, 64)); 3216 __ rev64(v12, __ T16B, v12); 3217 __ rev64(v13, __ T16B, v13); 3218 __ rev64(v14, __ T16B, v14); 3219 __ rev64(v15, __ T16B, v15); 3220 __ rev64(v16, __ T16B, v16); 3221 __ rev64(v17, __ T16B, v17); 3222 __ rev64(v18, __ T16B, v18); 3223 __ rev64(v19, __ T16B, v19); 3224 3225 __ mov(rscratch2, rscratch1); 3226 3227 __ mov(v0, __ T16B, v8); 3228 __ mov(v1, __ T16B, v9); 3229 __ mov(v2, __ T16B, v10); 3230 __ mov(v3, __ T16B, v11); 3231 3232 sha512_dround( 0, 0, 1, 2, 3, 4, 20, 24, 12, 13, 19, 16, 17); 3233 sha512_dround( 1, 3, 0, 4, 2, 1, 21, 25, 13, 14, 12, 17, 18); 3234 sha512_dround( 2, 2, 3, 1, 4, 0, 22, 26, 14, 15, 13, 18, 19); 3235 sha512_dround( 3, 4, 2, 0, 1, 3, 23, 27, 15, 16, 14, 19, 12); 3236 sha512_dround( 4, 1, 4, 3, 0, 2, 24, 28, 16, 17, 15, 12, 13); 3237 sha512_dround( 5, 0, 1, 2, 3, 4, 25, 29, 17, 18, 16, 13, 14); 3238 sha512_dround( 6, 3, 0, 4, 2, 1, 26, 30, 18, 19, 17, 14, 15); 3239 sha512_dround( 7, 2, 3, 1, 4, 0, 27, 31, 19, 12, 18, 15, 16); 3240 sha512_dround( 8, 4, 2, 0, 1, 3, 28, 24, 12, 13, 19, 16, 17); 3241 sha512_dround( 9, 1, 4, 3, 0, 2, 29, 25, 13, 14, 12, 17, 18); 3242 sha512_dround(10, 0, 1, 2, 3, 4, 30, 26, 14, 15, 13, 18, 19); 3243 sha512_dround(11, 3, 0, 4, 2, 1, 31, 27, 15, 16, 14, 19, 12); 3244 sha512_dround(12, 2, 3, 1, 4, 0, 24, 28, 16, 17, 15, 12, 13); 3245 sha512_dround(13, 4, 2, 0, 1, 3, 25, 29, 17, 18, 16, 13, 14); 3246 sha512_dround(14, 1, 4, 3, 0, 2, 26, 30, 18, 19, 17, 14, 15); 3247 sha512_dround(15, 0, 1, 2, 3, 4, 27, 31, 19, 12, 18, 15, 16); 3248 sha512_dround(16, 3, 0, 4, 2, 1, 28, 24, 12, 13, 19, 16, 17); 3249 sha512_dround(17, 2, 3, 1, 4, 0, 29, 25, 13, 14, 12, 17, 18); 3250 sha512_dround(18, 4, 2, 0, 1, 3, 30, 26, 14, 15, 13, 18, 19); 3251 sha512_dround(19, 1, 4, 3, 0, 2, 31, 27, 15, 16, 14, 19, 12); 3252 sha512_dround(20, 0, 1, 2, 3, 4, 24, 28, 16, 17, 15, 12, 13); 3253 sha512_dround(21, 3, 0, 4, 2, 1, 25, 29, 17, 18, 16, 13, 14); 3254 sha512_dround(22, 2, 3, 1, 4, 0, 26, 30, 18, 19, 17, 14, 15); 3255 sha512_dround(23, 4, 2, 0, 1, 3, 27, 31, 19, 12, 18, 15, 16); 3256 sha512_dround(24, 1, 4, 3, 0, 2, 28, 24, 12, 13, 19, 16, 17); 3257 sha512_dround(25, 0, 1, 2, 3, 4, 29, 25, 13, 14, 12, 17, 18); 3258 sha512_dround(26, 3, 0, 4, 2, 1, 30, 26, 14, 15, 13, 18, 19); 3259 sha512_dround(27, 2, 3, 1, 4, 0, 31, 27, 15, 16, 14, 19, 12); 3260 sha512_dround(28, 4, 2, 0, 1, 3, 24, 28, 16, 17, 15, 12, 13); 3261 sha512_dround(29, 1, 4, 3, 0, 2, 25, 29, 17, 18, 16, 13, 14); 3262 sha512_dround(30, 0, 1, 2, 3, 4, 26, 30, 18, 19, 17, 14, 15); 3263 sha512_dround(31, 3, 0, 4, 2, 1, 27, 31, 19, 12, 18, 15, 16); 3264 sha512_dround(32, 2, 3, 1, 4, 0, 28, 24, 12, 0, 0, 0, 0); 3265 sha512_dround(33, 4, 2, 0, 1, 3, 29, 25, 13, 0, 0, 0, 0); 3266 sha512_dround(34, 1, 4, 3, 0, 2, 30, 26, 14, 0, 0, 0, 0); 3267 sha512_dround(35, 0, 1, 2, 3, 4, 31, 27, 15, 0, 0, 0, 0); 3268 sha512_dround(36, 3, 0, 4, 2, 1, 24, 0, 16, 0, 0, 0, 0); 3269 sha512_dround(37, 2, 3, 1, 4, 0, 25, 0, 17, 0, 0, 0, 0); 3270 sha512_dround(38, 4, 2, 0, 1, 3, 26, 0, 18, 0, 0, 0, 0); 3271 sha512_dround(39, 1, 4, 3, 0, 2, 27, 0, 19, 0, 0, 0, 0); 3272 3273 __ addv(v8, __ T2D, v8, v0); 3274 __ addv(v9, __ T2D, v9, v1); 3275 __ addv(v10, __ T2D, v10, v2); 3276 __ addv(v11, __ T2D, v11, v3); 3277 3278 if (multi_block) { 3279 __ add(ofs, ofs, 128); 3280 __ cmp(ofs, limit); 3281 __ br(Assembler::LE, sha512_loop); 3282 __ mov(c_rarg0, ofs); // return ofs 3283 } 3284 3285 __ st1(v8, v9, v10, v11, __ T2D, state); 3286 3287 __ ldpd(v14, v15, Address(sp, 48)); 3288 __ ldpd(v12, v13, Address(sp, 32)); 3289 __ ldpd(v10, v11, Address(sp, 16)); 3290 __ ldpd(v8, v9, __ post(sp, 64)); 3291 3292 __ ret(lr); 3293 3294 return start; 3295 } 3296 3297 // Safefetch stubs. 3298 void generate_safefetch(const char* name, int size, address* entry, 3299 address* fault_pc, address* continuation_pc) { 3300 // safefetch signatures: 3301 // int SafeFetch32(int* adr, int errValue); 3302 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3303 // 3304 // arguments: 3305 // c_rarg0 = adr 3306 // c_rarg1 = errValue 3307 // 3308 // result: 3309 // PPC_RET = *adr or errValue 3310 3311 StubCodeMark mark(this, "StubRoutines", name); 3312 3313 // Entry point, pc or function descriptor. 3314 *entry = __ pc(); 3315 3316 // Load *adr into c_rarg1, may fault. 3317 *fault_pc = __ pc(); 3318 switch (size) { 3319 case 4: 3320 // int32_t 3321 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3322 break; 3323 case 8: 3324 // int64_t 3325 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3326 break; 3327 default: 3328 ShouldNotReachHere(); 3329 } 3330 3331 // return errValue or *adr 3332 *continuation_pc = __ pc(); 3333 __ mov(r0, c_rarg1); 3334 __ ret(lr); 3335 } 3336 3337 /** 3338 * Arguments: 3339 * 3340 * Inputs: 3341 * c_rarg0 - int crc 3342 * c_rarg1 - byte* buf 3343 * c_rarg2 - int length 3344 * 3345 * Ouput: 3346 * rax - int crc result 3347 */ 3348 address generate_updateBytesCRC32() { 3349 assert(UseCRC32Intrinsics, "what are we doing here?"); 3350 3351 __ align(CodeEntryAlignment); 3352 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3353 3354 address start = __ pc(); 3355 3356 const Register crc = c_rarg0; // crc 3357 const Register buf = c_rarg1; // source java byte array address 3358 const Register len = c_rarg2; // length 3359 const Register table0 = c_rarg3; // crc_table address 3360 const Register table1 = c_rarg4; 3361 const Register table2 = c_rarg5; 3362 const Register table3 = c_rarg6; 3363 const Register tmp3 = c_rarg7; 3364 3365 BLOCK_COMMENT("Entry:"); 3366 __ enter(); // required for proper stackwalking of RuntimeStub frame 3367 3368 __ kernel_crc32(crc, buf, len, 3369 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3370 3371 __ leave(); // required for proper stackwalking of RuntimeStub frame 3372 __ ret(lr); 3373 3374 return start; 3375 } 3376 3377 /** 3378 * Arguments: 3379 * 3380 * Inputs: 3381 * c_rarg0 - int crc 3382 * c_rarg1 - byte* buf 3383 * c_rarg2 - int length 3384 * c_rarg3 - int* table 3385 * 3386 * Ouput: 3387 * r0 - int crc result 3388 */ 3389 address generate_updateBytesCRC32C() { 3390 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3391 3392 __ align(CodeEntryAlignment); 3393 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3394 3395 address start = __ pc(); 3396 3397 const Register crc = c_rarg0; // crc 3398 const Register buf = c_rarg1; // source java byte array address 3399 const Register len = c_rarg2; // length 3400 const Register table0 = c_rarg3; // crc_table address 3401 const Register table1 = c_rarg4; 3402 const Register table2 = c_rarg5; 3403 const Register table3 = c_rarg6; 3404 const Register tmp3 = c_rarg7; 3405 3406 BLOCK_COMMENT("Entry:"); 3407 __ enter(); // required for proper stackwalking of RuntimeStub frame 3408 3409 __ kernel_crc32c(crc, buf, len, 3410 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3411 3412 __ leave(); // required for proper stackwalking of RuntimeStub frame 3413 __ ret(lr); 3414 3415 return start; 3416 } 3417 3418 /*** 3419 * Arguments: 3420 * 3421 * Inputs: 3422 * c_rarg0 - int adler 3423 * c_rarg1 - byte* buff 3424 * c_rarg2 - int len 3425 * 3426 * Output: 3427 * c_rarg0 - int adler result 3428 */ 3429 address generate_updateBytesAdler32() { 3430 __ align(CodeEntryAlignment); 3431 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3432 address start = __ pc(); 3433 3434 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3435 3436 // Aliases 3437 Register adler = c_rarg0; 3438 Register s1 = c_rarg0; 3439 Register s2 = c_rarg3; 3440 Register buff = c_rarg1; 3441 Register len = c_rarg2; 3442 Register nmax = r4; 3443 Register base = r5; 3444 Register count = r6; 3445 Register temp0 = rscratch1; 3446 Register temp1 = rscratch2; 3447 FloatRegister vbytes = v0; 3448 FloatRegister vs1acc = v1; 3449 FloatRegister vs2acc = v2; 3450 FloatRegister vtable = v3; 3451 3452 // Max number of bytes we can process before having to take the mod 3453 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3454 uint64_t BASE = 0xfff1; 3455 uint64_t NMAX = 0x15B0; 3456 3457 __ mov(base, BASE); 3458 __ mov(nmax, NMAX); 3459 3460 // Load accumulation coefficients for the upper 16 bits 3461 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 3462 __ ld1(vtable, __ T16B, Address(temp0)); 3463 3464 // s1 is initialized to the lower 16 bits of adler 3465 // s2 is initialized to the upper 16 bits of adler 3466 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3467 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3468 3469 // The pipelined loop needs at least 16 elements for 1 iteration 3470 // It does check this, but it is more effective to skip to the cleanup loop 3471 __ cmp(len, (u1)16); 3472 __ br(Assembler::HS, L_nmax); 3473 __ cbz(len, L_combine); 3474 3475 __ bind(L_simple_by1_loop); 3476 __ ldrb(temp0, Address(__ post(buff, 1))); 3477 __ add(s1, s1, temp0); 3478 __ add(s2, s2, s1); 3479 __ subs(len, len, 1); 3480 __ br(Assembler::HI, L_simple_by1_loop); 3481 3482 // s1 = s1 % BASE 3483 __ subs(temp0, s1, base); 3484 __ csel(s1, temp0, s1, Assembler::HS); 3485 3486 // s2 = s2 % BASE 3487 __ lsr(temp0, s2, 16); 3488 __ lsl(temp1, temp0, 4); 3489 __ sub(temp1, temp1, temp0); 3490 __ add(s2, temp1, s2, ext::uxth); 3491 3492 __ subs(temp0, s2, base); 3493 __ csel(s2, temp0, s2, Assembler::HS); 3494 3495 __ b(L_combine); 3496 3497 __ bind(L_nmax); 3498 __ subs(len, len, nmax); 3499 __ sub(count, nmax, 16); 3500 __ br(Assembler::LO, L_by16); 3501 3502 __ bind(L_nmax_loop); 3503 3504 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3505 vbytes, vs1acc, vs2acc, vtable); 3506 3507 __ subs(count, count, 16); 3508 __ br(Assembler::HS, L_nmax_loop); 3509 3510 // s1 = s1 % BASE 3511 __ lsr(temp0, s1, 16); 3512 __ lsl(temp1, temp0, 4); 3513 __ sub(temp1, temp1, temp0); 3514 __ add(temp1, temp1, s1, ext::uxth); 3515 3516 __ lsr(temp0, temp1, 16); 3517 __ lsl(s1, temp0, 4); 3518 __ sub(s1, s1, temp0); 3519 __ add(s1, s1, temp1, ext:: uxth); 3520 3521 __ subs(temp0, s1, base); 3522 __ csel(s1, temp0, s1, Assembler::HS); 3523 3524 // s2 = s2 % BASE 3525 __ lsr(temp0, s2, 16); 3526 __ lsl(temp1, temp0, 4); 3527 __ sub(temp1, temp1, temp0); 3528 __ add(temp1, temp1, s2, ext::uxth); 3529 3530 __ lsr(temp0, temp1, 16); 3531 __ lsl(s2, temp0, 4); 3532 __ sub(s2, s2, temp0); 3533 __ add(s2, s2, temp1, ext:: uxth); 3534 3535 __ subs(temp0, s2, base); 3536 __ csel(s2, temp0, s2, Assembler::HS); 3537 3538 __ subs(len, len, nmax); 3539 __ sub(count, nmax, 16); 3540 __ br(Assembler::HS, L_nmax_loop); 3541 3542 __ bind(L_by16); 3543 __ adds(len, len, count); 3544 __ br(Assembler::LO, L_by1); 3545 3546 __ bind(L_by16_loop); 3547 3548 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3549 vbytes, vs1acc, vs2acc, vtable); 3550 3551 __ subs(len, len, 16); 3552 __ br(Assembler::HS, L_by16_loop); 3553 3554 __ bind(L_by1); 3555 __ adds(len, len, 15); 3556 __ br(Assembler::LO, L_do_mod); 3557 3558 __ bind(L_by1_loop); 3559 __ ldrb(temp0, Address(__ post(buff, 1))); 3560 __ add(s1, temp0, s1); 3561 __ add(s2, s2, s1); 3562 __ subs(len, len, 1); 3563 __ br(Assembler::HS, L_by1_loop); 3564 3565 __ bind(L_do_mod); 3566 // s1 = s1 % BASE 3567 __ lsr(temp0, s1, 16); 3568 __ lsl(temp1, temp0, 4); 3569 __ sub(temp1, temp1, temp0); 3570 __ add(temp1, temp1, s1, ext::uxth); 3571 3572 __ lsr(temp0, temp1, 16); 3573 __ lsl(s1, temp0, 4); 3574 __ sub(s1, s1, temp0); 3575 __ add(s1, s1, temp1, ext:: uxth); 3576 3577 __ subs(temp0, s1, base); 3578 __ csel(s1, temp0, s1, Assembler::HS); 3579 3580 // s2 = s2 % BASE 3581 __ lsr(temp0, s2, 16); 3582 __ lsl(temp1, temp0, 4); 3583 __ sub(temp1, temp1, temp0); 3584 __ add(temp1, temp1, s2, ext::uxth); 3585 3586 __ lsr(temp0, temp1, 16); 3587 __ lsl(s2, temp0, 4); 3588 __ sub(s2, s2, temp0); 3589 __ add(s2, s2, temp1, ext:: uxth); 3590 3591 __ subs(temp0, s2, base); 3592 __ csel(s2, temp0, s2, Assembler::HS); 3593 3594 // Combine lower bits and higher bits 3595 __ bind(L_combine); 3596 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3597 3598 __ ret(lr); 3599 3600 return start; 3601 } 3602 3603 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 3604 Register temp0, Register temp1, FloatRegister vbytes, 3605 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 3606 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 3607 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 3608 // In non-vectorized code, we update s1 and s2 as: 3609 // s1 <- s1 + b1 3610 // s2 <- s2 + s1 3611 // s1 <- s1 + b2 3612 // s2 <- s2 + b1 3613 // ... 3614 // s1 <- s1 + b16 3615 // s2 <- s2 + s1 3616 // Putting above assignments together, we have: 3617 // s1_new = s1 + b1 + b2 + ... + b16 3618 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 3619 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 3620 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 3621 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 3622 3623 // s2 = s2 + s1 * 16 3624 __ add(s2, s2, s1, Assembler::LSL, 4); 3625 3626 // vs1acc = b1 + b2 + b3 + ... + b16 3627 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 3628 __ umullv(vs2acc, __ T8B, vtable, vbytes); 3629 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 3630 __ uaddlv(vs1acc, __ T16B, vbytes); 3631 __ uaddlv(vs2acc, __ T8H, vs2acc); 3632 3633 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 3634 __ fmovd(temp0, vs1acc); 3635 __ fmovd(temp1, vs2acc); 3636 __ add(s1, s1, temp0); 3637 __ add(s2, s2, temp1); 3638 } 3639 3640 /** 3641 * Arguments: 3642 * 3643 * Input: 3644 * c_rarg0 - x address 3645 * c_rarg1 - x length 3646 * c_rarg2 - y address 3647 * c_rarg3 - y lenth 3648 * c_rarg4 - z address 3649 * c_rarg5 - z length 3650 */ 3651 address generate_multiplyToLen() { 3652 __ align(CodeEntryAlignment); 3653 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3654 3655 address start = __ pc(); 3656 const Register x = r0; 3657 const Register xlen = r1; 3658 const Register y = r2; 3659 const Register ylen = r3; 3660 const Register z = r4; 3661 const Register zlen = r5; 3662 3663 const Register tmp1 = r10; 3664 const Register tmp2 = r11; 3665 const Register tmp3 = r12; 3666 const Register tmp4 = r13; 3667 const Register tmp5 = r14; 3668 const Register tmp6 = r15; 3669 const Register tmp7 = r16; 3670 3671 BLOCK_COMMENT("Entry:"); 3672 __ enter(); // required for proper stackwalking of RuntimeStub frame 3673 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3674 __ leave(); // required for proper stackwalking of RuntimeStub frame 3675 __ ret(lr); 3676 3677 return start; 3678 } 3679 3680 address generate_squareToLen() { 3681 // squareToLen algorithm for sizes 1..127 described in java code works 3682 // faster than multiply_to_len on some CPUs and slower on others, but 3683 // multiply_to_len shows a bit better overall results 3684 __ align(CodeEntryAlignment); 3685 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3686 address start = __ pc(); 3687 3688 const Register x = r0; 3689 const Register xlen = r1; 3690 const Register z = r2; 3691 const Register zlen = r3; 3692 const Register y = r4; // == x 3693 const Register ylen = r5; // == xlen 3694 3695 const Register tmp1 = r10; 3696 const Register tmp2 = r11; 3697 const Register tmp3 = r12; 3698 const Register tmp4 = r13; 3699 const Register tmp5 = r14; 3700 const Register tmp6 = r15; 3701 const Register tmp7 = r16; 3702 3703 RegSet spilled_regs = RegSet::of(y, ylen); 3704 BLOCK_COMMENT("Entry:"); 3705 __ enter(); 3706 __ push(spilled_regs, sp); 3707 __ mov(y, x); 3708 __ mov(ylen, xlen); 3709 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3710 __ pop(spilled_regs, sp); 3711 __ leave(); 3712 __ ret(lr); 3713 return start; 3714 } 3715 3716 address generate_mulAdd() { 3717 __ align(CodeEntryAlignment); 3718 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3719 3720 address start = __ pc(); 3721 3722 const Register out = r0; 3723 const Register in = r1; 3724 const Register offset = r2; 3725 const Register len = r3; 3726 const Register k = r4; 3727 3728 BLOCK_COMMENT("Entry:"); 3729 __ enter(); 3730 __ mul_add(out, in, offset, len, k); 3731 __ leave(); 3732 __ ret(lr); 3733 3734 return start; 3735 } 3736 3737 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3738 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3739 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3740 // Karatsuba multiplication performs a 128*128 -> 256-bit 3741 // multiplication in three 128-bit multiplications and a few 3742 // additions. 3743 // 3744 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3745 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3746 // 3747 // Inputs: 3748 // 3749 // A0 in a.d[0] (subkey) 3750 // A1 in a.d[1] 3751 // (A1+A0) in a1_xor_a0.d[0] 3752 // 3753 // B0 in b.d[0] (state) 3754 // B1 in b.d[1] 3755 3756 __ ext(tmp1, __ T16B, b, b, 0x08); 3757 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3758 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3759 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3760 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3761 3762 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3763 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3764 __ eor(tmp2, __ T16B, tmp2, tmp4); 3765 __ eor(tmp2, __ T16B, tmp2, tmp3); 3766 3767 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3768 __ ins(result_hi, __ D, tmp2, 0, 1); 3769 __ ins(result_lo, __ D, tmp2, 1, 0); 3770 } 3771 3772 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3773 FloatRegister p, FloatRegister z, FloatRegister t1) { 3774 const FloatRegister t0 = result; 3775 3776 // The GCM field polynomial f is z^128 + p(z), where p = 3777 // z^7+z^2+z+1. 3778 // 3779 // z^128 === -p(z) (mod (z^128 + p(z))) 3780 // 3781 // so, given that the product we're reducing is 3782 // a == lo + hi * z^128 3783 // substituting, 3784 // === lo - hi * p(z) (mod (z^128 + p(z))) 3785 // 3786 // we reduce by multiplying hi by p(z) and subtracting the result 3787 // from (i.e. XORing it with) lo. Because p has no nonzero high 3788 // bits we can do this with two 64-bit multiplications, lo*p and 3789 // hi*p. 3790 3791 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3792 __ ext(t1, __ T16B, t0, z, 8); 3793 __ eor(hi, __ T16B, hi, t1); 3794 __ ext(t1, __ T16B, z, t0, 8); 3795 __ eor(lo, __ T16B, lo, t1); 3796 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3797 __ eor(result, __ T16B, lo, t0); 3798 } 3799 3800 address generate_has_negatives(address &has_negatives_long) { 3801 const u1 large_loop_size = 64; 3802 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3803 int dcache_line = VM_Version::dcache_line_size(); 3804 3805 Register ary1 = r1, len = r2, result = r0; 3806 3807 __ align(CodeEntryAlignment); 3808 3809 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3810 3811 address entry = __ pc(); 3812 3813 __ enter(); 3814 3815 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3816 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3817 3818 __ cmp(len, (u1)15); 3819 __ br(Assembler::GT, LEN_OVER_15); 3820 // The only case when execution falls into this code is when pointer is near 3821 // the end of memory page and we have to avoid reading next page 3822 __ add(ary1, ary1, len); 3823 __ subs(len, len, 8); 3824 __ br(Assembler::GT, LEN_OVER_8); 3825 __ ldr(rscratch2, Address(ary1, -8)); 3826 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3827 __ lsrv(rscratch2, rscratch2, rscratch1); 3828 __ tst(rscratch2, UPPER_BIT_MASK); 3829 __ cset(result, Assembler::NE); 3830 __ leave(); 3831 __ ret(lr); 3832 __ bind(LEN_OVER_8); 3833 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3834 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3835 __ tst(rscratch2, UPPER_BIT_MASK); 3836 __ br(Assembler::NE, RET_TRUE_NO_POP); 3837 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3838 __ lsrv(rscratch1, rscratch1, rscratch2); 3839 __ tst(rscratch1, UPPER_BIT_MASK); 3840 __ cset(result, Assembler::NE); 3841 __ leave(); 3842 __ ret(lr); 3843 3844 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3845 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3846 3847 has_negatives_long = __ pc(); // 2nd entry point 3848 3849 __ enter(); 3850 3851 __ bind(LEN_OVER_15); 3852 __ push(spilled_regs, sp); 3853 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3854 __ cbz(rscratch2, ALIGNED); 3855 __ ldp(tmp6, tmp1, Address(ary1)); 3856 __ mov(tmp5, 16); 3857 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3858 __ add(ary1, ary1, rscratch1); 3859 __ sub(len, len, rscratch1); 3860 __ orr(tmp6, tmp6, tmp1); 3861 __ tst(tmp6, UPPER_BIT_MASK); 3862 __ br(Assembler::NE, RET_TRUE); 3863 3864 __ bind(ALIGNED); 3865 __ cmp(len, large_loop_size); 3866 __ br(Assembler::LT, CHECK_16); 3867 // Perform 16-byte load as early return in pre-loop to handle situation 3868 // when initially aligned large array has negative values at starting bytes, 3869 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3870 // slower. Cases with negative bytes further ahead won't be affected that 3871 // much. In fact, it'll be faster due to early loads, less instructions and 3872 // less branches in LARGE_LOOP. 3873 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3874 __ sub(len, len, 16); 3875 __ orr(tmp6, tmp6, tmp1); 3876 __ tst(tmp6, UPPER_BIT_MASK); 3877 __ br(Assembler::NE, RET_TRUE); 3878 __ cmp(len, large_loop_size); 3879 __ br(Assembler::LT, CHECK_16); 3880 3881 if (SoftwarePrefetchHintDistance >= 0 3882 && SoftwarePrefetchHintDistance >= dcache_line) { 3883 // initial prefetch 3884 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3885 } 3886 __ bind(LARGE_LOOP); 3887 if (SoftwarePrefetchHintDistance >= 0) { 3888 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3889 } 3890 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3891 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3892 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3893 // instructions per cycle and have less branches, but this approach disables 3894 // early return, thus, all 64 bytes are loaded and checked every time. 3895 __ ldp(tmp2, tmp3, Address(ary1)); 3896 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3897 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3898 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3899 __ add(ary1, ary1, large_loop_size); 3900 __ sub(len, len, large_loop_size); 3901 __ orr(tmp2, tmp2, tmp3); 3902 __ orr(tmp4, tmp4, tmp5); 3903 __ orr(rscratch1, rscratch1, rscratch2); 3904 __ orr(tmp6, tmp6, tmp1); 3905 __ orr(tmp2, tmp2, tmp4); 3906 __ orr(rscratch1, rscratch1, tmp6); 3907 __ orr(tmp2, tmp2, rscratch1); 3908 __ tst(tmp2, UPPER_BIT_MASK); 3909 __ br(Assembler::NE, RET_TRUE); 3910 __ cmp(len, large_loop_size); 3911 __ br(Assembler::GE, LARGE_LOOP); 3912 3913 __ bind(CHECK_16); // small 16-byte load pre-loop 3914 __ cmp(len, (u1)16); 3915 __ br(Assembler::LT, POST_LOOP16); 3916 3917 __ bind(LOOP16); // small 16-byte load loop 3918 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3919 __ sub(len, len, 16); 3920 __ orr(tmp2, tmp2, tmp3); 3921 __ tst(tmp2, UPPER_BIT_MASK); 3922 __ br(Assembler::NE, RET_TRUE); 3923 __ cmp(len, (u1)16); 3924 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3925 3926 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3927 __ cmp(len, (u1)8); 3928 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3929 __ ldr(tmp3, Address(__ post(ary1, 8))); 3930 __ sub(len, len, 8); 3931 __ tst(tmp3, UPPER_BIT_MASK); 3932 __ br(Assembler::NE, RET_TRUE); 3933 3934 __ bind(POST_LOOP16_LOAD_TAIL); 3935 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3936 __ ldr(tmp1, Address(ary1)); 3937 __ mov(tmp2, 64); 3938 __ sub(tmp4, tmp2, len, __ LSL, 3); 3939 __ lslv(tmp1, tmp1, tmp4); 3940 __ tst(tmp1, UPPER_BIT_MASK); 3941 __ br(Assembler::NE, RET_TRUE); 3942 // Fallthrough 3943 3944 __ bind(RET_FALSE); 3945 __ pop(spilled_regs, sp); 3946 __ leave(); 3947 __ mov(result, zr); 3948 __ ret(lr); 3949 3950 __ bind(RET_TRUE); 3951 __ pop(spilled_regs, sp); 3952 __ bind(RET_TRUE_NO_POP); 3953 __ leave(); 3954 __ mov(result, 1); 3955 __ ret(lr); 3956 3957 __ bind(DONE); 3958 __ pop(spilled_regs, sp); 3959 __ leave(); 3960 __ ret(lr); 3961 return entry; 3962 } 3963 3964 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3965 bool usePrefetch, Label &NOT_EQUAL) { 3966 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3967 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3968 tmp7 = r12, tmp8 = r13; 3969 Label LOOP; 3970 3971 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3972 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3973 __ bind(LOOP); 3974 if (usePrefetch) { 3975 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3976 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3977 } 3978 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3979 __ eor(tmp1, tmp1, tmp2); 3980 __ eor(tmp3, tmp3, tmp4); 3981 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3982 __ orr(tmp1, tmp1, tmp3); 3983 __ cbnz(tmp1, NOT_EQUAL); 3984 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3985 __ eor(tmp5, tmp5, tmp6); 3986 __ eor(tmp7, tmp7, tmp8); 3987 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3988 __ orr(tmp5, tmp5, tmp7); 3989 __ cbnz(tmp5, NOT_EQUAL); 3990 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3991 __ eor(tmp1, tmp1, tmp2); 3992 __ eor(tmp3, tmp3, tmp4); 3993 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3994 __ orr(tmp1, tmp1, tmp3); 3995 __ cbnz(tmp1, NOT_EQUAL); 3996 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3997 __ eor(tmp5, tmp5, tmp6); 3998 __ sub(cnt1, cnt1, 8 * wordSize); 3999 __ eor(tmp7, tmp7, tmp8); 4000 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 4001 // tmp6 is not used. MacroAssembler::subs is used here (rather than 4002 // cmp) because subs allows an unlimited range of immediate operand. 4003 __ subs(tmp6, cnt1, loopThreshold); 4004 __ orr(tmp5, tmp5, tmp7); 4005 __ cbnz(tmp5, NOT_EQUAL); 4006 __ br(__ GE, LOOP); 4007 // post-loop 4008 __ eor(tmp1, tmp1, tmp2); 4009 __ eor(tmp3, tmp3, tmp4); 4010 __ orr(tmp1, tmp1, tmp3); 4011 __ sub(cnt1, cnt1, 2 * wordSize); 4012 __ cbnz(tmp1, NOT_EQUAL); 4013 } 4014 4015 void generate_large_array_equals_loop_simd(int loopThreshold, 4016 bool usePrefetch, Label &NOT_EQUAL) { 4017 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 4018 tmp2 = rscratch2; 4019 Label LOOP; 4020 4021 __ bind(LOOP); 4022 if (usePrefetch) { 4023 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 4024 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 4025 } 4026 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 4027 __ sub(cnt1, cnt1, 8 * wordSize); 4028 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 4029 __ subs(tmp1, cnt1, loopThreshold); 4030 __ eor(v0, __ T16B, v0, v4); 4031 __ eor(v1, __ T16B, v1, v5); 4032 __ eor(v2, __ T16B, v2, v6); 4033 __ eor(v3, __ T16B, v3, v7); 4034 __ orr(v0, __ T16B, v0, v1); 4035 __ orr(v1, __ T16B, v2, v3); 4036 __ orr(v0, __ T16B, v0, v1); 4037 __ umov(tmp1, v0, __ D, 0); 4038 __ umov(tmp2, v0, __ D, 1); 4039 __ orr(tmp1, tmp1, tmp2); 4040 __ cbnz(tmp1, NOT_EQUAL); 4041 __ br(__ GE, LOOP); 4042 } 4043 4044 // a1 = r1 - array1 address 4045 // a2 = r2 - array2 address 4046 // result = r0 - return value. Already contains "false" 4047 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 4048 // r3-r5 are reserved temporary registers 4049 address generate_large_array_equals() { 4050 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 4051 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 4052 tmp7 = r12, tmp8 = r13; 4053 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 4054 SMALL_LOOP, POST_LOOP; 4055 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 4056 // calculate if at least 32 prefetched bytes are used 4057 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 4058 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 4059 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 4060 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 4061 tmp5, tmp6, tmp7, tmp8); 4062 4063 __ align(CodeEntryAlignment); 4064 4065 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 4066 4067 address entry = __ pc(); 4068 __ enter(); 4069 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 4070 // also advance pointers to use post-increment instead of pre-increment 4071 __ add(a1, a1, wordSize); 4072 __ add(a2, a2, wordSize); 4073 if (AvoidUnalignedAccesses) { 4074 // both implementations (SIMD/nonSIMD) are using relatively large load 4075 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 4076 // on some CPUs in case of address is not at least 16-byte aligned. 4077 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 4078 // load if needed at least for 1st address and make if 16-byte aligned. 4079 Label ALIGNED16; 4080 __ tbz(a1, 3, ALIGNED16); 4081 __ ldr(tmp1, Address(__ post(a1, wordSize))); 4082 __ ldr(tmp2, Address(__ post(a2, wordSize))); 4083 __ sub(cnt1, cnt1, wordSize); 4084 __ eor(tmp1, tmp1, tmp2); 4085 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 4086 __ bind(ALIGNED16); 4087 } 4088 if (UseSIMDForArrayEquals) { 4089 if (SoftwarePrefetchHintDistance >= 0) { 4090 __ subs(tmp1, cnt1, prefetchLoopThreshold); 4091 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 4092 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 4093 /* prfm = */ true, NOT_EQUAL); 4094 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 4095 __ br(__ LT, TAIL); 4096 } 4097 __ bind(NO_PREFETCH_LARGE_LOOP); 4098 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 4099 /* prfm = */ false, NOT_EQUAL); 4100 } else { 4101 __ push(spilled_regs, sp); 4102 if (SoftwarePrefetchHintDistance >= 0) { 4103 __ subs(tmp1, cnt1, prefetchLoopThreshold); 4104 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 4105 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 4106 /* prfm = */ true, NOT_EQUAL); 4107 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 4108 __ br(__ LT, TAIL); 4109 } 4110 __ bind(NO_PREFETCH_LARGE_LOOP); 4111 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 4112 /* prfm = */ false, NOT_EQUAL); 4113 } 4114 __ bind(TAIL); 4115 __ cbz(cnt1, EQUAL); 4116 __ subs(cnt1, cnt1, wordSize); 4117 __ br(__ LE, POST_LOOP); 4118 __ bind(SMALL_LOOP); 4119 __ ldr(tmp1, Address(__ post(a1, wordSize))); 4120 __ ldr(tmp2, Address(__ post(a2, wordSize))); 4121 __ subs(cnt1, cnt1, wordSize); 4122 __ eor(tmp1, tmp1, tmp2); 4123 __ cbnz(tmp1, NOT_EQUAL); 4124 __ br(__ GT, SMALL_LOOP); 4125 __ bind(POST_LOOP); 4126 __ ldr(tmp1, Address(a1, cnt1)); 4127 __ ldr(tmp2, Address(a2, cnt1)); 4128 __ eor(tmp1, tmp1, tmp2); 4129 __ cbnz(tmp1, NOT_EQUAL); 4130 __ bind(EQUAL); 4131 __ mov(result, true); 4132 __ bind(NOT_EQUAL); 4133 if (!UseSIMDForArrayEquals) { 4134 __ pop(spilled_regs, sp); 4135 } 4136 __ bind(NOT_EQUAL_NO_POP); 4137 __ leave(); 4138 __ ret(lr); 4139 return entry; 4140 } 4141 4142 address generate_dsin_dcos(bool isCos) { 4143 __ align(CodeEntryAlignment); 4144 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 4145 address start = __ pc(); 4146 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 4147 (address)StubRoutines::aarch64::_two_over_pi, 4148 (address)StubRoutines::aarch64::_pio2, 4149 (address)StubRoutines::aarch64::_dsin_coef, 4150 (address)StubRoutines::aarch64::_dcos_coef); 4151 return start; 4152 } 4153 4154 address generate_dlog() { 4155 __ align(CodeEntryAlignment); 4156 StubCodeMark mark(this, "StubRoutines", "dlog"); 4157 address entry = __ pc(); 4158 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 4159 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 4160 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 4161 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 4162 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 4163 return entry; 4164 } 4165 4166 // code for comparing 16 bytes of strings with same encoding 4167 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 4168 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; 4169 __ ldr(rscratch1, Address(__ post(str1, 8))); 4170 __ eor(rscratch2, tmp1, tmp2); 4171 __ ldr(cnt1, Address(__ post(str2, 8))); 4172 __ cbnz(rscratch2, DIFF1); 4173 __ ldr(tmp1, Address(__ post(str1, 8))); 4174 __ eor(rscratch2, rscratch1, cnt1); 4175 __ ldr(tmp2, Address(__ post(str2, 8))); 4176 __ cbnz(rscratch2, DIFF2); 4177 } 4178 4179 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 4180 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 4181 Label &DIFF2) { 4182 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 4183 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 4184 4185 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 4186 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4187 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4188 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 4189 4190 __ fmovd(tmpL, vtmp3); 4191 __ eor(rscratch2, tmp3, tmpL); 4192 __ cbnz(rscratch2, DIFF2); 4193 4194 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4195 __ umov(tmpL, vtmp3, __ D, 1); 4196 __ eor(rscratch2, tmpU, tmpL); 4197 __ cbnz(rscratch2, DIFF1); 4198 4199 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4200 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4201 __ fmovd(tmpL, vtmp); 4202 __ eor(rscratch2, tmp3, tmpL); 4203 __ cbnz(rscratch2, DIFF2); 4204 4205 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4206 __ umov(tmpL, vtmp, __ D, 1); 4207 __ eor(rscratch2, tmpU, tmpL); 4208 __ cbnz(rscratch2, DIFF1); 4209 } 4210 4211 // r0 = result 4212 // r1 = str1 4213 // r2 = cnt1 4214 // r3 = str2 4215 // r4 = cnt2 4216 // r10 = tmp1 4217 // r11 = tmp2 4218 address generate_compare_long_string_different_encoding(bool isLU) { 4219 __ align(CodeEntryAlignment); 4220 StubCodeMark mark(this, "StubRoutines", isLU 4221 ? "compare_long_string_different_encoding LU" 4222 : "compare_long_string_different_encoding UL"); 4223 address entry = __ pc(); 4224 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 4225 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 4226 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 4227 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4228 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 4229 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 4230 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 4231 4232 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 4233 4234 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 4235 // cnt2 == amount of characters left to compare 4236 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 4237 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4238 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 4239 __ add(str2, str2, isLU ? wordSize : wordSize/2); 4240 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 4241 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 4242 __ eor(rscratch2, tmp1, tmp2); 4243 __ mov(rscratch1, tmp2); 4244 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 4245 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 4246 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 4247 __ push(spilled_regs, sp); 4248 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 4249 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 4250 4251 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4252 4253 if (SoftwarePrefetchHintDistance >= 0) { 4254 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4255 __ br(__ LT, NO_PREFETCH); 4256 __ bind(LARGE_LOOP_PREFETCH); 4257 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 4258 __ mov(tmp4, 2); 4259 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4260 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 4261 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4262 __ subs(tmp4, tmp4, 1); 4263 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 4264 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4265 __ mov(tmp4, 2); 4266 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 4267 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4268 __ subs(tmp4, tmp4, 1); 4269 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 4270 __ sub(cnt2, cnt2, 64); 4271 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4272 __ br(__ GE, LARGE_LOOP_PREFETCH); 4273 } 4274 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 4275 __ bind(NO_PREFETCH); 4276 __ subs(cnt2, cnt2, 16); 4277 __ br(__ LT, TAIL); 4278 __ align(OptoLoopAlignment); 4279 __ bind(SMALL_LOOP); // smaller loop 4280 __ subs(cnt2, cnt2, 16); 4281 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4282 __ br(__ GE, SMALL_LOOP); 4283 __ cmn(cnt2, (u1)16); 4284 __ br(__ EQ, LOAD_LAST); 4285 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 4286 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 4287 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 4288 __ ldr(tmp3, Address(cnt1, -8)); 4289 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 4290 __ b(LOAD_LAST); 4291 __ bind(DIFF2); 4292 __ mov(tmpU, tmp3); 4293 __ bind(DIFF1); 4294 __ pop(spilled_regs, sp); 4295 __ b(CALCULATE_DIFFERENCE); 4296 __ bind(LOAD_LAST); 4297 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 4298 // No need to load it again 4299 __ mov(tmpU, tmp3); 4300 __ pop(spilled_regs, sp); 4301 4302 // tmp2 points to the address of the last 4 Latin1 characters right now 4303 __ ldrs(vtmp, Address(tmp2)); 4304 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4305 __ fmovd(tmpL, vtmp); 4306 4307 __ eor(rscratch2, tmpU, tmpL); 4308 __ cbz(rscratch2, DONE); 4309 4310 // Find the first different characters in the longwords and 4311 // compute their difference. 4312 __ bind(CALCULATE_DIFFERENCE); 4313 __ rev(rscratch2, rscratch2); 4314 __ clz(rscratch2, rscratch2); 4315 __ andr(rscratch2, rscratch2, -16); 4316 __ lsrv(tmp1, tmp1, rscratch2); 4317 __ uxthw(tmp1, tmp1); 4318 __ lsrv(rscratch1, rscratch1, rscratch2); 4319 __ uxthw(rscratch1, rscratch1); 4320 __ subw(result, tmp1, rscratch1); 4321 __ bind(DONE); 4322 __ ret(lr); 4323 return entry; 4324 } 4325 4326 address generate_method_entry_barrier() { 4327 __ align(CodeEntryAlignment); 4328 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 4329 4330 Label deoptimize_label; 4331 4332 address start = __ pc(); 4333 4334 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 4335 4336 __ enter(); 4337 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 4338 4339 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 4340 4341 __ push_call_clobbered_registers(); 4342 4343 __ mov(c_rarg0, rscratch2); 4344 __ call_VM_leaf 4345 (CAST_FROM_FN_PTR 4346 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 4347 4348 __ reset_last_Java_frame(true); 4349 4350 __ mov(rscratch1, r0); 4351 4352 __ pop_call_clobbered_registers(); 4353 4354 __ cbnz(rscratch1, deoptimize_label); 4355 4356 __ leave(); 4357 __ ret(lr); 4358 4359 __ BIND(deoptimize_label); 4360 4361 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 4362 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 4363 4364 __ mov(sp, rscratch1); 4365 __ br(rscratch2); 4366 4367 return start; 4368 } 4369 4370 // r0 = result 4371 // r1 = str1 4372 // r2 = cnt1 4373 // r3 = str2 4374 // r4 = cnt2 4375 // r10 = tmp1 4376 // r11 = tmp2 4377 address generate_compare_long_string_same_encoding(bool isLL) { 4378 __ align(CodeEntryAlignment); 4379 StubCodeMark mark(this, "StubRoutines", isLL 4380 ? "compare_long_string_same_encoding LL" 4381 : "compare_long_string_same_encoding UU"); 4382 address entry = __ pc(); 4383 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4384 tmp1 = r10, tmp2 = r11; 4385 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, 4386 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, 4387 DIFF_LAST_POSITION, DIFF_LAST_POSITION2; 4388 // exit from large loop when less than 64 bytes left to read or we're about 4389 // to prefetch memory behind array border 4390 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 4391 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 4392 // update cnt2 counter with already loaded 8 bytes 4393 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 4394 // update pointers, because of previous read 4395 __ add(str1, str1, wordSize); 4396 __ add(str2, str2, wordSize); 4397 if (SoftwarePrefetchHintDistance >= 0) { 4398 __ bind(LARGE_LOOP_PREFETCH); 4399 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 4400 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 4401 compare_string_16_bytes_same(DIFF, DIFF2); 4402 compare_string_16_bytes_same(DIFF, DIFF2); 4403 __ sub(cnt2, cnt2, isLL ? 64 : 32); 4404 compare_string_16_bytes_same(DIFF, DIFF2); 4405 __ subs(rscratch2, cnt2, largeLoopExitCondition); 4406 compare_string_16_bytes_same(DIFF, DIFF2); 4407 __ br(__ GT, LARGE_LOOP_PREFETCH); 4408 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? 4409 } 4410 // less than 16 bytes left? 4411 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4412 __ br(__ LT, TAIL); 4413 __ align(OptoLoopAlignment); 4414 __ bind(SMALL_LOOP); 4415 compare_string_16_bytes_same(DIFF, DIFF2); 4416 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4417 __ br(__ GE, SMALL_LOOP); 4418 __ bind(TAIL); 4419 __ adds(cnt2, cnt2, isLL ? 16 : 8); 4420 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); 4421 __ subs(cnt2, cnt2, isLL ? 8 : 4); 4422 __ br(__ LE, CHECK_LAST); 4423 __ eor(rscratch2, tmp1, tmp2); 4424 __ cbnz(rscratch2, DIFF); 4425 __ ldr(tmp1, Address(__ post(str1, 8))); 4426 __ ldr(tmp2, Address(__ post(str2, 8))); 4427 __ sub(cnt2, cnt2, isLL ? 8 : 4); 4428 __ bind(CHECK_LAST); 4429 if (!isLL) { 4430 __ add(cnt2, cnt2, cnt2); // now in bytes 4431 } 4432 __ eor(rscratch2, tmp1, tmp2); 4433 __ cbnz(rscratch2, DIFF); 4434 __ ldr(rscratch1, Address(str1, cnt2)); 4435 __ ldr(cnt1, Address(str2, cnt2)); 4436 __ eor(rscratch2, rscratch1, cnt1); 4437 __ cbz(rscratch2, LENGTH_DIFF); 4438 // Find the first different characters in the longwords and 4439 // compute their difference. 4440 __ bind(DIFF2); 4441 __ rev(rscratch2, rscratch2); 4442 __ clz(rscratch2, rscratch2); 4443 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4444 __ lsrv(rscratch1, rscratch1, rscratch2); 4445 if (isLL) { 4446 __ lsrv(cnt1, cnt1, rscratch2); 4447 __ uxtbw(rscratch1, rscratch1); 4448 __ uxtbw(cnt1, cnt1); 4449 } else { 4450 __ lsrv(cnt1, cnt1, rscratch2); 4451 __ uxthw(rscratch1, rscratch1); 4452 __ uxthw(cnt1, cnt1); 4453 } 4454 __ subw(result, rscratch1, cnt1); 4455 __ b(LENGTH_DIFF); 4456 __ bind(DIFF); 4457 __ rev(rscratch2, rscratch2); 4458 __ clz(rscratch2, rscratch2); 4459 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4460 __ lsrv(tmp1, tmp1, rscratch2); 4461 if (isLL) { 4462 __ lsrv(tmp2, tmp2, rscratch2); 4463 __ uxtbw(tmp1, tmp1); 4464 __ uxtbw(tmp2, tmp2); 4465 } else { 4466 __ lsrv(tmp2, tmp2, rscratch2); 4467 __ uxthw(tmp1, tmp1); 4468 __ uxthw(tmp2, tmp2); 4469 } 4470 __ subw(result, tmp1, tmp2); 4471 __ b(LENGTH_DIFF); 4472 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 4473 __ eor(rscratch2, tmp1, tmp2); 4474 __ cbnz(rscratch2, DIFF); 4475 __ bind(LENGTH_DIFF); 4476 __ ret(lr); 4477 return entry; 4478 } 4479 4480 void generate_compare_long_strings() { 4481 StubRoutines::aarch64::_compare_long_string_LL 4482 = generate_compare_long_string_same_encoding(true); 4483 StubRoutines::aarch64::_compare_long_string_UU 4484 = generate_compare_long_string_same_encoding(false); 4485 StubRoutines::aarch64::_compare_long_string_LU 4486 = generate_compare_long_string_different_encoding(true); 4487 StubRoutines::aarch64::_compare_long_string_UL 4488 = generate_compare_long_string_different_encoding(false); 4489 } 4490 4491 // R0 = result 4492 // R1 = str2 4493 // R2 = cnt1 4494 // R3 = str1 4495 // R4 = cnt2 4496 // This generic linear code use few additional ideas, which makes it faster: 4497 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 4498 // in order to skip initial loading(help in systems with 1 ld pipeline) 4499 // 2) we can use "fast" algorithm of finding single character to search for 4500 // first symbol with less branches(1 branch per each loaded register instead 4501 // of branch for each symbol), so, this is where constants like 4502 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 4503 // 3) after loading and analyzing 1st register of source string, it can be 4504 // used to search for every 1st character entry, saving few loads in 4505 // comparison with "simplier-but-slower" implementation 4506 // 4) in order to avoid lots of push/pop operations, code below is heavily 4507 // re-using/re-initializing/compressing register values, which makes code 4508 // larger and a bit less readable, however, most of extra operations are 4509 // issued during loads or branches, so, penalty is minimal 4510 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 4511 const char* stubName = str1_isL 4512 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 4513 : "indexof_linear_uu"; 4514 __ align(CodeEntryAlignment); 4515 StubCodeMark mark(this, "StubRoutines", stubName); 4516 address entry = __ pc(); 4517 4518 int str1_chr_size = str1_isL ? 1 : 2; 4519 int str2_chr_size = str2_isL ? 1 : 2; 4520 int str1_chr_shift = str1_isL ? 0 : 1; 4521 int str2_chr_shift = str2_isL ? 0 : 1; 4522 bool isL = str1_isL && str2_isL; 4523 // parameters 4524 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 4525 // temporary registers 4526 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 4527 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 4528 // redefinitions 4529 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 4530 4531 __ push(spilled_regs, sp); 4532 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 4533 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 4534 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 4535 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 4536 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 4537 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 4538 // Read whole register from str1. It is safe, because length >=8 here 4539 __ ldr(ch1, Address(str1)); 4540 // Read whole register from str2. It is safe, because length >=8 here 4541 __ ldr(ch2, Address(str2)); 4542 __ sub(cnt2, cnt2, cnt1); 4543 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 4544 if (str1_isL != str2_isL) { 4545 __ eor(v0, __ T16B, v0, v0); 4546 } 4547 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4548 __ mul(first, first, tmp1); 4549 // check if we have less than 1 register to check 4550 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 4551 if (str1_isL != str2_isL) { 4552 __ fmovd(v1, ch1); 4553 } 4554 __ br(__ LE, L_SMALL); 4555 __ eor(ch2, first, ch2); 4556 if (str1_isL != str2_isL) { 4557 __ zip1(v1, __ T16B, v1, v0); 4558 } 4559 __ sub(tmp2, ch2, tmp1); 4560 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4561 __ bics(tmp2, tmp2, ch2); 4562 if (str1_isL != str2_isL) { 4563 __ fmovd(ch1, v1); 4564 } 4565 __ br(__ NE, L_HAS_ZERO); 4566 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4567 __ add(result, result, wordSize/str2_chr_size); 4568 __ add(str2, str2, wordSize); 4569 __ br(__ LT, L_POST_LOOP); 4570 __ BIND(L_LOOP); 4571 __ ldr(ch2, Address(str2)); 4572 __ eor(ch2, first, ch2); 4573 __ sub(tmp2, ch2, tmp1); 4574 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4575 __ bics(tmp2, tmp2, ch2); 4576 __ br(__ NE, L_HAS_ZERO); 4577 __ BIND(L_LOOP_PROCEED); 4578 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4579 __ add(str2, str2, wordSize); 4580 __ add(result, result, wordSize/str2_chr_size); 4581 __ br(__ GE, L_LOOP); 4582 __ BIND(L_POST_LOOP); 4583 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 4584 __ br(__ LE, NOMATCH); 4585 __ ldr(ch2, Address(str2)); 4586 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4587 __ eor(ch2, first, ch2); 4588 __ sub(tmp2, ch2, tmp1); 4589 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4590 __ mov(tmp4, -1); // all bits set 4591 __ b(L_SMALL_PROCEED); 4592 __ align(OptoLoopAlignment); 4593 __ BIND(L_SMALL); 4594 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4595 __ eor(ch2, first, ch2); 4596 if (str1_isL != str2_isL) { 4597 __ zip1(v1, __ T16B, v1, v0); 4598 } 4599 __ sub(tmp2, ch2, tmp1); 4600 __ mov(tmp4, -1); // all bits set 4601 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4602 if (str1_isL != str2_isL) { 4603 __ fmovd(ch1, v1); // move converted 4 symbols 4604 } 4605 __ BIND(L_SMALL_PROCEED); 4606 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 4607 __ bic(tmp2, tmp2, ch2); 4608 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 4609 __ rbit(tmp2, tmp2); 4610 __ br(__ EQ, NOMATCH); 4611 __ BIND(L_SMALL_HAS_ZERO_LOOP); 4612 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 4613 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 4614 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 4615 if (str2_isL) { // LL 4616 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4617 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4618 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4619 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4620 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4621 } else { 4622 __ mov(ch2, 0xE); // all bits in byte set except last one 4623 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4624 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4625 __ lslv(tmp2, tmp2, tmp4); 4626 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4627 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4628 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4629 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4630 } 4631 __ cmp(ch1, ch2); 4632 __ mov(tmp4, wordSize/str2_chr_size); 4633 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4634 __ BIND(L_SMALL_CMP_LOOP); 4635 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4636 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4637 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4638 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4639 __ add(tmp4, tmp4, 1); 4640 __ cmp(tmp4, cnt1); 4641 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 4642 __ cmp(first, ch2); 4643 __ br(__ EQ, L_SMALL_CMP_LOOP); 4644 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 4645 __ cbz(tmp2, NOMATCH); // no more matches. exit 4646 __ clz(tmp4, tmp2); 4647 __ add(result, result, 1); // advance index 4648 __ add(str2, str2, str2_chr_size); // advance pointer 4649 __ b(L_SMALL_HAS_ZERO_LOOP); 4650 __ align(OptoLoopAlignment); 4651 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 4652 __ cmp(first, ch2); 4653 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4654 __ b(DONE); 4655 __ align(OptoLoopAlignment); 4656 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 4657 if (str2_isL) { // LL 4658 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4659 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4660 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4661 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4662 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4663 } else { 4664 __ mov(ch2, 0xE); // all bits in byte set except last one 4665 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4666 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4667 __ lslv(tmp2, tmp2, tmp4); 4668 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4669 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4670 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4671 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4672 } 4673 __ cmp(ch1, ch2); 4674 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4675 __ b(DONE); 4676 __ align(OptoLoopAlignment); 4677 __ BIND(L_HAS_ZERO); 4678 __ rbit(tmp2, tmp2); 4679 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 4680 // Now, perform compression of counters(cnt2 and cnt1) into one register. 4681 // It's fine because both counters are 32bit and are not changed in this 4682 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 4683 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 4684 __ sub(result, result, 1); 4685 __ BIND(L_HAS_ZERO_LOOP); 4686 __ mov(cnt1, wordSize/str2_chr_size); 4687 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4688 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 4689 if (str2_isL) { 4690 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4691 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4692 __ lslv(tmp2, tmp2, tmp4); 4693 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4694 __ add(tmp4, tmp4, 1); 4695 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4696 __ lsl(tmp2, tmp2, 1); 4697 __ mov(tmp4, wordSize/str2_chr_size); 4698 } else { 4699 __ mov(ch2, 0xE); 4700 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4701 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4702 __ lslv(tmp2, tmp2, tmp4); 4703 __ add(tmp4, tmp4, 1); 4704 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4705 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4706 __ lsl(tmp2, tmp2, 1); 4707 __ mov(tmp4, wordSize/str2_chr_size); 4708 __ sub(str2, str2, str2_chr_size); 4709 } 4710 __ cmp(ch1, ch2); 4711 __ mov(tmp4, wordSize/str2_chr_size); 4712 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4713 __ BIND(L_CMP_LOOP); 4714 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4715 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4716 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4717 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4718 __ add(tmp4, tmp4, 1); 4719 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4720 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 4721 __ cmp(cnt1, ch2); 4722 __ br(__ EQ, L_CMP_LOOP); 4723 __ BIND(L_CMP_LOOP_NOMATCH); 4724 // here we're not matched 4725 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 4726 __ clz(tmp4, tmp2); 4727 __ add(str2, str2, str2_chr_size); // advance pointer 4728 __ b(L_HAS_ZERO_LOOP); 4729 __ align(OptoLoopAlignment); 4730 __ BIND(L_CMP_LOOP_LAST_CMP); 4731 __ cmp(cnt1, ch2); 4732 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4733 __ b(DONE); 4734 __ align(OptoLoopAlignment); 4735 __ BIND(L_CMP_LOOP_LAST_CMP2); 4736 if (str2_isL) { 4737 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4738 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4739 __ lslv(tmp2, tmp2, tmp4); 4740 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4741 __ add(tmp4, tmp4, 1); 4742 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4743 __ lsl(tmp2, tmp2, 1); 4744 } else { 4745 __ mov(ch2, 0xE); 4746 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4747 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4748 __ lslv(tmp2, tmp2, tmp4); 4749 __ add(tmp4, tmp4, 1); 4750 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4751 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4752 __ lsl(tmp2, tmp2, 1); 4753 __ sub(str2, str2, str2_chr_size); 4754 } 4755 __ cmp(ch1, ch2); 4756 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4757 __ b(DONE); 4758 __ align(OptoLoopAlignment); 4759 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 4760 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 4761 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 4762 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 4763 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 4764 // result by analyzed characters value, so, we can just reset lower bits 4765 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 4766 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 4767 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 4768 // index of last analyzed substring inside current octet. So, str2 in at 4769 // respective start address. We need to advance it to next octet 4770 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 4771 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 4772 __ bfm(result, zr, 0, 2 - str2_chr_shift); 4773 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 4774 __ movw(cnt2, cnt2); 4775 __ b(L_LOOP_PROCEED); 4776 __ align(OptoLoopAlignment); 4777 __ BIND(NOMATCH); 4778 __ mov(result, -1); 4779 __ BIND(DONE); 4780 __ pop(spilled_regs, sp); 4781 __ ret(lr); 4782 return entry; 4783 } 4784 4785 void generate_string_indexof_stubs() { 4786 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 4787 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 4788 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 4789 } 4790 4791 void inflate_and_store_2_fp_registers(bool generatePrfm, 4792 FloatRegister src1, FloatRegister src2) { 4793 Register dst = r1; 4794 __ zip1(v1, __ T16B, src1, v0); 4795 __ zip2(v2, __ T16B, src1, v0); 4796 if (generatePrfm) { 4797 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 4798 } 4799 __ zip1(v3, __ T16B, src2, v0); 4800 __ zip2(v4, __ T16B, src2, v0); 4801 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4802 } 4803 4804 // R0 = src 4805 // R1 = dst 4806 // R2 = len 4807 // R3 = len >> 3 4808 // V0 = 0 4809 // v1 = loaded 8 bytes 4810 address generate_large_byte_array_inflate() { 4811 __ align(CodeEntryAlignment); 4812 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4813 address entry = __ pc(); 4814 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4815 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4816 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 4817 4818 // do one more 8-byte read to have address 16-byte aligned in most cases 4819 // also use single store instruction 4820 __ ldrd(v2, __ post(src, 8)); 4821 __ sub(octetCounter, octetCounter, 2); 4822 __ zip1(v1, __ T16B, v1, v0); 4823 __ zip1(v2, __ T16B, v2, v0); 4824 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4825 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4826 __ subs(rscratch1, octetCounter, large_loop_threshold); 4827 __ br(__ LE, LOOP_START); 4828 __ b(LOOP_PRFM_START); 4829 __ bind(LOOP_PRFM); 4830 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4831 __ bind(LOOP_PRFM_START); 4832 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4833 __ sub(octetCounter, octetCounter, 8); 4834 __ subs(rscratch1, octetCounter, large_loop_threshold); 4835 inflate_and_store_2_fp_registers(true, v3, v4); 4836 inflate_and_store_2_fp_registers(true, v5, v6); 4837 __ br(__ GT, LOOP_PRFM); 4838 __ cmp(octetCounter, (u1)8); 4839 __ br(__ LT, DONE); 4840 __ bind(LOOP); 4841 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4842 __ bind(LOOP_START); 4843 __ sub(octetCounter, octetCounter, 8); 4844 __ cmp(octetCounter, (u1)8); 4845 inflate_and_store_2_fp_registers(false, v3, v4); 4846 inflate_and_store_2_fp_registers(false, v5, v6); 4847 __ br(__ GE, LOOP); 4848 __ bind(DONE); 4849 __ ret(lr); 4850 return entry; 4851 } 4852 4853 /** 4854 * Arguments: 4855 * 4856 * Input: 4857 * c_rarg0 - current state address 4858 * c_rarg1 - H key address 4859 * c_rarg2 - data address 4860 * c_rarg3 - number of blocks 4861 * 4862 * Output: 4863 * Updated state at c_rarg0 4864 */ 4865 address generate_ghash_processBlocks() { 4866 // Bafflingly, GCM uses little-endian for the byte order, but 4867 // big-endian for the bit order. For example, the polynomial 1 is 4868 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4869 // 4870 // So, we must either reverse the bytes in each word and do 4871 // everything big-endian or reverse the bits in each byte and do 4872 // it little-endian. On AArch64 it's more idiomatic to reverse 4873 // the bits in each byte (we have an instruction, RBIT, to do 4874 // that) and keep the data in little-endian bit order throught the 4875 // calculation, bit-reversing the inputs and outputs. 4876 4877 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4878 __ align(wordSize * 2); 4879 address p = __ pc(); 4880 __ emit_int64(0x87); // The low-order bits of the field 4881 // polynomial (i.e. p = z^7+z^2+z+1) 4882 // repeated in the low and high parts of a 4883 // 128-bit vector 4884 __ emit_int64(0x87); 4885 4886 __ align(CodeEntryAlignment); 4887 address start = __ pc(); 4888 4889 Register state = c_rarg0; 4890 Register subkeyH = c_rarg1; 4891 Register data = c_rarg2; 4892 Register blocks = c_rarg3; 4893 4894 FloatRegister vzr = v30; 4895 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4896 4897 __ ldrq(v0, Address(state)); 4898 __ ldrq(v1, Address(subkeyH)); 4899 4900 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4901 __ rbit(v0, __ T16B, v0); 4902 __ rev64(v1, __ T16B, v1); 4903 __ rbit(v1, __ T16B, v1); 4904 4905 __ ldrq(v26, p); 4906 4907 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4908 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4909 4910 { 4911 Label L_ghash_loop; 4912 __ bind(L_ghash_loop); 4913 4914 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4915 // reversing each byte 4916 __ rbit(v2, __ T16B, v2); 4917 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4918 4919 // Multiply state in v2 by subkey in v1 4920 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4921 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4922 /*temps*/v6, v20, v18, v21); 4923 // Reduce v7:v5 by the field polynomial 4924 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4925 4926 __ sub(blocks, blocks, 1); 4927 __ cbnz(blocks, L_ghash_loop); 4928 } 4929 4930 // The bit-reversed result is at this point in v0 4931 __ rev64(v1, __ T16B, v0); 4932 __ rbit(v1, __ T16B, v1); 4933 4934 __ st1(v1, __ T16B, state); 4935 __ ret(lr); 4936 4937 return start; 4938 } 4939 4940 // Continuation point for throwing of implicit exceptions that are 4941 // not handled in the current activation. Fabricates an exception 4942 // oop and initiates normal exception dispatching in this 4943 // frame. Since we need to preserve callee-saved values (currently 4944 // only for C2, but done for C1 as well) we need a callee-saved oop 4945 // map and therefore have to make these stubs into RuntimeStubs 4946 // rather than BufferBlobs. If the compiler needs all registers to 4947 // be preserved between the fault point and the exception handler 4948 // then it must assume responsibility for that in 4949 // AbstractCompiler::continuation_for_implicit_null_exception or 4950 // continuation_for_implicit_division_by_zero_exception. All other 4951 // implicit exceptions (e.g., NullPointerException or 4952 // AbstractMethodError on entry) are either at call sites or 4953 // otherwise assume that stack unwinding will be initiated, so 4954 // caller saved registers were assumed volatile in the compiler. 4955 4956 #undef __ 4957 #define __ masm-> 4958 4959 address generate_throw_exception(const char* name, 4960 address runtime_entry, 4961 Register arg1 = noreg, 4962 Register arg2 = noreg) { 4963 // Information about frame layout at time of blocking runtime call. 4964 // Note that we only have to preserve callee-saved registers since 4965 // the compilers are responsible for supplying a continuation point 4966 // if they expect all registers to be preserved. 4967 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4968 enum layout { 4969 rfp_off = 0, 4970 rfp_off2, 4971 return_off, 4972 return_off2, 4973 framesize // inclusive of return address 4974 }; 4975 4976 int insts_size = 512; 4977 int locs_size = 64; 4978 4979 CodeBuffer code(name, insts_size, locs_size); 4980 OopMapSet* oop_maps = new OopMapSet(); 4981 MacroAssembler* masm = new MacroAssembler(&code); 4982 4983 address start = __ pc(); 4984 4985 // This is an inlined and slightly modified version of call_VM 4986 // which has the ability to fetch the return PC out of 4987 // thread-local storage and also sets up last_Java_sp slightly 4988 // differently than the real call_VM 4989 4990 __ enter(); // Save FP and LR before call 4991 4992 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4993 4994 // lr and fp are already in place 4995 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4996 4997 int frame_complete = __ pc() - start; 4998 4999 // Set up last_Java_sp and last_Java_fp 5000 address the_pc = __ pc(); 5001 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 5002 5003 // Call runtime 5004 if (arg1 != noreg) { 5005 assert(arg2 != c_rarg1, "clobbered"); 5006 __ mov(c_rarg1, arg1); 5007 } 5008 if (arg2 != noreg) { 5009 __ mov(c_rarg2, arg2); 5010 } 5011 __ mov(c_rarg0, rthread); 5012 BLOCK_COMMENT("call runtime_entry"); 5013 __ mov(rscratch1, runtime_entry); 5014 __ blr(rscratch1); 5015 5016 // Generate oop map 5017 OopMap* map = new OopMap(framesize, 0); 5018 5019 oop_maps->add_gc_map(the_pc - start, map); 5020 5021 __ reset_last_Java_frame(true); 5022 __ maybe_isb(); 5023 5024 __ leave(); 5025 5026 // check for pending exceptions 5027 #ifdef ASSERT 5028 Label L; 5029 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 5030 __ cbnz(rscratch1, L); 5031 __ should_not_reach_here(); 5032 __ bind(L); 5033 #endif // ASSERT 5034 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 5035 5036 5037 // codeBlob framesize is in words (not VMRegImpl::slot_size) 5038 RuntimeStub* stub = 5039 RuntimeStub::new_runtime_stub(name, 5040 &code, 5041 frame_complete, 5042 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 5043 oop_maps, false); 5044 return stub->entry_point(); 5045 } 5046 5047 class MontgomeryMultiplyGenerator : public MacroAssembler { 5048 5049 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 5050 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 5051 5052 RegSet _toSave; 5053 bool _squaring; 5054 5055 public: 5056 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 5057 : MacroAssembler(as->code()), _squaring(squaring) { 5058 5059 // Register allocation 5060 5061 RegSetIterator regs = (RegSet::range(r0, r26) - r18_tls).begin(); 5062 Pa_base = *regs; // Argument registers 5063 if (squaring) 5064 Pb_base = Pa_base; 5065 else 5066 Pb_base = *++regs; 5067 Pn_base = *++regs; 5068 Rlen= *++regs; 5069 inv = *++regs; 5070 Pm_base = *++regs; 5071 5072 // Working registers: 5073 Ra = *++regs; // The current digit of a, b, n, and m. 5074 Rb = *++regs; 5075 Rm = *++regs; 5076 Rn = *++regs; 5077 5078 Pa = *++regs; // Pointers to the current/next digit of a, b, n, and m. 5079 Pb = *++regs; 5080 Pm = *++regs; 5081 Pn = *++regs; 5082 5083 t0 = *++regs; // Three registers which form a 5084 t1 = *++regs; // triple-precision accumuator. 5085 t2 = *++regs; 5086 5087 Ri = *++regs; // Inner and outer loop indexes. 5088 Rj = *++regs; 5089 5090 Rhi_ab = *++regs; // Product registers: low and high parts 5091 Rlo_ab = *++regs; // of a*b and m*n. 5092 Rhi_mn = *++regs; 5093 Rlo_mn = *++regs; 5094 5095 // r19 and up are callee-saved. 5096 _toSave = RegSet::range(r19, *regs) + Pm_base; 5097 } 5098 5099 private: 5100 void save_regs() { 5101 push(_toSave, sp); 5102 } 5103 5104 void restore_regs() { 5105 pop(_toSave, sp); 5106 } 5107 5108 template <typename T> 5109 void unroll_2(Register count, T block) { 5110 Label loop, end, odd; 5111 tbnz(count, 0, odd); 5112 cbz(count, end); 5113 align(16); 5114 bind(loop); 5115 (this->*block)(); 5116 bind(odd); 5117 (this->*block)(); 5118 subs(count, count, 2); 5119 br(Assembler::GT, loop); 5120 bind(end); 5121 } 5122 5123 template <typename T> 5124 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 5125 Label loop, end, odd; 5126 tbnz(count, 0, odd); 5127 cbz(count, end); 5128 align(16); 5129 bind(loop); 5130 (this->*block)(d, s, tmp); 5131 bind(odd); 5132 (this->*block)(d, s, tmp); 5133 subs(count, count, 2); 5134 br(Assembler::GT, loop); 5135 bind(end); 5136 } 5137 5138 void pre1(RegisterOrConstant i) { 5139 block_comment("pre1"); 5140 // Pa = Pa_base; 5141 // Pb = Pb_base + i; 5142 // Pm = Pm_base; 5143 // Pn = Pn_base + i; 5144 // Ra = *Pa; 5145 // Rb = *Pb; 5146 // Rm = *Pm; 5147 // Rn = *Pn; 5148 ldr(Ra, Address(Pa_base)); 5149 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 5150 ldr(Rm, Address(Pm_base)); 5151 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5152 lea(Pa, Address(Pa_base)); 5153 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 5154 lea(Pm, Address(Pm_base)); 5155 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5156 5157 // Zero the m*n result. 5158 mov(Rhi_mn, zr); 5159 mov(Rlo_mn, zr); 5160 } 5161 5162 // The core multiply-accumulate step of a Montgomery 5163 // multiplication. The idea is to schedule operations as a 5164 // pipeline so that instructions with long latencies (loads and 5165 // multiplies) have time to complete before their results are 5166 // used. This most benefits in-order implementations of the 5167 // architecture but out-of-order ones also benefit. 5168 void step() { 5169 block_comment("step"); 5170 // MACC(Ra, Rb, t0, t1, t2); 5171 // Ra = *++Pa; 5172 // Rb = *--Pb; 5173 umulh(Rhi_ab, Ra, Rb); 5174 mul(Rlo_ab, Ra, Rb); 5175 ldr(Ra, pre(Pa, wordSize)); 5176 ldr(Rb, pre(Pb, -wordSize)); 5177 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 5178 // previous iteration. 5179 // MACC(Rm, Rn, t0, t1, t2); 5180 // Rm = *++Pm; 5181 // Rn = *--Pn; 5182 umulh(Rhi_mn, Rm, Rn); 5183 mul(Rlo_mn, Rm, Rn); 5184 ldr(Rm, pre(Pm, wordSize)); 5185 ldr(Rn, pre(Pn, -wordSize)); 5186 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5187 } 5188 5189 void post1() { 5190 block_comment("post1"); 5191 5192 // MACC(Ra, Rb, t0, t1, t2); 5193 // Ra = *++Pa; 5194 // Rb = *--Pb; 5195 umulh(Rhi_ab, Ra, Rb); 5196 mul(Rlo_ab, Ra, Rb); 5197 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5198 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5199 5200 // *Pm = Rm = t0 * inv; 5201 mul(Rm, t0, inv); 5202 str(Rm, Address(Pm)); 5203 5204 // MACC(Rm, Rn, t0, t1, t2); 5205 // t0 = t1; t1 = t2; t2 = 0; 5206 umulh(Rhi_mn, Rm, Rn); 5207 5208 #ifndef PRODUCT 5209 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5210 { 5211 mul(Rlo_mn, Rm, Rn); 5212 add(Rlo_mn, t0, Rlo_mn); 5213 Label ok; 5214 cbz(Rlo_mn, ok); { 5215 stop("broken Montgomery multiply"); 5216 } bind(ok); 5217 } 5218 #endif 5219 // We have very carefully set things up so that 5220 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5221 // the lower half of Rm * Rn because we know the result already: 5222 // it must be -t0. t0 + (-t0) must generate a carry iff 5223 // t0 != 0. So, rather than do a mul and an adds we just set 5224 // the carry flag iff t0 is nonzero. 5225 // 5226 // mul(Rlo_mn, Rm, Rn); 5227 // adds(zr, t0, Rlo_mn); 5228 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5229 adcs(t0, t1, Rhi_mn); 5230 adc(t1, t2, zr); 5231 mov(t2, zr); 5232 } 5233 5234 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 5235 block_comment("pre2"); 5236 // Pa = Pa_base + i-len; 5237 // Pb = Pb_base + len; 5238 // Pm = Pm_base + i-len; 5239 // Pn = Pn_base + len; 5240 5241 if (i.is_register()) { 5242 sub(Rj, i.as_register(), len); 5243 } else { 5244 mov(Rj, i.as_constant()); 5245 sub(Rj, Rj, len); 5246 } 5247 // Rj == i-len 5248 5249 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 5250 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 5251 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5252 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 5253 5254 // Ra = *++Pa; 5255 // Rb = *--Pb; 5256 // Rm = *++Pm; 5257 // Rn = *--Pn; 5258 ldr(Ra, pre(Pa, wordSize)); 5259 ldr(Rb, pre(Pb, -wordSize)); 5260 ldr(Rm, pre(Pm, wordSize)); 5261 ldr(Rn, pre(Pn, -wordSize)); 5262 5263 mov(Rhi_mn, zr); 5264 mov(Rlo_mn, zr); 5265 } 5266 5267 void post2(RegisterOrConstant i, RegisterOrConstant len) { 5268 block_comment("post2"); 5269 if (i.is_constant()) { 5270 mov(Rj, i.as_constant()-len.as_constant()); 5271 } else { 5272 sub(Rj, i.as_register(), len); 5273 } 5274 5275 adds(t0, t0, Rlo_mn); // The pending m*n, low part 5276 5277 // As soon as we know the least significant digit of our result, 5278 // store it. 5279 // Pm_base[i-len] = t0; 5280 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5281 5282 // t0 = t1; t1 = t2; t2 = 0; 5283 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 5284 adc(t1, t2, zr); 5285 mov(t2, zr); 5286 } 5287 5288 // A carry in t0 after Montgomery multiplication means that we 5289 // should subtract multiples of n from our result in m. We'll 5290 // keep doing that until there is no carry. 5291 void normalize(RegisterOrConstant len) { 5292 block_comment("normalize"); 5293 // while (t0) 5294 // t0 = sub(Pm_base, Pn_base, t0, len); 5295 Label loop, post, again; 5296 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 5297 cbz(t0, post); { 5298 bind(again); { 5299 mov(i, zr); 5300 mov(cnt, len); 5301 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5302 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5303 subs(zr, zr, zr); // set carry flag, i.e. no borrow 5304 align(16); 5305 bind(loop); { 5306 sbcs(Rm, Rm, Rn); 5307 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5308 add(i, i, 1); 5309 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5310 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5311 sub(cnt, cnt, 1); 5312 } cbnz(cnt, loop); 5313 sbc(t0, t0, zr); 5314 } cbnz(t0, again); 5315 } bind(post); 5316 } 5317 5318 // Move memory at s to d, reversing words. 5319 // Increments d to end of copied memory 5320 // Destroys tmp1, tmp2 5321 // Preserves len 5322 // Leaves s pointing to the address which was in d at start 5323 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 5324 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 5325 5326 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 5327 mov(tmp1, len); 5328 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 5329 sub(s, d, len, ext::uxtw, LogBytesPerWord); 5330 } 5331 // where 5332 void reverse1(Register d, Register s, Register tmp) { 5333 ldr(tmp, pre(s, -wordSize)); 5334 ror(tmp, tmp, 32); 5335 str(tmp, post(d, wordSize)); 5336 } 5337 5338 void step_squaring() { 5339 // An extra ACC 5340 step(); 5341 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5342 } 5343 5344 void last_squaring(RegisterOrConstant i) { 5345 Label dont; 5346 // if ((i & 1) == 0) { 5347 tbnz(i.as_register(), 0, dont); { 5348 // MACC(Ra, Rb, t0, t1, t2); 5349 // Ra = *++Pa; 5350 // Rb = *--Pb; 5351 umulh(Rhi_ab, Ra, Rb); 5352 mul(Rlo_ab, Ra, Rb); 5353 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5354 } bind(dont); 5355 } 5356 5357 void extra_step_squaring() { 5358 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5359 5360 // MACC(Rm, Rn, t0, t1, t2); 5361 // Rm = *++Pm; 5362 // Rn = *--Pn; 5363 umulh(Rhi_mn, Rm, Rn); 5364 mul(Rlo_mn, Rm, Rn); 5365 ldr(Rm, pre(Pm, wordSize)); 5366 ldr(Rn, pre(Pn, -wordSize)); 5367 } 5368 5369 void post1_squaring() { 5370 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5371 5372 // *Pm = Rm = t0 * inv; 5373 mul(Rm, t0, inv); 5374 str(Rm, Address(Pm)); 5375 5376 // MACC(Rm, Rn, t0, t1, t2); 5377 // t0 = t1; t1 = t2; t2 = 0; 5378 umulh(Rhi_mn, Rm, Rn); 5379 5380 #ifndef PRODUCT 5381 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5382 { 5383 mul(Rlo_mn, Rm, Rn); 5384 add(Rlo_mn, t0, Rlo_mn); 5385 Label ok; 5386 cbz(Rlo_mn, ok); { 5387 stop("broken Montgomery multiply"); 5388 } bind(ok); 5389 } 5390 #endif 5391 // We have very carefully set things up so that 5392 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5393 // the lower half of Rm * Rn because we know the result already: 5394 // it must be -t0. t0 + (-t0) must generate a carry iff 5395 // t0 != 0. So, rather than do a mul and an adds we just set 5396 // the carry flag iff t0 is nonzero. 5397 // 5398 // mul(Rlo_mn, Rm, Rn); 5399 // adds(zr, t0, Rlo_mn); 5400 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5401 adcs(t0, t1, Rhi_mn); 5402 adc(t1, t2, zr); 5403 mov(t2, zr); 5404 } 5405 5406 void acc(Register Rhi, Register Rlo, 5407 Register t0, Register t1, Register t2) { 5408 adds(t0, t0, Rlo); 5409 adcs(t1, t1, Rhi); 5410 adc(t2, t2, zr); 5411 } 5412 5413 public: 5414 /** 5415 * Fast Montgomery multiplication. The derivation of the 5416 * algorithm is in A Cryptographic Library for the Motorola 5417 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 5418 * 5419 * Arguments: 5420 * 5421 * Inputs for multiplication: 5422 * c_rarg0 - int array elements a 5423 * c_rarg1 - int array elements b 5424 * c_rarg2 - int array elements n (the modulus) 5425 * c_rarg3 - int length 5426 * c_rarg4 - int inv 5427 * c_rarg5 - int array elements m (the result) 5428 * 5429 * Inputs for squaring: 5430 * c_rarg0 - int array elements a 5431 * c_rarg1 - int array elements n (the modulus) 5432 * c_rarg2 - int length 5433 * c_rarg3 - int inv 5434 * c_rarg4 - int array elements m (the result) 5435 * 5436 */ 5437 address generate_multiply() { 5438 Label argh, nothing; 5439 bind(argh); 5440 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5441 5442 align(CodeEntryAlignment); 5443 address entry = pc(); 5444 5445 cbzw(Rlen, nothing); 5446 5447 enter(); 5448 5449 // Make room. 5450 cmpw(Rlen, 512); 5451 br(Assembler::HI, argh); 5452 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5453 andr(sp, Ra, -2 * wordSize); 5454 5455 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5456 5457 { 5458 // Copy input args, reversing as we go. We use Ra as a 5459 // temporary variable. 5460 reverse(Ra, Pa_base, Rlen, t0, t1); 5461 if (!_squaring) 5462 reverse(Ra, Pb_base, Rlen, t0, t1); 5463 reverse(Ra, Pn_base, Rlen, t0, t1); 5464 } 5465 5466 // Push all call-saved registers and also Pm_base which we'll need 5467 // at the end. 5468 save_regs(); 5469 5470 #ifndef PRODUCT 5471 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 5472 { 5473 ldr(Rn, Address(Pn_base, 0)); 5474 mul(Rlo_mn, Rn, inv); 5475 subs(zr, Rlo_mn, -1); 5476 Label ok; 5477 br(EQ, ok); { 5478 stop("broken inverse in Montgomery multiply"); 5479 } bind(ok); 5480 } 5481 #endif 5482 5483 mov(Pm_base, Ra); 5484 5485 mov(t0, zr); 5486 mov(t1, zr); 5487 mov(t2, zr); 5488 5489 block_comment("for (int i = 0; i < len; i++) {"); 5490 mov(Ri, zr); { 5491 Label loop, end; 5492 cmpw(Ri, Rlen); 5493 br(Assembler::GE, end); 5494 5495 bind(loop); 5496 pre1(Ri); 5497 5498 block_comment(" for (j = i; j; j--) {"); { 5499 movw(Rj, Ri); 5500 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5501 } block_comment(" } // j"); 5502 5503 post1(); 5504 addw(Ri, Ri, 1); 5505 cmpw(Ri, Rlen); 5506 br(Assembler::LT, loop); 5507 bind(end); 5508 block_comment("} // i"); 5509 } 5510 5511 block_comment("for (int i = len; i < 2*len; i++) {"); 5512 mov(Ri, Rlen); { 5513 Label loop, end; 5514 cmpw(Ri, Rlen, Assembler::LSL, 1); 5515 br(Assembler::GE, end); 5516 5517 bind(loop); 5518 pre2(Ri, Rlen); 5519 5520 block_comment(" for (j = len*2-i-1; j; j--) {"); { 5521 lslw(Rj, Rlen, 1); 5522 subw(Rj, Rj, Ri); 5523 subw(Rj, Rj, 1); 5524 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5525 } block_comment(" } // j"); 5526 5527 post2(Ri, Rlen); 5528 addw(Ri, Ri, 1); 5529 cmpw(Ri, Rlen, Assembler::LSL, 1); 5530 br(Assembler::LT, loop); 5531 bind(end); 5532 } 5533 block_comment("} // i"); 5534 5535 normalize(Rlen); 5536 5537 mov(Ra, Pm_base); // Save Pm_base in Ra 5538 restore_regs(); // Restore caller's Pm_base 5539 5540 // Copy our result into caller's Pm_base 5541 reverse(Pm_base, Ra, Rlen, t0, t1); 5542 5543 leave(); 5544 bind(nothing); 5545 ret(lr); 5546 5547 return entry; 5548 } 5549 // In C, approximately: 5550 5551 // void 5552 // montgomery_multiply(julong Pa_base[], julong Pb_base[], 5553 // julong Pn_base[], julong Pm_base[], 5554 // julong inv, int len) { 5555 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5556 // julong *Pa, *Pb, *Pn, *Pm; 5557 // julong Ra, Rb, Rn, Rm; 5558 5559 // int i; 5560 5561 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5562 5563 // for (i = 0; i < len; i++) { 5564 // int j; 5565 5566 // Pa = Pa_base; 5567 // Pb = Pb_base + i; 5568 // Pm = Pm_base; 5569 // Pn = Pn_base + i; 5570 5571 // Ra = *Pa; 5572 // Rb = *Pb; 5573 // Rm = *Pm; 5574 // Rn = *Pn; 5575 5576 // int iters = i; 5577 // for (j = 0; iters--; j++) { 5578 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5579 // MACC(Ra, Rb, t0, t1, t2); 5580 // Ra = *++Pa; 5581 // Rb = *--Pb; 5582 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5583 // MACC(Rm, Rn, t0, t1, t2); 5584 // Rm = *++Pm; 5585 // Rn = *--Pn; 5586 // } 5587 5588 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 5589 // MACC(Ra, Rb, t0, t1, t2); 5590 // *Pm = Rm = t0 * inv; 5591 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5592 // MACC(Rm, Rn, t0, t1, t2); 5593 5594 // assert(t0 == 0, "broken Montgomery multiply"); 5595 5596 // t0 = t1; t1 = t2; t2 = 0; 5597 // } 5598 5599 // for (i = len; i < 2*len; i++) { 5600 // int j; 5601 5602 // Pa = Pa_base + i-len; 5603 // Pb = Pb_base + len; 5604 // Pm = Pm_base + i-len; 5605 // Pn = Pn_base + len; 5606 5607 // Ra = *++Pa; 5608 // Rb = *--Pb; 5609 // Rm = *++Pm; 5610 // Rn = *--Pn; 5611 5612 // int iters = len*2-i-1; 5613 // for (j = i-len+1; iters--; j++) { 5614 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5615 // MACC(Ra, Rb, t0, t1, t2); 5616 // Ra = *++Pa; 5617 // Rb = *--Pb; 5618 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5619 // MACC(Rm, Rn, t0, t1, t2); 5620 // Rm = *++Pm; 5621 // Rn = *--Pn; 5622 // } 5623 5624 // Pm_base[i-len] = t0; 5625 // t0 = t1; t1 = t2; t2 = 0; 5626 // } 5627 5628 // while (t0) 5629 // t0 = sub(Pm_base, Pn_base, t0, len); 5630 // } 5631 5632 /** 5633 * Fast Montgomery squaring. This uses asymptotically 25% fewer 5634 * multiplies than Montgomery multiplication so it should be up to 5635 * 25% faster. However, its loop control is more complex and it 5636 * may actually run slower on some machines. 5637 * 5638 * Arguments: 5639 * 5640 * Inputs: 5641 * c_rarg0 - int array elements a 5642 * c_rarg1 - int array elements n (the modulus) 5643 * c_rarg2 - int length 5644 * c_rarg3 - int inv 5645 * c_rarg4 - int array elements m (the result) 5646 * 5647 */ 5648 address generate_square() { 5649 Label argh; 5650 bind(argh); 5651 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5652 5653 align(CodeEntryAlignment); 5654 address entry = pc(); 5655 5656 enter(); 5657 5658 // Make room. 5659 cmpw(Rlen, 512); 5660 br(Assembler::HI, argh); 5661 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5662 andr(sp, Ra, -2 * wordSize); 5663 5664 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5665 5666 { 5667 // Copy input args, reversing as we go. We use Ra as a 5668 // temporary variable. 5669 reverse(Ra, Pa_base, Rlen, t0, t1); 5670 reverse(Ra, Pn_base, Rlen, t0, t1); 5671 } 5672 5673 // Push all call-saved registers and also Pm_base which we'll need 5674 // at the end. 5675 save_regs(); 5676 5677 mov(Pm_base, Ra); 5678 5679 mov(t0, zr); 5680 mov(t1, zr); 5681 mov(t2, zr); 5682 5683 block_comment("for (int i = 0; i < len; i++) {"); 5684 mov(Ri, zr); { 5685 Label loop, end; 5686 bind(loop); 5687 cmp(Ri, Rlen); 5688 br(Assembler::GE, end); 5689 5690 pre1(Ri); 5691 5692 block_comment("for (j = (i+1)/2; j; j--) {"); { 5693 add(Rj, Ri, 1); 5694 lsr(Rj, Rj, 1); 5695 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5696 } block_comment(" } // j"); 5697 5698 last_squaring(Ri); 5699 5700 block_comment(" for (j = i/2; j; j--) {"); { 5701 lsr(Rj, Ri, 1); 5702 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5703 } block_comment(" } // j"); 5704 5705 post1_squaring(); 5706 add(Ri, Ri, 1); 5707 cmp(Ri, Rlen); 5708 br(Assembler::LT, loop); 5709 5710 bind(end); 5711 block_comment("} // i"); 5712 } 5713 5714 block_comment("for (int i = len; i < 2*len; i++) {"); 5715 mov(Ri, Rlen); { 5716 Label loop, end; 5717 bind(loop); 5718 cmp(Ri, Rlen, Assembler::LSL, 1); 5719 br(Assembler::GE, end); 5720 5721 pre2(Ri, Rlen); 5722 5723 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 5724 lsl(Rj, Rlen, 1); 5725 sub(Rj, Rj, Ri); 5726 sub(Rj, Rj, 1); 5727 lsr(Rj, Rj, 1); 5728 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5729 } block_comment(" } // j"); 5730 5731 last_squaring(Ri); 5732 5733 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 5734 lsl(Rj, Rlen, 1); 5735 sub(Rj, Rj, Ri); 5736 lsr(Rj, Rj, 1); 5737 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5738 } block_comment(" } // j"); 5739 5740 post2(Ri, Rlen); 5741 add(Ri, Ri, 1); 5742 cmp(Ri, Rlen, Assembler::LSL, 1); 5743 5744 br(Assembler::LT, loop); 5745 bind(end); 5746 block_comment("} // i"); 5747 } 5748 5749 normalize(Rlen); 5750 5751 mov(Ra, Pm_base); // Save Pm_base in Ra 5752 restore_regs(); // Restore caller's Pm_base 5753 5754 // Copy our result into caller's Pm_base 5755 reverse(Pm_base, Ra, Rlen, t0, t1); 5756 5757 leave(); 5758 ret(lr); 5759 5760 return entry; 5761 } 5762 // In C, approximately: 5763 5764 // void 5765 // montgomery_square(julong Pa_base[], julong Pn_base[], 5766 // julong Pm_base[], julong inv, int len) { 5767 // julong t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5768 // julong *Pa, *Pb, *Pn, *Pm; 5769 // julong Ra, Rb, Rn, Rm; 5770 5771 // int i; 5772 5773 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5774 5775 // for (i = 0; i < len; i++) { 5776 // int j; 5777 5778 // Pa = Pa_base; 5779 // Pb = Pa_base + i; 5780 // Pm = Pm_base; 5781 // Pn = Pn_base + i; 5782 5783 // Ra = *Pa; 5784 // Rb = *Pb; 5785 // Rm = *Pm; 5786 // Rn = *Pn; 5787 5788 // int iters = (i+1)/2; 5789 // for (j = 0; iters--; j++) { 5790 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5791 // MACC2(Ra, Rb, t0, t1, t2); 5792 // Ra = *++Pa; 5793 // Rb = *--Pb; 5794 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5795 // MACC(Rm, Rn, t0, t1, t2); 5796 // Rm = *++Pm; 5797 // Rn = *--Pn; 5798 // } 5799 // if ((i & 1) == 0) { 5800 // assert(Ra == Pa_base[j], "must be"); 5801 // MACC(Ra, Ra, t0, t1, t2); 5802 // } 5803 // iters = i/2; 5804 // assert(iters == i-j, "must be"); 5805 // for (; iters--; j++) { 5806 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5807 // MACC(Rm, Rn, t0, t1, t2); 5808 // Rm = *++Pm; 5809 // Rn = *--Pn; 5810 // } 5811 5812 // *Pm = Rm = t0 * inv; 5813 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5814 // MACC(Rm, Rn, t0, t1, t2); 5815 5816 // assert(t0 == 0, "broken Montgomery multiply"); 5817 5818 // t0 = t1; t1 = t2; t2 = 0; 5819 // } 5820 5821 // for (i = len; i < 2*len; i++) { 5822 // int start = i-len+1; 5823 // int end = start + (len - start)/2; 5824 // int j; 5825 5826 // Pa = Pa_base + i-len; 5827 // Pb = Pa_base + len; 5828 // Pm = Pm_base + i-len; 5829 // Pn = Pn_base + len; 5830 5831 // Ra = *++Pa; 5832 // Rb = *--Pb; 5833 // Rm = *++Pm; 5834 // Rn = *--Pn; 5835 5836 // int iters = (2*len-i-1)/2; 5837 // assert(iters == end-start, "must be"); 5838 // for (j = start; iters--; j++) { 5839 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5840 // MACC2(Ra, Rb, t0, t1, t2); 5841 // Ra = *++Pa; 5842 // Rb = *--Pb; 5843 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5844 // MACC(Rm, Rn, t0, t1, t2); 5845 // Rm = *++Pm; 5846 // Rn = *--Pn; 5847 // } 5848 // if ((i & 1) == 0) { 5849 // assert(Ra == Pa_base[j], "must be"); 5850 // MACC(Ra, Ra, t0, t1, t2); 5851 // } 5852 // iters = (2*len-i)/2; 5853 // assert(iters == len-j, "must be"); 5854 // for (; iters--; j++) { 5855 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5856 // MACC(Rm, Rn, t0, t1, t2); 5857 // Rm = *++Pm; 5858 // Rn = *--Pn; 5859 // } 5860 // Pm_base[i-len] = t0; 5861 // t0 = t1; t1 = t2; t2 = 0; 5862 // } 5863 5864 // while (t0) 5865 // t0 = sub(Pm_base, Pn_base, t0, len); 5866 // } 5867 }; 5868 5869 5870 // Initialization 5871 void generate_initial() { 5872 // Generate initial stubs and initializes the entry points 5873 5874 // entry points that exist in all platforms Note: This is code 5875 // that could be shared among different platforms - however the 5876 // benefit seems to be smaller than the disadvantage of having a 5877 // much more complicated generator structure. See also comment in 5878 // stubRoutines.hpp. 5879 5880 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5881 5882 StubRoutines::_call_stub_entry = 5883 generate_call_stub(StubRoutines::_call_stub_return_address); 5884 5885 // is referenced by megamorphic call 5886 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5887 5888 // Build this early so it's available for the interpreter. 5889 StubRoutines::_throw_StackOverflowError_entry = 5890 generate_throw_exception("StackOverflowError throw_exception", 5891 CAST_FROM_FN_PTR(address, 5892 SharedRuntime::throw_StackOverflowError)); 5893 StubRoutines::_throw_delayed_StackOverflowError_entry = 5894 generate_throw_exception("delayed StackOverflowError throw_exception", 5895 CAST_FROM_FN_PTR(address, 5896 SharedRuntime::throw_delayed_StackOverflowError)); 5897 if (UseCRC32Intrinsics) { 5898 // set table address before stub generation which use it 5899 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5900 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5901 } 5902 5903 if (UseCRC32CIntrinsics) { 5904 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5905 } 5906 5907 // Disabled until JDK-8210858 is fixed 5908 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5909 // StubRoutines::_dlog = generate_dlog(); 5910 // } 5911 5912 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5913 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 5914 } 5915 5916 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5917 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 5918 } 5919 5920 // Safefetch stubs. 5921 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5922 &StubRoutines::_safefetch32_fault_pc, 5923 &StubRoutines::_safefetch32_continuation_pc); 5924 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5925 &StubRoutines::_safefetchN_fault_pc, 5926 &StubRoutines::_safefetchN_continuation_pc); 5927 } 5928 5929 void generate_all() { 5930 // support for verify_oop (must happen after universe_init) 5931 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5932 StubRoutines::_throw_AbstractMethodError_entry = 5933 generate_throw_exception("AbstractMethodError throw_exception", 5934 CAST_FROM_FN_PTR(address, 5935 SharedRuntime:: 5936 throw_AbstractMethodError)); 5937 5938 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5939 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5940 CAST_FROM_FN_PTR(address, 5941 SharedRuntime:: 5942 throw_IncompatibleClassChangeError)); 5943 5944 StubRoutines::_throw_NullPointerException_at_call_entry = 5945 generate_throw_exception("NullPointerException at call throw_exception", 5946 CAST_FROM_FN_PTR(address, 5947 SharedRuntime:: 5948 throw_NullPointerException_at_call)); 5949 5950 // arraycopy stubs used by compilers 5951 generate_arraycopy_stubs(); 5952 5953 // has negatives stub for large arrays. 5954 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5955 5956 // array equals stub for large arrays. 5957 if (!UseSimpleArrayEquals) { 5958 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5959 } 5960 5961 generate_compare_long_strings(); 5962 5963 generate_string_indexof_stubs(); 5964 5965 // byte_array_inflate stub for large arrays. 5966 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5967 5968 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5969 if (bs_nm != NULL) { 5970 StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier(); 5971 } 5972 #ifdef COMPILER2 5973 if (UseMultiplyToLenIntrinsic) { 5974 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5975 } 5976 5977 if (UseSquareToLenIntrinsic) { 5978 StubRoutines::_squareToLen = generate_squareToLen(); 5979 } 5980 5981 if (UseMulAddIntrinsic) { 5982 StubRoutines::_mulAdd = generate_mulAdd(); 5983 } 5984 5985 if (UseMontgomeryMultiplyIntrinsic) { 5986 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5987 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5988 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5989 } 5990 5991 if (UseMontgomerySquareIntrinsic) { 5992 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5993 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5994 // We use generate_multiply() rather than generate_square() 5995 // because it's faster for the sizes of modulus we care about. 5996 StubRoutines::_montgomerySquare = g.generate_multiply(); 5997 } 5998 #endif // COMPILER2 5999 6000 // generate GHASH intrinsics code 6001 if (UseGHASHIntrinsics) { 6002 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 6003 } 6004 6005 // data cache line writeback 6006 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 6007 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 6008 6009 if (UseAESIntrinsics) { 6010 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 6011 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 6012 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 6013 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 6014 } 6015 6016 if (UseSHA1Intrinsics) { 6017 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 6018 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 6019 } 6020 if (UseSHA256Intrinsics) { 6021 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 6022 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 6023 } 6024 if (UseSHA512Intrinsics) { 6025 StubRoutines::_sha512_implCompress = generate_sha512_implCompress(false, "sha512_implCompress"); 6026 StubRoutines::_sha512_implCompressMB = generate_sha512_implCompress(true, "sha512_implCompressMB"); 6027 } 6028 6029 // generate Adler32 intrinsics code 6030 if (UseAdler32Intrinsics) { 6031 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 6032 } 6033 6034 StubRoutines::aarch64::set_completed(); 6035 } 6036 6037 public: 6038 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 6039 if (all) { 6040 generate_all(); 6041 } else { 6042 generate_initial(); 6043 } 6044 } 6045 }; // end class declaration 6046 6047 #define UCM_TABLE_MAX_ENTRIES 8 6048 void StubGenerator_generate(CodeBuffer* code, bool all) { 6049 if (UnsafeCopyMemory::_table == NULL) { 6050 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES); 6051 } 6052 StubGenerator g(code, all); 6053 }