1 /* 2 * Copyright (c) 2003, 2020, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2020, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/universe.hpp" 33 #include "nativeInst_aarch64.hpp" 34 #include "oops/instanceOop.hpp" 35 #include "oops/method.hpp" 36 #include "oops/objArrayKlass.hpp" 37 #include "oops/oop.inline.hpp" 38 #include "prims/methodHandles.hpp" 39 #include "runtime/frame.inline.hpp" 40 #include "runtime/handles.inline.hpp" 41 #include "runtime/sharedRuntime.hpp" 42 #include "runtime/stubCodeGenerator.hpp" 43 #include "runtime/stubRoutines.hpp" 44 #include "runtime/thread.inline.hpp" 45 #include "utilities/align.hpp" 46 #include "utilities/powerOfTwo.hpp" 47 #ifdef COMPILER2 48 #include "opto/runtime.hpp" 49 #endif 50 #if INCLUDE_ZGC 51 #include "gc/z/zThreadLocalData.hpp" 52 #endif 53 54 // Declaration and definition of StubGenerator (no .hpp file). 55 // For a more detailed description of the stub routine structure 56 // see the comment in stubRoutines.hpp 57 58 #undef __ 59 #define __ _masm-> 60 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 61 62 #ifdef PRODUCT 63 #define BLOCK_COMMENT(str) /* nothing */ 64 #else 65 #define BLOCK_COMMENT(str) __ block_comment(str) 66 #endif 67 68 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 69 70 // Stub Code definitions 71 72 class StubGenerator: public StubCodeGenerator { 73 private: 74 75 #ifdef PRODUCT 76 #define inc_counter_np(counter) ((void)0) 77 #else 78 void inc_counter_np_(int& counter) { 79 __ lea(rscratch2, ExternalAddress((address)&counter)); 80 __ ldrw(rscratch1, Address(rscratch2)); 81 __ addw(rscratch1, rscratch1, 1); 82 __ strw(rscratch1, Address(rscratch2)); 83 } 84 #define inc_counter_np(counter) \ 85 BLOCK_COMMENT("inc_counter " #counter); \ 86 inc_counter_np_(counter); 87 #endif 88 89 // Call stubs are used to call Java from C 90 // 91 // Arguments: 92 // c_rarg0: call wrapper address address 93 // c_rarg1: result address 94 // c_rarg2: result type BasicType 95 // c_rarg3: method Method* 96 // c_rarg4: (interpreter) entry point address 97 // c_rarg5: parameters intptr_t* 98 // c_rarg6: parameter size (in words) int 99 // c_rarg7: thread Thread* 100 // 101 // There is no return from the stub itself as any Java result 102 // is written to result 103 // 104 // we save r30 (lr) as the return PC at the base of the frame and 105 // link r29 (fp) below it as the frame pointer installing sp (r31) 106 // into fp. 107 // 108 // we save r0-r7, which accounts for all the c arguments. 109 // 110 // TODO: strictly do we need to save them all? they are treated as 111 // volatile by C so could we omit saving the ones we are going to 112 // place in global registers (thread? method?) or those we only use 113 // during setup of the Java call? 114 // 115 // we don't need to save r8 which C uses as an indirect result location 116 // return register. 117 // 118 // we don't need to save r9-r15 which both C and Java treat as 119 // volatile 120 // 121 // we don't need to save r16-18 because Java does not use them 122 // 123 // we save r19-r28 which Java uses as scratch registers and C 124 // expects to be callee-save 125 // 126 // we save the bottom 64 bits of each value stored in v8-v15; it is 127 // the responsibility of the caller to preserve larger values. 128 // 129 // so the stub frame looks like this when we enter Java code 130 // 131 // [ return_from_Java ] <--- sp 132 // [ argument word n ] 133 // ... 134 // -27 [ argument word 1 ] 135 // -26 [ saved v15 ] <--- sp_after_call 136 // -25 [ saved v14 ] 137 // -24 [ saved v13 ] 138 // -23 [ saved v12 ] 139 // -22 [ saved v11 ] 140 // -21 [ saved v10 ] 141 // -20 [ saved v9 ] 142 // -19 [ saved v8 ] 143 // -18 [ saved r28 ] 144 // -17 [ saved r27 ] 145 // -16 [ saved r26 ] 146 // -15 [ saved r25 ] 147 // -14 [ saved r24 ] 148 // -13 [ saved r23 ] 149 // -12 [ saved r22 ] 150 // -11 [ saved r21 ] 151 // -10 [ saved r20 ] 152 // -9 [ saved r19 ] 153 // -8 [ call wrapper (r0) ] 154 // -7 [ result (r1) ] 155 // -6 [ result type (r2) ] 156 // -5 [ method (r3) ] 157 // -4 [ entry point (r4) ] 158 // -3 [ parameters (r5) ] 159 // -2 [ parameter size (r6) ] 160 // -1 [ thread (r7) ] 161 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 162 // 1 [ saved lr (r30) ] 163 164 // Call stub stack layout word offsets from fp 165 enum call_stub_layout { 166 sp_after_call_off = -26, 167 168 d15_off = -26, 169 d13_off = -24, 170 d11_off = -22, 171 d9_off = -20, 172 173 r28_off = -18, 174 r26_off = -16, 175 r24_off = -14, 176 r22_off = -12, 177 r20_off = -10, 178 call_wrapper_off = -8, 179 result_off = -7, 180 result_type_off = -6, 181 method_off = -5, 182 entry_point_off = -4, 183 parameter_size_off = -2, 184 thread_off = -1, 185 fp_f = 0, 186 retaddr_off = 1, 187 }; 188 189 address generate_call_stub(address& return_address) { 190 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 191 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 192 "adjust this code"); 193 194 StubCodeMark mark(this, "StubRoutines", "call_stub"); 195 address start = __ pc(); 196 197 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 198 199 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 200 const Address result (rfp, result_off * wordSize); 201 const Address result_type (rfp, result_type_off * wordSize); 202 const Address method (rfp, method_off * wordSize); 203 const Address entry_point (rfp, entry_point_off * wordSize); 204 const Address parameter_size(rfp, parameter_size_off * wordSize); 205 206 const Address thread (rfp, thread_off * wordSize); 207 208 const Address d15_save (rfp, d15_off * wordSize); 209 const Address d13_save (rfp, d13_off * wordSize); 210 const Address d11_save (rfp, d11_off * wordSize); 211 const Address d9_save (rfp, d9_off * wordSize); 212 213 const Address r28_save (rfp, r28_off * wordSize); 214 const Address r26_save (rfp, r26_off * wordSize); 215 const Address r24_save (rfp, r24_off * wordSize); 216 const Address r22_save (rfp, r22_off * wordSize); 217 const Address r20_save (rfp, r20_off * wordSize); 218 219 // stub code 220 221 address aarch64_entry = __ pc(); 222 223 // set up frame and move sp to end of save area 224 __ enter(); 225 __ sub(sp, rfp, -sp_after_call_off * wordSize); 226 227 // save register parameters and Java scratch/global registers 228 // n.b. we save thread even though it gets installed in 229 // rthread because we want to sanity check rthread later 230 __ str(c_rarg7, thread); 231 __ strw(c_rarg6, parameter_size); 232 __ stp(c_rarg4, c_rarg5, entry_point); 233 __ stp(c_rarg2, c_rarg3, result_type); 234 __ stp(c_rarg0, c_rarg1, call_wrapper); 235 236 __ stp(r20, r19, r20_save); 237 __ stp(r22, r21, r22_save); 238 __ stp(r24, r23, r24_save); 239 __ stp(r26, r25, r26_save); 240 __ stp(r28, r27, r28_save); 241 242 __ stpd(v9, v8, d9_save); 243 __ stpd(v11, v10, d11_save); 244 __ stpd(v13, v12, d13_save); 245 __ stpd(v15, v14, d15_save); 246 247 // install Java thread in global register now we have saved 248 // whatever value it held 249 __ mov(rthread, c_rarg7); 250 // And method 251 __ mov(rmethod, c_rarg3); 252 253 // set up the heapbase register 254 __ reinit_heapbase(); 255 256 #ifdef ASSERT 257 // make sure we have no pending exceptions 258 { 259 Label L; 260 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 261 __ cmp(rscratch1, (u1)NULL_WORD); 262 __ br(Assembler::EQ, L); 263 __ stop("StubRoutines::call_stub: entered with pending exception"); 264 __ BIND(L); 265 } 266 #endif 267 // pass parameters if any 268 __ mov(esp, sp); 269 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 270 __ andr(sp, rscratch1, -2 * wordSize); 271 272 BLOCK_COMMENT("pass parameters if any"); 273 Label parameters_done; 274 // parameter count is still in c_rarg6 275 // and parameter pointer identifying param 1 is in c_rarg5 276 __ cbzw(c_rarg6, parameters_done); 277 278 address loop = __ pc(); 279 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 280 __ subsw(c_rarg6, c_rarg6, 1); 281 __ push(rscratch1); 282 __ br(Assembler::GT, loop); 283 284 __ BIND(parameters_done); 285 286 // call Java entry -- passing methdoOop, and current sp 287 // rmethod: Method* 288 // r13: sender sp 289 BLOCK_COMMENT("call Java function"); 290 __ mov(r13, sp); 291 __ blr(c_rarg4); 292 293 // we do this here because the notify will already have been done 294 // if we get to the next instruction via an exception 295 // 296 // n.b. adding this instruction here affects the calculation of 297 // whether or not a routine returns to the call stub (used when 298 // doing stack walks) since the normal test is to check the return 299 // pc against the address saved below. so we may need to allow for 300 // this extra instruction in the check. 301 302 // save current address for use by exception handling code 303 304 return_address = __ pc(); 305 306 // store result depending on type (everything that is not 307 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 308 // n.b. this assumes Java returns an integral result in r0 309 // and a floating result in j_farg0 310 __ ldr(j_rarg2, result); 311 Label is_long, is_float, is_double, exit; 312 __ ldr(j_rarg1, result_type); 313 __ cmp(j_rarg1, (u1)T_OBJECT); 314 __ br(Assembler::EQ, is_long); 315 __ cmp(j_rarg1, (u1)T_LONG); 316 __ br(Assembler::EQ, is_long); 317 __ cmp(j_rarg1, (u1)T_FLOAT); 318 __ br(Assembler::EQ, is_float); 319 __ cmp(j_rarg1, (u1)T_DOUBLE); 320 __ br(Assembler::EQ, is_double); 321 322 // handle T_INT case 323 __ strw(r0, Address(j_rarg2)); 324 325 __ BIND(exit); 326 327 // pop parameters 328 __ sub(esp, rfp, -sp_after_call_off * wordSize); 329 330 #ifdef ASSERT 331 // verify that threads correspond 332 { 333 Label L, S; 334 __ ldr(rscratch1, thread); 335 __ cmp(rthread, rscratch1); 336 __ br(Assembler::NE, S); 337 __ get_thread(rscratch1); 338 __ cmp(rthread, rscratch1); 339 __ br(Assembler::EQ, L); 340 __ BIND(S); 341 __ stop("StubRoutines::call_stub: threads must correspond"); 342 __ BIND(L); 343 } 344 #endif 345 346 // restore callee-save registers 347 __ ldpd(v15, v14, d15_save); 348 __ ldpd(v13, v12, d13_save); 349 __ ldpd(v11, v10, d11_save); 350 __ ldpd(v9, v8, d9_save); 351 352 __ ldp(r28, r27, r28_save); 353 __ ldp(r26, r25, r26_save); 354 __ ldp(r24, r23, r24_save); 355 __ ldp(r22, r21, r22_save); 356 __ ldp(r20, r19, r20_save); 357 358 __ ldp(c_rarg0, c_rarg1, call_wrapper); 359 __ ldrw(c_rarg2, result_type); 360 __ ldr(c_rarg3, method); 361 __ ldp(c_rarg4, c_rarg5, entry_point); 362 __ ldp(c_rarg6, c_rarg7, parameter_size); 363 364 // leave frame and return to caller 365 __ leave(); 366 __ ret(lr); 367 368 // handle return types different from T_INT 369 370 __ BIND(is_long); 371 __ str(r0, Address(j_rarg2, 0)); 372 __ br(Assembler::AL, exit); 373 374 __ BIND(is_float); 375 __ strs(j_farg0, Address(j_rarg2, 0)); 376 __ br(Assembler::AL, exit); 377 378 __ BIND(is_double); 379 __ strd(j_farg0, Address(j_rarg2, 0)); 380 __ br(Assembler::AL, exit); 381 382 return start; 383 } 384 385 // Return point for a Java call if there's an exception thrown in 386 // Java code. The exception is caught and transformed into a 387 // pending exception stored in JavaThread that can be tested from 388 // within the VM. 389 // 390 // Note: Usually the parameters are removed by the callee. In case 391 // of an exception crossing an activation frame boundary, that is 392 // not the case if the callee is compiled code => need to setup the 393 // rsp. 394 // 395 // r0: exception oop 396 397 address generate_catch_exception() { 398 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 399 address start = __ pc(); 400 401 // same as in generate_call_stub(): 402 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 403 const Address thread (rfp, thread_off * wordSize); 404 405 #ifdef ASSERT 406 // verify that threads correspond 407 { 408 Label L, S; 409 __ ldr(rscratch1, thread); 410 __ cmp(rthread, rscratch1); 411 __ br(Assembler::NE, S); 412 __ get_thread(rscratch1); 413 __ cmp(rthread, rscratch1); 414 __ br(Assembler::EQ, L); 415 __ bind(S); 416 __ stop("StubRoutines::catch_exception: threads must correspond"); 417 __ bind(L); 418 } 419 #endif 420 421 // set pending exception 422 __ verify_oop(r0); 423 424 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 425 __ mov(rscratch1, (address)__FILE__); 426 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 427 __ movw(rscratch1, (int)__LINE__); 428 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 429 430 // complete return to VM 431 assert(StubRoutines::_call_stub_return_address != NULL, 432 "_call_stub_return_address must have been generated before"); 433 __ b(StubRoutines::_call_stub_return_address); 434 435 return start; 436 } 437 438 // Continuation point for runtime calls returning with a pending 439 // exception. The pending exception check happened in the runtime 440 // or native call stub. The pending exception in Thread is 441 // converted into a Java-level exception. 442 // 443 // Contract with Java-level exception handlers: 444 // r0: exception 445 // r3: throwing pc 446 // 447 // NOTE: At entry of this stub, exception-pc must be in LR !! 448 449 // NOTE: this is always used as a jump target within generated code 450 // so it just needs to be generated code wiht no x86 prolog 451 452 address generate_forward_exception() { 453 StubCodeMark mark(this, "StubRoutines", "forward exception"); 454 address start = __ pc(); 455 456 // Upon entry, LR points to the return address returning into 457 // Java (interpreted or compiled) code; i.e., the return address 458 // becomes the throwing pc. 459 // 460 // Arguments pushed before the runtime call are still on the stack 461 // but the exception handler will reset the stack pointer -> 462 // ignore them. A potential result in registers can be ignored as 463 // well. 464 465 #ifdef ASSERT 466 // make sure this code is only executed if there is a pending exception 467 { 468 Label L; 469 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 470 __ cbnz(rscratch1, L); 471 __ stop("StubRoutines::forward exception: no pending exception (1)"); 472 __ bind(L); 473 } 474 #endif 475 476 // compute exception handler into r19 477 478 // call the VM to find the handler address associated with the 479 // caller address. pass thread in r0 and caller pc (ret address) 480 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 481 // the stack. 482 __ mov(c_rarg1, lr); 483 // lr will be trashed by the VM call so we move it to R19 484 // (callee-saved) because we also need to pass it to the handler 485 // returned by this call. 486 __ mov(r19, lr); 487 BLOCK_COMMENT("call exception_handler_for_return_address"); 488 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 489 SharedRuntime::exception_handler_for_return_address), 490 rthread, c_rarg1); 491 // we should not really care that lr is no longer the callee 492 // address. we saved the value the handler needs in r19 so we can 493 // just copy it to r3. however, the C2 handler will push its own 494 // frame and then calls into the VM and the VM code asserts that 495 // the PC for the frame above the handler belongs to a compiled 496 // Java method. So, we restore lr here to satisfy that assert. 497 __ mov(lr, r19); 498 // setup r0 & r3 & clear pending exception 499 __ mov(r3, r19); 500 __ mov(r19, r0); 501 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 502 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 503 504 #ifdef ASSERT 505 // make sure exception is set 506 { 507 Label L; 508 __ cbnz(r0, L); 509 __ stop("StubRoutines::forward exception: no pending exception (2)"); 510 __ bind(L); 511 } 512 #endif 513 514 // continue at exception handler 515 // r0: exception 516 // r3: throwing pc 517 // r19: exception handler 518 __ verify_oop(r0); 519 __ br(r19); 520 521 return start; 522 } 523 524 // Non-destructive plausibility checks for oops 525 // 526 // Arguments: 527 // r0: oop to verify 528 // rscratch1: error message 529 // 530 // Stack after saving c_rarg3: 531 // [tos + 0]: saved c_rarg3 532 // [tos + 1]: saved c_rarg2 533 // [tos + 2]: saved lr 534 // [tos + 3]: saved rscratch2 535 // [tos + 4]: saved r0 536 // [tos + 5]: saved rscratch1 537 address generate_verify_oop() { 538 539 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 540 address start = __ pc(); 541 542 Label exit, error; 543 544 // save c_rarg2 and c_rarg3 545 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 546 547 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 548 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 549 __ ldr(c_rarg3, Address(c_rarg2)); 550 __ add(c_rarg3, c_rarg3, 1); 551 __ str(c_rarg3, Address(c_rarg2)); 552 553 // object is in r0 554 // make sure object is 'reasonable' 555 __ cbz(r0, exit); // if obj is NULL it is OK 556 557 #if INCLUDE_ZGC 558 if (UseZGC) { 559 // Check if mask is good. 560 // verifies that ZAddressBadMask & r0 == 0 561 __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset())); 562 __ andr(c_rarg2, r0, c_rarg3); 563 __ cbnz(c_rarg2, error); 564 } 565 #endif 566 567 // Check if the oop is in the right area of memory 568 // Make sure we cast to `address` or it ends up calling the wrong `mov` 569 // with MSVC, leading to a crash. 570 __ mov(c_rarg3, (address) Universe::verify_oop_mask()); 571 __ andr(c_rarg2, r0, c_rarg3); 572 __ mov(c_rarg3, (address) Universe::verify_oop_bits()); 573 574 // Compare c_rarg2 and c_rarg3. We don't use a compare 575 // instruction here because the flags register is live. 576 __ eor(c_rarg2, c_rarg2, c_rarg3); 577 __ cbnz(c_rarg2, error); 578 579 // make sure klass is 'reasonable', which is not zero. 580 __ load_klass(r0, r0); // get klass 581 __ cbz(r0, error); // if klass is NULL it is broken 582 583 // return if everything seems ok 584 __ bind(exit); 585 586 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 587 __ ret(lr); 588 589 // handle errors 590 __ bind(error); 591 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 592 593 __ push(RegSet::range(r0, r29), sp); 594 // debug(char* msg, int64_t pc, int64_t regs[]) 595 __ mov(c_rarg0, rscratch1); // pass address of error message 596 __ mov(c_rarg1, lr); // pass return address 597 __ mov(c_rarg2, sp); // pass address of regs on stack 598 #ifndef PRODUCT 599 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 600 #endif 601 BLOCK_COMMENT("call MacroAssembler::debug"); 602 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 603 __ blr(rscratch1); 604 __ hlt(0); 605 606 return start; 607 } 608 609 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 610 611 // The inner part of zero_words(). This is the bulk operation, 612 // zeroing words in blocks, possibly using DC ZVA to do it. The 613 // caller is responsible for zeroing the last few words. 614 // 615 // Inputs: 616 // r10: the HeapWord-aligned base address of an array to zero. 617 // r11: the count in HeapWords, r11 > 0. 618 // 619 // Returns r10 and r11, adjusted for the caller to clear. 620 // r10: the base address of the tail of words left to clear. 621 // r11: the number of words in the tail. 622 // r11 < MacroAssembler::zero_words_block_size. 623 624 address generate_zero_blocks() { 625 Label done; 626 Label base_aligned; 627 628 Register base = r10, cnt = r11; 629 630 __ align(CodeEntryAlignment); 631 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 632 address start = __ pc(); 633 634 if (UseBlockZeroing) { 635 int zva_length = VM_Version::zva_length(); 636 637 // Ensure ZVA length can be divided by 16. This is required by 638 // the subsequent operations. 639 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 640 641 __ tbz(base, 3, base_aligned); 642 __ str(zr, Address(__ post(base, 8))); 643 __ sub(cnt, cnt, 1); 644 __ bind(base_aligned); 645 646 // Ensure count >= zva_length * 2 so that it still deserves a zva after 647 // alignment. 648 Label small; 649 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 650 __ subs(rscratch1, cnt, low_limit >> 3); 651 __ br(Assembler::LT, small); 652 __ zero_dcache_blocks(base, cnt); 653 __ bind(small); 654 } 655 656 { 657 // Number of stp instructions we'll unroll 658 const int unroll = 659 MacroAssembler::zero_words_block_size / 2; 660 // Clear the remaining blocks. 661 Label loop; 662 __ subs(cnt, cnt, unroll * 2); 663 __ br(Assembler::LT, done); 664 __ bind(loop); 665 for (int i = 0; i < unroll; i++) 666 __ stp(zr, zr, __ post(base, 16)); 667 __ subs(cnt, cnt, unroll * 2); 668 __ br(Assembler::GE, loop); 669 __ bind(done); 670 __ add(cnt, cnt, unroll * 2); 671 } 672 673 __ ret(lr); 674 675 return start; 676 } 677 678 679 typedef enum { 680 copy_forwards = 1, 681 copy_backwards = -1 682 } copy_direction; 683 684 // Bulk copy of blocks of 8 words. 685 // 686 // count is a count of words. 687 // 688 // Precondition: count >= 8 689 // 690 // Postconditions: 691 // 692 // The least significant bit of count contains the remaining count 693 // of words to copy. The rest of count is trash. 694 // 695 // s and d are adjusted to point to the remaining words to copy 696 // 697 void generate_copy_longs(Label &start, Register s, Register d, Register count, 698 copy_direction direction) { 699 int unit = wordSize * direction; 700 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 701 702 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 703 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 704 const Register stride = r13; 705 706 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 707 assert_different_registers(s, d, count, rscratch1); 708 709 Label again, drain; 710 const char *stub_name; 711 if (direction == copy_forwards) 712 stub_name = "forward_copy_longs"; 713 else 714 stub_name = "backward_copy_longs"; 715 716 __ align(CodeEntryAlignment); 717 718 StubCodeMark mark(this, "StubRoutines", stub_name); 719 720 __ bind(start); 721 722 Label unaligned_copy_long; 723 if (AvoidUnalignedAccesses) { 724 __ tbnz(d, 3, unaligned_copy_long); 725 } 726 727 if (direction == copy_forwards) { 728 __ sub(s, s, bias); 729 __ sub(d, d, bias); 730 } 731 732 #ifdef ASSERT 733 // Make sure we are never given < 8 words 734 { 735 Label L; 736 __ cmp(count, (u1)8); 737 __ br(Assembler::GE, L); 738 __ stop("genrate_copy_longs called with < 8 words"); 739 __ bind(L); 740 } 741 #endif 742 743 // Fill 8 registers 744 if (UseSIMDForMemoryOps) { 745 __ ldpq(v0, v1, Address(s, 4 * unit)); 746 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 747 } else { 748 __ ldp(t0, t1, Address(s, 2 * unit)); 749 __ ldp(t2, t3, Address(s, 4 * unit)); 750 __ ldp(t4, t5, Address(s, 6 * unit)); 751 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 752 } 753 754 __ subs(count, count, 16); 755 __ br(Assembler::LO, drain); 756 757 int prefetch = PrefetchCopyIntervalInBytes; 758 bool use_stride = false; 759 if (direction == copy_backwards) { 760 use_stride = prefetch > 256; 761 prefetch = -prefetch; 762 if (use_stride) __ mov(stride, prefetch); 763 } 764 765 __ bind(again); 766 767 if (PrefetchCopyIntervalInBytes > 0) 768 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 769 770 if (UseSIMDForMemoryOps) { 771 __ stpq(v0, v1, Address(d, 4 * unit)); 772 __ ldpq(v0, v1, Address(s, 4 * unit)); 773 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 774 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 775 } else { 776 __ stp(t0, t1, Address(d, 2 * unit)); 777 __ ldp(t0, t1, Address(s, 2 * unit)); 778 __ stp(t2, t3, Address(d, 4 * unit)); 779 __ ldp(t2, t3, Address(s, 4 * unit)); 780 __ stp(t4, t5, Address(d, 6 * unit)); 781 __ ldp(t4, t5, Address(s, 6 * unit)); 782 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 783 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 784 } 785 786 __ subs(count, count, 8); 787 __ br(Assembler::HS, again); 788 789 // Drain 790 __ bind(drain); 791 if (UseSIMDForMemoryOps) { 792 __ stpq(v0, v1, Address(d, 4 * unit)); 793 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 794 } else { 795 __ stp(t0, t1, Address(d, 2 * unit)); 796 __ stp(t2, t3, Address(d, 4 * unit)); 797 __ stp(t4, t5, Address(d, 6 * unit)); 798 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 799 } 800 801 { 802 Label L1, L2; 803 __ tbz(count, exact_log2(4), L1); 804 if (UseSIMDForMemoryOps) { 805 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 806 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 807 } else { 808 __ ldp(t0, t1, Address(s, 2 * unit)); 809 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 810 __ stp(t0, t1, Address(d, 2 * unit)); 811 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 812 } 813 __ bind(L1); 814 815 if (direction == copy_forwards) { 816 __ add(s, s, bias); 817 __ add(d, d, bias); 818 } 819 820 __ tbz(count, 1, L2); 821 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 822 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 823 __ bind(L2); 824 } 825 826 __ ret(lr); 827 828 if (AvoidUnalignedAccesses) { 829 Label drain, again; 830 // Register order for storing. Order is different for backward copy. 831 832 __ bind(unaligned_copy_long); 833 834 // source address is even aligned, target odd aligned 835 // 836 // when forward copying word pairs we read long pairs at offsets 837 // {0, 2, 4, 6} (in long words). when backwards copying we read 838 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 839 // address by -2 in the forwards case so we can compute the 840 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 841 // or -1. 842 // 843 // when forward copying we need to store 1 word, 3 pairs and 844 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 845 // zero offset We adjust the destination by -1 which means we 846 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 847 // 848 // When backwards copyng we need to store 1 word, 3 pairs and 849 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 850 // offsets {1, 3, 5, 7, 8} * unit. 851 852 if (direction == copy_forwards) { 853 __ sub(s, s, 16); 854 __ sub(d, d, 8); 855 } 856 857 // Fill 8 registers 858 // 859 // for forwards copy s was offset by -16 from the original input 860 // value of s so the register contents are at these offsets 861 // relative to the 64 bit block addressed by that original input 862 // and so on for each successive 64 byte block when s is updated 863 // 864 // t0 at offset 0, t1 at offset 8 865 // t2 at offset 16, t3 at offset 24 866 // t4 at offset 32, t5 at offset 40 867 // t6 at offset 48, t7 at offset 56 868 869 // for backwards copy s was not offset so the register contents 870 // are at these offsets into the preceding 64 byte block 871 // relative to that original input and so on for each successive 872 // preceding 64 byte block when s is updated. this explains the 873 // slightly counter-intuitive looking pattern of register usage 874 // in the stp instructions for backwards copy. 875 // 876 // t0 at offset -16, t1 at offset -8 877 // t2 at offset -32, t3 at offset -24 878 // t4 at offset -48, t5 at offset -40 879 // t6 at offset -64, t7 at offset -56 880 881 __ ldp(t0, t1, Address(s, 2 * unit)); 882 __ ldp(t2, t3, Address(s, 4 * unit)); 883 __ ldp(t4, t5, Address(s, 6 * unit)); 884 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 885 886 __ subs(count, count, 16); 887 __ br(Assembler::LO, drain); 888 889 int prefetch = PrefetchCopyIntervalInBytes; 890 bool use_stride = false; 891 if (direction == copy_backwards) { 892 use_stride = prefetch > 256; 893 prefetch = -prefetch; 894 if (use_stride) __ mov(stride, prefetch); 895 } 896 897 __ bind(again); 898 899 if (PrefetchCopyIntervalInBytes > 0) 900 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 901 902 if (direction == copy_forwards) { 903 // allowing for the offset of -8 the store instructions place 904 // registers into the target 64 bit block at the following 905 // offsets 906 // 907 // t0 at offset 0 908 // t1 at offset 8, t2 at offset 16 909 // t3 at offset 24, t4 at offset 32 910 // t5 at offset 40, t6 at offset 48 911 // t7 at offset 56 912 913 __ str(t0, Address(d, 1 * unit)); 914 __ stp(t1, t2, Address(d, 2 * unit)); 915 __ ldp(t0, t1, Address(s, 2 * unit)); 916 __ stp(t3, t4, Address(d, 4 * unit)); 917 __ ldp(t2, t3, Address(s, 4 * unit)); 918 __ stp(t5, t6, Address(d, 6 * unit)); 919 __ ldp(t4, t5, Address(s, 6 * unit)); 920 __ str(t7, Address(__ pre(d, 8 * unit))); 921 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 922 } else { 923 // d was not offset when we started so the registers are 924 // written into the 64 bit block preceding d with the following 925 // offsets 926 // 927 // t1 at offset -8 928 // t3 at offset -24, t0 at offset -16 929 // t5 at offset -48, t2 at offset -32 930 // t7 at offset -56, t4 at offset -48 931 // t6 at offset -64 932 // 933 // note that this matches the offsets previously noted for the 934 // loads 935 936 __ str(t1, Address(d, 1 * unit)); 937 __ stp(t3, t0, Address(d, 3 * unit)); 938 __ ldp(t0, t1, Address(s, 2 * unit)); 939 __ stp(t5, t2, Address(d, 5 * unit)); 940 __ ldp(t2, t3, Address(s, 4 * unit)); 941 __ stp(t7, t4, Address(d, 7 * unit)); 942 __ ldp(t4, t5, Address(s, 6 * unit)); 943 __ str(t6, Address(__ pre(d, 8 * unit))); 944 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 945 } 946 947 __ subs(count, count, 8); 948 __ br(Assembler::HS, again); 949 950 // Drain 951 // 952 // this uses the same pattern of offsets and register arguments 953 // as above 954 __ bind(drain); 955 if (direction == copy_forwards) { 956 __ str(t0, Address(d, 1 * unit)); 957 __ stp(t1, t2, Address(d, 2 * unit)); 958 __ stp(t3, t4, Address(d, 4 * unit)); 959 __ stp(t5, t6, Address(d, 6 * unit)); 960 __ str(t7, Address(__ pre(d, 8 * unit))); 961 } else { 962 __ str(t1, Address(d, 1 * unit)); 963 __ stp(t3, t0, Address(d, 3 * unit)); 964 __ stp(t5, t2, Address(d, 5 * unit)); 965 __ stp(t7, t4, Address(d, 7 * unit)); 966 __ str(t6, Address(__ pre(d, 8 * unit))); 967 } 968 // now we need to copy any remaining part block which may 969 // include a 4 word block subblock and/or a 2 word subblock. 970 // bits 2 and 1 in the count are the tell-tale for whetehr we 971 // have each such subblock 972 { 973 Label L1, L2; 974 __ tbz(count, exact_log2(4), L1); 975 // this is the same as above but copying only 4 longs hence 976 // with ony one intervening stp between the str instructions 977 // but note that the offsets and registers still follow the 978 // same pattern 979 __ ldp(t0, t1, Address(s, 2 * unit)); 980 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 981 if (direction == copy_forwards) { 982 __ str(t0, Address(d, 1 * unit)); 983 __ stp(t1, t2, Address(d, 2 * unit)); 984 __ str(t3, Address(__ pre(d, 4 * unit))); 985 } else { 986 __ str(t1, Address(d, 1 * unit)); 987 __ stp(t3, t0, Address(d, 3 * unit)); 988 __ str(t2, Address(__ pre(d, 4 * unit))); 989 } 990 __ bind(L1); 991 992 __ tbz(count, 1, L2); 993 // this is the same as above but copying only 2 longs hence 994 // there is no intervening stp between the str instructions 995 // but note that the offset and register patterns are still 996 // the same 997 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 998 if (direction == copy_forwards) { 999 __ str(t0, Address(d, 1 * unit)); 1000 __ str(t1, Address(__ pre(d, 2 * unit))); 1001 } else { 1002 __ str(t1, Address(d, 1 * unit)); 1003 __ str(t0, Address(__ pre(d, 2 * unit))); 1004 } 1005 __ bind(L2); 1006 1007 // for forwards copy we need to re-adjust the offsets we 1008 // applied so that s and d are follow the last words written 1009 1010 if (direction == copy_forwards) { 1011 __ add(s, s, 16); 1012 __ add(d, d, 8); 1013 } 1014 1015 } 1016 1017 __ ret(lr); 1018 } 1019 } 1020 1021 // Small copy: less than 16 bytes. 1022 // 1023 // NB: Ignores all of the bits of count which represent more than 15 1024 // bytes, so a caller doesn't have to mask them. 1025 1026 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1027 bool is_backwards = step < 0; 1028 size_t granularity = uabs(step); 1029 int direction = is_backwards ? -1 : 1; 1030 int unit = wordSize * direction; 1031 1032 Label Lword, Lint, Lshort, Lbyte; 1033 1034 assert(granularity 1035 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1036 1037 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1038 1039 // ??? I don't know if this bit-test-and-branch is the right thing 1040 // to do. It does a lot of jumping, resulting in several 1041 // mispredicted branches. It might make more sense to do this 1042 // with something like Duff's device with a single computed branch. 1043 1044 __ tbz(count, 3 - exact_log2(granularity), Lword); 1045 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1046 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1047 __ bind(Lword); 1048 1049 if (granularity <= sizeof (jint)) { 1050 __ tbz(count, 2 - exact_log2(granularity), Lint); 1051 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1052 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1053 __ bind(Lint); 1054 } 1055 1056 if (granularity <= sizeof (jshort)) { 1057 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1058 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1059 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1060 __ bind(Lshort); 1061 } 1062 1063 if (granularity <= sizeof (jbyte)) { 1064 __ tbz(count, 0, Lbyte); 1065 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1066 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1067 __ bind(Lbyte); 1068 } 1069 } 1070 1071 Label copy_f, copy_b; 1072 1073 // All-singing all-dancing memory copy. 1074 // 1075 // Copy count units of memory from s to d. The size of a unit is 1076 // step, which can be positive or negative depending on the direction 1077 // of copy. If is_aligned is false, we align the source address. 1078 // 1079 1080 void copy_memory(bool is_aligned, Register s, Register d, 1081 Register count, Register tmp, int step) { 1082 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1083 bool is_backwards = step < 0; 1084 int granularity = uabs(step); 1085 const Register t0 = r3, t1 = r4; 1086 1087 // <= 96 bytes do inline. Direction doesn't matter because we always 1088 // load all the data before writing anything 1089 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1090 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1091 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1092 const Register send = r17, dend = r16; 1093 1094 if (PrefetchCopyIntervalInBytes > 0) 1095 __ prfm(Address(s, 0), PLDL1KEEP); 1096 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1097 __ br(Assembler::HI, copy_big); 1098 1099 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1100 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1101 1102 __ cmp(count, u1(16/granularity)); 1103 __ br(Assembler::LS, copy16); 1104 1105 __ cmp(count, u1(64/granularity)); 1106 __ br(Assembler::HI, copy80); 1107 1108 __ cmp(count, u1(32/granularity)); 1109 __ br(Assembler::LS, copy32); 1110 1111 // 33..64 bytes 1112 if (UseSIMDForMemoryOps) { 1113 __ ldpq(v0, v1, Address(s, 0)); 1114 __ ldpq(v2, v3, Address(send, -32)); 1115 __ stpq(v0, v1, Address(d, 0)); 1116 __ stpq(v2, v3, Address(dend, -32)); 1117 } else { 1118 __ ldp(t0, t1, Address(s, 0)); 1119 __ ldp(t2, t3, Address(s, 16)); 1120 __ ldp(t4, t5, Address(send, -32)); 1121 __ ldp(t6, t7, Address(send, -16)); 1122 1123 __ stp(t0, t1, Address(d, 0)); 1124 __ stp(t2, t3, Address(d, 16)); 1125 __ stp(t4, t5, Address(dend, -32)); 1126 __ stp(t6, t7, Address(dend, -16)); 1127 } 1128 __ b(finish); 1129 1130 // 17..32 bytes 1131 __ bind(copy32); 1132 __ ldp(t0, t1, Address(s, 0)); 1133 __ ldp(t2, t3, Address(send, -16)); 1134 __ stp(t0, t1, Address(d, 0)); 1135 __ stp(t2, t3, Address(dend, -16)); 1136 __ b(finish); 1137 1138 // 65..80/96 bytes 1139 // (96 bytes if SIMD because we do 32 byes per instruction) 1140 __ bind(copy80); 1141 if (UseSIMDForMemoryOps) { 1142 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1143 __ ldpq(v4, v5, Address(send, -32)); 1144 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1145 __ stpq(v4, v5, Address(dend, -32)); 1146 } else { 1147 __ ldp(t0, t1, Address(s, 0)); 1148 __ ldp(t2, t3, Address(s, 16)); 1149 __ ldp(t4, t5, Address(s, 32)); 1150 __ ldp(t6, t7, Address(s, 48)); 1151 __ ldp(t8, t9, Address(send, -16)); 1152 1153 __ stp(t0, t1, Address(d, 0)); 1154 __ stp(t2, t3, Address(d, 16)); 1155 __ stp(t4, t5, Address(d, 32)); 1156 __ stp(t6, t7, Address(d, 48)); 1157 __ stp(t8, t9, Address(dend, -16)); 1158 } 1159 __ b(finish); 1160 1161 // 0..16 bytes 1162 __ bind(copy16); 1163 __ cmp(count, u1(8/granularity)); 1164 __ br(Assembler::LO, copy8); 1165 1166 // 8..16 bytes 1167 __ ldr(t0, Address(s, 0)); 1168 __ ldr(t1, Address(send, -8)); 1169 __ str(t0, Address(d, 0)); 1170 __ str(t1, Address(dend, -8)); 1171 __ b(finish); 1172 1173 if (granularity < 8) { 1174 // 4..7 bytes 1175 __ bind(copy8); 1176 __ tbz(count, 2 - exact_log2(granularity), copy4); 1177 __ ldrw(t0, Address(s, 0)); 1178 __ ldrw(t1, Address(send, -4)); 1179 __ strw(t0, Address(d, 0)); 1180 __ strw(t1, Address(dend, -4)); 1181 __ b(finish); 1182 if (granularity < 4) { 1183 // 0..3 bytes 1184 __ bind(copy4); 1185 __ cbz(count, finish); // get rid of 0 case 1186 if (granularity == 2) { 1187 __ ldrh(t0, Address(s, 0)); 1188 __ strh(t0, Address(d, 0)); 1189 } else { // granularity == 1 1190 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1191 // the first and last byte. 1192 // Handle the 3 byte case by loading and storing base + count/2 1193 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1194 // This does means in the 1 byte case we load/store the same 1195 // byte 3 times. 1196 __ lsr(count, count, 1); 1197 __ ldrb(t0, Address(s, 0)); 1198 __ ldrb(t1, Address(send, -1)); 1199 __ ldrb(t2, Address(s, count)); 1200 __ strb(t0, Address(d, 0)); 1201 __ strb(t1, Address(dend, -1)); 1202 __ strb(t2, Address(d, count)); 1203 } 1204 __ b(finish); 1205 } 1206 } 1207 1208 __ bind(copy_big); 1209 if (is_backwards) { 1210 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1211 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1212 } 1213 1214 // Now we've got the small case out of the way we can align the 1215 // source address on a 2-word boundary. 1216 1217 Label aligned; 1218 1219 if (is_aligned) { 1220 // We may have to adjust by 1 word to get s 2-word-aligned. 1221 __ tbz(s, exact_log2(wordSize), aligned); 1222 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1223 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1224 __ sub(count, count, wordSize/granularity); 1225 } else { 1226 if (is_backwards) { 1227 __ andr(rscratch2, s, 2 * wordSize - 1); 1228 } else { 1229 __ neg(rscratch2, s); 1230 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1231 } 1232 // rscratch2 is the byte adjustment needed to align s. 1233 __ cbz(rscratch2, aligned); 1234 int shift = exact_log2(granularity); 1235 if (shift) __ lsr(rscratch2, rscratch2, shift); 1236 __ sub(count, count, rscratch2); 1237 1238 #if 0 1239 // ?? This code is only correct for a disjoint copy. It may or 1240 // may not make sense to use it in that case. 1241 1242 // Copy the first pair; s and d may not be aligned. 1243 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1244 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1245 1246 // Align s and d, adjust count 1247 if (is_backwards) { 1248 __ sub(s, s, rscratch2); 1249 __ sub(d, d, rscratch2); 1250 } else { 1251 __ add(s, s, rscratch2); 1252 __ add(d, d, rscratch2); 1253 } 1254 #else 1255 copy_memory_small(s, d, rscratch2, rscratch1, step); 1256 #endif 1257 } 1258 1259 __ bind(aligned); 1260 1261 // s is now 2-word-aligned. 1262 1263 // We have a count of units and some trailing bytes. Adjust the 1264 // count and do a bulk copy of words. 1265 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1266 if (direction == copy_forwards) 1267 __ bl(copy_f); 1268 else 1269 __ bl(copy_b); 1270 1271 // And the tail. 1272 copy_memory_small(s, d, count, tmp, step); 1273 1274 if (granularity >= 8) __ bind(copy8); 1275 if (granularity >= 4) __ bind(copy4); 1276 __ bind(finish); 1277 } 1278 1279 1280 void clobber_registers() { 1281 #ifdef ASSERT 1282 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1283 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1284 for (Register r = r3; r <= NOT_WIN64(r18) WIN64_ONLY(r17); r++) 1285 if (r != rscratch1) __ mov(r, rscratch1); 1286 #endif 1287 1288 } 1289 1290 // Scan over array at a for count oops, verifying each one. 1291 // Preserves a and count, clobbers rscratch1 and rscratch2. 1292 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1293 Label loop, end; 1294 __ mov(rscratch1, a); 1295 __ mov(rscratch2, zr); 1296 __ bind(loop); 1297 __ cmp(rscratch2, count); 1298 __ br(Assembler::HS, end); 1299 if (size == (size_t)wordSize) { 1300 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1301 __ verify_oop(temp); 1302 } else { 1303 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1304 __ decode_heap_oop(temp); // calls verify_oop 1305 } 1306 __ add(rscratch2, rscratch2, size); 1307 __ b(loop); 1308 __ bind(end); 1309 } 1310 1311 // Arguments: 1312 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1313 // ignored 1314 // is_oop - true => oop array, so generate store check code 1315 // name - stub name string 1316 // 1317 // Inputs: 1318 // c_rarg0 - source array address 1319 // c_rarg1 - destination array address 1320 // c_rarg2 - element count, treated as ssize_t, can be zero 1321 // 1322 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1323 // the hardware handle it. The two dwords within qwords that span 1324 // cache line boundaries will still be loaded and stored atomicly. 1325 // 1326 // Side Effects: 1327 // disjoint_int_copy_entry is set to the no-overlap entry point 1328 // used by generate_conjoint_int_oop_copy(). 1329 // 1330 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1331 const char *name, bool dest_uninitialized = false) { 1332 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1333 RegSet saved_reg = RegSet::of(s, d, count); 1334 __ align(CodeEntryAlignment); 1335 StubCodeMark mark(this, "StubRoutines", name); 1336 address start = __ pc(); 1337 __ enter(); 1338 1339 if (entry != NULL) { 1340 *entry = __ pc(); 1341 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1342 BLOCK_COMMENT("Entry:"); 1343 } 1344 1345 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1346 if (dest_uninitialized) { 1347 decorators |= IS_DEST_UNINITIALIZED; 1348 } 1349 if (aligned) { 1350 decorators |= ARRAYCOPY_ALIGNED; 1351 } 1352 1353 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1354 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1355 1356 if (is_oop) { 1357 // save regs before copy_memory 1358 __ push(RegSet::of(d, count), sp); 1359 } 1360 { 1361 // UnsafeCopyMemory page error: continue after ucm 1362 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1363 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1364 copy_memory(aligned, s, d, count, rscratch1, size); 1365 } 1366 1367 if (is_oop) { 1368 __ pop(RegSet::of(d, count), sp); 1369 if (VerifyOops) 1370 verify_oop_array(size, d, count, r16); 1371 } 1372 1373 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1374 1375 __ leave(); 1376 __ mov(r0, zr); // return 0 1377 __ ret(lr); 1378 return start; 1379 } 1380 1381 // Arguments: 1382 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1383 // ignored 1384 // is_oop - true => oop array, so generate store check code 1385 // name - stub name string 1386 // 1387 // Inputs: 1388 // c_rarg0 - source array address 1389 // c_rarg1 - destination array address 1390 // c_rarg2 - element count, treated as ssize_t, can be zero 1391 // 1392 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1393 // the hardware handle it. The two dwords within qwords that span 1394 // cache line boundaries will still be loaded and stored atomicly. 1395 // 1396 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1397 address *entry, const char *name, 1398 bool dest_uninitialized = false) { 1399 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1400 RegSet saved_regs = RegSet::of(s, d, count); 1401 StubCodeMark mark(this, "StubRoutines", name); 1402 address start = __ pc(); 1403 __ enter(); 1404 1405 if (entry != NULL) { 1406 *entry = __ pc(); 1407 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1408 BLOCK_COMMENT("Entry:"); 1409 } 1410 1411 // use fwd copy when (d-s) above_equal (count*size) 1412 __ sub(rscratch1, d, s); 1413 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1414 __ br(Assembler::HS, nooverlap_target); 1415 1416 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1417 if (dest_uninitialized) { 1418 decorators |= IS_DEST_UNINITIALIZED; 1419 } 1420 if (aligned) { 1421 decorators |= ARRAYCOPY_ALIGNED; 1422 } 1423 1424 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1425 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1426 1427 if (is_oop) { 1428 // save regs before copy_memory 1429 __ push(RegSet::of(d, count), sp); 1430 } 1431 { 1432 // UnsafeCopyMemory page error: continue after ucm 1433 bool add_entry = !is_oop && (!aligned || sizeof(jlong) == size); 1434 UnsafeCopyMemoryMark ucmm(this, add_entry, true); 1435 copy_memory(aligned, s, d, count, rscratch1, -size); 1436 } 1437 if (is_oop) { 1438 __ pop(RegSet::of(d, count), sp); 1439 if (VerifyOops) 1440 verify_oop_array(size, d, count, r16); 1441 } 1442 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1443 __ leave(); 1444 __ mov(r0, zr); // return 0 1445 __ ret(lr); 1446 return start; 1447 } 1448 1449 // Arguments: 1450 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1451 // ignored 1452 // name - stub name string 1453 // 1454 // Inputs: 1455 // c_rarg0 - source array address 1456 // c_rarg1 - destination array address 1457 // c_rarg2 - element count, treated as ssize_t, can be zero 1458 // 1459 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1460 // we let the hardware handle it. The one to eight bytes within words, 1461 // dwords or qwords that span cache line boundaries will still be loaded 1462 // and stored atomically. 1463 // 1464 // Side Effects: 1465 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1466 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1467 // we let the hardware handle it. The one to eight bytes within words, 1468 // dwords or qwords that span cache line boundaries will still be loaded 1469 // and stored atomically. 1470 // 1471 // Side Effects: 1472 // disjoint_byte_copy_entry is set to the no-overlap entry point 1473 // used by generate_conjoint_byte_copy(). 1474 // 1475 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1476 const bool not_oop = false; 1477 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1478 } 1479 1480 // Arguments: 1481 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1482 // ignored 1483 // name - stub name string 1484 // 1485 // Inputs: 1486 // c_rarg0 - source array address 1487 // c_rarg1 - destination array address 1488 // c_rarg2 - element count, treated as ssize_t, can be zero 1489 // 1490 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1491 // we let the hardware handle it. The one to eight bytes within words, 1492 // dwords or qwords that span cache line boundaries will still be loaded 1493 // and stored atomically. 1494 // 1495 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1496 address* entry, const char *name) { 1497 const bool not_oop = false; 1498 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1499 } 1500 1501 // Arguments: 1502 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1503 // ignored 1504 // name - stub name string 1505 // 1506 // Inputs: 1507 // c_rarg0 - source array address 1508 // c_rarg1 - destination array address 1509 // c_rarg2 - element count, treated as ssize_t, can be zero 1510 // 1511 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1512 // let the hardware handle it. The two or four words within dwords 1513 // or qwords that span cache line boundaries will still be loaded 1514 // and stored atomically. 1515 // 1516 // Side Effects: 1517 // disjoint_short_copy_entry is set to the no-overlap entry point 1518 // used by generate_conjoint_short_copy(). 1519 // 1520 address generate_disjoint_short_copy(bool aligned, 1521 address* entry, const char *name) { 1522 const bool not_oop = false; 1523 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1524 } 1525 1526 // Arguments: 1527 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1528 // ignored 1529 // name - stub name string 1530 // 1531 // Inputs: 1532 // c_rarg0 - source array address 1533 // c_rarg1 - destination array address 1534 // c_rarg2 - element count, treated as ssize_t, can be zero 1535 // 1536 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1537 // let the hardware handle it. The two or four words within dwords 1538 // or qwords that span cache line boundaries will still be loaded 1539 // and stored atomically. 1540 // 1541 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1542 address *entry, const char *name) { 1543 const bool not_oop = false; 1544 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1545 1546 } 1547 // Arguments: 1548 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1549 // ignored 1550 // name - stub name string 1551 // 1552 // Inputs: 1553 // c_rarg0 - source array address 1554 // c_rarg1 - destination array address 1555 // c_rarg2 - element count, treated as ssize_t, can be zero 1556 // 1557 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1558 // the hardware handle it. The two dwords within qwords that span 1559 // cache line boundaries will still be loaded and stored atomicly. 1560 // 1561 // Side Effects: 1562 // disjoint_int_copy_entry is set to the no-overlap entry point 1563 // used by generate_conjoint_int_oop_copy(). 1564 // 1565 address generate_disjoint_int_copy(bool aligned, address *entry, 1566 const char *name, bool dest_uninitialized = false) { 1567 const bool not_oop = false; 1568 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1569 } 1570 1571 // Arguments: 1572 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1573 // ignored 1574 // name - stub name string 1575 // 1576 // Inputs: 1577 // c_rarg0 - source array address 1578 // c_rarg1 - destination array address 1579 // c_rarg2 - element count, treated as ssize_t, can be zero 1580 // 1581 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1582 // the hardware handle it. The two dwords within qwords that span 1583 // cache line boundaries will still be loaded and stored atomicly. 1584 // 1585 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1586 address *entry, const char *name, 1587 bool dest_uninitialized = false) { 1588 const bool not_oop = false; 1589 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1590 } 1591 1592 1593 // Arguments: 1594 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1595 // ignored 1596 // name - stub name string 1597 // 1598 // Inputs: 1599 // c_rarg0 - source array address 1600 // c_rarg1 - destination array address 1601 // c_rarg2 - element count, treated as size_t, can be zero 1602 // 1603 // Side Effects: 1604 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1605 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1606 // 1607 address generate_disjoint_long_copy(bool aligned, address *entry, 1608 const char *name, bool dest_uninitialized = false) { 1609 const bool not_oop = false; 1610 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1611 } 1612 1613 // Arguments: 1614 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1615 // ignored 1616 // name - stub name string 1617 // 1618 // Inputs: 1619 // c_rarg0 - source array address 1620 // c_rarg1 - destination array address 1621 // c_rarg2 - element count, treated as size_t, can be zero 1622 // 1623 address generate_conjoint_long_copy(bool aligned, 1624 address nooverlap_target, address *entry, 1625 const char *name, bool dest_uninitialized = false) { 1626 const bool not_oop = false; 1627 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1628 } 1629 1630 // Arguments: 1631 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1632 // ignored 1633 // name - stub name string 1634 // 1635 // Inputs: 1636 // c_rarg0 - source array address 1637 // c_rarg1 - destination array address 1638 // c_rarg2 - element count, treated as size_t, can be zero 1639 // 1640 // Side Effects: 1641 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1642 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1643 // 1644 address generate_disjoint_oop_copy(bool aligned, address *entry, 1645 const char *name, bool dest_uninitialized) { 1646 const bool is_oop = true; 1647 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1648 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1649 } 1650 1651 // Arguments: 1652 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1653 // ignored 1654 // name - stub name string 1655 // 1656 // Inputs: 1657 // c_rarg0 - source array address 1658 // c_rarg1 - destination array address 1659 // c_rarg2 - element count, treated as size_t, can be zero 1660 // 1661 address generate_conjoint_oop_copy(bool aligned, 1662 address nooverlap_target, address *entry, 1663 const char *name, bool dest_uninitialized) { 1664 const bool is_oop = true; 1665 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1666 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1667 name, dest_uninitialized); 1668 } 1669 1670 1671 // Helper for generating a dynamic type check. 1672 // Smashes rscratch1, rscratch2. 1673 void generate_type_check(Register sub_klass, 1674 Register super_check_offset, 1675 Register super_klass, 1676 Label& L_success) { 1677 assert_different_registers(sub_klass, super_check_offset, super_klass); 1678 1679 BLOCK_COMMENT("type_check:"); 1680 1681 Label L_miss; 1682 1683 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1684 super_check_offset); 1685 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1686 1687 // Fall through on failure! 1688 __ BIND(L_miss); 1689 } 1690 1691 // 1692 // Generate checkcasting array copy stub 1693 // 1694 // Input: 1695 // c_rarg0 - source array address 1696 // c_rarg1 - destination array address 1697 // c_rarg2 - element count, treated as ssize_t, can be zero 1698 // c_rarg3 - size_t ckoff (super_check_offset) 1699 // c_rarg4 - oop ckval (super_klass) 1700 // 1701 // Output: 1702 // r0 == 0 - success 1703 // r0 == -1^K - failure, where K is partial transfer count 1704 // 1705 address generate_checkcast_copy(const char *name, address *entry, 1706 bool dest_uninitialized = false) { 1707 1708 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1709 1710 // Input registers (after setup_arg_regs) 1711 const Register from = c_rarg0; // source array address 1712 const Register to = c_rarg1; // destination array address 1713 const Register count = c_rarg2; // elementscount 1714 const Register ckoff = c_rarg3; // super_check_offset 1715 const Register ckval = c_rarg4; // super_klass 1716 1717 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1718 RegSet wb_post_saved_regs = RegSet::of(count); 1719 1720 // Registers used as temps (r19, r20, r21, r22 are save-on-entry) 1721 const Register copied_oop = r22; // actual oop copied 1722 const Register count_save = r21; // orig elementscount 1723 const Register start_to = r20; // destination array start address 1724 const Register r19_klass = r19; // oop._klass 1725 1726 //--------------------------------------------------------------- 1727 // Assembler stub will be used for this call to arraycopy 1728 // if the two arrays are subtypes of Object[] but the 1729 // destination array type is not equal to or a supertype 1730 // of the source type. Each element must be separately 1731 // checked. 1732 1733 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1734 copied_oop, r19_klass, count_save); 1735 1736 __ align(CodeEntryAlignment); 1737 StubCodeMark mark(this, "StubRoutines", name); 1738 address start = __ pc(); 1739 1740 __ enter(); // required for proper stackwalking of RuntimeStub frame 1741 1742 #ifdef ASSERT 1743 // caller guarantees that the arrays really are different 1744 // otherwise, we would have to make conjoint checks 1745 { Label L; 1746 array_overlap_test(L, TIMES_OOP); 1747 __ stop("checkcast_copy within a single array"); 1748 __ bind(L); 1749 } 1750 #endif //ASSERT 1751 1752 // Caller of this entry point must set up the argument registers. 1753 if (entry != NULL) { 1754 *entry = __ pc(); 1755 BLOCK_COMMENT("Entry:"); 1756 } 1757 1758 // Empty array: Nothing to do. 1759 __ cbz(count, L_done); 1760 __ push(RegSet::of(r19, r20, r21, r22), sp); 1761 1762 #ifdef ASSERT 1763 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1764 // The ckoff and ckval must be mutually consistent, 1765 // even though caller generates both. 1766 { Label L; 1767 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1768 __ ldrw(start_to, Address(ckval, sco_offset)); 1769 __ cmpw(ckoff, start_to); 1770 __ br(Assembler::EQ, L); 1771 __ stop("super_check_offset inconsistent"); 1772 __ bind(L); 1773 } 1774 #endif //ASSERT 1775 1776 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1777 bool is_oop = true; 1778 if (dest_uninitialized) { 1779 decorators |= IS_DEST_UNINITIALIZED; 1780 } 1781 1782 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1783 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1784 1785 // save the original count 1786 __ mov(count_save, count); 1787 1788 // Copy from low to high addresses 1789 __ mov(start_to, to); // Save destination array start address 1790 __ b(L_load_element); 1791 1792 // ======== begin loop ======== 1793 // (Loop is rotated; its entry is L_load_element.) 1794 // Loop control: 1795 // for (; count != 0; count--) { 1796 // copied_oop = load_heap_oop(from++); 1797 // ... generate_type_check ...; 1798 // store_heap_oop(to++, copied_oop); 1799 // } 1800 __ align(OptoLoopAlignment); 1801 1802 __ BIND(L_store_element); 1803 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1804 __ sub(count, count, 1); 1805 __ cbz(count, L_do_card_marks); 1806 1807 // ======== loop entry is here ======== 1808 __ BIND(L_load_element); 1809 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1810 __ cbz(copied_oop, L_store_element); 1811 1812 __ load_klass(r19_klass, copied_oop);// query the object klass 1813 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1814 // ======== end loop ======== 1815 1816 // It was a real error; we must depend on the caller to finish the job. 1817 // Register count = remaining oops, count_orig = total oops. 1818 // Emit GC store barriers for the oops we have copied and report 1819 // their number to the caller. 1820 1821 __ subs(count, count_save, count); // K = partially copied oop count 1822 __ eon(count, count, zr); // report (-1^K) to caller 1823 __ br(Assembler::EQ, L_done_pop); 1824 1825 __ BIND(L_do_card_marks); 1826 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1827 1828 __ bind(L_done_pop); 1829 __ pop(RegSet::of(r19, r20, r21, r22), sp); 1830 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1831 1832 __ bind(L_done); 1833 __ mov(r0, count); 1834 __ leave(); 1835 __ ret(lr); 1836 1837 return start; 1838 } 1839 1840 // Perform range checks on the proposed arraycopy. 1841 // Kills temp, but nothing else. 1842 // Also, clean the sign bits of src_pos and dst_pos. 1843 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1844 Register src_pos, // source position (c_rarg1) 1845 Register dst, // destination array oo (c_rarg2) 1846 Register dst_pos, // destination position (c_rarg3) 1847 Register length, 1848 Register temp, 1849 Label& L_failed) { 1850 BLOCK_COMMENT("arraycopy_range_checks:"); 1851 1852 assert_different_registers(rscratch1, temp); 1853 1854 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1855 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1856 __ addw(temp, length, src_pos); 1857 __ cmpw(temp, rscratch1); 1858 __ br(Assembler::HI, L_failed); 1859 1860 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1861 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1862 __ addw(temp, length, dst_pos); 1863 __ cmpw(temp, rscratch1); 1864 __ br(Assembler::HI, L_failed); 1865 1866 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1867 __ movw(src_pos, src_pos); 1868 __ movw(dst_pos, dst_pos); 1869 1870 BLOCK_COMMENT("arraycopy_range_checks done"); 1871 } 1872 1873 // These stubs get called from some dumb test routine. 1874 // I'll write them properly when they're called from 1875 // something that's actually doing something. 1876 static void fake_arraycopy_stub(address src, address dst, int count) { 1877 assert(count == 0, "huh?"); 1878 } 1879 1880 1881 // 1882 // Generate 'unsafe' array copy stub 1883 // Though just as safe as the other stubs, it takes an unscaled 1884 // size_t argument instead of an element count. 1885 // 1886 // Input: 1887 // c_rarg0 - source array address 1888 // c_rarg1 - destination array address 1889 // c_rarg2 - byte count, treated as ssize_t, can be zero 1890 // 1891 // Examines the alignment of the operands and dispatches 1892 // to a long, int, short, or byte copy loop. 1893 // 1894 address generate_unsafe_copy(const char *name, 1895 address byte_copy_entry, 1896 address short_copy_entry, 1897 address int_copy_entry, 1898 address long_copy_entry) { 1899 Label L_long_aligned, L_int_aligned, L_short_aligned; 1900 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1901 1902 __ align(CodeEntryAlignment); 1903 StubCodeMark mark(this, "StubRoutines", name); 1904 address start = __ pc(); 1905 __ enter(); // required for proper stackwalking of RuntimeStub frame 1906 1907 // bump this on entry, not on exit: 1908 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1909 1910 __ orr(rscratch1, s, d); 1911 __ orr(rscratch1, rscratch1, count); 1912 1913 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1914 __ cbz(rscratch1, L_long_aligned); 1915 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1916 __ cbz(rscratch1, L_int_aligned); 1917 __ tbz(rscratch1, 0, L_short_aligned); 1918 __ b(RuntimeAddress(byte_copy_entry)); 1919 1920 __ BIND(L_short_aligned); 1921 __ lsr(count, count, LogBytesPerShort); // size => short_count 1922 __ b(RuntimeAddress(short_copy_entry)); 1923 __ BIND(L_int_aligned); 1924 __ lsr(count, count, LogBytesPerInt); // size => int_count 1925 __ b(RuntimeAddress(int_copy_entry)); 1926 __ BIND(L_long_aligned); 1927 __ lsr(count, count, LogBytesPerLong); // size => long_count 1928 __ b(RuntimeAddress(long_copy_entry)); 1929 1930 return start; 1931 } 1932 1933 // 1934 // Generate generic array copy stubs 1935 // 1936 // Input: 1937 // c_rarg0 - src oop 1938 // c_rarg1 - src_pos (32-bits) 1939 // c_rarg2 - dst oop 1940 // c_rarg3 - dst_pos (32-bits) 1941 // c_rarg4 - element count (32-bits) 1942 // 1943 // Output: 1944 // r0 == 0 - success 1945 // r0 == -1^K - failure, where K is partial transfer count 1946 // 1947 address generate_generic_copy(const char *name, 1948 address byte_copy_entry, address short_copy_entry, 1949 address int_copy_entry, address oop_copy_entry, 1950 address long_copy_entry, address checkcast_copy_entry) { 1951 1952 Label L_failed, L_objArray; 1953 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1954 1955 // Input registers 1956 const Register src = c_rarg0; // source array oop 1957 const Register src_pos = c_rarg1; // source position 1958 const Register dst = c_rarg2; // destination array oop 1959 const Register dst_pos = c_rarg3; // destination position 1960 const Register length = c_rarg4; 1961 1962 1963 // Registers used as temps 1964 const Register dst_klass = c_rarg5; 1965 1966 __ align(CodeEntryAlignment); 1967 1968 StubCodeMark mark(this, "StubRoutines", name); 1969 1970 address start = __ pc(); 1971 1972 __ enter(); // required for proper stackwalking of RuntimeStub frame 1973 1974 // bump this on entry, not on exit: 1975 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1976 1977 //----------------------------------------------------------------------- 1978 // Assembler stub will be used for this call to arraycopy 1979 // if the following conditions are met: 1980 // 1981 // (1) src and dst must not be null. 1982 // (2) src_pos must not be negative. 1983 // (3) dst_pos must not be negative. 1984 // (4) length must not be negative. 1985 // (5) src klass and dst klass should be the same and not NULL. 1986 // (6) src and dst should be arrays. 1987 // (7) src_pos + length must not exceed length of src. 1988 // (8) dst_pos + length must not exceed length of dst. 1989 // 1990 1991 // if (src == NULL) return -1; 1992 __ cbz(src, L_failed); 1993 1994 // if (src_pos < 0) return -1; 1995 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 1996 1997 // if (dst == NULL) return -1; 1998 __ cbz(dst, L_failed); 1999 2000 // if (dst_pos < 0) return -1; 2001 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2002 2003 // registers used as temp 2004 const Register scratch_length = r16; // elements count to copy 2005 const Register scratch_src_klass = r17; // array klass 2006 const Register lh = r15; // layout helper 2007 2008 // if (length < 0) return -1; 2009 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2010 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2011 2012 __ load_klass(scratch_src_klass, src); 2013 #ifdef ASSERT 2014 // assert(src->klass() != NULL); 2015 { 2016 BLOCK_COMMENT("assert klasses not null {"); 2017 Label L1, L2; 2018 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2019 __ bind(L1); 2020 __ stop("broken null klass"); 2021 __ bind(L2); 2022 __ load_klass(rscratch1, dst); 2023 __ cbz(rscratch1, L1); // this would be broken also 2024 BLOCK_COMMENT("} assert klasses not null done"); 2025 } 2026 #endif 2027 2028 // Load layout helper (32-bits) 2029 // 2030 // |array_tag| | header_size | element_type | |log2_element_size| 2031 // 32 30 24 16 8 2 0 2032 // 2033 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2034 // 2035 2036 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2037 2038 // Handle objArrays completely differently... 2039 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2040 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2041 __ movw(rscratch1, objArray_lh); 2042 __ eorw(rscratch2, lh, rscratch1); 2043 __ cbzw(rscratch2, L_objArray); 2044 2045 // if (src->klass() != dst->klass()) return -1; 2046 __ load_klass(rscratch2, dst); 2047 __ eor(rscratch2, rscratch2, scratch_src_klass); 2048 __ cbnz(rscratch2, L_failed); 2049 2050 // if (!src->is_Array()) return -1; 2051 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2052 2053 // At this point, it is known to be a typeArray (array_tag 0x3). 2054 #ifdef ASSERT 2055 { 2056 BLOCK_COMMENT("assert primitive array {"); 2057 Label L; 2058 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2059 __ cmpw(lh, rscratch2); 2060 __ br(Assembler::GE, L); 2061 __ stop("must be a primitive array"); 2062 __ bind(L); 2063 BLOCK_COMMENT("} assert primitive array done"); 2064 } 2065 #endif 2066 2067 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2068 rscratch2, L_failed); 2069 2070 // TypeArrayKlass 2071 // 2072 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2073 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2074 // 2075 2076 const Register rscratch1_offset = rscratch1; // array offset 2077 const Register r15_elsize = lh; // element size 2078 2079 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2080 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2081 __ add(src, src, rscratch1_offset); // src array offset 2082 __ add(dst, dst, rscratch1_offset); // dst array offset 2083 BLOCK_COMMENT("choose copy loop based on element size"); 2084 2085 // next registers should be set before the jump to corresponding stub 2086 const Register from = c_rarg0; // source array address 2087 const Register to = c_rarg1; // destination array address 2088 const Register count = c_rarg2; // elements count 2089 2090 // 'from', 'to', 'count' registers should be set in such order 2091 // since they are the same as 'src', 'src_pos', 'dst'. 2092 2093 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2094 2095 // The possible values of elsize are 0-3, i.e. exact_log2(element 2096 // size in bytes). We do a simple bitwise binary search. 2097 __ BIND(L_copy_bytes); 2098 __ tbnz(r15_elsize, 1, L_copy_ints); 2099 __ tbnz(r15_elsize, 0, L_copy_shorts); 2100 __ lea(from, Address(src, src_pos));// src_addr 2101 __ lea(to, Address(dst, dst_pos));// dst_addr 2102 __ movw(count, scratch_length); // length 2103 __ b(RuntimeAddress(byte_copy_entry)); 2104 2105 __ BIND(L_copy_shorts); 2106 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2107 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2108 __ movw(count, scratch_length); // length 2109 __ b(RuntimeAddress(short_copy_entry)); 2110 2111 __ BIND(L_copy_ints); 2112 __ tbnz(r15_elsize, 0, L_copy_longs); 2113 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2114 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2115 __ movw(count, scratch_length); // length 2116 __ b(RuntimeAddress(int_copy_entry)); 2117 2118 __ BIND(L_copy_longs); 2119 #ifdef ASSERT 2120 { 2121 BLOCK_COMMENT("assert long copy {"); 2122 Label L; 2123 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r15_elsize 2124 __ cmpw(r15_elsize, LogBytesPerLong); 2125 __ br(Assembler::EQ, L); 2126 __ stop("must be long copy, but elsize is wrong"); 2127 __ bind(L); 2128 BLOCK_COMMENT("} assert long copy done"); 2129 } 2130 #endif 2131 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2132 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2133 __ movw(count, scratch_length); // length 2134 __ b(RuntimeAddress(long_copy_entry)); 2135 2136 // ObjArrayKlass 2137 __ BIND(L_objArray); 2138 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2139 2140 Label L_plain_copy, L_checkcast_copy; 2141 // test array classes for subtyping 2142 __ load_klass(r15, dst); 2143 __ cmp(scratch_src_klass, r15); // usual case is exact equality 2144 __ br(Assembler::NE, L_checkcast_copy); 2145 2146 // Identically typed arrays can be copied without element-wise checks. 2147 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2148 rscratch2, L_failed); 2149 2150 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2151 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2152 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2153 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2154 __ movw(count, scratch_length); // length 2155 __ BIND(L_plain_copy); 2156 __ b(RuntimeAddress(oop_copy_entry)); 2157 2158 __ BIND(L_checkcast_copy); 2159 // live at this point: scratch_src_klass, scratch_length, r15 (dst_klass) 2160 { 2161 // Before looking at dst.length, make sure dst is also an objArray. 2162 __ ldrw(rscratch1, Address(r15, lh_offset)); 2163 __ movw(rscratch2, objArray_lh); 2164 __ eorw(rscratch1, rscratch1, rscratch2); 2165 __ cbnzw(rscratch1, L_failed); 2166 2167 // It is safe to examine both src.length and dst.length. 2168 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2169 r15, L_failed); 2170 2171 __ load_klass(dst_klass, dst); // reload 2172 2173 // Marshal the base address arguments now, freeing registers. 2174 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2175 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2176 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2177 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2178 __ movw(count, length); // length (reloaded) 2179 Register sco_temp = c_rarg3; // this register is free now 2180 assert_different_registers(from, to, count, sco_temp, 2181 dst_klass, scratch_src_klass); 2182 // assert_clean_int(count, sco_temp); 2183 2184 // Generate the type check. 2185 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2186 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2187 2188 // Smashes rscratch1, rscratch2 2189 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2190 2191 // Fetch destination element klass from the ObjArrayKlass header. 2192 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2193 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2194 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2195 2196 // the checkcast_copy loop needs two extra arguments: 2197 assert(c_rarg3 == sco_temp, "#3 already in place"); 2198 // Set up arguments for checkcast_copy_entry. 2199 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2200 __ b(RuntimeAddress(checkcast_copy_entry)); 2201 } 2202 2203 __ BIND(L_failed); 2204 __ mov(r0, -1); 2205 __ leave(); // required for proper stackwalking of RuntimeStub frame 2206 __ ret(lr); 2207 2208 return start; 2209 } 2210 2211 // 2212 // Generate stub for array fill. If "aligned" is true, the 2213 // "to" address is assumed to be heapword aligned. 2214 // 2215 // Arguments for generated stub: 2216 // to: c_rarg0 2217 // value: c_rarg1 2218 // count: c_rarg2 treated as signed 2219 // 2220 address generate_fill(BasicType t, bool aligned, const char *name) { 2221 __ align(CodeEntryAlignment); 2222 StubCodeMark mark(this, "StubRoutines", name); 2223 address start = __ pc(); 2224 2225 BLOCK_COMMENT("Entry:"); 2226 2227 const Register to = c_rarg0; // source array address 2228 const Register value = c_rarg1; // value 2229 const Register count = c_rarg2; // elements count 2230 2231 const Register bz_base = r10; // base for block_zero routine 2232 const Register cnt_words = r11; // temp register 2233 2234 __ enter(); 2235 2236 Label L_fill_elements, L_exit1; 2237 2238 int shift = -1; 2239 switch (t) { 2240 case T_BYTE: 2241 shift = 0; 2242 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2243 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2244 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2245 __ br(Assembler::LO, L_fill_elements); 2246 break; 2247 case T_SHORT: 2248 shift = 1; 2249 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2250 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2251 __ br(Assembler::LO, L_fill_elements); 2252 break; 2253 case T_INT: 2254 shift = 2; 2255 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2256 __ br(Assembler::LO, L_fill_elements); 2257 break; 2258 default: ShouldNotReachHere(); 2259 } 2260 2261 // Align source address at 8 bytes address boundary. 2262 Label L_skip_align1, L_skip_align2, L_skip_align4; 2263 if (!aligned) { 2264 switch (t) { 2265 case T_BYTE: 2266 // One byte misalignment happens only for byte arrays. 2267 __ tbz(to, 0, L_skip_align1); 2268 __ strb(value, Address(__ post(to, 1))); 2269 __ subw(count, count, 1); 2270 __ bind(L_skip_align1); 2271 // Fallthrough 2272 case T_SHORT: 2273 // Two bytes misalignment happens only for byte and short (char) arrays. 2274 __ tbz(to, 1, L_skip_align2); 2275 __ strh(value, Address(__ post(to, 2))); 2276 __ subw(count, count, 2 >> shift); 2277 __ bind(L_skip_align2); 2278 // Fallthrough 2279 case T_INT: 2280 // Align to 8 bytes, we know we are 4 byte aligned to start. 2281 __ tbz(to, 2, L_skip_align4); 2282 __ strw(value, Address(__ post(to, 4))); 2283 __ subw(count, count, 4 >> shift); 2284 __ bind(L_skip_align4); 2285 break; 2286 default: ShouldNotReachHere(); 2287 } 2288 } 2289 2290 // 2291 // Fill large chunks 2292 // 2293 __ lsrw(cnt_words, count, 3 - shift); // number of words 2294 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2295 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2296 if (UseBlockZeroing) { 2297 Label non_block_zeroing, rest; 2298 // If the fill value is zero we can use the fast zero_words(). 2299 __ cbnz(value, non_block_zeroing); 2300 __ mov(bz_base, to); 2301 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2302 __ zero_words(bz_base, cnt_words); 2303 __ b(rest); 2304 __ bind(non_block_zeroing); 2305 __ fill_words(to, cnt_words, value); 2306 __ bind(rest); 2307 } else { 2308 __ fill_words(to, cnt_words, value); 2309 } 2310 2311 // Remaining count is less than 8 bytes. Fill it by a single store. 2312 // Note that the total length is no less than 8 bytes. 2313 if (t == T_BYTE || t == T_SHORT) { 2314 Label L_exit1; 2315 __ cbzw(count, L_exit1); 2316 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2317 __ str(value, Address(to, -8)); // overwrite some elements 2318 __ bind(L_exit1); 2319 __ leave(); 2320 __ ret(lr); 2321 } 2322 2323 // Handle copies less than 8 bytes. 2324 Label L_fill_2, L_fill_4, L_exit2; 2325 __ bind(L_fill_elements); 2326 switch (t) { 2327 case T_BYTE: 2328 __ tbz(count, 0, L_fill_2); 2329 __ strb(value, Address(__ post(to, 1))); 2330 __ bind(L_fill_2); 2331 __ tbz(count, 1, L_fill_4); 2332 __ strh(value, Address(__ post(to, 2))); 2333 __ bind(L_fill_4); 2334 __ tbz(count, 2, L_exit2); 2335 __ strw(value, Address(to)); 2336 break; 2337 case T_SHORT: 2338 __ tbz(count, 0, L_fill_4); 2339 __ strh(value, Address(__ post(to, 2))); 2340 __ bind(L_fill_4); 2341 __ tbz(count, 1, L_exit2); 2342 __ strw(value, Address(to)); 2343 break; 2344 case T_INT: 2345 __ cbzw(count, L_exit2); 2346 __ strw(value, Address(to)); 2347 break; 2348 default: ShouldNotReachHere(); 2349 } 2350 __ bind(L_exit2); 2351 __ leave(); 2352 __ ret(lr); 2353 return start; 2354 } 2355 2356 address generate_data_cache_writeback() { 2357 const Register line = c_rarg0; // address of line to write back 2358 2359 __ align(CodeEntryAlignment); 2360 2361 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback"); 2362 2363 address start = __ pc(); 2364 __ enter(); 2365 __ cache_wb(Address(line, 0)); 2366 __ leave(); 2367 __ ret(lr); 2368 2369 return start; 2370 } 2371 2372 address generate_data_cache_writeback_sync() { 2373 const Register is_pre = c_rarg0; // pre or post sync 2374 2375 __ align(CodeEntryAlignment); 2376 2377 StubCodeMark mark(this, "StubRoutines", "_data_cache_writeback_sync"); 2378 2379 // pre wbsync is a no-op 2380 // post wbsync translates to an sfence 2381 2382 Label skip; 2383 address start = __ pc(); 2384 __ enter(); 2385 __ cbnz(is_pre, skip); 2386 __ cache_wbsync(false); 2387 __ bind(skip); 2388 __ leave(); 2389 __ ret(lr); 2390 2391 return start; 2392 } 2393 2394 void generate_arraycopy_stubs() { 2395 address entry; 2396 address entry_jbyte_arraycopy; 2397 address entry_jshort_arraycopy; 2398 address entry_jint_arraycopy; 2399 address entry_oop_arraycopy; 2400 address entry_jlong_arraycopy; 2401 address entry_checkcast_arraycopy; 2402 2403 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2404 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2405 2406 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2407 2408 //*** jbyte 2409 // Always need aligned and unaligned versions 2410 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2411 "jbyte_disjoint_arraycopy"); 2412 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2413 &entry_jbyte_arraycopy, 2414 "jbyte_arraycopy"); 2415 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2416 "arrayof_jbyte_disjoint_arraycopy"); 2417 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2418 "arrayof_jbyte_arraycopy"); 2419 2420 //*** jshort 2421 // Always need aligned and unaligned versions 2422 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2423 "jshort_disjoint_arraycopy"); 2424 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2425 &entry_jshort_arraycopy, 2426 "jshort_arraycopy"); 2427 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2428 "arrayof_jshort_disjoint_arraycopy"); 2429 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2430 "arrayof_jshort_arraycopy"); 2431 2432 //*** jint 2433 // Aligned versions 2434 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2435 "arrayof_jint_disjoint_arraycopy"); 2436 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2437 "arrayof_jint_arraycopy"); 2438 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2439 // entry_jint_arraycopy always points to the unaligned version 2440 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2441 "jint_disjoint_arraycopy"); 2442 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2443 &entry_jint_arraycopy, 2444 "jint_arraycopy"); 2445 2446 //*** jlong 2447 // It is always aligned 2448 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2449 "arrayof_jlong_disjoint_arraycopy"); 2450 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2451 "arrayof_jlong_arraycopy"); 2452 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2453 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2454 2455 //*** oops 2456 { 2457 // With compressed oops we need unaligned versions; notice that 2458 // we overwrite entry_oop_arraycopy. 2459 bool aligned = !UseCompressedOops; 2460 2461 StubRoutines::_arrayof_oop_disjoint_arraycopy 2462 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2463 /*dest_uninitialized*/false); 2464 StubRoutines::_arrayof_oop_arraycopy 2465 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2466 /*dest_uninitialized*/false); 2467 // Aligned versions without pre-barriers 2468 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2469 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2470 /*dest_uninitialized*/true); 2471 StubRoutines::_arrayof_oop_arraycopy_uninit 2472 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2473 /*dest_uninitialized*/true); 2474 } 2475 2476 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2477 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2478 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2479 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2480 2481 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2482 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2483 /*dest_uninitialized*/true); 2484 2485 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2486 entry_jbyte_arraycopy, 2487 entry_jshort_arraycopy, 2488 entry_jint_arraycopy, 2489 entry_jlong_arraycopy); 2490 2491 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2492 entry_jbyte_arraycopy, 2493 entry_jshort_arraycopy, 2494 entry_jint_arraycopy, 2495 entry_oop_arraycopy, 2496 entry_jlong_arraycopy, 2497 entry_checkcast_arraycopy); 2498 2499 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2500 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2501 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2502 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2503 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2504 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2505 } 2506 2507 void generate_math_stubs() { Unimplemented(); } 2508 2509 // Arguments: 2510 // 2511 // Inputs: 2512 // c_rarg0 - source byte array address 2513 // c_rarg1 - destination byte array address 2514 // c_rarg2 - K (key) in little endian int array 2515 // 2516 address generate_aescrypt_encryptBlock() { 2517 __ align(CodeEntryAlignment); 2518 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2519 2520 Label L_doLast; 2521 2522 const Register from = c_rarg0; // source array address 2523 const Register to = c_rarg1; // destination array address 2524 const Register key = c_rarg2; // key array address 2525 const Register keylen = rscratch1; 2526 2527 address start = __ pc(); 2528 __ enter(); 2529 2530 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2531 2532 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2533 2534 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2535 __ rev32(v1, __ T16B, v1); 2536 __ rev32(v2, __ T16B, v2); 2537 __ rev32(v3, __ T16B, v3); 2538 __ rev32(v4, __ T16B, v4); 2539 __ aese(v0, v1); 2540 __ aesmc(v0, v0); 2541 __ aese(v0, v2); 2542 __ aesmc(v0, v0); 2543 __ aese(v0, v3); 2544 __ aesmc(v0, v0); 2545 __ aese(v0, v4); 2546 __ aesmc(v0, v0); 2547 2548 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2549 __ rev32(v1, __ T16B, v1); 2550 __ rev32(v2, __ T16B, v2); 2551 __ rev32(v3, __ T16B, v3); 2552 __ rev32(v4, __ T16B, v4); 2553 __ aese(v0, v1); 2554 __ aesmc(v0, v0); 2555 __ aese(v0, v2); 2556 __ aesmc(v0, v0); 2557 __ aese(v0, v3); 2558 __ aesmc(v0, v0); 2559 __ aese(v0, v4); 2560 __ aesmc(v0, v0); 2561 2562 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2563 __ rev32(v1, __ T16B, v1); 2564 __ rev32(v2, __ T16B, v2); 2565 2566 __ cmpw(keylen, 44); 2567 __ br(Assembler::EQ, L_doLast); 2568 2569 __ aese(v0, v1); 2570 __ aesmc(v0, v0); 2571 __ aese(v0, v2); 2572 __ aesmc(v0, v0); 2573 2574 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2575 __ rev32(v1, __ T16B, v1); 2576 __ rev32(v2, __ T16B, v2); 2577 2578 __ cmpw(keylen, 52); 2579 __ br(Assembler::EQ, L_doLast); 2580 2581 __ aese(v0, v1); 2582 __ aesmc(v0, v0); 2583 __ aese(v0, v2); 2584 __ aesmc(v0, v0); 2585 2586 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2587 __ rev32(v1, __ T16B, v1); 2588 __ rev32(v2, __ T16B, v2); 2589 2590 __ BIND(L_doLast); 2591 2592 __ aese(v0, v1); 2593 __ aesmc(v0, v0); 2594 __ aese(v0, v2); 2595 2596 __ ld1(v1, __ T16B, key); 2597 __ rev32(v1, __ T16B, v1); 2598 __ eor(v0, __ T16B, v0, v1); 2599 2600 __ st1(v0, __ T16B, to); 2601 2602 __ mov(r0, 0); 2603 2604 __ leave(); 2605 __ ret(lr); 2606 2607 return start; 2608 } 2609 2610 // Arguments: 2611 // 2612 // Inputs: 2613 // c_rarg0 - source byte array address 2614 // c_rarg1 - destination byte array address 2615 // c_rarg2 - K (key) in little endian int array 2616 // 2617 address generate_aescrypt_decryptBlock() { 2618 assert(UseAES, "need AES instructions and misaligned SSE support"); 2619 __ align(CodeEntryAlignment); 2620 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2621 Label L_doLast; 2622 2623 const Register from = c_rarg0; // source array address 2624 const Register to = c_rarg1; // destination array address 2625 const Register key = c_rarg2; // key array address 2626 const Register keylen = rscratch1; 2627 2628 address start = __ pc(); 2629 __ enter(); // required for proper stackwalking of RuntimeStub frame 2630 2631 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2632 2633 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2634 2635 __ ld1(v5, __ T16B, __ post(key, 16)); 2636 __ rev32(v5, __ T16B, v5); 2637 2638 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2639 __ rev32(v1, __ T16B, v1); 2640 __ rev32(v2, __ T16B, v2); 2641 __ rev32(v3, __ T16B, v3); 2642 __ rev32(v4, __ T16B, v4); 2643 __ aesd(v0, v1); 2644 __ aesimc(v0, v0); 2645 __ aesd(v0, v2); 2646 __ aesimc(v0, v0); 2647 __ aesd(v0, v3); 2648 __ aesimc(v0, v0); 2649 __ aesd(v0, v4); 2650 __ aesimc(v0, v0); 2651 2652 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2653 __ rev32(v1, __ T16B, v1); 2654 __ rev32(v2, __ T16B, v2); 2655 __ rev32(v3, __ T16B, v3); 2656 __ rev32(v4, __ T16B, v4); 2657 __ aesd(v0, v1); 2658 __ aesimc(v0, v0); 2659 __ aesd(v0, v2); 2660 __ aesimc(v0, v0); 2661 __ aesd(v0, v3); 2662 __ aesimc(v0, v0); 2663 __ aesd(v0, v4); 2664 __ aesimc(v0, v0); 2665 2666 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2667 __ rev32(v1, __ T16B, v1); 2668 __ rev32(v2, __ T16B, v2); 2669 2670 __ cmpw(keylen, 44); 2671 __ br(Assembler::EQ, L_doLast); 2672 2673 __ aesd(v0, v1); 2674 __ aesimc(v0, v0); 2675 __ aesd(v0, v2); 2676 __ aesimc(v0, v0); 2677 2678 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2679 __ rev32(v1, __ T16B, v1); 2680 __ rev32(v2, __ T16B, v2); 2681 2682 __ cmpw(keylen, 52); 2683 __ br(Assembler::EQ, L_doLast); 2684 2685 __ aesd(v0, v1); 2686 __ aesimc(v0, v0); 2687 __ aesd(v0, v2); 2688 __ aesimc(v0, v0); 2689 2690 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2691 __ rev32(v1, __ T16B, v1); 2692 __ rev32(v2, __ T16B, v2); 2693 2694 __ BIND(L_doLast); 2695 2696 __ aesd(v0, v1); 2697 __ aesimc(v0, v0); 2698 __ aesd(v0, v2); 2699 2700 __ eor(v0, __ T16B, v0, v5); 2701 2702 __ st1(v0, __ T16B, to); 2703 2704 __ mov(r0, 0); 2705 2706 __ leave(); 2707 __ ret(lr); 2708 2709 return start; 2710 } 2711 2712 // Arguments: 2713 // 2714 // Inputs: 2715 // c_rarg0 - source byte array address 2716 // c_rarg1 - destination byte array address 2717 // c_rarg2 - K (key) in little endian int array 2718 // c_rarg3 - r vector byte array address 2719 // c_rarg4 - input length 2720 // 2721 // Output: 2722 // x0 - input length 2723 // 2724 address generate_cipherBlockChaining_encryptAESCrypt() { 2725 assert(UseAES, "need AES instructions and misaligned SSE support"); 2726 __ align(CodeEntryAlignment); 2727 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2728 2729 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2730 2731 const Register from = c_rarg0; // source array address 2732 const Register to = c_rarg1; // destination array address 2733 const Register key = c_rarg2; // key array address 2734 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2735 // and left with the results of the last encryption block 2736 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2737 const Register keylen = rscratch1; 2738 2739 address start = __ pc(); 2740 2741 __ enter(); 2742 2743 __ movw(rscratch2, len_reg); 2744 2745 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2746 2747 __ ld1(v0, __ T16B, rvec); 2748 2749 __ cmpw(keylen, 52); 2750 __ br(Assembler::CC, L_loadkeys_44); 2751 __ br(Assembler::EQ, L_loadkeys_52); 2752 2753 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2754 __ rev32(v17, __ T16B, v17); 2755 __ rev32(v18, __ T16B, v18); 2756 __ BIND(L_loadkeys_52); 2757 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2758 __ rev32(v19, __ T16B, v19); 2759 __ rev32(v20, __ T16B, v20); 2760 __ BIND(L_loadkeys_44); 2761 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2762 __ rev32(v21, __ T16B, v21); 2763 __ rev32(v22, __ T16B, v22); 2764 __ rev32(v23, __ T16B, v23); 2765 __ rev32(v24, __ T16B, v24); 2766 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2767 __ rev32(v25, __ T16B, v25); 2768 __ rev32(v26, __ T16B, v26); 2769 __ rev32(v27, __ T16B, v27); 2770 __ rev32(v28, __ T16B, v28); 2771 __ ld1(v29, v30, v31, __ T16B, key); 2772 __ rev32(v29, __ T16B, v29); 2773 __ rev32(v30, __ T16B, v30); 2774 __ rev32(v31, __ T16B, v31); 2775 2776 __ BIND(L_aes_loop); 2777 __ ld1(v1, __ T16B, __ post(from, 16)); 2778 __ eor(v0, __ T16B, v0, v1); 2779 2780 __ br(Assembler::CC, L_rounds_44); 2781 __ br(Assembler::EQ, L_rounds_52); 2782 2783 __ aese(v0, v17); __ aesmc(v0, v0); 2784 __ aese(v0, v18); __ aesmc(v0, v0); 2785 __ BIND(L_rounds_52); 2786 __ aese(v0, v19); __ aesmc(v0, v0); 2787 __ aese(v0, v20); __ aesmc(v0, v0); 2788 __ BIND(L_rounds_44); 2789 __ aese(v0, v21); __ aesmc(v0, v0); 2790 __ aese(v0, v22); __ aesmc(v0, v0); 2791 __ aese(v0, v23); __ aesmc(v0, v0); 2792 __ aese(v0, v24); __ aesmc(v0, v0); 2793 __ aese(v0, v25); __ aesmc(v0, v0); 2794 __ aese(v0, v26); __ aesmc(v0, v0); 2795 __ aese(v0, v27); __ aesmc(v0, v0); 2796 __ aese(v0, v28); __ aesmc(v0, v0); 2797 __ aese(v0, v29); __ aesmc(v0, v0); 2798 __ aese(v0, v30); 2799 __ eor(v0, __ T16B, v0, v31); 2800 2801 __ st1(v0, __ T16B, __ post(to, 16)); 2802 2803 __ subw(len_reg, len_reg, 16); 2804 __ cbnzw(len_reg, L_aes_loop); 2805 2806 __ st1(v0, __ T16B, rvec); 2807 2808 __ mov(r0, rscratch2); 2809 2810 __ leave(); 2811 __ ret(lr); 2812 2813 return start; 2814 } 2815 2816 // Arguments: 2817 // 2818 // Inputs: 2819 // c_rarg0 - source byte array address 2820 // c_rarg1 - destination byte array address 2821 // c_rarg2 - K (key) in little endian int array 2822 // c_rarg3 - r vector byte array address 2823 // c_rarg4 - input length 2824 // 2825 // Output: 2826 // r0 - input length 2827 // 2828 address generate_cipherBlockChaining_decryptAESCrypt() { 2829 assert(UseAES, "need AES instructions and misaligned SSE support"); 2830 __ align(CodeEntryAlignment); 2831 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2832 2833 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2834 2835 const Register from = c_rarg0; // source array address 2836 const Register to = c_rarg1; // destination array address 2837 const Register key = c_rarg2; // key array address 2838 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2839 // and left with the results of the last encryption block 2840 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2841 const Register keylen = rscratch1; 2842 2843 address start = __ pc(); 2844 2845 __ enter(); 2846 2847 __ movw(rscratch2, len_reg); 2848 2849 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2850 2851 __ ld1(v2, __ T16B, rvec); 2852 2853 __ ld1(v31, __ T16B, __ post(key, 16)); 2854 __ rev32(v31, __ T16B, v31); 2855 2856 __ cmpw(keylen, 52); 2857 __ br(Assembler::CC, L_loadkeys_44); 2858 __ br(Assembler::EQ, L_loadkeys_52); 2859 2860 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2861 __ rev32(v17, __ T16B, v17); 2862 __ rev32(v18, __ T16B, v18); 2863 __ BIND(L_loadkeys_52); 2864 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2865 __ rev32(v19, __ T16B, v19); 2866 __ rev32(v20, __ T16B, v20); 2867 __ BIND(L_loadkeys_44); 2868 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2869 __ rev32(v21, __ T16B, v21); 2870 __ rev32(v22, __ T16B, v22); 2871 __ rev32(v23, __ T16B, v23); 2872 __ rev32(v24, __ T16B, v24); 2873 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2874 __ rev32(v25, __ T16B, v25); 2875 __ rev32(v26, __ T16B, v26); 2876 __ rev32(v27, __ T16B, v27); 2877 __ rev32(v28, __ T16B, v28); 2878 __ ld1(v29, v30, __ T16B, key); 2879 __ rev32(v29, __ T16B, v29); 2880 __ rev32(v30, __ T16B, v30); 2881 2882 __ BIND(L_aes_loop); 2883 __ ld1(v0, __ T16B, __ post(from, 16)); 2884 __ orr(v1, __ T16B, v0, v0); 2885 2886 __ br(Assembler::CC, L_rounds_44); 2887 __ br(Assembler::EQ, L_rounds_52); 2888 2889 __ aesd(v0, v17); __ aesimc(v0, v0); 2890 __ aesd(v0, v18); __ aesimc(v0, v0); 2891 __ BIND(L_rounds_52); 2892 __ aesd(v0, v19); __ aesimc(v0, v0); 2893 __ aesd(v0, v20); __ aesimc(v0, v0); 2894 __ BIND(L_rounds_44); 2895 __ aesd(v0, v21); __ aesimc(v0, v0); 2896 __ aesd(v0, v22); __ aesimc(v0, v0); 2897 __ aesd(v0, v23); __ aesimc(v0, v0); 2898 __ aesd(v0, v24); __ aesimc(v0, v0); 2899 __ aesd(v0, v25); __ aesimc(v0, v0); 2900 __ aesd(v0, v26); __ aesimc(v0, v0); 2901 __ aesd(v0, v27); __ aesimc(v0, v0); 2902 __ aesd(v0, v28); __ aesimc(v0, v0); 2903 __ aesd(v0, v29); __ aesimc(v0, v0); 2904 __ aesd(v0, v30); 2905 __ eor(v0, __ T16B, v0, v31); 2906 __ eor(v0, __ T16B, v0, v2); 2907 2908 __ st1(v0, __ T16B, __ post(to, 16)); 2909 __ orr(v2, __ T16B, v1, v1); 2910 2911 __ subw(len_reg, len_reg, 16); 2912 __ cbnzw(len_reg, L_aes_loop); 2913 2914 __ st1(v2, __ T16B, rvec); 2915 2916 __ mov(r0, rscratch2); 2917 2918 __ leave(); 2919 __ ret(lr); 2920 2921 return start; 2922 } 2923 2924 // Arguments: 2925 // 2926 // Inputs: 2927 // c_rarg0 - byte[] source+offset 2928 // c_rarg1 - int[] SHA.state 2929 // c_rarg2 - int offset 2930 // c_rarg3 - int limit 2931 // 2932 address generate_sha1_implCompress(bool multi_block, const char *name) { 2933 __ align(CodeEntryAlignment); 2934 StubCodeMark mark(this, "StubRoutines", name); 2935 address start = __ pc(); 2936 2937 Register buf = c_rarg0; 2938 Register state = c_rarg1; 2939 Register ofs = c_rarg2; 2940 Register limit = c_rarg3; 2941 2942 Label keys; 2943 Label sha1_loop; 2944 2945 // load the keys into v0..v3 2946 __ adr(rscratch1, keys); 2947 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2948 // load 5 words state into v6, v7 2949 __ ldrq(v6, Address(state, 0)); 2950 __ ldrs(v7, Address(state, 16)); 2951 2952 2953 __ BIND(sha1_loop); 2954 // load 64 bytes of data into v16..v19 2955 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2956 __ rev32(v16, __ T16B, v16); 2957 __ rev32(v17, __ T16B, v17); 2958 __ rev32(v18, __ T16B, v18); 2959 __ rev32(v19, __ T16B, v19); 2960 2961 // do the sha1 2962 __ addv(v4, __ T4S, v16, v0); 2963 __ orr(v20, __ T16B, v6, v6); 2964 2965 FloatRegister d0 = v16; 2966 FloatRegister d1 = v17; 2967 FloatRegister d2 = v18; 2968 FloatRegister d3 = v19; 2969 2970 for (int round = 0; round < 20; round++) { 2971 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2972 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2973 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2974 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2975 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2976 2977 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2978 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2979 __ sha1h(tmp2, __ T4S, v20); 2980 if (round < 5) 2981 __ sha1c(v20, __ T4S, tmp3, tmp4); 2982 else if (round < 10 || round >= 15) 2983 __ sha1p(v20, __ T4S, tmp3, tmp4); 2984 else 2985 __ sha1m(v20, __ T4S, tmp3, tmp4); 2986 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2987 2988 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2989 } 2990 2991 __ addv(v7, __ T2S, v7, v21); 2992 __ addv(v6, __ T4S, v6, v20); 2993 2994 if (multi_block) { 2995 __ add(ofs, ofs, 64); 2996 __ cmp(ofs, limit); 2997 __ br(Assembler::LE, sha1_loop); 2998 __ mov(c_rarg0, ofs); // return ofs 2999 } 3000 3001 __ strq(v6, Address(state, 0)); 3002 __ strs(v7, Address(state, 16)); 3003 3004 __ ret(lr); 3005 3006 __ bind(keys); 3007 __ emit_int32(0x5a827999); 3008 __ emit_int32(0x6ed9eba1); 3009 __ emit_int32(0x8f1bbcdc); 3010 __ emit_int32(0xca62c1d6); 3011 3012 return start; 3013 } 3014 3015 3016 // Arguments: 3017 // 3018 // Inputs: 3019 // c_rarg0 - byte[] source+offset 3020 // c_rarg1 - int[] SHA.state 3021 // c_rarg2 - int offset 3022 // c_rarg3 - int limit 3023 // 3024 address generate_sha256_implCompress(bool multi_block, const char *name) { 3025 static const uint32_t round_consts[64] = { 3026 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3027 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3028 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3029 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3030 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3031 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3032 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3033 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3034 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3035 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3036 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3037 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3038 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3039 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3040 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3041 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3042 }; 3043 __ align(CodeEntryAlignment); 3044 StubCodeMark mark(this, "StubRoutines", name); 3045 address start = __ pc(); 3046 3047 Register buf = c_rarg0; 3048 Register state = c_rarg1; 3049 Register ofs = c_rarg2; 3050 Register limit = c_rarg3; 3051 3052 Label sha1_loop; 3053 3054 __ stpd(v8, v9, __ pre(sp, -32)); 3055 __ stpd(v10, v11, Address(sp, 16)); 3056 3057 // dga == v0 3058 // dgb == v1 3059 // dg0 == v2 3060 // dg1 == v3 3061 // dg2 == v4 3062 // t0 == v6 3063 // t1 == v7 3064 3065 // load 16 keys to v16..v31 3066 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3067 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3068 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3069 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3070 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3071 3072 // load 8 words (256 bits) state 3073 __ ldpq(v0, v1, state); 3074 3075 __ BIND(sha1_loop); 3076 // load 64 bytes of data into v8..v11 3077 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3078 __ rev32(v8, __ T16B, v8); 3079 __ rev32(v9, __ T16B, v9); 3080 __ rev32(v10, __ T16B, v10); 3081 __ rev32(v11, __ T16B, v11); 3082 3083 __ addv(v6, __ T4S, v8, v16); 3084 __ orr(v2, __ T16B, v0, v0); 3085 __ orr(v3, __ T16B, v1, v1); 3086 3087 FloatRegister d0 = v8; 3088 FloatRegister d1 = v9; 3089 FloatRegister d2 = v10; 3090 FloatRegister d3 = v11; 3091 3092 3093 for (int round = 0; round < 16; round++) { 3094 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3095 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3096 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3097 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3098 3099 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3100 __ orr(v4, __ T16B, v2, v2); 3101 if (round < 15) 3102 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3103 __ sha256h(v2, __ T4S, v3, tmp2); 3104 __ sha256h2(v3, __ T4S, v4, tmp2); 3105 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3106 3107 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3108 } 3109 3110 __ addv(v0, __ T4S, v0, v2); 3111 __ addv(v1, __ T4S, v1, v3); 3112 3113 if (multi_block) { 3114 __ add(ofs, ofs, 64); 3115 __ cmp(ofs, limit); 3116 __ br(Assembler::LE, sha1_loop); 3117 __ mov(c_rarg0, ofs); // return ofs 3118 } 3119 3120 __ ldpd(v10, v11, Address(sp, 16)); 3121 __ ldpd(v8, v9, __ post(sp, 32)); 3122 3123 __ stpq(v0, v1, state); 3124 3125 __ ret(lr); 3126 3127 return start; 3128 } 3129 3130 // Safefetch stubs. 3131 void generate_safefetch(const char* name, int size, address* entry, 3132 address* fault_pc, address* continuation_pc) { 3133 // safefetch signatures: 3134 // int SafeFetch32(int* adr, int errValue); 3135 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3136 // 3137 // arguments: 3138 // c_rarg0 = adr 3139 // c_rarg1 = errValue 3140 // 3141 // result: 3142 // PPC_RET = *adr or errValue 3143 3144 StubCodeMark mark(this, "StubRoutines", name); 3145 3146 // Entry point, pc or function descriptor. 3147 *entry = __ pc(); 3148 3149 // Load *adr into c_rarg1, may fault. 3150 *fault_pc = __ pc(); 3151 switch (size) { 3152 case 4: 3153 // int32_t 3154 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3155 break; 3156 case 8: 3157 // int64_t 3158 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3159 break; 3160 default: 3161 ShouldNotReachHere(); 3162 } 3163 3164 // return errValue or *adr 3165 *continuation_pc = __ pc(); 3166 __ mov(r0, c_rarg1); 3167 __ ret(lr); 3168 } 3169 3170 /** 3171 * Arguments: 3172 * 3173 * Inputs: 3174 * c_rarg0 - int crc 3175 * c_rarg1 - byte* buf 3176 * c_rarg2 - int length 3177 * 3178 * Ouput: 3179 * rax - int crc result 3180 */ 3181 address generate_updateBytesCRC32() { 3182 assert(UseCRC32Intrinsics, "what are we doing here?"); 3183 3184 __ align(CodeEntryAlignment); 3185 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3186 3187 address start = __ pc(); 3188 3189 const Register crc = c_rarg0; // crc 3190 const Register buf = c_rarg1; // source java byte array address 3191 const Register len = c_rarg2; // length 3192 const Register table0 = c_rarg3; // crc_table address 3193 const Register table1 = c_rarg4; 3194 const Register table2 = c_rarg5; 3195 const Register table3 = c_rarg6; 3196 const Register tmp3 = c_rarg7; 3197 3198 BLOCK_COMMENT("Entry:"); 3199 __ enter(); // required for proper stackwalking of RuntimeStub frame 3200 3201 __ kernel_crc32(crc, buf, len, 3202 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3203 3204 __ leave(); // required for proper stackwalking of RuntimeStub frame 3205 __ ret(lr); 3206 3207 return start; 3208 } 3209 3210 /** 3211 * Arguments: 3212 * 3213 * Inputs: 3214 * c_rarg0 - int crc 3215 * c_rarg1 - byte* buf 3216 * c_rarg2 - int length 3217 * c_rarg3 - int* table 3218 * 3219 * Ouput: 3220 * r0 - int crc result 3221 */ 3222 address generate_updateBytesCRC32C() { 3223 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3224 3225 __ align(CodeEntryAlignment); 3226 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3227 3228 address start = __ pc(); 3229 3230 const Register crc = c_rarg0; // crc 3231 const Register buf = c_rarg1; // source java byte array address 3232 const Register len = c_rarg2; // length 3233 const Register table0 = c_rarg3; // crc_table address 3234 const Register table1 = c_rarg4; 3235 const Register table2 = c_rarg5; 3236 const Register table3 = c_rarg6; 3237 const Register tmp3 = c_rarg7; 3238 3239 BLOCK_COMMENT("Entry:"); 3240 __ enter(); // required for proper stackwalking of RuntimeStub frame 3241 3242 __ kernel_crc32c(crc, buf, len, 3243 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3244 3245 __ leave(); // required for proper stackwalking of RuntimeStub frame 3246 __ ret(lr); 3247 3248 return start; 3249 } 3250 3251 /*** 3252 * Arguments: 3253 * 3254 * Inputs: 3255 * c_rarg0 - int adler 3256 * c_rarg1 - byte* buff 3257 * c_rarg2 - int len 3258 * 3259 * Output: 3260 * c_rarg0 - int adler result 3261 */ 3262 address generate_updateBytesAdler32() { 3263 __ align(CodeEntryAlignment); 3264 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3265 address start = __ pc(); 3266 3267 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3268 3269 // Aliases 3270 Register adler = c_rarg0; 3271 Register s1 = c_rarg0; 3272 Register s2 = c_rarg3; 3273 Register buff = c_rarg1; 3274 Register len = c_rarg2; 3275 Register nmax = r4; 3276 Register base = r5; 3277 Register count = r6; 3278 Register temp0 = rscratch1; 3279 Register temp1 = rscratch2; 3280 FloatRegister vbytes = v0; 3281 FloatRegister vs1acc = v1; 3282 FloatRegister vs2acc = v2; 3283 FloatRegister vtable = v3; 3284 3285 // Max number of bytes we can process before having to take the mod 3286 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3287 uint64_t BASE = 0xfff1; 3288 uint64_t NMAX = 0x15B0; 3289 3290 __ mov(base, BASE); 3291 __ mov(nmax, NMAX); 3292 3293 // Load accumulation coefficients for the upper 16 bits 3294 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 3295 __ ld1(vtable, __ T16B, Address(temp0)); 3296 3297 // s1 is initialized to the lower 16 bits of adler 3298 // s2 is initialized to the upper 16 bits of adler 3299 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3300 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3301 3302 // The pipelined loop needs at least 16 elements for 1 iteration 3303 // It does check this, but it is more effective to skip to the cleanup loop 3304 __ cmp(len, (u1)16); 3305 __ br(Assembler::HS, L_nmax); 3306 __ cbz(len, L_combine); 3307 3308 __ bind(L_simple_by1_loop); 3309 __ ldrb(temp0, Address(__ post(buff, 1))); 3310 __ add(s1, s1, temp0); 3311 __ add(s2, s2, s1); 3312 __ subs(len, len, 1); 3313 __ br(Assembler::HI, L_simple_by1_loop); 3314 3315 // s1 = s1 % BASE 3316 __ subs(temp0, s1, base); 3317 __ csel(s1, temp0, s1, Assembler::HS); 3318 3319 // s2 = s2 % BASE 3320 __ lsr(temp0, s2, 16); 3321 __ lsl(temp1, temp0, 4); 3322 __ sub(temp1, temp1, temp0); 3323 __ add(s2, temp1, s2, ext::uxth); 3324 3325 __ subs(temp0, s2, base); 3326 __ csel(s2, temp0, s2, Assembler::HS); 3327 3328 __ b(L_combine); 3329 3330 __ bind(L_nmax); 3331 __ subs(len, len, nmax); 3332 __ sub(count, nmax, 16); 3333 __ br(Assembler::LO, L_by16); 3334 3335 __ bind(L_nmax_loop); 3336 3337 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3338 vbytes, vs1acc, vs2acc, vtable); 3339 3340 __ subs(count, count, 16); 3341 __ br(Assembler::HS, L_nmax_loop); 3342 3343 // s1 = s1 % BASE 3344 __ lsr(temp0, s1, 16); 3345 __ lsl(temp1, temp0, 4); 3346 __ sub(temp1, temp1, temp0); 3347 __ add(temp1, temp1, s1, ext::uxth); 3348 3349 __ lsr(temp0, temp1, 16); 3350 __ lsl(s1, temp0, 4); 3351 __ sub(s1, s1, temp0); 3352 __ add(s1, s1, temp1, ext:: uxth); 3353 3354 __ subs(temp0, s1, base); 3355 __ csel(s1, temp0, s1, Assembler::HS); 3356 3357 // s2 = s2 % BASE 3358 __ lsr(temp0, s2, 16); 3359 __ lsl(temp1, temp0, 4); 3360 __ sub(temp1, temp1, temp0); 3361 __ add(temp1, temp1, s2, ext::uxth); 3362 3363 __ lsr(temp0, temp1, 16); 3364 __ lsl(s2, temp0, 4); 3365 __ sub(s2, s2, temp0); 3366 __ add(s2, s2, temp1, ext:: uxth); 3367 3368 __ subs(temp0, s2, base); 3369 __ csel(s2, temp0, s2, Assembler::HS); 3370 3371 __ subs(len, len, nmax); 3372 __ sub(count, nmax, 16); 3373 __ br(Assembler::HS, L_nmax_loop); 3374 3375 __ bind(L_by16); 3376 __ adds(len, len, count); 3377 __ br(Assembler::LO, L_by1); 3378 3379 __ bind(L_by16_loop); 3380 3381 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3382 vbytes, vs1acc, vs2acc, vtable); 3383 3384 __ subs(len, len, 16); 3385 __ br(Assembler::HS, L_by16_loop); 3386 3387 __ bind(L_by1); 3388 __ adds(len, len, 15); 3389 __ br(Assembler::LO, L_do_mod); 3390 3391 __ bind(L_by1_loop); 3392 __ ldrb(temp0, Address(__ post(buff, 1))); 3393 __ add(s1, temp0, s1); 3394 __ add(s2, s2, s1); 3395 __ subs(len, len, 1); 3396 __ br(Assembler::HS, L_by1_loop); 3397 3398 __ bind(L_do_mod); 3399 // s1 = s1 % BASE 3400 __ lsr(temp0, s1, 16); 3401 __ lsl(temp1, temp0, 4); 3402 __ sub(temp1, temp1, temp0); 3403 __ add(temp1, temp1, s1, ext::uxth); 3404 3405 __ lsr(temp0, temp1, 16); 3406 __ lsl(s1, temp0, 4); 3407 __ sub(s1, s1, temp0); 3408 __ add(s1, s1, temp1, ext:: uxth); 3409 3410 __ subs(temp0, s1, base); 3411 __ csel(s1, temp0, s1, Assembler::HS); 3412 3413 // s2 = s2 % BASE 3414 __ lsr(temp0, s2, 16); 3415 __ lsl(temp1, temp0, 4); 3416 __ sub(temp1, temp1, temp0); 3417 __ add(temp1, temp1, s2, ext::uxth); 3418 3419 __ lsr(temp0, temp1, 16); 3420 __ lsl(s2, temp0, 4); 3421 __ sub(s2, s2, temp0); 3422 __ add(s2, s2, temp1, ext:: uxth); 3423 3424 __ subs(temp0, s2, base); 3425 __ csel(s2, temp0, s2, Assembler::HS); 3426 3427 // Combine lower bits and higher bits 3428 __ bind(L_combine); 3429 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3430 3431 __ ret(lr); 3432 3433 return start; 3434 } 3435 3436 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 3437 Register temp0, Register temp1, FloatRegister vbytes, 3438 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 3439 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 3440 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 3441 // In non-vectorized code, we update s1 and s2 as: 3442 // s1 <- s1 + b1 3443 // s2 <- s2 + s1 3444 // s1 <- s1 + b2 3445 // s2 <- s2 + b1 3446 // ... 3447 // s1 <- s1 + b16 3448 // s2 <- s2 + s1 3449 // Putting above assignments together, we have: 3450 // s1_new = s1 + b1 + b2 + ... + b16 3451 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 3452 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 3453 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 3454 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 3455 3456 // s2 = s2 + s1 * 16 3457 __ add(s2, s2, s1, Assembler::LSL, 4); 3458 3459 // vs1acc = b1 + b2 + b3 + ... + b16 3460 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 3461 __ umullv(vs2acc, __ T8B, vtable, vbytes); 3462 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 3463 __ uaddlv(vs1acc, __ T16B, vbytes); 3464 __ uaddlv(vs2acc, __ T8H, vs2acc); 3465 3466 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 3467 __ fmovd(temp0, vs1acc); 3468 __ fmovd(temp1, vs2acc); 3469 __ add(s1, s1, temp0); 3470 __ add(s2, s2, temp1); 3471 } 3472 3473 /** 3474 * Arguments: 3475 * 3476 * Input: 3477 * c_rarg0 - x address 3478 * c_rarg1 - x length 3479 * c_rarg2 - y address 3480 * c_rarg3 - y lenth 3481 * c_rarg4 - z address 3482 * c_rarg5 - z length 3483 */ 3484 address generate_multiplyToLen() { 3485 __ align(CodeEntryAlignment); 3486 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3487 3488 address start = __ pc(); 3489 const Register x = r0; 3490 const Register xlen = r1; 3491 const Register y = r2; 3492 const Register ylen = r3; 3493 const Register z = r4; 3494 const Register zlen = r5; 3495 3496 const Register tmp1 = r10; 3497 const Register tmp2 = r11; 3498 const Register tmp3 = r12; 3499 const Register tmp4 = r13; 3500 const Register tmp5 = r14; 3501 const Register tmp6 = r15; 3502 const Register tmp7 = r16; 3503 3504 BLOCK_COMMENT("Entry:"); 3505 __ enter(); // required for proper stackwalking of RuntimeStub frame 3506 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3507 __ leave(); // required for proper stackwalking of RuntimeStub frame 3508 __ ret(lr); 3509 3510 return start; 3511 } 3512 3513 address generate_squareToLen() { 3514 // squareToLen algorithm for sizes 1..127 described in java code works 3515 // faster than multiply_to_len on some CPUs and slower on others, but 3516 // multiply_to_len shows a bit better overall results 3517 __ align(CodeEntryAlignment); 3518 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3519 address start = __ pc(); 3520 3521 const Register x = r0; 3522 const Register xlen = r1; 3523 const Register z = r2; 3524 const Register zlen = r3; 3525 const Register y = r4; // == x 3526 const Register ylen = r5; // == xlen 3527 3528 const Register tmp1 = r10; 3529 const Register tmp2 = r11; 3530 const Register tmp3 = r12; 3531 const Register tmp4 = r13; 3532 const Register tmp5 = r14; 3533 const Register tmp6 = r15; 3534 const Register tmp7 = r16; 3535 3536 RegSet spilled_regs = RegSet::of(y, ylen); 3537 BLOCK_COMMENT("Entry:"); 3538 __ enter(); 3539 __ push(spilled_regs, sp); 3540 __ mov(y, x); 3541 __ mov(ylen, xlen); 3542 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3543 __ pop(spilled_regs, sp); 3544 __ leave(); 3545 __ ret(lr); 3546 return start; 3547 } 3548 3549 address generate_mulAdd() { 3550 __ align(CodeEntryAlignment); 3551 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3552 3553 address start = __ pc(); 3554 3555 const Register out = r0; 3556 const Register in = r1; 3557 const Register offset = r2; 3558 const Register len = r3; 3559 const Register k = r4; 3560 3561 BLOCK_COMMENT("Entry:"); 3562 __ enter(); 3563 __ mul_add(out, in, offset, len, k); 3564 __ leave(); 3565 __ ret(lr); 3566 3567 return start; 3568 } 3569 3570 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3571 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3572 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3573 // Karatsuba multiplication performs a 128*128 -> 256-bit 3574 // multiplication in three 128-bit multiplications and a few 3575 // additions. 3576 // 3577 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3578 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3579 // 3580 // Inputs: 3581 // 3582 // A0 in a.d[0] (subkey) 3583 // A1 in a.d[1] 3584 // (A1+A0) in a1_xor_a0.d[0] 3585 // 3586 // B0 in b.d[0] (state) 3587 // B1 in b.d[1] 3588 3589 __ ext(tmp1, __ T16B, b, b, 0x08); 3590 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3591 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3592 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3593 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3594 3595 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3596 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3597 __ eor(tmp2, __ T16B, tmp2, tmp4); 3598 __ eor(tmp2, __ T16B, tmp2, tmp3); 3599 3600 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3601 __ ins(result_hi, __ D, tmp2, 0, 1); 3602 __ ins(result_lo, __ D, tmp2, 1, 0); 3603 } 3604 3605 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3606 FloatRegister p, FloatRegister z, FloatRegister t1) { 3607 const FloatRegister t0 = result; 3608 3609 // The GCM field polynomial f is z^128 + p(z), where p = 3610 // z^7+z^2+z+1. 3611 // 3612 // z^128 === -p(z) (mod (z^128 + p(z))) 3613 // 3614 // so, given that the product we're reducing is 3615 // a == lo + hi * z^128 3616 // substituting, 3617 // === lo - hi * p(z) (mod (z^128 + p(z))) 3618 // 3619 // we reduce by multiplying hi by p(z) and subtracting the result 3620 // from (i.e. XORing it with) lo. Because p has no nonzero high 3621 // bits we can do this with two 64-bit multiplications, lo*p and 3622 // hi*p. 3623 3624 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3625 __ ext(t1, __ T16B, t0, z, 8); 3626 __ eor(hi, __ T16B, hi, t1); 3627 __ ext(t1, __ T16B, z, t0, 8); 3628 __ eor(lo, __ T16B, lo, t1); 3629 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3630 __ eor(result, __ T16B, lo, t0); 3631 } 3632 3633 address generate_has_negatives(address &has_negatives_long) { 3634 const u1 large_loop_size = 64; 3635 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3636 int dcache_line = VM_Version::dcache_line_size(); 3637 3638 Register ary1 = r1, len = r2, result = r0; 3639 3640 __ align(CodeEntryAlignment); 3641 3642 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3643 3644 address entry = __ pc(); 3645 3646 __ enter(); 3647 3648 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3649 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3650 3651 __ cmp(len, (u1)15); 3652 __ br(Assembler::GT, LEN_OVER_15); 3653 // The only case when execution falls into this code is when pointer is near 3654 // the end of memory page and we have to avoid reading next page 3655 __ add(ary1, ary1, len); 3656 __ subs(len, len, 8); 3657 __ br(Assembler::GT, LEN_OVER_8); 3658 __ ldr(rscratch2, Address(ary1, -8)); 3659 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3660 __ lsrv(rscratch2, rscratch2, rscratch1); 3661 __ tst(rscratch2, UPPER_BIT_MASK); 3662 __ cset(result, Assembler::NE); 3663 __ leave(); 3664 __ ret(lr); 3665 __ bind(LEN_OVER_8); 3666 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3667 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3668 __ tst(rscratch2, UPPER_BIT_MASK); 3669 __ br(Assembler::NE, RET_TRUE_NO_POP); 3670 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3671 __ lsrv(rscratch1, rscratch1, rscratch2); 3672 __ tst(rscratch1, UPPER_BIT_MASK); 3673 __ cset(result, Assembler::NE); 3674 __ leave(); 3675 __ ret(lr); 3676 3677 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3678 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3679 3680 has_negatives_long = __ pc(); // 2nd entry point 3681 3682 __ enter(); 3683 3684 __ bind(LEN_OVER_15); 3685 __ push(spilled_regs, sp); 3686 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3687 __ cbz(rscratch2, ALIGNED); 3688 __ ldp(tmp6, tmp1, Address(ary1)); 3689 __ mov(tmp5, 16); 3690 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3691 __ add(ary1, ary1, rscratch1); 3692 __ sub(len, len, rscratch1); 3693 __ orr(tmp6, tmp6, tmp1); 3694 __ tst(tmp6, UPPER_BIT_MASK); 3695 __ br(Assembler::NE, RET_TRUE); 3696 3697 __ bind(ALIGNED); 3698 __ cmp(len, large_loop_size); 3699 __ br(Assembler::LT, CHECK_16); 3700 // Perform 16-byte load as early return in pre-loop to handle situation 3701 // when initially aligned large array has negative values at starting bytes, 3702 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3703 // slower. Cases with negative bytes further ahead won't be affected that 3704 // much. In fact, it'll be faster due to early loads, less instructions and 3705 // less branches in LARGE_LOOP. 3706 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3707 __ sub(len, len, 16); 3708 __ orr(tmp6, tmp6, tmp1); 3709 __ tst(tmp6, UPPER_BIT_MASK); 3710 __ br(Assembler::NE, RET_TRUE); 3711 __ cmp(len, large_loop_size); 3712 __ br(Assembler::LT, CHECK_16); 3713 3714 if (SoftwarePrefetchHintDistance >= 0 3715 && SoftwarePrefetchHintDistance >= dcache_line) { 3716 // initial prefetch 3717 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3718 } 3719 __ bind(LARGE_LOOP); 3720 if (SoftwarePrefetchHintDistance >= 0) { 3721 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3722 } 3723 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3724 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3725 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3726 // instructions per cycle and have less branches, but this approach disables 3727 // early return, thus, all 64 bytes are loaded and checked every time. 3728 __ ldp(tmp2, tmp3, Address(ary1)); 3729 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3730 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3731 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3732 __ add(ary1, ary1, large_loop_size); 3733 __ sub(len, len, large_loop_size); 3734 __ orr(tmp2, tmp2, tmp3); 3735 __ orr(tmp4, tmp4, tmp5); 3736 __ orr(rscratch1, rscratch1, rscratch2); 3737 __ orr(tmp6, tmp6, tmp1); 3738 __ orr(tmp2, tmp2, tmp4); 3739 __ orr(rscratch1, rscratch1, tmp6); 3740 __ orr(tmp2, tmp2, rscratch1); 3741 __ tst(tmp2, UPPER_BIT_MASK); 3742 __ br(Assembler::NE, RET_TRUE); 3743 __ cmp(len, large_loop_size); 3744 __ br(Assembler::GE, LARGE_LOOP); 3745 3746 __ bind(CHECK_16); // small 16-byte load pre-loop 3747 __ cmp(len, (u1)16); 3748 __ br(Assembler::LT, POST_LOOP16); 3749 3750 __ bind(LOOP16); // small 16-byte load loop 3751 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3752 __ sub(len, len, 16); 3753 __ orr(tmp2, tmp2, tmp3); 3754 __ tst(tmp2, UPPER_BIT_MASK); 3755 __ br(Assembler::NE, RET_TRUE); 3756 __ cmp(len, (u1)16); 3757 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3758 3759 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3760 __ cmp(len, (u1)8); 3761 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3762 __ ldr(tmp3, Address(__ post(ary1, 8))); 3763 __ sub(len, len, 8); 3764 __ tst(tmp3, UPPER_BIT_MASK); 3765 __ br(Assembler::NE, RET_TRUE); 3766 3767 __ bind(POST_LOOP16_LOAD_TAIL); 3768 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3769 __ ldr(tmp1, Address(ary1)); 3770 __ mov(tmp2, 64); 3771 __ sub(tmp4, tmp2, len, __ LSL, 3); 3772 __ lslv(tmp1, tmp1, tmp4); 3773 __ tst(tmp1, UPPER_BIT_MASK); 3774 __ br(Assembler::NE, RET_TRUE); 3775 // Fallthrough 3776 3777 __ bind(RET_FALSE); 3778 __ pop(spilled_regs, sp); 3779 __ leave(); 3780 __ mov(result, zr); 3781 __ ret(lr); 3782 3783 __ bind(RET_TRUE); 3784 __ pop(spilled_regs, sp); 3785 __ bind(RET_TRUE_NO_POP); 3786 __ leave(); 3787 __ mov(result, 1); 3788 __ ret(lr); 3789 3790 __ bind(DONE); 3791 __ pop(spilled_regs, sp); 3792 __ leave(); 3793 __ ret(lr); 3794 return entry; 3795 } 3796 3797 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3798 bool usePrefetch, Label &NOT_EQUAL) { 3799 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3800 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3801 tmp7 = r12, tmp8 = r13; 3802 Label LOOP; 3803 3804 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3805 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3806 __ bind(LOOP); 3807 if (usePrefetch) { 3808 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3809 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3810 } 3811 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3812 __ eor(tmp1, tmp1, tmp2); 3813 __ eor(tmp3, tmp3, tmp4); 3814 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3815 __ orr(tmp1, tmp1, tmp3); 3816 __ cbnz(tmp1, NOT_EQUAL); 3817 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3818 __ eor(tmp5, tmp5, tmp6); 3819 __ eor(tmp7, tmp7, tmp8); 3820 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3821 __ orr(tmp5, tmp5, tmp7); 3822 __ cbnz(tmp5, NOT_EQUAL); 3823 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3824 __ eor(tmp1, tmp1, tmp2); 3825 __ eor(tmp3, tmp3, tmp4); 3826 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3827 __ orr(tmp1, tmp1, tmp3); 3828 __ cbnz(tmp1, NOT_EQUAL); 3829 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3830 __ eor(tmp5, tmp5, tmp6); 3831 __ sub(cnt1, cnt1, 8 * wordSize); 3832 __ eor(tmp7, tmp7, tmp8); 3833 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3834 // tmp6 is not used. MacroAssembler::subs is used here (rather than 3835 // cmp) because subs allows an unlimited range of immediate operand. 3836 __ subs(tmp6, cnt1, loopThreshold); 3837 __ orr(tmp5, tmp5, tmp7); 3838 __ cbnz(tmp5, NOT_EQUAL); 3839 __ br(__ GE, LOOP); 3840 // post-loop 3841 __ eor(tmp1, tmp1, tmp2); 3842 __ eor(tmp3, tmp3, tmp4); 3843 __ orr(tmp1, tmp1, tmp3); 3844 __ sub(cnt1, cnt1, 2 * wordSize); 3845 __ cbnz(tmp1, NOT_EQUAL); 3846 } 3847 3848 void generate_large_array_equals_loop_simd(int loopThreshold, 3849 bool usePrefetch, Label &NOT_EQUAL) { 3850 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3851 tmp2 = rscratch2; 3852 Label LOOP; 3853 3854 __ bind(LOOP); 3855 if (usePrefetch) { 3856 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3857 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3858 } 3859 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3860 __ sub(cnt1, cnt1, 8 * wordSize); 3861 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3862 __ subs(tmp1, cnt1, loopThreshold); 3863 __ eor(v0, __ T16B, v0, v4); 3864 __ eor(v1, __ T16B, v1, v5); 3865 __ eor(v2, __ T16B, v2, v6); 3866 __ eor(v3, __ T16B, v3, v7); 3867 __ orr(v0, __ T16B, v0, v1); 3868 __ orr(v1, __ T16B, v2, v3); 3869 __ orr(v0, __ T16B, v0, v1); 3870 __ umov(tmp1, v0, __ D, 0); 3871 __ umov(tmp2, v0, __ D, 1); 3872 __ orr(tmp1, tmp1, tmp2); 3873 __ cbnz(tmp1, NOT_EQUAL); 3874 __ br(__ GE, LOOP); 3875 } 3876 3877 // a1 = r1 - array1 address 3878 // a2 = r2 - array2 address 3879 // result = r0 - return value. Already contains "false" 3880 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3881 // r3-r5 are reserved temporary registers 3882 address generate_large_array_equals() { 3883 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3884 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3885 tmp7 = r12, tmp8 = r13; 3886 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3887 SMALL_LOOP, POST_LOOP; 3888 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3889 // calculate if at least 32 prefetched bytes are used 3890 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3891 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3892 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3893 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3894 tmp5, tmp6, tmp7, tmp8); 3895 3896 __ align(CodeEntryAlignment); 3897 3898 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3899 3900 address entry = __ pc(); 3901 __ enter(); 3902 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3903 // also advance pointers to use post-increment instead of pre-increment 3904 __ add(a1, a1, wordSize); 3905 __ add(a2, a2, wordSize); 3906 if (AvoidUnalignedAccesses) { 3907 // both implementations (SIMD/nonSIMD) are using relatively large load 3908 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 3909 // on some CPUs in case of address is not at least 16-byte aligned. 3910 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 3911 // load if needed at least for 1st address and make if 16-byte aligned. 3912 Label ALIGNED16; 3913 __ tbz(a1, 3, ALIGNED16); 3914 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3915 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3916 __ sub(cnt1, cnt1, wordSize); 3917 __ eor(tmp1, tmp1, tmp2); 3918 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 3919 __ bind(ALIGNED16); 3920 } 3921 if (UseSIMDForArrayEquals) { 3922 if (SoftwarePrefetchHintDistance >= 0) { 3923 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3924 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3925 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 3926 /* prfm = */ true, NOT_EQUAL); 3927 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3928 __ br(__ LT, TAIL); 3929 } 3930 __ bind(NO_PREFETCH_LARGE_LOOP); 3931 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 3932 /* prfm = */ false, NOT_EQUAL); 3933 } else { 3934 __ push(spilled_regs, sp); 3935 if (SoftwarePrefetchHintDistance >= 0) { 3936 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3937 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3938 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 3939 /* prfm = */ true, NOT_EQUAL); 3940 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3941 __ br(__ LT, TAIL); 3942 } 3943 __ bind(NO_PREFETCH_LARGE_LOOP); 3944 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 3945 /* prfm = */ false, NOT_EQUAL); 3946 } 3947 __ bind(TAIL); 3948 __ cbz(cnt1, EQUAL); 3949 __ subs(cnt1, cnt1, wordSize); 3950 __ br(__ LE, POST_LOOP); 3951 __ bind(SMALL_LOOP); 3952 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3953 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3954 __ subs(cnt1, cnt1, wordSize); 3955 __ eor(tmp1, tmp1, tmp2); 3956 __ cbnz(tmp1, NOT_EQUAL); 3957 __ br(__ GT, SMALL_LOOP); 3958 __ bind(POST_LOOP); 3959 __ ldr(tmp1, Address(a1, cnt1)); 3960 __ ldr(tmp2, Address(a2, cnt1)); 3961 __ eor(tmp1, tmp1, tmp2); 3962 __ cbnz(tmp1, NOT_EQUAL); 3963 __ bind(EQUAL); 3964 __ mov(result, true); 3965 __ bind(NOT_EQUAL); 3966 if (!UseSIMDForArrayEquals) { 3967 __ pop(spilled_regs, sp); 3968 } 3969 __ bind(NOT_EQUAL_NO_POP); 3970 __ leave(); 3971 __ ret(lr); 3972 return entry; 3973 } 3974 3975 address generate_dsin_dcos(bool isCos) { 3976 __ align(CodeEntryAlignment); 3977 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 3978 address start = __ pc(); 3979 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 3980 (address)StubRoutines::aarch64::_two_over_pi, 3981 (address)StubRoutines::aarch64::_pio2, 3982 (address)StubRoutines::aarch64::_dsin_coef, 3983 (address)StubRoutines::aarch64::_dcos_coef); 3984 return start; 3985 } 3986 3987 address generate_dlog() { 3988 __ align(CodeEntryAlignment); 3989 StubCodeMark mark(this, "StubRoutines", "dlog"); 3990 address entry = __ pc(); 3991 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 3992 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 3993 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 3994 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 3995 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 3996 return entry; 3997 } 3998 3999 // code for comparing 16 bytes of strings with same encoding 4000 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 4001 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; 4002 __ ldr(rscratch1, Address(__ post(str1, 8))); 4003 __ eor(rscratch2, tmp1, tmp2); 4004 __ ldr(cnt1, Address(__ post(str2, 8))); 4005 __ cbnz(rscratch2, DIFF1); 4006 __ ldr(tmp1, Address(__ post(str1, 8))); 4007 __ eor(rscratch2, rscratch1, cnt1); 4008 __ ldr(tmp2, Address(__ post(str2, 8))); 4009 __ cbnz(rscratch2, DIFF2); 4010 } 4011 4012 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 4013 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 4014 Label &DIFF2) { 4015 Register cnt1 = r2, tmp2 = r11, tmp3 = r12; 4016 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 4017 4018 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 4019 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4020 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4021 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 4022 4023 __ fmovd(tmpL, vtmp3); 4024 __ eor(rscratch2, tmp3, tmpL); 4025 __ cbnz(rscratch2, DIFF2); 4026 4027 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4028 __ umov(tmpL, vtmp3, __ D, 1); 4029 __ eor(rscratch2, tmpU, tmpL); 4030 __ cbnz(rscratch2, DIFF1); 4031 4032 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4033 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4034 __ fmovd(tmpL, vtmp); 4035 __ eor(rscratch2, tmp3, tmpL); 4036 __ cbnz(rscratch2, DIFF2); 4037 4038 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4039 __ umov(tmpL, vtmp, __ D, 1); 4040 __ eor(rscratch2, tmpU, tmpL); 4041 __ cbnz(rscratch2, DIFF1); 4042 } 4043 4044 // r0 = result 4045 // r1 = str1 4046 // r2 = cnt1 4047 // r3 = str2 4048 // r4 = cnt2 4049 // r10 = tmp1 4050 // r11 = tmp2 4051 address generate_compare_long_string_different_encoding(bool isLU) { 4052 __ align(CodeEntryAlignment); 4053 StubCodeMark mark(this, "StubRoutines", isLU 4054 ? "compare_long_string_different_encoding LU" 4055 : "compare_long_string_different_encoding UL"); 4056 address entry = __ pc(); 4057 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 4058 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 4059 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 4060 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4061 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 4062 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 4063 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 4064 4065 int prefetchLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance/2); 4066 4067 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 4068 // cnt2 == amount of characters left to compare 4069 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 4070 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4071 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 4072 __ add(str2, str2, isLU ? wordSize : wordSize/2); 4073 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 4074 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 4075 __ eor(rscratch2, tmp1, tmp2); 4076 __ mov(rscratch1, tmp2); 4077 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 4078 Register tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 4079 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 4080 __ push(spilled_regs, sp); 4081 __ mov(tmp2, isLU ? str1 : str2); // init the pointer to L next load 4082 __ mov(cnt1, isLU ? str2 : str1); // init the pointer to U next load 4083 4084 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4085 4086 if (SoftwarePrefetchHintDistance >= 0) { 4087 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4088 __ br(__ LT, NO_PREFETCH); 4089 __ bind(LARGE_LOOP_PREFETCH); 4090 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 4091 __ mov(tmp4, 2); 4092 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4093 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 4094 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4095 __ subs(tmp4, tmp4, 1); 4096 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 4097 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4098 __ mov(tmp4, 2); 4099 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 4100 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4101 __ subs(tmp4, tmp4, 1); 4102 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 4103 __ sub(cnt2, cnt2, 64); 4104 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4105 __ br(__ GE, LARGE_LOOP_PREFETCH); 4106 } 4107 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 4108 __ bind(NO_PREFETCH); 4109 __ subs(cnt2, cnt2, 16); 4110 __ br(__ LT, TAIL); 4111 __ align(OptoLoopAlignment); 4112 __ bind(SMALL_LOOP); // smaller loop 4113 __ subs(cnt2, cnt2, 16); 4114 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4115 __ br(__ GE, SMALL_LOOP); 4116 __ cmn(cnt2, (u1)16); 4117 __ br(__ EQ, LOAD_LAST); 4118 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 4119 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 32 bytes before last 4 characters in UTF-16 string 4120 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 4121 __ ldr(tmp3, Address(cnt1, -8)); 4122 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 4123 __ b(LOAD_LAST); 4124 __ bind(DIFF2); 4125 __ mov(tmpU, tmp3); 4126 __ bind(DIFF1); 4127 __ pop(spilled_regs, sp); 4128 __ b(CALCULATE_DIFFERENCE); 4129 __ bind(LOAD_LAST); 4130 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 4131 // No need to load it again 4132 __ mov(tmpU, tmp3); 4133 __ pop(spilled_regs, sp); 4134 4135 // tmp2 points to the address of the last 4 Latin1 characters right now 4136 __ ldrs(vtmp, Address(tmp2)); 4137 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4138 __ fmovd(tmpL, vtmp); 4139 4140 __ eor(rscratch2, tmpU, tmpL); 4141 __ cbz(rscratch2, DONE); 4142 4143 // Find the first different characters in the longwords and 4144 // compute their difference. 4145 __ bind(CALCULATE_DIFFERENCE); 4146 __ rev(rscratch2, rscratch2); 4147 __ clz(rscratch2, rscratch2); 4148 __ andr(rscratch2, rscratch2, -16); 4149 __ lsrv(tmp1, tmp1, rscratch2); 4150 __ uxthw(tmp1, tmp1); 4151 __ lsrv(rscratch1, rscratch1, rscratch2); 4152 __ uxthw(rscratch1, rscratch1); 4153 __ subw(result, tmp1, rscratch1); 4154 __ bind(DONE); 4155 __ ret(lr); 4156 return entry; 4157 } 4158 4159 address generate_method_entry_barrier() { 4160 __ align(CodeEntryAlignment); 4161 StubCodeMark mark(this, "StubRoutines", "nmethod_entry_barrier"); 4162 4163 Label deoptimize_label; 4164 4165 address start = __ pc(); 4166 4167 __ set_last_Java_frame(sp, rfp, lr, rscratch1); 4168 4169 __ enter(); 4170 __ add(rscratch2, sp, wordSize); // rscratch2 points to the saved lr 4171 4172 __ sub(sp, sp, 4 * wordSize); // four words for the returned {sp, fp, lr, pc} 4173 4174 __ push_call_clobbered_registers(); 4175 4176 __ mov(c_rarg0, rscratch2); 4177 __ call_VM_leaf 4178 (CAST_FROM_FN_PTR 4179 (address, BarrierSetNMethod::nmethod_stub_entry_barrier), 1); 4180 4181 __ reset_last_Java_frame(true); 4182 4183 __ mov(rscratch1, r0); 4184 4185 __ pop_call_clobbered_registers(); 4186 4187 __ cbnz(rscratch1, deoptimize_label); 4188 4189 __ leave(); 4190 __ ret(lr); 4191 4192 __ BIND(deoptimize_label); 4193 4194 __ ldp(/* new sp */ rscratch1, rfp, Address(sp, 0 * wordSize)); 4195 __ ldp(lr, /* new pc*/ rscratch2, Address(sp, 2 * wordSize)); 4196 4197 __ mov(sp, rscratch1); 4198 __ br(rscratch2); 4199 4200 return start; 4201 } 4202 4203 // r0 = result 4204 // r1 = str1 4205 // r2 = cnt1 4206 // r3 = str2 4207 // r4 = cnt2 4208 // r10 = tmp1 4209 // r11 = tmp2 4210 address generate_compare_long_string_same_encoding(bool isLL) { 4211 __ align(CodeEntryAlignment); 4212 StubCodeMark mark(this, "StubRoutines", isLL 4213 ? "compare_long_string_same_encoding LL" 4214 : "compare_long_string_same_encoding UU"); 4215 address entry = __ pc(); 4216 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4217 tmp1 = r10, tmp2 = r11; 4218 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, 4219 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, 4220 DIFF_LAST_POSITION, DIFF_LAST_POSITION2; 4221 // exit from large loop when less than 64 bytes left to read or we're about 4222 // to prefetch memory behind array border 4223 int largeLoopExitCondition = MAX2(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 4224 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 4225 // update cnt2 counter with already loaded 8 bytes 4226 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 4227 // update pointers, because of previous read 4228 __ add(str1, str1, wordSize); 4229 __ add(str2, str2, wordSize); 4230 if (SoftwarePrefetchHintDistance >= 0) { 4231 __ bind(LARGE_LOOP_PREFETCH); 4232 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 4233 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 4234 compare_string_16_bytes_same(DIFF, DIFF2); 4235 compare_string_16_bytes_same(DIFF, DIFF2); 4236 __ sub(cnt2, cnt2, isLL ? 64 : 32); 4237 compare_string_16_bytes_same(DIFF, DIFF2); 4238 __ subs(rscratch2, cnt2, largeLoopExitCondition); 4239 compare_string_16_bytes_same(DIFF, DIFF2); 4240 __ br(__ GT, LARGE_LOOP_PREFETCH); 4241 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? 4242 } 4243 // less than 16 bytes left? 4244 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4245 __ br(__ LT, TAIL); 4246 __ align(OptoLoopAlignment); 4247 __ bind(SMALL_LOOP); 4248 compare_string_16_bytes_same(DIFF, DIFF2); 4249 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4250 __ br(__ GE, SMALL_LOOP); 4251 __ bind(TAIL); 4252 __ adds(cnt2, cnt2, isLL ? 16 : 8); 4253 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); 4254 __ subs(cnt2, cnt2, isLL ? 8 : 4); 4255 __ br(__ LE, CHECK_LAST); 4256 __ eor(rscratch2, tmp1, tmp2); 4257 __ cbnz(rscratch2, DIFF); 4258 __ ldr(tmp1, Address(__ post(str1, 8))); 4259 __ ldr(tmp2, Address(__ post(str2, 8))); 4260 __ sub(cnt2, cnt2, isLL ? 8 : 4); 4261 __ bind(CHECK_LAST); 4262 if (!isLL) { 4263 __ add(cnt2, cnt2, cnt2); // now in bytes 4264 } 4265 __ eor(rscratch2, tmp1, tmp2); 4266 __ cbnz(rscratch2, DIFF); 4267 __ ldr(rscratch1, Address(str1, cnt2)); 4268 __ ldr(cnt1, Address(str2, cnt2)); 4269 __ eor(rscratch2, rscratch1, cnt1); 4270 __ cbz(rscratch2, LENGTH_DIFF); 4271 // Find the first different characters in the longwords and 4272 // compute their difference. 4273 __ bind(DIFF2); 4274 __ rev(rscratch2, rscratch2); 4275 __ clz(rscratch2, rscratch2); 4276 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4277 __ lsrv(rscratch1, rscratch1, rscratch2); 4278 if (isLL) { 4279 __ lsrv(cnt1, cnt1, rscratch2); 4280 __ uxtbw(rscratch1, rscratch1); 4281 __ uxtbw(cnt1, cnt1); 4282 } else { 4283 __ lsrv(cnt1, cnt1, rscratch2); 4284 __ uxthw(rscratch1, rscratch1); 4285 __ uxthw(cnt1, cnt1); 4286 } 4287 __ subw(result, rscratch1, cnt1); 4288 __ b(LENGTH_DIFF); 4289 __ bind(DIFF); 4290 __ rev(rscratch2, rscratch2); 4291 __ clz(rscratch2, rscratch2); 4292 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4293 __ lsrv(tmp1, tmp1, rscratch2); 4294 if (isLL) { 4295 __ lsrv(tmp2, tmp2, rscratch2); 4296 __ uxtbw(tmp1, tmp1); 4297 __ uxtbw(tmp2, tmp2); 4298 } else { 4299 __ lsrv(tmp2, tmp2, rscratch2); 4300 __ uxthw(tmp1, tmp1); 4301 __ uxthw(tmp2, tmp2); 4302 } 4303 __ subw(result, tmp1, tmp2); 4304 __ b(LENGTH_DIFF); 4305 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 4306 __ eor(rscratch2, tmp1, tmp2); 4307 __ cbnz(rscratch2, DIFF); 4308 __ bind(LENGTH_DIFF); 4309 __ ret(lr); 4310 return entry; 4311 } 4312 4313 void generate_compare_long_strings() { 4314 StubRoutines::aarch64::_compare_long_string_LL 4315 = generate_compare_long_string_same_encoding(true); 4316 StubRoutines::aarch64::_compare_long_string_UU 4317 = generate_compare_long_string_same_encoding(false); 4318 StubRoutines::aarch64::_compare_long_string_LU 4319 = generate_compare_long_string_different_encoding(true); 4320 StubRoutines::aarch64::_compare_long_string_UL 4321 = generate_compare_long_string_different_encoding(false); 4322 } 4323 4324 // R0 = result 4325 // R1 = str2 4326 // R2 = cnt1 4327 // R3 = str1 4328 // R4 = cnt2 4329 // This generic linear code use few additional ideas, which makes it faster: 4330 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 4331 // in order to skip initial loading(help in systems with 1 ld pipeline) 4332 // 2) we can use "fast" algorithm of finding single character to search for 4333 // first symbol with less branches(1 branch per each loaded register instead 4334 // of branch for each symbol), so, this is where constants like 4335 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 4336 // 3) after loading and analyzing 1st register of source string, it can be 4337 // used to search for every 1st character entry, saving few loads in 4338 // comparison with "simplier-but-slower" implementation 4339 // 4) in order to avoid lots of push/pop operations, code below is heavily 4340 // re-using/re-initializing/compressing register values, which makes code 4341 // larger and a bit less readable, however, most of extra operations are 4342 // issued during loads or branches, so, penalty is minimal 4343 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 4344 const char* stubName = str1_isL 4345 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 4346 : "indexof_linear_uu"; 4347 __ align(CodeEntryAlignment); 4348 StubCodeMark mark(this, "StubRoutines", stubName); 4349 address entry = __ pc(); 4350 4351 int str1_chr_size = str1_isL ? 1 : 2; 4352 int str2_chr_size = str2_isL ? 1 : 2; 4353 int str1_chr_shift = str1_isL ? 0 : 1; 4354 int str2_chr_shift = str2_isL ? 0 : 1; 4355 bool isL = str1_isL && str2_isL; 4356 // parameters 4357 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 4358 // temporary registers 4359 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 4360 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 4361 // redefinitions 4362 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 4363 4364 __ push(spilled_regs, sp); 4365 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 4366 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 4367 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 4368 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 4369 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 4370 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 4371 // Read whole register from str1. It is safe, because length >=8 here 4372 __ ldr(ch1, Address(str1)); 4373 // Read whole register from str2. It is safe, because length >=8 here 4374 __ ldr(ch2, Address(str2)); 4375 __ sub(cnt2, cnt2, cnt1); 4376 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 4377 if (str1_isL != str2_isL) { 4378 __ eor(v0, __ T16B, v0, v0); 4379 } 4380 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4381 __ mul(first, first, tmp1); 4382 // check if we have less than 1 register to check 4383 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 4384 if (str1_isL != str2_isL) { 4385 __ fmovd(v1, ch1); 4386 } 4387 __ br(__ LE, L_SMALL); 4388 __ eor(ch2, first, ch2); 4389 if (str1_isL != str2_isL) { 4390 __ zip1(v1, __ T16B, v1, v0); 4391 } 4392 __ sub(tmp2, ch2, tmp1); 4393 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4394 __ bics(tmp2, tmp2, ch2); 4395 if (str1_isL != str2_isL) { 4396 __ fmovd(ch1, v1); 4397 } 4398 __ br(__ NE, L_HAS_ZERO); 4399 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4400 __ add(result, result, wordSize/str2_chr_size); 4401 __ add(str2, str2, wordSize); 4402 __ br(__ LT, L_POST_LOOP); 4403 __ BIND(L_LOOP); 4404 __ ldr(ch2, Address(str2)); 4405 __ eor(ch2, first, ch2); 4406 __ sub(tmp2, ch2, tmp1); 4407 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4408 __ bics(tmp2, tmp2, ch2); 4409 __ br(__ NE, L_HAS_ZERO); 4410 __ BIND(L_LOOP_PROCEED); 4411 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4412 __ add(str2, str2, wordSize); 4413 __ add(result, result, wordSize/str2_chr_size); 4414 __ br(__ GE, L_LOOP); 4415 __ BIND(L_POST_LOOP); 4416 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 4417 __ br(__ LE, NOMATCH); 4418 __ ldr(ch2, Address(str2)); 4419 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4420 __ eor(ch2, first, ch2); 4421 __ sub(tmp2, ch2, tmp1); 4422 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4423 __ mov(tmp4, -1); // all bits set 4424 __ b(L_SMALL_PROCEED); 4425 __ align(OptoLoopAlignment); 4426 __ BIND(L_SMALL); 4427 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4428 __ eor(ch2, first, ch2); 4429 if (str1_isL != str2_isL) { 4430 __ zip1(v1, __ T16B, v1, v0); 4431 } 4432 __ sub(tmp2, ch2, tmp1); 4433 __ mov(tmp4, -1); // all bits set 4434 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4435 if (str1_isL != str2_isL) { 4436 __ fmovd(ch1, v1); // move converted 4 symbols 4437 } 4438 __ BIND(L_SMALL_PROCEED); 4439 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 4440 __ bic(tmp2, tmp2, ch2); 4441 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 4442 __ rbit(tmp2, tmp2); 4443 __ br(__ EQ, NOMATCH); 4444 __ BIND(L_SMALL_HAS_ZERO_LOOP); 4445 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 4446 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 4447 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 4448 if (str2_isL) { // LL 4449 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4450 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4451 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4452 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4453 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4454 } else { 4455 __ mov(ch2, 0xE); // all bits in byte set except last one 4456 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4457 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4458 __ lslv(tmp2, tmp2, tmp4); 4459 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4460 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4461 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4462 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4463 } 4464 __ cmp(ch1, ch2); 4465 __ mov(tmp4, wordSize/str2_chr_size); 4466 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4467 __ BIND(L_SMALL_CMP_LOOP); 4468 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4469 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4470 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4471 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4472 __ add(tmp4, tmp4, 1); 4473 __ cmp(tmp4, cnt1); 4474 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 4475 __ cmp(first, ch2); 4476 __ br(__ EQ, L_SMALL_CMP_LOOP); 4477 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 4478 __ cbz(tmp2, NOMATCH); // no more matches. exit 4479 __ clz(tmp4, tmp2); 4480 __ add(result, result, 1); // advance index 4481 __ add(str2, str2, str2_chr_size); // advance pointer 4482 __ b(L_SMALL_HAS_ZERO_LOOP); 4483 __ align(OptoLoopAlignment); 4484 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 4485 __ cmp(first, ch2); 4486 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4487 __ b(DONE); 4488 __ align(OptoLoopAlignment); 4489 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 4490 if (str2_isL) { // LL 4491 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4492 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4493 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4494 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4495 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4496 } else { 4497 __ mov(ch2, 0xE); // all bits in byte set except last one 4498 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4499 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4500 __ lslv(tmp2, tmp2, tmp4); 4501 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4502 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4503 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4504 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4505 } 4506 __ cmp(ch1, ch2); 4507 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4508 __ b(DONE); 4509 __ align(OptoLoopAlignment); 4510 __ BIND(L_HAS_ZERO); 4511 __ rbit(tmp2, tmp2); 4512 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 4513 // Now, perform compression of counters(cnt2 and cnt1) into one register. 4514 // It's fine because both counters are 32bit and are not changed in this 4515 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 4516 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 4517 __ sub(result, result, 1); 4518 __ BIND(L_HAS_ZERO_LOOP); 4519 __ mov(cnt1, wordSize/str2_chr_size); 4520 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4521 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 4522 if (str2_isL) { 4523 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4524 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4525 __ lslv(tmp2, tmp2, tmp4); 4526 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4527 __ add(tmp4, tmp4, 1); 4528 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4529 __ lsl(tmp2, tmp2, 1); 4530 __ mov(tmp4, wordSize/str2_chr_size); 4531 } else { 4532 __ mov(ch2, 0xE); 4533 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4534 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4535 __ lslv(tmp2, tmp2, tmp4); 4536 __ add(tmp4, tmp4, 1); 4537 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4538 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4539 __ lsl(tmp2, tmp2, 1); 4540 __ mov(tmp4, wordSize/str2_chr_size); 4541 __ sub(str2, str2, str2_chr_size); 4542 } 4543 __ cmp(ch1, ch2); 4544 __ mov(tmp4, wordSize/str2_chr_size); 4545 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4546 __ BIND(L_CMP_LOOP); 4547 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4548 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4549 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4550 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4551 __ add(tmp4, tmp4, 1); 4552 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4553 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 4554 __ cmp(cnt1, ch2); 4555 __ br(__ EQ, L_CMP_LOOP); 4556 __ BIND(L_CMP_LOOP_NOMATCH); 4557 // here we're not matched 4558 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 4559 __ clz(tmp4, tmp2); 4560 __ add(str2, str2, str2_chr_size); // advance pointer 4561 __ b(L_HAS_ZERO_LOOP); 4562 __ align(OptoLoopAlignment); 4563 __ BIND(L_CMP_LOOP_LAST_CMP); 4564 __ cmp(cnt1, ch2); 4565 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4566 __ b(DONE); 4567 __ align(OptoLoopAlignment); 4568 __ BIND(L_CMP_LOOP_LAST_CMP2); 4569 if (str2_isL) { 4570 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4571 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4572 __ lslv(tmp2, tmp2, tmp4); 4573 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4574 __ add(tmp4, tmp4, 1); 4575 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4576 __ lsl(tmp2, tmp2, 1); 4577 } else { 4578 __ mov(ch2, 0xE); 4579 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4580 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4581 __ lslv(tmp2, tmp2, tmp4); 4582 __ add(tmp4, tmp4, 1); 4583 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4584 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4585 __ lsl(tmp2, tmp2, 1); 4586 __ sub(str2, str2, str2_chr_size); 4587 } 4588 __ cmp(ch1, ch2); 4589 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4590 __ b(DONE); 4591 __ align(OptoLoopAlignment); 4592 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 4593 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 4594 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 4595 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 4596 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 4597 // result by analyzed characters value, so, we can just reset lower bits 4598 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 4599 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 4600 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 4601 // index of last analyzed substring inside current octet. So, str2 in at 4602 // respective start address. We need to advance it to next octet 4603 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 4604 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 4605 __ bfm(result, zr, 0, 2 - str2_chr_shift); 4606 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 4607 __ movw(cnt2, cnt2); 4608 __ b(L_LOOP_PROCEED); 4609 __ align(OptoLoopAlignment); 4610 __ BIND(NOMATCH); 4611 __ mov(result, -1); 4612 __ BIND(DONE); 4613 __ pop(spilled_regs, sp); 4614 __ ret(lr); 4615 return entry; 4616 } 4617 4618 void generate_string_indexof_stubs() { 4619 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 4620 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 4621 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 4622 } 4623 4624 void inflate_and_store_2_fp_registers(bool generatePrfm, 4625 FloatRegister src1, FloatRegister src2) { 4626 Register dst = r1; 4627 __ zip1(v1, __ T16B, src1, v0); 4628 __ zip2(v2, __ T16B, src1, v0); 4629 if (generatePrfm) { 4630 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 4631 } 4632 __ zip1(v3, __ T16B, src2, v0); 4633 __ zip2(v4, __ T16B, src2, v0); 4634 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4635 } 4636 4637 // R0 = src 4638 // R1 = dst 4639 // R2 = len 4640 // R3 = len >> 3 4641 // V0 = 0 4642 // v1 = loaded 8 bytes 4643 address generate_large_byte_array_inflate() { 4644 __ align(CodeEntryAlignment); 4645 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4646 address entry = __ pc(); 4647 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4648 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4649 const int large_loop_threshold = MAX2(64, SoftwarePrefetchHintDistance)/8 + 4; 4650 4651 // do one more 8-byte read to have address 16-byte aligned in most cases 4652 // also use single store instruction 4653 __ ldrd(v2, __ post(src, 8)); 4654 __ sub(octetCounter, octetCounter, 2); 4655 __ zip1(v1, __ T16B, v1, v0); 4656 __ zip1(v2, __ T16B, v2, v0); 4657 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4658 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4659 __ subs(rscratch1, octetCounter, large_loop_threshold); 4660 __ br(__ LE, LOOP_START); 4661 __ b(LOOP_PRFM_START); 4662 __ bind(LOOP_PRFM); 4663 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4664 __ bind(LOOP_PRFM_START); 4665 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4666 __ sub(octetCounter, octetCounter, 8); 4667 __ subs(rscratch1, octetCounter, large_loop_threshold); 4668 inflate_and_store_2_fp_registers(true, v3, v4); 4669 inflate_and_store_2_fp_registers(true, v5, v6); 4670 __ br(__ GT, LOOP_PRFM); 4671 __ cmp(octetCounter, (u1)8); 4672 __ br(__ LT, DONE); 4673 __ bind(LOOP); 4674 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4675 __ bind(LOOP_START); 4676 __ sub(octetCounter, octetCounter, 8); 4677 __ cmp(octetCounter, (u1)8); 4678 inflate_and_store_2_fp_registers(false, v3, v4); 4679 inflate_and_store_2_fp_registers(false, v5, v6); 4680 __ br(__ GE, LOOP); 4681 __ bind(DONE); 4682 __ ret(lr); 4683 return entry; 4684 } 4685 4686 /** 4687 * Arguments: 4688 * 4689 * Input: 4690 * c_rarg0 - current state address 4691 * c_rarg1 - H key address 4692 * c_rarg2 - data address 4693 * c_rarg3 - number of blocks 4694 * 4695 * Output: 4696 * Updated state at c_rarg0 4697 */ 4698 address generate_ghash_processBlocks() { 4699 // Bafflingly, GCM uses little-endian for the byte order, but 4700 // big-endian for the bit order. For example, the polynomial 1 is 4701 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4702 // 4703 // So, we must either reverse the bytes in each word and do 4704 // everything big-endian or reverse the bits in each byte and do 4705 // it little-endian. On AArch64 it's more idiomatic to reverse 4706 // the bits in each byte (we have an instruction, RBIT, to do 4707 // that) and keep the data in little-endian bit order throught the 4708 // calculation, bit-reversing the inputs and outputs. 4709 4710 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4711 __ align(wordSize * 2); 4712 address p = __ pc(); 4713 __ emit_int64(0x87); // The low-order bits of the field 4714 // polynomial (i.e. p = z^7+z^2+z+1) 4715 // repeated in the low and high parts of a 4716 // 128-bit vector 4717 __ emit_int64(0x87); 4718 4719 __ align(CodeEntryAlignment); 4720 address start = __ pc(); 4721 4722 Register state = c_rarg0; 4723 Register subkeyH = c_rarg1; 4724 Register data = c_rarg2; 4725 Register blocks = c_rarg3; 4726 4727 FloatRegister vzr = v30; 4728 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4729 4730 __ ldrq(v0, Address(state)); 4731 __ ldrq(v1, Address(subkeyH)); 4732 4733 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4734 __ rbit(v0, __ T16B, v0); 4735 __ rev64(v1, __ T16B, v1); 4736 __ rbit(v1, __ T16B, v1); 4737 4738 __ ldrq(v26, p); 4739 4740 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4741 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4742 4743 { 4744 Label L_ghash_loop; 4745 __ bind(L_ghash_loop); 4746 4747 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4748 // reversing each byte 4749 __ rbit(v2, __ T16B, v2); 4750 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4751 4752 // Multiply state in v2 by subkey in v1 4753 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4754 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4755 /*temps*/v6, v20, v18, v21); 4756 // Reduce v7:v5 by the field polynomial 4757 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4758 4759 __ sub(blocks, blocks, 1); 4760 __ cbnz(blocks, L_ghash_loop); 4761 } 4762 4763 // The bit-reversed result is at this point in v0 4764 __ rev64(v1, __ T16B, v0); 4765 __ rbit(v1, __ T16B, v1); 4766 4767 __ st1(v1, __ T16B, state); 4768 __ ret(lr); 4769 4770 return start; 4771 } 4772 4773 // Continuation point for throwing of implicit exceptions that are 4774 // not handled in the current activation. Fabricates an exception 4775 // oop and initiates normal exception dispatching in this 4776 // frame. Since we need to preserve callee-saved values (currently 4777 // only for C2, but done for C1 as well) we need a callee-saved oop 4778 // map and therefore have to make these stubs into RuntimeStubs 4779 // rather than BufferBlobs. If the compiler needs all registers to 4780 // be preserved between the fault point and the exception handler 4781 // then it must assume responsibility for that in 4782 // AbstractCompiler::continuation_for_implicit_null_exception or 4783 // continuation_for_implicit_division_by_zero_exception. All other 4784 // implicit exceptions (e.g., NullPointerException or 4785 // AbstractMethodError on entry) are either at call sites or 4786 // otherwise assume that stack unwinding will be initiated, so 4787 // caller saved registers were assumed volatile in the compiler. 4788 4789 #undef __ 4790 #define __ masm-> 4791 4792 address generate_throw_exception(const char* name, 4793 address runtime_entry, 4794 Register arg1 = noreg, 4795 Register arg2 = noreg) { 4796 // Information about frame layout at time of blocking runtime call. 4797 // Note that we only have to preserve callee-saved registers since 4798 // the compilers are responsible for supplying a continuation point 4799 // if they expect all registers to be preserved. 4800 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4801 enum layout { 4802 rfp_off = 0, 4803 rfp_off2, 4804 return_off, 4805 return_off2, 4806 framesize // inclusive of return address 4807 }; 4808 4809 int insts_size = 512; 4810 int locs_size = 64; 4811 4812 CodeBuffer code(name, insts_size, locs_size); 4813 OopMapSet* oop_maps = new OopMapSet(); 4814 MacroAssembler* masm = new MacroAssembler(&code); 4815 4816 address start = __ pc(); 4817 4818 // This is an inlined and slightly modified version of call_VM 4819 // which has the ability to fetch the return PC out of 4820 // thread-local storage and also sets up last_Java_sp slightly 4821 // differently than the real call_VM 4822 4823 __ enter(); // Save FP and LR before call 4824 4825 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4826 4827 // lr and fp are already in place 4828 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4829 4830 int frame_complete = __ pc() - start; 4831 4832 // Set up last_Java_sp and last_Java_fp 4833 address the_pc = __ pc(); 4834 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 4835 4836 // Call runtime 4837 if (arg1 != noreg) { 4838 assert(arg2 != c_rarg1, "clobbered"); 4839 __ mov(c_rarg1, arg1); 4840 } 4841 if (arg2 != noreg) { 4842 __ mov(c_rarg2, arg2); 4843 } 4844 __ mov(c_rarg0, rthread); 4845 BLOCK_COMMENT("call runtime_entry"); 4846 __ mov(rscratch1, runtime_entry); 4847 __ blr(rscratch1); 4848 4849 // Generate oop map 4850 OopMap* map = new OopMap(framesize, 0); 4851 4852 oop_maps->add_gc_map(the_pc - start, map); 4853 4854 __ reset_last_Java_frame(true); 4855 __ maybe_isb(); 4856 4857 __ leave(); 4858 4859 // check for pending exceptions 4860 #ifdef ASSERT 4861 Label L; 4862 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4863 __ cbnz(rscratch1, L); 4864 __ should_not_reach_here(); 4865 __ bind(L); 4866 #endif // ASSERT 4867 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4868 4869 4870 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4871 RuntimeStub* stub = 4872 RuntimeStub::new_runtime_stub(name, 4873 &code, 4874 frame_complete, 4875 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4876 oop_maps, false); 4877 return stub->entry_point(); 4878 } 4879 4880 class MontgomeryMultiplyGenerator : public MacroAssembler { 4881 4882 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4883 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4884 4885 RegSet _toSave; 4886 bool _squaring; 4887 4888 public: 4889 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4890 : MacroAssembler(as->code()), _squaring(squaring) { 4891 4892 // Register allocation 4893 4894 Register reg = c_rarg0; 4895 Pa_base = reg; // Argument registers 4896 if (squaring) 4897 Pb_base = Pa_base; 4898 else 4899 Pb_base = next_reg(reg); 4900 Pn_base = next_reg(reg); 4901 Rlen= next_reg(reg); 4902 inv = next_reg(reg); 4903 Pm_base = next_reg(reg); 4904 4905 // Working registers: 4906 Ra = next_reg(reg); // The current digit of a, b, n, and m. 4907 Rb = next_reg(reg); 4908 Rm = next_reg(reg); 4909 Rn = next_reg(reg); 4910 4911 Pa = next_reg(reg); // Pointers to the current/next digit of a, b, n, and m. 4912 Pb = next_reg(reg); 4913 Pm = next_reg(reg); 4914 Pn = next_reg(reg); 4915 4916 t0 = next_reg(reg); // Three registers which form a 4917 t1 = next_reg(reg); // triple-precision accumuator. 4918 t2 = next_reg(reg); 4919 4920 Ri = next_reg(reg); // Inner and outer loop indexes. 4921 Rj = next_reg(reg); 4922 4923 Rhi_ab = next_reg(reg); // Product registers: low and high parts 4924 Rlo_ab = next_reg(reg); // of a*b and m*n. 4925 Rhi_mn = next_reg(reg); 4926 Rlo_mn = next_reg(reg); 4927 4928 // r19 and up are callee-saved. 4929 _toSave = RegSet::range(r19, reg) + Pm_base; 4930 } 4931 4932 private: 4933 Register next_reg(Register ®) { 4934 #ifdef _WIN64 4935 // skip r18 on Windows, it's used by native TLS 4936 return ++reg == r18 ? ++reg : reg; 4937 #else 4938 return ++reg; 4939 #endif 4940 } 4941 4942 void save_regs() { 4943 push(_toSave, sp); 4944 } 4945 4946 void restore_regs() { 4947 pop(_toSave, sp); 4948 } 4949 4950 template <typename T> 4951 void unroll_2(Register count, T block) { 4952 Label loop, end, odd; 4953 tbnz(count, 0, odd); 4954 cbz(count, end); 4955 align(16); 4956 bind(loop); 4957 (this->*block)(); 4958 bind(odd); 4959 (this->*block)(); 4960 subs(count, count, 2); 4961 br(Assembler::GT, loop); 4962 bind(end); 4963 } 4964 4965 template <typename T> 4966 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4967 Label loop, end, odd; 4968 tbnz(count, 0, odd); 4969 cbz(count, end); 4970 align(16); 4971 bind(loop); 4972 (this->*block)(d, s, tmp); 4973 bind(odd); 4974 (this->*block)(d, s, tmp); 4975 subs(count, count, 2); 4976 br(Assembler::GT, loop); 4977 bind(end); 4978 } 4979 4980 void pre1(RegisterOrConstant i) { 4981 block_comment("pre1"); 4982 // Pa = Pa_base; 4983 // Pb = Pb_base + i; 4984 // Pm = Pm_base; 4985 // Pn = Pn_base + i; 4986 // Ra = *Pa; 4987 // Rb = *Pb; 4988 // Rm = *Pm; 4989 // Rn = *Pn; 4990 ldr(Ra, Address(Pa_base)); 4991 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4992 ldr(Rm, Address(Pm_base)); 4993 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4994 lea(Pa, Address(Pa_base)); 4995 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4996 lea(Pm, Address(Pm_base)); 4997 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4998 4999 // Zero the m*n result. 5000 mov(Rhi_mn, zr); 5001 mov(Rlo_mn, zr); 5002 } 5003 5004 // The core multiply-accumulate step of a Montgomery 5005 // multiplication. The idea is to schedule operations as a 5006 // pipeline so that instructions with long latencies (loads and 5007 // multiplies) have time to complete before their results are 5008 // used. This most benefits in-order implementations of the 5009 // architecture but out-of-order ones also benefit. 5010 void step() { 5011 block_comment("step"); 5012 // MACC(Ra, Rb, t0, t1, t2); 5013 // Ra = *++Pa; 5014 // Rb = *--Pb; 5015 umulh(Rhi_ab, Ra, Rb); 5016 mul(Rlo_ab, Ra, Rb); 5017 ldr(Ra, pre(Pa, wordSize)); 5018 ldr(Rb, pre(Pb, -wordSize)); 5019 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 5020 // previous iteration. 5021 // MACC(Rm, Rn, t0, t1, t2); 5022 // Rm = *++Pm; 5023 // Rn = *--Pn; 5024 umulh(Rhi_mn, Rm, Rn); 5025 mul(Rlo_mn, Rm, Rn); 5026 ldr(Rm, pre(Pm, wordSize)); 5027 ldr(Rn, pre(Pn, -wordSize)); 5028 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5029 } 5030 5031 void post1() { 5032 block_comment("post1"); 5033 5034 // MACC(Ra, Rb, t0, t1, t2); 5035 // Ra = *++Pa; 5036 // Rb = *--Pb; 5037 umulh(Rhi_ab, Ra, Rb); 5038 mul(Rlo_ab, Ra, Rb); 5039 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5040 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5041 5042 // *Pm = Rm = t0 * inv; 5043 mul(Rm, t0, inv); 5044 str(Rm, Address(Pm)); 5045 5046 // MACC(Rm, Rn, t0, t1, t2); 5047 // t0 = t1; t1 = t2; t2 = 0; 5048 umulh(Rhi_mn, Rm, Rn); 5049 5050 #ifndef PRODUCT 5051 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5052 { 5053 mul(Rlo_mn, Rm, Rn); 5054 add(Rlo_mn, t0, Rlo_mn); 5055 Label ok; 5056 cbz(Rlo_mn, ok); { 5057 stop("broken Montgomery multiply"); 5058 } bind(ok); 5059 } 5060 #endif 5061 // We have very carefully set things up so that 5062 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5063 // the lower half of Rm * Rn because we know the result already: 5064 // it must be -t0. t0 + (-t0) must generate a carry iff 5065 // t0 != 0. So, rather than do a mul and an adds we just set 5066 // the carry flag iff t0 is nonzero. 5067 // 5068 // mul(Rlo_mn, Rm, Rn); 5069 // adds(zr, t0, Rlo_mn); 5070 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5071 adcs(t0, t1, Rhi_mn); 5072 adc(t1, t2, zr); 5073 mov(t2, zr); 5074 } 5075 5076 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 5077 block_comment("pre2"); 5078 // Pa = Pa_base + i-len; 5079 // Pb = Pb_base + len; 5080 // Pm = Pm_base + i-len; 5081 // Pn = Pn_base + len; 5082 5083 if (i.is_register()) { 5084 sub(Rj, i.as_register(), len); 5085 } else { 5086 mov(Rj, i.as_constant()); 5087 sub(Rj, Rj, len); 5088 } 5089 // Rj == i-len 5090 5091 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 5092 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 5093 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5094 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 5095 5096 // Ra = *++Pa; 5097 // Rb = *--Pb; 5098 // Rm = *++Pm; 5099 // Rn = *--Pn; 5100 ldr(Ra, pre(Pa, wordSize)); 5101 ldr(Rb, pre(Pb, -wordSize)); 5102 ldr(Rm, pre(Pm, wordSize)); 5103 ldr(Rn, pre(Pn, -wordSize)); 5104 5105 mov(Rhi_mn, zr); 5106 mov(Rlo_mn, zr); 5107 } 5108 5109 void post2(RegisterOrConstant i, RegisterOrConstant len) { 5110 block_comment("post2"); 5111 if (i.is_constant()) { 5112 mov(Rj, i.as_constant()-len.as_constant()); 5113 } else { 5114 sub(Rj, i.as_register(), len); 5115 } 5116 5117 adds(t0, t0, Rlo_mn); // The pending m*n, low part 5118 5119 // As soon as we know the least significant digit of our result, 5120 // store it. 5121 // Pm_base[i-len] = t0; 5122 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5123 5124 // t0 = t1; t1 = t2; t2 = 0; 5125 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 5126 adc(t1, t2, zr); 5127 mov(t2, zr); 5128 } 5129 5130 // A carry in t0 after Montgomery multiplication means that we 5131 // should subtract multiples of n from our result in m. We'll 5132 // keep doing that until there is no carry. 5133 void normalize(RegisterOrConstant len) { 5134 block_comment("normalize"); 5135 // while (t0) 5136 // t0 = sub(Pm_base, Pn_base, t0, len); 5137 Label loop, post, again; 5138 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 5139 cbz(t0, post); { 5140 bind(again); { 5141 mov(i, zr); 5142 mov(cnt, len); 5143 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5144 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5145 subs(zr, zr, zr); // set carry flag, i.e. no borrow 5146 align(16); 5147 bind(loop); { 5148 sbcs(Rm, Rm, Rn); 5149 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5150 add(i, i, 1); 5151 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5152 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5153 sub(cnt, cnt, 1); 5154 } cbnz(cnt, loop); 5155 sbc(t0, t0, zr); 5156 } cbnz(t0, again); 5157 } bind(post); 5158 } 5159 5160 // Move memory at s to d, reversing words. 5161 // Increments d to end of copied memory 5162 // Destroys tmp1, tmp2 5163 // Preserves len 5164 // Leaves s pointing to the address which was in d at start 5165 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 5166 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 5167 5168 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 5169 mov(tmp1, len); 5170 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 5171 sub(s, d, len, ext::uxtw, LogBytesPerWord); 5172 } 5173 // where 5174 void reverse1(Register d, Register s, Register tmp) { 5175 ldr(tmp, pre(s, -wordSize)); 5176 ror(tmp, tmp, 32); 5177 str(tmp, post(d, wordSize)); 5178 } 5179 5180 void step_squaring() { 5181 // An extra ACC 5182 step(); 5183 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5184 } 5185 5186 void last_squaring(RegisterOrConstant i) { 5187 Label dont; 5188 // if ((i & 1) == 0) { 5189 tbnz(i.as_register(), 0, dont); { 5190 // MACC(Ra, Rb, t0, t1, t2); 5191 // Ra = *++Pa; 5192 // Rb = *--Pb; 5193 umulh(Rhi_ab, Ra, Rb); 5194 mul(Rlo_ab, Ra, Rb); 5195 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5196 } bind(dont); 5197 } 5198 5199 void extra_step_squaring() { 5200 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5201 5202 // MACC(Rm, Rn, t0, t1, t2); 5203 // Rm = *++Pm; 5204 // Rn = *--Pn; 5205 umulh(Rhi_mn, Rm, Rn); 5206 mul(Rlo_mn, Rm, Rn); 5207 ldr(Rm, pre(Pm, wordSize)); 5208 ldr(Rn, pre(Pn, -wordSize)); 5209 } 5210 5211 void post1_squaring() { 5212 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5213 5214 // *Pm = Rm = t0 * inv; 5215 mul(Rm, t0, inv); 5216 str(Rm, Address(Pm)); 5217 5218 // MACC(Rm, Rn, t0, t1, t2); 5219 // t0 = t1; t1 = t2; t2 = 0; 5220 umulh(Rhi_mn, Rm, Rn); 5221 5222 #ifndef PRODUCT 5223 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5224 { 5225 mul(Rlo_mn, Rm, Rn); 5226 add(Rlo_mn, t0, Rlo_mn); 5227 Label ok; 5228 cbz(Rlo_mn, ok); { 5229 stop("broken Montgomery multiply"); 5230 } bind(ok); 5231 } 5232 #endif 5233 // We have very carefully set things up so that 5234 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5235 // the lower half of Rm * Rn because we know the result already: 5236 // it must be -t0. t0 + (-t0) must generate a carry iff 5237 // t0 != 0. So, rather than do a mul and an adds we just set 5238 // the carry flag iff t0 is nonzero. 5239 // 5240 // mul(Rlo_mn, Rm, Rn); 5241 // adds(zr, t0, Rlo_mn); 5242 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5243 adcs(t0, t1, Rhi_mn); 5244 adc(t1, t2, zr); 5245 mov(t2, zr); 5246 } 5247 5248 void acc(Register Rhi, Register Rlo, 5249 Register t0, Register t1, Register t2) { 5250 adds(t0, t0, Rlo); 5251 adcs(t1, t1, Rhi); 5252 adc(t2, t2, zr); 5253 } 5254 5255 public: 5256 /** 5257 * Fast Montgomery multiplication. The derivation of the 5258 * algorithm is in A Cryptographic Library for the Motorola 5259 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 5260 * 5261 * Arguments: 5262 * 5263 * Inputs for multiplication: 5264 * c_rarg0 - int array elements a 5265 * c_rarg1 - int array elements b 5266 * c_rarg2 - int array elements n (the modulus) 5267 * c_rarg3 - int length 5268 * c_rarg4 - int inv 5269 * c_rarg5 - int array elements m (the result) 5270 * 5271 * Inputs for squaring: 5272 * c_rarg0 - int array elements a 5273 * c_rarg1 - int array elements n (the modulus) 5274 * c_rarg2 - int length 5275 * c_rarg3 - int inv 5276 * c_rarg4 - int array elements m (the result) 5277 * 5278 */ 5279 address generate_multiply() { 5280 Label argh, nothing; 5281 bind(argh); 5282 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5283 5284 align(CodeEntryAlignment); 5285 address entry = pc(); 5286 5287 cbzw(Rlen, nothing); 5288 5289 enter(); 5290 5291 // Make room. 5292 cmpw(Rlen, 512); 5293 br(Assembler::HI, argh); 5294 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5295 andr(sp, Ra, -2 * wordSize); 5296 5297 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5298 5299 { 5300 // Copy input args, reversing as we go. We use Ra as a 5301 // temporary variable. 5302 reverse(Ra, Pa_base, Rlen, t0, t1); 5303 if (!_squaring) 5304 reverse(Ra, Pb_base, Rlen, t0, t1); 5305 reverse(Ra, Pn_base, Rlen, t0, t1); 5306 } 5307 5308 // Push all call-saved registers and also Pm_base which we'll need 5309 // at the end. 5310 save_regs(); 5311 5312 #ifndef PRODUCT 5313 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 5314 { 5315 ldr(Rn, Address(Pn_base, 0)); 5316 mul(Rlo_mn, Rn, inv); 5317 subs(zr, Rlo_mn, -1); 5318 Label ok; 5319 br(EQ, ok); { 5320 stop("broken inverse in Montgomery multiply"); 5321 } bind(ok); 5322 } 5323 #endif 5324 5325 mov(Pm_base, Ra); 5326 5327 mov(t0, zr); 5328 mov(t1, zr); 5329 mov(t2, zr); 5330 5331 block_comment("for (int i = 0; i < len; i++) {"); 5332 mov(Ri, zr); { 5333 Label loop, end; 5334 cmpw(Ri, Rlen); 5335 br(Assembler::GE, end); 5336 5337 bind(loop); 5338 pre1(Ri); 5339 5340 block_comment(" for (j = i; j; j--) {"); { 5341 movw(Rj, Ri); 5342 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5343 } block_comment(" } // j"); 5344 5345 post1(); 5346 addw(Ri, Ri, 1); 5347 cmpw(Ri, Rlen); 5348 br(Assembler::LT, loop); 5349 bind(end); 5350 block_comment("} // i"); 5351 } 5352 5353 block_comment("for (int i = len; i < 2*len; i++) {"); 5354 mov(Ri, Rlen); { 5355 Label loop, end; 5356 cmpw(Ri, Rlen, Assembler::LSL, 1); 5357 br(Assembler::GE, end); 5358 5359 bind(loop); 5360 pre2(Ri, Rlen); 5361 5362 block_comment(" for (j = len*2-i-1; j; j--) {"); { 5363 lslw(Rj, Rlen, 1); 5364 subw(Rj, Rj, Ri); 5365 subw(Rj, Rj, 1); 5366 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5367 } block_comment(" } // j"); 5368 5369 post2(Ri, Rlen); 5370 addw(Ri, Ri, 1); 5371 cmpw(Ri, Rlen, Assembler::LSL, 1); 5372 br(Assembler::LT, loop); 5373 bind(end); 5374 } 5375 block_comment("} // i"); 5376 5377 normalize(Rlen); 5378 5379 mov(Ra, Pm_base); // Save Pm_base in Ra 5380 restore_regs(); // Restore caller's Pm_base 5381 5382 // Copy our result into caller's Pm_base 5383 reverse(Pm_base, Ra, Rlen, t0, t1); 5384 5385 leave(); 5386 bind(nothing); 5387 ret(lr); 5388 5389 return entry; 5390 } 5391 // In C, approximately: 5392 5393 // void 5394 // montgomery_multiply(uint64_t Pa_base[], uint64_t Pb_base[], 5395 // uint64_t Pn_base[], uint64_t Pm_base[], 5396 // uint64_t inv, int len) { 5397 // uint64_t t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5398 // uint64_t *Pa, *Pb, *Pn, *Pm; 5399 // uint64_t Ra, Rb, Rn, Rm; 5400 5401 // int i; 5402 5403 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5404 5405 // for (i = 0; i < len; i++) { 5406 // int j; 5407 5408 // Pa = Pa_base; 5409 // Pb = Pb_base + i; 5410 // Pm = Pm_base; 5411 // Pn = Pn_base + i; 5412 5413 // Ra = *Pa; 5414 // Rb = *Pb; 5415 // Rm = *Pm; 5416 // Rn = *Pn; 5417 5418 // int iters = i; 5419 // for (j = 0; iters--; j++) { 5420 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5421 // MACC(Ra, Rb, t0, t1, t2); 5422 // Ra = *++Pa; 5423 // Rb = *--Pb; 5424 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5425 // MACC(Rm, Rn, t0, t1, t2); 5426 // Rm = *++Pm; 5427 // Rn = *--Pn; 5428 // } 5429 5430 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 5431 // MACC(Ra, Rb, t0, t1, t2); 5432 // *Pm = Rm = t0 * inv; 5433 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5434 // MACC(Rm, Rn, t0, t1, t2); 5435 5436 // assert(t0 == 0, "broken Montgomery multiply"); 5437 5438 // t0 = t1; t1 = t2; t2 = 0; 5439 // } 5440 5441 // for (i = len; i < 2*len; i++) { 5442 // int j; 5443 5444 // Pa = Pa_base + i-len; 5445 // Pb = Pb_base + len; 5446 // Pm = Pm_base + i-len; 5447 // Pn = Pn_base + len; 5448 5449 // Ra = *++Pa; 5450 // Rb = *--Pb; 5451 // Rm = *++Pm; 5452 // Rn = *--Pn; 5453 5454 // int iters = len*2-i-1; 5455 // for (j = i-len+1; iters--; j++) { 5456 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5457 // MACC(Ra, Rb, t0, t1, t2); 5458 // Ra = *++Pa; 5459 // Rb = *--Pb; 5460 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5461 // MACC(Rm, Rn, t0, t1, t2); 5462 // Rm = *++Pm; 5463 // Rn = *--Pn; 5464 // } 5465 5466 // Pm_base[i-len] = t0; 5467 // t0 = t1; t1 = t2; t2 = 0; 5468 // } 5469 5470 // while (t0) 5471 // t0 = sub(Pm_base, Pn_base, t0, len); 5472 // } 5473 5474 /** 5475 * Fast Montgomery squaring. This uses asymptotically 25% fewer 5476 * multiplies than Montgomery multiplication so it should be up to 5477 * 25% faster. However, its loop control is more complex and it 5478 * may actually run slower on some machines. 5479 * 5480 * Arguments: 5481 * 5482 * Inputs: 5483 * c_rarg0 - int array elements a 5484 * c_rarg1 - int array elements n (the modulus) 5485 * c_rarg2 - int length 5486 * c_rarg3 - int inv 5487 * c_rarg4 - int array elements m (the result) 5488 * 5489 */ 5490 address generate_square() { 5491 Label argh; 5492 bind(argh); 5493 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5494 5495 align(CodeEntryAlignment); 5496 address entry = pc(); 5497 5498 enter(); 5499 5500 // Make room. 5501 cmpw(Rlen, 512); 5502 br(Assembler::HI, argh); 5503 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5504 andr(sp, Ra, -2 * wordSize); 5505 5506 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5507 5508 { 5509 // Copy input args, reversing as we go. We use Ra as a 5510 // temporary variable. 5511 reverse(Ra, Pa_base, Rlen, t0, t1); 5512 reverse(Ra, Pn_base, Rlen, t0, t1); 5513 } 5514 5515 // Push all call-saved registers and also Pm_base which we'll need 5516 // at the end. 5517 save_regs(); 5518 5519 mov(Pm_base, Ra); 5520 5521 mov(t0, zr); 5522 mov(t1, zr); 5523 mov(t2, zr); 5524 5525 block_comment("for (int i = 0; i < len; i++) {"); 5526 mov(Ri, zr); { 5527 Label loop, end; 5528 bind(loop); 5529 cmp(Ri, Rlen); 5530 br(Assembler::GE, end); 5531 5532 pre1(Ri); 5533 5534 block_comment("for (j = (i+1)/2; j; j--) {"); { 5535 add(Rj, Ri, 1); 5536 lsr(Rj, Rj, 1); 5537 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5538 } block_comment(" } // j"); 5539 5540 last_squaring(Ri); 5541 5542 block_comment(" for (j = i/2; j; j--) {"); { 5543 lsr(Rj, Ri, 1); 5544 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5545 } block_comment(" } // j"); 5546 5547 post1_squaring(); 5548 add(Ri, Ri, 1); 5549 cmp(Ri, Rlen); 5550 br(Assembler::LT, loop); 5551 5552 bind(end); 5553 block_comment("} // i"); 5554 } 5555 5556 block_comment("for (int i = len; i < 2*len; i++) {"); 5557 mov(Ri, Rlen); { 5558 Label loop, end; 5559 bind(loop); 5560 cmp(Ri, Rlen, Assembler::LSL, 1); 5561 br(Assembler::GE, end); 5562 5563 pre2(Ri, Rlen); 5564 5565 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 5566 lsl(Rj, Rlen, 1); 5567 sub(Rj, Rj, Ri); 5568 sub(Rj, Rj, 1); 5569 lsr(Rj, Rj, 1); 5570 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5571 } block_comment(" } // j"); 5572 5573 last_squaring(Ri); 5574 5575 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 5576 lsl(Rj, Rlen, 1); 5577 sub(Rj, Rj, Ri); 5578 lsr(Rj, Rj, 1); 5579 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5580 } block_comment(" } // j"); 5581 5582 post2(Ri, Rlen); 5583 add(Ri, Ri, 1); 5584 cmp(Ri, Rlen, Assembler::LSL, 1); 5585 5586 br(Assembler::LT, loop); 5587 bind(end); 5588 block_comment("} // i"); 5589 } 5590 5591 normalize(Rlen); 5592 5593 mov(Ra, Pm_base); // Save Pm_base in Ra 5594 restore_regs(); // Restore caller's Pm_base 5595 5596 // Copy our result into caller's Pm_base 5597 reverse(Pm_base, Ra, Rlen, t0, t1); 5598 5599 leave(); 5600 ret(lr); 5601 5602 return entry; 5603 } 5604 // In C, approximately: 5605 5606 // void 5607 // montgomery_square(uint64_t Pa_base[], uint64_t Pn_base[], 5608 // uint64_t Pm_base[], uint64_t inv, int len) { 5609 // uint64_t t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5610 // uint64_t *Pa, *Pb, *Pn, *Pm; 5611 // uint64_t Ra, Rb, Rn, Rm; 5612 5613 // int i; 5614 5615 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5616 5617 // for (i = 0; i < len; i++) { 5618 // int j; 5619 5620 // Pa = Pa_base; 5621 // Pb = Pa_base + i; 5622 // Pm = Pm_base; 5623 // Pn = Pn_base + i; 5624 5625 // Ra = *Pa; 5626 // Rb = *Pb; 5627 // Rm = *Pm; 5628 // Rn = *Pn; 5629 5630 // int iters = (i+1)/2; 5631 // for (j = 0; iters--; j++) { 5632 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5633 // MACC2(Ra, Rb, t0, t1, t2); 5634 // Ra = *++Pa; 5635 // Rb = *--Pb; 5636 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5637 // MACC(Rm, Rn, t0, t1, t2); 5638 // Rm = *++Pm; 5639 // Rn = *--Pn; 5640 // } 5641 // if ((i & 1) == 0) { 5642 // assert(Ra == Pa_base[j], "must be"); 5643 // MACC(Ra, Ra, t0, t1, t2); 5644 // } 5645 // iters = i/2; 5646 // assert(iters == i-j, "must be"); 5647 // for (; iters--; j++) { 5648 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5649 // MACC(Rm, Rn, t0, t1, t2); 5650 // Rm = *++Pm; 5651 // Rn = *--Pn; 5652 // } 5653 5654 // *Pm = Rm = t0 * inv; 5655 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5656 // MACC(Rm, Rn, t0, t1, t2); 5657 5658 // assert(t0 == 0, "broken Montgomery multiply"); 5659 5660 // t0 = t1; t1 = t2; t2 = 0; 5661 // } 5662 5663 // for (i = len; i < 2*len; i++) { 5664 // int start = i-len+1; 5665 // int end = start + (len - start)/2; 5666 // int j; 5667 5668 // Pa = Pa_base + i-len; 5669 // Pb = Pa_base + len; 5670 // Pm = Pm_base + i-len; 5671 // Pn = Pn_base + len; 5672 5673 // Ra = *++Pa; 5674 // Rb = *--Pb; 5675 // Rm = *++Pm; 5676 // Rn = *--Pn; 5677 5678 // int iters = (2*len-i-1)/2; 5679 // assert(iters == end-start, "must be"); 5680 // for (j = start; iters--; j++) { 5681 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5682 // MACC2(Ra, Rb, t0, t1, t2); 5683 // Ra = *++Pa; 5684 // Rb = *--Pb; 5685 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5686 // MACC(Rm, Rn, t0, t1, t2); 5687 // Rm = *++Pm; 5688 // Rn = *--Pn; 5689 // } 5690 // if ((i & 1) == 0) { 5691 // assert(Ra == Pa_base[j], "must be"); 5692 // MACC(Ra, Ra, t0, t1, t2); 5693 // } 5694 // iters = (2*len-i)/2; 5695 // assert(iters == len-j, "must be"); 5696 // for (; iters--; j++) { 5697 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5698 // MACC(Rm, Rn, t0, t1, t2); 5699 // Rm = *++Pm; 5700 // Rn = *--Pn; 5701 // } 5702 // Pm_base[i-len] = t0; 5703 // t0 = t1; t1 = t2; t2 = 0; 5704 // } 5705 5706 // while (t0) 5707 // t0 = sub(Pm_base, Pn_base, t0, len); 5708 // } 5709 }; 5710 5711 5712 // Initialization 5713 void generate_initial() { 5714 // Generate initial stubs and initializes the entry points 5715 5716 // entry points that exist in all platforms Note: This is code 5717 // that could be shared among different platforms - however the 5718 // benefit seems to be smaller than the disadvantage of having a 5719 // much more complicated generator structure. See also comment in 5720 // stubRoutines.hpp. 5721 5722 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5723 5724 StubRoutines::_call_stub_entry = 5725 generate_call_stub(StubRoutines::_call_stub_return_address); 5726 5727 // is referenced by megamorphic call 5728 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5729 5730 // Build this early so it's available for the interpreter. 5731 StubRoutines::_throw_StackOverflowError_entry = 5732 generate_throw_exception("StackOverflowError throw_exception", 5733 CAST_FROM_FN_PTR(address, 5734 SharedRuntime::throw_StackOverflowError)); 5735 StubRoutines::_throw_delayed_StackOverflowError_entry = 5736 generate_throw_exception("delayed StackOverflowError throw_exception", 5737 CAST_FROM_FN_PTR(address, 5738 SharedRuntime::throw_delayed_StackOverflowError)); 5739 if (UseCRC32Intrinsics) { 5740 // set table address before stub generation which use it 5741 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5742 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5743 } 5744 5745 if (UseCRC32CIntrinsics) { 5746 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5747 } 5748 5749 // Disabled until JDK-8210858 is fixed 5750 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5751 // StubRoutines::_dlog = generate_dlog(); 5752 // } 5753 5754 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5755 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 5756 } 5757 5758 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5759 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 5760 } 5761 5762 // Safefetch stubs. 5763 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5764 &StubRoutines::_safefetch32_fault_pc, 5765 &StubRoutines::_safefetch32_continuation_pc); 5766 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5767 &StubRoutines::_safefetchN_fault_pc, 5768 &StubRoutines::_safefetchN_continuation_pc); 5769 } 5770 5771 void generate_all() { 5772 // support for verify_oop (must happen after universe_init) 5773 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5774 StubRoutines::_throw_AbstractMethodError_entry = 5775 generate_throw_exception("AbstractMethodError throw_exception", 5776 CAST_FROM_FN_PTR(address, 5777 SharedRuntime:: 5778 throw_AbstractMethodError)); 5779 5780 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5781 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5782 CAST_FROM_FN_PTR(address, 5783 SharedRuntime:: 5784 throw_IncompatibleClassChangeError)); 5785 5786 StubRoutines::_throw_NullPointerException_at_call_entry = 5787 generate_throw_exception("NullPointerException at call throw_exception", 5788 CAST_FROM_FN_PTR(address, 5789 SharedRuntime:: 5790 throw_NullPointerException_at_call)); 5791 5792 // arraycopy stubs used by compilers 5793 generate_arraycopy_stubs(); 5794 5795 // has negatives stub for large arrays. 5796 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5797 5798 // array equals stub for large arrays. 5799 if (!UseSimpleArrayEquals) { 5800 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5801 } 5802 5803 generate_compare_long_strings(); 5804 5805 generate_string_indexof_stubs(); 5806 5807 // byte_array_inflate stub for large arrays. 5808 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5809 5810 BarrierSetNMethod* bs_nm = BarrierSet::barrier_set()->barrier_set_nmethod(); 5811 if (bs_nm != NULL) { 5812 StubRoutines::aarch64::_method_entry_barrier = generate_method_entry_barrier(); 5813 } 5814 #ifdef COMPILER2 5815 if (UseMultiplyToLenIntrinsic) { 5816 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5817 } 5818 5819 if (UseSquareToLenIntrinsic) { 5820 StubRoutines::_squareToLen = generate_squareToLen(); 5821 } 5822 5823 if (UseMulAddIntrinsic) { 5824 StubRoutines::_mulAdd = generate_mulAdd(); 5825 } 5826 5827 if (UseMontgomeryMultiplyIntrinsic) { 5828 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5829 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5830 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5831 } 5832 5833 if (UseMontgomerySquareIntrinsic) { 5834 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5835 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5836 // We use generate_multiply() rather than generate_square() 5837 // because it's faster for the sizes of modulus we care about. 5838 StubRoutines::_montgomerySquare = g.generate_multiply(); 5839 } 5840 #endif // COMPILER2 5841 5842 // generate GHASH intrinsics code 5843 if (UseGHASHIntrinsics) { 5844 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5845 } 5846 5847 // data cache line writeback 5848 StubRoutines::_data_cache_writeback = generate_data_cache_writeback(); 5849 StubRoutines::_data_cache_writeback_sync = generate_data_cache_writeback_sync(); 5850 5851 if (UseAESIntrinsics) { 5852 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5853 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5854 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5855 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5856 } 5857 5858 if (UseSHA1Intrinsics) { 5859 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5860 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5861 } 5862 if (UseSHA256Intrinsics) { 5863 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5864 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5865 } 5866 5867 // generate Adler32 intrinsics code 5868 if (UseAdler32Intrinsics) { 5869 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5870 } 5871 5872 StubRoutines::aarch64::set_completed(); 5873 } 5874 5875 public: 5876 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5877 if (all) { 5878 generate_all(); 5879 } else { 5880 generate_initial(); 5881 } 5882 } 5883 }; // end class declaration 5884 5885 #define UCM_TABLE_MAX_ENTRIES 8 5886 void StubGenerator_generate(CodeBuffer* code, bool all) { 5887 if (UnsafeCopyMemory::_table == NULL) { 5888 UnsafeCopyMemory::create_table(UCM_TABLE_MAX_ENTRIES); 5889 } 5890 StubGenerator g(code, all); 5891 }