1 /* 2 * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "nativeInst_aarch64.hpp" 33 #include "oops/instanceOop.hpp" 34 #include "oops/method.hpp" 35 #include "oops/objArrayKlass.hpp" 36 #include "oops/oop.inline.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "runtime/frame.inline.hpp" 39 #include "runtime/handles.inline.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubCodeGenerator.hpp" 42 #include "runtime/stubRoutines.hpp" 43 #include "runtime/thread.inline.hpp" 44 #include "utilities/align.hpp" 45 #ifdef COMPILER2 46 #include "opto/runtime.hpp" 47 #endif 48 #if INCLUDE_ZGC 49 #include "gc/z/zThreadLocalData.hpp" 50 #endif 51 52 #ifdef BUILTIN_SIM 53 #include "../../../../../../simulator/simulator.hpp" 54 #endif 55 56 // Declaration and definition of StubGenerator (no .hpp file). 57 // For a more detailed description of the stub routine structure 58 // see the comment in stubRoutines.hpp 59 60 #undef __ 61 #define __ _masm-> 62 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 63 64 #ifdef PRODUCT 65 #define BLOCK_COMMENT(str) /* nothing */ 66 #else 67 #define BLOCK_COMMENT(str) __ block_comment(str) 68 #endif 69 70 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 71 72 // Stub Code definitions 73 74 class StubGenerator: public StubCodeGenerator { 75 private: 76 77 #ifdef PRODUCT 78 #define inc_counter_np(counter) ((void)0) 79 #else 80 void inc_counter_np_(int& counter) { 81 __ lea(rscratch2, ExternalAddress((address)&counter)); 82 __ ldrw(rscratch1, Address(rscratch2)); 83 __ addw(rscratch1, rscratch1, 1); 84 __ strw(rscratch1, Address(rscratch2)); 85 } 86 #define inc_counter_np(counter) \ 87 BLOCK_COMMENT("inc_counter " #counter); \ 88 inc_counter_np_(counter); 89 #endif 90 91 // Call stubs are used to call Java from C 92 // 93 // Arguments: 94 // c_rarg0: call wrapper address address 95 // c_rarg1: result address 96 // c_rarg2: result type BasicType 97 // c_rarg3: method Method* 98 // c_rarg4: (interpreter) entry point address 99 // c_rarg5: parameters intptr_t* 100 // c_rarg6: parameter size (in words) int 101 // c_rarg7: thread Thread* 102 // 103 // There is no return from the stub itself as any Java result 104 // is written to result 105 // 106 // we save r30 (lr) as the return PC at the base of the frame and 107 // link r29 (fp) below it as the frame pointer installing sp (r31) 108 // into fp. 109 // 110 // we save r0-r7, which accounts for all the c arguments. 111 // 112 // TODO: strictly do we need to save them all? they are treated as 113 // volatile by C so could we omit saving the ones we are going to 114 // place in global registers (thread? method?) or those we only use 115 // during setup of the Java call? 116 // 117 // we don't need to save r8 which C uses as an indirect result location 118 // return register. 119 // 120 // we don't need to save r9-r15 which both C and Java treat as 121 // volatile 122 // 123 // we don't need to save r16-18 because Java does not use them 124 // 125 // we save r19-r28 which Java uses as scratch registers and C 126 // expects to be callee-save 127 // 128 // we save the bottom 64 bits of each value stored in v8-v15; it is 129 // the responsibility of the caller to preserve larger values. 130 // 131 // so the stub frame looks like this when we enter Java code 132 // 133 // [ return_from_Java ] <--- sp 134 // [ argument word n ] 135 // ... 136 // -27 [ argument word 1 ] 137 // -26 [ saved v15 ] <--- sp_after_call 138 // -25 [ saved v14 ] 139 // -24 [ saved v13 ] 140 // -23 [ saved v12 ] 141 // -22 [ saved v11 ] 142 // -21 [ saved v10 ] 143 // -20 [ saved v9 ] 144 // -19 [ saved v8 ] 145 // -18 [ saved r28 ] 146 // -17 [ saved r27 ] 147 // -16 [ saved r26 ] 148 // -15 [ saved r25 ] 149 // -14 [ saved r24 ] 150 // -13 [ saved r23 ] 151 // -12 [ saved r22 ] 152 // -11 [ saved r21 ] 153 // -10 [ saved r20 ] 154 // -9 [ saved r19 ] 155 // -8 [ call wrapper (r0) ] 156 // -7 [ result (r1) ] 157 // -6 [ result type (r2) ] 158 // -5 [ method (r3) ] 159 // -4 [ entry point (r4) ] 160 // -3 [ parameters (r5) ] 161 // -2 [ parameter size (r6) ] 162 // -1 [ thread (r7) ] 163 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 164 // 1 [ saved lr (r30) ] 165 166 // Call stub stack layout word offsets from fp 167 enum call_stub_layout { 168 sp_after_call_off = -26, 169 170 d15_off = -26, 171 d13_off = -24, 172 d11_off = -22, 173 d9_off = -20, 174 175 r28_off = -18, 176 r26_off = -16, 177 r24_off = -14, 178 r22_off = -12, 179 r20_off = -10, 180 call_wrapper_off = -8, 181 result_off = -7, 182 result_type_off = -6, 183 method_off = -5, 184 entry_point_off = -4, 185 parameter_size_off = -2, 186 thread_off = -1, 187 fp_f = 0, 188 retaddr_off = 1, 189 }; 190 191 address generate_call_stub(address& return_address) { 192 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 193 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 194 "adjust this code"); 195 196 StubCodeMark mark(this, "StubRoutines", "call_stub"); 197 address start = __ pc(); 198 199 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 200 201 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 202 const Address result (rfp, result_off * wordSize); 203 const Address result_type (rfp, result_type_off * wordSize); 204 const Address method (rfp, method_off * wordSize); 205 const Address entry_point (rfp, entry_point_off * wordSize); 206 const Address parameter_size(rfp, parameter_size_off * wordSize); 207 208 const Address thread (rfp, thread_off * wordSize); 209 210 const Address d15_save (rfp, d15_off * wordSize); 211 const Address d13_save (rfp, d13_off * wordSize); 212 const Address d11_save (rfp, d11_off * wordSize); 213 const Address d9_save (rfp, d9_off * wordSize); 214 215 const Address r28_save (rfp, r28_off * wordSize); 216 const Address r26_save (rfp, r26_off * wordSize); 217 const Address r24_save (rfp, r24_off * wordSize); 218 const Address r22_save (rfp, r22_off * wordSize); 219 const Address r20_save (rfp, r20_off * wordSize); 220 221 // stub code 222 223 // we need a C prolog to bootstrap the x86 caller into the sim 224 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 225 226 address aarch64_entry = __ pc(); 227 228 #ifdef BUILTIN_SIM 229 // Save sender's SP for stack traces. 230 __ mov(rscratch1, sp); 231 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 232 #endif 233 // set up frame and move sp to end of save area 234 __ enter(); 235 __ sub(sp, rfp, -sp_after_call_off * wordSize); 236 237 // save register parameters and Java scratch/global registers 238 // n.b. we save thread even though it gets installed in 239 // rthread because we want to sanity check rthread later 240 __ str(c_rarg7, thread); 241 __ strw(c_rarg6, parameter_size); 242 __ stp(c_rarg4, c_rarg5, entry_point); 243 __ stp(c_rarg2, c_rarg3, result_type); 244 __ stp(c_rarg0, c_rarg1, call_wrapper); 245 246 __ stp(r20, r19, r20_save); 247 __ stp(r22, r21, r22_save); 248 __ stp(r24, r23, r24_save); 249 __ stp(r26, r25, r26_save); 250 __ stp(r28, r27, r28_save); 251 252 __ stpd(v9, v8, d9_save); 253 __ stpd(v11, v10, d11_save); 254 __ stpd(v13, v12, d13_save); 255 __ stpd(v15, v14, d15_save); 256 257 // install Java thread in global register now we have saved 258 // whatever value it held 259 __ mov(rthread, c_rarg7); 260 // And method 261 __ mov(rmethod, c_rarg3); 262 263 // set up the heapbase register 264 __ reinit_heapbase(); 265 266 #ifdef ASSERT 267 // make sure we have no pending exceptions 268 { 269 Label L; 270 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 271 __ cmp(rscratch1, (u1)NULL_WORD); 272 __ br(Assembler::EQ, L); 273 __ stop("StubRoutines::call_stub: entered with pending exception"); 274 __ BIND(L); 275 } 276 #endif 277 // pass parameters if any 278 __ mov(esp, sp); 279 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 280 __ andr(sp, rscratch1, -2 * wordSize); 281 282 BLOCK_COMMENT("pass parameters if any"); 283 Label parameters_done; 284 // parameter count is still in c_rarg6 285 // and parameter pointer identifying param 1 is in c_rarg5 286 __ cbzw(c_rarg6, parameters_done); 287 288 address loop = __ pc(); 289 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 290 __ subsw(c_rarg6, c_rarg6, 1); 291 __ push(rscratch1); 292 __ br(Assembler::GT, loop); 293 294 __ BIND(parameters_done); 295 296 // call Java entry -- passing methdoOop, and current sp 297 // rmethod: Method* 298 // r13: sender sp 299 BLOCK_COMMENT("call Java function"); 300 __ mov(r13, sp); 301 __ blr(c_rarg4); 302 303 // tell the simulator we have returned to the stub 304 305 // we do this here because the notify will already have been done 306 // if we get to the next instruction via an exception 307 // 308 // n.b. adding this instruction here affects the calculation of 309 // whether or not a routine returns to the call stub (used when 310 // doing stack walks) since the normal test is to check the return 311 // pc against the address saved below. so we may need to allow for 312 // this extra instruction in the check. 313 314 if (NotifySimulator) { 315 __ notify(Assembler::method_reentry); 316 } 317 // save current address for use by exception handling code 318 319 return_address = __ pc(); 320 321 // store result depending on type (everything that is not 322 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 323 // n.b. this assumes Java returns an integral result in r0 324 // and a floating result in j_farg0 325 __ ldr(j_rarg2, result); 326 Label is_long, is_float, is_double, exit; 327 __ ldr(j_rarg1, result_type); 328 __ cmp(j_rarg1, (u1)T_OBJECT); 329 __ br(Assembler::EQ, is_long); 330 __ cmp(j_rarg1, (u1)T_LONG); 331 __ br(Assembler::EQ, is_long); 332 __ cmp(j_rarg1, (u1)T_FLOAT); 333 __ br(Assembler::EQ, is_float); 334 __ cmp(j_rarg1, (u1)T_DOUBLE); 335 __ br(Assembler::EQ, is_double); 336 337 // handle T_INT case 338 __ strw(r0, Address(j_rarg2)); 339 340 __ BIND(exit); 341 342 // pop parameters 343 __ sub(esp, rfp, -sp_after_call_off * wordSize); 344 345 #ifdef ASSERT 346 // verify that threads correspond 347 { 348 Label L, S; 349 __ ldr(rscratch1, thread); 350 __ cmp(rthread, rscratch1); 351 __ br(Assembler::NE, S); 352 __ get_thread(rscratch1); 353 __ cmp(rthread, rscratch1); 354 __ br(Assembler::EQ, L); 355 __ BIND(S); 356 __ stop("StubRoutines::call_stub: threads must correspond"); 357 __ BIND(L); 358 } 359 #endif 360 361 // restore callee-save registers 362 __ ldpd(v15, v14, d15_save); 363 __ ldpd(v13, v12, d13_save); 364 __ ldpd(v11, v10, d11_save); 365 __ ldpd(v9, v8, d9_save); 366 367 __ ldp(r28, r27, r28_save); 368 __ ldp(r26, r25, r26_save); 369 __ ldp(r24, r23, r24_save); 370 __ ldp(r22, r21, r22_save); 371 __ ldp(r20, r19, r20_save); 372 373 __ ldp(c_rarg0, c_rarg1, call_wrapper); 374 __ ldrw(c_rarg2, result_type); 375 __ ldr(c_rarg3, method); 376 __ ldp(c_rarg4, c_rarg5, entry_point); 377 __ ldp(c_rarg6, c_rarg7, parameter_size); 378 379 #ifndef PRODUCT 380 // tell the simulator we are about to end Java execution 381 if (NotifySimulator) { 382 __ notify(Assembler::method_exit); 383 } 384 #endif 385 // leave frame and return to caller 386 __ leave(); 387 __ ret(lr); 388 389 // handle return types different from T_INT 390 391 __ BIND(is_long); 392 __ str(r0, Address(j_rarg2, 0)); 393 __ br(Assembler::AL, exit); 394 395 __ BIND(is_float); 396 __ strs(j_farg0, Address(j_rarg2, 0)); 397 __ br(Assembler::AL, exit); 398 399 __ BIND(is_double); 400 __ strd(j_farg0, Address(j_rarg2, 0)); 401 __ br(Assembler::AL, exit); 402 403 return start; 404 } 405 406 // Return point for a Java call if there's an exception thrown in 407 // Java code. The exception is caught and transformed into a 408 // pending exception stored in JavaThread that can be tested from 409 // within the VM. 410 // 411 // Note: Usually the parameters are removed by the callee. In case 412 // of an exception crossing an activation frame boundary, that is 413 // not the case if the callee is compiled code => need to setup the 414 // rsp. 415 // 416 // r0: exception oop 417 418 // NOTE: this is used as a target from the signal handler so it 419 // needs an x86 prolog which returns into the current simulator 420 // executing the generated catch_exception code. so the prolog 421 // needs to install rax in a sim register and adjust the sim's 422 // restart pc to enter the generated code at the start position 423 // then return from native to simulated execution. 424 425 address generate_catch_exception() { 426 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 427 address start = __ pc(); 428 429 // same as in generate_call_stub(): 430 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 431 const Address thread (rfp, thread_off * wordSize); 432 433 #ifdef ASSERT 434 // verify that threads correspond 435 { 436 Label L, S; 437 __ ldr(rscratch1, thread); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::NE, S); 440 __ get_thread(rscratch1); 441 __ cmp(rthread, rscratch1); 442 __ br(Assembler::EQ, L); 443 __ bind(S); 444 __ stop("StubRoutines::catch_exception: threads must correspond"); 445 __ bind(L); 446 } 447 #endif 448 449 // set pending exception 450 __ verify_oop(r0); 451 452 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 453 __ mov(rscratch1, (address)__FILE__); 454 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 455 __ movw(rscratch1, (int)__LINE__); 456 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 457 458 // complete return to VM 459 assert(StubRoutines::_call_stub_return_address != NULL, 460 "_call_stub_return_address must have been generated before"); 461 __ b(StubRoutines::_call_stub_return_address); 462 463 return start; 464 } 465 466 // Continuation point for runtime calls returning with a pending 467 // exception. The pending exception check happened in the runtime 468 // or native call stub. The pending exception in Thread is 469 // converted into a Java-level exception. 470 // 471 // Contract with Java-level exception handlers: 472 // r0: exception 473 // r3: throwing pc 474 // 475 // NOTE: At entry of this stub, exception-pc must be in LR !! 476 477 // NOTE: this is always used as a jump target within generated code 478 // so it just needs to be generated code wiht no x86 prolog 479 480 address generate_forward_exception() { 481 StubCodeMark mark(this, "StubRoutines", "forward exception"); 482 address start = __ pc(); 483 484 // Upon entry, LR points to the return address returning into 485 // Java (interpreted or compiled) code; i.e., the return address 486 // becomes the throwing pc. 487 // 488 // Arguments pushed before the runtime call are still on the stack 489 // but the exception handler will reset the stack pointer -> 490 // ignore them. A potential result in registers can be ignored as 491 // well. 492 493 #ifdef ASSERT 494 // make sure this code is only executed if there is a pending exception 495 { 496 Label L; 497 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 498 __ cbnz(rscratch1, L); 499 __ stop("StubRoutines::forward exception: no pending exception (1)"); 500 __ bind(L); 501 } 502 #endif 503 504 // compute exception handler into r19 505 506 // call the VM to find the handler address associated with the 507 // caller address. pass thread in r0 and caller pc (ret address) 508 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 509 // the stack. 510 __ mov(c_rarg1, lr); 511 // lr will be trashed by the VM call so we move it to R19 512 // (callee-saved) because we also need to pass it to the handler 513 // returned by this call. 514 __ mov(r19, lr); 515 BLOCK_COMMENT("call exception_handler_for_return_address"); 516 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 517 SharedRuntime::exception_handler_for_return_address), 518 rthread, c_rarg1); 519 // we should not really care that lr is no longer the callee 520 // address. we saved the value the handler needs in r19 so we can 521 // just copy it to r3. however, the C2 handler will push its own 522 // frame and then calls into the VM and the VM code asserts that 523 // the PC for the frame above the handler belongs to a compiled 524 // Java method. So, we restore lr here to satisfy that assert. 525 __ mov(lr, r19); 526 // setup r0 & r3 & clear pending exception 527 __ mov(r3, r19); 528 __ mov(r19, r0); 529 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 530 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 531 532 #ifdef ASSERT 533 // make sure exception is set 534 { 535 Label L; 536 __ cbnz(r0, L); 537 __ stop("StubRoutines::forward exception: no pending exception (2)"); 538 __ bind(L); 539 } 540 #endif 541 542 // continue at exception handler 543 // r0: exception 544 // r3: throwing pc 545 // r19: exception handler 546 __ verify_oop(r0); 547 __ br(r19); 548 549 return start; 550 } 551 552 // Non-destructive plausibility checks for oops 553 // 554 // Arguments: 555 // r0: oop to verify 556 // rscratch1: error message 557 // 558 // Stack after saving c_rarg3: 559 // [tos + 0]: saved c_rarg3 560 // [tos + 1]: saved c_rarg2 561 // [tos + 2]: saved lr 562 // [tos + 3]: saved rscratch2 563 // [tos + 4]: saved r0 564 // [tos + 5]: saved rscratch1 565 address generate_verify_oop() { 566 567 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 568 address start = __ pc(); 569 570 Label exit, error; 571 572 // save c_rarg2 and c_rarg3 573 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 574 575 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 576 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 577 __ ldr(c_rarg3, Address(c_rarg2)); 578 __ add(c_rarg3, c_rarg3, 1); 579 __ str(c_rarg3, Address(c_rarg2)); 580 581 // object is in r0 582 // make sure object is 'reasonable' 583 __ cbz(r0, exit); // if obj is NULL it is OK 584 585 #if INCLUDE_ZGC 586 if (UseZGC) { 587 // Check if mask is good. 588 // verifies that ZAddressBadMask & r0 == 0 589 __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset())); 590 __ andr(c_rarg2, r0, c_rarg3); 591 __ cbnz(c_rarg2, error); 592 } 593 #endif 594 595 // Check if the oop is in the right area of memory 596 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 597 __ andr(c_rarg2, r0, c_rarg3); 598 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 599 600 // Compare c_rarg2 and c_rarg3. We don't use a compare 601 // instruction here because the flags register is live. 602 __ eor(c_rarg2, c_rarg2, c_rarg3); 603 __ cbnz(c_rarg2, error); 604 605 // make sure klass is 'reasonable', which is not zero. 606 __ load_klass(r0, r0); // get klass 607 __ cbz(r0, error); // if klass is NULL it is broken 608 609 // return if everything seems ok 610 __ bind(exit); 611 612 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 613 __ ret(lr); 614 615 // handle errors 616 __ bind(error); 617 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 618 619 __ push(RegSet::range(r0, r29), sp); 620 // debug(char* msg, int64_t pc, int64_t regs[]) 621 __ mov(c_rarg0, rscratch1); // pass address of error message 622 __ mov(c_rarg1, lr); // pass return address 623 __ mov(c_rarg2, sp); // pass address of regs on stack 624 #ifndef PRODUCT 625 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 626 #endif 627 BLOCK_COMMENT("call MacroAssembler::debug"); 628 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 629 __ blrt(rscratch1, 3, 0, 1); 630 631 return start; 632 } 633 634 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 635 636 // The inner part of zero_words(). This is the bulk operation, 637 // zeroing words in blocks, possibly using DC ZVA to do it. The 638 // caller is responsible for zeroing the last few words. 639 // 640 // Inputs: 641 // r10: the HeapWord-aligned base address of an array to zero. 642 // r11: the count in HeapWords, r11 > 0. 643 // 644 // Returns r10 and r11, adjusted for the caller to clear. 645 // r10: the base address of the tail of words left to clear. 646 // r11: the number of words in the tail. 647 // r11 < MacroAssembler::zero_words_block_size. 648 649 address generate_zero_blocks() { 650 Label done; 651 Label base_aligned; 652 653 Register base = r10, cnt = r11; 654 655 __ align(CodeEntryAlignment); 656 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 657 address start = __ pc(); 658 659 if (UseBlockZeroing) { 660 int zva_length = VM_Version::zva_length(); 661 662 // Ensure ZVA length can be divided by 16. This is required by 663 // the subsequent operations. 664 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 665 666 __ tbz(base, 3, base_aligned); 667 __ str(zr, Address(__ post(base, 8))); 668 __ sub(cnt, cnt, 1); 669 __ bind(base_aligned); 670 671 // Ensure count >= zva_length * 2 so that it still deserves a zva after 672 // alignment. 673 Label small; 674 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 675 __ subs(rscratch1, cnt, low_limit >> 3); 676 __ br(Assembler::LT, small); 677 __ zero_dcache_blocks(base, cnt); 678 __ bind(small); 679 } 680 681 { 682 // Number of stp instructions we'll unroll 683 const int unroll = 684 MacroAssembler::zero_words_block_size / 2; 685 // Clear the remaining blocks. 686 Label loop; 687 __ subs(cnt, cnt, unroll * 2); 688 __ br(Assembler::LT, done); 689 __ bind(loop); 690 for (int i = 0; i < unroll; i++) 691 __ stp(zr, zr, __ post(base, 16)); 692 __ subs(cnt, cnt, unroll * 2); 693 __ br(Assembler::GE, loop); 694 __ bind(done); 695 __ add(cnt, cnt, unroll * 2); 696 } 697 698 __ ret(lr); 699 700 return start; 701 } 702 703 704 typedef enum { 705 copy_forwards = 1, 706 copy_backwards = -1 707 } copy_direction; 708 709 // Bulk copy of blocks of 8 words. 710 // 711 // count is a count of words. 712 // 713 // Precondition: count >= 8 714 // 715 // Postconditions: 716 // 717 // The least significant bit of count contains the remaining count 718 // of words to copy. The rest of count is trash. 719 // 720 // s and d are adjusted to point to the remaining words to copy 721 // 722 void generate_copy_longs(Label &start, Register s, Register d, Register count, 723 copy_direction direction) { 724 int unit = wordSize * direction; 725 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 726 727 int offset; 728 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 729 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 730 const Register stride = r13; 731 732 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 733 assert_different_registers(s, d, count, rscratch1); 734 735 Label again, drain; 736 const char *stub_name; 737 if (direction == copy_forwards) 738 stub_name = "forward_copy_longs"; 739 else 740 stub_name = "backward_copy_longs"; 741 742 __ align(CodeEntryAlignment); 743 744 StubCodeMark mark(this, "StubRoutines", stub_name); 745 746 __ bind(start); 747 748 Label unaligned_copy_long; 749 if (AvoidUnalignedAccesses) { 750 __ tbnz(d, 3, unaligned_copy_long); 751 } 752 753 if (direction == copy_forwards) { 754 __ sub(s, s, bias); 755 __ sub(d, d, bias); 756 } 757 758 #ifdef ASSERT 759 // Make sure we are never given < 8 words 760 { 761 Label L; 762 __ cmp(count, (u1)8); 763 __ br(Assembler::GE, L); 764 __ stop("genrate_copy_longs called with < 8 words"); 765 __ bind(L); 766 } 767 #endif 768 769 // Fill 8 registers 770 if (UseSIMDForMemoryOps) { 771 __ ldpq(v0, v1, Address(s, 4 * unit)); 772 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 773 } else { 774 __ ldp(t0, t1, Address(s, 2 * unit)); 775 __ ldp(t2, t3, Address(s, 4 * unit)); 776 __ ldp(t4, t5, Address(s, 6 * unit)); 777 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 778 } 779 780 __ subs(count, count, 16); 781 __ br(Assembler::LO, drain); 782 783 int prefetch = PrefetchCopyIntervalInBytes; 784 bool use_stride = false; 785 if (direction == copy_backwards) { 786 use_stride = prefetch > 256; 787 prefetch = -prefetch; 788 if (use_stride) __ mov(stride, prefetch); 789 } 790 791 __ bind(again); 792 793 if (PrefetchCopyIntervalInBytes > 0) 794 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 795 796 if (UseSIMDForMemoryOps) { 797 __ stpq(v0, v1, Address(d, 4 * unit)); 798 __ ldpq(v0, v1, Address(s, 4 * unit)); 799 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 800 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 801 } else { 802 __ stp(t0, t1, Address(d, 2 * unit)); 803 __ ldp(t0, t1, Address(s, 2 * unit)); 804 __ stp(t2, t3, Address(d, 4 * unit)); 805 __ ldp(t2, t3, Address(s, 4 * unit)); 806 __ stp(t4, t5, Address(d, 6 * unit)); 807 __ ldp(t4, t5, Address(s, 6 * unit)); 808 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 809 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 810 } 811 812 __ subs(count, count, 8); 813 __ br(Assembler::HS, again); 814 815 // Drain 816 __ bind(drain); 817 if (UseSIMDForMemoryOps) { 818 __ stpq(v0, v1, Address(d, 4 * unit)); 819 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 820 } else { 821 __ stp(t0, t1, Address(d, 2 * unit)); 822 __ stp(t2, t3, Address(d, 4 * unit)); 823 __ stp(t4, t5, Address(d, 6 * unit)); 824 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 825 } 826 827 { 828 Label L1, L2; 829 __ tbz(count, exact_log2(4), L1); 830 if (UseSIMDForMemoryOps) { 831 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 832 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 833 } else { 834 __ ldp(t0, t1, Address(s, 2 * unit)); 835 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 836 __ stp(t0, t1, Address(d, 2 * unit)); 837 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 838 } 839 __ bind(L1); 840 841 if (direction == copy_forwards) { 842 __ add(s, s, bias); 843 __ add(d, d, bias); 844 } 845 846 __ tbz(count, 1, L2); 847 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 848 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 849 __ bind(L2); 850 } 851 852 __ ret(lr); 853 854 if (AvoidUnalignedAccesses) { 855 Label drain, again; 856 // Register order for storing. Order is different for backward copy. 857 858 __ bind(unaligned_copy_long); 859 860 // source address is even aligned, target odd aligned 861 // 862 // when forward copying word pairs we read long pairs at offsets 863 // {0, 2, 4, 6} (in long words). when backwards copying we read 864 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 865 // address by -2 in the forwards case so we can compute the 866 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 867 // or -1. 868 // 869 // when forward copying we need to store 1 word, 3 pairs and 870 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 871 // zero offset We adjust the destination by -1 which means we 872 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 873 // 874 // When backwards copyng we need to store 1 word, 3 pairs and 875 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 876 // offsets {1, 3, 5, 7, 8} * unit. 877 878 if (direction == copy_forwards) { 879 __ sub(s, s, 16); 880 __ sub(d, d, 8); 881 } 882 883 // Fill 8 registers 884 // 885 // for forwards copy s was offset by -16 from the original input 886 // value of s so the register contents are at these offsets 887 // relative to the 64 bit block addressed by that original input 888 // and so on for each successive 64 byte block when s is updated 889 // 890 // t0 at offset 0, t1 at offset 8 891 // t2 at offset 16, t3 at offset 24 892 // t4 at offset 32, t5 at offset 40 893 // t6 at offset 48, t7 at offset 56 894 895 // for backwards copy s was not offset so the register contents 896 // are at these offsets into the preceding 64 byte block 897 // relative to that original input and so on for each successive 898 // preceding 64 byte block when s is updated. this explains the 899 // slightly counter-intuitive looking pattern of register usage 900 // in the stp instructions for backwards copy. 901 // 902 // t0 at offset -16, t1 at offset -8 903 // t2 at offset -32, t3 at offset -24 904 // t4 at offset -48, t5 at offset -40 905 // t6 at offset -64, t7 at offset -56 906 907 __ ldp(t0, t1, Address(s, 2 * unit)); 908 __ ldp(t2, t3, Address(s, 4 * unit)); 909 __ ldp(t4, t5, Address(s, 6 * unit)); 910 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 911 912 __ subs(count, count, 16); 913 __ br(Assembler::LO, drain); 914 915 int prefetch = PrefetchCopyIntervalInBytes; 916 bool use_stride = false; 917 if (direction == copy_backwards) { 918 use_stride = prefetch > 256; 919 prefetch = -prefetch; 920 if (use_stride) __ mov(stride, prefetch); 921 } 922 923 __ bind(again); 924 925 if (PrefetchCopyIntervalInBytes > 0) 926 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 927 928 if (direction == copy_forwards) { 929 // allowing for the offset of -8 the store instructions place 930 // registers into the target 64 bit block at the following 931 // offsets 932 // 933 // t0 at offset 0 934 // t1 at offset 8, t2 at offset 16 935 // t3 at offset 24, t4 at offset 32 936 // t5 at offset 40, t6 at offset 48 937 // t7 at offset 56 938 939 __ str(t0, Address(d, 1 * unit)); 940 __ stp(t1, t2, Address(d, 2 * unit)); 941 __ ldp(t0, t1, Address(s, 2 * unit)); 942 __ stp(t3, t4, Address(d, 4 * unit)); 943 __ ldp(t2, t3, Address(s, 4 * unit)); 944 __ stp(t5, t6, Address(d, 6 * unit)); 945 __ ldp(t4, t5, Address(s, 6 * unit)); 946 __ str(t7, Address(__ pre(d, 8 * unit))); 947 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 948 } else { 949 // d was not offset when we started so the registers are 950 // written into the 64 bit block preceding d with the following 951 // offsets 952 // 953 // t1 at offset -8 954 // t3 at offset -24, t0 at offset -16 955 // t5 at offset -48, t2 at offset -32 956 // t7 at offset -56, t4 at offset -48 957 // t6 at offset -64 958 // 959 // note that this matches the offsets previously noted for the 960 // loads 961 962 __ str(t1, Address(d, 1 * unit)); 963 __ stp(t3, t0, Address(d, 3 * unit)); 964 __ ldp(t0, t1, Address(s, 2 * unit)); 965 __ stp(t5, t2, Address(d, 5 * unit)); 966 __ ldp(t2, t3, Address(s, 4 * unit)); 967 __ stp(t7, t4, Address(d, 7 * unit)); 968 __ ldp(t4, t5, Address(s, 6 * unit)); 969 __ str(t6, Address(__ pre(d, 8 * unit))); 970 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 971 } 972 973 __ subs(count, count, 8); 974 __ br(Assembler::HS, again); 975 976 // Drain 977 // 978 // this uses the same pattern of offsets and register arguments 979 // as above 980 __ bind(drain); 981 if (direction == copy_forwards) { 982 __ str(t0, Address(d, 1 * unit)); 983 __ stp(t1, t2, Address(d, 2 * unit)); 984 __ stp(t3, t4, Address(d, 4 * unit)); 985 __ stp(t5, t6, Address(d, 6 * unit)); 986 __ str(t7, Address(__ pre(d, 8 * unit))); 987 } else { 988 __ str(t1, Address(d, 1 * unit)); 989 __ stp(t3, t0, Address(d, 3 * unit)); 990 __ stp(t5, t2, Address(d, 5 * unit)); 991 __ stp(t7, t4, Address(d, 7 * unit)); 992 __ str(t6, Address(__ pre(d, 8 * unit))); 993 } 994 // now we need to copy any remaining part block which may 995 // include a 4 word block subblock and/or a 2 word subblock. 996 // bits 2 and 1 in the count are the tell-tale for whetehr we 997 // have each such subblock 998 { 999 Label L1, L2; 1000 __ tbz(count, exact_log2(4), L1); 1001 // this is the same as above but copying only 4 longs hence 1002 // with ony one intervening stp between the str instructions 1003 // but note that the offsets and registers still follow the 1004 // same pattern 1005 __ ldp(t0, t1, Address(s, 2 * unit)); 1006 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1007 if (direction == copy_forwards) { 1008 __ str(t0, Address(d, 1 * unit)); 1009 __ stp(t1, t2, Address(d, 2 * unit)); 1010 __ str(t3, Address(__ pre(d, 4 * unit))); 1011 } else { 1012 __ str(t1, Address(d, 1 * unit)); 1013 __ stp(t3, t0, Address(d, 3 * unit)); 1014 __ str(t2, Address(__ pre(d, 4 * unit))); 1015 } 1016 __ bind(L1); 1017 1018 __ tbz(count, 1, L2); 1019 // this is the same as above but copying only 2 longs hence 1020 // there is no intervening stp between the str instructions 1021 // but note that the offset and register patterns are still 1022 // the same 1023 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1024 if (direction == copy_forwards) { 1025 __ str(t0, Address(d, 1 * unit)); 1026 __ str(t1, Address(__ pre(d, 2 * unit))); 1027 } else { 1028 __ str(t1, Address(d, 1 * unit)); 1029 __ str(t0, Address(__ pre(d, 2 * unit))); 1030 } 1031 __ bind(L2); 1032 1033 // for forwards copy we need to re-adjust the offsets we 1034 // applied so that s and d are follow the last words written 1035 1036 if (direction == copy_forwards) { 1037 __ add(s, s, 16); 1038 __ add(d, d, 8); 1039 } 1040 1041 } 1042 1043 __ ret(lr); 1044 } 1045 } 1046 1047 // Small copy: less than 16 bytes. 1048 // 1049 // NB: Ignores all of the bits of count which represent more than 15 1050 // bytes, so a caller doesn't have to mask them. 1051 1052 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1053 bool is_backwards = step < 0; 1054 size_t granularity = uabs(step); 1055 int direction = is_backwards ? -1 : 1; 1056 int unit = wordSize * direction; 1057 1058 Label Lword, Lint, Lshort, Lbyte; 1059 1060 assert(granularity 1061 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1062 1063 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1064 1065 // ??? I don't know if this bit-test-and-branch is the right thing 1066 // to do. It does a lot of jumping, resulting in several 1067 // mispredicted branches. It might make more sense to do this 1068 // with something like Duff's device with a single computed branch. 1069 1070 __ tbz(count, 3 - exact_log2(granularity), Lword); 1071 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1072 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1073 __ bind(Lword); 1074 1075 if (granularity <= sizeof (jint)) { 1076 __ tbz(count, 2 - exact_log2(granularity), Lint); 1077 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1078 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1079 __ bind(Lint); 1080 } 1081 1082 if (granularity <= sizeof (jshort)) { 1083 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1084 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1085 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1086 __ bind(Lshort); 1087 } 1088 1089 if (granularity <= sizeof (jbyte)) { 1090 __ tbz(count, 0, Lbyte); 1091 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1092 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1093 __ bind(Lbyte); 1094 } 1095 } 1096 1097 Label copy_f, copy_b; 1098 1099 // All-singing all-dancing memory copy. 1100 // 1101 // Copy count units of memory from s to d. The size of a unit is 1102 // step, which can be positive or negative depending on the direction 1103 // of copy. If is_aligned is false, we align the source address. 1104 // 1105 1106 void copy_memory(bool is_aligned, Register s, Register d, 1107 Register count, Register tmp, int step) { 1108 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1109 bool is_backwards = step < 0; 1110 int granularity = uabs(step); 1111 const Register t0 = r3, t1 = r4; 1112 1113 // <= 96 bytes do inline. Direction doesn't matter because we always 1114 // load all the data before writing anything 1115 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1116 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1117 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1118 const Register send = r17, dend = r18; 1119 1120 if (PrefetchCopyIntervalInBytes > 0) 1121 __ prfm(Address(s, 0), PLDL1KEEP); 1122 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1123 __ br(Assembler::HI, copy_big); 1124 1125 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1126 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1127 1128 __ cmp(count, u1(16/granularity)); 1129 __ br(Assembler::LS, copy16); 1130 1131 __ cmp(count, u1(64/granularity)); 1132 __ br(Assembler::HI, copy80); 1133 1134 __ cmp(count, u1(32/granularity)); 1135 __ br(Assembler::LS, copy32); 1136 1137 // 33..64 bytes 1138 if (UseSIMDForMemoryOps) { 1139 __ ldpq(v0, v1, Address(s, 0)); 1140 __ ldpq(v2, v3, Address(send, -32)); 1141 __ stpq(v0, v1, Address(d, 0)); 1142 __ stpq(v2, v3, Address(dend, -32)); 1143 } else { 1144 __ ldp(t0, t1, Address(s, 0)); 1145 __ ldp(t2, t3, Address(s, 16)); 1146 __ ldp(t4, t5, Address(send, -32)); 1147 __ ldp(t6, t7, Address(send, -16)); 1148 1149 __ stp(t0, t1, Address(d, 0)); 1150 __ stp(t2, t3, Address(d, 16)); 1151 __ stp(t4, t5, Address(dend, -32)); 1152 __ stp(t6, t7, Address(dend, -16)); 1153 } 1154 __ b(finish); 1155 1156 // 17..32 bytes 1157 __ bind(copy32); 1158 __ ldp(t0, t1, Address(s, 0)); 1159 __ ldp(t2, t3, Address(send, -16)); 1160 __ stp(t0, t1, Address(d, 0)); 1161 __ stp(t2, t3, Address(dend, -16)); 1162 __ b(finish); 1163 1164 // 65..80/96 bytes 1165 // (96 bytes if SIMD because we do 32 byes per instruction) 1166 __ bind(copy80); 1167 if (UseSIMDForMemoryOps) { 1168 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1169 __ ldpq(v4, v5, Address(send, -32)); 1170 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1171 __ stpq(v4, v5, Address(dend, -32)); 1172 } else { 1173 __ ldp(t0, t1, Address(s, 0)); 1174 __ ldp(t2, t3, Address(s, 16)); 1175 __ ldp(t4, t5, Address(s, 32)); 1176 __ ldp(t6, t7, Address(s, 48)); 1177 __ ldp(t8, t9, Address(send, -16)); 1178 1179 __ stp(t0, t1, Address(d, 0)); 1180 __ stp(t2, t3, Address(d, 16)); 1181 __ stp(t4, t5, Address(d, 32)); 1182 __ stp(t6, t7, Address(d, 48)); 1183 __ stp(t8, t9, Address(dend, -16)); 1184 } 1185 __ b(finish); 1186 1187 // 0..16 bytes 1188 __ bind(copy16); 1189 __ cmp(count, u1(8/granularity)); 1190 __ br(Assembler::LO, copy8); 1191 1192 // 8..16 bytes 1193 __ ldr(t0, Address(s, 0)); 1194 __ ldr(t1, Address(send, -8)); 1195 __ str(t0, Address(d, 0)); 1196 __ str(t1, Address(dend, -8)); 1197 __ b(finish); 1198 1199 if (granularity < 8) { 1200 // 4..7 bytes 1201 __ bind(copy8); 1202 __ tbz(count, 2 - exact_log2(granularity), copy4); 1203 __ ldrw(t0, Address(s, 0)); 1204 __ ldrw(t1, Address(send, -4)); 1205 __ strw(t0, Address(d, 0)); 1206 __ strw(t1, Address(dend, -4)); 1207 __ b(finish); 1208 if (granularity < 4) { 1209 // 0..3 bytes 1210 __ bind(copy4); 1211 __ cbz(count, finish); // get rid of 0 case 1212 if (granularity == 2) { 1213 __ ldrh(t0, Address(s, 0)); 1214 __ strh(t0, Address(d, 0)); 1215 } else { // granularity == 1 1216 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1217 // the first and last byte. 1218 // Handle the 3 byte case by loading and storing base + count/2 1219 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1220 // This does means in the 1 byte case we load/store the same 1221 // byte 3 times. 1222 __ lsr(count, count, 1); 1223 __ ldrb(t0, Address(s, 0)); 1224 __ ldrb(t1, Address(send, -1)); 1225 __ ldrb(t2, Address(s, count)); 1226 __ strb(t0, Address(d, 0)); 1227 __ strb(t1, Address(dend, -1)); 1228 __ strb(t2, Address(d, count)); 1229 } 1230 __ b(finish); 1231 } 1232 } 1233 1234 __ bind(copy_big); 1235 if (is_backwards) { 1236 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1237 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1238 } 1239 1240 // Now we've got the small case out of the way we can align the 1241 // source address on a 2-word boundary. 1242 1243 Label aligned; 1244 1245 if (is_aligned) { 1246 // We may have to adjust by 1 word to get s 2-word-aligned. 1247 __ tbz(s, exact_log2(wordSize), aligned); 1248 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1249 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1250 __ sub(count, count, wordSize/granularity); 1251 } else { 1252 if (is_backwards) { 1253 __ andr(rscratch2, s, 2 * wordSize - 1); 1254 } else { 1255 __ neg(rscratch2, s); 1256 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1257 } 1258 // rscratch2 is the byte adjustment needed to align s. 1259 __ cbz(rscratch2, aligned); 1260 int shift = exact_log2(granularity); 1261 if (shift) __ lsr(rscratch2, rscratch2, shift); 1262 __ sub(count, count, rscratch2); 1263 1264 #if 0 1265 // ?? This code is only correct for a disjoint copy. It may or 1266 // may not make sense to use it in that case. 1267 1268 // Copy the first pair; s and d may not be aligned. 1269 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1270 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1271 1272 // Align s and d, adjust count 1273 if (is_backwards) { 1274 __ sub(s, s, rscratch2); 1275 __ sub(d, d, rscratch2); 1276 } else { 1277 __ add(s, s, rscratch2); 1278 __ add(d, d, rscratch2); 1279 } 1280 #else 1281 copy_memory_small(s, d, rscratch2, rscratch1, step); 1282 #endif 1283 } 1284 1285 __ bind(aligned); 1286 1287 // s is now 2-word-aligned. 1288 1289 // We have a count of units and some trailing bytes. Adjust the 1290 // count and do a bulk copy of words. 1291 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1292 if (direction == copy_forwards) 1293 __ bl(copy_f); 1294 else 1295 __ bl(copy_b); 1296 1297 // And the tail. 1298 copy_memory_small(s, d, count, tmp, step); 1299 1300 if (granularity >= 8) __ bind(copy8); 1301 if (granularity >= 4) __ bind(copy4); 1302 __ bind(finish); 1303 } 1304 1305 1306 void clobber_registers() { 1307 #ifdef ASSERT 1308 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1309 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1310 for (Register r = r3; r <= r18; r++) 1311 if (r != rscratch1) __ mov(r, rscratch1); 1312 #endif 1313 } 1314 1315 // Scan over array at a for count oops, verifying each one. 1316 // Preserves a and count, clobbers rscratch1 and rscratch2. 1317 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1318 Label loop, end; 1319 __ mov(rscratch1, a); 1320 __ mov(rscratch2, zr); 1321 __ bind(loop); 1322 __ cmp(rscratch2, count); 1323 __ br(Assembler::HS, end); 1324 if (size == (size_t)wordSize) { 1325 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1326 __ verify_oop(temp); 1327 } else { 1328 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1329 __ decode_heap_oop(temp); // calls verify_oop 1330 } 1331 __ add(rscratch2, rscratch2, size); 1332 __ b(loop); 1333 __ bind(end); 1334 } 1335 1336 // Arguments: 1337 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1338 // ignored 1339 // is_oop - true => oop array, so generate store check code 1340 // name - stub name string 1341 // 1342 // Inputs: 1343 // c_rarg0 - source array address 1344 // c_rarg1 - destination array address 1345 // c_rarg2 - element count, treated as ssize_t, can be zero 1346 // 1347 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1348 // the hardware handle it. The two dwords within qwords that span 1349 // cache line boundaries will still be loaded and stored atomicly. 1350 // 1351 // Side Effects: 1352 // disjoint_int_copy_entry is set to the no-overlap entry point 1353 // used by generate_conjoint_int_oop_copy(). 1354 // 1355 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1356 const char *name, bool dest_uninitialized = false) { 1357 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1358 RegSet saved_reg = RegSet::of(s, d, count); 1359 __ align(CodeEntryAlignment); 1360 StubCodeMark mark(this, "StubRoutines", name); 1361 address start = __ pc(); 1362 __ enter(); 1363 1364 if (entry != NULL) { 1365 *entry = __ pc(); 1366 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1367 BLOCK_COMMENT("Entry:"); 1368 } 1369 1370 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1371 if (dest_uninitialized) { 1372 decorators |= IS_DEST_UNINITIALIZED; 1373 } 1374 if (aligned) { 1375 decorators |= ARRAYCOPY_ALIGNED; 1376 } 1377 1378 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1379 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1380 1381 if (is_oop) { 1382 // save regs before copy_memory 1383 __ push(RegSet::of(d, count), sp); 1384 } 1385 copy_memory(aligned, s, d, count, rscratch1, size); 1386 1387 if (is_oop) { 1388 __ pop(RegSet::of(d, count), sp); 1389 if (VerifyOops) 1390 verify_oop_array(size, d, count, r16); 1391 __ sub(count, count, 1); // make an inclusive end pointer 1392 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1393 } 1394 1395 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1396 1397 __ leave(); 1398 __ mov(r0, zr); // return 0 1399 __ ret(lr); 1400 #ifdef BUILTIN_SIM 1401 { 1402 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1403 sim->notifyCompile(const_cast<char*>(name), start); 1404 } 1405 #endif 1406 return start; 1407 } 1408 1409 // Arguments: 1410 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1411 // ignored 1412 // is_oop - true => oop array, so generate store check code 1413 // name - stub name string 1414 // 1415 // Inputs: 1416 // c_rarg0 - source array address 1417 // c_rarg1 - destination array address 1418 // c_rarg2 - element count, treated as ssize_t, can be zero 1419 // 1420 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1421 // the hardware handle it. The two dwords within qwords that span 1422 // cache line boundaries will still be loaded and stored atomicly. 1423 // 1424 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1425 address *entry, const char *name, 1426 bool dest_uninitialized = false) { 1427 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1428 RegSet saved_regs = RegSet::of(s, d, count); 1429 StubCodeMark mark(this, "StubRoutines", name); 1430 address start = __ pc(); 1431 __ enter(); 1432 1433 if (entry != NULL) { 1434 *entry = __ pc(); 1435 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1436 BLOCK_COMMENT("Entry:"); 1437 } 1438 1439 // use fwd copy when (d-s) above_equal (count*size) 1440 __ sub(rscratch1, d, s); 1441 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1442 __ br(Assembler::HS, nooverlap_target); 1443 1444 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1445 if (dest_uninitialized) { 1446 decorators |= IS_DEST_UNINITIALIZED; 1447 } 1448 if (aligned) { 1449 decorators |= ARRAYCOPY_ALIGNED; 1450 } 1451 1452 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1453 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1454 1455 if (is_oop) { 1456 // save regs before copy_memory 1457 __ push(RegSet::of(d, count), sp); 1458 } 1459 copy_memory(aligned, s, d, count, rscratch1, -size); 1460 if (is_oop) { 1461 __ pop(RegSet::of(d, count), sp); 1462 if (VerifyOops) 1463 verify_oop_array(size, d, count, r16); 1464 __ sub(count, count, 1); // make an inclusive end pointer 1465 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1466 } 1467 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1468 __ leave(); 1469 __ mov(r0, zr); // return 0 1470 __ ret(lr); 1471 #ifdef BUILTIN_SIM 1472 { 1473 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1474 sim->notifyCompile(const_cast<char*>(name), start); 1475 } 1476 #endif 1477 return start; 1478 } 1479 1480 // Arguments: 1481 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1482 // ignored 1483 // name - stub name string 1484 // 1485 // Inputs: 1486 // c_rarg0 - source array address 1487 // c_rarg1 - destination array address 1488 // c_rarg2 - element count, treated as ssize_t, can be zero 1489 // 1490 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1491 // we let the hardware handle it. The one to eight bytes within words, 1492 // dwords or qwords that span cache line boundaries will still be loaded 1493 // and stored atomically. 1494 // 1495 // Side Effects: 1496 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1497 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1498 // we let the hardware handle it. The one to eight bytes within words, 1499 // dwords or qwords that span cache line boundaries will still be loaded 1500 // and stored atomically. 1501 // 1502 // Side Effects: 1503 // disjoint_byte_copy_entry is set to the no-overlap entry point 1504 // used by generate_conjoint_byte_copy(). 1505 // 1506 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1507 const bool not_oop = false; 1508 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1509 } 1510 1511 // Arguments: 1512 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1513 // ignored 1514 // name - stub name string 1515 // 1516 // Inputs: 1517 // c_rarg0 - source array address 1518 // c_rarg1 - destination array address 1519 // c_rarg2 - element count, treated as ssize_t, can be zero 1520 // 1521 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1522 // we let the hardware handle it. The one to eight bytes within words, 1523 // dwords or qwords that span cache line boundaries will still be loaded 1524 // and stored atomically. 1525 // 1526 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1527 address* entry, const char *name) { 1528 const bool not_oop = false; 1529 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1530 } 1531 1532 // Arguments: 1533 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1534 // ignored 1535 // name - stub name string 1536 // 1537 // Inputs: 1538 // c_rarg0 - source array address 1539 // c_rarg1 - destination array address 1540 // c_rarg2 - element count, treated as ssize_t, can be zero 1541 // 1542 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1543 // let the hardware handle it. The two or four words within dwords 1544 // or qwords that span cache line boundaries will still be loaded 1545 // and stored atomically. 1546 // 1547 // Side Effects: 1548 // disjoint_short_copy_entry is set to the no-overlap entry point 1549 // used by generate_conjoint_short_copy(). 1550 // 1551 address generate_disjoint_short_copy(bool aligned, 1552 address* entry, const char *name) { 1553 const bool not_oop = false; 1554 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1555 } 1556 1557 // Arguments: 1558 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1559 // ignored 1560 // name - stub name string 1561 // 1562 // Inputs: 1563 // c_rarg0 - source array address 1564 // c_rarg1 - destination array address 1565 // c_rarg2 - element count, treated as ssize_t, can be zero 1566 // 1567 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1568 // let the hardware handle it. The two or four words within dwords 1569 // or qwords that span cache line boundaries will still be loaded 1570 // and stored atomically. 1571 // 1572 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1573 address *entry, const char *name) { 1574 const bool not_oop = false; 1575 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1576 1577 } 1578 // Arguments: 1579 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1580 // ignored 1581 // name - stub name string 1582 // 1583 // Inputs: 1584 // c_rarg0 - source array address 1585 // c_rarg1 - destination array address 1586 // c_rarg2 - element count, treated as ssize_t, can be zero 1587 // 1588 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1589 // the hardware handle it. The two dwords within qwords that span 1590 // cache line boundaries will still be loaded and stored atomicly. 1591 // 1592 // Side Effects: 1593 // disjoint_int_copy_entry is set to the no-overlap entry point 1594 // used by generate_conjoint_int_oop_copy(). 1595 // 1596 address generate_disjoint_int_copy(bool aligned, address *entry, 1597 const char *name, bool dest_uninitialized = false) { 1598 const bool not_oop = false; 1599 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1600 } 1601 1602 // Arguments: 1603 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1604 // ignored 1605 // name - stub name string 1606 // 1607 // Inputs: 1608 // c_rarg0 - source array address 1609 // c_rarg1 - destination array address 1610 // c_rarg2 - element count, treated as ssize_t, can be zero 1611 // 1612 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1613 // the hardware handle it. The two dwords within qwords that span 1614 // cache line boundaries will still be loaded and stored atomicly. 1615 // 1616 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1617 address *entry, const char *name, 1618 bool dest_uninitialized = false) { 1619 const bool not_oop = false; 1620 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1621 } 1622 1623 1624 // Arguments: 1625 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1626 // ignored 1627 // name - stub name string 1628 // 1629 // Inputs: 1630 // c_rarg0 - source array address 1631 // c_rarg1 - destination array address 1632 // c_rarg2 - element count, treated as size_t, can be zero 1633 // 1634 // Side Effects: 1635 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1636 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1637 // 1638 address generate_disjoint_long_copy(bool aligned, address *entry, 1639 const char *name, bool dest_uninitialized = false) { 1640 const bool not_oop = false; 1641 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1642 } 1643 1644 // Arguments: 1645 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1646 // ignored 1647 // name - stub name string 1648 // 1649 // Inputs: 1650 // c_rarg0 - source array address 1651 // c_rarg1 - destination array address 1652 // c_rarg2 - element count, treated as size_t, can be zero 1653 // 1654 address generate_conjoint_long_copy(bool aligned, 1655 address nooverlap_target, address *entry, 1656 const char *name, bool dest_uninitialized = false) { 1657 const bool not_oop = false; 1658 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1659 } 1660 1661 // Arguments: 1662 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1663 // ignored 1664 // name - stub name string 1665 // 1666 // Inputs: 1667 // c_rarg0 - source array address 1668 // c_rarg1 - destination array address 1669 // c_rarg2 - element count, treated as size_t, can be zero 1670 // 1671 // Side Effects: 1672 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1673 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1674 // 1675 address generate_disjoint_oop_copy(bool aligned, address *entry, 1676 const char *name, bool dest_uninitialized) { 1677 const bool is_oop = true; 1678 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1679 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1680 } 1681 1682 // Arguments: 1683 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1684 // ignored 1685 // name - stub name string 1686 // 1687 // Inputs: 1688 // c_rarg0 - source array address 1689 // c_rarg1 - destination array address 1690 // c_rarg2 - element count, treated as size_t, can be zero 1691 // 1692 address generate_conjoint_oop_copy(bool aligned, 1693 address nooverlap_target, address *entry, 1694 const char *name, bool dest_uninitialized) { 1695 const bool is_oop = true; 1696 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1697 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1698 name, dest_uninitialized); 1699 } 1700 1701 1702 // Helper for generating a dynamic type check. 1703 // Smashes rscratch1, rscratch2. 1704 void generate_type_check(Register sub_klass, 1705 Register super_check_offset, 1706 Register super_klass, 1707 Label& L_success) { 1708 assert_different_registers(sub_klass, super_check_offset, super_klass); 1709 1710 BLOCK_COMMENT("type_check:"); 1711 1712 Label L_miss; 1713 1714 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1715 super_check_offset); 1716 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1717 1718 // Fall through on failure! 1719 __ BIND(L_miss); 1720 } 1721 1722 // 1723 // Generate checkcasting array copy stub 1724 // 1725 // Input: 1726 // c_rarg0 - source array address 1727 // c_rarg1 - destination array address 1728 // c_rarg2 - element count, treated as ssize_t, can be zero 1729 // c_rarg3 - size_t ckoff (super_check_offset) 1730 // c_rarg4 - oop ckval (super_klass) 1731 // 1732 // Output: 1733 // r0 == 0 - success 1734 // r0 == -1^K - failure, where K is partial transfer count 1735 // 1736 address generate_checkcast_copy(const char *name, address *entry, 1737 bool dest_uninitialized = false) { 1738 1739 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1740 1741 // Input registers (after setup_arg_regs) 1742 const Register from = c_rarg0; // source array address 1743 const Register to = c_rarg1; // destination array address 1744 const Register count = c_rarg2; // elementscount 1745 const Register ckoff = c_rarg3; // super_check_offset 1746 const Register ckval = c_rarg4; // super_klass 1747 1748 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1749 RegSet wb_post_saved_regs = RegSet::of(count); 1750 1751 // Registers used as temps (r18, r19, r20 are save-on-entry) 1752 const Register count_save = r21; // orig elementscount 1753 const Register start_to = r20; // destination array start address 1754 const Register copied_oop = r18; // actual oop copied 1755 const Register r19_klass = r19; // oop._klass 1756 1757 //--------------------------------------------------------------- 1758 // Assembler stub will be used for this call to arraycopy 1759 // if the two arrays are subtypes of Object[] but the 1760 // destination array type is not equal to or a supertype 1761 // of the source type. Each element must be separately 1762 // checked. 1763 1764 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1765 copied_oop, r19_klass, count_save); 1766 1767 __ align(CodeEntryAlignment); 1768 StubCodeMark mark(this, "StubRoutines", name); 1769 address start = __ pc(); 1770 1771 __ enter(); // required for proper stackwalking of RuntimeStub frame 1772 1773 #ifdef ASSERT 1774 // caller guarantees that the arrays really are different 1775 // otherwise, we would have to make conjoint checks 1776 { Label L; 1777 array_overlap_test(L, TIMES_OOP); 1778 __ stop("checkcast_copy within a single array"); 1779 __ bind(L); 1780 } 1781 #endif //ASSERT 1782 1783 // Caller of this entry point must set up the argument registers. 1784 if (entry != NULL) { 1785 *entry = __ pc(); 1786 BLOCK_COMMENT("Entry:"); 1787 } 1788 1789 // Empty array: Nothing to do. 1790 __ cbz(count, L_done); 1791 1792 __ push(RegSet::of(r18, r19, r20, r21), sp); 1793 1794 #ifdef ASSERT 1795 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1796 // The ckoff and ckval must be mutually consistent, 1797 // even though caller generates both. 1798 { Label L; 1799 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1800 __ ldrw(start_to, Address(ckval, sco_offset)); 1801 __ cmpw(ckoff, start_to); 1802 __ br(Assembler::EQ, L); 1803 __ stop("super_check_offset inconsistent"); 1804 __ bind(L); 1805 } 1806 #endif //ASSERT 1807 1808 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST; 1809 bool is_oop = true; 1810 if (dest_uninitialized) { 1811 decorators |= IS_DEST_UNINITIALIZED; 1812 } 1813 1814 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1815 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1816 1817 // save the original count 1818 __ mov(count_save, count); 1819 1820 // Copy from low to high addresses 1821 __ mov(start_to, to); // Save destination array start address 1822 __ b(L_load_element); 1823 1824 // ======== begin loop ======== 1825 // (Loop is rotated; its entry is L_load_element.) 1826 // Loop control: 1827 // for (; count != 0; count--) { 1828 // copied_oop = load_heap_oop(from++); 1829 // ... generate_type_check ...; 1830 // store_heap_oop(to++, copied_oop); 1831 // } 1832 __ align(OptoLoopAlignment); 1833 1834 __ BIND(L_store_element); 1835 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1836 __ sub(count, count, 1); 1837 __ cbz(count, L_do_card_marks); 1838 1839 // ======== loop entry is here ======== 1840 __ BIND(L_load_element); 1841 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1842 __ cbz(copied_oop, L_store_element); 1843 1844 __ load_klass(r19_klass, copied_oop);// query the object klass 1845 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1846 // ======== end loop ======== 1847 1848 // It was a real error; we must depend on the caller to finish the job. 1849 // Register count = remaining oops, count_orig = total oops. 1850 // Emit GC store barriers for the oops we have copied and report 1851 // their number to the caller. 1852 1853 __ subs(count, count_save, count); // K = partially copied oop count 1854 __ eon(count, count, zr); // report (-1^K) to caller 1855 __ br(Assembler::EQ, L_done_pop); 1856 1857 __ BIND(L_do_card_marks); 1858 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1859 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs); 1860 1861 __ bind(L_done_pop); 1862 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1863 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1864 1865 __ bind(L_done); 1866 __ mov(r0, count); 1867 __ leave(); 1868 __ ret(lr); 1869 1870 return start; 1871 } 1872 1873 // Perform range checks on the proposed arraycopy. 1874 // Kills temp, but nothing else. 1875 // Also, clean the sign bits of src_pos and dst_pos. 1876 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1877 Register src_pos, // source position (c_rarg1) 1878 Register dst, // destination array oo (c_rarg2) 1879 Register dst_pos, // destination position (c_rarg3) 1880 Register length, 1881 Register temp, 1882 Label& L_failed) { 1883 BLOCK_COMMENT("arraycopy_range_checks:"); 1884 1885 assert_different_registers(rscratch1, temp); 1886 1887 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1888 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1889 __ addw(temp, length, src_pos); 1890 __ cmpw(temp, rscratch1); 1891 __ br(Assembler::HI, L_failed); 1892 1893 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1894 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1895 __ addw(temp, length, dst_pos); 1896 __ cmpw(temp, rscratch1); 1897 __ br(Assembler::HI, L_failed); 1898 1899 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1900 __ movw(src_pos, src_pos); 1901 __ movw(dst_pos, dst_pos); 1902 1903 BLOCK_COMMENT("arraycopy_range_checks done"); 1904 } 1905 1906 // These stubs get called from some dumb test routine. 1907 // I'll write them properly when they're called from 1908 // something that's actually doing something. 1909 static void fake_arraycopy_stub(address src, address dst, int count) { 1910 assert(count == 0, "huh?"); 1911 } 1912 1913 1914 // 1915 // Generate 'unsafe' array copy stub 1916 // Though just as safe as the other stubs, it takes an unscaled 1917 // size_t argument instead of an element count. 1918 // 1919 // Input: 1920 // c_rarg0 - source array address 1921 // c_rarg1 - destination array address 1922 // c_rarg2 - byte count, treated as ssize_t, can be zero 1923 // 1924 // Examines the alignment of the operands and dispatches 1925 // to a long, int, short, or byte copy loop. 1926 // 1927 address generate_unsafe_copy(const char *name, 1928 address byte_copy_entry, 1929 address short_copy_entry, 1930 address int_copy_entry, 1931 address long_copy_entry) { 1932 Label L_long_aligned, L_int_aligned, L_short_aligned; 1933 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1934 1935 __ align(CodeEntryAlignment); 1936 StubCodeMark mark(this, "StubRoutines", name); 1937 address start = __ pc(); 1938 __ enter(); // required for proper stackwalking of RuntimeStub frame 1939 1940 // bump this on entry, not on exit: 1941 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1942 1943 __ orr(rscratch1, s, d); 1944 __ orr(rscratch1, rscratch1, count); 1945 1946 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1947 __ cbz(rscratch1, L_long_aligned); 1948 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1949 __ cbz(rscratch1, L_int_aligned); 1950 __ tbz(rscratch1, 0, L_short_aligned); 1951 __ b(RuntimeAddress(byte_copy_entry)); 1952 1953 __ BIND(L_short_aligned); 1954 __ lsr(count, count, LogBytesPerShort); // size => short_count 1955 __ b(RuntimeAddress(short_copy_entry)); 1956 __ BIND(L_int_aligned); 1957 __ lsr(count, count, LogBytesPerInt); // size => int_count 1958 __ b(RuntimeAddress(int_copy_entry)); 1959 __ BIND(L_long_aligned); 1960 __ lsr(count, count, LogBytesPerLong); // size => long_count 1961 __ b(RuntimeAddress(long_copy_entry)); 1962 1963 return start; 1964 } 1965 1966 // 1967 // Generate generic array copy stubs 1968 // 1969 // Input: 1970 // c_rarg0 - src oop 1971 // c_rarg1 - src_pos (32-bits) 1972 // c_rarg2 - dst oop 1973 // c_rarg3 - dst_pos (32-bits) 1974 // c_rarg4 - element count (32-bits) 1975 // 1976 // Output: 1977 // r0 == 0 - success 1978 // r0 == -1^K - failure, where K is partial transfer count 1979 // 1980 address generate_generic_copy(const char *name, 1981 address byte_copy_entry, address short_copy_entry, 1982 address int_copy_entry, address oop_copy_entry, 1983 address long_copy_entry, address checkcast_copy_entry) { 1984 1985 Label L_failed, L_objArray; 1986 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1987 1988 // Input registers 1989 const Register src = c_rarg0; // source array oop 1990 const Register src_pos = c_rarg1; // source position 1991 const Register dst = c_rarg2; // destination array oop 1992 const Register dst_pos = c_rarg3; // destination position 1993 const Register length = c_rarg4; 1994 1995 1996 // Registers used as temps 1997 const Register dst_klass = c_rarg5; 1998 1999 __ align(CodeEntryAlignment); 2000 2001 StubCodeMark mark(this, "StubRoutines", name); 2002 2003 address start = __ pc(); 2004 2005 __ enter(); // required for proper stackwalking of RuntimeStub frame 2006 2007 // bump this on entry, not on exit: 2008 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2009 2010 //----------------------------------------------------------------------- 2011 // Assembler stub will be used for this call to arraycopy 2012 // if the following conditions are met: 2013 // 2014 // (1) src and dst must not be null. 2015 // (2) src_pos must not be negative. 2016 // (3) dst_pos must not be negative. 2017 // (4) length must not be negative. 2018 // (5) src klass and dst klass should be the same and not NULL. 2019 // (6) src and dst should be arrays. 2020 // (7) src_pos + length must not exceed length of src. 2021 // (8) dst_pos + length must not exceed length of dst. 2022 // 2023 2024 // if (src == NULL) return -1; 2025 __ cbz(src, L_failed); 2026 2027 // if (src_pos < 0) return -1; 2028 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2029 2030 // if (dst == NULL) return -1; 2031 __ cbz(dst, L_failed); 2032 2033 // if (dst_pos < 0) return -1; 2034 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2035 2036 // registers used as temp 2037 const Register scratch_length = r16; // elements count to copy 2038 const Register scratch_src_klass = r17; // array klass 2039 const Register lh = r18; // layout helper 2040 2041 // if (length < 0) return -1; 2042 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2043 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2044 2045 __ load_klass(scratch_src_klass, src); 2046 #ifdef ASSERT 2047 // assert(src->klass() != NULL); 2048 { 2049 BLOCK_COMMENT("assert klasses not null {"); 2050 Label L1, L2; 2051 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2052 __ bind(L1); 2053 __ stop("broken null klass"); 2054 __ bind(L2); 2055 __ load_klass(rscratch1, dst); 2056 __ cbz(rscratch1, L1); // this would be broken also 2057 BLOCK_COMMENT("} assert klasses not null done"); 2058 } 2059 #endif 2060 2061 // Load layout helper (32-bits) 2062 // 2063 // |array_tag| | header_size | element_type | |log2_element_size| 2064 // 32 30 24 16 8 2 0 2065 // 2066 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2067 // 2068 2069 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2070 2071 // Handle objArrays completely differently... 2072 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2073 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2074 __ movw(rscratch1, objArray_lh); 2075 __ eorw(rscratch2, lh, rscratch1); 2076 __ cbzw(rscratch2, L_objArray); 2077 2078 // if (src->klass() != dst->klass()) return -1; 2079 __ load_klass(rscratch2, dst); 2080 __ eor(rscratch2, rscratch2, scratch_src_klass); 2081 __ cbnz(rscratch2, L_failed); 2082 2083 // if (!src->is_Array()) return -1; 2084 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2085 2086 // At this point, it is known to be a typeArray (array_tag 0x3). 2087 #ifdef ASSERT 2088 { 2089 BLOCK_COMMENT("assert primitive array {"); 2090 Label L; 2091 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2092 __ cmpw(lh, rscratch2); 2093 __ br(Assembler::GE, L); 2094 __ stop("must be a primitive array"); 2095 __ bind(L); 2096 BLOCK_COMMENT("} assert primitive array done"); 2097 } 2098 #endif 2099 2100 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2101 rscratch2, L_failed); 2102 2103 // TypeArrayKlass 2104 // 2105 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2106 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2107 // 2108 2109 const Register rscratch1_offset = rscratch1; // array offset 2110 const Register r18_elsize = lh; // element size 2111 2112 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2113 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2114 __ add(src, src, rscratch1_offset); // src array offset 2115 __ add(dst, dst, rscratch1_offset); // dst array offset 2116 BLOCK_COMMENT("choose copy loop based on element size"); 2117 2118 // next registers should be set before the jump to corresponding stub 2119 const Register from = c_rarg0; // source array address 2120 const Register to = c_rarg1; // destination array address 2121 const Register count = c_rarg2; // elements count 2122 2123 // 'from', 'to', 'count' registers should be set in such order 2124 // since they are the same as 'src', 'src_pos', 'dst'. 2125 2126 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2127 2128 // The possible values of elsize are 0-3, i.e. exact_log2(element 2129 // size in bytes). We do a simple bitwise binary search. 2130 __ BIND(L_copy_bytes); 2131 __ tbnz(r18_elsize, 1, L_copy_ints); 2132 __ tbnz(r18_elsize, 0, L_copy_shorts); 2133 __ lea(from, Address(src, src_pos));// src_addr 2134 __ lea(to, Address(dst, dst_pos));// dst_addr 2135 __ movw(count, scratch_length); // length 2136 __ b(RuntimeAddress(byte_copy_entry)); 2137 2138 __ BIND(L_copy_shorts); 2139 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2140 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2141 __ movw(count, scratch_length); // length 2142 __ b(RuntimeAddress(short_copy_entry)); 2143 2144 __ BIND(L_copy_ints); 2145 __ tbnz(r18_elsize, 0, L_copy_longs); 2146 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2147 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2148 __ movw(count, scratch_length); // length 2149 __ b(RuntimeAddress(int_copy_entry)); 2150 2151 __ BIND(L_copy_longs); 2152 #ifdef ASSERT 2153 { 2154 BLOCK_COMMENT("assert long copy {"); 2155 Label L; 2156 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2157 __ cmpw(r18_elsize, LogBytesPerLong); 2158 __ br(Assembler::EQ, L); 2159 __ stop("must be long copy, but elsize is wrong"); 2160 __ bind(L); 2161 BLOCK_COMMENT("} assert long copy done"); 2162 } 2163 #endif 2164 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2165 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2166 __ movw(count, scratch_length); // length 2167 __ b(RuntimeAddress(long_copy_entry)); 2168 2169 // ObjArrayKlass 2170 __ BIND(L_objArray); 2171 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2172 2173 Label L_plain_copy, L_checkcast_copy; 2174 // test array classes for subtyping 2175 __ load_klass(r18, dst); 2176 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2177 __ br(Assembler::NE, L_checkcast_copy); 2178 2179 // Identically typed arrays can be copied without element-wise checks. 2180 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2181 rscratch2, L_failed); 2182 2183 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2184 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2185 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2186 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2187 __ movw(count, scratch_length); // length 2188 __ BIND(L_plain_copy); 2189 __ b(RuntimeAddress(oop_copy_entry)); 2190 2191 __ BIND(L_checkcast_copy); 2192 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2193 { 2194 // Before looking at dst.length, make sure dst is also an objArray. 2195 __ ldrw(rscratch1, Address(r18, lh_offset)); 2196 __ movw(rscratch2, objArray_lh); 2197 __ eorw(rscratch1, rscratch1, rscratch2); 2198 __ cbnzw(rscratch1, L_failed); 2199 2200 // It is safe to examine both src.length and dst.length. 2201 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2202 r18, L_failed); 2203 2204 __ load_klass(dst_klass, dst); // reload 2205 2206 // Marshal the base address arguments now, freeing registers. 2207 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2208 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2209 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2210 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2211 __ movw(count, length); // length (reloaded) 2212 Register sco_temp = c_rarg3; // this register is free now 2213 assert_different_registers(from, to, count, sco_temp, 2214 dst_klass, scratch_src_klass); 2215 // assert_clean_int(count, sco_temp); 2216 2217 // Generate the type check. 2218 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2219 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2220 2221 // Smashes rscratch1, rscratch2 2222 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2223 2224 // Fetch destination element klass from the ObjArrayKlass header. 2225 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2226 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2227 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2228 2229 // the checkcast_copy loop needs two extra arguments: 2230 assert(c_rarg3 == sco_temp, "#3 already in place"); 2231 // Set up arguments for checkcast_copy_entry. 2232 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2233 __ b(RuntimeAddress(checkcast_copy_entry)); 2234 } 2235 2236 __ BIND(L_failed); 2237 __ mov(r0, -1); 2238 __ leave(); // required for proper stackwalking of RuntimeStub frame 2239 __ ret(lr); 2240 2241 return start; 2242 } 2243 2244 // 2245 // Generate stub for array fill. If "aligned" is true, the 2246 // "to" address is assumed to be heapword aligned. 2247 // 2248 // Arguments for generated stub: 2249 // to: c_rarg0 2250 // value: c_rarg1 2251 // count: c_rarg2 treated as signed 2252 // 2253 address generate_fill(BasicType t, bool aligned, const char *name) { 2254 __ align(CodeEntryAlignment); 2255 StubCodeMark mark(this, "StubRoutines", name); 2256 address start = __ pc(); 2257 2258 BLOCK_COMMENT("Entry:"); 2259 2260 const Register to = c_rarg0; // source array address 2261 const Register value = c_rarg1; // value 2262 const Register count = c_rarg2; // elements count 2263 2264 const Register bz_base = r10; // base for block_zero routine 2265 const Register cnt_words = r11; // temp register 2266 2267 __ enter(); 2268 2269 Label L_fill_elements, L_exit1; 2270 2271 int shift = -1; 2272 switch (t) { 2273 case T_BYTE: 2274 shift = 0; 2275 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2276 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2277 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2278 __ br(Assembler::LO, L_fill_elements); 2279 break; 2280 case T_SHORT: 2281 shift = 1; 2282 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2283 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2284 __ br(Assembler::LO, L_fill_elements); 2285 break; 2286 case T_INT: 2287 shift = 2; 2288 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2289 __ br(Assembler::LO, L_fill_elements); 2290 break; 2291 default: ShouldNotReachHere(); 2292 } 2293 2294 // Align source address at 8 bytes address boundary. 2295 Label L_skip_align1, L_skip_align2, L_skip_align4; 2296 if (!aligned) { 2297 switch (t) { 2298 case T_BYTE: 2299 // One byte misalignment happens only for byte arrays. 2300 __ tbz(to, 0, L_skip_align1); 2301 __ strb(value, Address(__ post(to, 1))); 2302 __ subw(count, count, 1); 2303 __ bind(L_skip_align1); 2304 // Fallthrough 2305 case T_SHORT: 2306 // Two bytes misalignment happens only for byte and short (char) arrays. 2307 __ tbz(to, 1, L_skip_align2); 2308 __ strh(value, Address(__ post(to, 2))); 2309 __ subw(count, count, 2 >> shift); 2310 __ bind(L_skip_align2); 2311 // Fallthrough 2312 case T_INT: 2313 // Align to 8 bytes, we know we are 4 byte aligned to start. 2314 __ tbz(to, 2, L_skip_align4); 2315 __ strw(value, Address(__ post(to, 4))); 2316 __ subw(count, count, 4 >> shift); 2317 __ bind(L_skip_align4); 2318 break; 2319 default: ShouldNotReachHere(); 2320 } 2321 } 2322 2323 // 2324 // Fill large chunks 2325 // 2326 __ lsrw(cnt_words, count, 3 - shift); // number of words 2327 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2328 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2329 if (UseBlockZeroing) { 2330 Label non_block_zeroing, rest; 2331 // If the fill value is zero we can use the fast zero_words(). 2332 __ cbnz(value, non_block_zeroing); 2333 __ mov(bz_base, to); 2334 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2335 __ zero_words(bz_base, cnt_words); 2336 __ b(rest); 2337 __ bind(non_block_zeroing); 2338 __ fill_words(to, cnt_words, value); 2339 __ bind(rest); 2340 } else { 2341 __ fill_words(to, cnt_words, value); 2342 } 2343 2344 // Remaining count is less than 8 bytes. Fill it by a single store. 2345 // Note that the total length is no less than 8 bytes. 2346 if (t == T_BYTE || t == T_SHORT) { 2347 Label L_exit1; 2348 __ cbzw(count, L_exit1); 2349 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2350 __ str(value, Address(to, -8)); // overwrite some elements 2351 __ bind(L_exit1); 2352 __ leave(); 2353 __ ret(lr); 2354 } 2355 2356 // Handle copies less than 8 bytes. 2357 Label L_fill_2, L_fill_4, L_exit2; 2358 __ bind(L_fill_elements); 2359 switch (t) { 2360 case T_BYTE: 2361 __ tbz(count, 0, L_fill_2); 2362 __ strb(value, Address(__ post(to, 1))); 2363 __ bind(L_fill_2); 2364 __ tbz(count, 1, L_fill_4); 2365 __ strh(value, Address(__ post(to, 2))); 2366 __ bind(L_fill_4); 2367 __ tbz(count, 2, L_exit2); 2368 __ strw(value, Address(to)); 2369 break; 2370 case T_SHORT: 2371 __ tbz(count, 0, L_fill_4); 2372 __ strh(value, Address(__ post(to, 2))); 2373 __ bind(L_fill_4); 2374 __ tbz(count, 1, L_exit2); 2375 __ strw(value, Address(to)); 2376 break; 2377 case T_INT: 2378 __ cbzw(count, L_exit2); 2379 __ strw(value, Address(to)); 2380 break; 2381 default: ShouldNotReachHere(); 2382 } 2383 __ bind(L_exit2); 2384 __ leave(); 2385 __ ret(lr); 2386 return start; 2387 } 2388 2389 void generate_arraycopy_stubs() { 2390 address entry; 2391 address entry_jbyte_arraycopy; 2392 address entry_jshort_arraycopy; 2393 address entry_jint_arraycopy; 2394 address entry_oop_arraycopy; 2395 address entry_jlong_arraycopy; 2396 address entry_checkcast_arraycopy; 2397 2398 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2399 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2400 2401 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2402 2403 //*** jbyte 2404 // Always need aligned and unaligned versions 2405 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2406 "jbyte_disjoint_arraycopy"); 2407 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2408 &entry_jbyte_arraycopy, 2409 "jbyte_arraycopy"); 2410 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2411 "arrayof_jbyte_disjoint_arraycopy"); 2412 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2413 "arrayof_jbyte_arraycopy"); 2414 2415 //*** jshort 2416 // Always need aligned and unaligned versions 2417 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2418 "jshort_disjoint_arraycopy"); 2419 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2420 &entry_jshort_arraycopy, 2421 "jshort_arraycopy"); 2422 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2423 "arrayof_jshort_disjoint_arraycopy"); 2424 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2425 "arrayof_jshort_arraycopy"); 2426 2427 //*** jint 2428 // Aligned versions 2429 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2430 "arrayof_jint_disjoint_arraycopy"); 2431 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2432 "arrayof_jint_arraycopy"); 2433 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2434 // entry_jint_arraycopy always points to the unaligned version 2435 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2436 "jint_disjoint_arraycopy"); 2437 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2438 &entry_jint_arraycopy, 2439 "jint_arraycopy"); 2440 2441 //*** jlong 2442 // It is always aligned 2443 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2444 "arrayof_jlong_disjoint_arraycopy"); 2445 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2446 "arrayof_jlong_arraycopy"); 2447 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2448 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2449 2450 //*** oops 2451 { 2452 // With compressed oops we need unaligned versions; notice that 2453 // we overwrite entry_oop_arraycopy. 2454 bool aligned = !UseCompressedOops; 2455 2456 StubRoutines::_arrayof_oop_disjoint_arraycopy 2457 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2458 /*dest_uninitialized*/false); 2459 StubRoutines::_arrayof_oop_arraycopy 2460 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2461 /*dest_uninitialized*/false); 2462 // Aligned versions without pre-barriers 2463 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2464 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2465 /*dest_uninitialized*/true); 2466 StubRoutines::_arrayof_oop_arraycopy_uninit 2467 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2468 /*dest_uninitialized*/true); 2469 } 2470 2471 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2472 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2473 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2474 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2475 2476 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2477 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2478 /*dest_uninitialized*/true); 2479 2480 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2481 entry_jbyte_arraycopy, 2482 entry_jshort_arraycopy, 2483 entry_jint_arraycopy, 2484 entry_jlong_arraycopy); 2485 2486 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2487 entry_jbyte_arraycopy, 2488 entry_jshort_arraycopy, 2489 entry_jint_arraycopy, 2490 entry_oop_arraycopy, 2491 entry_jlong_arraycopy, 2492 entry_checkcast_arraycopy); 2493 2494 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2495 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2496 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2497 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2498 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2499 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2500 } 2501 2502 void generate_math_stubs() { Unimplemented(); } 2503 2504 // Arguments: 2505 // 2506 // Inputs: 2507 // c_rarg0 - source byte array address 2508 // c_rarg1 - destination byte array address 2509 // c_rarg2 - K (key) in little endian int array 2510 // 2511 address generate_aescrypt_encryptBlock() { 2512 __ align(CodeEntryAlignment); 2513 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2514 2515 Label L_doLast; 2516 2517 const Register from = c_rarg0; // source array address 2518 const Register to = c_rarg1; // destination array address 2519 const Register key = c_rarg2; // key array address 2520 const Register keylen = rscratch1; 2521 2522 address start = __ pc(); 2523 __ enter(); 2524 2525 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2526 2527 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2528 2529 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2530 __ rev32(v1, __ T16B, v1); 2531 __ rev32(v2, __ T16B, v2); 2532 __ rev32(v3, __ T16B, v3); 2533 __ rev32(v4, __ T16B, v4); 2534 __ aese(v0, v1); 2535 __ aesmc(v0, v0); 2536 __ aese(v0, v2); 2537 __ aesmc(v0, v0); 2538 __ aese(v0, v3); 2539 __ aesmc(v0, v0); 2540 __ aese(v0, v4); 2541 __ aesmc(v0, v0); 2542 2543 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2544 __ rev32(v1, __ T16B, v1); 2545 __ rev32(v2, __ T16B, v2); 2546 __ rev32(v3, __ T16B, v3); 2547 __ rev32(v4, __ T16B, v4); 2548 __ aese(v0, v1); 2549 __ aesmc(v0, v0); 2550 __ aese(v0, v2); 2551 __ aesmc(v0, v0); 2552 __ aese(v0, v3); 2553 __ aesmc(v0, v0); 2554 __ aese(v0, v4); 2555 __ aesmc(v0, v0); 2556 2557 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2558 __ rev32(v1, __ T16B, v1); 2559 __ rev32(v2, __ T16B, v2); 2560 2561 __ cmpw(keylen, 44); 2562 __ br(Assembler::EQ, L_doLast); 2563 2564 __ aese(v0, v1); 2565 __ aesmc(v0, v0); 2566 __ aese(v0, v2); 2567 __ aesmc(v0, v0); 2568 2569 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2570 __ rev32(v1, __ T16B, v1); 2571 __ rev32(v2, __ T16B, v2); 2572 2573 __ cmpw(keylen, 52); 2574 __ br(Assembler::EQ, L_doLast); 2575 2576 __ aese(v0, v1); 2577 __ aesmc(v0, v0); 2578 __ aese(v0, v2); 2579 __ aesmc(v0, v0); 2580 2581 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2582 __ rev32(v1, __ T16B, v1); 2583 __ rev32(v2, __ T16B, v2); 2584 2585 __ BIND(L_doLast); 2586 2587 __ aese(v0, v1); 2588 __ aesmc(v0, v0); 2589 __ aese(v0, v2); 2590 2591 __ ld1(v1, __ T16B, key); 2592 __ rev32(v1, __ T16B, v1); 2593 __ eor(v0, __ T16B, v0, v1); 2594 2595 __ st1(v0, __ T16B, to); 2596 2597 __ mov(r0, 0); 2598 2599 __ leave(); 2600 __ ret(lr); 2601 2602 return start; 2603 } 2604 2605 // Arguments: 2606 // 2607 // Inputs: 2608 // c_rarg0 - source byte array address 2609 // c_rarg1 - destination byte array address 2610 // c_rarg2 - K (key) in little endian int array 2611 // 2612 address generate_aescrypt_decryptBlock() { 2613 assert(UseAES, "need AES instructions and misaligned SSE support"); 2614 __ align(CodeEntryAlignment); 2615 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2616 Label L_doLast; 2617 2618 const Register from = c_rarg0; // source array address 2619 const Register to = c_rarg1; // destination array address 2620 const Register key = c_rarg2; // key array address 2621 const Register keylen = rscratch1; 2622 2623 address start = __ pc(); 2624 __ enter(); // required for proper stackwalking of RuntimeStub frame 2625 2626 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2627 2628 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2629 2630 __ ld1(v5, __ T16B, __ post(key, 16)); 2631 __ rev32(v5, __ T16B, v5); 2632 2633 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2634 __ rev32(v1, __ T16B, v1); 2635 __ rev32(v2, __ T16B, v2); 2636 __ rev32(v3, __ T16B, v3); 2637 __ rev32(v4, __ T16B, v4); 2638 __ aesd(v0, v1); 2639 __ aesimc(v0, v0); 2640 __ aesd(v0, v2); 2641 __ aesimc(v0, v0); 2642 __ aesd(v0, v3); 2643 __ aesimc(v0, v0); 2644 __ aesd(v0, v4); 2645 __ aesimc(v0, v0); 2646 2647 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2648 __ rev32(v1, __ T16B, v1); 2649 __ rev32(v2, __ T16B, v2); 2650 __ rev32(v3, __ T16B, v3); 2651 __ rev32(v4, __ T16B, v4); 2652 __ aesd(v0, v1); 2653 __ aesimc(v0, v0); 2654 __ aesd(v0, v2); 2655 __ aesimc(v0, v0); 2656 __ aesd(v0, v3); 2657 __ aesimc(v0, v0); 2658 __ aesd(v0, v4); 2659 __ aesimc(v0, v0); 2660 2661 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2662 __ rev32(v1, __ T16B, v1); 2663 __ rev32(v2, __ T16B, v2); 2664 2665 __ cmpw(keylen, 44); 2666 __ br(Assembler::EQ, L_doLast); 2667 2668 __ aesd(v0, v1); 2669 __ aesimc(v0, v0); 2670 __ aesd(v0, v2); 2671 __ aesimc(v0, v0); 2672 2673 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2674 __ rev32(v1, __ T16B, v1); 2675 __ rev32(v2, __ T16B, v2); 2676 2677 __ cmpw(keylen, 52); 2678 __ br(Assembler::EQ, L_doLast); 2679 2680 __ aesd(v0, v1); 2681 __ aesimc(v0, v0); 2682 __ aesd(v0, v2); 2683 __ aesimc(v0, v0); 2684 2685 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2686 __ rev32(v1, __ T16B, v1); 2687 __ rev32(v2, __ T16B, v2); 2688 2689 __ BIND(L_doLast); 2690 2691 __ aesd(v0, v1); 2692 __ aesimc(v0, v0); 2693 __ aesd(v0, v2); 2694 2695 __ eor(v0, __ T16B, v0, v5); 2696 2697 __ st1(v0, __ T16B, to); 2698 2699 __ mov(r0, 0); 2700 2701 __ leave(); 2702 __ ret(lr); 2703 2704 return start; 2705 } 2706 2707 // Arguments: 2708 // 2709 // Inputs: 2710 // c_rarg0 - source byte array address 2711 // c_rarg1 - destination byte array address 2712 // c_rarg2 - K (key) in little endian int array 2713 // c_rarg3 - r vector byte array address 2714 // c_rarg4 - input length 2715 // 2716 // Output: 2717 // x0 - input length 2718 // 2719 address generate_cipherBlockChaining_encryptAESCrypt() { 2720 assert(UseAES, "need AES instructions and misaligned SSE support"); 2721 __ align(CodeEntryAlignment); 2722 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2723 2724 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2725 2726 const Register from = c_rarg0; // source array address 2727 const Register to = c_rarg1; // destination array address 2728 const Register key = c_rarg2; // key array address 2729 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2730 // and left with the results of the last encryption block 2731 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2732 const Register keylen = rscratch1; 2733 2734 address start = __ pc(); 2735 2736 __ enter(); 2737 2738 __ movw(rscratch2, len_reg); 2739 2740 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2741 2742 __ ld1(v0, __ T16B, rvec); 2743 2744 __ cmpw(keylen, 52); 2745 __ br(Assembler::CC, L_loadkeys_44); 2746 __ br(Assembler::EQ, L_loadkeys_52); 2747 2748 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2749 __ rev32(v17, __ T16B, v17); 2750 __ rev32(v18, __ T16B, v18); 2751 __ BIND(L_loadkeys_52); 2752 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2753 __ rev32(v19, __ T16B, v19); 2754 __ rev32(v20, __ T16B, v20); 2755 __ BIND(L_loadkeys_44); 2756 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2757 __ rev32(v21, __ T16B, v21); 2758 __ rev32(v22, __ T16B, v22); 2759 __ rev32(v23, __ T16B, v23); 2760 __ rev32(v24, __ T16B, v24); 2761 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2762 __ rev32(v25, __ T16B, v25); 2763 __ rev32(v26, __ T16B, v26); 2764 __ rev32(v27, __ T16B, v27); 2765 __ rev32(v28, __ T16B, v28); 2766 __ ld1(v29, v30, v31, __ T16B, key); 2767 __ rev32(v29, __ T16B, v29); 2768 __ rev32(v30, __ T16B, v30); 2769 __ rev32(v31, __ T16B, v31); 2770 2771 __ BIND(L_aes_loop); 2772 __ ld1(v1, __ T16B, __ post(from, 16)); 2773 __ eor(v0, __ T16B, v0, v1); 2774 2775 __ br(Assembler::CC, L_rounds_44); 2776 __ br(Assembler::EQ, L_rounds_52); 2777 2778 __ aese(v0, v17); __ aesmc(v0, v0); 2779 __ aese(v0, v18); __ aesmc(v0, v0); 2780 __ BIND(L_rounds_52); 2781 __ aese(v0, v19); __ aesmc(v0, v0); 2782 __ aese(v0, v20); __ aesmc(v0, v0); 2783 __ BIND(L_rounds_44); 2784 __ aese(v0, v21); __ aesmc(v0, v0); 2785 __ aese(v0, v22); __ aesmc(v0, v0); 2786 __ aese(v0, v23); __ aesmc(v0, v0); 2787 __ aese(v0, v24); __ aesmc(v0, v0); 2788 __ aese(v0, v25); __ aesmc(v0, v0); 2789 __ aese(v0, v26); __ aesmc(v0, v0); 2790 __ aese(v0, v27); __ aesmc(v0, v0); 2791 __ aese(v0, v28); __ aesmc(v0, v0); 2792 __ aese(v0, v29); __ aesmc(v0, v0); 2793 __ aese(v0, v30); 2794 __ eor(v0, __ T16B, v0, v31); 2795 2796 __ st1(v0, __ T16B, __ post(to, 16)); 2797 2798 __ subw(len_reg, len_reg, 16); 2799 __ cbnzw(len_reg, L_aes_loop); 2800 2801 __ st1(v0, __ T16B, rvec); 2802 2803 __ mov(r0, rscratch2); 2804 2805 __ leave(); 2806 __ ret(lr); 2807 2808 return start; 2809 } 2810 2811 // Arguments: 2812 // 2813 // Inputs: 2814 // c_rarg0 - source byte array address 2815 // c_rarg1 - destination byte array address 2816 // c_rarg2 - K (key) in little endian int array 2817 // c_rarg3 - r vector byte array address 2818 // c_rarg4 - input length 2819 // 2820 // Output: 2821 // r0 - input length 2822 // 2823 address generate_cipherBlockChaining_decryptAESCrypt() { 2824 assert(UseAES, "need AES instructions and misaligned SSE support"); 2825 __ align(CodeEntryAlignment); 2826 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2827 2828 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2829 2830 const Register from = c_rarg0; // source array address 2831 const Register to = c_rarg1; // destination array address 2832 const Register key = c_rarg2; // key array address 2833 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2834 // and left with the results of the last encryption block 2835 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2836 const Register keylen = rscratch1; 2837 2838 address start = __ pc(); 2839 2840 __ enter(); 2841 2842 __ movw(rscratch2, len_reg); 2843 2844 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2845 2846 __ ld1(v2, __ T16B, rvec); 2847 2848 __ ld1(v31, __ T16B, __ post(key, 16)); 2849 __ rev32(v31, __ T16B, v31); 2850 2851 __ cmpw(keylen, 52); 2852 __ br(Assembler::CC, L_loadkeys_44); 2853 __ br(Assembler::EQ, L_loadkeys_52); 2854 2855 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2856 __ rev32(v17, __ T16B, v17); 2857 __ rev32(v18, __ T16B, v18); 2858 __ BIND(L_loadkeys_52); 2859 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2860 __ rev32(v19, __ T16B, v19); 2861 __ rev32(v20, __ T16B, v20); 2862 __ BIND(L_loadkeys_44); 2863 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2864 __ rev32(v21, __ T16B, v21); 2865 __ rev32(v22, __ T16B, v22); 2866 __ rev32(v23, __ T16B, v23); 2867 __ rev32(v24, __ T16B, v24); 2868 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2869 __ rev32(v25, __ T16B, v25); 2870 __ rev32(v26, __ T16B, v26); 2871 __ rev32(v27, __ T16B, v27); 2872 __ rev32(v28, __ T16B, v28); 2873 __ ld1(v29, v30, __ T16B, key); 2874 __ rev32(v29, __ T16B, v29); 2875 __ rev32(v30, __ T16B, v30); 2876 2877 __ BIND(L_aes_loop); 2878 __ ld1(v0, __ T16B, __ post(from, 16)); 2879 __ orr(v1, __ T16B, v0, v0); 2880 2881 __ br(Assembler::CC, L_rounds_44); 2882 __ br(Assembler::EQ, L_rounds_52); 2883 2884 __ aesd(v0, v17); __ aesimc(v0, v0); 2885 __ aesd(v0, v18); __ aesimc(v0, v0); 2886 __ BIND(L_rounds_52); 2887 __ aesd(v0, v19); __ aesimc(v0, v0); 2888 __ aesd(v0, v20); __ aesimc(v0, v0); 2889 __ BIND(L_rounds_44); 2890 __ aesd(v0, v21); __ aesimc(v0, v0); 2891 __ aesd(v0, v22); __ aesimc(v0, v0); 2892 __ aesd(v0, v23); __ aesimc(v0, v0); 2893 __ aesd(v0, v24); __ aesimc(v0, v0); 2894 __ aesd(v0, v25); __ aesimc(v0, v0); 2895 __ aesd(v0, v26); __ aesimc(v0, v0); 2896 __ aesd(v0, v27); __ aesimc(v0, v0); 2897 __ aesd(v0, v28); __ aesimc(v0, v0); 2898 __ aesd(v0, v29); __ aesimc(v0, v0); 2899 __ aesd(v0, v30); 2900 __ eor(v0, __ T16B, v0, v31); 2901 __ eor(v0, __ T16B, v0, v2); 2902 2903 __ st1(v0, __ T16B, __ post(to, 16)); 2904 __ orr(v2, __ T16B, v1, v1); 2905 2906 __ subw(len_reg, len_reg, 16); 2907 __ cbnzw(len_reg, L_aes_loop); 2908 2909 __ st1(v2, __ T16B, rvec); 2910 2911 __ mov(r0, rscratch2); 2912 2913 __ leave(); 2914 __ ret(lr); 2915 2916 return start; 2917 } 2918 2919 // Arguments: 2920 // 2921 // Inputs: 2922 // c_rarg0 - byte[] source+offset 2923 // c_rarg1 - int[] SHA.state 2924 // c_rarg2 - int offset 2925 // c_rarg3 - int limit 2926 // 2927 address generate_sha1_implCompress(bool multi_block, const char *name) { 2928 __ align(CodeEntryAlignment); 2929 StubCodeMark mark(this, "StubRoutines", name); 2930 address start = __ pc(); 2931 2932 Register buf = c_rarg0; 2933 Register state = c_rarg1; 2934 Register ofs = c_rarg2; 2935 Register limit = c_rarg3; 2936 2937 Label keys; 2938 Label sha1_loop; 2939 2940 // load the keys into v0..v3 2941 __ adr(rscratch1, keys); 2942 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2943 // load 5 words state into v6, v7 2944 __ ldrq(v6, Address(state, 0)); 2945 __ ldrs(v7, Address(state, 16)); 2946 2947 2948 __ BIND(sha1_loop); 2949 // load 64 bytes of data into v16..v19 2950 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2951 __ rev32(v16, __ T16B, v16); 2952 __ rev32(v17, __ T16B, v17); 2953 __ rev32(v18, __ T16B, v18); 2954 __ rev32(v19, __ T16B, v19); 2955 2956 // do the sha1 2957 __ addv(v4, __ T4S, v16, v0); 2958 __ orr(v20, __ T16B, v6, v6); 2959 2960 FloatRegister d0 = v16; 2961 FloatRegister d1 = v17; 2962 FloatRegister d2 = v18; 2963 FloatRegister d3 = v19; 2964 2965 for (int round = 0; round < 20; round++) { 2966 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2967 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2968 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2969 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2970 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2971 2972 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2973 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2974 __ sha1h(tmp2, __ T4S, v20); 2975 if (round < 5) 2976 __ sha1c(v20, __ T4S, tmp3, tmp4); 2977 else if (round < 10 || round >= 15) 2978 __ sha1p(v20, __ T4S, tmp3, tmp4); 2979 else 2980 __ sha1m(v20, __ T4S, tmp3, tmp4); 2981 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2982 2983 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2984 } 2985 2986 __ addv(v7, __ T2S, v7, v21); 2987 __ addv(v6, __ T4S, v6, v20); 2988 2989 if (multi_block) { 2990 __ add(ofs, ofs, 64); 2991 __ cmp(ofs, limit); 2992 __ br(Assembler::LE, sha1_loop); 2993 __ mov(c_rarg0, ofs); // return ofs 2994 } 2995 2996 __ strq(v6, Address(state, 0)); 2997 __ strs(v7, Address(state, 16)); 2998 2999 __ ret(lr); 3000 3001 __ bind(keys); 3002 __ emit_int32(0x5a827999); 3003 __ emit_int32(0x6ed9eba1); 3004 __ emit_int32(0x8f1bbcdc); 3005 __ emit_int32(0xca62c1d6); 3006 3007 return start; 3008 } 3009 3010 3011 // Arguments: 3012 // 3013 // Inputs: 3014 // c_rarg0 - byte[] source+offset 3015 // c_rarg1 - int[] SHA.state 3016 // c_rarg2 - int offset 3017 // c_rarg3 - int limit 3018 // 3019 address generate_sha256_implCompress(bool multi_block, const char *name) { 3020 static const uint32_t round_consts[64] = { 3021 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3022 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3023 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3024 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3025 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3026 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3027 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3028 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3029 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3030 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3031 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3032 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3033 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3034 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3035 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3036 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3037 }; 3038 __ align(CodeEntryAlignment); 3039 StubCodeMark mark(this, "StubRoutines", name); 3040 address start = __ pc(); 3041 3042 Register buf = c_rarg0; 3043 Register state = c_rarg1; 3044 Register ofs = c_rarg2; 3045 Register limit = c_rarg3; 3046 3047 Label sha1_loop; 3048 3049 __ stpd(v8, v9, __ pre(sp, -32)); 3050 __ stpd(v10, v11, Address(sp, 16)); 3051 3052 // dga == v0 3053 // dgb == v1 3054 // dg0 == v2 3055 // dg1 == v3 3056 // dg2 == v4 3057 // t0 == v6 3058 // t1 == v7 3059 3060 // load 16 keys to v16..v31 3061 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3062 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3063 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3064 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3065 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3066 3067 // load 8 words (256 bits) state 3068 __ ldpq(v0, v1, state); 3069 3070 __ BIND(sha1_loop); 3071 // load 64 bytes of data into v8..v11 3072 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3073 __ rev32(v8, __ T16B, v8); 3074 __ rev32(v9, __ T16B, v9); 3075 __ rev32(v10, __ T16B, v10); 3076 __ rev32(v11, __ T16B, v11); 3077 3078 __ addv(v6, __ T4S, v8, v16); 3079 __ orr(v2, __ T16B, v0, v0); 3080 __ orr(v3, __ T16B, v1, v1); 3081 3082 FloatRegister d0 = v8; 3083 FloatRegister d1 = v9; 3084 FloatRegister d2 = v10; 3085 FloatRegister d3 = v11; 3086 3087 3088 for (int round = 0; round < 16; round++) { 3089 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3090 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3091 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3092 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3093 3094 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3095 __ orr(v4, __ T16B, v2, v2); 3096 if (round < 15) 3097 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3098 __ sha256h(v2, __ T4S, v3, tmp2); 3099 __ sha256h2(v3, __ T4S, v4, tmp2); 3100 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3101 3102 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3103 } 3104 3105 __ addv(v0, __ T4S, v0, v2); 3106 __ addv(v1, __ T4S, v1, v3); 3107 3108 if (multi_block) { 3109 __ add(ofs, ofs, 64); 3110 __ cmp(ofs, limit); 3111 __ br(Assembler::LE, sha1_loop); 3112 __ mov(c_rarg0, ofs); // return ofs 3113 } 3114 3115 __ ldpd(v10, v11, Address(sp, 16)); 3116 __ ldpd(v8, v9, __ post(sp, 32)); 3117 3118 __ stpq(v0, v1, state); 3119 3120 __ ret(lr); 3121 3122 return start; 3123 } 3124 3125 #ifndef BUILTIN_SIM 3126 // Safefetch stubs. 3127 void generate_safefetch(const char* name, int size, address* entry, 3128 address* fault_pc, address* continuation_pc) { 3129 // safefetch signatures: 3130 // int SafeFetch32(int* adr, int errValue); 3131 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3132 // 3133 // arguments: 3134 // c_rarg0 = adr 3135 // c_rarg1 = errValue 3136 // 3137 // result: 3138 // PPC_RET = *adr or errValue 3139 3140 StubCodeMark mark(this, "StubRoutines", name); 3141 3142 // Entry point, pc or function descriptor. 3143 *entry = __ pc(); 3144 3145 // Load *adr into c_rarg1, may fault. 3146 *fault_pc = __ pc(); 3147 switch (size) { 3148 case 4: 3149 // int32_t 3150 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3151 break; 3152 case 8: 3153 // int64_t 3154 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3155 break; 3156 default: 3157 ShouldNotReachHere(); 3158 } 3159 3160 // return errValue or *adr 3161 *continuation_pc = __ pc(); 3162 __ mov(r0, c_rarg1); 3163 __ ret(lr); 3164 } 3165 #endif 3166 3167 /** 3168 * Arguments: 3169 * 3170 * Inputs: 3171 * c_rarg0 - int crc 3172 * c_rarg1 - byte* buf 3173 * c_rarg2 - int length 3174 * 3175 * Ouput: 3176 * rax - int crc result 3177 */ 3178 address generate_updateBytesCRC32() { 3179 assert(UseCRC32Intrinsics, "what are we doing here?"); 3180 3181 __ align(CodeEntryAlignment); 3182 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3183 3184 address start = __ pc(); 3185 3186 const Register crc = c_rarg0; // crc 3187 const Register buf = c_rarg1; // source java byte array address 3188 const Register len = c_rarg2; // length 3189 const Register table0 = c_rarg3; // crc_table address 3190 const Register table1 = c_rarg4; 3191 const Register table2 = c_rarg5; 3192 const Register table3 = c_rarg6; 3193 const Register tmp3 = c_rarg7; 3194 3195 BLOCK_COMMENT("Entry:"); 3196 __ enter(); // required for proper stackwalking of RuntimeStub frame 3197 3198 __ kernel_crc32(crc, buf, len, 3199 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3200 3201 __ leave(); // required for proper stackwalking of RuntimeStub frame 3202 __ ret(lr); 3203 3204 return start; 3205 } 3206 3207 /** 3208 * Arguments: 3209 * 3210 * Inputs: 3211 * c_rarg0 - int crc 3212 * c_rarg1 - byte* buf 3213 * c_rarg2 - int length 3214 * c_rarg3 - int* table 3215 * 3216 * Ouput: 3217 * r0 - int crc result 3218 */ 3219 address generate_updateBytesCRC32C() { 3220 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3221 3222 __ align(CodeEntryAlignment); 3223 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3224 3225 address start = __ pc(); 3226 3227 const Register crc = c_rarg0; // crc 3228 const Register buf = c_rarg1; // source java byte array address 3229 const Register len = c_rarg2; // length 3230 const Register table0 = c_rarg3; // crc_table address 3231 const Register table1 = c_rarg4; 3232 const Register table2 = c_rarg5; 3233 const Register table3 = c_rarg6; 3234 const Register tmp3 = c_rarg7; 3235 3236 BLOCK_COMMENT("Entry:"); 3237 __ enter(); // required for proper stackwalking of RuntimeStub frame 3238 3239 __ kernel_crc32c(crc, buf, len, 3240 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3241 3242 __ leave(); // required for proper stackwalking of RuntimeStub frame 3243 __ ret(lr); 3244 3245 return start; 3246 } 3247 3248 /*** 3249 * Arguments: 3250 * 3251 * Inputs: 3252 * c_rarg0 - int adler 3253 * c_rarg1 - byte* buff 3254 * c_rarg2 - int len 3255 * 3256 * Output: 3257 * c_rarg0 - int adler result 3258 */ 3259 address generate_updateBytesAdler32() { 3260 __ align(CodeEntryAlignment); 3261 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3262 address start = __ pc(); 3263 3264 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3265 3266 // Aliases 3267 Register adler = c_rarg0; 3268 Register s1 = c_rarg0; 3269 Register s2 = c_rarg3; 3270 Register buff = c_rarg1; 3271 Register len = c_rarg2; 3272 Register nmax = r4; 3273 Register base = r5; 3274 Register count = r6; 3275 Register temp0 = rscratch1; 3276 Register temp1 = rscratch2; 3277 FloatRegister vbytes = v0; 3278 FloatRegister vs1acc = v1; 3279 FloatRegister vs2acc = v2; 3280 FloatRegister vtable = v3; 3281 3282 // Max number of bytes we can process before having to take the mod 3283 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3284 unsigned long BASE = 0xfff1; 3285 unsigned long NMAX = 0x15B0; 3286 3287 __ mov(base, BASE); 3288 __ mov(nmax, NMAX); 3289 3290 // Load accumulation coefficients for the upper 16 bits 3291 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 3292 __ ld1(vtable, __ T16B, Address(temp0)); 3293 3294 // s1 is initialized to the lower 16 bits of adler 3295 // s2 is initialized to the upper 16 bits of adler 3296 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3297 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3298 3299 // The pipelined loop needs at least 16 elements for 1 iteration 3300 // It does check this, but it is more effective to skip to the cleanup loop 3301 __ cmp(len, (u1)16); 3302 __ br(Assembler::HS, L_nmax); 3303 __ cbz(len, L_combine); 3304 3305 __ bind(L_simple_by1_loop); 3306 __ ldrb(temp0, Address(__ post(buff, 1))); 3307 __ add(s1, s1, temp0); 3308 __ add(s2, s2, s1); 3309 __ subs(len, len, 1); 3310 __ br(Assembler::HI, L_simple_by1_loop); 3311 3312 // s1 = s1 % BASE 3313 __ subs(temp0, s1, base); 3314 __ csel(s1, temp0, s1, Assembler::HS); 3315 3316 // s2 = s2 % BASE 3317 __ lsr(temp0, s2, 16); 3318 __ lsl(temp1, temp0, 4); 3319 __ sub(temp1, temp1, temp0); 3320 __ add(s2, temp1, s2, ext::uxth); 3321 3322 __ subs(temp0, s2, base); 3323 __ csel(s2, temp0, s2, Assembler::HS); 3324 3325 __ b(L_combine); 3326 3327 __ bind(L_nmax); 3328 __ subs(len, len, nmax); 3329 __ sub(count, nmax, 16); 3330 __ br(Assembler::LO, L_by16); 3331 3332 __ bind(L_nmax_loop); 3333 3334 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3335 vbytes, vs1acc, vs2acc, vtable); 3336 3337 __ subs(count, count, 16); 3338 __ br(Assembler::HS, L_nmax_loop); 3339 3340 // s1 = s1 % BASE 3341 __ lsr(temp0, s1, 16); 3342 __ lsl(temp1, temp0, 4); 3343 __ sub(temp1, temp1, temp0); 3344 __ add(temp1, temp1, s1, ext::uxth); 3345 3346 __ lsr(temp0, temp1, 16); 3347 __ lsl(s1, temp0, 4); 3348 __ sub(s1, s1, temp0); 3349 __ add(s1, s1, temp1, ext:: uxth); 3350 3351 __ subs(temp0, s1, base); 3352 __ csel(s1, temp0, s1, Assembler::HS); 3353 3354 // s2 = s2 % BASE 3355 __ lsr(temp0, s2, 16); 3356 __ lsl(temp1, temp0, 4); 3357 __ sub(temp1, temp1, temp0); 3358 __ add(temp1, temp1, s2, ext::uxth); 3359 3360 __ lsr(temp0, temp1, 16); 3361 __ lsl(s2, temp0, 4); 3362 __ sub(s2, s2, temp0); 3363 __ add(s2, s2, temp1, ext:: uxth); 3364 3365 __ subs(temp0, s2, base); 3366 __ csel(s2, temp0, s2, Assembler::HS); 3367 3368 __ subs(len, len, nmax); 3369 __ sub(count, nmax, 16); 3370 __ br(Assembler::HS, L_nmax_loop); 3371 3372 __ bind(L_by16); 3373 __ adds(len, len, count); 3374 __ br(Assembler::LO, L_by1); 3375 3376 __ bind(L_by16_loop); 3377 3378 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3379 vbytes, vs1acc, vs2acc, vtable); 3380 3381 __ subs(len, len, 16); 3382 __ br(Assembler::HS, L_by16_loop); 3383 3384 __ bind(L_by1); 3385 __ adds(len, len, 15); 3386 __ br(Assembler::LO, L_do_mod); 3387 3388 __ bind(L_by1_loop); 3389 __ ldrb(temp0, Address(__ post(buff, 1))); 3390 __ add(s1, temp0, s1); 3391 __ add(s2, s2, s1); 3392 __ subs(len, len, 1); 3393 __ br(Assembler::HS, L_by1_loop); 3394 3395 __ bind(L_do_mod); 3396 // s1 = s1 % BASE 3397 __ lsr(temp0, s1, 16); 3398 __ lsl(temp1, temp0, 4); 3399 __ sub(temp1, temp1, temp0); 3400 __ add(temp1, temp1, s1, ext::uxth); 3401 3402 __ lsr(temp0, temp1, 16); 3403 __ lsl(s1, temp0, 4); 3404 __ sub(s1, s1, temp0); 3405 __ add(s1, s1, temp1, ext:: uxth); 3406 3407 __ subs(temp0, s1, base); 3408 __ csel(s1, temp0, s1, Assembler::HS); 3409 3410 // s2 = s2 % BASE 3411 __ lsr(temp0, s2, 16); 3412 __ lsl(temp1, temp0, 4); 3413 __ sub(temp1, temp1, temp0); 3414 __ add(temp1, temp1, s2, ext::uxth); 3415 3416 __ lsr(temp0, temp1, 16); 3417 __ lsl(s2, temp0, 4); 3418 __ sub(s2, s2, temp0); 3419 __ add(s2, s2, temp1, ext:: uxth); 3420 3421 __ subs(temp0, s2, base); 3422 __ csel(s2, temp0, s2, Assembler::HS); 3423 3424 // Combine lower bits and higher bits 3425 __ bind(L_combine); 3426 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3427 3428 __ ret(lr); 3429 3430 return start; 3431 } 3432 3433 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 3434 Register temp0, Register temp1, FloatRegister vbytes, 3435 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 3436 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 3437 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 3438 // In non-vectorized code, we update s1 and s2 as: 3439 // s1 <- s1 + b1 3440 // s2 <- s2 + s1 3441 // s1 <- s1 + b2 3442 // s2 <- s2 + b1 3443 // ... 3444 // s1 <- s1 + b16 3445 // s2 <- s2 + s1 3446 // Putting above assignments together, we have: 3447 // s1_new = s1 + b1 + b2 + ... + b16 3448 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 3449 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 3450 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 3451 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 3452 3453 // s2 = s2 + s1 * 16 3454 __ add(s2, s2, s1, Assembler::LSL, 4); 3455 3456 // vs1acc = b1 + b2 + b3 + ... + b16 3457 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 3458 __ umullv(vs2acc, __ T8B, vtable, vbytes); 3459 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 3460 __ uaddlv(vs1acc, __ T16B, vbytes); 3461 __ uaddlv(vs2acc, __ T8H, vs2acc); 3462 3463 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 3464 __ fmovd(temp0, vs1acc); 3465 __ fmovd(temp1, vs2acc); 3466 __ add(s1, s1, temp0); 3467 __ add(s2, s2, temp1); 3468 } 3469 3470 /** 3471 * Arguments: 3472 * 3473 * Input: 3474 * c_rarg0 - x address 3475 * c_rarg1 - x length 3476 * c_rarg2 - y address 3477 * c_rarg3 - y lenth 3478 * c_rarg4 - z address 3479 * c_rarg5 - z length 3480 */ 3481 address generate_multiplyToLen() { 3482 __ align(CodeEntryAlignment); 3483 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3484 3485 address start = __ pc(); 3486 const Register x = r0; 3487 const Register xlen = r1; 3488 const Register y = r2; 3489 const Register ylen = r3; 3490 const Register z = r4; 3491 const Register zlen = r5; 3492 3493 const Register tmp1 = r10; 3494 const Register tmp2 = r11; 3495 const Register tmp3 = r12; 3496 const Register tmp4 = r13; 3497 const Register tmp5 = r14; 3498 const Register tmp6 = r15; 3499 const Register tmp7 = r16; 3500 3501 BLOCK_COMMENT("Entry:"); 3502 __ enter(); // required for proper stackwalking of RuntimeStub frame 3503 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3504 __ leave(); // required for proper stackwalking of RuntimeStub frame 3505 __ ret(lr); 3506 3507 return start; 3508 } 3509 3510 address generate_squareToLen() { 3511 // squareToLen algorithm for sizes 1..127 described in java code works 3512 // faster than multiply_to_len on some CPUs and slower on others, but 3513 // multiply_to_len shows a bit better overall results 3514 __ align(CodeEntryAlignment); 3515 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3516 address start = __ pc(); 3517 3518 const Register x = r0; 3519 const Register xlen = r1; 3520 const Register z = r2; 3521 const Register zlen = r3; 3522 const Register y = r4; // == x 3523 const Register ylen = r5; // == xlen 3524 3525 const Register tmp1 = r10; 3526 const Register tmp2 = r11; 3527 const Register tmp3 = r12; 3528 const Register tmp4 = r13; 3529 const Register tmp5 = r14; 3530 const Register tmp6 = r15; 3531 const Register tmp7 = r16; 3532 3533 RegSet spilled_regs = RegSet::of(y, ylen); 3534 BLOCK_COMMENT("Entry:"); 3535 __ enter(); 3536 __ push(spilled_regs, sp); 3537 __ mov(y, x); 3538 __ mov(ylen, xlen); 3539 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3540 __ pop(spilled_regs, sp); 3541 __ leave(); 3542 __ ret(lr); 3543 return start; 3544 } 3545 3546 address generate_mulAdd() { 3547 __ align(CodeEntryAlignment); 3548 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3549 3550 address start = __ pc(); 3551 3552 const Register out = r0; 3553 const Register in = r1; 3554 const Register offset = r2; 3555 const Register len = r3; 3556 const Register k = r4; 3557 3558 BLOCK_COMMENT("Entry:"); 3559 __ enter(); 3560 __ mul_add(out, in, offset, len, k); 3561 __ leave(); 3562 __ ret(lr); 3563 3564 return start; 3565 } 3566 3567 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3568 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3569 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3570 // Karatsuba multiplication performs a 128*128 -> 256-bit 3571 // multiplication in three 128-bit multiplications and a few 3572 // additions. 3573 // 3574 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3575 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3576 // 3577 // Inputs: 3578 // 3579 // A0 in a.d[0] (subkey) 3580 // A1 in a.d[1] 3581 // (A1+A0) in a1_xor_a0.d[0] 3582 // 3583 // B0 in b.d[0] (state) 3584 // B1 in b.d[1] 3585 3586 __ ext(tmp1, __ T16B, b, b, 0x08); 3587 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3588 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3589 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3590 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3591 3592 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3593 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3594 __ eor(tmp2, __ T16B, tmp2, tmp4); 3595 __ eor(tmp2, __ T16B, tmp2, tmp3); 3596 3597 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3598 __ ins(result_hi, __ D, tmp2, 0, 1); 3599 __ ins(result_lo, __ D, tmp2, 1, 0); 3600 } 3601 3602 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3603 FloatRegister p, FloatRegister z, FloatRegister t1) { 3604 const FloatRegister t0 = result; 3605 3606 // The GCM field polynomial f is z^128 + p(z), where p = 3607 // z^7+z^2+z+1. 3608 // 3609 // z^128 === -p(z) (mod (z^128 + p(z))) 3610 // 3611 // so, given that the product we're reducing is 3612 // a == lo + hi * z^128 3613 // substituting, 3614 // === lo - hi * p(z) (mod (z^128 + p(z))) 3615 // 3616 // we reduce by multiplying hi by p(z) and subtracting the result 3617 // from (i.e. XORing it with) lo. Because p has no nonzero high 3618 // bits we can do this with two 64-bit multiplications, lo*p and 3619 // hi*p. 3620 3621 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3622 __ ext(t1, __ T16B, t0, z, 8); 3623 __ eor(hi, __ T16B, hi, t1); 3624 __ ext(t1, __ T16B, z, t0, 8); 3625 __ eor(lo, __ T16B, lo, t1); 3626 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3627 __ eor(result, __ T16B, lo, t0); 3628 } 3629 3630 address generate_has_negatives(address &has_negatives_long) { 3631 const u1 large_loop_size = 64; 3632 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3633 int dcache_line = VM_Version::dcache_line_size(); 3634 3635 Register ary1 = r1, len = r2, result = r0; 3636 3637 __ align(CodeEntryAlignment); 3638 3639 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3640 3641 address entry = __ pc(); 3642 3643 __ enter(); 3644 3645 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3646 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3647 3648 __ cmp(len, (u1)15); 3649 __ br(Assembler::GT, LEN_OVER_15); 3650 // The only case when execution falls into this code is when pointer is near 3651 // the end of memory page and we have to avoid reading next page 3652 __ add(ary1, ary1, len); 3653 __ subs(len, len, 8); 3654 __ br(Assembler::GT, LEN_OVER_8); 3655 __ ldr(rscratch2, Address(ary1, -8)); 3656 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3657 __ lsrv(rscratch2, rscratch2, rscratch1); 3658 __ tst(rscratch2, UPPER_BIT_MASK); 3659 __ cset(result, Assembler::NE); 3660 __ leave(); 3661 __ ret(lr); 3662 __ bind(LEN_OVER_8); 3663 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3664 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3665 __ tst(rscratch2, UPPER_BIT_MASK); 3666 __ br(Assembler::NE, RET_TRUE_NO_POP); 3667 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3668 __ lsrv(rscratch1, rscratch1, rscratch2); 3669 __ tst(rscratch1, UPPER_BIT_MASK); 3670 __ cset(result, Assembler::NE); 3671 __ leave(); 3672 __ ret(lr); 3673 3674 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3675 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3676 3677 has_negatives_long = __ pc(); // 2nd entry point 3678 3679 __ enter(); 3680 3681 __ bind(LEN_OVER_15); 3682 __ push(spilled_regs, sp); 3683 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3684 __ cbz(rscratch2, ALIGNED); 3685 __ ldp(tmp6, tmp1, Address(ary1)); 3686 __ mov(tmp5, 16); 3687 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3688 __ add(ary1, ary1, rscratch1); 3689 __ sub(len, len, rscratch1); 3690 __ orr(tmp6, tmp6, tmp1); 3691 __ tst(tmp6, UPPER_BIT_MASK); 3692 __ br(Assembler::NE, RET_TRUE); 3693 3694 __ bind(ALIGNED); 3695 __ cmp(len, large_loop_size); 3696 __ br(Assembler::LT, CHECK_16); 3697 // Perform 16-byte load as early return in pre-loop to handle situation 3698 // when initially aligned large array has negative values at starting bytes, 3699 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3700 // slower. Cases with negative bytes further ahead won't be affected that 3701 // much. In fact, it'll be faster due to early loads, less instructions and 3702 // less branches in LARGE_LOOP. 3703 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3704 __ sub(len, len, 16); 3705 __ orr(tmp6, tmp6, tmp1); 3706 __ tst(tmp6, UPPER_BIT_MASK); 3707 __ br(Assembler::NE, RET_TRUE); 3708 __ cmp(len, large_loop_size); 3709 __ br(Assembler::LT, CHECK_16); 3710 3711 if (SoftwarePrefetchHintDistance >= 0 3712 && SoftwarePrefetchHintDistance >= dcache_line) { 3713 // initial prefetch 3714 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3715 } 3716 __ bind(LARGE_LOOP); 3717 if (SoftwarePrefetchHintDistance >= 0) { 3718 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3719 } 3720 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3721 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3722 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3723 // instructions per cycle and have less branches, but this approach disables 3724 // early return, thus, all 64 bytes are loaded and checked every time. 3725 __ ldp(tmp2, tmp3, Address(ary1)); 3726 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3727 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3728 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3729 __ add(ary1, ary1, large_loop_size); 3730 __ sub(len, len, large_loop_size); 3731 __ orr(tmp2, tmp2, tmp3); 3732 __ orr(tmp4, tmp4, tmp5); 3733 __ orr(rscratch1, rscratch1, rscratch2); 3734 __ orr(tmp6, tmp6, tmp1); 3735 __ orr(tmp2, tmp2, tmp4); 3736 __ orr(rscratch1, rscratch1, tmp6); 3737 __ orr(tmp2, tmp2, rscratch1); 3738 __ tst(tmp2, UPPER_BIT_MASK); 3739 __ br(Assembler::NE, RET_TRUE); 3740 __ cmp(len, large_loop_size); 3741 __ br(Assembler::GE, LARGE_LOOP); 3742 3743 __ bind(CHECK_16); // small 16-byte load pre-loop 3744 __ cmp(len, (u1)16); 3745 __ br(Assembler::LT, POST_LOOP16); 3746 3747 __ bind(LOOP16); // small 16-byte load loop 3748 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3749 __ sub(len, len, 16); 3750 __ orr(tmp2, tmp2, tmp3); 3751 __ tst(tmp2, UPPER_BIT_MASK); 3752 __ br(Assembler::NE, RET_TRUE); 3753 __ cmp(len, (u1)16); 3754 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3755 3756 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3757 __ cmp(len, (u1)8); 3758 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3759 __ ldr(tmp3, Address(__ post(ary1, 8))); 3760 __ sub(len, len, 8); 3761 __ tst(tmp3, UPPER_BIT_MASK); 3762 __ br(Assembler::NE, RET_TRUE); 3763 3764 __ bind(POST_LOOP16_LOAD_TAIL); 3765 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3766 __ ldr(tmp1, Address(ary1)); 3767 __ mov(tmp2, 64); 3768 __ sub(tmp4, tmp2, len, __ LSL, 3); 3769 __ lslv(tmp1, tmp1, tmp4); 3770 __ tst(tmp1, UPPER_BIT_MASK); 3771 __ br(Assembler::NE, RET_TRUE); 3772 // Fallthrough 3773 3774 __ bind(RET_FALSE); 3775 __ pop(spilled_regs, sp); 3776 __ leave(); 3777 __ mov(result, zr); 3778 __ ret(lr); 3779 3780 __ bind(RET_TRUE); 3781 __ pop(spilled_regs, sp); 3782 __ bind(RET_TRUE_NO_POP); 3783 __ leave(); 3784 __ mov(result, 1); 3785 __ ret(lr); 3786 3787 __ bind(DONE); 3788 __ pop(spilled_regs, sp); 3789 __ leave(); 3790 __ ret(lr); 3791 return entry; 3792 } 3793 3794 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3795 bool usePrefetch, Label &NOT_EQUAL) { 3796 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3797 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3798 tmp7 = r12, tmp8 = r13; 3799 Label LOOP; 3800 3801 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3802 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3803 __ bind(LOOP); 3804 if (usePrefetch) { 3805 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3806 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3807 } 3808 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3809 __ eor(tmp1, tmp1, tmp2); 3810 __ eor(tmp3, tmp3, tmp4); 3811 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3812 __ orr(tmp1, tmp1, tmp3); 3813 __ cbnz(tmp1, NOT_EQUAL); 3814 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3815 __ eor(tmp5, tmp5, tmp6); 3816 __ eor(tmp7, tmp7, tmp8); 3817 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3818 __ orr(tmp5, tmp5, tmp7); 3819 __ cbnz(tmp5, NOT_EQUAL); 3820 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3821 __ eor(tmp1, tmp1, tmp2); 3822 __ eor(tmp3, tmp3, tmp4); 3823 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3824 __ orr(tmp1, tmp1, tmp3); 3825 __ cbnz(tmp1, NOT_EQUAL); 3826 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3827 __ eor(tmp5, tmp5, tmp6); 3828 __ sub(cnt1, cnt1, 8 * wordSize); 3829 __ eor(tmp7, tmp7, tmp8); 3830 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3831 // tmp6 is not used. MacroAssembler::subs is used here (rather than 3832 // cmp) because subs allows an unlimited range of immediate operand. 3833 __ subs(tmp6, cnt1, loopThreshold); 3834 __ orr(tmp5, tmp5, tmp7); 3835 __ cbnz(tmp5, NOT_EQUAL); 3836 __ br(__ GE, LOOP); 3837 // post-loop 3838 __ eor(tmp1, tmp1, tmp2); 3839 __ eor(tmp3, tmp3, tmp4); 3840 __ orr(tmp1, tmp1, tmp3); 3841 __ sub(cnt1, cnt1, 2 * wordSize); 3842 __ cbnz(tmp1, NOT_EQUAL); 3843 } 3844 3845 void generate_large_array_equals_loop_simd(int loopThreshold, 3846 bool usePrefetch, Label &NOT_EQUAL) { 3847 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3848 tmp2 = rscratch2; 3849 Label LOOP; 3850 3851 __ bind(LOOP); 3852 if (usePrefetch) { 3853 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3854 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3855 } 3856 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3857 __ sub(cnt1, cnt1, 8 * wordSize); 3858 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3859 __ subs(tmp1, cnt1, loopThreshold); 3860 __ eor(v0, __ T16B, v0, v4); 3861 __ eor(v1, __ T16B, v1, v5); 3862 __ eor(v2, __ T16B, v2, v6); 3863 __ eor(v3, __ T16B, v3, v7); 3864 __ orr(v0, __ T16B, v0, v1); 3865 __ orr(v1, __ T16B, v2, v3); 3866 __ orr(v0, __ T16B, v0, v1); 3867 __ umov(tmp1, v0, __ D, 0); 3868 __ umov(tmp2, v0, __ D, 1); 3869 __ orr(tmp1, tmp1, tmp2); 3870 __ cbnz(tmp1, NOT_EQUAL); 3871 __ br(__ GE, LOOP); 3872 } 3873 3874 // a1 = r1 - array1 address 3875 // a2 = r2 - array2 address 3876 // result = r0 - return value. Already contains "false" 3877 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3878 // r3-r5 are reserved temporary registers 3879 address generate_large_array_equals() { 3880 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3881 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3882 tmp7 = r12, tmp8 = r13; 3883 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3884 SMALL_LOOP, POST_LOOP; 3885 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3886 // calculate if at least 32 prefetched bytes are used 3887 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3888 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3889 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3890 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3891 tmp5, tmp6, tmp7, tmp8); 3892 3893 __ align(CodeEntryAlignment); 3894 3895 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3896 3897 address entry = __ pc(); 3898 __ enter(); 3899 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3900 // also advance pointers to use post-increment instead of pre-increment 3901 __ add(a1, a1, wordSize); 3902 __ add(a2, a2, wordSize); 3903 if (AvoidUnalignedAccesses) { 3904 // both implementations (SIMD/nonSIMD) are using relatively large load 3905 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 3906 // on some CPUs in case of address is not at least 16-byte aligned. 3907 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 3908 // load if needed at least for 1st address and make if 16-byte aligned. 3909 Label ALIGNED16; 3910 __ tbz(a1, 3, ALIGNED16); 3911 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3912 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3913 __ sub(cnt1, cnt1, wordSize); 3914 __ eor(tmp1, tmp1, tmp2); 3915 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 3916 __ bind(ALIGNED16); 3917 } 3918 if (UseSIMDForArrayEquals) { 3919 if (SoftwarePrefetchHintDistance >= 0) { 3920 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3921 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3922 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 3923 /* prfm = */ true, NOT_EQUAL); 3924 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3925 __ br(__ LT, TAIL); 3926 } 3927 __ bind(NO_PREFETCH_LARGE_LOOP); 3928 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 3929 /* prfm = */ false, NOT_EQUAL); 3930 } else { 3931 __ push(spilled_regs, sp); 3932 if (SoftwarePrefetchHintDistance >= 0) { 3933 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3934 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3935 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 3936 /* prfm = */ true, NOT_EQUAL); 3937 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3938 __ br(__ LT, TAIL); 3939 } 3940 __ bind(NO_PREFETCH_LARGE_LOOP); 3941 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 3942 /* prfm = */ false, NOT_EQUAL); 3943 } 3944 __ bind(TAIL); 3945 __ cbz(cnt1, EQUAL); 3946 __ subs(cnt1, cnt1, wordSize); 3947 __ br(__ LE, POST_LOOP); 3948 __ bind(SMALL_LOOP); 3949 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3950 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3951 __ subs(cnt1, cnt1, wordSize); 3952 __ eor(tmp1, tmp1, tmp2); 3953 __ cbnz(tmp1, NOT_EQUAL); 3954 __ br(__ GT, SMALL_LOOP); 3955 __ bind(POST_LOOP); 3956 __ ldr(tmp1, Address(a1, cnt1)); 3957 __ ldr(tmp2, Address(a2, cnt1)); 3958 __ eor(tmp1, tmp1, tmp2); 3959 __ cbnz(tmp1, NOT_EQUAL); 3960 __ bind(EQUAL); 3961 __ mov(result, true); 3962 __ bind(NOT_EQUAL); 3963 if (!UseSIMDForArrayEquals) { 3964 __ pop(spilled_regs, sp); 3965 } 3966 __ bind(NOT_EQUAL_NO_POP); 3967 __ leave(); 3968 __ ret(lr); 3969 return entry; 3970 } 3971 3972 address generate_dsin_dcos(bool isCos) { 3973 __ align(CodeEntryAlignment); 3974 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 3975 address start = __ pc(); 3976 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 3977 (address)StubRoutines::aarch64::_two_over_pi, 3978 (address)StubRoutines::aarch64::_pio2, 3979 (address)StubRoutines::aarch64::_dsin_coef, 3980 (address)StubRoutines::aarch64::_dcos_coef); 3981 return start; 3982 } 3983 3984 address generate_dlog() { 3985 __ align(CodeEntryAlignment); 3986 StubCodeMark mark(this, "StubRoutines", "dlog"); 3987 address entry = __ pc(); 3988 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 3989 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 3990 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 3991 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 3992 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 3993 return entry; 3994 } 3995 3996 // code for comparing 16 bytes of strings with same encoding 3997 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 3998 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; 3999 __ ldr(rscratch1, Address(__ post(str1, 8))); 4000 __ eor(rscratch2, tmp1, tmp2); 4001 __ ldr(cnt1, Address(__ post(str2, 8))); 4002 __ cbnz(rscratch2, DIFF1); 4003 __ ldr(tmp1, Address(__ post(str1, 8))); 4004 __ eor(rscratch2, rscratch1, cnt1); 4005 __ ldr(tmp2, Address(__ post(str2, 8))); 4006 __ cbnz(rscratch2, DIFF2); 4007 } 4008 4009 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 4010 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 4011 Label &DIFF2) { 4012 Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12; 4013 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 4014 4015 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 4016 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4017 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4018 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 4019 4020 __ fmovd(tmpL, vtmp3); 4021 __ eor(rscratch2, tmp3, tmpL); 4022 __ cbnz(rscratch2, DIFF2); 4023 4024 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4025 __ umov(tmpL, vtmp3, __ D, 1); 4026 __ eor(rscratch2, tmpU, tmpL); 4027 __ cbnz(rscratch2, DIFF1); 4028 4029 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4030 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4031 __ fmovd(tmpL, vtmp); 4032 __ eor(rscratch2, tmp3, tmpL); 4033 __ cbnz(rscratch2, DIFF2); 4034 4035 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4036 __ umov(tmpL, vtmp, __ D, 1); 4037 __ eor(rscratch2, tmpU, tmpL); 4038 __ cbnz(rscratch2, DIFF1); 4039 } 4040 4041 // r0 = result 4042 // r1 = str1 4043 // r2 = cnt1 4044 // r3 = str2 4045 // r4 = cnt2 4046 // r10 = tmp1 4047 // r11 = tmp2 4048 address generate_compare_long_string_different_encoding(bool isLU) { 4049 __ align(CodeEntryAlignment); 4050 StubCodeMark mark(this, "StubRoutines", isLU 4051 ? "compare_long_string_different_encoding LU" 4052 : "compare_long_string_different_encoding UL"); 4053 address entry = __ pc(); 4054 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 4055 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER, 4056 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 4057 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4058 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 4059 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 4060 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 4061 4062 int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2); 4063 4064 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 4065 // cnt2 == amount of characters left to compare 4066 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 4067 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4068 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 4069 __ add(str2, str2, isLU ? wordSize : wordSize/2); 4070 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 4071 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 4072 __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1); 4073 __ eor(rscratch2, tmp1, tmp2); 4074 __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0); 4075 __ mov(rscratch1, tmp2); 4076 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 4077 Register strU = isLU ? str2 : str1, 4078 strL = isLU ? str1 : str2, 4079 tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 4080 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 4081 __ push(spilled_regs, sp); 4082 __ sub(tmp2, strL, cnt2); // strL pointer to load from 4083 __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from 4084 4085 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4086 4087 if (SoftwarePrefetchHintDistance >= 0) { 4088 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4089 __ br(__ LT, SMALL_LOOP); 4090 __ bind(LARGE_LOOP_PREFETCH); 4091 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 4092 __ mov(tmp4, 2); 4093 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4094 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 4095 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4096 __ subs(tmp4, tmp4, 1); 4097 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 4098 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4099 __ mov(tmp4, 2); 4100 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 4101 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4102 __ subs(tmp4, tmp4, 1); 4103 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 4104 __ sub(cnt2, cnt2, 64); 4105 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4106 __ br(__ GE, LARGE_LOOP_PREFETCH); 4107 } 4108 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 4109 __ subs(cnt2, cnt2, 16); 4110 __ br(__ LT, TAIL); 4111 __ b(SMALL_LOOP_ENTER); 4112 __ bind(SMALL_LOOP); // smaller loop 4113 __ subs(cnt2, cnt2, 16); 4114 __ bind(SMALL_LOOP_ENTER); 4115 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4116 __ br(__ GE, SMALL_LOOP); 4117 __ cbz(cnt2, LOAD_LAST); 4118 __ bind(TAIL); // 1..15 characters left 4119 __ subs(zr, cnt2, -8); 4120 __ br(__ GT, TAIL_LOAD_16); 4121 __ ldrd(vtmp, Address(tmp2)); 4122 __ zip1(vtmp3, __ T8B, vtmp, vtmpZ); 4123 4124 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4125 __ fmovd(tmpL, vtmp3); 4126 __ eor(rscratch2, tmp3, tmpL); 4127 __ cbnz(rscratch2, DIFF2); 4128 __ umov(tmpL, vtmp3, __ D, 1); 4129 __ eor(rscratch2, tmpU, tmpL); 4130 __ cbnz(rscratch2, DIFF1); 4131 __ b(LOAD_LAST); 4132 __ bind(TAIL_LOAD_16); 4133 __ ldrq(vtmp, Address(tmp2)); 4134 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4135 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4136 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4137 __ fmovd(tmpL, vtmp3); 4138 __ eor(rscratch2, tmp3, tmpL); 4139 __ cbnz(rscratch2, DIFF2); 4140 4141 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4142 __ umov(tmpL, vtmp3, __ D, 1); 4143 __ eor(rscratch2, tmpU, tmpL); 4144 __ cbnz(rscratch2, DIFF1); 4145 4146 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4147 __ fmovd(tmpL, vtmp); 4148 __ eor(rscratch2, tmp3, tmpL); 4149 __ cbnz(rscratch2, DIFF2); 4150 4151 __ umov(tmpL, vtmp, __ D, 1); 4152 __ eor(rscratch2, tmpU, tmpL); 4153 __ cbnz(rscratch2, DIFF1); 4154 __ b(LOAD_LAST); 4155 __ bind(DIFF2); 4156 __ mov(tmpU, tmp3); 4157 __ bind(DIFF1); 4158 __ pop(spilled_regs, sp); 4159 __ b(CALCULATE_DIFFERENCE); 4160 __ bind(LOAD_LAST); 4161 __ pop(spilled_regs, sp); 4162 4163 __ ldrs(vtmp, Address(strL)); 4164 __ ldr(tmpU, Address(strU)); 4165 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4166 __ fmovd(tmpL, vtmp); 4167 4168 __ eor(rscratch2, tmpU, tmpL); 4169 __ cbz(rscratch2, DONE); 4170 4171 // Find the first different characters in the longwords and 4172 // compute their difference. 4173 __ bind(CALCULATE_DIFFERENCE); 4174 __ rev(rscratch2, rscratch2); 4175 __ clz(rscratch2, rscratch2); 4176 __ andr(rscratch2, rscratch2, -16); 4177 __ lsrv(tmp1, tmp1, rscratch2); 4178 __ uxthw(tmp1, tmp1); 4179 __ lsrv(rscratch1, rscratch1, rscratch2); 4180 __ uxthw(rscratch1, rscratch1); 4181 __ subw(result, tmp1, rscratch1); 4182 __ bind(DONE); 4183 __ ret(lr); 4184 return entry; 4185 } 4186 4187 // r0 = result 4188 // r1 = str1 4189 // r2 = cnt1 4190 // r3 = str2 4191 // r4 = cnt2 4192 // r10 = tmp1 4193 // r11 = tmp2 4194 address generate_compare_long_string_same_encoding(bool isLL) { 4195 __ align(CodeEntryAlignment); 4196 StubCodeMark mark(this, "StubRoutines", isLL 4197 ? "compare_long_string_same_encoding LL" 4198 : "compare_long_string_same_encoding UU"); 4199 address entry = __ pc(); 4200 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4201 tmp1 = r10, tmp2 = r11; 4202 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, 4203 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, 4204 DIFF_LAST_POSITION, DIFF_LAST_POSITION2; 4205 // exit from large loop when less than 64 bytes left to read or we're about 4206 // to prefetch memory behind array border 4207 int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 4208 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 4209 // update cnt2 counter with already loaded 8 bytes 4210 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 4211 // update pointers, because of previous read 4212 __ add(str1, str1, wordSize); 4213 __ add(str2, str2, wordSize); 4214 if (SoftwarePrefetchHintDistance >= 0) { 4215 __ bind(LARGE_LOOP_PREFETCH); 4216 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 4217 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 4218 compare_string_16_bytes_same(DIFF, DIFF2); 4219 compare_string_16_bytes_same(DIFF, DIFF2); 4220 __ sub(cnt2, cnt2, isLL ? 64 : 32); 4221 compare_string_16_bytes_same(DIFF, DIFF2); 4222 __ subs(rscratch2, cnt2, largeLoopExitCondition); 4223 compare_string_16_bytes_same(DIFF, DIFF2); 4224 __ br(__ GT, LARGE_LOOP_PREFETCH); 4225 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? 4226 // less than 16 bytes left? 4227 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4228 __ br(__ LT, TAIL); 4229 } 4230 __ bind(SMALL_LOOP); 4231 compare_string_16_bytes_same(DIFF, DIFF2); 4232 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4233 __ br(__ GE, SMALL_LOOP); 4234 __ bind(TAIL); 4235 __ adds(cnt2, cnt2, isLL ? 16 : 8); 4236 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); 4237 __ subs(cnt2, cnt2, isLL ? 8 : 4); 4238 __ br(__ LE, CHECK_LAST); 4239 __ eor(rscratch2, tmp1, tmp2); 4240 __ cbnz(rscratch2, DIFF); 4241 __ ldr(tmp1, Address(__ post(str1, 8))); 4242 __ ldr(tmp2, Address(__ post(str2, 8))); 4243 __ sub(cnt2, cnt2, isLL ? 8 : 4); 4244 __ bind(CHECK_LAST); 4245 if (!isLL) { 4246 __ add(cnt2, cnt2, cnt2); // now in bytes 4247 } 4248 __ eor(rscratch2, tmp1, tmp2); 4249 __ cbnz(rscratch2, DIFF); 4250 __ ldr(rscratch1, Address(str1, cnt2)); 4251 __ ldr(cnt1, Address(str2, cnt2)); 4252 __ eor(rscratch2, rscratch1, cnt1); 4253 __ cbz(rscratch2, LENGTH_DIFF); 4254 // Find the first different characters in the longwords and 4255 // compute their difference. 4256 __ bind(DIFF2); 4257 __ rev(rscratch2, rscratch2); 4258 __ clz(rscratch2, rscratch2); 4259 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4260 __ lsrv(rscratch1, rscratch1, rscratch2); 4261 if (isLL) { 4262 __ lsrv(cnt1, cnt1, rscratch2); 4263 __ uxtbw(rscratch1, rscratch1); 4264 __ uxtbw(cnt1, cnt1); 4265 } else { 4266 __ lsrv(cnt1, cnt1, rscratch2); 4267 __ uxthw(rscratch1, rscratch1); 4268 __ uxthw(cnt1, cnt1); 4269 } 4270 __ subw(result, rscratch1, cnt1); 4271 __ b(LENGTH_DIFF); 4272 __ bind(DIFF); 4273 __ rev(rscratch2, rscratch2); 4274 __ clz(rscratch2, rscratch2); 4275 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4276 __ lsrv(tmp1, tmp1, rscratch2); 4277 if (isLL) { 4278 __ lsrv(tmp2, tmp2, rscratch2); 4279 __ uxtbw(tmp1, tmp1); 4280 __ uxtbw(tmp2, tmp2); 4281 } else { 4282 __ lsrv(tmp2, tmp2, rscratch2); 4283 __ uxthw(tmp1, tmp1); 4284 __ uxthw(tmp2, tmp2); 4285 } 4286 __ subw(result, tmp1, tmp2); 4287 __ b(LENGTH_DIFF); 4288 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 4289 __ eor(rscratch2, tmp1, tmp2); 4290 __ cbnz(rscratch2, DIFF); 4291 __ bind(LENGTH_DIFF); 4292 __ ret(lr); 4293 return entry; 4294 } 4295 4296 void generate_compare_long_strings() { 4297 StubRoutines::aarch64::_compare_long_string_LL 4298 = generate_compare_long_string_same_encoding(true); 4299 StubRoutines::aarch64::_compare_long_string_UU 4300 = generate_compare_long_string_same_encoding(false); 4301 StubRoutines::aarch64::_compare_long_string_LU 4302 = generate_compare_long_string_different_encoding(true); 4303 StubRoutines::aarch64::_compare_long_string_UL 4304 = generate_compare_long_string_different_encoding(false); 4305 } 4306 4307 // R0 = result 4308 // R1 = str2 4309 // R2 = cnt1 4310 // R3 = str1 4311 // R4 = cnt2 4312 // This generic linear code use few additional ideas, which makes it faster: 4313 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 4314 // in order to skip initial loading(help in systems with 1 ld pipeline) 4315 // 2) we can use "fast" algorithm of finding single character to search for 4316 // first symbol with less branches(1 branch per each loaded register instead 4317 // of branch for each symbol), so, this is where constants like 4318 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 4319 // 3) after loading and analyzing 1st register of source string, it can be 4320 // used to search for every 1st character entry, saving few loads in 4321 // comparison with "simplier-but-slower" implementation 4322 // 4) in order to avoid lots of push/pop operations, code below is heavily 4323 // re-using/re-initializing/compressing register values, which makes code 4324 // larger and a bit less readable, however, most of extra operations are 4325 // issued during loads or branches, so, penalty is minimal 4326 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 4327 const char* stubName = str1_isL 4328 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 4329 : "indexof_linear_uu"; 4330 __ align(CodeEntryAlignment); 4331 StubCodeMark mark(this, "StubRoutines", stubName); 4332 address entry = __ pc(); 4333 4334 int str1_chr_size = str1_isL ? 1 : 2; 4335 int str2_chr_size = str2_isL ? 1 : 2; 4336 int str1_chr_shift = str1_isL ? 0 : 1; 4337 int str2_chr_shift = str2_isL ? 0 : 1; 4338 bool isL = str1_isL && str2_isL; 4339 // parameters 4340 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 4341 // temporary registers 4342 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 4343 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 4344 // redefinitions 4345 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 4346 4347 __ push(spilled_regs, sp); 4348 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 4349 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 4350 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 4351 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 4352 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 4353 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 4354 // Read whole register from str1. It is safe, because length >=8 here 4355 __ ldr(ch1, Address(str1)); 4356 // Read whole register from str2. It is safe, because length >=8 here 4357 __ ldr(ch2, Address(str2)); 4358 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 4359 if (str1_isL != str2_isL) { 4360 __ eor(v0, __ T16B, v0, v0); 4361 } 4362 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4363 __ mul(first, first, tmp1); 4364 // check if we have less than 1 register to check 4365 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 4366 if (str1_isL != str2_isL) { 4367 __ fmovd(v1, ch1); 4368 } 4369 __ br(__ LE, L_SMALL); 4370 __ eor(ch2, first, ch2); 4371 if (str1_isL != str2_isL) { 4372 __ zip1(v1, __ T16B, v1, v0); 4373 } 4374 __ sub(tmp2, ch2, tmp1); 4375 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4376 __ bics(tmp2, tmp2, ch2); 4377 if (str1_isL != str2_isL) { 4378 __ fmovd(ch1, v1); 4379 } 4380 __ br(__ NE, L_HAS_ZERO); 4381 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4382 __ add(result, result, wordSize/str2_chr_size); 4383 __ add(str2, str2, wordSize); 4384 __ br(__ LT, L_POST_LOOP); 4385 __ BIND(L_LOOP); 4386 __ ldr(ch2, Address(str2)); 4387 __ eor(ch2, first, ch2); 4388 __ sub(tmp2, ch2, tmp1); 4389 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4390 __ bics(tmp2, tmp2, ch2); 4391 __ br(__ NE, L_HAS_ZERO); 4392 __ BIND(L_LOOP_PROCEED); 4393 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4394 __ add(str2, str2, wordSize); 4395 __ add(result, result, wordSize/str2_chr_size); 4396 __ br(__ GE, L_LOOP); 4397 __ BIND(L_POST_LOOP); 4398 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 4399 __ br(__ LE, NOMATCH); 4400 __ ldr(ch2, Address(str2)); 4401 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4402 __ eor(ch2, first, ch2); 4403 __ sub(tmp2, ch2, tmp1); 4404 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4405 __ mov(tmp4, -1); // all bits set 4406 __ b(L_SMALL_PROCEED); 4407 __ align(OptoLoopAlignment); 4408 __ BIND(L_SMALL); 4409 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4410 __ eor(ch2, first, ch2); 4411 if (str1_isL != str2_isL) { 4412 __ zip1(v1, __ T16B, v1, v0); 4413 } 4414 __ sub(tmp2, ch2, tmp1); 4415 __ mov(tmp4, -1); // all bits set 4416 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4417 if (str1_isL != str2_isL) { 4418 __ fmovd(ch1, v1); // move converted 4 symbols 4419 } 4420 __ BIND(L_SMALL_PROCEED); 4421 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 4422 __ bic(tmp2, tmp2, ch2); 4423 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 4424 __ rbit(tmp2, tmp2); 4425 __ br(__ EQ, NOMATCH); 4426 __ BIND(L_SMALL_HAS_ZERO_LOOP); 4427 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 4428 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 4429 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 4430 if (str2_isL) { // LL 4431 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4432 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4433 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4434 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4435 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4436 } else { 4437 __ mov(ch2, 0xE); // all bits in byte set except last one 4438 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4439 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4440 __ lslv(tmp2, tmp2, tmp4); 4441 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4442 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4443 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4444 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4445 } 4446 __ cmp(ch1, ch2); 4447 __ mov(tmp4, wordSize/str2_chr_size); 4448 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4449 __ BIND(L_SMALL_CMP_LOOP); 4450 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4451 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4452 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4453 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4454 __ add(tmp4, tmp4, 1); 4455 __ cmp(tmp4, cnt1); 4456 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 4457 __ cmp(first, ch2); 4458 __ br(__ EQ, L_SMALL_CMP_LOOP); 4459 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 4460 __ cbz(tmp2, NOMATCH); // no more matches. exit 4461 __ clz(tmp4, tmp2); 4462 __ add(result, result, 1); // advance index 4463 __ add(str2, str2, str2_chr_size); // advance pointer 4464 __ b(L_SMALL_HAS_ZERO_LOOP); 4465 __ align(OptoLoopAlignment); 4466 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 4467 __ cmp(first, ch2); 4468 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4469 __ b(DONE); 4470 __ align(OptoLoopAlignment); 4471 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 4472 if (str2_isL) { // LL 4473 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4474 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4475 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4476 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4477 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4478 } else { 4479 __ mov(ch2, 0xE); // all bits in byte set except last one 4480 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4481 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4482 __ lslv(tmp2, tmp2, tmp4); 4483 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4484 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4485 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4486 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4487 } 4488 __ cmp(ch1, ch2); 4489 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4490 __ b(DONE); 4491 __ align(OptoLoopAlignment); 4492 __ BIND(L_HAS_ZERO); 4493 __ rbit(tmp2, tmp2); 4494 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 4495 // Now, perform compression of counters(cnt2 and cnt1) into one register. 4496 // It's fine because both counters are 32bit and are not changed in this 4497 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 4498 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 4499 __ sub(result, result, 1); 4500 __ BIND(L_HAS_ZERO_LOOP); 4501 __ mov(cnt1, wordSize/str2_chr_size); 4502 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4503 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 4504 if (str2_isL) { 4505 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4506 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4507 __ lslv(tmp2, tmp2, tmp4); 4508 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4509 __ add(tmp4, tmp4, 1); 4510 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4511 __ lsl(tmp2, tmp2, 1); 4512 __ mov(tmp4, wordSize/str2_chr_size); 4513 } else { 4514 __ mov(ch2, 0xE); 4515 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4516 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4517 __ lslv(tmp2, tmp2, tmp4); 4518 __ add(tmp4, tmp4, 1); 4519 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4520 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4521 __ lsl(tmp2, tmp2, 1); 4522 __ mov(tmp4, wordSize/str2_chr_size); 4523 __ sub(str2, str2, str2_chr_size); 4524 } 4525 __ cmp(ch1, ch2); 4526 __ mov(tmp4, wordSize/str2_chr_size); 4527 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4528 __ BIND(L_CMP_LOOP); 4529 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4530 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4531 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4532 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4533 __ add(tmp4, tmp4, 1); 4534 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4535 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 4536 __ cmp(cnt1, ch2); 4537 __ br(__ EQ, L_CMP_LOOP); 4538 __ BIND(L_CMP_LOOP_NOMATCH); 4539 // here we're not matched 4540 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 4541 __ clz(tmp4, tmp2); 4542 __ add(str2, str2, str2_chr_size); // advance pointer 4543 __ b(L_HAS_ZERO_LOOP); 4544 __ align(OptoLoopAlignment); 4545 __ BIND(L_CMP_LOOP_LAST_CMP); 4546 __ cmp(cnt1, ch2); 4547 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4548 __ b(DONE); 4549 __ align(OptoLoopAlignment); 4550 __ BIND(L_CMP_LOOP_LAST_CMP2); 4551 if (str2_isL) { 4552 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4553 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4554 __ lslv(tmp2, tmp2, tmp4); 4555 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4556 __ add(tmp4, tmp4, 1); 4557 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4558 __ lsl(tmp2, tmp2, 1); 4559 } else { 4560 __ mov(ch2, 0xE); 4561 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4562 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4563 __ lslv(tmp2, tmp2, tmp4); 4564 __ add(tmp4, tmp4, 1); 4565 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4566 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4567 __ lsl(tmp2, tmp2, 1); 4568 __ sub(str2, str2, str2_chr_size); 4569 } 4570 __ cmp(ch1, ch2); 4571 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4572 __ b(DONE); 4573 __ align(OptoLoopAlignment); 4574 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 4575 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 4576 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 4577 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 4578 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 4579 // result by analyzed characters value, so, we can just reset lower bits 4580 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 4581 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 4582 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 4583 // index of last analyzed substring inside current octet. So, str2 in at 4584 // respective start address. We need to advance it to next octet 4585 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 4586 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 4587 __ bfm(result, zr, 0, 2 - str2_chr_shift); 4588 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 4589 __ movw(cnt2, cnt2); 4590 __ b(L_LOOP_PROCEED); 4591 __ align(OptoLoopAlignment); 4592 __ BIND(NOMATCH); 4593 __ mov(result, -1); 4594 __ BIND(DONE); 4595 __ pop(spilled_regs, sp); 4596 __ ret(lr); 4597 return entry; 4598 } 4599 4600 void generate_string_indexof_stubs() { 4601 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 4602 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 4603 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 4604 } 4605 4606 void inflate_and_store_2_fp_registers(bool generatePrfm, 4607 FloatRegister src1, FloatRegister src2) { 4608 Register dst = r1; 4609 __ zip1(v1, __ T16B, src1, v0); 4610 __ zip2(v2, __ T16B, src1, v0); 4611 if (generatePrfm) { 4612 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 4613 } 4614 __ zip1(v3, __ T16B, src2, v0); 4615 __ zip2(v4, __ T16B, src2, v0); 4616 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4617 } 4618 4619 // R0 = src 4620 // R1 = dst 4621 // R2 = len 4622 // R3 = len >> 3 4623 // V0 = 0 4624 // v1 = loaded 8 bytes 4625 address generate_large_byte_array_inflate() { 4626 __ align(CodeEntryAlignment); 4627 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4628 address entry = __ pc(); 4629 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4630 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4631 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4; 4632 4633 // do one more 8-byte read to have address 16-byte aligned in most cases 4634 // also use single store instruction 4635 __ ldrd(v2, __ post(src, 8)); 4636 __ sub(octetCounter, octetCounter, 2); 4637 __ zip1(v1, __ T16B, v1, v0); 4638 __ zip1(v2, __ T16B, v2, v0); 4639 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4640 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4641 __ subs(rscratch1, octetCounter, large_loop_threshold); 4642 __ br(__ LE, LOOP_START); 4643 __ b(LOOP_PRFM_START); 4644 __ bind(LOOP_PRFM); 4645 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4646 __ bind(LOOP_PRFM_START); 4647 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4648 __ sub(octetCounter, octetCounter, 8); 4649 __ subs(rscratch1, octetCounter, large_loop_threshold); 4650 inflate_and_store_2_fp_registers(true, v3, v4); 4651 inflate_and_store_2_fp_registers(true, v5, v6); 4652 __ br(__ GT, LOOP_PRFM); 4653 __ cmp(octetCounter, (u1)8); 4654 __ br(__ LT, DONE); 4655 __ bind(LOOP); 4656 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4657 __ bind(LOOP_START); 4658 __ sub(octetCounter, octetCounter, 8); 4659 __ cmp(octetCounter, (u1)8); 4660 inflate_and_store_2_fp_registers(false, v3, v4); 4661 inflate_and_store_2_fp_registers(false, v5, v6); 4662 __ br(__ GE, LOOP); 4663 __ bind(DONE); 4664 __ ret(lr); 4665 return entry; 4666 } 4667 4668 /** 4669 * Arguments: 4670 * 4671 * Input: 4672 * c_rarg0 - current state address 4673 * c_rarg1 - H key address 4674 * c_rarg2 - data address 4675 * c_rarg3 - number of blocks 4676 * 4677 * Output: 4678 * Updated state at c_rarg0 4679 */ 4680 address generate_ghash_processBlocks() { 4681 // Bafflingly, GCM uses little-endian for the byte order, but 4682 // big-endian for the bit order. For example, the polynomial 1 is 4683 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4684 // 4685 // So, we must either reverse the bytes in each word and do 4686 // everything big-endian or reverse the bits in each byte and do 4687 // it little-endian. On AArch64 it's more idiomatic to reverse 4688 // the bits in each byte (we have an instruction, RBIT, to do 4689 // that) and keep the data in little-endian bit order throught the 4690 // calculation, bit-reversing the inputs and outputs. 4691 4692 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4693 __ align(wordSize * 2); 4694 address p = __ pc(); 4695 __ emit_int64(0x87); // The low-order bits of the field 4696 // polynomial (i.e. p = z^7+z^2+z+1) 4697 // repeated in the low and high parts of a 4698 // 128-bit vector 4699 __ emit_int64(0x87); 4700 4701 __ align(CodeEntryAlignment); 4702 address start = __ pc(); 4703 4704 Register state = c_rarg0; 4705 Register subkeyH = c_rarg1; 4706 Register data = c_rarg2; 4707 Register blocks = c_rarg3; 4708 4709 FloatRegister vzr = v30; 4710 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4711 4712 __ ldrq(v0, Address(state)); 4713 __ ldrq(v1, Address(subkeyH)); 4714 4715 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4716 __ rbit(v0, __ T16B, v0); 4717 __ rev64(v1, __ T16B, v1); 4718 __ rbit(v1, __ T16B, v1); 4719 4720 __ ldrq(v26, p); 4721 4722 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4723 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4724 4725 { 4726 Label L_ghash_loop; 4727 __ bind(L_ghash_loop); 4728 4729 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4730 // reversing each byte 4731 __ rbit(v2, __ T16B, v2); 4732 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4733 4734 // Multiply state in v2 by subkey in v1 4735 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4736 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4737 /*temps*/v6, v20, v18, v21); 4738 // Reduce v7:v5 by the field polynomial 4739 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4740 4741 __ sub(blocks, blocks, 1); 4742 __ cbnz(blocks, L_ghash_loop); 4743 } 4744 4745 // The bit-reversed result is at this point in v0 4746 __ rev64(v1, __ T16B, v0); 4747 __ rbit(v1, __ T16B, v1); 4748 4749 __ st1(v1, __ T16B, state); 4750 __ ret(lr); 4751 4752 return start; 4753 } 4754 4755 // Continuation point for throwing of implicit exceptions that are 4756 // not handled in the current activation. Fabricates an exception 4757 // oop and initiates normal exception dispatching in this 4758 // frame. Since we need to preserve callee-saved values (currently 4759 // only for C2, but done for C1 as well) we need a callee-saved oop 4760 // map and therefore have to make these stubs into RuntimeStubs 4761 // rather than BufferBlobs. If the compiler needs all registers to 4762 // be preserved between the fault point and the exception handler 4763 // then it must assume responsibility for that in 4764 // AbstractCompiler::continuation_for_implicit_null_exception or 4765 // continuation_for_implicit_division_by_zero_exception. All other 4766 // implicit exceptions (e.g., NullPointerException or 4767 // AbstractMethodError on entry) are either at call sites or 4768 // otherwise assume that stack unwinding will be initiated, so 4769 // caller saved registers were assumed volatile in the compiler. 4770 4771 #undef __ 4772 #define __ masm-> 4773 4774 address generate_throw_exception(const char* name, 4775 address runtime_entry, 4776 Register arg1 = noreg, 4777 Register arg2 = noreg) { 4778 // Information about frame layout at time of blocking runtime call. 4779 // Note that we only have to preserve callee-saved registers since 4780 // the compilers are responsible for supplying a continuation point 4781 // if they expect all registers to be preserved. 4782 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4783 enum layout { 4784 rfp_off = 0, 4785 rfp_off2, 4786 return_off, 4787 return_off2, 4788 framesize // inclusive of return address 4789 }; 4790 4791 int insts_size = 512; 4792 int locs_size = 64; 4793 4794 CodeBuffer code(name, insts_size, locs_size); 4795 OopMapSet* oop_maps = new OopMapSet(); 4796 MacroAssembler* masm = new MacroAssembler(&code); 4797 4798 address start = __ pc(); 4799 4800 // This is an inlined and slightly modified version of call_VM 4801 // which has the ability to fetch the return PC out of 4802 // thread-local storage and also sets up last_Java_sp slightly 4803 // differently than the real call_VM 4804 4805 __ enter(); // Save FP and LR before call 4806 4807 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4808 4809 // lr and fp are already in place 4810 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4811 4812 int frame_complete = __ pc() - start; 4813 4814 // Set up last_Java_sp and last_Java_fp 4815 address the_pc = __ pc(); 4816 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 4817 4818 // Call runtime 4819 if (arg1 != noreg) { 4820 assert(arg2 != c_rarg1, "clobbered"); 4821 __ mov(c_rarg1, arg1); 4822 } 4823 if (arg2 != noreg) { 4824 __ mov(c_rarg2, arg2); 4825 } 4826 __ mov(c_rarg0, rthread); 4827 BLOCK_COMMENT("call runtime_entry"); 4828 __ mov(rscratch1, runtime_entry); 4829 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4830 4831 // Generate oop map 4832 OopMap* map = new OopMap(framesize, 0); 4833 4834 oop_maps->add_gc_map(the_pc - start, map); 4835 4836 __ reset_last_Java_frame(true); 4837 __ maybe_isb(); 4838 4839 __ leave(); 4840 4841 // check for pending exceptions 4842 #ifdef ASSERT 4843 Label L; 4844 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4845 __ cbnz(rscratch1, L); 4846 __ should_not_reach_here(); 4847 __ bind(L); 4848 #endif // ASSERT 4849 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4850 4851 4852 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4853 RuntimeStub* stub = 4854 RuntimeStub::new_runtime_stub(name, 4855 &code, 4856 frame_complete, 4857 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4858 oop_maps, false); 4859 return stub->entry_point(); 4860 } 4861 4862 class MontgomeryMultiplyGenerator : public MacroAssembler { 4863 4864 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4865 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4866 4867 RegSet _toSave; 4868 bool _squaring; 4869 4870 public: 4871 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4872 : MacroAssembler(as->code()), _squaring(squaring) { 4873 4874 // Register allocation 4875 4876 Register reg = c_rarg0; 4877 Pa_base = reg; // Argument registers 4878 if (squaring) 4879 Pb_base = Pa_base; 4880 else 4881 Pb_base = ++reg; 4882 Pn_base = ++reg; 4883 Rlen= ++reg; 4884 inv = ++reg; 4885 Pm_base = ++reg; 4886 4887 // Working registers: 4888 Ra = ++reg; // The current digit of a, b, n, and m. 4889 Rb = ++reg; 4890 Rm = ++reg; 4891 Rn = ++reg; 4892 4893 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4894 Pb = ++reg; 4895 Pm = ++reg; 4896 Pn = ++reg; 4897 4898 t0 = ++reg; // Three registers which form a 4899 t1 = ++reg; // triple-precision accumuator. 4900 t2 = ++reg; 4901 4902 Ri = ++reg; // Inner and outer loop indexes. 4903 Rj = ++reg; 4904 4905 Rhi_ab = ++reg; // Product registers: low and high parts 4906 Rlo_ab = ++reg; // of a*b and m*n. 4907 Rhi_mn = ++reg; 4908 Rlo_mn = ++reg; 4909 4910 // r19 and up are callee-saved. 4911 _toSave = RegSet::range(r19, reg) + Pm_base; 4912 } 4913 4914 private: 4915 void save_regs() { 4916 push(_toSave, sp); 4917 } 4918 4919 void restore_regs() { 4920 pop(_toSave, sp); 4921 } 4922 4923 template <typename T> 4924 void unroll_2(Register count, T block) { 4925 Label loop, end, odd; 4926 tbnz(count, 0, odd); 4927 cbz(count, end); 4928 align(16); 4929 bind(loop); 4930 (this->*block)(); 4931 bind(odd); 4932 (this->*block)(); 4933 subs(count, count, 2); 4934 br(Assembler::GT, loop); 4935 bind(end); 4936 } 4937 4938 template <typename T> 4939 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4940 Label loop, end, odd; 4941 tbnz(count, 0, odd); 4942 cbz(count, end); 4943 align(16); 4944 bind(loop); 4945 (this->*block)(d, s, tmp); 4946 bind(odd); 4947 (this->*block)(d, s, tmp); 4948 subs(count, count, 2); 4949 br(Assembler::GT, loop); 4950 bind(end); 4951 } 4952 4953 void pre1(RegisterOrConstant i) { 4954 block_comment("pre1"); 4955 // Pa = Pa_base; 4956 // Pb = Pb_base + i; 4957 // Pm = Pm_base; 4958 // Pn = Pn_base + i; 4959 // Ra = *Pa; 4960 // Rb = *Pb; 4961 // Rm = *Pm; 4962 // Rn = *Pn; 4963 ldr(Ra, Address(Pa_base)); 4964 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4965 ldr(Rm, Address(Pm_base)); 4966 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4967 lea(Pa, Address(Pa_base)); 4968 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4969 lea(Pm, Address(Pm_base)); 4970 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4971 4972 // Zero the m*n result. 4973 mov(Rhi_mn, zr); 4974 mov(Rlo_mn, zr); 4975 } 4976 4977 // The core multiply-accumulate step of a Montgomery 4978 // multiplication. The idea is to schedule operations as a 4979 // pipeline so that instructions with long latencies (loads and 4980 // multiplies) have time to complete before their results are 4981 // used. This most benefits in-order implementations of the 4982 // architecture but out-of-order ones also benefit. 4983 void step() { 4984 block_comment("step"); 4985 // MACC(Ra, Rb, t0, t1, t2); 4986 // Ra = *++Pa; 4987 // Rb = *--Pb; 4988 umulh(Rhi_ab, Ra, Rb); 4989 mul(Rlo_ab, Ra, Rb); 4990 ldr(Ra, pre(Pa, wordSize)); 4991 ldr(Rb, pre(Pb, -wordSize)); 4992 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4993 // previous iteration. 4994 // MACC(Rm, Rn, t0, t1, t2); 4995 // Rm = *++Pm; 4996 // Rn = *--Pn; 4997 umulh(Rhi_mn, Rm, Rn); 4998 mul(Rlo_mn, Rm, Rn); 4999 ldr(Rm, pre(Pm, wordSize)); 5000 ldr(Rn, pre(Pn, -wordSize)); 5001 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5002 } 5003 5004 void post1() { 5005 block_comment("post1"); 5006 5007 // MACC(Ra, Rb, t0, t1, t2); 5008 // Ra = *++Pa; 5009 // Rb = *--Pb; 5010 umulh(Rhi_ab, Ra, Rb); 5011 mul(Rlo_ab, Ra, Rb); 5012 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5013 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5014 5015 // *Pm = Rm = t0 * inv; 5016 mul(Rm, t0, inv); 5017 str(Rm, Address(Pm)); 5018 5019 // MACC(Rm, Rn, t0, t1, t2); 5020 // t0 = t1; t1 = t2; t2 = 0; 5021 umulh(Rhi_mn, Rm, Rn); 5022 5023 #ifndef PRODUCT 5024 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5025 { 5026 mul(Rlo_mn, Rm, Rn); 5027 add(Rlo_mn, t0, Rlo_mn); 5028 Label ok; 5029 cbz(Rlo_mn, ok); { 5030 stop("broken Montgomery multiply"); 5031 } bind(ok); 5032 } 5033 #endif 5034 // We have very carefully set things up so that 5035 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5036 // the lower half of Rm * Rn because we know the result already: 5037 // it must be -t0. t0 + (-t0) must generate a carry iff 5038 // t0 != 0. So, rather than do a mul and an adds we just set 5039 // the carry flag iff t0 is nonzero. 5040 // 5041 // mul(Rlo_mn, Rm, Rn); 5042 // adds(zr, t0, Rlo_mn); 5043 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5044 adcs(t0, t1, Rhi_mn); 5045 adc(t1, t2, zr); 5046 mov(t2, zr); 5047 } 5048 5049 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 5050 block_comment("pre2"); 5051 // Pa = Pa_base + i-len; 5052 // Pb = Pb_base + len; 5053 // Pm = Pm_base + i-len; 5054 // Pn = Pn_base + len; 5055 5056 if (i.is_register()) { 5057 sub(Rj, i.as_register(), len); 5058 } else { 5059 mov(Rj, i.as_constant()); 5060 sub(Rj, Rj, len); 5061 } 5062 // Rj == i-len 5063 5064 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 5065 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 5066 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5067 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 5068 5069 // Ra = *++Pa; 5070 // Rb = *--Pb; 5071 // Rm = *++Pm; 5072 // Rn = *--Pn; 5073 ldr(Ra, pre(Pa, wordSize)); 5074 ldr(Rb, pre(Pb, -wordSize)); 5075 ldr(Rm, pre(Pm, wordSize)); 5076 ldr(Rn, pre(Pn, -wordSize)); 5077 5078 mov(Rhi_mn, zr); 5079 mov(Rlo_mn, zr); 5080 } 5081 5082 void post2(RegisterOrConstant i, RegisterOrConstant len) { 5083 block_comment("post2"); 5084 if (i.is_constant()) { 5085 mov(Rj, i.as_constant()-len.as_constant()); 5086 } else { 5087 sub(Rj, i.as_register(), len); 5088 } 5089 5090 adds(t0, t0, Rlo_mn); // The pending m*n, low part 5091 5092 // As soon as we know the least significant digit of our result, 5093 // store it. 5094 // Pm_base[i-len] = t0; 5095 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5096 5097 // t0 = t1; t1 = t2; t2 = 0; 5098 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 5099 adc(t1, t2, zr); 5100 mov(t2, zr); 5101 } 5102 5103 // A carry in t0 after Montgomery multiplication means that we 5104 // should subtract multiples of n from our result in m. We'll 5105 // keep doing that until there is no carry. 5106 void normalize(RegisterOrConstant len) { 5107 block_comment("normalize"); 5108 // while (t0) 5109 // t0 = sub(Pm_base, Pn_base, t0, len); 5110 Label loop, post, again; 5111 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 5112 cbz(t0, post); { 5113 bind(again); { 5114 mov(i, zr); 5115 mov(cnt, len); 5116 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5117 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5118 subs(zr, zr, zr); // set carry flag, i.e. no borrow 5119 align(16); 5120 bind(loop); { 5121 sbcs(Rm, Rm, Rn); 5122 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5123 add(i, i, 1); 5124 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5125 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5126 sub(cnt, cnt, 1); 5127 } cbnz(cnt, loop); 5128 sbc(t0, t0, zr); 5129 } cbnz(t0, again); 5130 } bind(post); 5131 } 5132 5133 // Move memory at s to d, reversing words. 5134 // Increments d to end of copied memory 5135 // Destroys tmp1, tmp2 5136 // Preserves len 5137 // Leaves s pointing to the address which was in d at start 5138 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 5139 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 5140 5141 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 5142 mov(tmp1, len); 5143 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 5144 sub(s, d, len, ext::uxtw, LogBytesPerWord); 5145 } 5146 // where 5147 void reverse1(Register d, Register s, Register tmp) { 5148 ldr(tmp, pre(s, -wordSize)); 5149 ror(tmp, tmp, 32); 5150 str(tmp, post(d, wordSize)); 5151 } 5152 5153 void step_squaring() { 5154 // An extra ACC 5155 step(); 5156 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5157 } 5158 5159 void last_squaring(RegisterOrConstant i) { 5160 Label dont; 5161 // if ((i & 1) == 0) { 5162 tbnz(i.as_register(), 0, dont); { 5163 // MACC(Ra, Rb, t0, t1, t2); 5164 // Ra = *++Pa; 5165 // Rb = *--Pb; 5166 umulh(Rhi_ab, Ra, Rb); 5167 mul(Rlo_ab, Ra, Rb); 5168 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5169 } bind(dont); 5170 } 5171 5172 void extra_step_squaring() { 5173 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5174 5175 // MACC(Rm, Rn, t0, t1, t2); 5176 // Rm = *++Pm; 5177 // Rn = *--Pn; 5178 umulh(Rhi_mn, Rm, Rn); 5179 mul(Rlo_mn, Rm, Rn); 5180 ldr(Rm, pre(Pm, wordSize)); 5181 ldr(Rn, pre(Pn, -wordSize)); 5182 } 5183 5184 void post1_squaring() { 5185 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5186 5187 // *Pm = Rm = t0 * inv; 5188 mul(Rm, t0, inv); 5189 str(Rm, Address(Pm)); 5190 5191 // MACC(Rm, Rn, t0, t1, t2); 5192 // t0 = t1; t1 = t2; t2 = 0; 5193 umulh(Rhi_mn, Rm, Rn); 5194 5195 #ifndef PRODUCT 5196 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5197 { 5198 mul(Rlo_mn, Rm, Rn); 5199 add(Rlo_mn, t0, Rlo_mn); 5200 Label ok; 5201 cbz(Rlo_mn, ok); { 5202 stop("broken Montgomery multiply"); 5203 } bind(ok); 5204 } 5205 #endif 5206 // We have very carefully set things up so that 5207 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5208 // the lower half of Rm * Rn because we know the result already: 5209 // it must be -t0. t0 + (-t0) must generate a carry iff 5210 // t0 != 0. So, rather than do a mul and an adds we just set 5211 // the carry flag iff t0 is nonzero. 5212 // 5213 // mul(Rlo_mn, Rm, Rn); 5214 // adds(zr, t0, Rlo_mn); 5215 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5216 adcs(t0, t1, Rhi_mn); 5217 adc(t1, t2, zr); 5218 mov(t2, zr); 5219 } 5220 5221 void acc(Register Rhi, Register Rlo, 5222 Register t0, Register t1, Register t2) { 5223 adds(t0, t0, Rlo); 5224 adcs(t1, t1, Rhi); 5225 adc(t2, t2, zr); 5226 } 5227 5228 public: 5229 /** 5230 * Fast Montgomery multiplication. The derivation of the 5231 * algorithm is in A Cryptographic Library for the Motorola 5232 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 5233 * 5234 * Arguments: 5235 * 5236 * Inputs for multiplication: 5237 * c_rarg0 - int array elements a 5238 * c_rarg1 - int array elements b 5239 * c_rarg2 - int array elements n (the modulus) 5240 * c_rarg3 - int length 5241 * c_rarg4 - int inv 5242 * c_rarg5 - int array elements m (the result) 5243 * 5244 * Inputs for squaring: 5245 * c_rarg0 - int array elements a 5246 * c_rarg1 - int array elements n (the modulus) 5247 * c_rarg2 - int length 5248 * c_rarg3 - int inv 5249 * c_rarg4 - int array elements m (the result) 5250 * 5251 */ 5252 address generate_multiply() { 5253 Label argh, nothing; 5254 bind(argh); 5255 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5256 5257 align(CodeEntryAlignment); 5258 address entry = pc(); 5259 5260 cbzw(Rlen, nothing); 5261 5262 enter(); 5263 5264 // Make room. 5265 cmpw(Rlen, 512); 5266 br(Assembler::HI, argh); 5267 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5268 andr(sp, Ra, -2 * wordSize); 5269 5270 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5271 5272 { 5273 // Copy input args, reversing as we go. We use Ra as a 5274 // temporary variable. 5275 reverse(Ra, Pa_base, Rlen, t0, t1); 5276 if (!_squaring) 5277 reverse(Ra, Pb_base, Rlen, t0, t1); 5278 reverse(Ra, Pn_base, Rlen, t0, t1); 5279 } 5280 5281 // Push all call-saved registers and also Pm_base which we'll need 5282 // at the end. 5283 save_regs(); 5284 5285 #ifndef PRODUCT 5286 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 5287 { 5288 ldr(Rn, Address(Pn_base, 0)); 5289 mul(Rlo_mn, Rn, inv); 5290 subs(zr, Rlo_mn, -1); 5291 Label ok; 5292 br(EQ, ok); { 5293 stop("broken inverse in Montgomery multiply"); 5294 } bind(ok); 5295 } 5296 #endif 5297 5298 mov(Pm_base, Ra); 5299 5300 mov(t0, zr); 5301 mov(t1, zr); 5302 mov(t2, zr); 5303 5304 block_comment("for (int i = 0; i < len; i++) {"); 5305 mov(Ri, zr); { 5306 Label loop, end; 5307 cmpw(Ri, Rlen); 5308 br(Assembler::GE, end); 5309 5310 bind(loop); 5311 pre1(Ri); 5312 5313 block_comment(" for (j = i; j; j--) {"); { 5314 movw(Rj, Ri); 5315 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5316 } block_comment(" } // j"); 5317 5318 post1(); 5319 addw(Ri, Ri, 1); 5320 cmpw(Ri, Rlen); 5321 br(Assembler::LT, loop); 5322 bind(end); 5323 block_comment("} // i"); 5324 } 5325 5326 block_comment("for (int i = len; i < 2*len; i++) {"); 5327 mov(Ri, Rlen); { 5328 Label loop, end; 5329 cmpw(Ri, Rlen, Assembler::LSL, 1); 5330 br(Assembler::GE, end); 5331 5332 bind(loop); 5333 pre2(Ri, Rlen); 5334 5335 block_comment(" for (j = len*2-i-1; j; j--) {"); { 5336 lslw(Rj, Rlen, 1); 5337 subw(Rj, Rj, Ri); 5338 subw(Rj, Rj, 1); 5339 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5340 } block_comment(" } // j"); 5341 5342 post2(Ri, Rlen); 5343 addw(Ri, Ri, 1); 5344 cmpw(Ri, Rlen, Assembler::LSL, 1); 5345 br(Assembler::LT, loop); 5346 bind(end); 5347 } 5348 block_comment("} // i"); 5349 5350 normalize(Rlen); 5351 5352 mov(Ra, Pm_base); // Save Pm_base in Ra 5353 restore_regs(); // Restore caller's Pm_base 5354 5355 // Copy our result into caller's Pm_base 5356 reverse(Pm_base, Ra, Rlen, t0, t1); 5357 5358 leave(); 5359 bind(nothing); 5360 ret(lr); 5361 5362 return entry; 5363 } 5364 // In C, approximately: 5365 5366 // void 5367 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 5368 // unsigned long Pn_base[], unsigned long Pm_base[], 5369 // unsigned long inv, int len) { 5370 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5371 // unsigned long *Pa, *Pb, *Pn, *Pm; 5372 // unsigned long Ra, Rb, Rn, Rm; 5373 5374 // int i; 5375 5376 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5377 5378 // for (i = 0; i < len; i++) { 5379 // int j; 5380 5381 // Pa = Pa_base; 5382 // Pb = Pb_base + i; 5383 // Pm = Pm_base; 5384 // Pn = Pn_base + i; 5385 5386 // Ra = *Pa; 5387 // Rb = *Pb; 5388 // Rm = *Pm; 5389 // Rn = *Pn; 5390 5391 // int iters = i; 5392 // for (j = 0; iters--; j++) { 5393 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5394 // MACC(Ra, Rb, t0, t1, t2); 5395 // Ra = *++Pa; 5396 // Rb = *--Pb; 5397 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5398 // MACC(Rm, Rn, t0, t1, t2); 5399 // Rm = *++Pm; 5400 // Rn = *--Pn; 5401 // } 5402 5403 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 5404 // MACC(Ra, Rb, t0, t1, t2); 5405 // *Pm = Rm = t0 * inv; 5406 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5407 // MACC(Rm, Rn, t0, t1, t2); 5408 5409 // assert(t0 == 0, "broken Montgomery multiply"); 5410 5411 // t0 = t1; t1 = t2; t2 = 0; 5412 // } 5413 5414 // for (i = len; i < 2*len; i++) { 5415 // int j; 5416 5417 // Pa = Pa_base + i-len; 5418 // Pb = Pb_base + len; 5419 // Pm = Pm_base + i-len; 5420 // Pn = Pn_base + len; 5421 5422 // Ra = *++Pa; 5423 // Rb = *--Pb; 5424 // Rm = *++Pm; 5425 // Rn = *--Pn; 5426 5427 // int iters = len*2-i-1; 5428 // for (j = i-len+1; iters--; j++) { 5429 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5430 // MACC(Ra, Rb, t0, t1, t2); 5431 // Ra = *++Pa; 5432 // Rb = *--Pb; 5433 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5434 // MACC(Rm, Rn, t0, t1, t2); 5435 // Rm = *++Pm; 5436 // Rn = *--Pn; 5437 // } 5438 5439 // Pm_base[i-len] = t0; 5440 // t0 = t1; t1 = t2; t2 = 0; 5441 // } 5442 5443 // while (t0) 5444 // t0 = sub(Pm_base, Pn_base, t0, len); 5445 // } 5446 5447 /** 5448 * Fast Montgomery squaring. This uses asymptotically 25% fewer 5449 * multiplies than Montgomery multiplication so it should be up to 5450 * 25% faster. However, its loop control is more complex and it 5451 * may actually run slower on some machines. 5452 * 5453 * Arguments: 5454 * 5455 * Inputs: 5456 * c_rarg0 - int array elements a 5457 * c_rarg1 - int array elements n (the modulus) 5458 * c_rarg2 - int length 5459 * c_rarg3 - int inv 5460 * c_rarg4 - int array elements m (the result) 5461 * 5462 */ 5463 address generate_square() { 5464 Label argh; 5465 bind(argh); 5466 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5467 5468 align(CodeEntryAlignment); 5469 address entry = pc(); 5470 5471 enter(); 5472 5473 // Make room. 5474 cmpw(Rlen, 512); 5475 br(Assembler::HI, argh); 5476 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5477 andr(sp, Ra, -2 * wordSize); 5478 5479 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5480 5481 { 5482 // Copy input args, reversing as we go. We use Ra as a 5483 // temporary variable. 5484 reverse(Ra, Pa_base, Rlen, t0, t1); 5485 reverse(Ra, Pn_base, Rlen, t0, t1); 5486 } 5487 5488 // Push all call-saved registers and also Pm_base which we'll need 5489 // at the end. 5490 save_regs(); 5491 5492 mov(Pm_base, Ra); 5493 5494 mov(t0, zr); 5495 mov(t1, zr); 5496 mov(t2, zr); 5497 5498 block_comment("for (int i = 0; i < len; i++) {"); 5499 mov(Ri, zr); { 5500 Label loop, end; 5501 bind(loop); 5502 cmp(Ri, Rlen); 5503 br(Assembler::GE, end); 5504 5505 pre1(Ri); 5506 5507 block_comment("for (j = (i+1)/2; j; j--) {"); { 5508 add(Rj, Ri, 1); 5509 lsr(Rj, Rj, 1); 5510 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5511 } block_comment(" } // j"); 5512 5513 last_squaring(Ri); 5514 5515 block_comment(" for (j = i/2; j; j--) {"); { 5516 lsr(Rj, Ri, 1); 5517 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5518 } block_comment(" } // j"); 5519 5520 post1_squaring(); 5521 add(Ri, Ri, 1); 5522 cmp(Ri, Rlen); 5523 br(Assembler::LT, loop); 5524 5525 bind(end); 5526 block_comment("} // i"); 5527 } 5528 5529 block_comment("for (int i = len; i < 2*len; i++) {"); 5530 mov(Ri, Rlen); { 5531 Label loop, end; 5532 bind(loop); 5533 cmp(Ri, Rlen, Assembler::LSL, 1); 5534 br(Assembler::GE, end); 5535 5536 pre2(Ri, Rlen); 5537 5538 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 5539 lsl(Rj, Rlen, 1); 5540 sub(Rj, Rj, Ri); 5541 sub(Rj, Rj, 1); 5542 lsr(Rj, Rj, 1); 5543 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5544 } block_comment(" } // j"); 5545 5546 last_squaring(Ri); 5547 5548 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 5549 lsl(Rj, Rlen, 1); 5550 sub(Rj, Rj, Ri); 5551 lsr(Rj, Rj, 1); 5552 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5553 } block_comment(" } // j"); 5554 5555 post2(Ri, Rlen); 5556 add(Ri, Ri, 1); 5557 cmp(Ri, Rlen, Assembler::LSL, 1); 5558 5559 br(Assembler::LT, loop); 5560 bind(end); 5561 block_comment("} // i"); 5562 } 5563 5564 normalize(Rlen); 5565 5566 mov(Ra, Pm_base); // Save Pm_base in Ra 5567 restore_regs(); // Restore caller's Pm_base 5568 5569 // Copy our result into caller's Pm_base 5570 reverse(Pm_base, Ra, Rlen, t0, t1); 5571 5572 leave(); 5573 ret(lr); 5574 5575 return entry; 5576 } 5577 // In C, approximately: 5578 5579 // void 5580 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 5581 // unsigned long Pm_base[], unsigned long inv, int len) { 5582 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5583 // unsigned long *Pa, *Pb, *Pn, *Pm; 5584 // unsigned long Ra, Rb, Rn, Rm; 5585 5586 // int i; 5587 5588 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5589 5590 // for (i = 0; i < len; i++) { 5591 // int j; 5592 5593 // Pa = Pa_base; 5594 // Pb = Pa_base + i; 5595 // Pm = Pm_base; 5596 // Pn = Pn_base + i; 5597 5598 // Ra = *Pa; 5599 // Rb = *Pb; 5600 // Rm = *Pm; 5601 // Rn = *Pn; 5602 5603 // int iters = (i+1)/2; 5604 // for (j = 0; iters--; j++) { 5605 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5606 // MACC2(Ra, Rb, t0, t1, t2); 5607 // Ra = *++Pa; 5608 // Rb = *--Pb; 5609 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5610 // MACC(Rm, Rn, t0, t1, t2); 5611 // Rm = *++Pm; 5612 // Rn = *--Pn; 5613 // } 5614 // if ((i & 1) == 0) { 5615 // assert(Ra == Pa_base[j], "must be"); 5616 // MACC(Ra, Ra, t0, t1, t2); 5617 // } 5618 // iters = i/2; 5619 // assert(iters == i-j, "must be"); 5620 // for (; iters--; j++) { 5621 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5622 // MACC(Rm, Rn, t0, t1, t2); 5623 // Rm = *++Pm; 5624 // Rn = *--Pn; 5625 // } 5626 5627 // *Pm = Rm = t0 * inv; 5628 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5629 // MACC(Rm, Rn, t0, t1, t2); 5630 5631 // assert(t0 == 0, "broken Montgomery multiply"); 5632 5633 // t0 = t1; t1 = t2; t2 = 0; 5634 // } 5635 5636 // for (i = len; i < 2*len; i++) { 5637 // int start = i-len+1; 5638 // int end = start + (len - start)/2; 5639 // int j; 5640 5641 // Pa = Pa_base + i-len; 5642 // Pb = Pa_base + len; 5643 // Pm = Pm_base + i-len; 5644 // Pn = Pn_base + len; 5645 5646 // Ra = *++Pa; 5647 // Rb = *--Pb; 5648 // Rm = *++Pm; 5649 // Rn = *--Pn; 5650 5651 // int iters = (2*len-i-1)/2; 5652 // assert(iters == end-start, "must be"); 5653 // for (j = start; iters--; j++) { 5654 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5655 // MACC2(Ra, Rb, t0, t1, t2); 5656 // Ra = *++Pa; 5657 // Rb = *--Pb; 5658 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5659 // MACC(Rm, Rn, t0, t1, t2); 5660 // Rm = *++Pm; 5661 // Rn = *--Pn; 5662 // } 5663 // if ((i & 1) == 0) { 5664 // assert(Ra == Pa_base[j], "must be"); 5665 // MACC(Ra, Ra, t0, t1, t2); 5666 // } 5667 // iters = (2*len-i)/2; 5668 // assert(iters == len-j, "must be"); 5669 // for (; iters--; j++) { 5670 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5671 // MACC(Rm, Rn, t0, t1, t2); 5672 // Rm = *++Pm; 5673 // Rn = *--Pn; 5674 // } 5675 // Pm_base[i-len] = t0; 5676 // t0 = t1; t1 = t2; t2 = 0; 5677 // } 5678 5679 // while (t0) 5680 // t0 = sub(Pm_base, Pn_base, t0, len); 5681 // } 5682 }; 5683 5684 5685 // Initialization 5686 void generate_initial() { 5687 // Generate initial stubs and initializes the entry points 5688 5689 // entry points that exist in all platforms Note: This is code 5690 // that could be shared among different platforms - however the 5691 // benefit seems to be smaller than the disadvantage of having a 5692 // much more complicated generator structure. See also comment in 5693 // stubRoutines.hpp. 5694 5695 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5696 5697 StubRoutines::_call_stub_entry = 5698 generate_call_stub(StubRoutines::_call_stub_return_address); 5699 5700 // is referenced by megamorphic call 5701 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5702 5703 // Build this early so it's available for the interpreter. 5704 StubRoutines::_throw_StackOverflowError_entry = 5705 generate_throw_exception("StackOverflowError throw_exception", 5706 CAST_FROM_FN_PTR(address, 5707 SharedRuntime::throw_StackOverflowError)); 5708 StubRoutines::_throw_delayed_StackOverflowError_entry = 5709 generate_throw_exception("delayed StackOverflowError throw_exception", 5710 CAST_FROM_FN_PTR(address, 5711 SharedRuntime::throw_delayed_StackOverflowError)); 5712 if (UseCRC32Intrinsics) { 5713 // set table address before stub generation which use it 5714 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5715 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5716 } 5717 5718 if (UseCRC32CIntrinsics) { 5719 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5720 } 5721 5722 // Disabled until JDK-8210858 is fixed 5723 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5724 // StubRoutines::_dlog = generate_dlog(); 5725 // } 5726 5727 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5728 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 5729 } 5730 5731 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5732 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 5733 } 5734 } 5735 5736 void generate_all() { 5737 // support for verify_oop (must happen after universe_init) 5738 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5739 StubRoutines::_throw_AbstractMethodError_entry = 5740 generate_throw_exception("AbstractMethodError throw_exception", 5741 CAST_FROM_FN_PTR(address, 5742 SharedRuntime:: 5743 throw_AbstractMethodError)); 5744 5745 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5746 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5747 CAST_FROM_FN_PTR(address, 5748 SharedRuntime:: 5749 throw_IncompatibleClassChangeError)); 5750 5751 StubRoutines::_throw_NullPointerException_at_call_entry = 5752 generate_throw_exception("NullPointerException at call throw_exception", 5753 CAST_FROM_FN_PTR(address, 5754 SharedRuntime:: 5755 throw_NullPointerException_at_call)); 5756 5757 // arraycopy stubs used by compilers 5758 generate_arraycopy_stubs(); 5759 5760 // has negatives stub for large arrays. 5761 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5762 5763 // array equals stub for large arrays. 5764 if (!UseSimpleArrayEquals) { 5765 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5766 } 5767 5768 generate_compare_long_strings(); 5769 5770 generate_string_indexof_stubs(); 5771 5772 // byte_array_inflate stub for large arrays. 5773 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5774 5775 #ifdef COMPILER2 5776 if (UseMultiplyToLenIntrinsic) { 5777 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5778 } 5779 5780 if (UseSquareToLenIntrinsic) { 5781 StubRoutines::_squareToLen = generate_squareToLen(); 5782 } 5783 5784 if (UseMulAddIntrinsic) { 5785 StubRoutines::_mulAdd = generate_mulAdd(); 5786 } 5787 5788 if (UseMontgomeryMultiplyIntrinsic) { 5789 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5790 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5791 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5792 } 5793 5794 if (UseMontgomerySquareIntrinsic) { 5795 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5796 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5797 // We use generate_multiply() rather than generate_square() 5798 // because it's faster for the sizes of modulus we care about. 5799 StubRoutines::_montgomerySquare = g.generate_multiply(); 5800 } 5801 #endif // COMPILER2 5802 5803 #ifndef BUILTIN_SIM 5804 // generate GHASH intrinsics code 5805 if (UseGHASHIntrinsics) { 5806 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5807 } 5808 5809 if (UseAESIntrinsics) { 5810 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5811 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5812 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5813 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5814 } 5815 5816 if (UseSHA1Intrinsics) { 5817 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5818 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5819 } 5820 if (UseSHA256Intrinsics) { 5821 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5822 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5823 } 5824 5825 // generate Adler32 intrinsics code 5826 if (UseAdler32Intrinsics) { 5827 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5828 } 5829 5830 // Safefetch stubs. 5831 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5832 &StubRoutines::_safefetch32_fault_pc, 5833 &StubRoutines::_safefetch32_continuation_pc); 5834 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5835 &StubRoutines::_safefetchN_fault_pc, 5836 &StubRoutines::_safefetchN_continuation_pc); 5837 #endif 5838 StubRoutines::aarch64::set_completed(); 5839 } 5840 5841 public: 5842 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5843 if (all) { 5844 generate_all(); 5845 } else { 5846 generate_initial(); 5847 } 5848 } 5849 }; // end class declaration 5850 5851 void StubGenerator_generate(CodeBuffer* code, bool all) { 5852 StubGenerator g(code, all); 5853 }