1 /* 2 * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "nativeInst_aarch64.hpp" 33 #include "oops/instanceOop.hpp" 34 #include "oops/method.hpp" 35 #include "oops/objArrayKlass.hpp" 36 #include "oops/oop.inline.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "runtime/frame.inline.hpp" 39 #include "runtime/handles.inline.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubCodeGenerator.hpp" 42 #include "runtime/stubRoutines.hpp" 43 #include "runtime/thread.inline.hpp" 44 #include "utilities/align.hpp" 45 #ifdef COMPILER2 46 #include "opto/runtime.hpp" 47 #endif 48 49 #ifdef BUILTIN_SIM 50 #include "../../../../../../simulator/simulator.hpp" 51 #endif 52 53 // Declaration and definition of StubGenerator (no .hpp file). 54 // For a more detailed description of the stub routine structure 55 // see the comment in stubRoutines.hpp 56 57 #undef __ 58 #define __ _masm-> 59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 60 61 #ifdef PRODUCT 62 #define BLOCK_COMMENT(str) /* nothing */ 63 #else 64 #define BLOCK_COMMENT(str) __ block_comment(str) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Stub Code definitions 70 71 class StubGenerator: public StubCodeGenerator { 72 private: 73 74 #ifdef PRODUCT 75 #define inc_counter_np(counter) ((void)0) 76 #else 77 void inc_counter_np_(int& counter) { 78 __ lea(rscratch2, ExternalAddress((address)&counter)); 79 __ ldrw(rscratch1, Address(rscratch2)); 80 __ addw(rscratch1, rscratch1, 1); 81 __ strw(rscratch1, Address(rscratch2)); 82 } 83 #define inc_counter_np(counter) \ 84 BLOCK_COMMENT("inc_counter " #counter); \ 85 inc_counter_np_(counter); 86 #endif 87 88 // Call stubs are used to call Java from C 89 // 90 // Arguments: 91 // c_rarg0: call wrapper address address 92 // c_rarg1: result address 93 // c_rarg2: result type BasicType 94 // c_rarg3: method Method* 95 // c_rarg4: (interpreter) entry point address 96 // c_rarg5: parameters intptr_t* 97 // c_rarg6: parameter size (in words) int 98 // c_rarg7: thread Thread* 99 // 100 // There is no return from the stub itself as any Java result 101 // is written to result 102 // 103 // we save r30 (lr) as the return PC at the base of the frame and 104 // link r29 (fp) below it as the frame pointer installing sp (r31) 105 // into fp. 106 // 107 // we save r0-r7, which accounts for all the c arguments. 108 // 109 // TODO: strictly do we need to save them all? they are treated as 110 // volatile by C so could we omit saving the ones we are going to 111 // place in global registers (thread? method?) or those we only use 112 // during setup of the Java call? 113 // 114 // we don't need to save r8 which C uses as an indirect result location 115 // return register. 116 // 117 // we don't need to save r9-r15 which both C and Java treat as 118 // volatile 119 // 120 // we don't need to save r16-18 because Java does not use them 121 // 122 // we save r19-r28 which Java uses as scratch registers and C 123 // expects to be callee-save 124 // 125 // we save the bottom 64 bits of each value stored in v8-v15; it is 126 // the responsibility of the caller to preserve larger values. 127 // 128 // so the stub frame looks like this when we enter Java code 129 // 130 // [ return_from_Java ] <--- sp 131 // [ argument word n ] 132 // ... 133 // -27 [ argument word 1 ] 134 // -26 [ saved v15 ] <--- sp_after_call 135 // -25 [ saved v14 ] 136 // -24 [ saved v13 ] 137 // -23 [ saved v12 ] 138 // -22 [ saved v11 ] 139 // -21 [ saved v10 ] 140 // -20 [ saved v9 ] 141 // -19 [ saved v8 ] 142 // -18 [ saved r28 ] 143 // -17 [ saved r27 ] 144 // -16 [ saved r26 ] 145 // -15 [ saved r25 ] 146 // -14 [ saved r24 ] 147 // -13 [ saved r23 ] 148 // -12 [ saved r22 ] 149 // -11 [ saved r21 ] 150 // -10 [ saved r20 ] 151 // -9 [ saved r19 ] 152 // -8 [ call wrapper (r0) ] 153 // -7 [ result (r1) ] 154 // -6 [ result type (r2) ] 155 // -5 [ method (r3) ] 156 // -4 [ entry point (r4) ] 157 // -3 [ parameters (r5) ] 158 // -2 [ parameter size (r6) ] 159 // -1 [ thread (r7) ] 160 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 161 // 1 [ saved lr (r30) ] 162 163 // Call stub stack layout word offsets from fp 164 enum call_stub_layout { 165 sp_after_call_off = -26, 166 167 d15_off = -26, 168 d13_off = -24, 169 d11_off = -22, 170 d9_off = -20, 171 172 r28_off = -18, 173 r26_off = -16, 174 r24_off = -14, 175 r22_off = -12, 176 r20_off = -10, 177 call_wrapper_off = -8, 178 result_off = -7, 179 result_type_off = -6, 180 method_off = -5, 181 entry_point_off = -4, 182 parameter_size_off = -2, 183 thread_off = -1, 184 fp_f = 0, 185 retaddr_off = 1, 186 }; 187 188 address generate_call_stub(address& return_address) { 189 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 190 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 191 "adjust this code"); 192 193 StubCodeMark mark(this, "StubRoutines", "call_stub"); 194 address start = __ pc(); 195 196 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 197 198 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 199 const Address result (rfp, result_off * wordSize); 200 const Address result_type (rfp, result_type_off * wordSize); 201 const Address method (rfp, method_off * wordSize); 202 const Address entry_point (rfp, entry_point_off * wordSize); 203 const Address parameter_size(rfp, parameter_size_off * wordSize); 204 205 const Address thread (rfp, thread_off * wordSize); 206 207 const Address d15_save (rfp, d15_off * wordSize); 208 const Address d13_save (rfp, d13_off * wordSize); 209 const Address d11_save (rfp, d11_off * wordSize); 210 const Address d9_save (rfp, d9_off * wordSize); 211 212 const Address r28_save (rfp, r28_off * wordSize); 213 const Address r26_save (rfp, r26_off * wordSize); 214 const Address r24_save (rfp, r24_off * wordSize); 215 const Address r22_save (rfp, r22_off * wordSize); 216 const Address r20_save (rfp, r20_off * wordSize); 217 218 // stub code 219 220 // we need a C prolog to bootstrap the x86 caller into the sim 221 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 222 223 address aarch64_entry = __ pc(); 224 225 #ifdef BUILTIN_SIM 226 // Save sender's SP for stack traces. 227 __ mov(rscratch1, sp); 228 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 229 #endif 230 // set up frame and move sp to end of save area 231 __ enter(); 232 __ sub(sp, rfp, -sp_after_call_off * wordSize); 233 234 // save register parameters and Java scratch/global registers 235 // n.b. we save thread even though it gets installed in 236 // rthread because we want to sanity check rthread later 237 __ str(c_rarg7, thread); 238 __ strw(c_rarg6, parameter_size); 239 __ stp(c_rarg4, c_rarg5, entry_point); 240 __ stp(c_rarg2, c_rarg3, result_type); 241 __ stp(c_rarg0, c_rarg1, call_wrapper); 242 243 __ stp(r20, r19, r20_save); 244 __ stp(r22, r21, r22_save); 245 __ stp(r24, r23, r24_save); 246 __ stp(r26, r25, r26_save); 247 __ stp(r28, r27, r28_save); 248 249 __ stpd(v9, v8, d9_save); 250 __ stpd(v11, v10, d11_save); 251 __ stpd(v13, v12, d13_save); 252 __ stpd(v15, v14, d15_save); 253 254 // install Java thread in global register now we have saved 255 // whatever value it held 256 __ mov(rthread, c_rarg7); 257 // And method 258 __ mov(rmethod, c_rarg3); 259 260 // set up the heapbase register 261 __ reinit_heapbase(); 262 263 #ifdef ASSERT 264 // make sure we have no pending exceptions 265 { 266 Label L; 267 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 268 __ cmp(rscratch1, (u1)NULL_WORD); 269 __ br(Assembler::EQ, L); 270 __ stop("StubRoutines::call_stub: entered with pending exception"); 271 __ BIND(L); 272 } 273 #endif 274 // pass parameters if any 275 __ mov(esp, sp); 276 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 277 __ andr(sp, rscratch1, -2 * wordSize); 278 279 BLOCK_COMMENT("pass parameters if any"); 280 Label parameters_done; 281 // parameter count is still in c_rarg6 282 // and parameter pointer identifying param 1 is in c_rarg5 283 __ cbzw(c_rarg6, parameters_done); 284 285 address loop = __ pc(); 286 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 287 __ subsw(c_rarg6, c_rarg6, 1); 288 __ push(rscratch1); 289 __ br(Assembler::GT, loop); 290 291 __ BIND(parameters_done); 292 293 // call Java entry -- passing methdoOop, and current sp 294 // rmethod: Method* 295 // r13: sender sp 296 BLOCK_COMMENT("call Java function"); 297 __ mov(r13, sp); 298 __ blr(c_rarg4); 299 300 // tell the simulator we have returned to the stub 301 302 // we do this here because the notify will already have been done 303 // if we get to the next instruction via an exception 304 // 305 // n.b. adding this instruction here affects the calculation of 306 // whether or not a routine returns to the call stub (used when 307 // doing stack walks) since the normal test is to check the return 308 // pc against the address saved below. so we may need to allow for 309 // this extra instruction in the check. 310 311 if (NotifySimulator) { 312 __ notify(Assembler::method_reentry); 313 } 314 // save current address for use by exception handling code 315 316 return_address = __ pc(); 317 318 // store result depending on type (everything that is not 319 // T_OBJECT, T_VALUETYPE, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 320 // n.b. this assumes Java returns an integral result in r0 321 // and a floating result in j_farg0 322 __ ldr(j_rarg2, result); 323 Label is_long, is_float, is_double, exit; 324 __ ldr(j_rarg1, result_type); 325 __ cmp(j_rarg1, (u1)T_OBJECT); 326 __ br(Assembler::EQ, is_long); 327 __ cmp(j_rarg1, (u1)T_VALUETYPE); 328 __ br(Assembler::EQ, is_long); 329 __ cmp(j_rarg1, (u1)T_LONG); 330 __ br(Assembler::EQ, is_long); 331 __ cmp(j_rarg1, (u1)T_FLOAT); 332 __ br(Assembler::EQ, is_float); 333 __ cmp(j_rarg1, (u1)T_DOUBLE); 334 __ br(Assembler::EQ, is_double); 335 336 // handle T_INT case 337 __ strw(r0, Address(j_rarg2)); 338 339 __ BIND(exit); 340 341 // pop parameters 342 __ sub(esp, rfp, -sp_after_call_off * wordSize); 343 344 #ifdef ASSERT 345 // verify that threads correspond 346 { 347 Label L, S; 348 __ ldr(rscratch1, thread); 349 __ cmp(rthread, rscratch1); 350 __ br(Assembler::NE, S); 351 __ get_thread(rscratch1); 352 __ cmp(rthread, rscratch1); 353 __ br(Assembler::EQ, L); 354 __ BIND(S); 355 __ stop("StubRoutines::call_stub: threads must correspond"); 356 __ BIND(L); 357 } 358 #endif 359 360 // restore callee-save registers 361 __ ldpd(v15, v14, d15_save); 362 __ ldpd(v13, v12, d13_save); 363 __ ldpd(v11, v10, d11_save); 364 __ ldpd(v9, v8, d9_save); 365 366 __ ldp(r28, r27, r28_save); 367 __ ldp(r26, r25, r26_save); 368 __ ldp(r24, r23, r24_save); 369 __ ldp(r22, r21, r22_save); 370 __ ldp(r20, r19, r20_save); 371 372 __ ldp(c_rarg0, c_rarg1, call_wrapper); 373 __ ldrw(c_rarg2, result_type); 374 __ ldr(c_rarg3, method); 375 __ ldp(c_rarg4, c_rarg5, entry_point); 376 __ ldp(c_rarg6, c_rarg7, parameter_size); 377 378 #ifndef PRODUCT 379 // tell the simulator we are about to end Java execution 380 if (NotifySimulator) { 381 __ notify(Assembler::method_exit); 382 } 383 #endif 384 // leave frame and return to caller 385 __ leave(); 386 __ ret(lr); 387 388 // handle return types different from T_INT 389 390 __ BIND(is_long); 391 __ str(r0, Address(j_rarg2, 0)); 392 __ br(Assembler::AL, exit); 393 394 __ BIND(is_float); 395 __ strs(j_farg0, Address(j_rarg2, 0)); 396 __ br(Assembler::AL, exit); 397 398 __ BIND(is_double); 399 __ strd(j_farg0, Address(j_rarg2, 0)); 400 __ br(Assembler::AL, exit); 401 402 return start; 403 } 404 405 // Return point for a Java call if there's an exception thrown in 406 // Java code. The exception is caught and transformed into a 407 // pending exception stored in JavaThread that can be tested from 408 // within the VM. 409 // 410 // Note: Usually the parameters are removed by the callee. In case 411 // of an exception crossing an activation frame boundary, that is 412 // not the case if the callee is compiled code => need to setup the 413 // rsp. 414 // 415 // r0: exception oop 416 417 // NOTE: this is used as a target from the signal handler so it 418 // needs an x86 prolog which returns into the current simulator 419 // executing the generated catch_exception code. so the prolog 420 // needs to install rax in a sim register and adjust the sim's 421 // restart pc to enter the generated code at the start position 422 // then return from native to simulated execution. 423 424 address generate_catch_exception() { 425 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 426 address start = __ pc(); 427 428 // same as in generate_call_stub(): 429 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 430 const Address thread (rfp, thread_off * wordSize); 431 432 #ifdef ASSERT 433 // verify that threads correspond 434 { 435 Label L, S; 436 __ ldr(rscratch1, thread); 437 __ cmp(rthread, rscratch1); 438 __ br(Assembler::NE, S); 439 __ get_thread(rscratch1); 440 __ cmp(rthread, rscratch1); 441 __ br(Assembler::EQ, L); 442 __ bind(S); 443 __ stop("StubRoutines::catch_exception: threads must correspond"); 444 __ bind(L); 445 } 446 #endif 447 448 // set pending exception 449 __ verify_oop(r0); 450 451 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 452 __ mov(rscratch1, (address)__FILE__); 453 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 454 __ movw(rscratch1, (int)__LINE__); 455 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 456 457 // complete return to VM 458 assert(StubRoutines::_call_stub_return_address != NULL, 459 "_call_stub_return_address must have been generated before"); 460 __ b(StubRoutines::_call_stub_return_address); 461 462 return start; 463 } 464 465 // Continuation point for runtime calls returning with a pending 466 // exception. The pending exception check happened in the runtime 467 // or native call stub. The pending exception in Thread is 468 // converted into a Java-level exception. 469 // 470 // Contract with Java-level exception handlers: 471 // r0: exception 472 // r3: throwing pc 473 // 474 // NOTE: At entry of this stub, exception-pc must be in LR !! 475 476 // NOTE: this is always used as a jump target within generated code 477 // so it just needs to be generated code wiht no x86 prolog 478 479 address generate_forward_exception() { 480 StubCodeMark mark(this, "StubRoutines", "forward exception"); 481 address start = __ pc(); 482 483 // Upon entry, LR points to the return address returning into 484 // Java (interpreted or compiled) code; i.e., the return address 485 // becomes the throwing pc. 486 // 487 // Arguments pushed before the runtime call are still on the stack 488 // but the exception handler will reset the stack pointer -> 489 // ignore them. A potential result in registers can be ignored as 490 // well. 491 492 #ifdef ASSERT 493 // make sure this code is only executed if there is a pending exception 494 { 495 Label L; 496 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 497 __ cbnz(rscratch1, L); 498 __ stop("StubRoutines::forward exception: no pending exception (1)"); 499 __ bind(L); 500 } 501 #endif 502 503 // compute exception handler into r19 504 505 // call the VM to find the handler address associated with the 506 // caller address. pass thread in r0 and caller pc (ret address) 507 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 508 // the stack. 509 __ mov(c_rarg1, lr); 510 // lr will be trashed by the VM call so we move it to R19 511 // (callee-saved) because we also need to pass it to the handler 512 // returned by this call. 513 __ mov(r19, lr); 514 BLOCK_COMMENT("call exception_handler_for_return_address"); 515 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 516 SharedRuntime::exception_handler_for_return_address), 517 rthread, c_rarg1); 518 // we should not really care that lr is no longer the callee 519 // address. we saved the value the handler needs in r19 so we can 520 // just copy it to r3. however, the C2 handler will push its own 521 // frame and then calls into the VM and the VM code asserts that 522 // the PC for the frame above the handler belongs to a compiled 523 // Java method. So, we restore lr here to satisfy that assert. 524 __ mov(lr, r19); 525 // setup r0 & r3 & clear pending exception 526 __ mov(r3, r19); 527 __ mov(r19, r0); 528 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 529 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 530 531 #ifdef ASSERT 532 // make sure exception is set 533 { 534 Label L; 535 __ cbnz(r0, L); 536 __ stop("StubRoutines::forward exception: no pending exception (2)"); 537 __ bind(L); 538 } 539 #endif 540 541 // continue at exception handler 542 // r0: exception 543 // r3: throwing pc 544 // r19: exception handler 545 __ verify_oop(r0); 546 __ br(r19); 547 548 return start; 549 } 550 551 // Non-destructive plausibility checks for oops 552 // 553 // Arguments: 554 // r0: oop to verify 555 // rscratch1: error message 556 // 557 // Stack after saving c_rarg3: 558 // [tos + 0]: saved c_rarg3 559 // [tos + 1]: saved c_rarg2 560 // [tos + 2]: saved lr 561 // [tos + 3]: saved rscratch2 562 // [tos + 4]: saved r0 563 // [tos + 5]: saved rscratch1 564 address generate_verify_oop() { 565 566 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 567 address start = __ pc(); 568 569 Label exit, error; 570 571 // save c_rarg2 and c_rarg3 572 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 573 574 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 575 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 576 __ ldr(c_rarg3, Address(c_rarg2)); 577 __ add(c_rarg3, c_rarg3, 1); 578 __ str(c_rarg3, Address(c_rarg2)); 579 580 // object is in r0 581 // make sure object is 'reasonable' 582 __ cbz(r0, exit); // if obj is NULL it is OK 583 584 // Check if the oop is in the right area of memory 585 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 586 __ andr(c_rarg2, r0, c_rarg3); 587 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 588 589 // Compare c_rarg2 and c_rarg3. We don't use a compare 590 // instruction here because the flags register is live. 591 __ eor(c_rarg2, c_rarg2, c_rarg3); 592 __ cbnz(c_rarg2, error); 593 594 // make sure klass is 'reasonable', which is not zero. 595 __ load_klass(r0, r0); // get klass 596 __ cbz(r0, error); // if klass is NULL it is broken 597 598 // return if everything seems ok 599 __ bind(exit); 600 601 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 602 __ ret(lr); 603 604 // handle errors 605 __ bind(error); 606 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 607 608 __ push(RegSet::range(r0, r29), sp); 609 // debug(char* msg, int64_t pc, int64_t regs[]) 610 __ mov(c_rarg0, rscratch1); // pass address of error message 611 __ mov(c_rarg1, lr); // pass return address 612 __ mov(c_rarg2, sp); // pass address of regs on stack 613 #ifndef PRODUCT 614 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 615 #endif 616 BLOCK_COMMENT("call MacroAssembler::debug"); 617 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 618 __ blrt(rscratch1, 3, 0, 1); 619 620 return start; 621 } 622 623 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 624 625 // The inner part of zero_words(). This is the bulk operation, 626 // zeroing words in blocks, possibly using DC ZVA to do it. The 627 // caller is responsible for zeroing the last few words. 628 // 629 // Inputs: 630 // r10: the HeapWord-aligned base address of an array to zero. 631 // r11: the count in HeapWords, r11 > 0. 632 // 633 // Returns r10 and r11, adjusted for the caller to clear. 634 // r10: the base address of the tail of words left to clear. 635 // r11: the number of words in the tail. 636 // r11 < MacroAssembler::zero_words_block_size. 637 638 address generate_zero_blocks() { 639 Label done; 640 Label base_aligned; 641 642 Register base = r10, cnt = r11; 643 644 __ align(CodeEntryAlignment); 645 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 646 address start = __ pc(); 647 648 if (UseBlockZeroing) { 649 int zva_length = VM_Version::zva_length(); 650 651 // Ensure ZVA length can be divided by 16. This is required by 652 // the subsequent operations. 653 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 654 655 __ tbz(base, 3, base_aligned); 656 __ str(zr, Address(__ post(base, 8))); 657 __ sub(cnt, cnt, 1); 658 __ bind(base_aligned); 659 660 // Ensure count >= zva_length * 2 so that it still deserves a zva after 661 // alignment. 662 Label small; 663 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 664 __ subs(rscratch1, cnt, low_limit >> 3); 665 __ br(Assembler::LT, small); 666 __ zero_dcache_blocks(base, cnt); 667 __ bind(small); 668 } 669 670 { 671 // Number of stp instructions we'll unroll 672 const int unroll = 673 MacroAssembler::zero_words_block_size / 2; 674 // Clear the remaining blocks. 675 Label loop; 676 __ subs(cnt, cnt, unroll * 2); 677 __ br(Assembler::LT, done); 678 __ bind(loop); 679 for (int i = 0; i < unroll; i++) 680 __ stp(zr, zr, __ post(base, 16)); 681 __ subs(cnt, cnt, unroll * 2); 682 __ br(Assembler::GE, loop); 683 __ bind(done); 684 __ add(cnt, cnt, unroll * 2); 685 } 686 687 __ ret(lr); 688 689 return start; 690 } 691 692 693 typedef enum { 694 copy_forwards = 1, 695 copy_backwards = -1 696 } copy_direction; 697 698 // Bulk copy of blocks of 8 words. 699 // 700 // count is a count of words. 701 // 702 // Precondition: count >= 8 703 // 704 // Postconditions: 705 // 706 // The least significant bit of count contains the remaining count 707 // of words to copy. The rest of count is trash. 708 // 709 // s and d are adjusted to point to the remaining words to copy 710 // 711 void generate_copy_longs(Label &start, Register s, Register d, Register count, 712 copy_direction direction) { 713 int unit = wordSize * direction; 714 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 715 716 int offset; 717 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 718 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 719 const Register stride = r13; 720 721 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 722 assert_different_registers(s, d, count, rscratch1); 723 724 Label again, drain; 725 const char *stub_name; 726 if (direction == copy_forwards) 727 stub_name = "forward_copy_longs"; 728 else 729 stub_name = "backward_copy_longs"; 730 731 __ align(CodeEntryAlignment); 732 733 StubCodeMark mark(this, "StubRoutines", stub_name); 734 735 __ bind(start); 736 737 Label unaligned_copy_long; 738 if (AvoidUnalignedAccesses) { 739 __ tbnz(d, 3, unaligned_copy_long); 740 } 741 742 if (direction == copy_forwards) { 743 __ sub(s, s, bias); 744 __ sub(d, d, bias); 745 } 746 747 #ifdef ASSERT 748 // Make sure we are never given < 8 words 749 { 750 Label L; 751 __ cmp(count, (u1)8); 752 __ br(Assembler::GE, L); 753 __ stop("genrate_copy_longs called with < 8 words"); 754 __ bind(L); 755 } 756 #endif 757 758 // Fill 8 registers 759 if (UseSIMDForMemoryOps) { 760 __ ldpq(v0, v1, Address(s, 4 * unit)); 761 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 762 } else { 763 __ ldp(t0, t1, Address(s, 2 * unit)); 764 __ ldp(t2, t3, Address(s, 4 * unit)); 765 __ ldp(t4, t5, Address(s, 6 * unit)); 766 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 767 } 768 769 __ subs(count, count, 16); 770 __ br(Assembler::LO, drain); 771 772 int prefetch = PrefetchCopyIntervalInBytes; 773 bool use_stride = false; 774 if (direction == copy_backwards) { 775 use_stride = prefetch > 256; 776 prefetch = -prefetch; 777 if (use_stride) __ mov(stride, prefetch); 778 } 779 780 __ bind(again); 781 782 if (PrefetchCopyIntervalInBytes > 0) 783 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 784 785 if (UseSIMDForMemoryOps) { 786 __ stpq(v0, v1, Address(d, 4 * unit)); 787 __ ldpq(v0, v1, Address(s, 4 * unit)); 788 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 789 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 790 } else { 791 __ stp(t0, t1, Address(d, 2 * unit)); 792 __ ldp(t0, t1, Address(s, 2 * unit)); 793 __ stp(t2, t3, Address(d, 4 * unit)); 794 __ ldp(t2, t3, Address(s, 4 * unit)); 795 __ stp(t4, t5, Address(d, 6 * unit)); 796 __ ldp(t4, t5, Address(s, 6 * unit)); 797 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 798 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 799 } 800 801 __ subs(count, count, 8); 802 __ br(Assembler::HS, again); 803 804 // Drain 805 __ bind(drain); 806 if (UseSIMDForMemoryOps) { 807 __ stpq(v0, v1, Address(d, 4 * unit)); 808 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 809 } else { 810 __ stp(t0, t1, Address(d, 2 * unit)); 811 __ stp(t2, t3, Address(d, 4 * unit)); 812 __ stp(t4, t5, Address(d, 6 * unit)); 813 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 814 } 815 816 { 817 Label L1, L2; 818 __ tbz(count, exact_log2(4), L1); 819 if (UseSIMDForMemoryOps) { 820 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 821 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 822 } else { 823 __ ldp(t0, t1, Address(s, 2 * unit)); 824 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 825 __ stp(t0, t1, Address(d, 2 * unit)); 826 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 827 } 828 __ bind(L1); 829 830 if (direction == copy_forwards) { 831 __ add(s, s, bias); 832 __ add(d, d, bias); 833 } 834 835 __ tbz(count, 1, L2); 836 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 837 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 838 __ bind(L2); 839 } 840 841 __ ret(lr); 842 843 if (AvoidUnalignedAccesses) { 844 Label drain, again; 845 // Register order for storing. Order is different for backward copy. 846 847 __ bind(unaligned_copy_long); 848 849 // source address is even aligned, target odd aligned 850 // 851 // when forward copying word pairs we read long pairs at offsets 852 // {0, 2, 4, 6} (in long words). when backwards copying we read 853 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 854 // address by -2 in the forwards case so we can compute the 855 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 856 // or -1. 857 // 858 // when forward copying we need to store 1 word, 3 pairs and 859 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 860 // zero offset We adjust the destination by -1 which means we 861 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 862 // 863 // When backwards copyng we need to store 1 word, 3 pairs and 864 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 865 // offsets {1, 3, 5, 7, 8} * unit. 866 867 if (direction == copy_forwards) { 868 __ sub(s, s, 16); 869 __ sub(d, d, 8); 870 } 871 872 // Fill 8 registers 873 // 874 // for forwards copy s was offset by -16 from the original input 875 // value of s so the register contents are at these offsets 876 // relative to the 64 bit block addressed by that original input 877 // and so on for each successive 64 byte block when s is updated 878 // 879 // t0 at offset 0, t1 at offset 8 880 // t2 at offset 16, t3 at offset 24 881 // t4 at offset 32, t5 at offset 40 882 // t6 at offset 48, t7 at offset 56 883 884 // for backwards copy s was not offset so the register contents 885 // are at these offsets into the preceding 64 byte block 886 // relative to that original input and so on for each successive 887 // preceding 64 byte block when s is updated. this explains the 888 // slightly counter-intuitive looking pattern of register usage 889 // in the stp instructions for backwards copy. 890 // 891 // t0 at offset -16, t1 at offset -8 892 // t2 at offset -32, t3 at offset -24 893 // t4 at offset -48, t5 at offset -40 894 // t6 at offset -64, t7 at offset -56 895 896 __ ldp(t0, t1, Address(s, 2 * unit)); 897 __ ldp(t2, t3, Address(s, 4 * unit)); 898 __ ldp(t4, t5, Address(s, 6 * unit)); 899 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 900 901 __ subs(count, count, 16); 902 __ br(Assembler::LO, drain); 903 904 int prefetch = PrefetchCopyIntervalInBytes; 905 bool use_stride = false; 906 if (direction == copy_backwards) { 907 use_stride = prefetch > 256; 908 prefetch = -prefetch; 909 if (use_stride) __ mov(stride, prefetch); 910 } 911 912 __ bind(again); 913 914 if (PrefetchCopyIntervalInBytes > 0) 915 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 916 917 if (direction == copy_forwards) { 918 // allowing for the offset of -8 the store instructions place 919 // registers into the target 64 bit block at the following 920 // offsets 921 // 922 // t0 at offset 0 923 // t1 at offset 8, t2 at offset 16 924 // t3 at offset 24, t4 at offset 32 925 // t5 at offset 40, t6 at offset 48 926 // t7 at offset 56 927 928 __ str(t0, Address(d, 1 * unit)); 929 __ stp(t1, t2, Address(d, 2 * unit)); 930 __ ldp(t0, t1, Address(s, 2 * unit)); 931 __ stp(t3, t4, Address(d, 4 * unit)); 932 __ ldp(t2, t3, Address(s, 4 * unit)); 933 __ stp(t5, t6, Address(d, 6 * unit)); 934 __ ldp(t4, t5, Address(s, 6 * unit)); 935 __ str(t7, Address(__ pre(d, 8 * unit))); 936 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 937 } else { 938 // d was not offset when we started so the registers are 939 // written into the 64 bit block preceding d with the following 940 // offsets 941 // 942 // t1 at offset -8 943 // t3 at offset -24, t0 at offset -16 944 // t5 at offset -48, t2 at offset -32 945 // t7 at offset -56, t4 at offset -48 946 // t6 at offset -64 947 // 948 // note that this matches the offsets previously noted for the 949 // loads 950 951 __ str(t1, Address(d, 1 * unit)); 952 __ stp(t3, t0, Address(d, 3 * unit)); 953 __ ldp(t0, t1, Address(s, 2 * unit)); 954 __ stp(t5, t2, Address(d, 5 * unit)); 955 __ ldp(t2, t3, Address(s, 4 * unit)); 956 __ stp(t7, t4, Address(d, 7 * unit)); 957 __ ldp(t4, t5, Address(s, 6 * unit)); 958 __ str(t6, Address(__ pre(d, 8 * unit))); 959 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 960 } 961 962 __ subs(count, count, 8); 963 __ br(Assembler::HS, again); 964 965 // Drain 966 // 967 // this uses the same pattern of offsets and register arguments 968 // as above 969 __ bind(drain); 970 if (direction == copy_forwards) { 971 __ str(t0, Address(d, 1 * unit)); 972 __ stp(t1, t2, Address(d, 2 * unit)); 973 __ stp(t3, t4, Address(d, 4 * unit)); 974 __ stp(t5, t6, Address(d, 6 * unit)); 975 __ str(t7, Address(__ pre(d, 8 * unit))); 976 } else { 977 __ str(t1, Address(d, 1 * unit)); 978 __ stp(t3, t0, Address(d, 3 * unit)); 979 __ stp(t5, t2, Address(d, 5 * unit)); 980 __ stp(t7, t4, Address(d, 7 * unit)); 981 __ str(t6, Address(__ pre(d, 8 * unit))); 982 } 983 // now we need to copy any remaining part block which may 984 // include a 4 word block subblock and/or a 2 word subblock. 985 // bits 2 and 1 in the count are the tell-tale for whetehr we 986 // have each such subblock 987 { 988 Label L1, L2; 989 __ tbz(count, exact_log2(4), L1); 990 // this is the same as above but copying only 4 longs hence 991 // with ony one intervening stp between the str instructions 992 // but note that the offsets and registers still follow the 993 // same pattern 994 __ ldp(t0, t1, Address(s, 2 * unit)); 995 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 996 if (direction == copy_forwards) { 997 __ str(t0, Address(d, 1 * unit)); 998 __ stp(t1, t2, Address(d, 2 * unit)); 999 __ str(t3, Address(__ pre(d, 4 * unit))); 1000 } else { 1001 __ str(t1, Address(d, 1 * unit)); 1002 __ stp(t3, t0, Address(d, 3 * unit)); 1003 __ str(t2, Address(__ pre(d, 4 * unit))); 1004 } 1005 __ bind(L1); 1006 1007 __ tbz(count, 1, L2); 1008 // this is the same as above but copying only 2 longs hence 1009 // there is no intervening stp between the str instructions 1010 // but note that the offset and register patterns are still 1011 // the same 1012 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1013 if (direction == copy_forwards) { 1014 __ str(t0, Address(d, 1 * unit)); 1015 __ str(t1, Address(__ pre(d, 2 * unit))); 1016 } else { 1017 __ str(t1, Address(d, 1 * unit)); 1018 __ str(t0, Address(__ pre(d, 2 * unit))); 1019 } 1020 __ bind(L2); 1021 1022 // for forwards copy we need to re-adjust the offsets we 1023 // applied so that s and d are follow the last words written 1024 1025 if (direction == copy_forwards) { 1026 __ add(s, s, 16); 1027 __ add(d, d, 8); 1028 } 1029 1030 } 1031 1032 __ ret(lr); 1033 } 1034 } 1035 1036 // Small copy: less than 16 bytes. 1037 // 1038 // NB: Ignores all of the bits of count which represent more than 15 1039 // bytes, so a caller doesn't have to mask them. 1040 1041 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1042 bool is_backwards = step < 0; 1043 size_t granularity = uabs(step); 1044 int direction = is_backwards ? -1 : 1; 1045 int unit = wordSize * direction; 1046 1047 Label Lword, Lint, Lshort, Lbyte; 1048 1049 assert(granularity 1050 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1051 1052 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1053 1054 // ??? I don't know if this bit-test-and-branch is the right thing 1055 // to do. It does a lot of jumping, resulting in several 1056 // mispredicted branches. It might make more sense to do this 1057 // with something like Duff's device with a single computed branch. 1058 1059 __ tbz(count, 3 - exact_log2(granularity), Lword); 1060 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1061 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1062 __ bind(Lword); 1063 1064 if (granularity <= sizeof (jint)) { 1065 __ tbz(count, 2 - exact_log2(granularity), Lint); 1066 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1067 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1068 __ bind(Lint); 1069 } 1070 1071 if (granularity <= sizeof (jshort)) { 1072 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1073 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1074 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1075 __ bind(Lshort); 1076 } 1077 1078 if (granularity <= sizeof (jbyte)) { 1079 __ tbz(count, 0, Lbyte); 1080 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1081 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1082 __ bind(Lbyte); 1083 } 1084 } 1085 1086 Label copy_f, copy_b; 1087 1088 // All-singing all-dancing memory copy. 1089 // 1090 // Copy count units of memory from s to d. The size of a unit is 1091 // step, which can be positive or negative depending on the direction 1092 // of copy. If is_aligned is false, we align the source address. 1093 // 1094 1095 void copy_memory(bool is_aligned, Register s, Register d, 1096 Register count, Register tmp, int step) { 1097 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1098 bool is_backwards = step < 0; 1099 int granularity = uabs(step); 1100 const Register t0 = r3, t1 = r4; 1101 1102 // <= 96 bytes do inline. Direction doesn't matter because we always 1103 // load all the data before writing anything 1104 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1105 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1106 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1107 const Register send = r17, dend = r18; 1108 1109 if (PrefetchCopyIntervalInBytes > 0) 1110 __ prfm(Address(s, 0), PLDL1KEEP); 1111 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1112 __ br(Assembler::HI, copy_big); 1113 1114 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1115 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1116 1117 __ cmp(count, u1(16/granularity)); 1118 __ br(Assembler::LS, copy16); 1119 1120 __ cmp(count, u1(64/granularity)); 1121 __ br(Assembler::HI, copy80); 1122 1123 __ cmp(count, u1(32/granularity)); 1124 __ br(Assembler::LS, copy32); 1125 1126 // 33..64 bytes 1127 if (UseSIMDForMemoryOps) { 1128 __ ldpq(v0, v1, Address(s, 0)); 1129 __ ldpq(v2, v3, Address(send, -32)); 1130 __ stpq(v0, v1, Address(d, 0)); 1131 __ stpq(v2, v3, Address(dend, -32)); 1132 } else { 1133 __ ldp(t0, t1, Address(s, 0)); 1134 __ ldp(t2, t3, Address(s, 16)); 1135 __ ldp(t4, t5, Address(send, -32)); 1136 __ ldp(t6, t7, Address(send, -16)); 1137 1138 __ stp(t0, t1, Address(d, 0)); 1139 __ stp(t2, t3, Address(d, 16)); 1140 __ stp(t4, t5, Address(dend, -32)); 1141 __ stp(t6, t7, Address(dend, -16)); 1142 } 1143 __ b(finish); 1144 1145 // 17..32 bytes 1146 __ bind(copy32); 1147 __ ldp(t0, t1, Address(s, 0)); 1148 __ ldp(t2, t3, Address(send, -16)); 1149 __ stp(t0, t1, Address(d, 0)); 1150 __ stp(t2, t3, Address(dend, -16)); 1151 __ b(finish); 1152 1153 // 65..80/96 bytes 1154 // (96 bytes if SIMD because we do 32 byes per instruction) 1155 __ bind(copy80); 1156 if (UseSIMDForMemoryOps) { 1157 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1158 __ ldpq(v4, v5, Address(send, -32)); 1159 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1160 __ stpq(v4, v5, Address(dend, -32)); 1161 } else { 1162 __ ldp(t0, t1, Address(s, 0)); 1163 __ ldp(t2, t3, Address(s, 16)); 1164 __ ldp(t4, t5, Address(s, 32)); 1165 __ ldp(t6, t7, Address(s, 48)); 1166 __ ldp(t8, t9, Address(send, -16)); 1167 1168 __ stp(t0, t1, Address(d, 0)); 1169 __ stp(t2, t3, Address(d, 16)); 1170 __ stp(t4, t5, Address(d, 32)); 1171 __ stp(t6, t7, Address(d, 48)); 1172 __ stp(t8, t9, Address(dend, -16)); 1173 } 1174 __ b(finish); 1175 1176 // 0..16 bytes 1177 __ bind(copy16); 1178 __ cmp(count, u1(8/granularity)); 1179 __ br(Assembler::LO, copy8); 1180 1181 // 8..16 bytes 1182 __ ldr(t0, Address(s, 0)); 1183 __ ldr(t1, Address(send, -8)); 1184 __ str(t0, Address(d, 0)); 1185 __ str(t1, Address(dend, -8)); 1186 __ b(finish); 1187 1188 if (granularity < 8) { 1189 // 4..7 bytes 1190 __ bind(copy8); 1191 __ tbz(count, 2 - exact_log2(granularity), copy4); 1192 __ ldrw(t0, Address(s, 0)); 1193 __ ldrw(t1, Address(send, -4)); 1194 __ strw(t0, Address(d, 0)); 1195 __ strw(t1, Address(dend, -4)); 1196 __ b(finish); 1197 if (granularity < 4) { 1198 // 0..3 bytes 1199 __ bind(copy4); 1200 __ cbz(count, finish); // get rid of 0 case 1201 if (granularity == 2) { 1202 __ ldrh(t0, Address(s, 0)); 1203 __ strh(t0, Address(d, 0)); 1204 } else { // granularity == 1 1205 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1206 // the first and last byte. 1207 // Handle the 3 byte case by loading and storing base + count/2 1208 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1209 // This does means in the 1 byte case we load/store the same 1210 // byte 3 times. 1211 __ lsr(count, count, 1); 1212 __ ldrb(t0, Address(s, 0)); 1213 __ ldrb(t1, Address(send, -1)); 1214 __ ldrb(t2, Address(s, count)); 1215 __ strb(t0, Address(d, 0)); 1216 __ strb(t1, Address(dend, -1)); 1217 __ strb(t2, Address(d, count)); 1218 } 1219 __ b(finish); 1220 } 1221 } 1222 1223 __ bind(copy_big); 1224 if (is_backwards) { 1225 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1226 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1227 } 1228 1229 // Now we've got the small case out of the way we can align the 1230 // source address on a 2-word boundary. 1231 1232 Label aligned; 1233 1234 if (is_aligned) { 1235 // We may have to adjust by 1 word to get s 2-word-aligned. 1236 __ tbz(s, exact_log2(wordSize), aligned); 1237 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1238 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1239 __ sub(count, count, wordSize/granularity); 1240 } else { 1241 if (is_backwards) { 1242 __ andr(rscratch2, s, 2 * wordSize - 1); 1243 } else { 1244 __ neg(rscratch2, s); 1245 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1246 } 1247 // rscratch2 is the byte adjustment needed to align s. 1248 __ cbz(rscratch2, aligned); 1249 int shift = exact_log2(granularity); 1250 if (shift) __ lsr(rscratch2, rscratch2, shift); 1251 __ sub(count, count, rscratch2); 1252 1253 #if 0 1254 // ?? This code is only correct for a disjoint copy. It may or 1255 // may not make sense to use it in that case. 1256 1257 // Copy the first pair; s and d may not be aligned. 1258 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1259 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1260 1261 // Align s and d, adjust count 1262 if (is_backwards) { 1263 __ sub(s, s, rscratch2); 1264 __ sub(d, d, rscratch2); 1265 } else { 1266 __ add(s, s, rscratch2); 1267 __ add(d, d, rscratch2); 1268 } 1269 #else 1270 copy_memory_small(s, d, rscratch2, rscratch1, step); 1271 #endif 1272 } 1273 1274 __ bind(aligned); 1275 1276 // s is now 2-word-aligned. 1277 1278 // We have a count of units and some trailing bytes. Adjust the 1279 // count and do a bulk copy of words. 1280 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1281 if (direction == copy_forwards) 1282 __ bl(copy_f); 1283 else 1284 __ bl(copy_b); 1285 1286 // And the tail. 1287 copy_memory_small(s, d, count, tmp, step); 1288 1289 if (granularity >= 8) __ bind(copy8); 1290 if (granularity >= 4) __ bind(copy4); 1291 __ bind(finish); 1292 } 1293 1294 1295 void clobber_registers() { 1296 #ifdef ASSERT 1297 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1298 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1299 for (Register r = r3; r <= r18; r++) 1300 if (r != rscratch1) __ mov(r, rscratch1); 1301 #endif 1302 } 1303 1304 // Scan over array at a for count oops, verifying each one. 1305 // Preserves a and count, clobbers rscratch1 and rscratch2. 1306 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1307 Label loop, end; 1308 __ mov(rscratch1, a); 1309 __ mov(rscratch2, zr); 1310 __ bind(loop); 1311 __ cmp(rscratch2, count); 1312 __ br(Assembler::HS, end); 1313 if (size == (size_t)wordSize) { 1314 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1315 __ verify_oop(temp); 1316 } else { 1317 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1318 __ decode_heap_oop(temp); // calls verify_oop 1319 } 1320 __ add(rscratch2, rscratch2, size); 1321 __ b(loop); 1322 __ bind(end); 1323 } 1324 1325 // Arguments: 1326 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1327 // ignored 1328 // is_oop - true => oop array, so generate store check code 1329 // name - stub name string 1330 // 1331 // Inputs: 1332 // c_rarg0 - source array address 1333 // c_rarg1 - destination array address 1334 // c_rarg2 - element count, treated as ssize_t, can be zero 1335 // 1336 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1337 // the hardware handle it. The two dwords within qwords that span 1338 // cache line boundaries will still be loaded and stored atomicly. 1339 // 1340 // Side Effects: 1341 // disjoint_int_copy_entry is set to the no-overlap entry point 1342 // used by generate_conjoint_int_oop_copy(). 1343 // 1344 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1345 const char *name, bool dest_uninitialized = false) { 1346 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1347 RegSet saved_reg = RegSet::of(s, d, count); 1348 __ align(CodeEntryAlignment); 1349 StubCodeMark mark(this, "StubRoutines", name); 1350 address start = __ pc(); 1351 __ enter(); 1352 1353 if (entry != NULL) { 1354 *entry = __ pc(); 1355 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1356 BLOCK_COMMENT("Entry:"); 1357 } 1358 1359 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1360 if (dest_uninitialized) { 1361 decorators |= IS_DEST_UNINITIALIZED; 1362 } 1363 if (aligned) { 1364 decorators |= ARRAYCOPY_ALIGNED; 1365 } 1366 1367 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1368 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg); 1369 1370 if (is_oop) { 1371 // save regs before copy_memory 1372 __ push(RegSet::of(d, count), sp); 1373 } 1374 copy_memory(aligned, s, d, count, rscratch1, size); 1375 1376 if (is_oop) { 1377 __ pop(RegSet::of(d, count), sp); 1378 if (VerifyOops) 1379 verify_oop_array(size, d, count, r16); 1380 __ sub(count, count, 1); // make an inclusive end pointer 1381 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1382 } 1383 1384 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1385 1386 __ leave(); 1387 __ mov(r0, zr); // return 0 1388 __ ret(lr); 1389 #ifdef BUILTIN_SIM 1390 { 1391 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1392 sim->notifyCompile(const_cast<char*>(name), start); 1393 } 1394 #endif 1395 return start; 1396 } 1397 1398 // Arguments: 1399 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1400 // ignored 1401 // is_oop - true => oop array, so generate store check code 1402 // name - stub name string 1403 // 1404 // Inputs: 1405 // c_rarg0 - source array address 1406 // c_rarg1 - destination array address 1407 // c_rarg2 - element count, treated as ssize_t, can be zero 1408 // 1409 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1410 // the hardware handle it. The two dwords within qwords that span 1411 // cache line boundaries will still be loaded and stored atomicly. 1412 // 1413 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1414 address *entry, const char *name, 1415 bool dest_uninitialized = false) { 1416 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1417 RegSet saved_regs = RegSet::of(s, d, count); 1418 StubCodeMark mark(this, "StubRoutines", name); 1419 address start = __ pc(); 1420 __ enter(); 1421 1422 if (entry != NULL) { 1423 *entry = __ pc(); 1424 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1425 BLOCK_COMMENT("Entry:"); 1426 } 1427 1428 // use fwd copy when (d-s) above_equal (count*size) 1429 __ sub(rscratch1, d, s); 1430 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1431 __ br(Assembler::HS, nooverlap_target); 1432 1433 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1434 if (dest_uninitialized) { 1435 decorators |= IS_DEST_UNINITIALIZED; 1436 } 1437 if (aligned) { 1438 decorators |= ARRAYCOPY_ALIGNED; 1439 } 1440 1441 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1442 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs); 1443 1444 if (is_oop) { 1445 // save regs before copy_memory 1446 __ push(RegSet::of(d, count), sp); 1447 } 1448 copy_memory(aligned, s, d, count, rscratch1, -size); 1449 if (is_oop) { 1450 __ pop(RegSet::of(d, count), sp); 1451 if (VerifyOops) 1452 verify_oop_array(size, d, count, r16); 1453 __ sub(count, count, 1); // make an inclusive end pointer 1454 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1455 } 1456 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1457 __ leave(); 1458 __ mov(r0, zr); // return 0 1459 __ ret(lr); 1460 #ifdef BUILTIN_SIM 1461 { 1462 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1463 sim->notifyCompile(const_cast<char*>(name), start); 1464 } 1465 #endif 1466 return start; 1467 } 1468 1469 // Arguments: 1470 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1471 // ignored 1472 // name - stub name string 1473 // 1474 // Inputs: 1475 // c_rarg0 - source array address 1476 // c_rarg1 - destination array address 1477 // c_rarg2 - element count, treated as ssize_t, can be zero 1478 // 1479 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1480 // we let the hardware handle it. The one to eight bytes within words, 1481 // dwords or qwords that span cache line boundaries will still be loaded 1482 // and stored atomically. 1483 // 1484 // Side Effects: 1485 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1486 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1487 // we let the hardware handle it. The one to eight bytes within words, 1488 // dwords or qwords that span cache line boundaries will still be loaded 1489 // and stored atomically. 1490 // 1491 // Side Effects: 1492 // disjoint_byte_copy_entry is set to the no-overlap entry point 1493 // used by generate_conjoint_byte_copy(). 1494 // 1495 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1496 const bool not_oop = false; 1497 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1498 } 1499 1500 // Arguments: 1501 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1502 // ignored 1503 // name - stub name string 1504 // 1505 // Inputs: 1506 // c_rarg0 - source array address 1507 // c_rarg1 - destination array address 1508 // c_rarg2 - element count, treated as ssize_t, can be zero 1509 // 1510 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1511 // we let the hardware handle it. The one to eight bytes within words, 1512 // dwords or qwords that span cache line boundaries will still be loaded 1513 // and stored atomically. 1514 // 1515 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1516 address* entry, const char *name) { 1517 const bool not_oop = false; 1518 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1519 } 1520 1521 // Arguments: 1522 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1523 // ignored 1524 // name - stub name string 1525 // 1526 // Inputs: 1527 // c_rarg0 - source array address 1528 // c_rarg1 - destination array address 1529 // c_rarg2 - element count, treated as ssize_t, can be zero 1530 // 1531 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1532 // let the hardware handle it. The two or four words within dwords 1533 // or qwords that span cache line boundaries will still be loaded 1534 // and stored atomically. 1535 // 1536 // Side Effects: 1537 // disjoint_short_copy_entry is set to the no-overlap entry point 1538 // used by generate_conjoint_short_copy(). 1539 // 1540 address generate_disjoint_short_copy(bool aligned, 1541 address* entry, const char *name) { 1542 const bool not_oop = false; 1543 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1544 } 1545 1546 // Arguments: 1547 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1548 // ignored 1549 // name - stub name string 1550 // 1551 // Inputs: 1552 // c_rarg0 - source array address 1553 // c_rarg1 - destination array address 1554 // c_rarg2 - element count, treated as ssize_t, can be zero 1555 // 1556 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1557 // let the hardware handle it. The two or four words within dwords 1558 // or qwords that span cache line boundaries will still be loaded 1559 // and stored atomically. 1560 // 1561 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1562 address *entry, const char *name) { 1563 const bool not_oop = false; 1564 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1565 1566 } 1567 // Arguments: 1568 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1569 // ignored 1570 // name - stub name string 1571 // 1572 // Inputs: 1573 // c_rarg0 - source array address 1574 // c_rarg1 - destination array address 1575 // c_rarg2 - element count, treated as ssize_t, can be zero 1576 // 1577 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1578 // the hardware handle it. The two dwords within qwords that span 1579 // cache line boundaries will still be loaded and stored atomicly. 1580 // 1581 // Side Effects: 1582 // disjoint_int_copy_entry is set to the no-overlap entry point 1583 // used by generate_conjoint_int_oop_copy(). 1584 // 1585 address generate_disjoint_int_copy(bool aligned, address *entry, 1586 const char *name, bool dest_uninitialized = false) { 1587 const bool not_oop = false; 1588 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1589 } 1590 1591 // Arguments: 1592 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1593 // ignored 1594 // name - stub name string 1595 // 1596 // Inputs: 1597 // c_rarg0 - source array address 1598 // c_rarg1 - destination array address 1599 // c_rarg2 - element count, treated as ssize_t, can be zero 1600 // 1601 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1602 // the hardware handle it. The two dwords within qwords that span 1603 // cache line boundaries will still be loaded and stored atomicly. 1604 // 1605 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1606 address *entry, const char *name, 1607 bool dest_uninitialized = false) { 1608 const bool not_oop = false; 1609 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1610 } 1611 1612 1613 // Arguments: 1614 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1615 // ignored 1616 // name - stub name string 1617 // 1618 // Inputs: 1619 // c_rarg0 - source array address 1620 // c_rarg1 - destination array address 1621 // c_rarg2 - element count, treated as size_t, can be zero 1622 // 1623 // Side Effects: 1624 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1625 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1626 // 1627 address generate_disjoint_long_copy(bool aligned, address *entry, 1628 const char *name, bool dest_uninitialized = false) { 1629 const bool not_oop = false; 1630 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1631 } 1632 1633 // Arguments: 1634 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1635 // ignored 1636 // name - stub name string 1637 // 1638 // Inputs: 1639 // c_rarg0 - source array address 1640 // c_rarg1 - destination array address 1641 // c_rarg2 - element count, treated as size_t, can be zero 1642 // 1643 address generate_conjoint_long_copy(bool aligned, 1644 address nooverlap_target, address *entry, 1645 const char *name, bool dest_uninitialized = false) { 1646 const bool not_oop = false; 1647 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1648 } 1649 1650 // Arguments: 1651 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1652 // ignored 1653 // name - stub name string 1654 // 1655 // Inputs: 1656 // c_rarg0 - source array address 1657 // c_rarg1 - destination array address 1658 // c_rarg2 - element count, treated as size_t, can be zero 1659 // 1660 // Side Effects: 1661 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1662 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1663 // 1664 address generate_disjoint_oop_copy(bool aligned, address *entry, 1665 const char *name, bool dest_uninitialized) { 1666 const bool is_oop = true; 1667 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1668 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1669 } 1670 1671 // Arguments: 1672 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1673 // ignored 1674 // name - stub name string 1675 // 1676 // Inputs: 1677 // c_rarg0 - source array address 1678 // c_rarg1 - destination array address 1679 // c_rarg2 - element count, treated as size_t, can be zero 1680 // 1681 address generate_conjoint_oop_copy(bool aligned, 1682 address nooverlap_target, address *entry, 1683 const char *name, bool dest_uninitialized) { 1684 const bool is_oop = true; 1685 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1686 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1687 name, dest_uninitialized); 1688 } 1689 1690 1691 // Helper for generating a dynamic type check. 1692 // Smashes rscratch1, rscratch2. 1693 void generate_type_check(Register sub_klass, 1694 Register super_check_offset, 1695 Register super_klass, 1696 Label& L_success) { 1697 assert_different_registers(sub_klass, super_check_offset, super_klass); 1698 1699 BLOCK_COMMENT("type_check:"); 1700 1701 Label L_miss; 1702 1703 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1704 super_check_offset); 1705 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1706 1707 // Fall through on failure! 1708 __ BIND(L_miss); 1709 } 1710 1711 // 1712 // Generate checkcasting array copy stub 1713 // 1714 // Input: 1715 // c_rarg0 - source array address 1716 // c_rarg1 - destination array address 1717 // c_rarg2 - element count, treated as ssize_t, can be zero 1718 // c_rarg3 - size_t ckoff (super_check_offset) 1719 // c_rarg4 - oop ckval (super_klass) 1720 // 1721 // Output: 1722 // r0 == 0 - success 1723 // r0 == -1^K - failure, where K is partial transfer count 1724 // 1725 address generate_checkcast_copy(const char *name, address *entry, 1726 bool dest_uninitialized = false) { 1727 1728 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1729 1730 // Input registers (after setup_arg_regs) 1731 const Register from = c_rarg0; // source array address 1732 const Register to = c_rarg1; // destination array address 1733 const Register count = c_rarg2; // elementscount 1734 const Register ckoff = c_rarg3; // super_check_offset 1735 const Register ckval = c_rarg4; // super_klass 1736 1737 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1738 RegSet wb_post_saved_regs = RegSet::of(count); 1739 1740 // Registers used as temps (r18, r19, r20 are save-on-entry) 1741 const Register count_save = r21; // orig elementscount 1742 const Register start_to = r20; // destination array start address 1743 const Register copied_oop = r18; // actual oop copied 1744 const Register r19_klass = r19; // oop._klass 1745 1746 //--------------------------------------------------------------- 1747 // Assembler stub will be used for this call to arraycopy 1748 // if the two arrays are subtypes of Object[] but the 1749 // destination array type is not equal to or a supertype 1750 // of the source type. Each element must be separately 1751 // checked. 1752 1753 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1754 copied_oop, r19_klass, count_save); 1755 1756 __ align(CodeEntryAlignment); 1757 StubCodeMark mark(this, "StubRoutines", name); 1758 address start = __ pc(); 1759 1760 __ enter(); // required for proper stackwalking of RuntimeStub frame 1761 1762 #ifdef ASSERT 1763 // caller guarantees that the arrays really are different 1764 // otherwise, we would have to make conjoint checks 1765 { Label L; 1766 array_overlap_test(L, TIMES_OOP); 1767 __ stop("checkcast_copy within a single array"); 1768 __ bind(L); 1769 } 1770 #endif //ASSERT 1771 1772 // Caller of this entry point must set up the argument registers. 1773 if (entry != NULL) { 1774 *entry = __ pc(); 1775 BLOCK_COMMENT("Entry:"); 1776 } 1777 1778 // Empty array: Nothing to do. 1779 __ cbz(count, L_done); 1780 1781 __ push(RegSet::of(r18, r19, r20, r21), sp); 1782 1783 #ifdef ASSERT 1784 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1785 // The ckoff and ckval must be mutually consistent, 1786 // even though caller generates both. 1787 { Label L; 1788 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1789 __ ldrw(start_to, Address(ckval, sco_offset)); 1790 __ cmpw(ckoff, start_to); 1791 __ br(Assembler::EQ, L); 1792 __ stop("super_check_offset inconsistent"); 1793 __ bind(L); 1794 } 1795 #endif //ASSERT 1796 1797 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST; 1798 bool is_oop = true; 1799 if (dest_uninitialized) { 1800 decorators |= IS_DEST_UNINITIALIZED; 1801 } 1802 1803 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1804 bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs); 1805 1806 // save the original count 1807 __ mov(count_save, count); 1808 1809 // Copy from low to high addresses 1810 __ mov(start_to, to); // Save destination array start address 1811 __ b(L_load_element); 1812 1813 // ======== begin loop ======== 1814 // (Loop is rotated; its entry is L_load_element.) 1815 // Loop control: 1816 // for (; count != 0; count--) { 1817 // copied_oop = load_heap_oop(from++); 1818 // ... generate_type_check ...; 1819 // store_heap_oop(to++, copied_oop); 1820 // } 1821 __ align(OptoLoopAlignment); 1822 1823 __ BIND(L_store_element); 1824 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1825 __ sub(count, count, 1); 1826 __ cbz(count, L_do_card_marks); 1827 1828 // ======== loop entry is here ======== 1829 __ BIND(L_load_element); 1830 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1831 __ cbz(copied_oop, L_store_element); 1832 1833 __ load_klass(r19_klass, copied_oop);// query the object klass 1834 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1835 // ======== end loop ======== 1836 1837 // It was a real error; we must depend on the caller to finish the job. 1838 // Register count = remaining oops, count_orig = total oops. 1839 // Emit GC store barriers for the oops we have copied and report 1840 // their number to the caller. 1841 1842 __ subs(count, count_save, count); // K = partially copied oop count 1843 __ eon(count, count, zr); // report (-1^K) to caller 1844 __ br(Assembler::EQ, L_done_pop); 1845 1846 __ BIND(L_do_card_marks); 1847 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1848 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs); 1849 1850 __ bind(L_done_pop); 1851 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1852 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1853 1854 __ bind(L_done); 1855 __ mov(r0, count); 1856 __ leave(); 1857 __ ret(lr); 1858 1859 return start; 1860 } 1861 1862 // Perform range checks on the proposed arraycopy. 1863 // Kills temp, but nothing else. 1864 // Also, clean the sign bits of src_pos and dst_pos. 1865 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1866 Register src_pos, // source position (c_rarg1) 1867 Register dst, // destination array oo (c_rarg2) 1868 Register dst_pos, // destination position (c_rarg3) 1869 Register length, 1870 Register temp, 1871 Label& L_failed) { 1872 BLOCK_COMMENT("arraycopy_range_checks:"); 1873 1874 assert_different_registers(rscratch1, temp); 1875 1876 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1877 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1878 __ addw(temp, length, src_pos); 1879 __ cmpw(temp, rscratch1); 1880 __ br(Assembler::HI, L_failed); 1881 1882 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1883 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1884 __ addw(temp, length, dst_pos); 1885 __ cmpw(temp, rscratch1); 1886 __ br(Assembler::HI, L_failed); 1887 1888 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1889 __ movw(src_pos, src_pos); 1890 __ movw(dst_pos, dst_pos); 1891 1892 BLOCK_COMMENT("arraycopy_range_checks done"); 1893 } 1894 1895 // These stubs get called from some dumb test routine. 1896 // I'll write them properly when they're called from 1897 // something that's actually doing something. 1898 static void fake_arraycopy_stub(address src, address dst, int count) { 1899 assert(count == 0, "huh?"); 1900 } 1901 1902 1903 // 1904 // Generate 'unsafe' array copy stub 1905 // Though just as safe as the other stubs, it takes an unscaled 1906 // size_t argument instead of an element count. 1907 // 1908 // Input: 1909 // c_rarg0 - source array address 1910 // c_rarg1 - destination array address 1911 // c_rarg2 - byte count, treated as ssize_t, can be zero 1912 // 1913 // Examines the alignment of the operands and dispatches 1914 // to a long, int, short, or byte copy loop. 1915 // 1916 address generate_unsafe_copy(const char *name, 1917 address byte_copy_entry, 1918 address short_copy_entry, 1919 address int_copy_entry, 1920 address long_copy_entry) { 1921 Label L_long_aligned, L_int_aligned, L_short_aligned; 1922 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1923 1924 __ align(CodeEntryAlignment); 1925 StubCodeMark mark(this, "StubRoutines", name); 1926 address start = __ pc(); 1927 __ enter(); // required for proper stackwalking of RuntimeStub frame 1928 1929 // bump this on entry, not on exit: 1930 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1931 1932 __ orr(rscratch1, s, d); 1933 __ orr(rscratch1, rscratch1, count); 1934 1935 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1936 __ cbz(rscratch1, L_long_aligned); 1937 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1938 __ cbz(rscratch1, L_int_aligned); 1939 __ tbz(rscratch1, 0, L_short_aligned); 1940 __ b(RuntimeAddress(byte_copy_entry)); 1941 1942 __ BIND(L_short_aligned); 1943 __ lsr(count, count, LogBytesPerShort); // size => short_count 1944 __ b(RuntimeAddress(short_copy_entry)); 1945 __ BIND(L_int_aligned); 1946 __ lsr(count, count, LogBytesPerInt); // size => int_count 1947 __ b(RuntimeAddress(int_copy_entry)); 1948 __ BIND(L_long_aligned); 1949 __ lsr(count, count, LogBytesPerLong); // size => long_count 1950 __ b(RuntimeAddress(long_copy_entry)); 1951 1952 return start; 1953 } 1954 1955 // 1956 // Generate generic array copy stubs 1957 // 1958 // Input: 1959 // c_rarg0 - src oop 1960 // c_rarg1 - src_pos (32-bits) 1961 // c_rarg2 - dst oop 1962 // c_rarg3 - dst_pos (32-bits) 1963 // c_rarg4 - element count (32-bits) 1964 // 1965 // Output: 1966 // r0 == 0 - success 1967 // r0 == -1^K - failure, where K is partial transfer count 1968 // 1969 address generate_generic_copy(const char *name, 1970 address byte_copy_entry, address short_copy_entry, 1971 address int_copy_entry, address oop_copy_entry, 1972 address long_copy_entry, address checkcast_copy_entry) { 1973 1974 Label L_failed, L_objArray; 1975 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1976 1977 // Input registers 1978 const Register src = c_rarg0; // source array oop 1979 const Register src_pos = c_rarg1; // source position 1980 const Register dst = c_rarg2; // destination array oop 1981 const Register dst_pos = c_rarg3; // destination position 1982 const Register length = c_rarg4; 1983 1984 1985 // Registers used as temps 1986 const Register dst_klass = c_rarg5; 1987 1988 __ align(CodeEntryAlignment); 1989 1990 StubCodeMark mark(this, "StubRoutines", name); 1991 1992 address start = __ pc(); 1993 1994 __ enter(); // required for proper stackwalking of RuntimeStub frame 1995 1996 // bump this on entry, not on exit: 1997 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1998 1999 //----------------------------------------------------------------------- 2000 // Assembler stub will be used for this call to arraycopy 2001 // if the following conditions are met: 2002 // 2003 // (1) src and dst must not be null. 2004 // (2) src_pos must not be negative. 2005 // (3) dst_pos must not be negative. 2006 // (4) length must not be negative. 2007 // (5) src klass and dst klass should be the same and not NULL. 2008 // (6) src and dst should be arrays. 2009 // (7) src_pos + length must not exceed length of src. 2010 // (8) dst_pos + length must not exceed length of dst. 2011 // 2012 2013 // if (src == NULL) return -1; 2014 __ cbz(src, L_failed); 2015 2016 // if (src_pos < 0) return -1; 2017 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2018 2019 // if (dst == NULL) return -1; 2020 __ cbz(dst, L_failed); 2021 2022 // if (dst_pos < 0) return -1; 2023 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2024 2025 // registers used as temp 2026 const Register scratch_length = r16; // elements count to copy 2027 const Register scratch_src_klass = r17; // array klass 2028 const Register lh = r18; // layout helper 2029 2030 // if (length < 0) return -1; 2031 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2032 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2033 2034 __ load_klass(scratch_src_klass, src); 2035 #ifdef ASSERT 2036 // assert(src->klass() != NULL); 2037 { 2038 BLOCK_COMMENT("assert klasses not null {"); 2039 Label L1, L2; 2040 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2041 __ bind(L1); 2042 __ stop("broken null klass"); 2043 __ bind(L2); 2044 __ load_klass(rscratch1, dst); 2045 __ cbz(rscratch1, L1); // this would be broken also 2046 BLOCK_COMMENT("} assert klasses not null done"); 2047 } 2048 #endif 2049 2050 // Load layout helper (32-bits) 2051 // 2052 // |array_tag| | header_size | element_type | |log2_element_size| 2053 // 32 30 24 16 8 2 0 2054 // 2055 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2056 // 2057 2058 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2059 2060 // Handle objArrays completely differently... 2061 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2062 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2063 __ movw(rscratch1, objArray_lh); 2064 __ eorw(rscratch2, lh, rscratch1); 2065 __ cbzw(rscratch2, L_objArray); 2066 2067 // if (src->klass() != dst->klass()) return -1; 2068 __ load_klass(rscratch2, dst); 2069 __ eor(rscratch2, rscratch2, scratch_src_klass); 2070 __ cbnz(rscratch2, L_failed); 2071 2072 // if (!src->is_Array()) return -1; 2073 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2074 2075 // At this point, it is known to be a typeArray (array_tag 0x3). 2076 #ifdef ASSERT 2077 { 2078 BLOCK_COMMENT("assert primitive array {"); 2079 Label L; 2080 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2081 __ cmpw(lh, rscratch2); 2082 __ br(Assembler::GE, L); 2083 __ stop("must be a primitive array"); 2084 __ bind(L); 2085 BLOCK_COMMENT("} assert primitive array done"); 2086 } 2087 #endif 2088 2089 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2090 rscratch2, L_failed); 2091 2092 // TypeArrayKlass 2093 // 2094 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2095 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2096 // 2097 2098 const Register rscratch1_offset = rscratch1; // array offset 2099 const Register r18_elsize = lh; // element size 2100 2101 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2102 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2103 __ add(src, src, rscratch1_offset); // src array offset 2104 __ add(dst, dst, rscratch1_offset); // dst array offset 2105 BLOCK_COMMENT("choose copy loop based on element size"); 2106 2107 // next registers should be set before the jump to corresponding stub 2108 const Register from = c_rarg0; // source array address 2109 const Register to = c_rarg1; // destination array address 2110 const Register count = c_rarg2; // elements count 2111 2112 // 'from', 'to', 'count' registers should be set in such order 2113 // since they are the same as 'src', 'src_pos', 'dst'. 2114 2115 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2116 2117 // The possible values of elsize are 0-3, i.e. exact_log2(element 2118 // size in bytes). We do a simple bitwise binary search. 2119 __ BIND(L_copy_bytes); 2120 __ tbnz(r18_elsize, 1, L_copy_ints); 2121 __ tbnz(r18_elsize, 0, L_copy_shorts); 2122 __ lea(from, Address(src, src_pos));// src_addr 2123 __ lea(to, Address(dst, dst_pos));// dst_addr 2124 __ movw(count, scratch_length); // length 2125 __ b(RuntimeAddress(byte_copy_entry)); 2126 2127 __ BIND(L_copy_shorts); 2128 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2129 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2130 __ movw(count, scratch_length); // length 2131 __ b(RuntimeAddress(short_copy_entry)); 2132 2133 __ BIND(L_copy_ints); 2134 __ tbnz(r18_elsize, 0, L_copy_longs); 2135 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2136 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2137 __ movw(count, scratch_length); // length 2138 __ b(RuntimeAddress(int_copy_entry)); 2139 2140 __ BIND(L_copy_longs); 2141 #ifdef ASSERT 2142 { 2143 BLOCK_COMMENT("assert long copy {"); 2144 Label L; 2145 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2146 __ cmpw(r18_elsize, LogBytesPerLong); 2147 __ br(Assembler::EQ, L); 2148 __ stop("must be long copy, but elsize is wrong"); 2149 __ bind(L); 2150 BLOCK_COMMENT("} assert long copy done"); 2151 } 2152 #endif 2153 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2154 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2155 __ movw(count, scratch_length); // length 2156 __ b(RuntimeAddress(long_copy_entry)); 2157 2158 // ObjArrayKlass 2159 __ BIND(L_objArray); 2160 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2161 2162 Label L_plain_copy, L_checkcast_copy; 2163 // test array classes for subtyping 2164 __ load_klass(r18, dst); 2165 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2166 __ br(Assembler::NE, L_checkcast_copy); 2167 2168 // Identically typed arrays can be copied without element-wise checks. 2169 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2170 rscratch2, L_failed); 2171 2172 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2173 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2174 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2175 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2176 __ movw(count, scratch_length); // length 2177 __ BIND(L_plain_copy); 2178 __ b(RuntimeAddress(oop_copy_entry)); 2179 2180 __ BIND(L_checkcast_copy); 2181 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2182 { 2183 // Before looking at dst.length, make sure dst is also an objArray. 2184 __ ldrw(rscratch1, Address(r18, lh_offset)); 2185 __ movw(rscratch2, objArray_lh); 2186 __ eorw(rscratch1, rscratch1, rscratch2); 2187 __ cbnzw(rscratch1, L_failed); 2188 2189 // It is safe to examine both src.length and dst.length. 2190 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2191 r18, L_failed); 2192 2193 __ load_klass(dst_klass, dst); // reload 2194 2195 // Marshal the base address arguments now, freeing registers. 2196 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2197 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2198 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2199 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2200 __ movw(count, length); // length (reloaded) 2201 Register sco_temp = c_rarg3; // this register is free now 2202 assert_different_registers(from, to, count, sco_temp, 2203 dst_klass, scratch_src_klass); 2204 // assert_clean_int(count, sco_temp); 2205 2206 // Generate the type check. 2207 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2208 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2209 2210 // Smashes rscratch1, rscratch2 2211 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2212 2213 // Fetch destination element klass from the ObjArrayKlass header. 2214 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2215 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2216 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2217 2218 // the checkcast_copy loop needs two extra arguments: 2219 assert(c_rarg3 == sco_temp, "#3 already in place"); 2220 // Set up arguments for checkcast_copy_entry. 2221 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2222 __ b(RuntimeAddress(checkcast_copy_entry)); 2223 } 2224 2225 __ BIND(L_failed); 2226 __ mov(r0, -1); 2227 __ leave(); // required for proper stackwalking of RuntimeStub frame 2228 __ ret(lr); 2229 2230 return start; 2231 } 2232 2233 // 2234 // Generate stub for array fill. If "aligned" is true, the 2235 // "to" address is assumed to be heapword aligned. 2236 // 2237 // Arguments for generated stub: 2238 // to: c_rarg0 2239 // value: c_rarg1 2240 // count: c_rarg2 treated as signed 2241 // 2242 address generate_fill(BasicType t, bool aligned, const char *name) { 2243 __ align(CodeEntryAlignment); 2244 StubCodeMark mark(this, "StubRoutines", name); 2245 address start = __ pc(); 2246 2247 BLOCK_COMMENT("Entry:"); 2248 2249 const Register to = c_rarg0; // source array address 2250 const Register value = c_rarg1; // value 2251 const Register count = c_rarg2; // elements count 2252 2253 const Register bz_base = r10; // base for block_zero routine 2254 const Register cnt_words = r11; // temp register 2255 2256 __ enter(); 2257 2258 Label L_fill_elements, L_exit1; 2259 2260 int shift = -1; 2261 switch (t) { 2262 case T_BYTE: 2263 shift = 0; 2264 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2265 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2266 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2267 __ br(Assembler::LO, L_fill_elements); 2268 break; 2269 case T_SHORT: 2270 shift = 1; 2271 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2272 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2273 __ br(Assembler::LO, L_fill_elements); 2274 break; 2275 case T_INT: 2276 shift = 2; 2277 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2278 __ br(Assembler::LO, L_fill_elements); 2279 break; 2280 default: ShouldNotReachHere(); 2281 } 2282 2283 // Align source address at 8 bytes address boundary. 2284 Label L_skip_align1, L_skip_align2, L_skip_align4; 2285 if (!aligned) { 2286 switch (t) { 2287 case T_BYTE: 2288 // One byte misalignment happens only for byte arrays. 2289 __ tbz(to, 0, L_skip_align1); 2290 __ strb(value, Address(__ post(to, 1))); 2291 __ subw(count, count, 1); 2292 __ bind(L_skip_align1); 2293 // Fallthrough 2294 case T_SHORT: 2295 // Two bytes misalignment happens only for byte and short (char) arrays. 2296 __ tbz(to, 1, L_skip_align2); 2297 __ strh(value, Address(__ post(to, 2))); 2298 __ subw(count, count, 2 >> shift); 2299 __ bind(L_skip_align2); 2300 // Fallthrough 2301 case T_INT: 2302 // Align to 8 bytes, we know we are 4 byte aligned to start. 2303 __ tbz(to, 2, L_skip_align4); 2304 __ strw(value, Address(__ post(to, 4))); 2305 __ subw(count, count, 4 >> shift); 2306 __ bind(L_skip_align4); 2307 break; 2308 default: ShouldNotReachHere(); 2309 } 2310 } 2311 2312 // 2313 // Fill large chunks 2314 // 2315 __ lsrw(cnt_words, count, 3 - shift); // number of words 2316 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2317 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2318 if (UseBlockZeroing) { 2319 Label non_block_zeroing, rest; 2320 // If the fill value is zero we can use the fast zero_words(). 2321 __ cbnz(value, non_block_zeroing); 2322 __ mov(bz_base, to); 2323 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2324 __ zero_words(bz_base, cnt_words); 2325 __ b(rest); 2326 __ bind(non_block_zeroing); 2327 __ fill_words(to, cnt_words, value); 2328 __ bind(rest); 2329 } else { 2330 __ fill_words(to, cnt_words, value); 2331 } 2332 2333 // Remaining count is less than 8 bytes. Fill it by a single store. 2334 // Note that the total length is no less than 8 bytes. 2335 if (t == T_BYTE || t == T_SHORT) { 2336 Label L_exit1; 2337 __ cbzw(count, L_exit1); 2338 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2339 __ str(value, Address(to, -8)); // overwrite some elements 2340 __ bind(L_exit1); 2341 __ leave(); 2342 __ ret(lr); 2343 } 2344 2345 // Handle copies less than 8 bytes. 2346 Label L_fill_2, L_fill_4, L_exit2; 2347 __ bind(L_fill_elements); 2348 switch (t) { 2349 case T_BYTE: 2350 __ tbz(count, 0, L_fill_2); 2351 __ strb(value, Address(__ post(to, 1))); 2352 __ bind(L_fill_2); 2353 __ tbz(count, 1, L_fill_4); 2354 __ strh(value, Address(__ post(to, 2))); 2355 __ bind(L_fill_4); 2356 __ tbz(count, 2, L_exit2); 2357 __ strw(value, Address(to)); 2358 break; 2359 case T_SHORT: 2360 __ tbz(count, 0, L_fill_4); 2361 __ strh(value, Address(__ post(to, 2))); 2362 __ bind(L_fill_4); 2363 __ tbz(count, 1, L_exit2); 2364 __ strw(value, Address(to)); 2365 break; 2366 case T_INT: 2367 __ cbzw(count, L_exit2); 2368 __ strw(value, Address(to)); 2369 break; 2370 default: ShouldNotReachHere(); 2371 } 2372 __ bind(L_exit2); 2373 __ leave(); 2374 __ ret(lr); 2375 return start; 2376 } 2377 2378 void generate_arraycopy_stubs() { 2379 address entry; 2380 address entry_jbyte_arraycopy; 2381 address entry_jshort_arraycopy; 2382 address entry_jint_arraycopy; 2383 address entry_oop_arraycopy; 2384 address entry_jlong_arraycopy; 2385 address entry_checkcast_arraycopy; 2386 2387 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2388 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2389 2390 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2391 2392 //*** jbyte 2393 // Always need aligned and unaligned versions 2394 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2395 "jbyte_disjoint_arraycopy"); 2396 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2397 &entry_jbyte_arraycopy, 2398 "jbyte_arraycopy"); 2399 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2400 "arrayof_jbyte_disjoint_arraycopy"); 2401 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2402 "arrayof_jbyte_arraycopy"); 2403 2404 //*** jshort 2405 // Always need aligned and unaligned versions 2406 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2407 "jshort_disjoint_arraycopy"); 2408 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2409 &entry_jshort_arraycopy, 2410 "jshort_arraycopy"); 2411 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2412 "arrayof_jshort_disjoint_arraycopy"); 2413 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2414 "arrayof_jshort_arraycopy"); 2415 2416 //*** jint 2417 // Aligned versions 2418 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2419 "arrayof_jint_disjoint_arraycopy"); 2420 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2421 "arrayof_jint_arraycopy"); 2422 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2423 // entry_jint_arraycopy always points to the unaligned version 2424 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2425 "jint_disjoint_arraycopy"); 2426 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2427 &entry_jint_arraycopy, 2428 "jint_arraycopy"); 2429 2430 //*** jlong 2431 // It is always aligned 2432 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2433 "arrayof_jlong_disjoint_arraycopy"); 2434 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2435 "arrayof_jlong_arraycopy"); 2436 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2437 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2438 2439 //*** oops 2440 { 2441 // With compressed oops we need unaligned versions; notice that 2442 // we overwrite entry_oop_arraycopy. 2443 bool aligned = !UseCompressedOops; 2444 2445 StubRoutines::_arrayof_oop_disjoint_arraycopy 2446 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2447 /*dest_uninitialized*/false); 2448 StubRoutines::_arrayof_oop_arraycopy 2449 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2450 /*dest_uninitialized*/false); 2451 // Aligned versions without pre-barriers 2452 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2453 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2454 /*dest_uninitialized*/true); 2455 StubRoutines::_arrayof_oop_arraycopy_uninit 2456 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2457 /*dest_uninitialized*/true); 2458 } 2459 2460 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2461 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2462 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2463 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2464 2465 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2466 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2467 /*dest_uninitialized*/true); 2468 2469 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2470 entry_jbyte_arraycopy, 2471 entry_jshort_arraycopy, 2472 entry_jint_arraycopy, 2473 entry_jlong_arraycopy); 2474 2475 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2476 entry_jbyte_arraycopy, 2477 entry_jshort_arraycopy, 2478 entry_jint_arraycopy, 2479 entry_oop_arraycopy, 2480 entry_jlong_arraycopy, 2481 entry_checkcast_arraycopy); 2482 2483 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2484 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2485 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2486 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2487 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2488 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2489 } 2490 2491 void generate_math_stubs() { Unimplemented(); } 2492 2493 // Arguments: 2494 // 2495 // Inputs: 2496 // c_rarg0 - source byte array address 2497 // c_rarg1 - destination byte array address 2498 // c_rarg2 - K (key) in little endian int array 2499 // 2500 address generate_aescrypt_encryptBlock() { 2501 __ align(CodeEntryAlignment); 2502 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2503 2504 Label L_doLast; 2505 2506 const Register from = c_rarg0; // source array address 2507 const Register to = c_rarg1; // destination array address 2508 const Register key = c_rarg2; // key array address 2509 const Register keylen = rscratch1; 2510 2511 address start = __ pc(); 2512 __ enter(); 2513 2514 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2515 2516 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2517 2518 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2519 __ rev32(v1, __ T16B, v1); 2520 __ rev32(v2, __ T16B, v2); 2521 __ rev32(v3, __ T16B, v3); 2522 __ rev32(v4, __ T16B, v4); 2523 __ aese(v0, v1); 2524 __ aesmc(v0, v0); 2525 __ aese(v0, v2); 2526 __ aesmc(v0, v0); 2527 __ aese(v0, v3); 2528 __ aesmc(v0, v0); 2529 __ aese(v0, v4); 2530 __ aesmc(v0, v0); 2531 2532 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2533 __ rev32(v1, __ T16B, v1); 2534 __ rev32(v2, __ T16B, v2); 2535 __ rev32(v3, __ T16B, v3); 2536 __ rev32(v4, __ T16B, v4); 2537 __ aese(v0, v1); 2538 __ aesmc(v0, v0); 2539 __ aese(v0, v2); 2540 __ aesmc(v0, v0); 2541 __ aese(v0, v3); 2542 __ aesmc(v0, v0); 2543 __ aese(v0, v4); 2544 __ aesmc(v0, v0); 2545 2546 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2547 __ rev32(v1, __ T16B, v1); 2548 __ rev32(v2, __ T16B, v2); 2549 2550 __ cmpw(keylen, 44); 2551 __ br(Assembler::EQ, L_doLast); 2552 2553 __ aese(v0, v1); 2554 __ aesmc(v0, v0); 2555 __ aese(v0, v2); 2556 __ aesmc(v0, v0); 2557 2558 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2559 __ rev32(v1, __ T16B, v1); 2560 __ rev32(v2, __ T16B, v2); 2561 2562 __ cmpw(keylen, 52); 2563 __ br(Assembler::EQ, L_doLast); 2564 2565 __ aese(v0, v1); 2566 __ aesmc(v0, v0); 2567 __ aese(v0, v2); 2568 __ aesmc(v0, v0); 2569 2570 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2571 __ rev32(v1, __ T16B, v1); 2572 __ rev32(v2, __ T16B, v2); 2573 2574 __ BIND(L_doLast); 2575 2576 __ aese(v0, v1); 2577 __ aesmc(v0, v0); 2578 __ aese(v0, v2); 2579 2580 __ ld1(v1, __ T16B, key); 2581 __ rev32(v1, __ T16B, v1); 2582 __ eor(v0, __ T16B, v0, v1); 2583 2584 __ st1(v0, __ T16B, to); 2585 2586 __ mov(r0, 0); 2587 2588 __ leave(); 2589 __ ret(lr); 2590 2591 return start; 2592 } 2593 2594 // Arguments: 2595 // 2596 // Inputs: 2597 // c_rarg0 - source byte array address 2598 // c_rarg1 - destination byte array address 2599 // c_rarg2 - K (key) in little endian int array 2600 // 2601 address generate_aescrypt_decryptBlock() { 2602 assert(UseAES, "need AES instructions and misaligned SSE support"); 2603 __ align(CodeEntryAlignment); 2604 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2605 Label L_doLast; 2606 2607 const Register from = c_rarg0; // source array address 2608 const Register to = c_rarg1; // destination array address 2609 const Register key = c_rarg2; // key array address 2610 const Register keylen = rscratch1; 2611 2612 address start = __ pc(); 2613 __ enter(); // required for proper stackwalking of RuntimeStub frame 2614 2615 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2616 2617 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2618 2619 __ ld1(v5, __ T16B, __ post(key, 16)); 2620 __ rev32(v5, __ T16B, v5); 2621 2622 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2623 __ rev32(v1, __ T16B, v1); 2624 __ rev32(v2, __ T16B, v2); 2625 __ rev32(v3, __ T16B, v3); 2626 __ rev32(v4, __ T16B, v4); 2627 __ aesd(v0, v1); 2628 __ aesimc(v0, v0); 2629 __ aesd(v0, v2); 2630 __ aesimc(v0, v0); 2631 __ aesd(v0, v3); 2632 __ aesimc(v0, v0); 2633 __ aesd(v0, v4); 2634 __ aesimc(v0, v0); 2635 2636 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2637 __ rev32(v1, __ T16B, v1); 2638 __ rev32(v2, __ T16B, v2); 2639 __ rev32(v3, __ T16B, v3); 2640 __ rev32(v4, __ T16B, v4); 2641 __ aesd(v0, v1); 2642 __ aesimc(v0, v0); 2643 __ aesd(v0, v2); 2644 __ aesimc(v0, v0); 2645 __ aesd(v0, v3); 2646 __ aesimc(v0, v0); 2647 __ aesd(v0, v4); 2648 __ aesimc(v0, v0); 2649 2650 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2651 __ rev32(v1, __ T16B, v1); 2652 __ rev32(v2, __ T16B, v2); 2653 2654 __ cmpw(keylen, 44); 2655 __ br(Assembler::EQ, L_doLast); 2656 2657 __ aesd(v0, v1); 2658 __ aesimc(v0, v0); 2659 __ aesd(v0, v2); 2660 __ aesimc(v0, v0); 2661 2662 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2663 __ rev32(v1, __ T16B, v1); 2664 __ rev32(v2, __ T16B, v2); 2665 2666 __ cmpw(keylen, 52); 2667 __ br(Assembler::EQ, L_doLast); 2668 2669 __ aesd(v0, v1); 2670 __ aesimc(v0, v0); 2671 __ aesd(v0, v2); 2672 __ aesimc(v0, v0); 2673 2674 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2675 __ rev32(v1, __ T16B, v1); 2676 __ rev32(v2, __ T16B, v2); 2677 2678 __ BIND(L_doLast); 2679 2680 __ aesd(v0, v1); 2681 __ aesimc(v0, v0); 2682 __ aesd(v0, v2); 2683 2684 __ eor(v0, __ T16B, v0, v5); 2685 2686 __ st1(v0, __ T16B, to); 2687 2688 __ mov(r0, 0); 2689 2690 __ leave(); 2691 __ ret(lr); 2692 2693 return start; 2694 } 2695 2696 // Arguments: 2697 // 2698 // Inputs: 2699 // c_rarg0 - source byte array address 2700 // c_rarg1 - destination byte array address 2701 // c_rarg2 - K (key) in little endian int array 2702 // c_rarg3 - r vector byte array address 2703 // c_rarg4 - input length 2704 // 2705 // Output: 2706 // x0 - input length 2707 // 2708 address generate_cipherBlockChaining_encryptAESCrypt() { 2709 assert(UseAES, "need AES instructions and misaligned SSE support"); 2710 __ align(CodeEntryAlignment); 2711 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2712 2713 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2714 2715 const Register from = c_rarg0; // source array address 2716 const Register to = c_rarg1; // destination array address 2717 const Register key = c_rarg2; // key array address 2718 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2719 // and left with the results of the last encryption block 2720 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2721 const Register keylen = rscratch1; 2722 2723 address start = __ pc(); 2724 2725 __ enter(); 2726 2727 __ movw(rscratch2, len_reg); 2728 2729 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2730 2731 __ ld1(v0, __ T16B, rvec); 2732 2733 __ cmpw(keylen, 52); 2734 __ br(Assembler::CC, L_loadkeys_44); 2735 __ br(Assembler::EQ, L_loadkeys_52); 2736 2737 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2738 __ rev32(v17, __ T16B, v17); 2739 __ rev32(v18, __ T16B, v18); 2740 __ BIND(L_loadkeys_52); 2741 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2742 __ rev32(v19, __ T16B, v19); 2743 __ rev32(v20, __ T16B, v20); 2744 __ BIND(L_loadkeys_44); 2745 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2746 __ rev32(v21, __ T16B, v21); 2747 __ rev32(v22, __ T16B, v22); 2748 __ rev32(v23, __ T16B, v23); 2749 __ rev32(v24, __ T16B, v24); 2750 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2751 __ rev32(v25, __ T16B, v25); 2752 __ rev32(v26, __ T16B, v26); 2753 __ rev32(v27, __ T16B, v27); 2754 __ rev32(v28, __ T16B, v28); 2755 __ ld1(v29, v30, v31, __ T16B, key); 2756 __ rev32(v29, __ T16B, v29); 2757 __ rev32(v30, __ T16B, v30); 2758 __ rev32(v31, __ T16B, v31); 2759 2760 __ BIND(L_aes_loop); 2761 __ ld1(v1, __ T16B, __ post(from, 16)); 2762 __ eor(v0, __ T16B, v0, v1); 2763 2764 __ br(Assembler::CC, L_rounds_44); 2765 __ br(Assembler::EQ, L_rounds_52); 2766 2767 __ aese(v0, v17); __ aesmc(v0, v0); 2768 __ aese(v0, v18); __ aesmc(v0, v0); 2769 __ BIND(L_rounds_52); 2770 __ aese(v0, v19); __ aesmc(v0, v0); 2771 __ aese(v0, v20); __ aesmc(v0, v0); 2772 __ BIND(L_rounds_44); 2773 __ aese(v0, v21); __ aesmc(v0, v0); 2774 __ aese(v0, v22); __ aesmc(v0, v0); 2775 __ aese(v0, v23); __ aesmc(v0, v0); 2776 __ aese(v0, v24); __ aesmc(v0, v0); 2777 __ aese(v0, v25); __ aesmc(v0, v0); 2778 __ aese(v0, v26); __ aesmc(v0, v0); 2779 __ aese(v0, v27); __ aesmc(v0, v0); 2780 __ aese(v0, v28); __ aesmc(v0, v0); 2781 __ aese(v0, v29); __ aesmc(v0, v0); 2782 __ aese(v0, v30); 2783 __ eor(v0, __ T16B, v0, v31); 2784 2785 __ st1(v0, __ T16B, __ post(to, 16)); 2786 2787 __ subw(len_reg, len_reg, 16); 2788 __ cbnzw(len_reg, L_aes_loop); 2789 2790 __ st1(v0, __ T16B, rvec); 2791 2792 __ mov(r0, rscratch2); 2793 2794 __ leave(); 2795 __ ret(lr); 2796 2797 return start; 2798 } 2799 2800 // Arguments: 2801 // 2802 // Inputs: 2803 // c_rarg0 - source byte array address 2804 // c_rarg1 - destination byte array address 2805 // c_rarg2 - K (key) in little endian int array 2806 // c_rarg3 - r vector byte array address 2807 // c_rarg4 - input length 2808 // 2809 // Output: 2810 // r0 - input length 2811 // 2812 address generate_cipherBlockChaining_decryptAESCrypt() { 2813 assert(UseAES, "need AES instructions and misaligned SSE support"); 2814 __ align(CodeEntryAlignment); 2815 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2816 2817 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2818 2819 const Register from = c_rarg0; // source array address 2820 const Register to = c_rarg1; // destination array address 2821 const Register key = c_rarg2; // key array address 2822 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2823 // and left with the results of the last encryption block 2824 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2825 const Register keylen = rscratch1; 2826 2827 address start = __ pc(); 2828 2829 __ enter(); 2830 2831 __ movw(rscratch2, len_reg); 2832 2833 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2834 2835 __ ld1(v2, __ T16B, rvec); 2836 2837 __ ld1(v31, __ T16B, __ post(key, 16)); 2838 __ rev32(v31, __ T16B, v31); 2839 2840 __ cmpw(keylen, 52); 2841 __ br(Assembler::CC, L_loadkeys_44); 2842 __ br(Assembler::EQ, L_loadkeys_52); 2843 2844 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2845 __ rev32(v17, __ T16B, v17); 2846 __ rev32(v18, __ T16B, v18); 2847 __ BIND(L_loadkeys_52); 2848 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2849 __ rev32(v19, __ T16B, v19); 2850 __ rev32(v20, __ T16B, v20); 2851 __ BIND(L_loadkeys_44); 2852 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2853 __ rev32(v21, __ T16B, v21); 2854 __ rev32(v22, __ T16B, v22); 2855 __ rev32(v23, __ T16B, v23); 2856 __ rev32(v24, __ T16B, v24); 2857 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2858 __ rev32(v25, __ T16B, v25); 2859 __ rev32(v26, __ T16B, v26); 2860 __ rev32(v27, __ T16B, v27); 2861 __ rev32(v28, __ T16B, v28); 2862 __ ld1(v29, v30, __ T16B, key); 2863 __ rev32(v29, __ T16B, v29); 2864 __ rev32(v30, __ T16B, v30); 2865 2866 __ BIND(L_aes_loop); 2867 __ ld1(v0, __ T16B, __ post(from, 16)); 2868 __ orr(v1, __ T16B, v0, v0); 2869 2870 __ br(Assembler::CC, L_rounds_44); 2871 __ br(Assembler::EQ, L_rounds_52); 2872 2873 __ aesd(v0, v17); __ aesimc(v0, v0); 2874 __ aesd(v0, v18); __ aesimc(v0, v0); 2875 __ BIND(L_rounds_52); 2876 __ aesd(v0, v19); __ aesimc(v0, v0); 2877 __ aesd(v0, v20); __ aesimc(v0, v0); 2878 __ BIND(L_rounds_44); 2879 __ aesd(v0, v21); __ aesimc(v0, v0); 2880 __ aesd(v0, v22); __ aesimc(v0, v0); 2881 __ aesd(v0, v23); __ aesimc(v0, v0); 2882 __ aesd(v0, v24); __ aesimc(v0, v0); 2883 __ aesd(v0, v25); __ aesimc(v0, v0); 2884 __ aesd(v0, v26); __ aesimc(v0, v0); 2885 __ aesd(v0, v27); __ aesimc(v0, v0); 2886 __ aesd(v0, v28); __ aesimc(v0, v0); 2887 __ aesd(v0, v29); __ aesimc(v0, v0); 2888 __ aesd(v0, v30); 2889 __ eor(v0, __ T16B, v0, v31); 2890 __ eor(v0, __ T16B, v0, v2); 2891 2892 __ st1(v0, __ T16B, __ post(to, 16)); 2893 __ orr(v2, __ T16B, v1, v1); 2894 2895 __ subw(len_reg, len_reg, 16); 2896 __ cbnzw(len_reg, L_aes_loop); 2897 2898 __ st1(v2, __ T16B, rvec); 2899 2900 __ mov(r0, rscratch2); 2901 2902 __ leave(); 2903 __ ret(lr); 2904 2905 return start; 2906 } 2907 2908 // Arguments: 2909 // 2910 // Inputs: 2911 // c_rarg0 - byte[] source+offset 2912 // c_rarg1 - int[] SHA.state 2913 // c_rarg2 - int offset 2914 // c_rarg3 - int limit 2915 // 2916 address generate_sha1_implCompress(bool multi_block, const char *name) { 2917 __ align(CodeEntryAlignment); 2918 StubCodeMark mark(this, "StubRoutines", name); 2919 address start = __ pc(); 2920 2921 Register buf = c_rarg0; 2922 Register state = c_rarg1; 2923 Register ofs = c_rarg2; 2924 Register limit = c_rarg3; 2925 2926 Label keys; 2927 Label sha1_loop; 2928 2929 // load the keys into v0..v3 2930 __ adr(rscratch1, keys); 2931 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2932 // load 5 words state into v6, v7 2933 __ ldrq(v6, Address(state, 0)); 2934 __ ldrs(v7, Address(state, 16)); 2935 2936 2937 __ BIND(sha1_loop); 2938 // load 64 bytes of data into v16..v19 2939 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2940 __ rev32(v16, __ T16B, v16); 2941 __ rev32(v17, __ T16B, v17); 2942 __ rev32(v18, __ T16B, v18); 2943 __ rev32(v19, __ T16B, v19); 2944 2945 // do the sha1 2946 __ addv(v4, __ T4S, v16, v0); 2947 __ orr(v20, __ T16B, v6, v6); 2948 2949 FloatRegister d0 = v16; 2950 FloatRegister d1 = v17; 2951 FloatRegister d2 = v18; 2952 FloatRegister d3 = v19; 2953 2954 for (int round = 0; round < 20; round++) { 2955 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2956 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2957 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2958 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2959 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2960 2961 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2962 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2963 __ sha1h(tmp2, __ T4S, v20); 2964 if (round < 5) 2965 __ sha1c(v20, __ T4S, tmp3, tmp4); 2966 else if (round < 10 || round >= 15) 2967 __ sha1p(v20, __ T4S, tmp3, tmp4); 2968 else 2969 __ sha1m(v20, __ T4S, tmp3, tmp4); 2970 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2971 2972 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2973 } 2974 2975 __ addv(v7, __ T2S, v7, v21); 2976 __ addv(v6, __ T4S, v6, v20); 2977 2978 if (multi_block) { 2979 __ add(ofs, ofs, 64); 2980 __ cmp(ofs, limit); 2981 __ br(Assembler::LE, sha1_loop); 2982 __ mov(c_rarg0, ofs); // return ofs 2983 } 2984 2985 __ strq(v6, Address(state, 0)); 2986 __ strs(v7, Address(state, 16)); 2987 2988 __ ret(lr); 2989 2990 __ bind(keys); 2991 __ emit_int32(0x5a827999); 2992 __ emit_int32(0x6ed9eba1); 2993 __ emit_int32(0x8f1bbcdc); 2994 __ emit_int32(0xca62c1d6); 2995 2996 return start; 2997 } 2998 2999 3000 // Arguments: 3001 // 3002 // Inputs: 3003 // c_rarg0 - byte[] source+offset 3004 // c_rarg1 - int[] SHA.state 3005 // c_rarg2 - int offset 3006 // c_rarg3 - int limit 3007 // 3008 address generate_sha256_implCompress(bool multi_block, const char *name) { 3009 static const uint32_t round_consts[64] = { 3010 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3011 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3012 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3013 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3014 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3015 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3016 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3017 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3018 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3019 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3020 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3021 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3022 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3023 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3024 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3025 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3026 }; 3027 __ align(CodeEntryAlignment); 3028 StubCodeMark mark(this, "StubRoutines", name); 3029 address start = __ pc(); 3030 3031 Register buf = c_rarg0; 3032 Register state = c_rarg1; 3033 Register ofs = c_rarg2; 3034 Register limit = c_rarg3; 3035 3036 Label sha1_loop; 3037 3038 __ stpd(v8, v9, __ pre(sp, -32)); 3039 __ stpd(v10, v11, Address(sp, 16)); 3040 3041 // dga == v0 3042 // dgb == v1 3043 // dg0 == v2 3044 // dg1 == v3 3045 // dg2 == v4 3046 // t0 == v6 3047 // t1 == v7 3048 3049 // load 16 keys to v16..v31 3050 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3051 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3052 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3053 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3054 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3055 3056 // load 8 words (256 bits) state 3057 __ ldpq(v0, v1, state); 3058 3059 __ BIND(sha1_loop); 3060 // load 64 bytes of data into v8..v11 3061 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3062 __ rev32(v8, __ T16B, v8); 3063 __ rev32(v9, __ T16B, v9); 3064 __ rev32(v10, __ T16B, v10); 3065 __ rev32(v11, __ T16B, v11); 3066 3067 __ addv(v6, __ T4S, v8, v16); 3068 __ orr(v2, __ T16B, v0, v0); 3069 __ orr(v3, __ T16B, v1, v1); 3070 3071 FloatRegister d0 = v8; 3072 FloatRegister d1 = v9; 3073 FloatRegister d2 = v10; 3074 FloatRegister d3 = v11; 3075 3076 3077 for (int round = 0; round < 16; round++) { 3078 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3079 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3080 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3081 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3082 3083 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3084 __ orr(v4, __ T16B, v2, v2); 3085 if (round < 15) 3086 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3087 __ sha256h(v2, __ T4S, v3, tmp2); 3088 __ sha256h2(v3, __ T4S, v4, tmp2); 3089 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3090 3091 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3092 } 3093 3094 __ addv(v0, __ T4S, v0, v2); 3095 __ addv(v1, __ T4S, v1, v3); 3096 3097 if (multi_block) { 3098 __ add(ofs, ofs, 64); 3099 __ cmp(ofs, limit); 3100 __ br(Assembler::LE, sha1_loop); 3101 __ mov(c_rarg0, ofs); // return ofs 3102 } 3103 3104 __ ldpd(v10, v11, Address(sp, 16)); 3105 __ ldpd(v8, v9, __ post(sp, 32)); 3106 3107 __ stpq(v0, v1, state); 3108 3109 __ ret(lr); 3110 3111 return start; 3112 } 3113 3114 #ifndef BUILTIN_SIM 3115 // Safefetch stubs. 3116 void generate_safefetch(const char* name, int size, address* entry, 3117 address* fault_pc, address* continuation_pc) { 3118 // safefetch signatures: 3119 // int SafeFetch32(int* adr, int errValue); 3120 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3121 // 3122 // arguments: 3123 // c_rarg0 = adr 3124 // c_rarg1 = errValue 3125 // 3126 // result: 3127 // PPC_RET = *adr or errValue 3128 3129 StubCodeMark mark(this, "StubRoutines", name); 3130 3131 // Entry point, pc or function descriptor. 3132 *entry = __ pc(); 3133 3134 // Load *adr into c_rarg1, may fault. 3135 *fault_pc = __ pc(); 3136 switch (size) { 3137 case 4: 3138 // int32_t 3139 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3140 break; 3141 case 8: 3142 // int64_t 3143 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3144 break; 3145 default: 3146 ShouldNotReachHere(); 3147 } 3148 3149 // return errValue or *adr 3150 *continuation_pc = __ pc(); 3151 __ mov(r0, c_rarg1); 3152 __ ret(lr); 3153 } 3154 #endif 3155 3156 /** 3157 * Arguments: 3158 * 3159 * Inputs: 3160 * c_rarg0 - int crc 3161 * c_rarg1 - byte* buf 3162 * c_rarg2 - int length 3163 * 3164 * Ouput: 3165 * rax - int crc result 3166 */ 3167 address generate_updateBytesCRC32() { 3168 assert(UseCRC32Intrinsics, "what are we doing here?"); 3169 3170 __ align(CodeEntryAlignment); 3171 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3172 3173 address start = __ pc(); 3174 3175 const Register crc = c_rarg0; // crc 3176 const Register buf = c_rarg1; // source java byte array address 3177 const Register len = c_rarg2; // length 3178 const Register table0 = c_rarg3; // crc_table address 3179 const Register table1 = c_rarg4; 3180 const Register table2 = c_rarg5; 3181 const Register table3 = c_rarg6; 3182 const Register tmp3 = c_rarg7; 3183 3184 BLOCK_COMMENT("Entry:"); 3185 __ enter(); // required for proper stackwalking of RuntimeStub frame 3186 3187 __ kernel_crc32(crc, buf, len, 3188 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3189 3190 __ leave(); // required for proper stackwalking of RuntimeStub frame 3191 __ ret(lr); 3192 3193 return start; 3194 } 3195 3196 /** 3197 * Arguments: 3198 * 3199 * Inputs: 3200 * c_rarg0 - int crc 3201 * c_rarg1 - byte* buf 3202 * c_rarg2 - int length 3203 * c_rarg3 - int* table 3204 * 3205 * Ouput: 3206 * r0 - int crc result 3207 */ 3208 address generate_updateBytesCRC32C() { 3209 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3210 3211 __ align(CodeEntryAlignment); 3212 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3213 3214 address start = __ pc(); 3215 3216 const Register crc = c_rarg0; // crc 3217 const Register buf = c_rarg1; // source java byte array address 3218 const Register len = c_rarg2; // length 3219 const Register table0 = c_rarg3; // crc_table address 3220 const Register table1 = c_rarg4; 3221 const Register table2 = c_rarg5; 3222 const Register table3 = c_rarg6; 3223 const Register tmp3 = c_rarg7; 3224 3225 BLOCK_COMMENT("Entry:"); 3226 __ enter(); // required for proper stackwalking of RuntimeStub frame 3227 3228 __ kernel_crc32c(crc, buf, len, 3229 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3230 3231 __ leave(); // required for proper stackwalking of RuntimeStub frame 3232 __ ret(lr); 3233 3234 return start; 3235 } 3236 3237 /*** 3238 * Arguments: 3239 * 3240 * Inputs: 3241 * c_rarg0 - int adler 3242 * c_rarg1 - byte* buff 3243 * c_rarg2 - int len 3244 * 3245 * Output: 3246 * c_rarg0 - int adler result 3247 */ 3248 address generate_updateBytesAdler32() { 3249 __ align(CodeEntryAlignment); 3250 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3251 address start = __ pc(); 3252 3253 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3254 3255 // Aliases 3256 Register adler = c_rarg0; 3257 Register s1 = c_rarg0; 3258 Register s2 = c_rarg3; 3259 Register buff = c_rarg1; 3260 Register len = c_rarg2; 3261 Register nmax = r4; 3262 Register base = r5; 3263 Register count = r6; 3264 Register temp0 = rscratch1; 3265 Register temp1 = rscratch2; 3266 FloatRegister vbytes = v0; 3267 FloatRegister vs1acc = v1; 3268 FloatRegister vs2acc = v2; 3269 FloatRegister vtable = v3; 3270 3271 // Max number of bytes we can process before having to take the mod 3272 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3273 unsigned long BASE = 0xfff1; 3274 unsigned long NMAX = 0x15B0; 3275 3276 __ mov(base, BASE); 3277 __ mov(nmax, NMAX); 3278 3279 // Load accumulation coefficients for the upper 16 bits 3280 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 3281 __ ld1(vtable, __ T16B, Address(temp0)); 3282 3283 // s1 is initialized to the lower 16 bits of adler 3284 // s2 is initialized to the upper 16 bits of adler 3285 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3286 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3287 3288 // The pipelined loop needs at least 16 elements for 1 iteration 3289 // It does check this, but it is more effective to skip to the cleanup loop 3290 __ cmp(len, (u1)16); 3291 __ br(Assembler::HS, L_nmax); 3292 __ cbz(len, L_combine); 3293 3294 __ bind(L_simple_by1_loop); 3295 __ ldrb(temp0, Address(__ post(buff, 1))); 3296 __ add(s1, s1, temp0); 3297 __ add(s2, s2, s1); 3298 __ subs(len, len, 1); 3299 __ br(Assembler::HI, L_simple_by1_loop); 3300 3301 // s1 = s1 % BASE 3302 __ subs(temp0, s1, base); 3303 __ csel(s1, temp0, s1, Assembler::HS); 3304 3305 // s2 = s2 % BASE 3306 __ lsr(temp0, s2, 16); 3307 __ lsl(temp1, temp0, 4); 3308 __ sub(temp1, temp1, temp0); 3309 __ add(s2, temp1, s2, ext::uxth); 3310 3311 __ subs(temp0, s2, base); 3312 __ csel(s2, temp0, s2, Assembler::HS); 3313 3314 __ b(L_combine); 3315 3316 __ bind(L_nmax); 3317 __ subs(len, len, nmax); 3318 __ sub(count, nmax, 16); 3319 __ br(Assembler::LO, L_by16); 3320 3321 __ bind(L_nmax_loop); 3322 3323 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3324 vbytes, vs1acc, vs2acc, vtable); 3325 3326 __ subs(count, count, 16); 3327 __ br(Assembler::HS, L_nmax_loop); 3328 3329 // s1 = s1 % BASE 3330 __ lsr(temp0, s1, 16); 3331 __ lsl(temp1, temp0, 4); 3332 __ sub(temp1, temp1, temp0); 3333 __ add(temp1, temp1, s1, ext::uxth); 3334 3335 __ lsr(temp0, temp1, 16); 3336 __ lsl(s1, temp0, 4); 3337 __ sub(s1, s1, temp0); 3338 __ add(s1, s1, temp1, ext:: uxth); 3339 3340 __ subs(temp0, s1, base); 3341 __ csel(s1, temp0, s1, Assembler::HS); 3342 3343 // s2 = s2 % BASE 3344 __ lsr(temp0, s2, 16); 3345 __ lsl(temp1, temp0, 4); 3346 __ sub(temp1, temp1, temp0); 3347 __ add(temp1, temp1, s2, ext::uxth); 3348 3349 __ lsr(temp0, temp1, 16); 3350 __ lsl(s2, temp0, 4); 3351 __ sub(s2, s2, temp0); 3352 __ add(s2, s2, temp1, ext:: uxth); 3353 3354 __ subs(temp0, s2, base); 3355 __ csel(s2, temp0, s2, Assembler::HS); 3356 3357 __ subs(len, len, nmax); 3358 __ sub(count, nmax, 16); 3359 __ br(Assembler::HS, L_nmax_loop); 3360 3361 __ bind(L_by16); 3362 __ adds(len, len, count); 3363 __ br(Assembler::LO, L_by1); 3364 3365 __ bind(L_by16_loop); 3366 3367 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3368 vbytes, vs1acc, vs2acc, vtable); 3369 3370 __ subs(len, len, 16); 3371 __ br(Assembler::HS, L_by16_loop); 3372 3373 __ bind(L_by1); 3374 __ adds(len, len, 15); 3375 __ br(Assembler::LO, L_do_mod); 3376 3377 __ bind(L_by1_loop); 3378 __ ldrb(temp0, Address(__ post(buff, 1))); 3379 __ add(s1, temp0, s1); 3380 __ add(s2, s2, s1); 3381 __ subs(len, len, 1); 3382 __ br(Assembler::HS, L_by1_loop); 3383 3384 __ bind(L_do_mod); 3385 // s1 = s1 % BASE 3386 __ lsr(temp0, s1, 16); 3387 __ lsl(temp1, temp0, 4); 3388 __ sub(temp1, temp1, temp0); 3389 __ add(temp1, temp1, s1, ext::uxth); 3390 3391 __ lsr(temp0, temp1, 16); 3392 __ lsl(s1, temp0, 4); 3393 __ sub(s1, s1, temp0); 3394 __ add(s1, s1, temp1, ext:: uxth); 3395 3396 __ subs(temp0, s1, base); 3397 __ csel(s1, temp0, s1, Assembler::HS); 3398 3399 // s2 = s2 % BASE 3400 __ lsr(temp0, s2, 16); 3401 __ lsl(temp1, temp0, 4); 3402 __ sub(temp1, temp1, temp0); 3403 __ add(temp1, temp1, s2, ext::uxth); 3404 3405 __ lsr(temp0, temp1, 16); 3406 __ lsl(s2, temp0, 4); 3407 __ sub(s2, s2, temp0); 3408 __ add(s2, s2, temp1, ext:: uxth); 3409 3410 __ subs(temp0, s2, base); 3411 __ csel(s2, temp0, s2, Assembler::HS); 3412 3413 // Combine lower bits and higher bits 3414 __ bind(L_combine); 3415 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3416 3417 __ ret(lr); 3418 3419 return start; 3420 } 3421 3422 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 3423 Register temp0, Register temp1, FloatRegister vbytes, 3424 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 3425 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 3426 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 3427 // In non-vectorized code, we update s1 and s2 as: 3428 // s1 <- s1 + b1 3429 // s2 <- s2 + s1 3430 // s1 <- s1 + b2 3431 // s2 <- s2 + b1 3432 // ... 3433 // s1 <- s1 + b16 3434 // s2 <- s2 + s1 3435 // Putting above assignments together, we have: 3436 // s1_new = s1 + b1 + b2 + ... + b16 3437 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 3438 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 3439 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 3440 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 3441 3442 // s2 = s2 + s1 * 16 3443 __ add(s2, s2, s1, Assembler::LSL, 4); 3444 3445 // vs1acc = b1 + b2 + b3 + ... + b16 3446 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 3447 __ umullv(vs2acc, __ T8B, vtable, vbytes); 3448 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 3449 __ uaddlv(vs1acc, __ T16B, vbytes); 3450 __ uaddlv(vs2acc, __ T8H, vs2acc); 3451 3452 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 3453 __ fmovd(temp0, vs1acc); 3454 __ fmovd(temp1, vs2acc); 3455 __ add(s1, s1, temp0); 3456 __ add(s2, s2, temp1); 3457 } 3458 3459 /** 3460 * Arguments: 3461 * 3462 * Input: 3463 * c_rarg0 - x address 3464 * c_rarg1 - x length 3465 * c_rarg2 - y address 3466 * c_rarg3 - y lenth 3467 * c_rarg4 - z address 3468 * c_rarg5 - z length 3469 */ 3470 address generate_multiplyToLen() { 3471 __ align(CodeEntryAlignment); 3472 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3473 3474 address start = __ pc(); 3475 const Register x = r0; 3476 const Register xlen = r1; 3477 const Register y = r2; 3478 const Register ylen = r3; 3479 const Register z = r4; 3480 const Register zlen = r5; 3481 3482 const Register tmp1 = r10; 3483 const Register tmp2 = r11; 3484 const Register tmp3 = r12; 3485 const Register tmp4 = r13; 3486 const Register tmp5 = r14; 3487 const Register tmp6 = r15; 3488 const Register tmp7 = r16; 3489 3490 BLOCK_COMMENT("Entry:"); 3491 __ enter(); // required for proper stackwalking of RuntimeStub frame 3492 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3493 __ leave(); // required for proper stackwalking of RuntimeStub frame 3494 __ ret(lr); 3495 3496 return start; 3497 } 3498 3499 address generate_squareToLen() { 3500 // squareToLen algorithm for sizes 1..127 described in java code works 3501 // faster than multiply_to_len on some CPUs and slower on others, but 3502 // multiply_to_len shows a bit better overall results 3503 __ align(CodeEntryAlignment); 3504 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3505 address start = __ pc(); 3506 3507 const Register x = r0; 3508 const Register xlen = r1; 3509 const Register z = r2; 3510 const Register zlen = r3; 3511 const Register y = r4; // == x 3512 const Register ylen = r5; // == xlen 3513 3514 const Register tmp1 = r10; 3515 const Register tmp2 = r11; 3516 const Register tmp3 = r12; 3517 const Register tmp4 = r13; 3518 const Register tmp5 = r14; 3519 const Register tmp6 = r15; 3520 const Register tmp7 = r16; 3521 3522 RegSet spilled_regs = RegSet::of(y, ylen); 3523 BLOCK_COMMENT("Entry:"); 3524 __ enter(); 3525 __ push(spilled_regs, sp); 3526 __ mov(y, x); 3527 __ mov(ylen, xlen); 3528 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3529 __ pop(spilled_regs, sp); 3530 __ leave(); 3531 __ ret(lr); 3532 return start; 3533 } 3534 3535 address generate_mulAdd() { 3536 __ align(CodeEntryAlignment); 3537 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3538 3539 address start = __ pc(); 3540 3541 const Register out = r0; 3542 const Register in = r1; 3543 const Register offset = r2; 3544 const Register len = r3; 3545 const Register k = r4; 3546 3547 BLOCK_COMMENT("Entry:"); 3548 __ enter(); 3549 __ mul_add(out, in, offset, len, k); 3550 __ leave(); 3551 __ ret(lr); 3552 3553 return start; 3554 } 3555 3556 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3557 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3558 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3559 // Karatsuba multiplication performs a 128*128 -> 256-bit 3560 // multiplication in three 128-bit multiplications and a few 3561 // additions. 3562 // 3563 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3564 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3565 // 3566 // Inputs: 3567 // 3568 // A0 in a.d[0] (subkey) 3569 // A1 in a.d[1] 3570 // (A1+A0) in a1_xor_a0.d[0] 3571 // 3572 // B0 in b.d[0] (state) 3573 // B1 in b.d[1] 3574 3575 __ ext(tmp1, __ T16B, b, b, 0x08); 3576 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3577 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3578 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3579 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3580 3581 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3582 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3583 __ eor(tmp2, __ T16B, tmp2, tmp4); 3584 __ eor(tmp2, __ T16B, tmp2, tmp3); 3585 3586 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3587 __ ins(result_hi, __ D, tmp2, 0, 1); 3588 __ ins(result_lo, __ D, tmp2, 1, 0); 3589 } 3590 3591 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3592 FloatRegister p, FloatRegister z, FloatRegister t1) { 3593 const FloatRegister t0 = result; 3594 3595 // The GCM field polynomial f is z^128 + p(z), where p = 3596 // z^7+z^2+z+1. 3597 // 3598 // z^128 === -p(z) (mod (z^128 + p(z))) 3599 // 3600 // so, given that the product we're reducing is 3601 // a == lo + hi * z^128 3602 // substituting, 3603 // === lo - hi * p(z) (mod (z^128 + p(z))) 3604 // 3605 // we reduce by multiplying hi by p(z) and subtracting the result 3606 // from (i.e. XORing it with) lo. Because p has no nonzero high 3607 // bits we can do this with two 64-bit multiplications, lo*p and 3608 // hi*p. 3609 3610 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3611 __ ext(t1, __ T16B, t0, z, 8); 3612 __ eor(hi, __ T16B, hi, t1); 3613 __ ext(t1, __ T16B, z, t0, 8); 3614 __ eor(lo, __ T16B, lo, t1); 3615 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3616 __ eor(result, __ T16B, lo, t0); 3617 } 3618 3619 address generate_has_negatives(address &has_negatives_long) { 3620 const u1 large_loop_size = 64; 3621 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3622 int dcache_line = VM_Version::dcache_line_size(); 3623 3624 Register ary1 = r1, len = r2, result = r0; 3625 3626 __ align(CodeEntryAlignment); 3627 3628 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3629 3630 address entry = __ pc(); 3631 3632 __ enter(); 3633 3634 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3635 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3636 3637 __ cmp(len, (u1)15); 3638 __ br(Assembler::GT, LEN_OVER_15); 3639 // The only case when execution falls into this code is when pointer is near 3640 // the end of memory page and we have to avoid reading next page 3641 __ add(ary1, ary1, len); 3642 __ subs(len, len, 8); 3643 __ br(Assembler::GT, LEN_OVER_8); 3644 __ ldr(rscratch2, Address(ary1, -8)); 3645 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3646 __ lsrv(rscratch2, rscratch2, rscratch1); 3647 __ tst(rscratch2, UPPER_BIT_MASK); 3648 __ cset(result, Assembler::NE); 3649 __ leave(); 3650 __ ret(lr); 3651 __ bind(LEN_OVER_8); 3652 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3653 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3654 __ tst(rscratch2, UPPER_BIT_MASK); 3655 __ br(Assembler::NE, RET_TRUE_NO_POP); 3656 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3657 __ lsrv(rscratch1, rscratch1, rscratch2); 3658 __ tst(rscratch1, UPPER_BIT_MASK); 3659 __ cset(result, Assembler::NE); 3660 __ leave(); 3661 __ ret(lr); 3662 3663 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3664 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3665 3666 has_negatives_long = __ pc(); // 2nd entry point 3667 3668 __ enter(); 3669 3670 __ bind(LEN_OVER_15); 3671 __ push(spilled_regs, sp); 3672 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3673 __ cbz(rscratch2, ALIGNED); 3674 __ ldp(tmp6, tmp1, Address(ary1)); 3675 __ mov(tmp5, 16); 3676 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3677 __ add(ary1, ary1, rscratch1); 3678 __ sub(len, len, rscratch1); 3679 __ orr(tmp6, tmp6, tmp1); 3680 __ tst(tmp6, UPPER_BIT_MASK); 3681 __ br(Assembler::NE, RET_TRUE); 3682 3683 __ bind(ALIGNED); 3684 __ cmp(len, large_loop_size); 3685 __ br(Assembler::LT, CHECK_16); 3686 // Perform 16-byte load as early return in pre-loop to handle situation 3687 // when initially aligned large array has negative values at starting bytes, 3688 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3689 // slower. Cases with negative bytes further ahead won't be affected that 3690 // much. In fact, it'll be faster due to early loads, less instructions and 3691 // less branches in LARGE_LOOP. 3692 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3693 __ sub(len, len, 16); 3694 __ orr(tmp6, tmp6, tmp1); 3695 __ tst(tmp6, UPPER_BIT_MASK); 3696 __ br(Assembler::NE, RET_TRUE); 3697 __ cmp(len, large_loop_size); 3698 __ br(Assembler::LT, CHECK_16); 3699 3700 if (SoftwarePrefetchHintDistance >= 0 3701 && SoftwarePrefetchHintDistance >= dcache_line) { 3702 // initial prefetch 3703 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3704 } 3705 __ bind(LARGE_LOOP); 3706 if (SoftwarePrefetchHintDistance >= 0) { 3707 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3708 } 3709 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3710 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3711 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3712 // instructions per cycle and have less branches, but this approach disables 3713 // early return, thus, all 64 bytes are loaded and checked every time. 3714 __ ldp(tmp2, tmp3, Address(ary1)); 3715 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3716 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3717 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3718 __ add(ary1, ary1, large_loop_size); 3719 __ sub(len, len, large_loop_size); 3720 __ orr(tmp2, tmp2, tmp3); 3721 __ orr(tmp4, tmp4, tmp5); 3722 __ orr(rscratch1, rscratch1, rscratch2); 3723 __ orr(tmp6, tmp6, tmp1); 3724 __ orr(tmp2, tmp2, tmp4); 3725 __ orr(rscratch1, rscratch1, tmp6); 3726 __ orr(tmp2, tmp2, rscratch1); 3727 __ tst(tmp2, UPPER_BIT_MASK); 3728 __ br(Assembler::NE, RET_TRUE); 3729 __ cmp(len, large_loop_size); 3730 __ br(Assembler::GE, LARGE_LOOP); 3731 3732 __ bind(CHECK_16); // small 16-byte load pre-loop 3733 __ cmp(len, (u1)16); 3734 __ br(Assembler::LT, POST_LOOP16); 3735 3736 __ bind(LOOP16); // small 16-byte load loop 3737 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3738 __ sub(len, len, 16); 3739 __ orr(tmp2, tmp2, tmp3); 3740 __ tst(tmp2, UPPER_BIT_MASK); 3741 __ br(Assembler::NE, RET_TRUE); 3742 __ cmp(len, (u1)16); 3743 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3744 3745 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3746 __ cmp(len, (u1)8); 3747 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3748 __ ldr(tmp3, Address(__ post(ary1, 8))); 3749 __ sub(len, len, 8); 3750 __ tst(tmp3, UPPER_BIT_MASK); 3751 __ br(Assembler::NE, RET_TRUE); 3752 3753 __ bind(POST_LOOP16_LOAD_TAIL); 3754 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3755 __ ldr(tmp1, Address(ary1)); 3756 __ mov(tmp2, 64); 3757 __ sub(tmp4, tmp2, len, __ LSL, 3); 3758 __ lslv(tmp1, tmp1, tmp4); 3759 __ tst(tmp1, UPPER_BIT_MASK); 3760 __ br(Assembler::NE, RET_TRUE); 3761 // Fallthrough 3762 3763 __ bind(RET_FALSE); 3764 __ pop(spilled_regs, sp); 3765 __ leave(); 3766 __ mov(result, zr); 3767 __ ret(lr); 3768 3769 __ bind(RET_TRUE); 3770 __ pop(spilled_regs, sp); 3771 __ bind(RET_TRUE_NO_POP); 3772 __ leave(); 3773 __ mov(result, 1); 3774 __ ret(lr); 3775 3776 __ bind(DONE); 3777 __ pop(spilled_regs, sp); 3778 __ leave(); 3779 __ ret(lr); 3780 return entry; 3781 } 3782 3783 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3784 bool usePrefetch, Label &NOT_EQUAL) { 3785 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3786 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3787 tmp7 = r12, tmp8 = r13; 3788 Label LOOP; 3789 3790 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3791 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3792 __ bind(LOOP); 3793 if (usePrefetch) { 3794 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3795 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3796 } 3797 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3798 __ eor(tmp1, tmp1, tmp2); 3799 __ eor(tmp3, tmp3, tmp4); 3800 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3801 __ orr(tmp1, tmp1, tmp3); 3802 __ cbnz(tmp1, NOT_EQUAL); 3803 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3804 __ eor(tmp5, tmp5, tmp6); 3805 __ eor(tmp7, tmp7, tmp8); 3806 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3807 __ orr(tmp5, tmp5, tmp7); 3808 __ cbnz(tmp5, NOT_EQUAL); 3809 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3810 __ eor(tmp1, tmp1, tmp2); 3811 __ eor(tmp3, tmp3, tmp4); 3812 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3813 __ orr(tmp1, tmp1, tmp3); 3814 __ cbnz(tmp1, NOT_EQUAL); 3815 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3816 __ eor(tmp5, tmp5, tmp6); 3817 __ sub(cnt1, cnt1, 8 * wordSize); 3818 __ eor(tmp7, tmp7, tmp8); 3819 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3820 // tmp6 is not used. MacroAssembler::subs is used here (rather than 3821 // cmp) because subs allows an unlimited range of immediate operand. 3822 __ subs(tmp6, cnt1, loopThreshold); 3823 __ orr(tmp5, tmp5, tmp7); 3824 __ cbnz(tmp5, NOT_EQUAL); 3825 __ br(__ GE, LOOP); 3826 // post-loop 3827 __ eor(tmp1, tmp1, tmp2); 3828 __ eor(tmp3, tmp3, tmp4); 3829 __ orr(tmp1, tmp1, tmp3); 3830 __ sub(cnt1, cnt1, 2 * wordSize); 3831 __ cbnz(tmp1, NOT_EQUAL); 3832 } 3833 3834 void generate_large_array_equals_loop_simd(int loopThreshold, 3835 bool usePrefetch, Label &NOT_EQUAL) { 3836 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3837 tmp2 = rscratch2; 3838 Label LOOP; 3839 3840 __ bind(LOOP); 3841 if (usePrefetch) { 3842 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3843 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3844 } 3845 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3846 __ sub(cnt1, cnt1, 8 * wordSize); 3847 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3848 __ subs(tmp1, cnt1, loopThreshold); 3849 __ eor(v0, __ T16B, v0, v4); 3850 __ eor(v1, __ T16B, v1, v5); 3851 __ eor(v2, __ T16B, v2, v6); 3852 __ eor(v3, __ T16B, v3, v7); 3853 __ orr(v0, __ T16B, v0, v1); 3854 __ orr(v1, __ T16B, v2, v3); 3855 __ orr(v0, __ T16B, v0, v1); 3856 __ umov(tmp1, v0, __ D, 0); 3857 __ umov(tmp2, v0, __ D, 1); 3858 __ orr(tmp1, tmp1, tmp2); 3859 __ cbnz(tmp1, NOT_EQUAL); 3860 __ br(__ GE, LOOP); 3861 } 3862 3863 // a1 = r1 - array1 address 3864 // a2 = r2 - array2 address 3865 // result = r0 - return value. Already contains "false" 3866 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3867 // r3-r5 are reserved temporary registers 3868 address generate_large_array_equals() { 3869 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3870 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3871 tmp7 = r12, tmp8 = r13; 3872 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3873 SMALL_LOOP, POST_LOOP; 3874 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3875 // calculate if at least 32 prefetched bytes are used 3876 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3877 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3878 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3879 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3880 tmp5, tmp6, tmp7, tmp8); 3881 3882 __ align(CodeEntryAlignment); 3883 3884 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3885 3886 address entry = __ pc(); 3887 __ enter(); 3888 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3889 // also advance pointers to use post-increment instead of pre-increment 3890 __ add(a1, a1, wordSize); 3891 __ add(a2, a2, wordSize); 3892 if (AvoidUnalignedAccesses) { 3893 // both implementations (SIMD/nonSIMD) are using relatively large load 3894 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 3895 // on some CPUs in case of address is not at least 16-byte aligned. 3896 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 3897 // load if needed at least for 1st address and make if 16-byte aligned. 3898 Label ALIGNED16; 3899 __ tbz(a1, 3, ALIGNED16); 3900 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3901 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3902 __ sub(cnt1, cnt1, wordSize); 3903 __ eor(tmp1, tmp1, tmp2); 3904 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 3905 __ bind(ALIGNED16); 3906 } 3907 if (UseSIMDForArrayEquals) { 3908 if (SoftwarePrefetchHintDistance >= 0) { 3909 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3910 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3911 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 3912 /* prfm = */ true, NOT_EQUAL); 3913 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3914 __ br(__ LT, TAIL); 3915 } 3916 __ bind(NO_PREFETCH_LARGE_LOOP); 3917 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 3918 /* prfm = */ false, NOT_EQUAL); 3919 } else { 3920 __ push(spilled_regs, sp); 3921 if (SoftwarePrefetchHintDistance >= 0) { 3922 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3923 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3924 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 3925 /* prfm = */ true, NOT_EQUAL); 3926 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3927 __ br(__ LT, TAIL); 3928 } 3929 __ bind(NO_PREFETCH_LARGE_LOOP); 3930 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 3931 /* prfm = */ false, NOT_EQUAL); 3932 } 3933 __ bind(TAIL); 3934 __ cbz(cnt1, EQUAL); 3935 __ subs(cnt1, cnt1, wordSize); 3936 __ br(__ LE, POST_LOOP); 3937 __ bind(SMALL_LOOP); 3938 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3939 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3940 __ subs(cnt1, cnt1, wordSize); 3941 __ eor(tmp1, tmp1, tmp2); 3942 __ cbnz(tmp1, NOT_EQUAL); 3943 __ br(__ GT, SMALL_LOOP); 3944 __ bind(POST_LOOP); 3945 __ ldr(tmp1, Address(a1, cnt1)); 3946 __ ldr(tmp2, Address(a2, cnt1)); 3947 __ eor(tmp1, tmp1, tmp2); 3948 __ cbnz(tmp1, NOT_EQUAL); 3949 __ bind(EQUAL); 3950 __ mov(result, true); 3951 __ bind(NOT_EQUAL); 3952 if (!UseSIMDForArrayEquals) { 3953 __ pop(spilled_regs, sp); 3954 } 3955 __ bind(NOT_EQUAL_NO_POP); 3956 __ leave(); 3957 __ ret(lr); 3958 return entry; 3959 } 3960 3961 address generate_dsin_dcos(bool isCos) { 3962 __ align(CodeEntryAlignment); 3963 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 3964 address start = __ pc(); 3965 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 3966 (address)StubRoutines::aarch64::_two_over_pi, 3967 (address)StubRoutines::aarch64::_pio2, 3968 (address)StubRoutines::aarch64::_dsin_coef, 3969 (address)StubRoutines::aarch64::_dcos_coef); 3970 return start; 3971 } 3972 3973 address generate_dlog() { 3974 __ align(CodeEntryAlignment); 3975 StubCodeMark mark(this, "StubRoutines", "dlog"); 3976 address entry = __ pc(); 3977 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 3978 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 3979 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 3980 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 3981 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 3982 return entry; 3983 } 3984 3985 // code for comparing 16 bytes of strings with same encoding 3986 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 3987 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; 3988 __ ldr(rscratch1, Address(__ post(str1, 8))); 3989 __ eor(rscratch2, tmp1, tmp2); 3990 __ ldr(cnt1, Address(__ post(str2, 8))); 3991 __ cbnz(rscratch2, DIFF1); 3992 __ ldr(tmp1, Address(__ post(str1, 8))); 3993 __ eor(rscratch2, rscratch1, cnt1); 3994 __ ldr(tmp2, Address(__ post(str2, 8))); 3995 __ cbnz(rscratch2, DIFF2); 3996 } 3997 3998 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 3999 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 4000 Label &DIFF2) { 4001 Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12; 4002 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 4003 4004 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 4005 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4006 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4007 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 4008 4009 __ fmovd(tmpL, vtmp3); 4010 __ eor(rscratch2, tmp3, tmpL); 4011 __ cbnz(rscratch2, DIFF2); 4012 4013 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4014 __ umov(tmpL, vtmp3, __ D, 1); 4015 __ eor(rscratch2, tmpU, tmpL); 4016 __ cbnz(rscratch2, DIFF1); 4017 4018 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4019 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4020 __ fmovd(tmpL, vtmp); 4021 __ eor(rscratch2, tmp3, tmpL); 4022 __ cbnz(rscratch2, DIFF2); 4023 4024 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4025 __ umov(tmpL, vtmp, __ D, 1); 4026 __ eor(rscratch2, tmpU, tmpL); 4027 __ cbnz(rscratch2, DIFF1); 4028 } 4029 4030 // r0 = result 4031 // r1 = str1 4032 // r2 = cnt1 4033 // r3 = str2 4034 // r4 = cnt2 4035 // r10 = tmp1 4036 // r11 = tmp2 4037 address generate_compare_long_string_different_encoding(bool isLU) { 4038 __ align(CodeEntryAlignment); 4039 StubCodeMark mark(this, "StubRoutines", isLU 4040 ? "compare_long_string_different_encoding LU" 4041 : "compare_long_string_different_encoding UL"); 4042 address entry = __ pc(); 4043 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 4044 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER, 4045 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 4046 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4047 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 4048 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 4049 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 4050 4051 int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2); 4052 4053 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 4054 // cnt2 == amount of characters left to compare 4055 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 4056 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4057 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 4058 __ add(str2, str2, isLU ? wordSize : wordSize/2); 4059 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 4060 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 4061 __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1); 4062 __ eor(rscratch2, tmp1, tmp2); 4063 __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0); 4064 __ mov(rscratch1, tmp2); 4065 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 4066 Register strU = isLU ? str2 : str1, 4067 strL = isLU ? str1 : str2, 4068 tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 4069 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 4070 __ push(spilled_regs, sp); 4071 __ sub(tmp2, strL, cnt2); // strL pointer to load from 4072 __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from 4073 4074 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4075 4076 if (SoftwarePrefetchHintDistance >= 0) { 4077 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4078 __ br(__ LT, SMALL_LOOP); 4079 __ bind(LARGE_LOOP_PREFETCH); 4080 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 4081 __ mov(tmp4, 2); 4082 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4083 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 4084 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4085 __ subs(tmp4, tmp4, 1); 4086 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 4087 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4088 __ mov(tmp4, 2); 4089 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 4090 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4091 __ subs(tmp4, tmp4, 1); 4092 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 4093 __ sub(cnt2, cnt2, 64); 4094 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4095 __ br(__ GE, LARGE_LOOP_PREFETCH); 4096 } 4097 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 4098 __ subs(cnt2, cnt2, 16); 4099 __ br(__ LT, TAIL); 4100 __ b(SMALL_LOOP_ENTER); 4101 __ bind(SMALL_LOOP); // smaller loop 4102 __ subs(cnt2, cnt2, 16); 4103 __ bind(SMALL_LOOP_ENTER); 4104 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4105 __ br(__ GE, SMALL_LOOP); 4106 __ cbz(cnt2, LOAD_LAST); 4107 __ bind(TAIL); // 1..15 characters left 4108 __ subs(zr, cnt2, -8); 4109 __ br(__ GT, TAIL_LOAD_16); 4110 __ ldrd(vtmp, Address(tmp2)); 4111 __ zip1(vtmp3, __ T8B, vtmp, vtmpZ); 4112 4113 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4114 __ fmovd(tmpL, vtmp3); 4115 __ eor(rscratch2, tmp3, tmpL); 4116 __ cbnz(rscratch2, DIFF2); 4117 __ umov(tmpL, vtmp3, __ D, 1); 4118 __ eor(rscratch2, tmpU, tmpL); 4119 __ cbnz(rscratch2, DIFF1); 4120 __ b(LOAD_LAST); 4121 __ bind(TAIL_LOAD_16); 4122 __ ldrq(vtmp, Address(tmp2)); 4123 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4124 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4125 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4126 __ fmovd(tmpL, vtmp3); 4127 __ eor(rscratch2, tmp3, tmpL); 4128 __ cbnz(rscratch2, DIFF2); 4129 4130 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4131 __ umov(tmpL, vtmp3, __ D, 1); 4132 __ eor(rscratch2, tmpU, tmpL); 4133 __ cbnz(rscratch2, DIFF1); 4134 4135 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4136 __ fmovd(tmpL, vtmp); 4137 __ eor(rscratch2, tmp3, tmpL); 4138 __ cbnz(rscratch2, DIFF2); 4139 4140 __ umov(tmpL, vtmp, __ D, 1); 4141 __ eor(rscratch2, tmpU, tmpL); 4142 __ cbnz(rscratch2, DIFF1); 4143 __ b(LOAD_LAST); 4144 __ bind(DIFF2); 4145 __ mov(tmpU, tmp3); 4146 __ bind(DIFF1); 4147 __ pop(spilled_regs, sp); 4148 __ b(CALCULATE_DIFFERENCE); 4149 __ bind(LOAD_LAST); 4150 __ pop(spilled_regs, sp); 4151 4152 __ ldrs(vtmp, Address(strL)); 4153 __ ldr(tmpU, Address(strU)); 4154 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4155 __ fmovd(tmpL, vtmp); 4156 4157 __ eor(rscratch2, tmpU, tmpL); 4158 __ cbz(rscratch2, DONE); 4159 4160 // Find the first different characters in the longwords and 4161 // compute their difference. 4162 __ bind(CALCULATE_DIFFERENCE); 4163 __ rev(rscratch2, rscratch2); 4164 __ clz(rscratch2, rscratch2); 4165 __ andr(rscratch2, rscratch2, -16); 4166 __ lsrv(tmp1, tmp1, rscratch2); 4167 __ uxthw(tmp1, tmp1); 4168 __ lsrv(rscratch1, rscratch1, rscratch2); 4169 __ uxthw(rscratch1, rscratch1); 4170 __ subw(result, tmp1, rscratch1); 4171 __ bind(DONE); 4172 __ ret(lr); 4173 return entry; 4174 } 4175 4176 // r0 = result 4177 // r1 = str1 4178 // r2 = cnt1 4179 // r3 = str2 4180 // r4 = cnt2 4181 // r10 = tmp1 4182 // r11 = tmp2 4183 address generate_compare_long_string_same_encoding(bool isLL) { 4184 __ align(CodeEntryAlignment); 4185 StubCodeMark mark(this, "StubRoutines", isLL 4186 ? "compare_long_string_same_encoding LL" 4187 : "compare_long_string_same_encoding UU"); 4188 address entry = __ pc(); 4189 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4190 tmp1 = r10, tmp2 = r11; 4191 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, 4192 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, 4193 DIFF_LAST_POSITION, DIFF_LAST_POSITION2; 4194 // exit from large loop when less than 64 bytes left to read or we're about 4195 // to prefetch memory behind array border 4196 int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 4197 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 4198 // update cnt2 counter with already loaded 8 bytes 4199 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 4200 // update pointers, because of previous read 4201 __ add(str1, str1, wordSize); 4202 __ add(str2, str2, wordSize); 4203 if (SoftwarePrefetchHintDistance >= 0) { 4204 __ bind(LARGE_LOOP_PREFETCH); 4205 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 4206 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 4207 compare_string_16_bytes_same(DIFF, DIFF2); 4208 compare_string_16_bytes_same(DIFF, DIFF2); 4209 __ sub(cnt2, cnt2, isLL ? 64 : 32); 4210 compare_string_16_bytes_same(DIFF, DIFF2); 4211 __ subs(rscratch2, cnt2, largeLoopExitCondition); 4212 compare_string_16_bytes_same(DIFF, DIFF2); 4213 __ br(__ GT, LARGE_LOOP_PREFETCH); 4214 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? 4215 // less than 16 bytes left? 4216 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4217 __ br(__ LT, TAIL); 4218 } 4219 __ bind(SMALL_LOOP); 4220 compare_string_16_bytes_same(DIFF, DIFF2); 4221 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4222 __ br(__ GE, SMALL_LOOP); 4223 __ bind(TAIL); 4224 __ adds(cnt2, cnt2, isLL ? 16 : 8); 4225 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); 4226 __ subs(cnt2, cnt2, isLL ? 8 : 4); 4227 __ br(__ LE, CHECK_LAST); 4228 __ eor(rscratch2, tmp1, tmp2); 4229 __ cbnz(rscratch2, DIFF); 4230 __ ldr(tmp1, Address(__ post(str1, 8))); 4231 __ ldr(tmp2, Address(__ post(str2, 8))); 4232 __ sub(cnt2, cnt2, isLL ? 8 : 4); 4233 __ bind(CHECK_LAST); 4234 if (!isLL) { 4235 __ add(cnt2, cnt2, cnt2); // now in bytes 4236 } 4237 __ eor(rscratch2, tmp1, tmp2); 4238 __ cbnz(rscratch2, DIFF); 4239 __ ldr(rscratch1, Address(str1, cnt2)); 4240 __ ldr(cnt1, Address(str2, cnt2)); 4241 __ eor(rscratch2, rscratch1, cnt1); 4242 __ cbz(rscratch2, LENGTH_DIFF); 4243 // Find the first different characters in the longwords and 4244 // compute their difference. 4245 __ bind(DIFF2); 4246 __ rev(rscratch2, rscratch2); 4247 __ clz(rscratch2, rscratch2); 4248 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4249 __ lsrv(rscratch1, rscratch1, rscratch2); 4250 if (isLL) { 4251 __ lsrv(cnt1, cnt1, rscratch2); 4252 __ uxtbw(rscratch1, rscratch1); 4253 __ uxtbw(cnt1, cnt1); 4254 } else { 4255 __ lsrv(cnt1, cnt1, rscratch2); 4256 __ uxthw(rscratch1, rscratch1); 4257 __ uxthw(cnt1, cnt1); 4258 } 4259 __ subw(result, rscratch1, cnt1); 4260 __ b(LENGTH_DIFF); 4261 __ bind(DIFF); 4262 __ rev(rscratch2, rscratch2); 4263 __ clz(rscratch2, rscratch2); 4264 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4265 __ lsrv(tmp1, tmp1, rscratch2); 4266 if (isLL) { 4267 __ lsrv(tmp2, tmp2, rscratch2); 4268 __ uxtbw(tmp1, tmp1); 4269 __ uxtbw(tmp2, tmp2); 4270 } else { 4271 __ lsrv(tmp2, tmp2, rscratch2); 4272 __ uxthw(tmp1, tmp1); 4273 __ uxthw(tmp2, tmp2); 4274 } 4275 __ subw(result, tmp1, tmp2); 4276 __ b(LENGTH_DIFF); 4277 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 4278 __ eor(rscratch2, tmp1, tmp2); 4279 __ cbnz(rscratch2, DIFF); 4280 __ bind(LENGTH_DIFF); 4281 __ ret(lr); 4282 return entry; 4283 } 4284 4285 void generate_compare_long_strings() { 4286 StubRoutines::aarch64::_compare_long_string_LL 4287 = generate_compare_long_string_same_encoding(true); 4288 StubRoutines::aarch64::_compare_long_string_UU 4289 = generate_compare_long_string_same_encoding(false); 4290 StubRoutines::aarch64::_compare_long_string_LU 4291 = generate_compare_long_string_different_encoding(true); 4292 StubRoutines::aarch64::_compare_long_string_UL 4293 = generate_compare_long_string_different_encoding(false); 4294 } 4295 4296 // R0 = result 4297 // R1 = str2 4298 // R2 = cnt1 4299 // R3 = str1 4300 // R4 = cnt2 4301 // This generic linear code use few additional ideas, which makes it faster: 4302 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 4303 // in order to skip initial loading(help in systems with 1 ld pipeline) 4304 // 2) we can use "fast" algorithm of finding single character to search for 4305 // first symbol with less branches(1 branch per each loaded register instead 4306 // of branch for each symbol), so, this is where constants like 4307 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 4308 // 3) after loading and analyzing 1st register of source string, it can be 4309 // used to search for every 1st character entry, saving few loads in 4310 // comparison with "simplier-but-slower" implementation 4311 // 4) in order to avoid lots of push/pop operations, code below is heavily 4312 // re-using/re-initializing/compressing register values, which makes code 4313 // larger and a bit less readable, however, most of extra operations are 4314 // issued during loads or branches, so, penalty is minimal 4315 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 4316 const char* stubName = str1_isL 4317 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 4318 : "indexof_linear_uu"; 4319 __ align(CodeEntryAlignment); 4320 StubCodeMark mark(this, "StubRoutines", stubName); 4321 address entry = __ pc(); 4322 4323 int str1_chr_size = str1_isL ? 1 : 2; 4324 int str2_chr_size = str2_isL ? 1 : 2; 4325 int str1_chr_shift = str1_isL ? 0 : 1; 4326 int str2_chr_shift = str2_isL ? 0 : 1; 4327 bool isL = str1_isL && str2_isL; 4328 // parameters 4329 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 4330 // temporary registers 4331 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 4332 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 4333 // redefinitions 4334 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 4335 4336 __ push(spilled_regs, sp); 4337 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 4338 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 4339 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 4340 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 4341 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 4342 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 4343 // Read whole register from str1. It is safe, because length >=8 here 4344 __ ldr(ch1, Address(str1)); 4345 // Read whole register from str2. It is safe, because length >=8 here 4346 __ ldr(ch2, Address(str2)); 4347 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 4348 if (str1_isL != str2_isL) { 4349 __ eor(v0, __ T16B, v0, v0); 4350 } 4351 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4352 __ mul(first, first, tmp1); 4353 // check if we have less than 1 register to check 4354 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 4355 if (str1_isL != str2_isL) { 4356 __ fmovd(v1, ch1); 4357 } 4358 __ br(__ LE, L_SMALL); 4359 __ eor(ch2, first, ch2); 4360 if (str1_isL != str2_isL) { 4361 __ zip1(v1, __ T16B, v1, v0); 4362 } 4363 __ sub(tmp2, ch2, tmp1); 4364 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4365 __ bics(tmp2, tmp2, ch2); 4366 if (str1_isL != str2_isL) { 4367 __ fmovd(ch1, v1); 4368 } 4369 __ br(__ NE, L_HAS_ZERO); 4370 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4371 __ add(result, result, wordSize/str2_chr_size); 4372 __ add(str2, str2, wordSize); 4373 __ br(__ LT, L_POST_LOOP); 4374 __ BIND(L_LOOP); 4375 __ ldr(ch2, Address(str2)); 4376 __ eor(ch2, first, ch2); 4377 __ sub(tmp2, ch2, tmp1); 4378 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4379 __ bics(tmp2, tmp2, ch2); 4380 __ br(__ NE, L_HAS_ZERO); 4381 __ BIND(L_LOOP_PROCEED); 4382 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4383 __ add(str2, str2, wordSize); 4384 __ add(result, result, wordSize/str2_chr_size); 4385 __ br(__ GE, L_LOOP); 4386 __ BIND(L_POST_LOOP); 4387 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 4388 __ br(__ LE, NOMATCH); 4389 __ ldr(ch2, Address(str2)); 4390 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4391 __ eor(ch2, first, ch2); 4392 __ sub(tmp2, ch2, tmp1); 4393 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4394 __ mov(tmp4, -1); // all bits set 4395 __ b(L_SMALL_PROCEED); 4396 __ align(OptoLoopAlignment); 4397 __ BIND(L_SMALL); 4398 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4399 __ eor(ch2, first, ch2); 4400 if (str1_isL != str2_isL) { 4401 __ zip1(v1, __ T16B, v1, v0); 4402 } 4403 __ sub(tmp2, ch2, tmp1); 4404 __ mov(tmp4, -1); // all bits set 4405 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4406 if (str1_isL != str2_isL) { 4407 __ fmovd(ch1, v1); // move converted 4 symbols 4408 } 4409 __ BIND(L_SMALL_PROCEED); 4410 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 4411 __ bic(tmp2, tmp2, ch2); 4412 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 4413 __ rbit(tmp2, tmp2); 4414 __ br(__ EQ, NOMATCH); 4415 __ BIND(L_SMALL_HAS_ZERO_LOOP); 4416 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 4417 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 4418 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 4419 if (str2_isL) { // LL 4420 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4421 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4422 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4423 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4424 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4425 } else { 4426 __ mov(ch2, 0xE); // all bits in byte set except last one 4427 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4428 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4429 __ lslv(tmp2, tmp2, tmp4); 4430 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4431 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4432 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4433 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4434 } 4435 __ cmp(ch1, ch2); 4436 __ mov(tmp4, wordSize/str2_chr_size); 4437 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4438 __ BIND(L_SMALL_CMP_LOOP); 4439 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4440 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4441 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4442 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4443 __ add(tmp4, tmp4, 1); 4444 __ cmp(tmp4, cnt1); 4445 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 4446 __ cmp(first, ch2); 4447 __ br(__ EQ, L_SMALL_CMP_LOOP); 4448 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 4449 __ cbz(tmp2, NOMATCH); // no more matches. exit 4450 __ clz(tmp4, tmp2); 4451 __ add(result, result, 1); // advance index 4452 __ add(str2, str2, str2_chr_size); // advance pointer 4453 __ b(L_SMALL_HAS_ZERO_LOOP); 4454 __ align(OptoLoopAlignment); 4455 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 4456 __ cmp(first, ch2); 4457 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4458 __ b(DONE); 4459 __ align(OptoLoopAlignment); 4460 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 4461 if (str2_isL) { // LL 4462 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4463 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4464 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4465 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4466 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4467 } else { 4468 __ mov(ch2, 0xE); // all bits in byte set except last one 4469 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4470 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4471 __ lslv(tmp2, tmp2, tmp4); 4472 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4473 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4474 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4475 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4476 } 4477 __ cmp(ch1, ch2); 4478 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4479 __ b(DONE); 4480 __ align(OptoLoopAlignment); 4481 __ BIND(L_HAS_ZERO); 4482 __ rbit(tmp2, tmp2); 4483 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 4484 // Now, perform compression of counters(cnt2 and cnt1) into one register. 4485 // It's fine because both counters are 32bit and are not changed in this 4486 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 4487 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 4488 __ sub(result, result, 1); 4489 __ BIND(L_HAS_ZERO_LOOP); 4490 __ mov(cnt1, wordSize/str2_chr_size); 4491 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4492 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 4493 if (str2_isL) { 4494 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4495 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4496 __ lslv(tmp2, tmp2, tmp4); 4497 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4498 __ add(tmp4, tmp4, 1); 4499 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4500 __ lsl(tmp2, tmp2, 1); 4501 __ mov(tmp4, wordSize/str2_chr_size); 4502 } else { 4503 __ mov(ch2, 0xE); 4504 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4505 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4506 __ lslv(tmp2, tmp2, tmp4); 4507 __ add(tmp4, tmp4, 1); 4508 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4509 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4510 __ lsl(tmp2, tmp2, 1); 4511 __ mov(tmp4, wordSize/str2_chr_size); 4512 __ sub(str2, str2, str2_chr_size); 4513 } 4514 __ cmp(ch1, ch2); 4515 __ mov(tmp4, wordSize/str2_chr_size); 4516 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4517 __ BIND(L_CMP_LOOP); 4518 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4519 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4520 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4521 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4522 __ add(tmp4, tmp4, 1); 4523 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4524 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 4525 __ cmp(cnt1, ch2); 4526 __ br(__ EQ, L_CMP_LOOP); 4527 __ BIND(L_CMP_LOOP_NOMATCH); 4528 // here we're not matched 4529 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 4530 __ clz(tmp4, tmp2); 4531 __ add(str2, str2, str2_chr_size); // advance pointer 4532 __ b(L_HAS_ZERO_LOOP); 4533 __ align(OptoLoopAlignment); 4534 __ BIND(L_CMP_LOOP_LAST_CMP); 4535 __ cmp(cnt1, ch2); 4536 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4537 __ b(DONE); 4538 __ align(OptoLoopAlignment); 4539 __ BIND(L_CMP_LOOP_LAST_CMP2); 4540 if (str2_isL) { 4541 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4542 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4543 __ lslv(tmp2, tmp2, tmp4); 4544 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4545 __ add(tmp4, tmp4, 1); 4546 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4547 __ lsl(tmp2, tmp2, 1); 4548 } else { 4549 __ mov(ch2, 0xE); 4550 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4551 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4552 __ lslv(tmp2, tmp2, tmp4); 4553 __ add(tmp4, tmp4, 1); 4554 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4555 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4556 __ lsl(tmp2, tmp2, 1); 4557 __ sub(str2, str2, str2_chr_size); 4558 } 4559 __ cmp(ch1, ch2); 4560 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4561 __ b(DONE); 4562 __ align(OptoLoopAlignment); 4563 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 4564 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 4565 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 4566 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 4567 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 4568 // result by analyzed characters value, so, we can just reset lower bits 4569 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 4570 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 4571 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 4572 // index of last analyzed substring inside current octet. So, str2 in at 4573 // respective start address. We need to advance it to next octet 4574 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 4575 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 4576 __ bfm(result, zr, 0, 2 - str2_chr_shift); 4577 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 4578 __ movw(cnt2, cnt2); 4579 __ b(L_LOOP_PROCEED); 4580 __ align(OptoLoopAlignment); 4581 __ BIND(NOMATCH); 4582 __ mov(result, -1); 4583 __ BIND(DONE); 4584 __ pop(spilled_regs, sp); 4585 __ ret(lr); 4586 return entry; 4587 } 4588 4589 void generate_string_indexof_stubs() { 4590 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 4591 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 4592 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 4593 } 4594 4595 void inflate_and_store_2_fp_registers(bool generatePrfm, 4596 FloatRegister src1, FloatRegister src2) { 4597 Register dst = r1; 4598 __ zip1(v1, __ T16B, src1, v0); 4599 __ zip2(v2, __ T16B, src1, v0); 4600 if (generatePrfm) { 4601 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 4602 } 4603 __ zip1(v3, __ T16B, src2, v0); 4604 __ zip2(v4, __ T16B, src2, v0); 4605 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4606 } 4607 4608 // R0 = src 4609 // R1 = dst 4610 // R2 = len 4611 // R3 = len >> 3 4612 // V0 = 0 4613 // v1 = loaded 8 bytes 4614 address generate_large_byte_array_inflate() { 4615 __ align(CodeEntryAlignment); 4616 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4617 address entry = __ pc(); 4618 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4619 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4620 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4; 4621 4622 // do one more 8-byte read to have address 16-byte aligned in most cases 4623 // also use single store instruction 4624 __ ldrd(v2, __ post(src, 8)); 4625 __ sub(octetCounter, octetCounter, 2); 4626 __ zip1(v1, __ T16B, v1, v0); 4627 __ zip1(v2, __ T16B, v2, v0); 4628 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4629 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4630 __ subs(rscratch1, octetCounter, large_loop_threshold); 4631 __ br(__ LE, LOOP_START); 4632 __ b(LOOP_PRFM_START); 4633 __ bind(LOOP_PRFM); 4634 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4635 __ bind(LOOP_PRFM_START); 4636 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4637 __ sub(octetCounter, octetCounter, 8); 4638 __ subs(rscratch1, octetCounter, large_loop_threshold); 4639 inflate_and_store_2_fp_registers(true, v3, v4); 4640 inflate_and_store_2_fp_registers(true, v5, v6); 4641 __ br(__ GT, LOOP_PRFM); 4642 __ cmp(octetCounter, (u1)8); 4643 __ br(__ LT, DONE); 4644 __ bind(LOOP); 4645 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4646 __ bind(LOOP_START); 4647 __ sub(octetCounter, octetCounter, 8); 4648 __ cmp(octetCounter, (u1)8); 4649 inflate_and_store_2_fp_registers(false, v3, v4); 4650 inflate_and_store_2_fp_registers(false, v5, v6); 4651 __ br(__ GE, LOOP); 4652 __ bind(DONE); 4653 __ ret(lr); 4654 return entry; 4655 } 4656 4657 /** 4658 * Arguments: 4659 * 4660 * Input: 4661 * c_rarg0 - current state address 4662 * c_rarg1 - H key address 4663 * c_rarg2 - data address 4664 * c_rarg3 - number of blocks 4665 * 4666 * Output: 4667 * Updated state at c_rarg0 4668 */ 4669 address generate_ghash_processBlocks() { 4670 // Bafflingly, GCM uses little-endian for the byte order, but 4671 // big-endian for the bit order. For example, the polynomial 1 is 4672 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4673 // 4674 // So, we must either reverse the bytes in each word and do 4675 // everything big-endian or reverse the bits in each byte and do 4676 // it little-endian. On AArch64 it's more idiomatic to reverse 4677 // the bits in each byte (we have an instruction, RBIT, to do 4678 // that) and keep the data in little-endian bit order throught the 4679 // calculation, bit-reversing the inputs and outputs. 4680 4681 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4682 __ align(wordSize * 2); 4683 address p = __ pc(); 4684 __ emit_int64(0x87); // The low-order bits of the field 4685 // polynomial (i.e. p = z^7+z^2+z+1) 4686 // repeated in the low and high parts of a 4687 // 128-bit vector 4688 __ emit_int64(0x87); 4689 4690 __ align(CodeEntryAlignment); 4691 address start = __ pc(); 4692 4693 Register state = c_rarg0; 4694 Register subkeyH = c_rarg1; 4695 Register data = c_rarg2; 4696 Register blocks = c_rarg3; 4697 4698 FloatRegister vzr = v30; 4699 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4700 4701 __ ldrq(v0, Address(state)); 4702 __ ldrq(v1, Address(subkeyH)); 4703 4704 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4705 __ rbit(v0, __ T16B, v0); 4706 __ rev64(v1, __ T16B, v1); 4707 __ rbit(v1, __ T16B, v1); 4708 4709 __ ldrq(v26, p); 4710 4711 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4712 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4713 4714 { 4715 Label L_ghash_loop; 4716 __ bind(L_ghash_loop); 4717 4718 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4719 // reversing each byte 4720 __ rbit(v2, __ T16B, v2); 4721 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4722 4723 // Multiply state in v2 by subkey in v1 4724 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4725 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4726 /*temps*/v6, v20, v18, v21); 4727 // Reduce v7:v5 by the field polynomial 4728 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4729 4730 __ sub(blocks, blocks, 1); 4731 __ cbnz(blocks, L_ghash_loop); 4732 } 4733 4734 // The bit-reversed result is at this point in v0 4735 __ rev64(v1, __ T16B, v0); 4736 __ rbit(v1, __ T16B, v1); 4737 4738 __ st1(v1, __ T16B, state); 4739 __ ret(lr); 4740 4741 return start; 4742 } 4743 4744 // Continuation point for throwing of implicit exceptions that are 4745 // not handled in the current activation. Fabricates an exception 4746 // oop and initiates normal exception dispatching in this 4747 // frame. Since we need to preserve callee-saved values (currently 4748 // only for C2, but done for C1 as well) we need a callee-saved oop 4749 // map and therefore have to make these stubs into RuntimeStubs 4750 // rather than BufferBlobs. If the compiler needs all registers to 4751 // be preserved between the fault point and the exception handler 4752 // then it must assume responsibility for that in 4753 // AbstractCompiler::continuation_for_implicit_null_exception or 4754 // continuation_for_implicit_division_by_zero_exception. All other 4755 // implicit exceptions (e.g., NullPointerException or 4756 // AbstractMethodError on entry) are either at call sites or 4757 // otherwise assume that stack unwinding will be initiated, so 4758 // caller saved registers were assumed volatile in the compiler. 4759 4760 #undef __ 4761 #define __ masm-> 4762 4763 address generate_throw_exception(const char* name, 4764 address runtime_entry, 4765 Register arg1 = noreg, 4766 Register arg2 = noreg) { 4767 // Information about frame layout at time of blocking runtime call. 4768 // Note that we only have to preserve callee-saved registers since 4769 // the compilers are responsible for supplying a continuation point 4770 // if they expect all registers to be preserved. 4771 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4772 enum layout { 4773 rfp_off = 0, 4774 rfp_off2, 4775 return_off, 4776 return_off2, 4777 framesize // inclusive of return address 4778 }; 4779 4780 int insts_size = 512; 4781 int locs_size = 64; 4782 4783 CodeBuffer code(name, insts_size, locs_size); 4784 OopMapSet* oop_maps = new OopMapSet(); 4785 MacroAssembler* masm = new MacroAssembler(&code); 4786 4787 address start = __ pc(); 4788 4789 // This is an inlined and slightly modified version of call_VM 4790 // which has the ability to fetch the return PC out of 4791 // thread-local storage and also sets up last_Java_sp slightly 4792 // differently than the real call_VM 4793 4794 __ enter(); // Save FP and LR before call 4795 4796 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4797 4798 // lr and fp are already in place 4799 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4800 4801 int frame_complete = __ pc() - start; 4802 4803 // Set up last_Java_sp and last_Java_fp 4804 address the_pc = __ pc(); 4805 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 4806 4807 // Call runtime 4808 if (arg1 != noreg) { 4809 assert(arg2 != c_rarg1, "clobbered"); 4810 __ mov(c_rarg1, arg1); 4811 } 4812 if (arg2 != noreg) { 4813 __ mov(c_rarg2, arg2); 4814 } 4815 __ mov(c_rarg0, rthread); 4816 BLOCK_COMMENT("call runtime_entry"); 4817 __ mov(rscratch1, runtime_entry); 4818 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4819 4820 // Generate oop map 4821 OopMap* map = new OopMap(framesize, 0); 4822 4823 oop_maps->add_gc_map(the_pc - start, map); 4824 4825 __ reset_last_Java_frame(true); 4826 __ maybe_isb(); 4827 4828 __ leave(); 4829 4830 // check for pending exceptions 4831 #ifdef ASSERT 4832 Label L; 4833 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4834 __ cbnz(rscratch1, L); 4835 __ should_not_reach_here(); 4836 __ bind(L); 4837 #endif // ASSERT 4838 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4839 4840 4841 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4842 RuntimeStub* stub = 4843 RuntimeStub::new_runtime_stub(name, 4844 &code, 4845 frame_complete, 4846 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4847 oop_maps, false); 4848 return stub->entry_point(); 4849 } 4850 4851 class MontgomeryMultiplyGenerator : public MacroAssembler { 4852 4853 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4854 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4855 4856 RegSet _toSave; 4857 bool _squaring; 4858 4859 public: 4860 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4861 : MacroAssembler(as->code()), _squaring(squaring) { 4862 4863 // Register allocation 4864 4865 Register reg = c_rarg0; 4866 Pa_base = reg; // Argument registers 4867 if (squaring) 4868 Pb_base = Pa_base; 4869 else 4870 Pb_base = ++reg; 4871 Pn_base = ++reg; 4872 Rlen= ++reg; 4873 inv = ++reg; 4874 Pm_base = ++reg; 4875 4876 // Working registers: 4877 Ra = ++reg; // The current digit of a, b, n, and m. 4878 Rb = ++reg; 4879 Rm = ++reg; 4880 Rn = ++reg; 4881 4882 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4883 Pb = ++reg; 4884 Pm = ++reg; 4885 Pn = ++reg; 4886 4887 t0 = ++reg; // Three registers which form a 4888 t1 = ++reg; // triple-precision accumuator. 4889 t2 = ++reg; 4890 4891 Ri = ++reg; // Inner and outer loop indexes. 4892 Rj = ++reg; 4893 4894 Rhi_ab = ++reg; // Product registers: low and high parts 4895 Rlo_ab = ++reg; // of a*b and m*n. 4896 Rhi_mn = ++reg; 4897 Rlo_mn = ++reg; 4898 4899 // r19 and up are callee-saved. 4900 _toSave = RegSet::range(r19, reg) + Pm_base; 4901 } 4902 4903 private: 4904 void save_regs() { 4905 push(_toSave, sp); 4906 } 4907 4908 void restore_regs() { 4909 pop(_toSave, sp); 4910 } 4911 4912 template <typename T> 4913 void unroll_2(Register count, T block) { 4914 Label loop, end, odd; 4915 tbnz(count, 0, odd); 4916 cbz(count, end); 4917 align(16); 4918 bind(loop); 4919 (this->*block)(); 4920 bind(odd); 4921 (this->*block)(); 4922 subs(count, count, 2); 4923 br(Assembler::GT, loop); 4924 bind(end); 4925 } 4926 4927 template <typename T> 4928 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4929 Label loop, end, odd; 4930 tbnz(count, 0, odd); 4931 cbz(count, end); 4932 align(16); 4933 bind(loop); 4934 (this->*block)(d, s, tmp); 4935 bind(odd); 4936 (this->*block)(d, s, tmp); 4937 subs(count, count, 2); 4938 br(Assembler::GT, loop); 4939 bind(end); 4940 } 4941 4942 void pre1(RegisterOrConstant i) { 4943 block_comment("pre1"); 4944 // Pa = Pa_base; 4945 // Pb = Pb_base + i; 4946 // Pm = Pm_base; 4947 // Pn = Pn_base + i; 4948 // Ra = *Pa; 4949 // Rb = *Pb; 4950 // Rm = *Pm; 4951 // Rn = *Pn; 4952 ldr(Ra, Address(Pa_base)); 4953 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4954 ldr(Rm, Address(Pm_base)); 4955 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4956 lea(Pa, Address(Pa_base)); 4957 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4958 lea(Pm, Address(Pm_base)); 4959 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4960 4961 // Zero the m*n result. 4962 mov(Rhi_mn, zr); 4963 mov(Rlo_mn, zr); 4964 } 4965 4966 // The core multiply-accumulate step of a Montgomery 4967 // multiplication. The idea is to schedule operations as a 4968 // pipeline so that instructions with long latencies (loads and 4969 // multiplies) have time to complete before their results are 4970 // used. This most benefits in-order implementations of the 4971 // architecture but out-of-order ones also benefit. 4972 void step() { 4973 block_comment("step"); 4974 // MACC(Ra, Rb, t0, t1, t2); 4975 // Ra = *++Pa; 4976 // Rb = *--Pb; 4977 umulh(Rhi_ab, Ra, Rb); 4978 mul(Rlo_ab, Ra, Rb); 4979 ldr(Ra, pre(Pa, wordSize)); 4980 ldr(Rb, pre(Pb, -wordSize)); 4981 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4982 // previous iteration. 4983 // MACC(Rm, Rn, t0, t1, t2); 4984 // Rm = *++Pm; 4985 // Rn = *--Pn; 4986 umulh(Rhi_mn, Rm, Rn); 4987 mul(Rlo_mn, Rm, Rn); 4988 ldr(Rm, pre(Pm, wordSize)); 4989 ldr(Rn, pre(Pn, -wordSize)); 4990 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4991 } 4992 4993 void post1() { 4994 block_comment("post1"); 4995 4996 // MACC(Ra, Rb, t0, t1, t2); 4997 // Ra = *++Pa; 4998 // Rb = *--Pb; 4999 umulh(Rhi_ab, Ra, Rb); 5000 mul(Rlo_ab, Ra, Rb); 5001 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5002 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5003 5004 // *Pm = Rm = t0 * inv; 5005 mul(Rm, t0, inv); 5006 str(Rm, Address(Pm)); 5007 5008 // MACC(Rm, Rn, t0, t1, t2); 5009 // t0 = t1; t1 = t2; t2 = 0; 5010 umulh(Rhi_mn, Rm, Rn); 5011 5012 #ifndef PRODUCT 5013 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5014 { 5015 mul(Rlo_mn, Rm, Rn); 5016 add(Rlo_mn, t0, Rlo_mn); 5017 Label ok; 5018 cbz(Rlo_mn, ok); { 5019 stop("broken Montgomery multiply"); 5020 } bind(ok); 5021 } 5022 #endif 5023 // We have very carefully set things up so that 5024 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5025 // the lower half of Rm * Rn because we know the result already: 5026 // it must be -t0. t0 + (-t0) must generate a carry iff 5027 // t0 != 0. So, rather than do a mul and an adds we just set 5028 // the carry flag iff t0 is nonzero. 5029 // 5030 // mul(Rlo_mn, Rm, Rn); 5031 // adds(zr, t0, Rlo_mn); 5032 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5033 adcs(t0, t1, Rhi_mn); 5034 adc(t1, t2, zr); 5035 mov(t2, zr); 5036 } 5037 5038 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 5039 block_comment("pre2"); 5040 // Pa = Pa_base + i-len; 5041 // Pb = Pb_base + len; 5042 // Pm = Pm_base + i-len; 5043 // Pn = Pn_base + len; 5044 5045 if (i.is_register()) { 5046 sub(Rj, i.as_register(), len); 5047 } else { 5048 mov(Rj, i.as_constant()); 5049 sub(Rj, Rj, len); 5050 } 5051 // Rj == i-len 5052 5053 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 5054 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 5055 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5056 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 5057 5058 // Ra = *++Pa; 5059 // Rb = *--Pb; 5060 // Rm = *++Pm; 5061 // Rn = *--Pn; 5062 ldr(Ra, pre(Pa, wordSize)); 5063 ldr(Rb, pre(Pb, -wordSize)); 5064 ldr(Rm, pre(Pm, wordSize)); 5065 ldr(Rn, pre(Pn, -wordSize)); 5066 5067 mov(Rhi_mn, zr); 5068 mov(Rlo_mn, zr); 5069 } 5070 5071 void post2(RegisterOrConstant i, RegisterOrConstant len) { 5072 block_comment("post2"); 5073 if (i.is_constant()) { 5074 mov(Rj, i.as_constant()-len.as_constant()); 5075 } else { 5076 sub(Rj, i.as_register(), len); 5077 } 5078 5079 adds(t0, t0, Rlo_mn); // The pending m*n, low part 5080 5081 // As soon as we know the least significant digit of our result, 5082 // store it. 5083 // Pm_base[i-len] = t0; 5084 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5085 5086 // t0 = t1; t1 = t2; t2 = 0; 5087 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 5088 adc(t1, t2, zr); 5089 mov(t2, zr); 5090 } 5091 5092 // A carry in t0 after Montgomery multiplication means that we 5093 // should subtract multiples of n from our result in m. We'll 5094 // keep doing that until there is no carry. 5095 void normalize(RegisterOrConstant len) { 5096 block_comment("normalize"); 5097 // while (t0) 5098 // t0 = sub(Pm_base, Pn_base, t0, len); 5099 Label loop, post, again; 5100 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 5101 cbz(t0, post); { 5102 bind(again); { 5103 mov(i, zr); 5104 mov(cnt, len); 5105 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5106 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5107 subs(zr, zr, zr); // set carry flag, i.e. no borrow 5108 align(16); 5109 bind(loop); { 5110 sbcs(Rm, Rm, Rn); 5111 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5112 add(i, i, 1); 5113 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5114 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5115 sub(cnt, cnt, 1); 5116 } cbnz(cnt, loop); 5117 sbc(t0, t0, zr); 5118 } cbnz(t0, again); 5119 } bind(post); 5120 } 5121 5122 // Move memory at s to d, reversing words. 5123 // Increments d to end of copied memory 5124 // Destroys tmp1, tmp2 5125 // Preserves len 5126 // Leaves s pointing to the address which was in d at start 5127 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 5128 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 5129 5130 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 5131 mov(tmp1, len); 5132 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 5133 sub(s, d, len, ext::uxtw, LogBytesPerWord); 5134 } 5135 // where 5136 void reverse1(Register d, Register s, Register tmp) { 5137 ldr(tmp, pre(s, -wordSize)); 5138 ror(tmp, tmp, 32); 5139 str(tmp, post(d, wordSize)); 5140 } 5141 5142 void step_squaring() { 5143 // An extra ACC 5144 step(); 5145 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5146 } 5147 5148 void last_squaring(RegisterOrConstant i) { 5149 Label dont; 5150 // if ((i & 1) == 0) { 5151 tbnz(i.as_register(), 0, dont); { 5152 // MACC(Ra, Rb, t0, t1, t2); 5153 // Ra = *++Pa; 5154 // Rb = *--Pb; 5155 umulh(Rhi_ab, Ra, Rb); 5156 mul(Rlo_ab, Ra, Rb); 5157 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5158 } bind(dont); 5159 } 5160 5161 void extra_step_squaring() { 5162 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5163 5164 // MACC(Rm, Rn, t0, t1, t2); 5165 // Rm = *++Pm; 5166 // Rn = *--Pn; 5167 umulh(Rhi_mn, Rm, Rn); 5168 mul(Rlo_mn, Rm, Rn); 5169 ldr(Rm, pre(Pm, wordSize)); 5170 ldr(Rn, pre(Pn, -wordSize)); 5171 } 5172 5173 void post1_squaring() { 5174 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5175 5176 // *Pm = Rm = t0 * inv; 5177 mul(Rm, t0, inv); 5178 str(Rm, Address(Pm)); 5179 5180 // MACC(Rm, Rn, t0, t1, t2); 5181 // t0 = t1; t1 = t2; t2 = 0; 5182 umulh(Rhi_mn, Rm, Rn); 5183 5184 #ifndef PRODUCT 5185 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5186 { 5187 mul(Rlo_mn, Rm, Rn); 5188 add(Rlo_mn, t0, Rlo_mn); 5189 Label ok; 5190 cbz(Rlo_mn, ok); { 5191 stop("broken Montgomery multiply"); 5192 } bind(ok); 5193 } 5194 #endif 5195 // We have very carefully set things up so that 5196 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5197 // the lower half of Rm * Rn because we know the result already: 5198 // it must be -t0. t0 + (-t0) must generate a carry iff 5199 // t0 != 0. So, rather than do a mul and an adds we just set 5200 // the carry flag iff t0 is nonzero. 5201 // 5202 // mul(Rlo_mn, Rm, Rn); 5203 // adds(zr, t0, Rlo_mn); 5204 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5205 adcs(t0, t1, Rhi_mn); 5206 adc(t1, t2, zr); 5207 mov(t2, zr); 5208 } 5209 5210 void acc(Register Rhi, Register Rlo, 5211 Register t0, Register t1, Register t2) { 5212 adds(t0, t0, Rlo); 5213 adcs(t1, t1, Rhi); 5214 adc(t2, t2, zr); 5215 } 5216 5217 public: 5218 /** 5219 * Fast Montgomery multiplication. The derivation of the 5220 * algorithm is in A Cryptographic Library for the Motorola 5221 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 5222 * 5223 * Arguments: 5224 * 5225 * Inputs for multiplication: 5226 * c_rarg0 - int array elements a 5227 * c_rarg1 - int array elements b 5228 * c_rarg2 - int array elements n (the modulus) 5229 * c_rarg3 - int length 5230 * c_rarg4 - int inv 5231 * c_rarg5 - int array elements m (the result) 5232 * 5233 * Inputs for squaring: 5234 * c_rarg0 - int array elements a 5235 * c_rarg1 - int array elements n (the modulus) 5236 * c_rarg2 - int length 5237 * c_rarg3 - int inv 5238 * c_rarg4 - int array elements m (the result) 5239 * 5240 */ 5241 address generate_multiply() { 5242 Label argh, nothing; 5243 bind(argh); 5244 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5245 5246 align(CodeEntryAlignment); 5247 address entry = pc(); 5248 5249 cbzw(Rlen, nothing); 5250 5251 enter(); 5252 5253 // Make room. 5254 cmpw(Rlen, 512); 5255 br(Assembler::HI, argh); 5256 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5257 andr(sp, Ra, -2 * wordSize); 5258 5259 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5260 5261 { 5262 // Copy input args, reversing as we go. We use Ra as a 5263 // temporary variable. 5264 reverse(Ra, Pa_base, Rlen, t0, t1); 5265 if (!_squaring) 5266 reverse(Ra, Pb_base, Rlen, t0, t1); 5267 reverse(Ra, Pn_base, Rlen, t0, t1); 5268 } 5269 5270 // Push all call-saved registers and also Pm_base which we'll need 5271 // at the end. 5272 save_regs(); 5273 5274 #ifndef PRODUCT 5275 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 5276 { 5277 ldr(Rn, Address(Pn_base, 0)); 5278 mul(Rlo_mn, Rn, inv); 5279 subs(zr, Rlo_mn, -1); 5280 Label ok; 5281 br(EQ, ok); { 5282 stop("broken inverse in Montgomery multiply"); 5283 } bind(ok); 5284 } 5285 #endif 5286 5287 mov(Pm_base, Ra); 5288 5289 mov(t0, zr); 5290 mov(t1, zr); 5291 mov(t2, zr); 5292 5293 block_comment("for (int i = 0; i < len; i++) {"); 5294 mov(Ri, zr); { 5295 Label loop, end; 5296 cmpw(Ri, Rlen); 5297 br(Assembler::GE, end); 5298 5299 bind(loop); 5300 pre1(Ri); 5301 5302 block_comment(" for (j = i; j; j--) {"); { 5303 movw(Rj, Ri); 5304 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5305 } block_comment(" } // j"); 5306 5307 post1(); 5308 addw(Ri, Ri, 1); 5309 cmpw(Ri, Rlen); 5310 br(Assembler::LT, loop); 5311 bind(end); 5312 block_comment("} // i"); 5313 } 5314 5315 block_comment("for (int i = len; i < 2*len; i++) {"); 5316 mov(Ri, Rlen); { 5317 Label loop, end; 5318 cmpw(Ri, Rlen, Assembler::LSL, 1); 5319 br(Assembler::GE, end); 5320 5321 bind(loop); 5322 pre2(Ri, Rlen); 5323 5324 block_comment(" for (j = len*2-i-1; j; j--) {"); { 5325 lslw(Rj, Rlen, 1); 5326 subw(Rj, Rj, Ri); 5327 subw(Rj, Rj, 1); 5328 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5329 } block_comment(" } // j"); 5330 5331 post2(Ri, Rlen); 5332 addw(Ri, Ri, 1); 5333 cmpw(Ri, Rlen, Assembler::LSL, 1); 5334 br(Assembler::LT, loop); 5335 bind(end); 5336 } 5337 block_comment("} // i"); 5338 5339 normalize(Rlen); 5340 5341 mov(Ra, Pm_base); // Save Pm_base in Ra 5342 restore_regs(); // Restore caller's Pm_base 5343 5344 // Copy our result into caller's Pm_base 5345 reverse(Pm_base, Ra, Rlen, t0, t1); 5346 5347 leave(); 5348 bind(nothing); 5349 ret(lr); 5350 5351 return entry; 5352 } 5353 // In C, approximately: 5354 5355 // void 5356 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 5357 // unsigned long Pn_base[], unsigned long Pm_base[], 5358 // unsigned long inv, int len) { 5359 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5360 // unsigned long *Pa, *Pb, *Pn, *Pm; 5361 // unsigned long Ra, Rb, Rn, Rm; 5362 5363 // int i; 5364 5365 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5366 5367 // for (i = 0; i < len; i++) { 5368 // int j; 5369 5370 // Pa = Pa_base; 5371 // Pb = Pb_base + i; 5372 // Pm = Pm_base; 5373 // Pn = Pn_base + i; 5374 5375 // Ra = *Pa; 5376 // Rb = *Pb; 5377 // Rm = *Pm; 5378 // Rn = *Pn; 5379 5380 // int iters = i; 5381 // for (j = 0; iters--; j++) { 5382 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5383 // MACC(Ra, Rb, t0, t1, t2); 5384 // Ra = *++Pa; 5385 // Rb = *--Pb; 5386 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5387 // MACC(Rm, Rn, t0, t1, t2); 5388 // Rm = *++Pm; 5389 // Rn = *--Pn; 5390 // } 5391 5392 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 5393 // MACC(Ra, Rb, t0, t1, t2); 5394 // *Pm = Rm = t0 * inv; 5395 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5396 // MACC(Rm, Rn, t0, t1, t2); 5397 5398 // assert(t0 == 0, "broken Montgomery multiply"); 5399 5400 // t0 = t1; t1 = t2; t2 = 0; 5401 // } 5402 5403 // for (i = len; i < 2*len; i++) { 5404 // int j; 5405 5406 // Pa = Pa_base + i-len; 5407 // Pb = Pb_base + len; 5408 // Pm = Pm_base + i-len; 5409 // Pn = Pn_base + len; 5410 5411 // Ra = *++Pa; 5412 // Rb = *--Pb; 5413 // Rm = *++Pm; 5414 // Rn = *--Pn; 5415 5416 // int iters = len*2-i-1; 5417 // for (j = i-len+1; iters--; j++) { 5418 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5419 // MACC(Ra, Rb, t0, t1, t2); 5420 // Ra = *++Pa; 5421 // Rb = *--Pb; 5422 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5423 // MACC(Rm, Rn, t0, t1, t2); 5424 // Rm = *++Pm; 5425 // Rn = *--Pn; 5426 // } 5427 5428 // Pm_base[i-len] = t0; 5429 // t0 = t1; t1 = t2; t2 = 0; 5430 // } 5431 5432 // while (t0) 5433 // t0 = sub(Pm_base, Pn_base, t0, len); 5434 // } 5435 5436 /** 5437 * Fast Montgomery squaring. This uses asymptotically 25% fewer 5438 * multiplies than Montgomery multiplication so it should be up to 5439 * 25% faster. However, its loop control is more complex and it 5440 * may actually run slower on some machines. 5441 * 5442 * Arguments: 5443 * 5444 * Inputs: 5445 * c_rarg0 - int array elements a 5446 * c_rarg1 - int array elements n (the modulus) 5447 * c_rarg2 - int length 5448 * c_rarg3 - int inv 5449 * c_rarg4 - int array elements m (the result) 5450 * 5451 */ 5452 address generate_square() { 5453 Label argh; 5454 bind(argh); 5455 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5456 5457 align(CodeEntryAlignment); 5458 address entry = pc(); 5459 5460 enter(); 5461 5462 // Make room. 5463 cmpw(Rlen, 512); 5464 br(Assembler::HI, argh); 5465 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5466 andr(sp, Ra, -2 * wordSize); 5467 5468 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5469 5470 { 5471 // Copy input args, reversing as we go. We use Ra as a 5472 // temporary variable. 5473 reverse(Ra, Pa_base, Rlen, t0, t1); 5474 reverse(Ra, Pn_base, Rlen, t0, t1); 5475 } 5476 5477 // Push all call-saved registers and also Pm_base which we'll need 5478 // at the end. 5479 save_regs(); 5480 5481 mov(Pm_base, Ra); 5482 5483 mov(t0, zr); 5484 mov(t1, zr); 5485 mov(t2, zr); 5486 5487 block_comment("for (int i = 0; i < len; i++) {"); 5488 mov(Ri, zr); { 5489 Label loop, end; 5490 bind(loop); 5491 cmp(Ri, Rlen); 5492 br(Assembler::GE, end); 5493 5494 pre1(Ri); 5495 5496 block_comment("for (j = (i+1)/2; j; j--) {"); { 5497 add(Rj, Ri, 1); 5498 lsr(Rj, Rj, 1); 5499 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5500 } block_comment(" } // j"); 5501 5502 last_squaring(Ri); 5503 5504 block_comment(" for (j = i/2; j; j--) {"); { 5505 lsr(Rj, Ri, 1); 5506 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5507 } block_comment(" } // j"); 5508 5509 post1_squaring(); 5510 add(Ri, Ri, 1); 5511 cmp(Ri, Rlen); 5512 br(Assembler::LT, loop); 5513 5514 bind(end); 5515 block_comment("} // i"); 5516 } 5517 5518 block_comment("for (int i = len; i < 2*len; i++) {"); 5519 mov(Ri, Rlen); { 5520 Label loop, end; 5521 bind(loop); 5522 cmp(Ri, Rlen, Assembler::LSL, 1); 5523 br(Assembler::GE, end); 5524 5525 pre2(Ri, Rlen); 5526 5527 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 5528 lsl(Rj, Rlen, 1); 5529 sub(Rj, Rj, Ri); 5530 sub(Rj, Rj, 1); 5531 lsr(Rj, Rj, 1); 5532 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5533 } block_comment(" } // j"); 5534 5535 last_squaring(Ri); 5536 5537 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 5538 lsl(Rj, Rlen, 1); 5539 sub(Rj, Rj, Ri); 5540 lsr(Rj, Rj, 1); 5541 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5542 } block_comment(" } // j"); 5543 5544 post2(Ri, Rlen); 5545 add(Ri, Ri, 1); 5546 cmp(Ri, Rlen, Assembler::LSL, 1); 5547 5548 br(Assembler::LT, loop); 5549 bind(end); 5550 block_comment("} // i"); 5551 } 5552 5553 normalize(Rlen); 5554 5555 mov(Ra, Pm_base); // Save Pm_base in Ra 5556 restore_regs(); // Restore caller's Pm_base 5557 5558 // Copy our result into caller's Pm_base 5559 reverse(Pm_base, Ra, Rlen, t0, t1); 5560 5561 leave(); 5562 ret(lr); 5563 5564 return entry; 5565 } 5566 // In C, approximately: 5567 5568 // void 5569 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 5570 // unsigned long Pm_base[], unsigned long inv, int len) { 5571 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5572 // unsigned long *Pa, *Pb, *Pn, *Pm; 5573 // unsigned long Ra, Rb, Rn, Rm; 5574 5575 // int i; 5576 5577 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5578 5579 // for (i = 0; i < len; i++) { 5580 // int j; 5581 5582 // Pa = Pa_base; 5583 // Pb = Pa_base + i; 5584 // Pm = Pm_base; 5585 // Pn = Pn_base + i; 5586 5587 // Ra = *Pa; 5588 // Rb = *Pb; 5589 // Rm = *Pm; 5590 // Rn = *Pn; 5591 5592 // int iters = (i+1)/2; 5593 // for (j = 0; iters--; j++) { 5594 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5595 // MACC2(Ra, Rb, t0, t1, t2); 5596 // Ra = *++Pa; 5597 // Rb = *--Pb; 5598 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5599 // MACC(Rm, Rn, t0, t1, t2); 5600 // Rm = *++Pm; 5601 // Rn = *--Pn; 5602 // } 5603 // if ((i & 1) == 0) { 5604 // assert(Ra == Pa_base[j], "must be"); 5605 // MACC(Ra, Ra, t0, t1, t2); 5606 // } 5607 // iters = i/2; 5608 // assert(iters == i-j, "must be"); 5609 // for (; iters--; j++) { 5610 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5611 // MACC(Rm, Rn, t0, t1, t2); 5612 // Rm = *++Pm; 5613 // Rn = *--Pn; 5614 // } 5615 5616 // *Pm = Rm = t0 * inv; 5617 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5618 // MACC(Rm, Rn, t0, t1, t2); 5619 5620 // assert(t0 == 0, "broken Montgomery multiply"); 5621 5622 // t0 = t1; t1 = t2; t2 = 0; 5623 // } 5624 5625 // for (i = len; i < 2*len; i++) { 5626 // int start = i-len+1; 5627 // int end = start + (len - start)/2; 5628 // int j; 5629 5630 // Pa = Pa_base + i-len; 5631 // Pb = Pa_base + len; 5632 // Pm = Pm_base + i-len; 5633 // Pn = Pn_base + len; 5634 5635 // Ra = *++Pa; 5636 // Rb = *--Pb; 5637 // Rm = *++Pm; 5638 // Rn = *--Pn; 5639 5640 // int iters = (2*len-i-1)/2; 5641 // assert(iters == end-start, "must be"); 5642 // for (j = start; iters--; j++) { 5643 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5644 // MACC2(Ra, Rb, t0, t1, t2); 5645 // Ra = *++Pa; 5646 // Rb = *--Pb; 5647 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5648 // MACC(Rm, Rn, t0, t1, t2); 5649 // Rm = *++Pm; 5650 // Rn = *--Pn; 5651 // } 5652 // if ((i & 1) == 0) { 5653 // assert(Ra == Pa_base[j], "must be"); 5654 // MACC(Ra, Ra, t0, t1, t2); 5655 // } 5656 // iters = (2*len-i)/2; 5657 // assert(iters == len-j, "must be"); 5658 // for (; iters--; j++) { 5659 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5660 // MACC(Rm, Rn, t0, t1, t2); 5661 // Rm = *++Pm; 5662 // Rn = *--Pn; 5663 // } 5664 // Pm_base[i-len] = t0; 5665 // t0 = t1; t1 = t2; t2 = 0; 5666 // } 5667 5668 // while (t0) 5669 // t0 = sub(Pm_base, Pn_base, t0, len); 5670 // } 5671 }; 5672 5673 5674 // Call here from the interpreter or compiled code to either load 5675 // multiple returned values from the value type instance being 5676 // returned to registers or to store returned values to a newly 5677 // allocated value type instance. 5678 address generate_return_value_stub(address destination, const char* name, bool has_res) { 5679 5680 // Information about frame layout at time of blocking runtime call. 5681 // Note that we only have to preserve callee-saved registers since 5682 // the compilers are responsible for supplying a continuation point 5683 // if they expect all registers to be preserved. 5684 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 5685 enum layout { 5686 rfp_off = 0, rfp_off2, 5687 5688 j_rarg7_off, j_rarg7_2, 5689 j_rarg6_off, j_rarg6_2, 5690 j_rarg5_off, j_rarg5_2, 5691 j_rarg4_off, j_rarg4_2, 5692 j_rarg3_off, j_rarg3_2, 5693 j_rarg2_off, j_rarg2_2, 5694 j_rarg1_off, j_rarg1_2, 5695 j_rarg0_off, j_rarg0_2, 5696 5697 j_farg0_off, j_farg0_2, 5698 j_farg1_off, j_farg1_2, 5699 j_farg2_off, j_farg2_2, 5700 j_farg3_off, j_farg3_2, 5701 j_farg4_off, j_farg4_2, 5702 j_farg5_off, j_farg5_2, 5703 j_farg6_off, j_farg6_2, 5704 j_farg7_off, j_farg7_2, 5705 5706 return_off, return_off2, 5707 framesize // inclusive of return address 5708 }; 5709 5710 int insts_size = 512; 5711 int locs_size = 64; 5712 5713 CodeBuffer code(name, insts_size, locs_size); 5714 OopMapSet* oop_maps = new OopMapSet(); 5715 MacroAssembler* masm = new MacroAssembler(&code); 5716 5717 address start = __ pc(); 5718 5719 const Address f7_save (rfp, j_farg7_off * wordSize); 5720 const Address f6_save (rfp, j_farg6_off * wordSize); 5721 const Address f5_save (rfp, j_farg5_off * wordSize); 5722 const Address f4_save (rfp, j_farg4_off * wordSize); 5723 const Address f3_save (rfp, j_farg3_off * wordSize); 5724 const Address f2_save (rfp, j_farg2_off * wordSize); 5725 const Address f1_save (rfp, j_farg1_off * wordSize); 5726 const Address f0_save (rfp, j_farg0_off * wordSize); 5727 5728 const Address r0_save (rfp, j_rarg0_off * wordSize); 5729 const Address r1_save (rfp, j_rarg1_off * wordSize); 5730 const Address r2_save (rfp, j_rarg2_off * wordSize); 5731 const Address r3_save (rfp, j_rarg3_off * wordSize); 5732 const Address r4_save (rfp, j_rarg4_off * wordSize); 5733 const Address r5_save (rfp, j_rarg5_off * wordSize); 5734 const Address r6_save (rfp, j_rarg6_off * wordSize); 5735 const Address r7_save (rfp, j_rarg7_off * wordSize); 5736 5737 // Generate oop map 5738 OopMap* map = new OopMap(framesize, 0); 5739 5740 map->set_callee_saved(VMRegImpl::stack2reg(rfp_off), rfp->as_VMReg()); 5741 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg()); 5742 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg()); 5743 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg()); 5744 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg()); 5745 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg()); 5746 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg()); 5747 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg()); 5748 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg()); 5749 5750 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg()); 5751 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg()); 5752 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg()); 5753 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg()); 5754 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg()); 5755 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg()); 5756 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg()); 5757 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg()); 5758 5759 // This is an inlined and slightly modified version of call_VM 5760 // which has the ability to fetch the return PC out of 5761 // thread-local storage and also sets up last_Java_sp slightly 5762 // differently than the real call_VM 5763 5764 __ enter(); // Save FP and LR before call 5765 5766 assert(is_even(framesize/2), "sp not 16-byte aligned"); 5767 5768 // lr and fp are already in place 5769 __ sub(sp, rfp, ((unsigned)framesize - 4) << LogBytesPerInt); // prolog 5770 5771 __ strd(j_farg7, f7_save); 5772 __ strd(j_farg6, f6_save); 5773 __ strd(j_farg5, f5_save); 5774 __ strd(j_farg4, f4_save); 5775 __ strd(j_farg3, f3_save); 5776 __ strd(j_farg2, f2_save); 5777 __ strd(j_farg1, f1_save); 5778 __ strd(j_farg0, f0_save); 5779 5780 __ str(j_rarg0, r0_save); 5781 __ str(j_rarg1, r1_save); 5782 __ str(j_rarg2, r2_save); 5783 __ str(j_rarg3, r3_save); 5784 __ str(j_rarg4, r4_save); 5785 __ str(j_rarg5, r5_save); 5786 __ str(j_rarg6, r6_save); 5787 __ str(j_rarg7, r7_save); 5788 5789 int frame_complete = __ pc() - start; 5790 5791 // Set up last_Java_sp and last_Java_fp 5792 address the_pc = __ pc(); 5793 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 5794 5795 // Call runtime 5796 __ mov(c_rarg0, rthread); 5797 __ mov(c_rarg1, r0); 5798 5799 BLOCK_COMMENT("call runtime_entry"); 5800 __ mov(rscratch1, destination); 5801 __ blrt(rscratch1, 2 /* number_of_arguments */, 0, 1); 5802 5803 oop_maps->add_gc_map(the_pc - start, map); 5804 5805 __ reset_last_Java_frame(false); 5806 __ maybe_isb(); 5807 5808 __ ldrd(j_farg7, f7_save); 5809 __ ldrd(j_farg6, f6_save); 5810 __ ldrd(j_farg5, f5_save); 5811 __ ldrd(j_farg4, f4_save); 5812 __ ldrd(j_farg3, f3_save); 5813 __ ldrd(j_farg3, f2_save); 5814 __ ldrd(j_farg1, f1_save); 5815 __ ldrd(j_farg0, f0_save); 5816 5817 __ ldr(j_rarg0, r0_save); 5818 __ ldr(j_rarg1, r1_save); 5819 __ ldr(j_rarg2, r2_save); 5820 __ ldr(j_rarg3, r3_save); 5821 __ ldr(j_rarg4, r4_save); 5822 __ ldr(j_rarg5, r5_save); 5823 __ ldr(j_rarg6, r6_save); 5824 __ ldr(j_rarg7, r7_save); 5825 5826 __ leave(); 5827 5828 // check for pending exceptions 5829 Label pending; 5830 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 5831 __ cmp(rscratch1, (u1)NULL_WORD); 5832 __ br(Assembler::NE, pending); 5833 5834 if (has_res) { 5835 __ get_vm_result(r0, rthread); 5836 } 5837 __ ret(lr); 5838 5839 __ bind(pending); 5840 __ ldr(r0, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 5841 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 5842 5843 5844 // codeBlob framesize is in words (not VMRegImpl::slot_size) 5845 int frame_size_in_words = (framesize >> (LogBytesPerWord - LogBytesPerInt)); 5846 RuntimeStub* stub = 5847 RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false); 5848 5849 return stub->entry_point(); 5850 } 5851 5852 // Initialization 5853 void generate_initial() { 5854 // Generate initial stubs and initializes the entry points 5855 5856 // entry points that exist in all platforms Note: This is code 5857 // that could be shared among different platforms - however the 5858 // benefit seems to be smaller than the disadvantage of having a 5859 // much more complicated generator structure. See also comment in 5860 // stubRoutines.hpp. 5861 5862 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5863 5864 StubRoutines::_call_stub_entry = 5865 generate_call_stub(StubRoutines::_call_stub_return_address); 5866 5867 // is referenced by megamorphic call 5868 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5869 5870 // Build this early so it's available for the interpreter. 5871 StubRoutines::_throw_StackOverflowError_entry = 5872 generate_throw_exception("StackOverflowError throw_exception", 5873 CAST_FROM_FN_PTR(address, 5874 SharedRuntime::throw_StackOverflowError)); 5875 StubRoutines::_throw_delayed_StackOverflowError_entry = 5876 generate_throw_exception("delayed StackOverflowError throw_exception", 5877 CAST_FROM_FN_PTR(address, 5878 SharedRuntime::throw_delayed_StackOverflowError)); 5879 if (UseCRC32Intrinsics) { 5880 // set table address before stub generation which use it 5881 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5882 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5883 } 5884 5885 if (UseCRC32CIntrinsics) { 5886 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5887 } 5888 5889 // Disabled until JDK-8210858 is fixed 5890 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5891 // StubRoutines::_dlog = generate_dlog(); 5892 // } 5893 5894 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5895 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 5896 } 5897 5898 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5899 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 5900 } 5901 5902 5903 StubRoutines::_load_value_type_fields_in_regs = 5904 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_value_type_fields_in_regs), "load_value_type_fields_in_regs", false); 5905 StubRoutines::_store_value_type_fields_to_buf = 5906 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_value_type_fields_to_buf), "store_value_type_fields_to_buf", true); 5907 } 5908 5909 void generate_all() { 5910 // support for verify_oop (must happen after universe_init) 5911 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5912 StubRoutines::_throw_AbstractMethodError_entry = 5913 generate_throw_exception("AbstractMethodError throw_exception", 5914 CAST_FROM_FN_PTR(address, 5915 SharedRuntime:: 5916 throw_AbstractMethodError)); 5917 5918 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5919 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5920 CAST_FROM_FN_PTR(address, 5921 SharedRuntime:: 5922 throw_IncompatibleClassChangeError)); 5923 5924 StubRoutines::_throw_NullPointerException_at_call_entry = 5925 generate_throw_exception("NullPointerException at call throw_exception", 5926 CAST_FROM_FN_PTR(address, 5927 SharedRuntime:: 5928 throw_NullPointerException_at_call)); 5929 5930 // arraycopy stubs used by compilers 5931 generate_arraycopy_stubs(); 5932 5933 // has negatives stub for large arrays. 5934 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5935 5936 // array equals stub for large arrays. 5937 if (!UseSimpleArrayEquals) { 5938 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5939 } 5940 5941 generate_compare_long_strings(); 5942 5943 generate_string_indexof_stubs(); 5944 5945 // byte_array_inflate stub for large arrays. 5946 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5947 5948 #ifdef COMPILER2 5949 if (UseMultiplyToLenIntrinsic) { 5950 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5951 } 5952 5953 if (UseSquareToLenIntrinsic) { 5954 StubRoutines::_squareToLen = generate_squareToLen(); 5955 } 5956 5957 if (UseMulAddIntrinsic) { 5958 StubRoutines::_mulAdd = generate_mulAdd(); 5959 } 5960 5961 if (UseMontgomeryMultiplyIntrinsic) { 5962 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5963 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5964 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5965 } 5966 5967 if (UseMontgomerySquareIntrinsic) { 5968 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5969 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5970 // We use generate_multiply() rather than generate_square() 5971 // because it's faster for the sizes of modulus we care about. 5972 StubRoutines::_montgomerySquare = g.generate_multiply(); 5973 } 5974 #endif // COMPILER2 5975 5976 #ifndef BUILTIN_SIM 5977 // generate GHASH intrinsics code 5978 if (UseGHASHIntrinsics) { 5979 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5980 } 5981 5982 if (UseAESIntrinsics) { 5983 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5984 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5985 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5986 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5987 } 5988 5989 if (UseSHA1Intrinsics) { 5990 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5991 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5992 } 5993 if (UseSHA256Intrinsics) { 5994 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5995 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5996 } 5997 5998 // generate Adler32 intrinsics code 5999 if (UseAdler32Intrinsics) { 6000 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 6001 } 6002 6003 // Safefetch stubs. 6004 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 6005 &StubRoutines::_safefetch32_fault_pc, 6006 &StubRoutines::_safefetch32_continuation_pc); 6007 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 6008 &StubRoutines::_safefetchN_fault_pc, 6009 &StubRoutines::_safefetchN_continuation_pc); 6010 #endif 6011 StubRoutines::aarch64::set_completed(); 6012 } 6013 6014 public: 6015 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 6016 if (all) { 6017 generate_all(); 6018 } else { 6019 generate_initial(); 6020 } 6021 } 6022 }; // end class declaration 6023 6024 void StubGenerator_generate(CodeBuffer* code, bool all) { 6025 StubGenerator g(code, all); 6026 }