1 /* 2 * Copyright (c) 2003, 2018, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2015, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "nativeInst_aarch64.hpp" 33 #include "oops/instanceOop.hpp" 34 #include "oops/method.hpp" 35 #include "oops/objArrayKlass.hpp" 36 #include "oops/oop.inline.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "runtime/frame.inline.hpp" 39 #include "runtime/handles.inline.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubCodeGenerator.hpp" 42 #include "runtime/stubRoutines.hpp" 43 #include "runtime/thread.inline.hpp" 44 #include "utilities/align.hpp" 45 #ifdef COMPILER2 46 #include "opto/runtime.hpp" 47 #endif 48 49 #ifdef BUILTIN_SIM 50 #include "../../../../../../simulator/simulator.hpp" 51 #endif 52 53 // Declaration and definition of StubGenerator (no .hpp file). 54 // For a more detailed description of the stub routine structure 55 // see the comment in stubRoutines.hpp 56 57 #undef __ 58 #define __ _masm-> 59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 60 61 #ifdef PRODUCT 62 #define BLOCK_COMMENT(str) /* nothing */ 63 #else 64 #define BLOCK_COMMENT(str) __ block_comment(str) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Stub Code definitions 70 71 class StubGenerator: public StubCodeGenerator { 72 private: 73 74 #ifdef PRODUCT 75 #define inc_counter_np(counter) ((void)0) 76 #else 77 void inc_counter_np_(int& counter) { 78 __ lea(rscratch2, ExternalAddress((address)&counter)); 79 __ ldrw(rscratch1, Address(rscratch2)); 80 __ addw(rscratch1, rscratch1, 1); 81 __ strw(rscratch1, Address(rscratch2)); 82 } 83 #define inc_counter_np(counter) \ 84 BLOCK_COMMENT("inc_counter " #counter); \ 85 inc_counter_np_(counter); 86 #endif 87 88 // Call stubs are used to call Java from C 89 // 90 // Arguments: 91 // c_rarg0: call wrapper address address 92 // c_rarg1: result address 93 // c_rarg2: result type BasicType 94 // c_rarg3: method Method* 95 // c_rarg4: (interpreter) entry point address 96 // c_rarg5: parameters intptr_t* 97 // c_rarg6: parameter size (in words) int 98 // c_rarg7: thread Thread* 99 // 100 // There is no return from the stub itself as any Java result 101 // is written to result 102 // 103 // we save r30 (lr) as the return PC at the base of the frame and 104 // link r29 (fp) below it as the frame pointer installing sp (r31) 105 // into fp. 106 // 107 // we save r0-r7, which accounts for all the c arguments. 108 // 109 // TODO: strictly do we need to save them all? they are treated as 110 // volatile by C so could we omit saving the ones we are going to 111 // place in global registers (thread? method?) or those we only use 112 // during setup of the Java call? 113 // 114 // we don't need to save r8 which C uses as an indirect result location 115 // return register. 116 // 117 // we don't need to save r9-r15 which both C and Java treat as 118 // volatile 119 // 120 // we don't need to save r16-18 because Java does not use them 121 // 122 // we save r19-r28 which Java uses as scratch registers and C 123 // expects to be callee-save 124 // 125 // we save the bottom 64 bits of each value stored in v8-v15; it is 126 // the responsibility of the caller to preserve larger values. 127 // 128 // so the stub frame looks like this when we enter Java code 129 // 130 // [ return_from_Java ] <--- sp 131 // [ argument word n ] 132 // ... 133 // -27 [ argument word 1 ] 134 // -26 [ saved v15 ] <--- sp_after_call 135 // -25 [ saved v14 ] 136 // -24 [ saved v13 ] 137 // -23 [ saved v12 ] 138 // -22 [ saved v11 ] 139 // -21 [ saved v10 ] 140 // -20 [ saved v9 ] 141 // -19 [ saved v8 ] 142 // -18 [ saved r28 ] 143 // -17 [ saved r27 ] 144 // -16 [ saved r26 ] 145 // -15 [ saved r25 ] 146 // -14 [ saved r24 ] 147 // -13 [ saved r23 ] 148 // -12 [ saved r22 ] 149 // -11 [ saved r21 ] 150 // -10 [ saved r20 ] 151 // -9 [ saved r19 ] 152 // -8 [ call wrapper (r0) ] 153 // -7 [ result (r1) ] 154 // -6 [ result type (r2) ] 155 // -5 [ method (r3) ] 156 // -4 [ entry point (r4) ] 157 // -3 [ parameters (r5) ] 158 // -2 [ parameter size (r6) ] 159 // -1 [ thread (r7) ] 160 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 161 // 1 [ saved lr (r30) ] 162 163 // Call stub stack layout word offsets from fp 164 enum call_stub_layout { 165 sp_after_call_off = -26, 166 167 d15_off = -26, 168 d13_off = -24, 169 d11_off = -22, 170 d9_off = -20, 171 172 r28_off = -18, 173 r26_off = -16, 174 r24_off = -14, 175 r22_off = -12, 176 r20_off = -10, 177 call_wrapper_off = -8, 178 result_off = -7, 179 result_type_off = -6, 180 method_off = -5, 181 entry_point_off = -4, 182 parameter_size_off = -2, 183 thread_off = -1, 184 fp_f = 0, 185 retaddr_off = 1, 186 }; 187 188 address generate_call_stub(address& return_address) { 189 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 190 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 191 "adjust this code"); 192 193 StubCodeMark mark(this, "StubRoutines", "call_stub"); 194 address start = __ pc(); 195 196 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 197 198 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 199 const Address result (rfp, result_off * wordSize); 200 const Address result_type (rfp, result_type_off * wordSize); 201 const Address method (rfp, method_off * wordSize); 202 const Address entry_point (rfp, entry_point_off * wordSize); 203 const Address parameter_size(rfp, parameter_size_off * wordSize); 204 205 const Address thread (rfp, thread_off * wordSize); 206 207 const Address d15_save (rfp, d15_off * wordSize); 208 const Address d13_save (rfp, d13_off * wordSize); 209 const Address d11_save (rfp, d11_off * wordSize); 210 const Address d9_save (rfp, d9_off * wordSize); 211 212 const Address r28_save (rfp, r28_off * wordSize); 213 const Address r26_save (rfp, r26_off * wordSize); 214 const Address r24_save (rfp, r24_off * wordSize); 215 const Address r22_save (rfp, r22_off * wordSize); 216 const Address r20_save (rfp, r20_off * wordSize); 217 218 // stub code 219 220 // we need a C prolog to bootstrap the x86 caller into the sim 221 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 222 223 address aarch64_entry = __ pc(); 224 225 #ifdef BUILTIN_SIM 226 // Save sender's SP for stack traces. 227 __ mov(rscratch1, sp); 228 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 229 #endif 230 // set up frame and move sp to end of save area 231 __ enter(); 232 __ sub(sp, rfp, -sp_after_call_off * wordSize); 233 234 // save register parameters and Java scratch/global registers 235 // n.b. we save thread even though it gets installed in 236 // rthread because we want to sanity check rthread later 237 __ str(c_rarg7, thread); 238 __ strw(c_rarg6, parameter_size); 239 __ stp(c_rarg4, c_rarg5, entry_point); 240 __ stp(c_rarg2, c_rarg3, result_type); 241 __ stp(c_rarg0, c_rarg1, call_wrapper); 242 243 __ stp(r20, r19, r20_save); 244 __ stp(r22, r21, r22_save); 245 __ stp(r24, r23, r24_save); 246 __ stp(r26, r25, r26_save); 247 __ stp(r28, r27, r28_save); 248 249 __ stpd(v9, v8, d9_save); 250 __ stpd(v11, v10, d11_save); 251 __ stpd(v13, v12, d13_save); 252 __ stpd(v15, v14, d15_save); 253 254 // install Java thread in global register now we have saved 255 // whatever value it held 256 __ mov(rthread, c_rarg7); 257 // And method 258 __ mov(rmethod, c_rarg3); 259 260 // set up the heapbase register 261 __ reinit_heapbase(); 262 263 #ifdef ASSERT 264 // make sure we have no pending exceptions 265 { 266 Label L; 267 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 268 __ cmp(rscratch1, (u1)NULL_WORD); 269 __ br(Assembler::EQ, L); 270 __ stop("StubRoutines::call_stub: entered with pending exception"); 271 __ BIND(L); 272 } 273 #endif 274 // pass parameters if any 275 __ mov(esp, sp); 276 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 277 __ andr(sp, rscratch1, -2 * wordSize); 278 279 BLOCK_COMMENT("pass parameters if any"); 280 Label parameters_done; 281 // parameter count is still in c_rarg6 282 // and parameter pointer identifying param 1 is in c_rarg5 283 __ cbzw(c_rarg6, parameters_done); 284 285 address loop = __ pc(); 286 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 287 __ subsw(c_rarg6, c_rarg6, 1); 288 __ push(rscratch1); 289 __ br(Assembler::GT, loop); 290 291 __ BIND(parameters_done); 292 293 // call Java entry -- passing methdoOop, and current sp 294 // rmethod: Method* 295 // r13: sender sp 296 BLOCK_COMMENT("call Java function"); 297 __ mov(r13, sp); 298 __ blr(c_rarg4); 299 300 // tell the simulator we have returned to the stub 301 302 // we do this here because the notify will already have been done 303 // if we get to the next instruction via an exception 304 // 305 // n.b. adding this instruction here affects the calculation of 306 // whether or not a routine returns to the call stub (used when 307 // doing stack walks) since the normal test is to check the return 308 // pc against the address saved below. so we may need to allow for 309 // this extra instruction in the check. 310 311 if (NotifySimulator) { 312 __ notify(Assembler::method_reentry); 313 } 314 // save current address for use by exception handling code 315 316 return_address = __ pc(); 317 318 // store result depending on type (everything that is not 319 // T_OBJECT, T_VALUETYPE, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 320 // n.b. this assumes Java returns an integral result in r0 321 // and a floating result in j_farg0 322 __ ldr(j_rarg2, result); 323 Label is_long, is_float, is_double, exit; 324 __ ldr(j_rarg1, result_type); 325 __ cmp(j_rarg1, (u1)T_OBJECT); 326 __ br(Assembler::EQ, is_long); 327 __ cmp(j_rarg1, (u1)T_VALUETYPE); 328 __ br(Assembler::EQ, is_long); 329 __ cmp(j_rarg1, (u1)T_LONG); 330 __ br(Assembler::EQ, is_long); 331 __ cmp(j_rarg1, (u1)T_FLOAT); 332 __ br(Assembler::EQ, is_float); 333 __ cmp(j_rarg1, (u1)T_DOUBLE); 334 __ br(Assembler::EQ, is_double); 335 336 // handle T_INT case 337 __ strw(r0, Address(j_rarg2)); 338 339 __ BIND(exit); 340 341 // pop parameters 342 __ sub(esp, rfp, -sp_after_call_off * wordSize); 343 344 #ifdef ASSERT 345 // verify that threads correspond 346 { 347 Label L, S; 348 __ ldr(rscratch1, thread); 349 __ cmp(rthread, rscratch1); 350 __ br(Assembler::NE, S); 351 __ get_thread(rscratch1); 352 __ cmp(rthread, rscratch1); 353 __ br(Assembler::EQ, L); 354 __ BIND(S); 355 __ stop("StubRoutines::call_stub: threads must correspond"); 356 __ BIND(L); 357 } 358 #endif 359 360 // restore callee-save registers 361 __ ldpd(v15, v14, d15_save); 362 __ ldpd(v13, v12, d13_save); 363 __ ldpd(v11, v10, d11_save); 364 __ ldpd(v9, v8, d9_save); 365 366 __ ldp(r28, r27, r28_save); 367 __ ldp(r26, r25, r26_save); 368 __ ldp(r24, r23, r24_save); 369 __ ldp(r22, r21, r22_save); 370 __ ldp(r20, r19, r20_save); 371 372 __ ldp(c_rarg0, c_rarg1, call_wrapper); 373 __ ldrw(c_rarg2, result_type); 374 __ ldr(c_rarg3, method); 375 __ ldp(c_rarg4, c_rarg5, entry_point); 376 __ ldp(c_rarg6, c_rarg7, parameter_size); 377 378 #ifndef PRODUCT 379 // tell the simulator we are about to end Java execution 380 if (NotifySimulator) { 381 __ notify(Assembler::method_exit); 382 } 383 #endif 384 // leave frame and return to caller 385 __ leave(); 386 __ ret(lr); 387 388 // handle return types different from T_INT 389 390 __ BIND(is_long); 391 __ str(r0, Address(j_rarg2, 0)); 392 __ br(Assembler::AL, exit); 393 394 __ BIND(is_float); 395 __ strs(j_farg0, Address(j_rarg2, 0)); 396 __ br(Assembler::AL, exit); 397 398 __ BIND(is_double); 399 __ strd(j_farg0, Address(j_rarg2, 0)); 400 __ br(Assembler::AL, exit); 401 402 return start; 403 } 404 405 // Return point for a Java call if there's an exception thrown in 406 // Java code. The exception is caught and transformed into a 407 // pending exception stored in JavaThread that can be tested from 408 // within the VM. 409 // 410 // Note: Usually the parameters are removed by the callee. In case 411 // of an exception crossing an activation frame boundary, that is 412 // not the case if the callee is compiled code => need to setup the 413 // rsp. 414 // 415 // r0: exception oop 416 417 // NOTE: this is used as a target from the signal handler so it 418 // needs an x86 prolog which returns into the current simulator 419 // executing the generated catch_exception code. so the prolog 420 // needs to install rax in a sim register and adjust the sim's 421 // restart pc to enter the generated code at the start position 422 // then return from native to simulated execution. 423 424 address generate_catch_exception() { 425 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 426 address start = __ pc(); 427 428 // same as in generate_call_stub(): 429 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 430 const Address thread (rfp, thread_off * wordSize); 431 432 #ifdef ASSERT 433 // verify that threads correspond 434 { 435 Label L, S; 436 __ ldr(rscratch1, thread); 437 __ cmp(rthread, rscratch1); 438 __ br(Assembler::NE, S); 439 __ get_thread(rscratch1); 440 __ cmp(rthread, rscratch1); 441 __ br(Assembler::EQ, L); 442 __ bind(S); 443 __ stop("StubRoutines::catch_exception: threads must correspond"); 444 __ bind(L); 445 } 446 #endif 447 448 // set pending exception 449 __ verify_oop(r0); 450 451 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 452 __ mov(rscratch1, (address)__FILE__); 453 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 454 __ movw(rscratch1, (int)__LINE__); 455 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 456 457 // complete return to VM 458 assert(StubRoutines::_call_stub_return_address != NULL, 459 "_call_stub_return_address must have been generated before"); 460 __ b(StubRoutines::_call_stub_return_address); 461 462 return start; 463 } 464 465 // Continuation point for runtime calls returning with a pending 466 // exception. The pending exception check happened in the runtime 467 // or native call stub. The pending exception in Thread is 468 // converted into a Java-level exception. 469 // 470 // Contract with Java-level exception handlers: 471 // r0: exception 472 // r3: throwing pc 473 // 474 // NOTE: At entry of this stub, exception-pc must be in LR !! 475 476 // NOTE: this is always used as a jump target within generated code 477 // so it just needs to be generated code wiht no x86 prolog 478 479 address generate_forward_exception() { 480 StubCodeMark mark(this, "StubRoutines", "forward exception"); 481 address start = __ pc(); 482 483 // Upon entry, LR points to the return address returning into 484 // Java (interpreted or compiled) code; i.e., the return address 485 // becomes the throwing pc. 486 // 487 // Arguments pushed before the runtime call are still on the stack 488 // but the exception handler will reset the stack pointer -> 489 // ignore them. A potential result in registers can be ignored as 490 // well. 491 492 #ifdef ASSERT 493 // make sure this code is only executed if there is a pending exception 494 { 495 Label L; 496 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 497 __ cbnz(rscratch1, L); 498 __ stop("StubRoutines::forward exception: no pending exception (1)"); 499 __ bind(L); 500 } 501 #endif 502 503 // compute exception handler into r19 504 505 // call the VM to find the handler address associated with the 506 // caller address. pass thread in r0 and caller pc (ret address) 507 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 508 // the stack. 509 __ mov(c_rarg1, lr); 510 // lr will be trashed by the VM call so we move it to R19 511 // (callee-saved) because we also need to pass it to the handler 512 // returned by this call. 513 __ mov(r19, lr); 514 BLOCK_COMMENT("call exception_handler_for_return_address"); 515 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 516 SharedRuntime::exception_handler_for_return_address), 517 rthread, c_rarg1); 518 // we should not really care that lr is no longer the callee 519 // address. we saved the value the handler needs in r19 so we can 520 // just copy it to r3. however, the C2 handler will push its own 521 // frame and then calls into the VM and the VM code asserts that 522 // the PC for the frame above the handler belongs to a compiled 523 // Java method. So, we restore lr here to satisfy that assert. 524 __ mov(lr, r19); 525 // setup r0 & r3 & clear pending exception 526 __ mov(r3, r19); 527 __ mov(r19, r0); 528 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 529 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 530 531 #ifdef ASSERT 532 // make sure exception is set 533 { 534 Label L; 535 __ cbnz(r0, L); 536 __ stop("StubRoutines::forward exception: no pending exception (2)"); 537 __ bind(L); 538 } 539 #endif 540 541 // continue at exception handler 542 // r0: exception 543 // r3: throwing pc 544 // r19: exception handler 545 __ verify_oop(r0); 546 __ br(r19); 547 548 return start; 549 } 550 551 // Non-destructive plausibility checks for oops 552 // 553 // Arguments: 554 // r0: oop to verify 555 // rscratch1: error message 556 // 557 // Stack after saving c_rarg3: 558 // [tos + 0]: saved c_rarg3 559 // [tos + 1]: saved c_rarg2 560 // [tos + 2]: saved lr 561 // [tos + 3]: saved rscratch2 562 // [tos + 4]: saved r0 563 // [tos + 5]: saved rscratch1 564 address generate_verify_oop() { 565 566 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 567 address start = __ pc(); 568 569 Label exit, error; 570 571 // save c_rarg2 and c_rarg3 572 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 573 574 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 575 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 576 __ ldr(c_rarg3, Address(c_rarg2)); 577 __ add(c_rarg3, c_rarg3, 1); 578 __ str(c_rarg3, Address(c_rarg2)); 579 580 // object is in r0 581 // make sure object is 'reasonable' 582 __ cbz(r0, exit); // if obj is NULL it is OK 583 584 // Check if the oop is in the right area of memory 585 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 586 __ andr(c_rarg2, r0, c_rarg3); 587 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 588 589 // Compare c_rarg2 and c_rarg3. We don't use a compare 590 // instruction here because the flags register is live. 591 __ eor(c_rarg2, c_rarg2, c_rarg3); 592 __ cbnz(c_rarg2, error); 593 594 // make sure klass is 'reasonable', which is not zero. 595 __ load_klass(r0, r0); // get klass 596 __ cbz(r0, error); // if klass is NULL it is broken 597 598 // return if everything seems ok 599 __ bind(exit); 600 601 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 602 __ ret(lr); 603 604 // handle errors 605 __ bind(error); 606 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 607 608 __ push(RegSet::range(r0, r29), sp); 609 // debug(char* msg, int64_t pc, int64_t regs[]) 610 __ mov(c_rarg0, rscratch1); // pass address of error message 611 __ mov(c_rarg1, lr); // pass return address 612 __ mov(c_rarg2, sp); // pass address of regs on stack 613 #ifndef PRODUCT 614 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 615 #endif 616 BLOCK_COMMENT("call MacroAssembler::debug"); 617 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 618 __ blrt(rscratch1, 3, 0, 1); 619 620 return start; 621 } 622 623 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 624 625 // The inner part of zero_words(). This is the bulk operation, 626 // zeroing words in blocks, possibly using DC ZVA to do it. The 627 // caller is responsible for zeroing the last few words. 628 // 629 // Inputs: 630 // r10: the HeapWord-aligned base address of an array to zero. 631 // r11: the count in HeapWords, r11 > 0. 632 // 633 // Returns r10 and r11, adjusted for the caller to clear. 634 // r10: the base address of the tail of words left to clear. 635 // r11: the number of words in the tail. 636 // r11 < MacroAssembler::zero_words_block_size. 637 638 address generate_zero_blocks() { 639 Label done; 640 Label base_aligned; 641 642 Register base = r10, cnt = r11; 643 644 __ align(CodeEntryAlignment); 645 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 646 address start = __ pc(); 647 648 if (UseBlockZeroing) { 649 int zva_length = VM_Version::zva_length(); 650 651 // Ensure ZVA length can be divided by 16. This is required by 652 // the subsequent operations. 653 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 654 655 __ tbz(base, 3, base_aligned); 656 __ str(zr, Address(__ post(base, 8))); 657 __ sub(cnt, cnt, 1); 658 __ bind(base_aligned); 659 660 // Ensure count >= zva_length * 2 so that it still deserves a zva after 661 // alignment. 662 Label small; 663 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 664 __ subs(rscratch1, cnt, low_limit >> 3); 665 __ br(Assembler::LT, small); 666 __ zero_dcache_blocks(base, cnt); 667 __ bind(small); 668 } 669 670 { 671 // Number of stp instructions we'll unroll 672 const int unroll = 673 MacroAssembler::zero_words_block_size / 2; 674 // Clear the remaining blocks. 675 Label loop; 676 __ subs(cnt, cnt, unroll * 2); 677 __ br(Assembler::LT, done); 678 __ bind(loop); 679 for (int i = 0; i < unroll; i++) 680 __ stp(zr, zr, __ post(base, 16)); 681 __ subs(cnt, cnt, unroll * 2); 682 __ br(Assembler::GE, loop); 683 __ bind(done); 684 __ add(cnt, cnt, unroll * 2); 685 } 686 687 __ ret(lr); 688 689 return start; 690 } 691 692 693 typedef enum { 694 copy_forwards = 1, 695 copy_backwards = -1 696 } copy_direction; 697 698 // Bulk copy of blocks of 8 words. 699 // 700 // count is a count of words. 701 // 702 // Precondition: count >= 8 703 // 704 // Postconditions: 705 // 706 // The least significant bit of count contains the remaining count 707 // of words to copy. The rest of count is trash. 708 // 709 // s and d are adjusted to point to the remaining words to copy 710 // 711 void generate_copy_longs(Label &start, Register s, Register d, Register count, 712 copy_direction direction) { 713 int unit = wordSize * direction; 714 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 715 716 int offset; 717 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 718 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 719 const Register stride = r13; 720 721 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 722 assert_different_registers(s, d, count, rscratch1); 723 724 Label again, drain; 725 const char *stub_name; 726 if (direction == copy_forwards) 727 stub_name = "forward_copy_longs"; 728 else 729 stub_name = "backward_copy_longs"; 730 731 __ align(CodeEntryAlignment); 732 733 StubCodeMark mark(this, "StubRoutines", stub_name); 734 735 __ bind(start); 736 737 Label unaligned_copy_long; 738 if (AvoidUnalignedAccesses) { 739 __ tbnz(d, 3, unaligned_copy_long); 740 } 741 742 if (direction == copy_forwards) { 743 __ sub(s, s, bias); 744 __ sub(d, d, bias); 745 } 746 747 #ifdef ASSERT 748 // Make sure we are never given < 8 words 749 { 750 Label L; 751 __ cmp(count, (u1)8); 752 __ br(Assembler::GE, L); 753 __ stop("genrate_copy_longs called with < 8 words"); 754 __ bind(L); 755 } 756 #endif 757 758 // Fill 8 registers 759 if (UseSIMDForMemoryOps) { 760 __ ldpq(v0, v1, Address(s, 4 * unit)); 761 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 762 } else { 763 __ ldp(t0, t1, Address(s, 2 * unit)); 764 __ ldp(t2, t3, Address(s, 4 * unit)); 765 __ ldp(t4, t5, Address(s, 6 * unit)); 766 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 767 } 768 769 __ subs(count, count, 16); 770 __ br(Assembler::LO, drain); 771 772 int prefetch = PrefetchCopyIntervalInBytes; 773 bool use_stride = false; 774 if (direction == copy_backwards) { 775 use_stride = prefetch > 256; 776 prefetch = -prefetch; 777 if (use_stride) __ mov(stride, prefetch); 778 } 779 780 __ bind(again); 781 782 if (PrefetchCopyIntervalInBytes > 0) 783 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 784 785 if (UseSIMDForMemoryOps) { 786 __ stpq(v0, v1, Address(d, 4 * unit)); 787 __ ldpq(v0, v1, Address(s, 4 * unit)); 788 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 789 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 790 } else { 791 __ stp(t0, t1, Address(d, 2 * unit)); 792 __ ldp(t0, t1, Address(s, 2 * unit)); 793 __ stp(t2, t3, Address(d, 4 * unit)); 794 __ ldp(t2, t3, Address(s, 4 * unit)); 795 __ stp(t4, t5, Address(d, 6 * unit)); 796 __ ldp(t4, t5, Address(s, 6 * unit)); 797 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 798 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 799 } 800 801 __ subs(count, count, 8); 802 __ br(Assembler::HS, again); 803 804 // Drain 805 __ bind(drain); 806 if (UseSIMDForMemoryOps) { 807 __ stpq(v0, v1, Address(d, 4 * unit)); 808 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 809 } else { 810 __ stp(t0, t1, Address(d, 2 * unit)); 811 __ stp(t2, t3, Address(d, 4 * unit)); 812 __ stp(t4, t5, Address(d, 6 * unit)); 813 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 814 } 815 816 { 817 Label L1, L2; 818 __ tbz(count, exact_log2(4), L1); 819 if (UseSIMDForMemoryOps) { 820 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 821 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 822 } else { 823 __ ldp(t0, t1, Address(s, 2 * unit)); 824 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 825 __ stp(t0, t1, Address(d, 2 * unit)); 826 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 827 } 828 __ bind(L1); 829 830 if (direction == copy_forwards) { 831 __ add(s, s, bias); 832 __ add(d, d, bias); 833 } 834 835 __ tbz(count, 1, L2); 836 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 837 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 838 __ bind(L2); 839 } 840 841 __ ret(lr); 842 843 if (AvoidUnalignedAccesses) { 844 Label drain, again; 845 // Register order for storing. Order is different for backward copy. 846 847 __ bind(unaligned_copy_long); 848 849 // source address is even aligned, target odd aligned 850 // 851 // when forward copying word pairs we read long pairs at offsets 852 // {0, 2, 4, 6} (in long words). when backwards copying we read 853 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 854 // address by -2 in the forwards case so we can compute the 855 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 856 // or -1. 857 // 858 // when forward copying we need to store 1 word, 3 pairs and 859 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 860 // zero offset We adjust the destination by -1 which means we 861 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 862 // 863 // When backwards copyng we need to store 1 word, 3 pairs and 864 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 865 // offsets {1, 3, 5, 7, 8} * unit. 866 867 if (direction == copy_forwards) { 868 __ sub(s, s, 16); 869 __ sub(d, d, 8); 870 } 871 872 // Fill 8 registers 873 // 874 // for forwards copy s was offset by -16 from the original input 875 // value of s so the register contents are at these offsets 876 // relative to the 64 bit block addressed by that original input 877 // and so on for each successive 64 byte block when s is updated 878 // 879 // t0 at offset 0, t1 at offset 8 880 // t2 at offset 16, t3 at offset 24 881 // t4 at offset 32, t5 at offset 40 882 // t6 at offset 48, t7 at offset 56 883 884 // for backwards copy s was not offset so the register contents 885 // are at these offsets into the preceding 64 byte block 886 // relative to that original input and so on for each successive 887 // preceding 64 byte block when s is updated. this explains the 888 // slightly counter-intuitive looking pattern of register usage 889 // in the stp instructions for backwards copy. 890 // 891 // t0 at offset -16, t1 at offset -8 892 // t2 at offset -32, t3 at offset -24 893 // t4 at offset -48, t5 at offset -40 894 // t6 at offset -64, t7 at offset -56 895 896 __ ldp(t0, t1, Address(s, 2 * unit)); 897 __ ldp(t2, t3, Address(s, 4 * unit)); 898 __ ldp(t4, t5, Address(s, 6 * unit)); 899 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 900 901 __ subs(count, count, 16); 902 __ br(Assembler::LO, drain); 903 904 int prefetch = PrefetchCopyIntervalInBytes; 905 bool use_stride = false; 906 if (direction == copy_backwards) { 907 use_stride = prefetch > 256; 908 prefetch = -prefetch; 909 if (use_stride) __ mov(stride, prefetch); 910 } 911 912 __ bind(again); 913 914 if (PrefetchCopyIntervalInBytes > 0) 915 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 916 917 if (direction == copy_forwards) { 918 // allowing for the offset of -8 the store instructions place 919 // registers into the target 64 bit block at the following 920 // offsets 921 // 922 // t0 at offset 0 923 // t1 at offset 8, t2 at offset 16 924 // t3 at offset 24, t4 at offset 32 925 // t5 at offset 40, t6 at offset 48 926 // t7 at offset 56 927 928 __ str(t0, Address(d, 1 * unit)); 929 __ stp(t1, t2, Address(d, 2 * unit)); 930 __ ldp(t0, t1, Address(s, 2 * unit)); 931 __ stp(t3, t4, Address(d, 4 * unit)); 932 __ ldp(t2, t3, Address(s, 4 * unit)); 933 __ stp(t5, t6, Address(d, 6 * unit)); 934 __ ldp(t4, t5, Address(s, 6 * unit)); 935 __ str(t7, Address(__ pre(d, 8 * unit))); 936 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 937 } else { 938 // d was not offset when we started so the registers are 939 // written into the 64 bit block preceding d with the following 940 // offsets 941 // 942 // t1 at offset -8 943 // t3 at offset -24, t0 at offset -16 944 // t5 at offset -48, t2 at offset -32 945 // t7 at offset -56, t4 at offset -48 946 // t6 at offset -64 947 // 948 // note that this matches the offsets previously noted for the 949 // loads 950 951 __ str(t1, Address(d, 1 * unit)); 952 __ stp(t3, t0, Address(d, 3 * unit)); 953 __ ldp(t0, t1, Address(s, 2 * unit)); 954 __ stp(t5, t2, Address(d, 5 * unit)); 955 __ ldp(t2, t3, Address(s, 4 * unit)); 956 __ stp(t7, t4, Address(d, 7 * unit)); 957 __ ldp(t4, t5, Address(s, 6 * unit)); 958 __ str(t6, Address(__ pre(d, 8 * unit))); 959 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 960 } 961 962 __ subs(count, count, 8); 963 __ br(Assembler::HS, again); 964 965 // Drain 966 // 967 // this uses the same pattern of offsets and register arguments 968 // as above 969 __ bind(drain); 970 if (direction == copy_forwards) { 971 __ str(t0, Address(d, 1 * unit)); 972 __ stp(t1, t2, Address(d, 2 * unit)); 973 __ stp(t3, t4, Address(d, 4 * unit)); 974 __ stp(t5, t6, Address(d, 6 * unit)); 975 __ str(t7, Address(__ pre(d, 8 * unit))); 976 } else { 977 __ str(t1, Address(d, 1 * unit)); 978 __ stp(t3, t0, Address(d, 3 * unit)); 979 __ stp(t5, t2, Address(d, 5 * unit)); 980 __ stp(t7, t4, Address(d, 7 * unit)); 981 __ str(t6, Address(__ pre(d, 8 * unit))); 982 } 983 // now we need to copy any remaining part block which may 984 // include a 4 word block subblock and/or a 2 word subblock. 985 // bits 2 and 1 in the count are the tell-tale for whetehr we 986 // have each such subblock 987 { 988 Label L1, L2; 989 __ tbz(count, exact_log2(4), L1); 990 // this is the same as above but copying only 4 longs hence 991 // with ony one intervening stp between the str instructions 992 // but note that the offsets and registers still follow the 993 // same pattern 994 __ ldp(t0, t1, Address(s, 2 * unit)); 995 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 996 if (direction == copy_forwards) { 997 __ str(t0, Address(d, 1 * unit)); 998 __ stp(t1, t2, Address(d, 2 * unit)); 999 __ str(t3, Address(__ pre(d, 4 * unit))); 1000 } else { 1001 __ str(t1, Address(d, 1 * unit)); 1002 __ stp(t3, t0, Address(d, 3 * unit)); 1003 __ str(t2, Address(__ pre(d, 4 * unit))); 1004 } 1005 __ bind(L1); 1006 1007 __ tbz(count, 1, L2); 1008 // this is the same as above but copying only 2 longs hence 1009 // there is no intervening stp between the str instructions 1010 // but note that the offset and register patterns are still 1011 // the same 1012 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1013 if (direction == copy_forwards) { 1014 __ str(t0, Address(d, 1 * unit)); 1015 __ str(t1, Address(__ pre(d, 2 * unit))); 1016 } else { 1017 __ str(t1, Address(d, 1 * unit)); 1018 __ str(t0, Address(__ pre(d, 2 * unit))); 1019 } 1020 __ bind(L2); 1021 1022 // for forwards copy we need to re-adjust the offsets we 1023 // applied so that s and d are follow the last words written 1024 1025 if (direction == copy_forwards) { 1026 __ add(s, s, 16); 1027 __ add(d, d, 8); 1028 } 1029 1030 } 1031 1032 __ ret(lr); 1033 } 1034 } 1035 1036 // Small copy: less than 16 bytes. 1037 // 1038 // NB: Ignores all of the bits of count which represent more than 15 1039 // bytes, so a caller doesn't have to mask them. 1040 1041 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1042 bool is_backwards = step < 0; 1043 size_t granularity = uabs(step); 1044 int direction = is_backwards ? -1 : 1; 1045 int unit = wordSize * direction; 1046 1047 Label Lword, Lint, Lshort, Lbyte; 1048 1049 assert(granularity 1050 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1051 1052 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1053 1054 // ??? I don't know if this bit-test-and-branch is the right thing 1055 // to do. It does a lot of jumping, resulting in several 1056 // mispredicted branches. It might make more sense to do this 1057 // with something like Duff's device with a single computed branch. 1058 1059 __ tbz(count, 3 - exact_log2(granularity), Lword); 1060 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1061 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1062 __ bind(Lword); 1063 1064 if (granularity <= sizeof (jint)) { 1065 __ tbz(count, 2 - exact_log2(granularity), Lint); 1066 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1067 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1068 __ bind(Lint); 1069 } 1070 1071 if (granularity <= sizeof (jshort)) { 1072 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1073 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1074 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1075 __ bind(Lshort); 1076 } 1077 1078 if (granularity <= sizeof (jbyte)) { 1079 __ tbz(count, 0, Lbyte); 1080 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1081 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1082 __ bind(Lbyte); 1083 } 1084 } 1085 1086 Label copy_f, copy_b; 1087 1088 // All-singing all-dancing memory copy. 1089 // 1090 // Copy count units of memory from s to d. The size of a unit is 1091 // step, which can be positive or negative depending on the direction 1092 // of copy. If is_aligned is false, we align the source address. 1093 // 1094 1095 void copy_memory(bool is_aligned, Register s, Register d, 1096 Register count, Register tmp, int step) { 1097 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1098 bool is_backwards = step < 0; 1099 int granularity = uabs(step); 1100 const Register t0 = r3, t1 = r4; 1101 1102 // <= 96 bytes do inline. Direction doesn't matter because we always 1103 // load all the data before writing anything 1104 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1105 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1106 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1107 const Register send = r17, dend = r18; 1108 1109 if (PrefetchCopyIntervalInBytes > 0) 1110 __ prfm(Address(s, 0), PLDL1KEEP); 1111 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1112 __ br(Assembler::HI, copy_big); 1113 1114 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1115 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1116 1117 __ cmp(count, u1(16/granularity)); 1118 __ br(Assembler::LS, copy16); 1119 1120 __ cmp(count, u1(64/granularity)); 1121 __ br(Assembler::HI, copy80); 1122 1123 __ cmp(count, u1(32/granularity)); 1124 __ br(Assembler::LS, copy32); 1125 1126 // 33..64 bytes 1127 if (UseSIMDForMemoryOps) { 1128 __ ldpq(v0, v1, Address(s, 0)); 1129 __ ldpq(v2, v3, Address(send, -32)); 1130 __ stpq(v0, v1, Address(d, 0)); 1131 __ stpq(v2, v3, Address(dend, -32)); 1132 } else { 1133 __ ldp(t0, t1, Address(s, 0)); 1134 __ ldp(t2, t3, Address(s, 16)); 1135 __ ldp(t4, t5, Address(send, -32)); 1136 __ ldp(t6, t7, Address(send, -16)); 1137 1138 __ stp(t0, t1, Address(d, 0)); 1139 __ stp(t2, t3, Address(d, 16)); 1140 __ stp(t4, t5, Address(dend, -32)); 1141 __ stp(t6, t7, Address(dend, -16)); 1142 } 1143 __ b(finish); 1144 1145 // 17..32 bytes 1146 __ bind(copy32); 1147 __ ldp(t0, t1, Address(s, 0)); 1148 __ ldp(t2, t3, Address(send, -16)); 1149 __ stp(t0, t1, Address(d, 0)); 1150 __ stp(t2, t3, Address(dend, -16)); 1151 __ b(finish); 1152 1153 // 65..80/96 bytes 1154 // (96 bytes if SIMD because we do 32 byes per instruction) 1155 __ bind(copy80); 1156 if (UseSIMDForMemoryOps) { 1157 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1158 __ ldpq(v4, v5, Address(send, -32)); 1159 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1160 __ stpq(v4, v5, Address(dend, -32)); 1161 } else { 1162 __ ldp(t0, t1, Address(s, 0)); 1163 __ ldp(t2, t3, Address(s, 16)); 1164 __ ldp(t4, t5, Address(s, 32)); 1165 __ ldp(t6, t7, Address(s, 48)); 1166 __ ldp(t8, t9, Address(send, -16)); 1167 1168 __ stp(t0, t1, Address(d, 0)); 1169 __ stp(t2, t3, Address(d, 16)); 1170 __ stp(t4, t5, Address(d, 32)); 1171 __ stp(t6, t7, Address(d, 48)); 1172 __ stp(t8, t9, Address(dend, -16)); 1173 } 1174 __ b(finish); 1175 1176 // 0..16 bytes 1177 __ bind(copy16); 1178 __ cmp(count, u1(8/granularity)); 1179 __ br(Assembler::LO, copy8); 1180 1181 // 8..16 bytes 1182 __ ldr(t0, Address(s, 0)); 1183 __ ldr(t1, Address(send, -8)); 1184 __ str(t0, Address(d, 0)); 1185 __ str(t1, Address(dend, -8)); 1186 __ b(finish); 1187 1188 if (granularity < 8) { 1189 // 4..7 bytes 1190 __ bind(copy8); 1191 __ tbz(count, 2 - exact_log2(granularity), copy4); 1192 __ ldrw(t0, Address(s, 0)); 1193 __ ldrw(t1, Address(send, -4)); 1194 __ strw(t0, Address(d, 0)); 1195 __ strw(t1, Address(dend, -4)); 1196 __ b(finish); 1197 if (granularity < 4) { 1198 // 0..3 bytes 1199 __ bind(copy4); 1200 __ cbz(count, finish); // get rid of 0 case 1201 if (granularity == 2) { 1202 __ ldrh(t0, Address(s, 0)); 1203 __ strh(t0, Address(d, 0)); 1204 } else { // granularity == 1 1205 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1206 // the first and last byte. 1207 // Handle the 3 byte case by loading and storing base + count/2 1208 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1209 // This does means in the 1 byte case we load/store the same 1210 // byte 3 times. 1211 __ lsr(count, count, 1); 1212 __ ldrb(t0, Address(s, 0)); 1213 __ ldrb(t1, Address(send, -1)); 1214 __ ldrb(t2, Address(s, count)); 1215 __ strb(t0, Address(d, 0)); 1216 __ strb(t1, Address(dend, -1)); 1217 __ strb(t2, Address(d, count)); 1218 } 1219 __ b(finish); 1220 } 1221 } 1222 1223 __ bind(copy_big); 1224 if (is_backwards) { 1225 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1226 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1227 } 1228 1229 // Now we've got the small case out of the way we can align the 1230 // source address on a 2-word boundary. 1231 1232 Label aligned; 1233 1234 if (is_aligned) { 1235 // We may have to adjust by 1 word to get s 2-word-aligned. 1236 __ tbz(s, exact_log2(wordSize), aligned); 1237 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1238 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1239 __ sub(count, count, wordSize/granularity); 1240 } else { 1241 if (is_backwards) { 1242 __ andr(rscratch2, s, 2 * wordSize - 1); 1243 } else { 1244 __ neg(rscratch2, s); 1245 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1246 } 1247 // rscratch2 is the byte adjustment needed to align s. 1248 __ cbz(rscratch2, aligned); 1249 int shift = exact_log2(granularity); 1250 if (shift) __ lsr(rscratch2, rscratch2, shift); 1251 __ sub(count, count, rscratch2); 1252 1253 #if 0 1254 // ?? This code is only correct for a disjoint copy. It may or 1255 // may not make sense to use it in that case. 1256 1257 // Copy the first pair; s and d may not be aligned. 1258 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1259 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1260 1261 // Align s and d, adjust count 1262 if (is_backwards) { 1263 __ sub(s, s, rscratch2); 1264 __ sub(d, d, rscratch2); 1265 } else { 1266 __ add(s, s, rscratch2); 1267 __ add(d, d, rscratch2); 1268 } 1269 #else 1270 copy_memory_small(s, d, rscratch2, rscratch1, step); 1271 #endif 1272 } 1273 1274 __ bind(aligned); 1275 1276 // s is now 2-word-aligned. 1277 1278 // We have a count of units and some trailing bytes. Adjust the 1279 // count and do a bulk copy of words. 1280 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1281 if (direction == copy_forwards) 1282 __ bl(copy_f); 1283 else 1284 __ bl(copy_b); 1285 1286 // And the tail. 1287 copy_memory_small(s, d, count, tmp, step); 1288 1289 if (granularity >= 8) __ bind(copy8); 1290 if (granularity >= 4) __ bind(copy4); 1291 __ bind(finish); 1292 } 1293 1294 1295 void clobber_registers() { 1296 #ifdef ASSERT 1297 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1298 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1299 for (Register r = r3; r <= r18; r++) 1300 if (r != rscratch1) __ mov(r, rscratch1); 1301 #endif 1302 } 1303 1304 // Scan over array at a for count oops, verifying each one. 1305 // Preserves a and count, clobbers rscratch1 and rscratch2. 1306 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1307 Label loop, end; 1308 __ mov(rscratch1, a); 1309 __ mov(rscratch2, zr); 1310 __ bind(loop); 1311 __ cmp(rscratch2, count); 1312 __ br(Assembler::HS, end); 1313 if (size == (size_t)wordSize) { 1314 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1315 __ verify_oop(temp); 1316 } else { 1317 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1318 __ decode_heap_oop(temp); // calls verify_oop 1319 } 1320 __ add(rscratch2, rscratch2, size); 1321 __ b(loop); 1322 __ bind(end); 1323 } 1324 1325 // Arguments: 1326 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1327 // ignored 1328 // is_oop - true => oop array, so generate store check code 1329 // name - stub name string 1330 // 1331 // Inputs: 1332 // c_rarg0 - source array address 1333 // c_rarg1 - destination array address 1334 // c_rarg2 - element count, treated as ssize_t, can be zero 1335 // 1336 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1337 // the hardware handle it. The two dwords within qwords that span 1338 // cache line boundaries will still be loaded and stored atomicly. 1339 // 1340 // Side Effects: 1341 // disjoint_int_copy_entry is set to the no-overlap entry point 1342 // used by generate_conjoint_int_oop_copy(). 1343 // 1344 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1345 const char *name, bool dest_uninitialized = false) { 1346 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1347 RegSet saved_reg = RegSet::of(s, d, count); 1348 __ align(CodeEntryAlignment); 1349 StubCodeMark mark(this, "StubRoutines", name); 1350 address start = __ pc(); 1351 __ enter(); 1352 1353 if (entry != NULL) { 1354 *entry = __ pc(); 1355 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1356 BLOCK_COMMENT("Entry:"); 1357 } 1358 1359 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1360 if (dest_uninitialized) { 1361 decorators |= IS_DEST_UNINITIALIZED; 1362 } 1363 if (aligned) { 1364 decorators |= ARRAYCOPY_ALIGNED; 1365 } 1366 1367 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1368 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg); 1369 1370 if (is_oop) { 1371 // save regs before copy_memory 1372 __ push(RegSet::of(d, count), sp); 1373 } 1374 copy_memory(aligned, s, d, count, rscratch1, size); 1375 1376 if (is_oop) { 1377 __ pop(RegSet::of(d, count), sp); 1378 if (VerifyOops) 1379 verify_oop_array(size, d, count, r16); 1380 __ sub(count, count, 1); // make an inclusive end pointer 1381 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1382 } 1383 1384 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1385 1386 __ leave(); 1387 __ mov(r0, zr); // return 0 1388 __ ret(lr); 1389 #ifdef BUILTIN_SIM 1390 { 1391 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1392 sim->notifyCompile(const_cast<char*>(name), start); 1393 } 1394 #endif 1395 return start; 1396 } 1397 1398 // Arguments: 1399 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1400 // ignored 1401 // is_oop - true => oop array, so generate store check code 1402 // name - stub name string 1403 // 1404 // Inputs: 1405 // c_rarg0 - source array address 1406 // c_rarg1 - destination array address 1407 // c_rarg2 - element count, treated as ssize_t, can be zero 1408 // 1409 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1410 // the hardware handle it. The two dwords within qwords that span 1411 // cache line boundaries will still be loaded and stored atomicly. 1412 // 1413 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1414 address *entry, const char *name, 1415 bool dest_uninitialized = false) { 1416 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1417 RegSet saved_regs = RegSet::of(s, d, count); 1418 StubCodeMark mark(this, "StubRoutines", name); 1419 address start = __ pc(); 1420 __ enter(); 1421 1422 if (entry != NULL) { 1423 *entry = __ pc(); 1424 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1425 BLOCK_COMMENT("Entry:"); 1426 } 1427 1428 // use fwd copy when (d-s) above_equal (count*size) 1429 __ sub(rscratch1, d, s); 1430 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1431 __ br(Assembler::HS, nooverlap_target); 1432 1433 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1434 if (dest_uninitialized) { 1435 decorators |= IS_DEST_UNINITIALIZED; 1436 } 1437 if (aligned) { 1438 decorators |= ARRAYCOPY_ALIGNED; 1439 } 1440 1441 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1442 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs); 1443 1444 if (is_oop) { 1445 // save regs before copy_memory 1446 __ push(RegSet::of(d, count), sp); 1447 } 1448 copy_memory(aligned, s, d, count, rscratch1, -size); 1449 if (is_oop) { 1450 __ pop(RegSet::of(d, count), sp); 1451 if (VerifyOops) 1452 verify_oop_array(size, d, count, r16); 1453 __ sub(count, count, 1); // make an inclusive end pointer 1454 __ lea(count, Address(d, count, Address::lsl(exact_log2(size)))); 1455 } 1456 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1457 __ leave(); 1458 __ mov(r0, zr); // return 0 1459 __ ret(lr); 1460 #ifdef BUILTIN_SIM 1461 { 1462 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1463 sim->notifyCompile(const_cast<char*>(name), start); 1464 } 1465 #endif 1466 return start; 1467 } 1468 1469 // Arguments: 1470 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1471 // ignored 1472 // name - stub name string 1473 // 1474 // Inputs: 1475 // c_rarg0 - source array address 1476 // c_rarg1 - destination array address 1477 // c_rarg2 - element count, treated as ssize_t, can be zero 1478 // 1479 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1480 // we let the hardware handle it. The one to eight bytes within words, 1481 // dwords or qwords that span cache line boundaries will still be loaded 1482 // and stored atomically. 1483 // 1484 // Side Effects: 1485 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1486 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1487 // we let the hardware handle it. The one to eight bytes within words, 1488 // dwords or qwords that span cache line boundaries will still be loaded 1489 // and stored atomically. 1490 // 1491 // Side Effects: 1492 // disjoint_byte_copy_entry is set to the no-overlap entry point 1493 // used by generate_conjoint_byte_copy(). 1494 // 1495 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1496 const bool not_oop = false; 1497 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1498 } 1499 1500 // Arguments: 1501 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1502 // ignored 1503 // name - stub name string 1504 // 1505 // Inputs: 1506 // c_rarg0 - source array address 1507 // c_rarg1 - destination array address 1508 // c_rarg2 - element count, treated as ssize_t, can be zero 1509 // 1510 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1511 // we let the hardware handle it. The one to eight bytes within words, 1512 // dwords or qwords that span cache line boundaries will still be loaded 1513 // and stored atomically. 1514 // 1515 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1516 address* entry, const char *name) { 1517 const bool not_oop = false; 1518 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1519 } 1520 1521 // Arguments: 1522 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1523 // ignored 1524 // name - stub name string 1525 // 1526 // Inputs: 1527 // c_rarg0 - source array address 1528 // c_rarg1 - destination array address 1529 // c_rarg2 - element count, treated as ssize_t, can be zero 1530 // 1531 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1532 // let the hardware handle it. The two or four words within dwords 1533 // or qwords that span cache line boundaries will still be loaded 1534 // and stored atomically. 1535 // 1536 // Side Effects: 1537 // disjoint_short_copy_entry is set to the no-overlap entry point 1538 // used by generate_conjoint_short_copy(). 1539 // 1540 address generate_disjoint_short_copy(bool aligned, 1541 address* entry, const char *name) { 1542 const bool not_oop = false; 1543 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1544 } 1545 1546 // Arguments: 1547 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1548 // ignored 1549 // name - stub name string 1550 // 1551 // Inputs: 1552 // c_rarg0 - source array address 1553 // c_rarg1 - destination array address 1554 // c_rarg2 - element count, treated as ssize_t, can be zero 1555 // 1556 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1557 // let the hardware handle it. The two or four words within dwords 1558 // or qwords that span cache line boundaries will still be loaded 1559 // and stored atomically. 1560 // 1561 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1562 address *entry, const char *name) { 1563 const bool not_oop = false; 1564 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1565 1566 } 1567 // Arguments: 1568 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1569 // ignored 1570 // name - stub name string 1571 // 1572 // Inputs: 1573 // c_rarg0 - source array address 1574 // c_rarg1 - destination array address 1575 // c_rarg2 - element count, treated as ssize_t, can be zero 1576 // 1577 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1578 // the hardware handle it. The two dwords within qwords that span 1579 // cache line boundaries will still be loaded and stored atomicly. 1580 // 1581 // Side Effects: 1582 // disjoint_int_copy_entry is set to the no-overlap entry point 1583 // used by generate_conjoint_int_oop_copy(). 1584 // 1585 address generate_disjoint_int_copy(bool aligned, address *entry, 1586 const char *name, bool dest_uninitialized = false) { 1587 const bool not_oop = false; 1588 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1589 } 1590 1591 // Arguments: 1592 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1593 // ignored 1594 // name - stub name string 1595 // 1596 // Inputs: 1597 // c_rarg0 - source array address 1598 // c_rarg1 - destination array address 1599 // c_rarg2 - element count, treated as ssize_t, can be zero 1600 // 1601 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1602 // the hardware handle it. The two dwords within qwords that span 1603 // cache line boundaries will still be loaded and stored atomicly. 1604 // 1605 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1606 address *entry, const char *name, 1607 bool dest_uninitialized = false) { 1608 const bool not_oop = false; 1609 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1610 } 1611 1612 1613 // Arguments: 1614 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1615 // ignored 1616 // name - stub name string 1617 // 1618 // Inputs: 1619 // c_rarg0 - source array address 1620 // c_rarg1 - destination array address 1621 // c_rarg2 - element count, treated as size_t, can be zero 1622 // 1623 // Side Effects: 1624 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1625 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1626 // 1627 address generate_disjoint_long_copy(bool aligned, address *entry, 1628 const char *name, bool dest_uninitialized = false) { 1629 const bool not_oop = false; 1630 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1631 } 1632 1633 // Arguments: 1634 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1635 // ignored 1636 // name - stub name string 1637 // 1638 // Inputs: 1639 // c_rarg0 - source array address 1640 // c_rarg1 - destination array address 1641 // c_rarg2 - element count, treated as size_t, can be zero 1642 // 1643 address generate_conjoint_long_copy(bool aligned, 1644 address nooverlap_target, address *entry, 1645 const char *name, bool dest_uninitialized = false) { 1646 const bool not_oop = false; 1647 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1648 } 1649 1650 // Arguments: 1651 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1652 // ignored 1653 // name - stub name string 1654 // 1655 // Inputs: 1656 // c_rarg0 - source array address 1657 // c_rarg1 - destination array address 1658 // c_rarg2 - element count, treated as size_t, can be zero 1659 // 1660 // Side Effects: 1661 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1662 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1663 // 1664 address generate_disjoint_oop_copy(bool aligned, address *entry, 1665 const char *name, bool dest_uninitialized) { 1666 const bool is_oop = true; 1667 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1668 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1669 } 1670 1671 // Arguments: 1672 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1673 // ignored 1674 // name - stub name string 1675 // 1676 // Inputs: 1677 // c_rarg0 - source array address 1678 // c_rarg1 - destination array address 1679 // c_rarg2 - element count, treated as size_t, can be zero 1680 // 1681 address generate_conjoint_oop_copy(bool aligned, 1682 address nooverlap_target, address *entry, 1683 const char *name, bool dest_uninitialized) { 1684 const bool is_oop = true; 1685 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1686 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1687 name, dest_uninitialized); 1688 } 1689 1690 1691 // Helper for generating a dynamic type check. 1692 // Smashes rscratch1. 1693 void generate_type_check(Register sub_klass, 1694 Register super_check_offset, 1695 Register super_klass, 1696 Label& L_success) { 1697 assert_different_registers(sub_klass, super_check_offset, super_klass); 1698 1699 BLOCK_COMMENT("type_check:"); 1700 1701 Label L_miss; 1702 1703 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1704 super_check_offset); 1705 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1706 1707 // Fall through on failure! 1708 __ BIND(L_miss); 1709 } 1710 1711 // 1712 // Generate checkcasting array copy stub 1713 // 1714 // Input: 1715 // c_rarg0 - source array address 1716 // c_rarg1 - destination array address 1717 // c_rarg2 - element count, treated as ssize_t, can be zero 1718 // c_rarg3 - size_t ckoff (super_check_offset) 1719 // c_rarg4 - oop ckval (super_klass) 1720 // 1721 // Output: 1722 // r0 == 0 - success 1723 // r0 == -1^K - failure, where K is partial transfer count 1724 // 1725 address generate_checkcast_copy(const char *name, address *entry, 1726 bool dest_uninitialized = false) { 1727 1728 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1729 1730 // Input registers (after setup_arg_regs) 1731 const Register from = c_rarg0; // source array address 1732 const Register to = c_rarg1; // destination array address 1733 const Register count = c_rarg2; // elementscount 1734 const Register ckoff = c_rarg3; // super_check_offset 1735 const Register ckval = c_rarg4; // super_klass 1736 1737 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1738 RegSet wb_post_saved_regs = RegSet::of(count); 1739 1740 // Registers used as temps (r18, r19, r20 are save-on-entry) 1741 const Register count_save = r21; // orig elementscount 1742 const Register start_to = r20; // destination array start address 1743 const Register copied_oop = r18; // actual oop copied 1744 const Register r19_klass = r19; // oop._klass 1745 1746 //--------------------------------------------------------------- 1747 // Assembler stub will be used for this call to arraycopy 1748 // if the two arrays are subtypes of Object[] but the 1749 // destination array type is not equal to or a supertype 1750 // of the source type. Each element must be separately 1751 // checked. 1752 1753 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1754 copied_oop, r19_klass, count_save); 1755 1756 __ align(CodeEntryAlignment); 1757 StubCodeMark mark(this, "StubRoutines", name); 1758 address start = __ pc(); 1759 1760 __ enter(); // required for proper stackwalking of RuntimeStub frame 1761 1762 #ifdef ASSERT 1763 // caller guarantees that the arrays really are different 1764 // otherwise, we would have to make conjoint checks 1765 { Label L; 1766 array_overlap_test(L, TIMES_OOP); 1767 __ stop("checkcast_copy within a single array"); 1768 __ bind(L); 1769 } 1770 #endif //ASSERT 1771 1772 // Caller of this entry point must set up the argument registers. 1773 if (entry != NULL) { 1774 *entry = __ pc(); 1775 BLOCK_COMMENT("Entry:"); 1776 } 1777 1778 // Empty array: Nothing to do. 1779 __ cbz(count, L_done); 1780 1781 __ push(RegSet::of(r18, r19, r20, r21), sp); 1782 1783 #ifdef ASSERT 1784 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1785 // The ckoff and ckval must be mutually consistent, 1786 // even though caller generates both. 1787 { Label L; 1788 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1789 __ ldrw(start_to, Address(ckval, sco_offset)); 1790 __ cmpw(ckoff, start_to); 1791 __ br(Assembler::EQ, L); 1792 __ stop("super_check_offset inconsistent"); 1793 __ bind(L); 1794 } 1795 #endif //ASSERT 1796 1797 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST; 1798 bool is_oop = true; 1799 if (dest_uninitialized) { 1800 decorators |= IS_DEST_UNINITIALIZED; 1801 } 1802 1803 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1804 bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs); 1805 1806 // save the original count 1807 __ mov(count_save, count); 1808 1809 // Copy from low to high addresses 1810 __ mov(start_to, to); // Save destination array start address 1811 __ b(L_load_element); 1812 1813 // ======== begin loop ======== 1814 // (Loop is rotated; its entry is L_load_element.) 1815 // Loop control: 1816 // for (; count != 0; count--) { 1817 // copied_oop = load_heap_oop(from++); 1818 // ... generate_type_check ...; 1819 // store_heap_oop(to++, copied_oop); 1820 // } 1821 __ align(OptoLoopAlignment); 1822 1823 __ BIND(L_store_element); 1824 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1825 __ sub(count, count, 1); 1826 __ cbz(count, L_do_card_marks); 1827 1828 // ======== loop entry is here ======== 1829 __ BIND(L_load_element); 1830 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1831 __ cbz(copied_oop, L_store_element); 1832 1833 __ load_klass(r19_klass, copied_oop);// query the object klass 1834 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1835 // ======== end loop ======== 1836 1837 // It was a real error; we must depend on the caller to finish the job. 1838 // Register count = remaining oops, count_orig = total oops. 1839 // Emit GC store barriers for the oops we have copied and report 1840 // their number to the caller. 1841 1842 __ subs(count, count_save, count); // K = partially copied oop count 1843 __ eon(count, count, zr); // report (-1^K) to caller 1844 __ br(Assembler::EQ, L_done_pop); 1845 1846 __ BIND(L_do_card_marks); 1847 __ add(to, to, -heapOopSize); // make an inclusive end pointer 1848 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, to, rscratch1, wb_post_saved_regs); 1849 1850 __ bind(L_done_pop); 1851 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1852 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1853 1854 __ bind(L_done); 1855 __ mov(r0, count); 1856 __ leave(); 1857 __ ret(lr); 1858 1859 return start; 1860 } 1861 1862 // Perform range checks on the proposed arraycopy. 1863 // Kills temp, but nothing else. 1864 // Also, clean the sign bits of src_pos and dst_pos. 1865 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1866 Register src_pos, // source position (c_rarg1) 1867 Register dst, // destination array oo (c_rarg2) 1868 Register dst_pos, // destination position (c_rarg3) 1869 Register length, 1870 Register temp, 1871 Label& L_failed) { 1872 BLOCK_COMMENT("arraycopy_range_checks:"); 1873 1874 assert_different_registers(rscratch1, temp); 1875 1876 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1877 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1878 __ addw(temp, length, src_pos); 1879 __ cmpw(temp, rscratch1); 1880 __ br(Assembler::HI, L_failed); 1881 1882 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1883 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1884 __ addw(temp, length, dst_pos); 1885 __ cmpw(temp, rscratch1); 1886 __ br(Assembler::HI, L_failed); 1887 1888 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1889 __ movw(src_pos, src_pos); 1890 __ movw(dst_pos, dst_pos); 1891 1892 BLOCK_COMMENT("arraycopy_range_checks done"); 1893 } 1894 1895 // These stubs get called from some dumb test routine. 1896 // I'll write them properly when they're called from 1897 // something that's actually doing something. 1898 static void fake_arraycopy_stub(address src, address dst, int count) { 1899 assert(count == 0, "huh?"); 1900 } 1901 1902 1903 // 1904 // Generate 'unsafe' array copy stub 1905 // Though just as safe as the other stubs, it takes an unscaled 1906 // size_t argument instead of an element count. 1907 // 1908 // Input: 1909 // c_rarg0 - source array address 1910 // c_rarg1 - destination array address 1911 // c_rarg2 - byte count, treated as ssize_t, can be zero 1912 // 1913 // Examines the alignment of the operands and dispatches 1914 // to a long, int, short, or byte copy loop. 1915 // 1916 address generate_unsafe_copy(const char *name, 1917 address byte_copy_entry, 1918 address short_copy_entry, 1919 address int_copy_entry, 1920 address long_copy_entry) { 1921 Label L_long_aligned, L_int_aligned, L_short_aligned; 1922 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1923 1924 __ align(CodeEntryAlignment); 1925 StubCodeMark mark(this, "StubRoutines", name); 1926 address start = __ pc(); 1927 __ enter(); // required for proper stackwalking of RuntimeStub frame 1928 1929 // bump this on entry, not on exit: 1930 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1931 1932 __ orr(rscratch1, s, d); 1933 __ orr(rscratch1, rscratch1, count); 1934 1935 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1936 __ cbz(rscratch1, L_long_aligned); 1937 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1938 __ cbz(rscratch1, L_int_aligned); 1939 __ tbz(rscratch1, 0, L_short_aligned); 1940 __ b(RuntimeAddress(byte_copy_entry)); 1941 1942 __ BIND(L_short_aligned); 1943 __ lsr(count, count, LogBytesPerShort); // size => short_count 1944 __ b(RuntimeAddress(short_copy_entry)); 1945 __ BIND(L_int_aligned); 1946 __ lsr(count, count, LogBytesPerInt); // size => int_count 1947 __ b(RuntimeAddress(int_copy_entry)); 1948 __ BIND(L_long_aligned); 1949 __ lsr(count, count, LogBytesPerLong); // size => long_count 1950 __ b(RuntimeAddress(long_copy_entry)); 1951 1952 return start; 1953 } 1954 1955 // 1956 // Generate generic array copy stubs 1957 // 1958 // Input: 1959 // c_rarg0 - src oop 1960 // c_rarg1 - src_pos (32-bits) 1961 // c_rarg2 - dst oop 1962 // c_rarg3 - dst_pos (32-bits) 1963 // c_rarg4 - element count (32-bits) 1964 // 1965 // Output: 1966 // r0 == 0 - success 1967 // r0 == -1^K - failure, where K is partial transfer count 1968 // 1969 address generate_generic_copy(const char *name, 1970 address byte_copy_entry, address short_copy_entry, 1971 address int_copy_entry, address oop_copy_entry, 1972 address long_copy_entry, address checkcast_copy_entry) { 1973 1974 Label L_failed, L_objArray; 1975 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1976 1977 // Input registers 1978 const Register src = c_rarg0; // source array oop 1979 const Register src_pos = c_rarg1; // source position 1980 const Register dst = c_rarg2; // destination array oop 1981 const Register dst_pos = c_rarg3; // destination position 1982 const Register length = c_rarg4; 1983 1984 __ align(CodeEntryAlignment); 1985 1986 StubCodeMark mark(this, "StubRoutines", name); 1987 1988 address start = __ pc(); 1989 1990 __ enter(); // required for proper stackwalking of RuntimeStub frame 1991 1992 // bump this on entry, not on exit: 1993 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1994 1995 //----------------------------------------------------------------------- 1996 // Assembler stub will be used for this call to arraycopy 1997 // if the following conditions are met: 1998 // 1999 // (1) src and dst must not be null. 2000 // (2) src_pos must not be negative. 2001 // (3) dst_pos must not be negative. 2002 // (4) length must not be negative. 2003 // (5) src klass and dst klass should be the same and not NULL. 2004 // (6) src and dst should be arrays. 2005 // (7) src_pos + length must not exceed length of src. 2006 // (8) dst_pos + length must not exceed length of dst. 2007 // 2008 2009 // if (src == NULL) return -1; 2010 __ cbz(src, L_failed); 2011 2012 // if (src_pos < 0) return -1; 2013 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2014 2015 // if (dst == NULL) return -1; 2016 __ cbz(dst, L_failed); 2017 2018 // if (dst_pos < 0) return -1; 2019 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2020 2021 // registers used as temp 2022 const Register scratch_length = r16; // elements count to copy 2023 const Register scratch_src_klass = r17; // array klass 2024 const Register lh = r18; // layout helper 2025 2026 // if (length < 0) return -1; 2027 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2028 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2029 2030 __ load_klass(scratch_src_klass, src); 2031 #ifdef ASSERT 2032 // assert(src->klass() != NULL); 2033 { 2034 BLOCK_COMMENT("assert klasses not null {"); 2035 Label L1, L2; 2036 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2037 __ bind(L1); 2038 __ stop("broken null klass"); 2039 __ bind(L2); 2040 __ load_klass(rscratch1, dst); 2041 __ cbz(rscratch1, L1); // this would be broken also 2042 BLOCK_COMMENT("} assert klasses not null done"); 2043 } 2044 #endif 2045 2046 // Load layout helper (32-bits) 2047 // 2048 // |array_tag| | header_size | element_type | |log2_element_size| 2049 // 32 30 24 16 8 2 0 2050 // 2051 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2052 // 2053 2054 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2055 2056 // Handle objArrays completely differently... 2057 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2058 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2059 __ movw(rscratch1, objArray_lh); 2060 __ eorw(rscratch2, lh, rscratch1); 2061 __ cbzw(rscratch2, L_objArray); 2062 2063 // if (src->klass() != dst->klass()) return -1; 2064 __ load_klass(rscratch2, dst); 2065 __ eor(rscratch2, rscratch2, scratch_src_klass); 2066 __ cbnz(rscratch2, L_failed); 2067 2068 // if (!src->is_Array()) return -1; 2069 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2070 2071 // At this point, it is known to be a typeArray (array_tag 0x3). 2072 #ifdef ASSERT 2073 { 2074 BLOCK_COMMENT("assert primitive array {"); 2075 Label L; 2076 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2077 __ cmpw(lh, rscratch2); 2078 __ br(Assembler::GE, L); 2079 __ stop("must be a primitive array"); 2080 __ bind(L); 2081 BLOCK_COMMENT("} assert primitive array done"); 2082 } 2083 #endif 2084 2085 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2086 rscratch2, L_failed); 2087 2088 // TypeArrayKlass 2089 // 2090 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2091 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2092 // 2093 2094 const Register rscratch1_offset = rscratch1; // array offset 2095 const Register r18_elsize = lh; // element size 2096 2097 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2098 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2099 __ add(src, src, rscratch1_offset); // src array offset 2100 __ add(dst, dst, rscratch1_offset); // dst array offset 2101 BLOCK_COMMENT("choose copy loop based on element size"); 2102 2103 // next registers should be set before the jump to corresponding stub 2104 const Register from = c_rarg0; // source array address 2105 const Register to = c_rarg1; // destination array address 2106 const Register count = c_rarg2; // elements count 2107 2108 // 'from', 'to', 'count' registers should be set in such order 2109 // since they are the same as 'src', 'src_pos', 'dst'. 2110 2111 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2112 2113 // The possible values of elsize are 0-3, i.e. exact_log2(element 2114 // size in bytes). We do a simple bitwise binary search. 2115 __ BIND(L_copy_bytes); 2116 __ tbnz(r18_elsize, 1, L_copy_ints); 2117 __ tbnz(r18_elsize, 0, L_copy_shorts); 2118 __ lea(from, Address(src, src_pos));// src_addr 2119 __ lea(to, Address(dst, dst_pos));// dst_addr 2120 __ movw(count, scratch_length); // length 2121 __ b(RuntimeAddress(byte_copy_entry)); 2122 2123 __ BIND(L_copy_shorts); 2124 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2125 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2126 __ movw(count, scratch_length); // length 2127 __ b(RuntimeAddress(short_copy_entry)); 2128 2129 __ BIND(L_copy_ints); 2130 __ tbnz(r18_elsize, 0, L_copy_longs); 2131 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2132 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2133 __ movw(count, scratch_length); // length 2134 __ b(RuntimeAddress(int_copy_entry)); 2135 2136 __ BIND(L_copy_longs); 2137 #ifdef ASSERT 2138 { 2139 BLOCK_COMMENT("assert long copy {"); 2140 Label L; 2141 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2142 __ cmpw(r18_elsize, LogBytesPerLong); 2143 __ br(Assembler::EQ, L); 2144 __ stop("must be long copy, but elsize is wrong"); 2145 __ bind(L); 2146 BLOCK_COMMENT("} assert long copy done"); 2147 } 2148 #endif 2149 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2150 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2151 __ movw(count, scratch_length); // length 2152 __ b(RuntimeAddress(long_copy_entry)); 2153 2154 // ObjArrayKlass 2155 __ BIND(L_objArray); 2156 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2157 2158 Label L_plain_copy, L_checkcast_copy; 2159 // test array classes for subtyping 2160 __ load_klass(r18, dst); 2161 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2162 __ br(Assembler::NE, L_checkcast_copy); 2163 2164 // Identically typed arrays can be copied without element-wise checks. 2165 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2166 rscratch2, L_failed); 2167 2168 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2169 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2170 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2171 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2172 __ movw(count, scratch_length); // length 2173 __ BIND(L_plain_copy); 2174 __ b(RuntimeAddress(oop_copy_entry)); 2175 2176 __ BIND(L_checkcast_copy); 2177 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2178 { 2179 // Before looking at dst.length, make sure dst is also an objArray. 2180 __ ldrw(rscratch1, Address(r18, lh_offset)); 2181 __ movw(rscratch2, objArray_lh); 2182 __ eorw(rscratch1, rscratch1, rscratch2); 2183 __ cbnzw(rscratch1, L_failed); 2184 2185 // It is safe to examine both src.length and dst.length. 2186 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2187 r18, L_failed); 2188 2189 const Register rscratch2_dst_klass = rscratch2; 2190 __ load_klass(rscratch2_dst_klass, dst); // reload 2191 2192 // Marshal the base address arguments now, freeing registers. 2193 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2194 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2195 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2196 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2197 __ movw(count, length); // length (reloaded) 2198 Register sco_temp = c_rarg3; // this register is free now 2199 assert_different_registers(from, to, count, sco_temp, 2200 rscratch2_dst_klass, scratch_src_klass); 2201 // assert_clean_int(count, sco_temp); 2202 2203 // Generate the type check. 2204 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2205 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2206 // assert_clean_int(sco_temp, r18); 2207 generate_type_check(scratch_src_klass, sco_temp, rscratch2_dst_klass, L_plain_copy); 2208 2209 // Fetch destination element klass from the ObjArrayKlass header. 2210 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2211 __ ldr(rscratch2_dst_klass, Address(rscratch2_dst_klass, ek_offset)); 2212 __ ldrw(sco_temp, Address(rscratch2_dst_klass, sco_offset)); 2213 2214 // the checkcast_copy loop needs two extra arguments: 2215 assert(c_rarg3 == sco_temp, "#3 already in place"); 2216 // Set up arguments for checkcast_copy_entry. 2217 __ mov(c_rarg4, rscratch2_dst_klass); // dst.klass.element_klass 2218 __ b(RuntimeAddress(checkcast_copy_entry)); 2219 } 2220 2221 __ BIND(L_failed); 2222 __ mov(r0, -1); 2223 __ leave(); // required for proper stackwalking of RuntimeStub frame 2224 __ ret(lr); 2225 2226 return start; 2227 } 2228 2229 // 2230 // Generate stub for array fill. If "aligned" is true, the 2231 // "to" address is assumed to be heapword aligned. 2232 // 2233 // Arguments for generated stub: 2234 // to: c_rarg0 2235 // value: c_rarg1 2236 // count: c_rarg2 treated as signed 2237 // 2238 address generate_fill(BasicType t, bool aligned, const char *name) { 2239 __ align(CodeEntryAlignment); 2240 StubCodeMark mark(this, "StubRoutines", name); 2241 address start = __ pc(); 2242 2243 BLOCK_COMMENT("Entry:"); 2244 2245 const Register to = c_rarg0; // source array address 2246 const Register value = c_rarg1; // value 2247 const Register count = c_rarg2; // elements count 2248 2249 const Register bz_base = r10; // base for block_zero routine 2250 const Register cnt_words = r11; // temp register 2251 2252 __ enter(); 2253 2254 Label L_fill_elements, L_exit1; 2255 2256 int shift = -1; 2257 switch (t) { 2258 case T_BYTE: 2259 shift = 0; 2260 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2261 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2262 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2263 __ br(Assembler::LO, L_fill_elements); 2264 break; 2265 case T_SHORT: 2266 shift = 1; 2267 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2268 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2269 __ br(Assembler::LO, L_fill_elements); 2270 break; 2271 case T_INT: 2272 shift = 2; 2273 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2274 __ br(Assembler::LO, L_fill_elements); 2275 break; 2276 default: ShouldNotReachHere(); 2277 } 2278 2279 // Align source address at 8 bytes address boundary. 2280 Label L_skip_align1, L_skip_align2, L_skip_align4; 2281 if (!aligned) { 2282 switch (t) { 2283 case T_BYTE: 2284 // One byte misalignment happens only for byte arrays. 2285 __ tbz(to, 0, L_skip_align1); 2286 __ strb(value, Address(__ post(to, 1))); 2287 __ subw(count, count, 1); 2288 __ bind(L_skip_align1); 2289 // Fallthrough 2290 case T_SHORT: 2291 // Two bytes misalignment happens only for byte and short (char) arrays. 2292 __ tbz(to, 1, L_skip_align2); 2293 __ strh(value, Address(__ post(to, 2))); 2294 __ subw(count, count, 2 >> shift); 2295 __ bind(L_skip_align2); 2296 // Fallthrough 2297 case T_INT: 2298 // Align to 8 bytes, we know we are 4 byte aligned to start. 2299 __ tbz(to, 2, L_skip_align4); 2300 __ strw(value, Address(__ post(to, 4))); 2301 __ subw(count, count, 4 >> shift); 2302 __ bind(L_skip_align4); 2303 break; 2304 default: ShouldNotReachHere(); 2305 } 2306 } 2307 2308 // 2309 // Fill large chunks 2310 // 2311 __ lsrw(cnt_words, count, 3 - shift); // number of words 2312 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2313 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2314 if (UseBlockZeroing) { 2315 Label non_block_zeroing, rest; 2316 // If the fill value is zero we can use the fast zero_words(). 2317 __ cbnz(value, non_block_zeroing); 2318 __ mov(bz_base, to); 2319 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2320 __ zero_words(bz_base, cnt_words); 2321 __ b(rest); 2322 __ bind(non_block_zeroing); 2323 __ fill_words(to, cnt_words, value); 2324 __ bind(rest); 2325 } else { 2326 __ fill_words(to, cnt_words, value); 2327 } 2328 2329 // Remaining count is less than 8 bytes. Fill it by a single store. 2330 // Note that the total length is no less than 8 bytes. 2331 if (t == T_BYTE || t == T_SHORT) { 2332 Label L_exit1; 2333 __ cbzw(count, L_exit1); 2334 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2335 __ str(value, Address(to, -8)); // overwrite some elements 2336 __ bind(L_exit1); 2337 __ leave(); 2338 __ ret(lr); 2339 } 2340 2341 // Handle copies less than 8 bytes. 2342 Label L_fill_2, L_fill_4, L_exit2; 2343 __ bind(L_fill_elements); 2344 switch (t) { 2345 case T_BYTE: 2346 __ tbz(count, 0, L_fill_2); 2347 __ strb(value, Address(__ post(to, 1))); 2348 __ bind(L_fill_2); 2349 __ tbz(count, 1, L_fill_4); 2350 __ strh(value, Address(__ post(to, 2))); 2351 __ bind(L_fill_4); 2352 __ tbz(count, 2, L_exit2); 2353 __ strw(value, Address(to)); 2354 break; 2355 case T_SHORT: 2356 __ tbz(count, 0, L_fill_4); 2357 __ strh(value, Address(__ post(to, 2))); 2358 __ bind(L_fill_4); 2359 __ tbz(count, 1, L_exit2); 2360 __ strw(value, Address(to)); 2361 break; 2362 case T_INT: 2363 __ cbzw(count, L_exit2); 2364 __ strw(value, Address(to)); 2365 break; 2366 default: ShouldNotReachHere(); 2367 } 2368 __ bind(L_exit2); 2369 __ leave(); 2370 __ ret(lr); 2371 return start; 2372 } 2373 2374 void generate_arraycopy_stubs() { 2375 address entry; 2376 address entry_jbyte_arraycopy; 2377 address entry_jshort_arraycopy; 2378 address entry_jint_arraycopy; 2379 address entry_oop_arraycopy; 2380 address entry_jlong_arraycopy; 2381 address entry_checkcast_arraycopy; 2382 2383 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2384 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2385 2386 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2387 2388 //*** jbyte 2389 // Always need aligned and unaligned versions 2390 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2391 "jbyte_disjoint_arraycopy"); 2392 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2393 &entry_jbyte_arraycopy, 2394 "jbyte_arraycopy"); 2395 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2396 "arrayof_jbyte_disjoint_arraycopy"); 2397 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2398 "arrayof_jbyte_arraycopy"); 2399 2400 //*** jshort 2401 // Always need aligned and unaligned versions 2402 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2403 "jshort_disjoint_arraycopy"); 2404 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2405 &entry_jshort_arraycopy, 2406 "jshort_arraycopy"); 2407 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2408 "arrayof_jshort_disjoint_arraycopy"); 2409 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2410 "arrayof_jshort_arraycopy"); 2411 2412 //*** jint 2413 // Aligned versions 2414 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2415 "arrayof_jint_disjoint_arraycopy"); 2416 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2417 "arrayof_jint_arraycopy"); 2418 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2419 // entry_jint_arraycopy always points to the unaligned version 2420 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2421 "jint_disjoint_arraycopy"); 2422 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2423 &entry_jint_arraycopy, 2424 "jint_arraycopy"); 2425 2426 //*** jlong 2427 // It is always aligned 2428 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2429 "arrayof_jlong_disjoint_arraycopy"); 2430 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2431 "arrayof_jlong_arraycopy"); 2432 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2433 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2434 2435 //*** oops 2436 { 2437 // With compressed oops we need unaligned versions; notice that 2438 // we overwrite entry_oop_arraycopy. 2439 bool aligned = !UseCompressedOops; 2440 2441 StubRoutines::_arrayof_oop_disjoint_arraycopy 2442 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2443 /*dest_uninitialized*/false); 2444 StubRoutines::_arrayof_oop_arraycopy 2445 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2446 /*dest_uninitialized*/false); 2447 // Aligned versions without pre-barriers 2448 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2449 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2450 /*dest_uninitialized*/true); 2451 StubRoutines::_arrayof_oop_arraycopy_uninit 2452 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2453 /*dest_uninitialized*/true); 2454 } 2455 2456 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2457 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2458 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2459 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2460 2461 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2462 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2463 /*dest_uninitialized*/true); 2464 2465 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2466 entry_jbyte_arraycopy, 2467 entry_jshort_arraycopy, 2468 entry_jint_arraycopy, 2469 entry_jlong_arraycopy); 2470 2471 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2472 entry_jbyte_arraycopy, 2473 entry_jshort_arraycopy, 2474 entry_jint_arraycopy, 2475 entry_oop_arraycopy, 2476 entry_jlong_arraycopy, 2477 entry_checkcast_arraycopy); 2478 2479 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2480 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2481 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2482 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2483 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2484 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2485 } 2486 2487 void generate_math_stubs() { Unimplemented(); } 2488 2489 // Arguments: 2490 // 2491 // Inputs: 2492 // c_rarg0 - source byte array address 2493 // c_rarg1 - destination byte array address 2494 // c_rarg2 - K (key) in little endian int array 2495 // 2496 address generate_aescrypt_encryptBlock() { 2497 __ align(CodeEntryAlignment); 2498 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2499 2500 Label L_doLast; 2501 2502 const Register from = c_rarg0; // source array address 2503 const Register to = c_rarg1; // destination array address 2504 const Register key = c_rarg2; // key array address 2505 const Register keylen = rscratch1; 2506 2507 address start = __ pc(); 2508 __ enter(); 2509 2510 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2511 2512 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2513 2514 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2515 __ rev32(v1, __ T16B, v1); 2516 __ rev32(v2, __ T16B, v2); 2517 __ rev32(v3, __ T16B, v3); 2518 __ rev32(v4, __ T16B, v4); 2519 __ aese(v0, v1); 2520 __ aesmc(v0, v0); 2521 __ aese(v0, v2); 2522 __ aesmc(v0, v0); 2523 __ aese(v0, v3); 2524 __ aesmc(v0, v0); 2525 __ aese(v0, v4); 2526 __ aesmc(v0, v0); 2527 2528 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2529 __ rev32(v1, __ T16B, v1); 2530 __ rev32(v2, __ T16B, v2); 2531 __ rev32(v3, __ T16B, v3); 2532 __ rev32(v4, __ T16B, v4); 2533 __ aese(v0, v1); 2534 __ aesmc(v0, v0); 2535 __ aese(v0, v2); 2536 __ aesmc(v0, v0); 2537 __ aese(v0, v3); 2538 __ aesmc(v0, v0); 2539 __ aese(v0, v4); 2540 __ aesmc(v0, v0); 2541 2542 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2543 __ rev32(v1, __ T16B, v1); 2544 __ rev32(v2, __ T16B, v2); 2545 2546 __ cmpw(keylen, 44); 2547 __ br(Assembler::EQ, L_doLast); 2548 2549 __ aese(v0, v1); 2550 __ aesmc(v0, v0); 2551 __ aese(v0, v2); 2552 __ aesmc(v0, v0); 2553 2554 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2555 __ rev32(v1, __ T16B, v1); 2556 __ rev32(v2, __ T16B, v2); 2557 2558 __ cmpw(keylen, 52); 2559 __ br(Assembler::EQ, L_doLast); 2560 2561 __ aese(v0, v1); 2562 __ aesmc(v0, v0); 2563 __ aese(v0, v2); 2564 __ aesmc(v0, v0); 2565 2566 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2567 __ rev32(v1, __ T16B, v1); 2568 __ rev32(v2, __ T16B, v2); 2569 2570 __ BIND(L_doLast); 2571 2572 __ aese(v0, v1); 2573 __ aesmc(v0, v0); 2574 __ aese(v0, v2); 2575 2576 __ ld1(v1, __ T16B, key); 2577 __ rev32(v1, __ T16B, v1); 2578 __ eor(v0, __ T16B, v0, v1); 2579 2580 __ st1(v0, __ T16B, to); 2581 2582 __ mov(r0, 0); 2583 2584 __ leave(); 2585 __ ret(lr); 2586 2587 return start; 2588 } 2589 2590 // Arguments: 2591 // 2592 // Inputs: 2593 // c_rarg0 - source byte array address 2594 // c_rarg1 - destination byte array address 2595 // c_rarg2 - K (key) in little endian int array 2596 // 2597 address generate_aescrypt_decryptBlock() { 2598 assert(UseAES, "need AES instructions and misaligned SSE support"); 2599 __ align(CodeEntryAlignment); 2600 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2601 Label L_doLast; 2602 2603 const Register from = c_rarg0; // source array address 2604 const Register to = c_rarg1; // destination array address 2605 const Register key = c_rarg2; // key array address 2606 const Register keylen = rscratch1; 2607 2608 address start = __ pc(); 2609 __ enter(); // required for proper stackwalking of RuntimeStub frame 2610 2611 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2612 2613 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2614 2615 __ ld1(v5, __ T16B, __ post(key, 16)); 2616 __ rev32(v5, __ T16B, v5); 2617 2618 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2619 __ rev32(v1, __ T16B, v1); 2620 __ rev32(v2, __ T16B, v2); 2621 __ rev32(v3, __ T16B, v3); 2622 __ rev32(v4, __ T16B, v4); 2623 __ aesd(v0, v1); 2624 __ aesimc(v0, v0); 2625 __ aesd(v0, v2); 2626 __ aesimc(v0, v0); 2627 __ aesd(v0, v3); 2628 __ aesimc(v0, v0); 2629 __ aesd(v0, v4); 2630 __ aesimc(v0, v0); 2631 2632 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2633 __ rev32(v1, __ T16B, v1); 2634 __ rev32(v2, __ T16B, v2); 2635 __ rev32(v3, __ T16B, v3); 2636 __ rev32(v4, __ T16B, v4); 2637 __ aesd(v0, v1); 2638 __ aesimc(v0, v0); 2639 __ aesd(v0, v2); 2640 __ aesimc(v0, v0); 2641 __ aesd(v0, v3); 2642 __ aesimc(v0, v0); 2643 __ aesd(v0, v4); 2644 __ aesimc(v0, v0); 2645 2646 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2647 __ rev32(v1, __ T16B, v1); 2648 __ rev32(v2, __ T16B, v2); 2649 2650 __ cmpw(keylen, 44); 2651 __ br(Assembler::EQ, L_doLast); 2652 2653 __ aesd(v0, v1); 2654 __ aesimc(v0, v0); 2655 __ aesd(v0, v2); 2656 __ aesimc(v0, v0); 2657 2658 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2659 __ rev32(v1, __ T16B, v1); 2660 __ rev32(v2, __ T16B, v2); 2661 2662 __ cmpw(keylen, 52); 2663 __ br(Assembler::EQ, L_doLast); 2664 2665 __ aesd(v0, v1); 2666 __ aesimc(v0, v0); 2667 __ aesd(v0, v2); 2668 __ aesimc(v0, v0); 2669 2670 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2671 __ rev32(v1, __ T16B, v1); 2672 __ rev32(v2, __ T16B, v2); 2673 2674 __ BIND(L_doLast); 2675 2676 __ aesd(v0, v1); 2677 __ aesimc(v0, v0); 2678 __ aesd(v0, v2); 2679 2680 __ eor(v0, __ T16B, v0, v5); 2681 2682 __ st1(v0, __ T16B, to); 2683 2684 __ mov(r0, 0); 2685 2686 __ leave(); 2687 __ ret(lr); 2688 2689 return start; 2690 } 2691 2692 // Arguments: 2693 // 2694 // Inputs: 2695 // c_rarg0 - source byte array address 2696 // c_rarg1 - destination byte array address 2697 // c_rarg2 - K (key) in little endian int array 2698 // c_rarg3 - r vector byte array address 2699 // c_rarg4 - input length 2700 // 2701 // Output: 2702 // x0 - input length 2703 // 2704 address generate_cipherBlockChaining_encryptAESCrypt() { 2705 assert(UseAES, "need AES instructions and misaligned SSE support"); 2706 __ align(CodeEntryAlignment); 2707 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2708 2709 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2710 2711 const Register from = c_rarg0; // source array address 2712 const Register to = c_rarg1; // destination array address 2713 const Register key = c_rarg2; // key array address 2714 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2715 // and left with the results of the last encryption block 2716 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2717 const Register keylen = rscratch1; 2718 2719 address start = __ pc(); 2720 2721 __ enter(); 2722 2723 __ movw(rscratch2, len_reg); 2724 2725 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2726 2727 __ ld1(v0, __ T16B, rvec); 2728 2729 __ cmpw(keylen, 52); 2730 __ br(Assembler::CC, L_loadkeys_44); 2731 __ br(Assembler::EQ, L_loadkeys_52); 2732 2733 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2734 __ rev32(v17, __ T16B, v17); 2735 __ rev32(v18, __ T16B, v18); 2736 __ BIND(L_loadkeys_52); 2737 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2738 __ rev32(v19, __ T16B, v19); 2739 __ rev32(v20, __ T16B, v20); 2740 __ BIND(L_loadkeys_44); 2741 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2742 __ rev32(v21, __ T16B, v21); 2743 __ rev32(v22, __ T16B, v22); 2744 __ rev32(v23, __ T16B, v23); 2745 __ rev32(v24, __ T16B, v24); 2746 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2747 __ rev32(v25, __ T16B, v25); 2748 __ rev32(v26, __ T16B, v26); 2749 __ rev32(v27, __ T16B, v27); 2750 __ rev32(v28, __ T16B, v28); 2751 __ ld1(v29, v30, v31, __ T16B, key); 2752 __ rev32(v29, __ T16B, v29); 2753 __ rev32(v30, __ T16B, v30); 2754 __ rev32(v31, __ T16B, v31); 2755 2756 __ BIND(L_aes_loop); 2757 __ ld1(v1, __ T16B, __ post(from, 16)); 2758 __ eor(v0, __ T16B, v0, v1); 2759 2760 __ br(Assembler::CC, L_rounds_44); 2761 __ br(Assembler::EQ, L_rounds_52); 2762 2763 __ aese(v0, v17); __ aesmc(v0, v0); 2764 __ aese(v0, v18); __ aesmc(v0, v0); 2765 __ BIND(L_rounds_52); 2766 __ aese(v0, v19); __ aesmc(v0, v0); 2767 __ aese(v0, v20); __ aesmc(v0, v0); 2768 __ BIND(L_rounds_44); 2769 __ aese(v0, v21); __ aesmc(v0, v0); 2770 __ aese(v0, v22); __ aesmc(v0, v0); 2771 __ aese(v0, v23); __ aesmc(v0, v0); 2772 __ aese(v0, v24); __ aesmc(v0, v0); 2773 __ aese(v0, v25); __ aesmc(v0, v0); 2774 __ aese(v0, v26); __ aesmc(v0, v0); 2775 __ aese(v0, v27); __ aesmc(v0, v0); 2776 __ aese(v0, v28); __ aesmc(v0, v0); 2777 __ aese(v0, v29); __ aesmc(v0, v0); 2778 __ aese(v0, v30); 2779 __ eor(v0, __ T16B, v0, v31); 2780 2781 __ st1(v0, __ T16B, __ post(to, 16)); 2782 2783 __ subw(len_reg, len_reg, 16); 2784 __ cbnzw(len_reg, L_aes_loop); 2785 2786 __ st1(v0, __ T16B, rvec); 2787 2788 __ mov(r0, rscratch2); 2789 2790 __ leave(); 2791 __ ret(lr); 2792 2793 return start; 2794 } 2795 2796 // Arguments: 2797 // 2798 // Inputs: 2799 // c_rarg0 - source byte array address 2800 // c_rarg1 - destination byte array address 2801 // c_rarg2 - K (key) in little endian int array 2802 // c_rarg3 - r vector byte array address 2803 // c_rarg4 - input length 2804 // 2805 // Output: 2806 // r0 - input length 2807 // 2808 address generate_cipherBlockChaining_decryptAESCrypt() { 2809 assert(UseAES, "need AES instructions and misaligned SSE support"); 2810 __ align(CodeEntryAlignment); 2811 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2812 2813 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2814 2815 const Register from = c_rarg0; // source array address 2816 const Register to = c_rarg1; // destination array address 2817 const Register key = c_rarg2; // key array address 2818 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2819 // and left with the results of the last encryption block 2820 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2821 const Register keylen = rscratch1; 2822 2823 address start = __ pc(); 2824 2825 __ enter(); 2826 2827 __ movw(rscratch2, len_reg); 2828 2829 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2830 2831 __ ld1(v2, __ T16B, rvec); 2832 2833 __ ld1(v31, __ T16B, __ post(key, 16)); 2834 __ rev32(v31, __ T16B, v31); 2835 2836 __ cmpw(keylen, 52); 2837 __ br(Assembler::CC, L_loadkeys_44); 2838 __ br(Assembler::EQ, L_loadkeys_52); 2839 2840 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2841 __ rev32(v17, __ T16B, v17); 2842 __ rev32(v18, __ T16B, v18); 2843 __ BIND(L_loadkeys_52); 2844 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2845 __ rev32(v19, __ T16B, v19); 2846 __ rev32(v20, __ T16B, v20); 2847 __ BIND(L_loadkeys_44); 2848 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2849 __ rev32(v21, __ T16B, v21); 2850 __ rev32(v22, __ T16B, v22); 2851 __ rev32(v23, __ T16B, v23); 2852 __ rev32(v24, __ T16B, v24); 2853 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2854 __ rev32(v25, __ T16B, v25); 2855 __ rev32(v26, __ T16B, v26); 2856 __ rev32(v27, __ T16B, v27); 2857 __ rev32(v28, __ T16B, v28); 2858 __ ld1(v29, v30, __ T16B, key); 2859 __ rev32(v29, __ T16B, v29); 2860 __ rev32(v30, __ T16B, v30); 2861 2862 __ BIND(L_aes_loop); 2863 __ ld1(v0, __ T16B, __ post(from, 16)); 2864 __ orr(v1, __ T16B, v0, v0); 2865 2866 __ br(Assembler::CC, L_rounds_44); 2867 __ br(Assembler::EQ, L_rounds_52); 2868 2869 __ aesd(v0, v17); __ aesimc(v0, v0); 2870 __ aesd(v0, v18); __ aesimc(v0, v0); 2871 __ BIND(L_rounds_52); 2872 __ aesd(v0, v19); __ aesimc(v0, v0); 2873 __ aesd(v0, v20); __ aesimc(v0, v0); 2874 __ BIND(L_rounds_44); 2875 __ aesd(v0, v21); __ aesimc(v0, v0); 2876 __ aesd(v0, v22); __ aesimc(v0, v0); 2877 __ aesd(v0, v23); __ aesimc(v0, v0); 2878 __ aesd(v0, v24); __ aesimc(v0, v0); 2879 __ aesd(v0, v25); __ aesimc(v0, v0); 2880 __ aesd(v0, v26); __ aesimc(v0, v0); 2881 __ aesd(v0, v27); __ aesimc(v0, v0); 2882 __ aesd(v0, v28); __ aesimc(v0, v0); 2883 __ aesd(v0, v29); __ aesimc(v0, v0); 2884 __ aesd(v0, v30); 2885 __ eor(v0, __ T16B, v0, v31); 2886 __ eor(v0, __ T16B, v0, v2); 2887 2888 __ st1(v0, __ T16B, __ post(to, 16)); 2889 __ orr(v2, __ T16B, v1, v1); 2890 2891 __ subw(len_reg, len_reg, 16); 2892 __ cbnzw(len_reg, L_aes_loop); 2893 2894 __ st1(v2, __ T16B, rvec); 2895 2896 __ mov(r0, rscratch2); 2897 2898 __ leave(); 2899 __ ret(lr); 2900 2901 return start; 2902 } 2903 2904 // Arguments: 2905 // 2906 // Inputs: 2907 // c_rarg0 - byte[] source+offset 2908 // c_rarg1 - int[] SHA.state 2909 // c_rarg2 - int offset 2910 // c_rarg3 - int limit 2911 // 2912 address generate_sha1_implCompress(bool multi_block, const char *name) { 2913 __ align(CodeEntryAlignment); 2914 StubCodeMark mark(this, "StubRoutines", name); 2915 address start = __ pc(); 2916 2917 Register buf = c_rarg0; 2918 Register state = c_rarg1; 2919 Register ofs = c_rarg2; 2920 Register limit = c_rarg3; 2921 2922 Label keys; 2923 Label sha1_loop; 2924 2925 // load the keys into v0..v3 2926 __ adr(rscratch1, keys); 2927 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2928 // load 5 words state into v6, v7 2929 __ ldrq(v6, Address(state, 0)); 2930 __ ldrs(v7, Address(state, 16)); 2931 2932 2933 __ BIND(sha1_loop); 2934 // load 64 bytes of data into v16..v19 2935 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2936 __ rev32(v16, __ T16B, v16); 2937 __ rev32(v17, __ T16B, v17); 2938 __ rev32(v18, __ T16B, v18); 2939 __ rev32(v19, __ T16B, v19); 2940 2941 // do the sha1 2942 __ addv(v4, __ T4S, v16, v0); 2943 __ orr(v20, __ T16B, v6, v6); 2944 2945 FloatRegister d0 = v16; 2946 FloatRegister d1 = v17; 2947 FloatRegister d2 = v18; 2948 FloatRegister d3 = v19; 2949 2950 for (int round = 0; round < 20; round++) { 2951 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2952 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2953 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2954 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2955 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2956 2957 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2958 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2959 __ sha1h(tmp2, __ T4S, v20); 2960 if (round < 5) 2961 __ sha1c(v20, __ T4S, tmp3, tmp4); 2962 else if (round < 10 || round >= 15) 2963 __ sha1p(v20, __ T4S, tmp3, tmp4); 2964 else 2965 __ sha1m(v20, __ T4S, tmp3, tmp4); 2966 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2967 2968 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2969 } 2970 2971 __ addv(v7, __ T2S, v7, v21); 2972 __ addv(v6, __ T4S, v6, v20); 2973 2974 if (multi_block) { 2975 __ add(ofs, ofs, 64); 2976 __ cmp(ofs, limit); 2977 __ br(Assembler::LE, sha1_loop); 2978 __ mov(c_rarg0, ofs); // return ofs 2979 } 2980 2981 __ strq(v6, Address(state, 0)); 2982 __ strs(v7, Address(state, 16)); 2983 2984 __ ret(lr); 2985 2986 __ bind(keys); 2987 __ emit_int32(0x5a827999); 2988 __ emit_int32(0x6ed9eba1); 2989 __ emit_int32(0x8f1bbcdc); 2990 __ emit_int32(0xca62c1d6); 2991 2992 return start; 2993 } 2994 2995 2996 // Arguments: 2997 // 2998 // Inputs: 2999 // c_rarg0 - byte[] source+offset 3000 // c_rarg1 - int[] SHA.state 3001 // c_rarg2 - int offset 3002 // c_rarg3 - int limit 3003 // 3004 address generate_sha256_implCompress(bool multi_block, const char *name) { 3005 static const uint32_t round_consts[64] = { 3006 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3007 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3008 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3009 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3010 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3011 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3012 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3013 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3014 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3015 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3016 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3017 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3018 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3019 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3020 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3021 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3022 }; 3023 __ align(CodeEntryAlignment); 3024 StubCodeMark mark(this, "StubRoutines", name); 3025 address start = __ pc(); 3026 3027 Register buf = c_rarg0; 3028 Register state = c_rarg1; 3029 Register ofs = c_rarg2; 3030 Register limit = c_rarg3; 3031 3032 Label sha1_loop; 3033 3034 __ stpd(v8, v9, __ pre(sp, -32)); 3035 __ stpd(v10, v11, Address(sp, 16)); 3036 3037 // dga == v0 3038 // dgb == v1 3039 // dg0 == v2 3040 // dg1 == v3 3041 // dg2 == v4 3042 // t0 == v6 3043 // t1 == v7 3044 3045 // load 16 keys to v16..v31 3046 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3047 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3048 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3049 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3050 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3051 3052 // load 8 words (256 bits) state 3053 __ ldpq(v0, v1, state); 3054 3055 __ BIND(sha1_loop); 3056 // load 64 bytes of data into v8..v11 3057 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3058 __ rev32(v8, __ T16B, v8); 3059 __ rev32(v9, __ T16B, v9); 3060 __ rev32(v10, __ T16B, v10); 3061 __ rev32(v11, __ T16B, v11); 3062 3063 __ addv(v6, __ T4S, v8, v16); 3064 __ orr(v2, __ T16B, v0, v0); 3065 __ orr(v3, __ T16B, v1, v1); 3066 3067 FloatRegister d0 = v8; 3068 FloatRegister d1 = v9; 3069 FloatRegister d2 = v10; 3070 FloatRegister d3 = v11; 3071 3072 3073 for (int round = 0; round < 16; round++) { 3074 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3075 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3076 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3077 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3078 3079 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3080 __ orr(v4, __ T16B, v2, v2); 3081 if (round < 15) 3082 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3083 __ sha256h(v2, __ T4S, v3, tmp2); 3084 __ sha256h2(v3, __ T4S, v4, tmp2); 3085 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3086 3087 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3088 } 3089 3090 __ addv(v0, __ T4S, v0, v2); 3091 __ addv(v1, __ T4S, v1, v3); 3092 3093 if (multi_block) { 3094 __ add(ofs, ofs, 64); 3095 __ cmp(ofs, limit); 3096 __ br(Assembler::LE, sha1_loop); 3097 __ mov(c_rarg0, ofs); // return ofs 3098 } 3099 3100 __ ldpd(v10, v11, Address(sp, 16)); 3101 __ ldpd(v8, v9, __ post(sp, 32)); 3102 3103 __ stpq(v0, v1, state); 3104 3105 __ ret(lr); 3106 3107 return start; 3108 } 3109 3110 #ifndef BUILTIN_SIM 3111 // Safefetch stubs. 3112 void generate_safefetch(const char* name, int size, address* entry, 3113 address* fault_pc, address* continuation_pc) { 3114 // safefetch signatures: 3115 // int SafeFetch32(int* adr, int errValue); 3116 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3117 // 3118 // arguments: 3119 // c_rarg0 = adr 3120 // c_rarg1 = errValue 3121 // 3122 // result: 3123 // PPC_RET = *adr or errValue 3124 3125 StubCodeMark mark(this, "StubRoutines", name); 3126 3127 // Entry point, pc or function descriptor. 3128 *entry = __ pc(); 3129 3130 // Load *adr into c_rarg1, may fault. 3131 *fault_pc = __ pc(); 3132 switch (size) { 3133 case 4: 3134 // int32_t 3135 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3136 break; 3137 case 8: 3138 // int64_t 3139 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3140 break; 3141 default: 3142 ShouldNotReachHere(); 3143 } 3144 3145 // return errValue or *adr 3146 *continuation_pc = __ pc(); 3147 __ mov(r0, c_rarg1); 3148 __ ret(lr); 3149 } 3150 #endif 3151 3152 /** 3153 * Arguments: 3154 * 3155 * Inputs: 3156 * c_rarg0 - int crc 3157 * c_rarg1 - byte* buf 3158 * c_rarg2 - int length 3159 * 3160 * Ouput: 3161 * rax - int crc result 3162 */ 3163 address generate_updateBytesCRC32() { 3164 assert(UseCRC32Intrinsics, "what are we doing here?"); 3165 3166 __ align(CodeEntryAlignment); 3167 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3168 3169 address start = __ pc(); 3170 3171 const Register crc = c_rarg0; // crc 3172 const Register buf = c_rarg1; // source java byte array address 3173 const Register len = c_rarg2; // length 3174 const Register table0 = c_rarg3; // crc_table address 3175 const Register table1 = c_rarg4; 3176 const Register table2 = c_rarg5; 3177 const Register table3 = c_rarg6; 3178 const Register tmp3 = c_rarg7; 3179 3180 BLOCK_COMMENT("Entry:"); 3181 __ enter(); // required for proper stackwalking of RuntimeStub frame 3182 3183 __ kernel_crc32(crc, buf, len, 3184 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3185 3186 __ leave(); // required for proper stackwalking of RuntimeStub frame 3187 __ ret(lr); 3188 3189 return start; 3190 } 3191 3192 /** 3193 * Arguments: 3194 * 3195 * Inputs: 3196 * c_rarg0 - int crc 3197 * c_rarg1 - byte* buf 3198 * c_rarg2 - int length 3199 * c_rarg3 - int* table 3200 * 3201 * Ouput: 3202 * r0 - int crc result 3203 */ 3204 address generate_updateBytesCRC32C() { 3205 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3206 3207 __ align(CodeEntryAlignment); 3208 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3209 3210 address start = __ pc(); 3211 3212 const Register crc = c_rarg0; // crc 3213 const Register buf = c_rarg1; // source java byte array address 3214 const Register len = c_rarg2; // length 3215 const Register table0 = c_rarg3; // crc_table address 3216 const Register table1 = c_rarg4; 3217 const Register table2 = c_rarg5; 3218 const Register table3 = c_rarg6; 3219 const Register tmp3 = c_rarg7; 3220 3221 BLOCK_COMMENT("Entry:"); 3222 __ enter(); // required for proper stackwalking of RuntimeStub frame 3223 3224 __ kernel_crc32c(crc, buf, len, 3225 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3226 3227 __ leave(); // required for proper stackwalking of RuntimeStub frame 3228 __ ret(lr); 3229 3230 return start; 3231 } 3232 3233 /*** 3234 * Arguments: 3235 * 3236 * Inputs: 3237 * c_rarg0 - int adler 3238 * c_rarg1 - byte* buff 3239 * c_rarg2 - int len 3240 * 3241 * Output: 3242 * c_rarg0 - int adler result 3243 */ 3244 address generate_updateBytesAdler32() { 3245 __ align(CodeEntryAlignment); 3246 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3247 address start = __ pc(); 3248 3249 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3250 3251 // Aliases 3252 Register adler = c_rarg0; 3253 Register s1 = c_rarg0; 3254 Register s2 = c_rarg3; 3255 Register buff = c_rarg1; 3256 Register len = c_rarg2; 3257 Register nmax = r4; 3258 Register base = r5; 3259 Register count = r6; 3260 Register temp0 = rscratch1; 3261 Register temp1 = rscratch2; 3262 Register temp2 = r7; 3263 3264 // Max number of bytes we can process before having to take the mod 3265 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3266 unsigned long BASE = 0xfff1; 3267 unsigned long NMAX = 0x15B0; 3268 3269 __ mov(base, BASE); 3270 __ mov(nmax, NMAX); 3271 3272 // s1 is initialized to the lower 16 bits of adler 3273 // s2 is initialized to the upper 16 bits of adler 3274 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3275 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3276 3277 // The pipelined loop needs at least 16 elements for 1 iteration 3278 // It does check this, but it is more effective to skip to the cleanup loop 3279 __ cmp(len, (u1)16); 3280 __ br(Assembler::HS, L_nmax); 3281 __ cbz(len, L_combine); 3282 3283 __ bind(L_simple_by1_loop); 3284 __ ldrb(temp0, Address(__ post(buff, 1))); 3285 __ add(s1, s1, temp0); 3286 __ add(s2, s2, s1); 3287 __ subs(len, len, 1); 3288 __ br(Assembler::HI, L_simple_by1_loop); 3289 3290 // s1 = s1 % BASE 3291 __ subs(temp0, s1, base); 3292 __ csel(s1, temp0, s1, Assembler::HS); 3293 3294 // s2 = s2 % BASE 3295 __ lsr(temp0, s2, 16); 3296 __ lsl(temp1, temp0, 4); 3297 __ sub(temp1, temp1, temp0); 3298 __ add(s2, temp1, s2, ext::uxth); 3299 3300 __ subs(temp0, s2, base); 3301 __ csel(s2, temp0, s2, Assembler::HS); 3302 3303 __ b(L_combine); 3304 3305 __ bind(L_nmax); 3306 __ subs(len, len, nmax); 3307 __ sub(count, nmax, 16); 3308 __ br(Assembler::LO, L_by16); 3309 3310 __ bind(L_nmax_loop); 3311 3312 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3313 3314 __ add(s1, s1, temp0, ext::uxtb); 3315 __ ubfx(temp2, temp0, 8, 8); 3316 __ add(s2, s2, s1); 3317 __ add(s1, s1, temp2); 3318 __ ubfx(temp2, temp0, 16, 8); 3319 __ add(s2, s2, s1); 3320 __ add(s1, s1, temp2); 3321 __ ubfx(temp2, temp0, 24, 8); 3322 __ add(s2, s2, s1); 3323 __ add(s1, s1, temp2); 3324 __ ubfx(temp2, temp0, 32, 8); 3325 __ add(s2, s2, s1); 3326 __ add(s1, s1, temp2); 3327 __ ubfx(temp2, temp0, 40, 8); 3328 __ add(s2, s2, s1); 3329 __ add(s1, s1, temp2); 3330 __ ubfx(temp2, temp0, 48, 8); 3331 __ add(s2, s2, s1); 3332 __ add(s1, s1, temp2); 3333 __ add(s2, s2, s1); 3334 __ add(s1, s1, temp0, Assembler::LSR, 56); 3335 __ add(s2, s2, s1); 3336 3337 __ add(s1, s1, temp1, ext::uxtb); 3338 __ ubfx(temp2, temp1, 8, 8); 3339 __ add(s2, s2, s1); 3340 __ add(s1, s1, temp2); 3341 __ ubfx(temp2, temp1, 16, 8); 3342 __ add(s2, s2, s1); 3343 __ add(s1, s1, temp2); 3344 __ ubfx(temp2, temp1, 24, 8); 3345 __ add(s2, s2, s1); 3346 __ add(s1, s1, temp2); 3347 __ ubfx(temp2, temp1, 32, 8); 3348 __ add(s2, s2, s1); 3349 __ add(s1, s1, temp2); 3350 __ ubfx(temp2, temp1, 40, 8); 3351 __ add(s2, s2, s1); 3352 __ add(s1, s1, temp2); 3353 __ ubfx(temp2, temp1, 48, 8); 3354 __ add(s2, s2, s1); 3355 __ add(s1, s1, temp2); 3356 __ add(s2, s2, s1); 3357 __ add(s1, s1, temp1, Assembler::LSR, 56); 3358 __ add(s2, s2, s1); 3359 3360 __ subs(count, count, 16); 3361 __ br(Assembler::HS, L_nmax_loop); 3362 3363 // s1 = s1 % BASE 3364 __ lsr(temp0, s1, 16); 3365 __ lsl(temp1, temp0, 4); 3366 __ sub(temp1, temp1, temp0); 3367 __ add(temp1, temp1, s1, ext::uxth); 3368 3369 __ lsr(temp0, temp1, 16); 3370 __ lsl(s1, temp0, 4); 3371 __ sub(s1, s1, temp0); 3372 __ add(s1, s1, temp1, ext:: uxth); 3373 3374 __ subs(temp0, s1, base); 3375 __ csel(s1, temp0, s1, Assembler::HS); 3376 3377 // s2 = s2 % BASE 3378 __ lsr(temp0, s2, 16); 3379 __ lsl(temp1, temp0, 4); 3380 __ sub(temp1, temp1, temp0); 3381 __ add(temp1, temp1, s2, ext::uxth); 3382 3383 __ lsr(temp0, temp1, 16); 3384 __ lsl(s2, temp0, 4); 3385 __ sub(s2, s2, temp0); 3386 __ add(s2, s2, temp1, ext:: uxth); 3387 3388 __ subs(temp0, s2, base); 3389 __ csel(s2, temp0, s2, Assembler::HS); 3390 3391 __ subs(len, len, nmax); 3392 __ sub(count, nmax, 16); 3393 __ br(Assembler::HS, L_nmax_loop); 3394 3395 __ bind(L_by16); 3396 __ adds(len, len, count); 3397 __ br(Assembler::LO, L_by1); 3398 3399 __ bind(L_by16_loop); 3400 3401 __ ldp(temp0, temp1, Address(__ post(buff, 16))); 3402 3403 __ add(s1, s1, temp0, ext::uxtb); 3404 __ ubfx(temp2, temp0, 8, 8); 3405 __ add(s2, s2, s1); 3406 __ add(s1, s1, temp2); 3407 __ ubfx(temp2, temp0, 16, 8); 3408 __ add(s2, s2, s1); 3409 __ add(s1, s1, temp2); 3410 __ ubfx(temp2, temp0, 24, 8); 3411 __ add(s2, s2, s1); 3412 __ add(s1, s1, temp2); 3413 __ ubfx(temp2, temp0, 32, 8); 3414 __ add(s2, s2, s1); 3415 __ add(s1, s1, temp2); 3416 __ ubfx(temp2, temp0, 40, 8); 3417 __ add(s2, s2, s1); 3418 __ add(s1, s1, temp2); 3419 __ ubfx(temp2, temp0, 48, 8); 3420 __ add(s2, s2, s1); 3421 __ add(s1, s1, temp2); 3422 __ add(s2, s2, s1); 3423 __ add(s1, s1, temp0, Assembler::LSR, 56); 3424 __ add(s2, s2, s1); 3425 3426 __ add(s1, s1, temp1, ext::uxtb); 3427 __ ubfx(temp2, temp1, 8, 8); 3428 __ add(s2, s2, s1); 3429 __ add(s1, s1, temp2); 3430 __ ubfx(temp2, temp1, 16, 8); 3431 __ add(s2, s2, s1); 3432 __ add(s1, s1, temp2); 3433 __ ubfx(temp2, temp1, 24, 8); 3434 __ add(s2, s2, s1); 3435 __ add(s1, s1, temp2); 3436 __ ubfx(temp2, temp1, 32, 8); 3437 __ add(s2, s2, s1); 3438 __ add(s1, s1, temp2); 3439 __ ubfx(temp2, temp1, 40, 8); 3440 __ add(s2, s2, s1); 3441 __ add(s1, s1, temp2); 3442 __ ubfx(temp2, temp1, 48, 8); 3443 __ add(s2, s2, s1); 3444 __ add(s1, s1, temp2); 3445 __ add(s2, s2, s1); 3446 __ add(s1, s1, temp1, Assembler::LSR, 56); 3447 __ add(s2, s2, s1); 3448 3449 __ subs(len, len, 16); 3450 __ br(Assembler::HS, L_by16_loop); 3451 3452 __ bind(L_by1); 3453 __ adds(len, len, 15); 3454 __ br(Assembler::LO, L_do_mod); 3455 3456 __ bind(L_by1_loop); 3457 __ ldrb(temp0, Address(__ post(buff, 1))); 3458 __ add(s1, temp0, s1); 3459 __ add(s2, s2, s1); 3460 __ subs(len, len, 1); 3461 __ br(Assembler::HS, L_by1_loop); 3462 3463 __ bind(L_do_mod); 3464 // s1 = s1 % BASE 3465 __ lsr(temp0, s1, 16); 3466 __ lsl(temp1, temp0, 4); 3467 __ sub(temp1, temp1, temp0); 3468 __ add(temp1, temp1, s1, ext::uxth); 3469 3470 __ lsr(temp0, temp1, 16); 3471 __ lsl(s1, temp0, 4); 3472 __ sub(s1, s1, temp0); 3473 __ add(s1, s1, temp1, ext:: uxth); 3474 3475 __ subs(temp0, s1, base); 3476 __ csel(s1, temp0, s1, Assembler::HS); 3477 3478 // s2 = s2 % BASE 3479 __ lsr(temp0, s2, 16); 3480 __ lsl(temp1, temp0, 4); 3481 __ sub(temp1, temp1, temp0); 3482 __ add(temp1, temp1, s2, ext::uxth); 3483 3484 __ lsr(temp0, temp1, 16); 3485 __ lsl(s2, temp0, 4); 3486 __ sub(s2, s2, temp0); 3487 __ add(s2, s2, temp1, ext:: uxth); 3488 3489 __ subs(temp0, s2, base); 3490 __ csel(s2, temp0, s2, Assembler::HS); 3491 3492 // Combine lower bits and higher bits 3493 __ bind(L_combine); 3494 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3495 3496 __ ret(lr); 3497 3498 return start; 3499 } 3500 3501 /** 3502 * Arguments: 3503 * 3504 * Input: 3505 * c_rarg0 - x address 3506 * c_rarg1 - x length 3507 * c_rarg2 - y address 3508 * c_rarg3 - y lenth 3509 * c_rarg4 - z address 3510 * c_rarg5 - z length 3511 */ 3512 address generate_multiplyToLen() { 3513 __ align(CodeEntryAlignment); 3514 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3515 3516 address start = __ pc(); 3517 const Register x = r0; 3518 const Register xlen = r1; 3519 const Register y = r2; 3520 const Register ylen = r3; 3521 const Register z = r4; 3522 const Register zlen = r5; 3523 3524 const Register tmp1 = r10; 3525 const Register tmp2 = r11; 3526 const Register tmp3 = r12; 3527 const Register tmp4 = r13; 3528 const Register tmp5 = r14; 3529 const Register tmp6 = r15; 3530 const Register tmp7 = r16; 3531 3532 BLOCK_COMMENT("Entry:"); 3533 __ enter(); // required for proper stackwalking of RuntimeStub frame 3534 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3535 __ leave(); // required for proper stackwalking of RuntimeStub frame 3536 __ ret(lr); 3537 3538 return start; 3539 } 3540 3541 address generate_squareToLen() { 3542 // squareToLen algorithm for sizes 1..127 described in java code works 3543 // faster than multiply_to_len on some CPUs and slower on others, but 3544 // multiply_to_len shows a bit better overall results 3545 __ align(CodeEntryAlignment); 3546 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3547 address start = __ pc(); 3548 3549 const Register x = r0; 3550 const Register xlen = r1; 3551 const Register z = r2; 3552 const Register zlen = r3; 3553 const Register y = r4; // == x 3554 const Register ylen = r5; // == xlen 3555 3556 const Register tmp1 = r10; 3557 const Register tmp2 = r11; 3558 const Register tmp3 = r12; 3559 const Register tmp4 = r13; 3560 const Register tmp5 = r14; 3561 const Register tmp6 = r15; 3562 const Register tmp7 = r16; 3563 3564 RegSet spilled_regs = RegSet::of(y, ylen); 3565 BLOCK_COMMENT("Entry:"); 3566 __ enter(); 3567 __ push(spilled_regs, sp); 3568 __ mov(y, x); 3569 __ mov(ylen, xlen); 3570 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3571 __ pop(spilled_regs, sp); 3572 __ leave(); 3573 __ ret(lr); 3574 return start; 3575 } 3576 3577 address generate_mulAdd() { 3578 __ align(CodeEntryAlignment); 3579 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3580 3581 address start = __ pc(); 3582 3583 const Register out = r0; 3584 const Register in = r1; 3585 const Register offset = r2; 3586 const Register len = r3; 3587 const Register k = r4; 3588 3589 BLOCK_COMMENT("Entry:"); 3590 __ enter(); 3591 __ mul_add(out, in, offset, len, k); 3592 __ leave(); 3593 __ ret(lr); 3594 3595 return start; 3596 } 3597 3598 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3599 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3600 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3601 // Karatsuba multiplication performs a 128*128 -> 256-bit 3602 // multiplication in three 128-bit multiplications and a few 3603 // additions. 3604 // 3605 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3606 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3607 // 3608 // Inputs: 3609 // 3610 // A0 in a.d[0] (subkey) 3611 // A1 in a.d[1] 3612 // (A1+A0) in a1_xor_a0.d[0] 3613 // 3614 // B0 in b.d[0] (state) 3615 // B1 in b.d[1] 3616 3617 __ ext(tmp1, __ T16B, b, b, 0x08); 3618 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3619 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3620 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3621 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3622 3623 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3624 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3625 __ eor(tmp2, __ T16B, tmp2, tmp4); 3626 __ eor(tmp2, __ T16B, tmp2, tmp3); 3627 3628 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3629 __ ins(result_hi, __ D, tmp2, 0, 1); 3630 __ ins(result_lo, __ D, tmp2, 1, 0); 3631 } 3632 3633 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3634 FloatRegister p, FloatRegister z, FloatRegister t1) { 3635 const FloatRegister t0 = result; 3636 3637 // The GCM field polynomial f is z^128 + p(z), where p = 3638 // z^7+z^2+z+1. 3639 // 3640 // z^128 === -p(z) (mod (z^128 + p(z))) 3641 // 3642 // so, given that the product we're reducing is 3643 // a == lo + hi * z^128 3644 // substituting, 3645 // === lo - hi * p(z) (mod (z^128 + p(z))) 3646 // 3647 // we reduce by multiplying hi by p(z) and subtracting the result 3648 // from (i.e. XORing it with) lo. Because p has no nonzero high 3649 // bits we can do this with two 64-bit multiplications, lo*p and 3650 // hi*p. 3651 3652 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3653 __ ext(t1, __ T16B, t0, z, 8); 3654 __ eor(hi, __ T16B, hi, t1); 3655 __ ext(t1, __ T16B, z, t0, 8); 3656 __ eor(lo, __ T16B, lo, t1); 3657 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3658 __ eor(result, __ T16B, lo, t0); 3659 } 3660 3661 address generate_has_negatives(address &has_negatives_long) { 3662 const u1 large_loop_size = 64; 3663 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3664 int dcache_line = VM_Version::dcache_line_size(); 3665 3666 Register ary1 = r1, len = r2, result = r0; 3667 3668 __ align(CodeEntryAlignment); 3669 3670 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3671 3672 address entry = __ pc(); 3673 3674 __ enter(); 3675 3676 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3677 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3678 3679 __ cmp(len, (u1)15); 3680 __ br(Assembler::GT, LEN_OVER_15); 3681 // The only case when execution falls into this code is when pointer is near 3682 // the end of memory page and we have to avoid reading next page 3683 __ add(ary1, ary1, len); 3684 __ subs(len, len, 8); 3685 __ br(Assembler::GT, LEN_OVER_8); 3686 __ ldr(rscratch2, Address(ary1, -8)); 3687 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3688 __ lsrv(rscratch2, rscratch2, rscratch1); 3689 __ tst(rscratch2, UPPER_BIT_MASK); 3690 __ cset(result, Assembler::NE); 3691 __ leave(); 3692 __ ret(lr); 3693 __ bind(LEN_OVER_8); 3694 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3695 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3696 __ tst(rscratch2, UPPER_BIT_MASK); 3697 __ br(Assembler::NE, RET_TRUE_NO_POP); 3698 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3699 __ lsrv(rscratch1, rscratch1, rscratch2); 3700 __ tst(rscratch1, UPPER_BIT_MASK); 3701 __ cset(result, Assembler::NE); 3702 __ leave(); 3703 __ ret(lr); 3704 3705 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3706 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3707 3708 has_negatives_long = __ pc(); // 2nd entry point 3709 3710 __ enter(); 3711 3712 __ bind(LEN_OVER_15); 3713 __ push(spilled_regs, sp); 3714 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3715 __ cbz(rscratch2, ALIGNED); 3716 __ ldp(tmp6, tmp1, Address(ary1)); 3717 __ mov(tmp5, 16); 3718 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3719 __ add(ary1, ary1, rscratch1); 3720 __ sub(len, len, rscratch1); 3721 __ orr(tmp6, tmp6, tmp1); 3722 __ tst(tmp6, UPPER_BIT_MASK); 3723 __ br(Assembler::NE, RET_TRUE); 3724 3725 __ bind(ALIGNED); 3726 __ cmp(len, large_loop_size); 3727 __ br(Assembler::LT, CHECK_16); 3728 // Perform 16-byte load as early return in pre-loop to handle situation 3729 // when initially aligned large array has negative values at starting bytes, 3730 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3731 // slower. Cases with negative bytes further ahead won't be affected that 3732 // much. In fact, it'll be faster due to early loads, less instructions and 3733 // less branches in LARGE_LOOP. 3734 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3735 __ sub(len, len, 16); 3736 __ orr(tmp6, tmp6, tmp1); 3737 __ tst(tmp6, UPPER_BIT_MASK); 3738 __ br(Assembler::NE, RET_TRUE); 3739 __ cmp(len, large_loop_size); 3740 __ br(Assembler::LT, CHECK_16); 3741 3742 if (SoftwarePrefetchHintDistance >= 0 3743 && SoftwarePrefetchHintDistance >= dcache_line) { 3744 // initial prefetch 3745 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3746 } 3747 __ bind(LARGE_LOOP); 3748 if (SoftwarePrefetchHintDistance >= 0) { 3749 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3750 } 3751 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3752 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3753 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3754 // instructions per cycle and have less branches, but this approach disables 3755 // early return, thus, all 64 bytes are loaded and checked every time. 3756 __ ldp(tmp2, tmp3, Address(ary1)); 3757 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3758 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3759 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3760 __ add(ary1, ary1, large_loop_size); 3761 __ sub(len, len, large_loop_size); 3762 __ orr(tmp2, tmp2, tmp3); 3763 __ orr(tmp4, tmp4, tmp5); 3764 __ orr(rscratch1, rscratch1, rscratch2); 3765 __ orr(tmp6, tmp6, tmp1); 3766 __ orr(tmp2, tmp2, tmp4); 3767 __ orr(rscratch1, rscratch1, tmp6); 3768 __ orr(tmp2, tmp2, rscratch1); 3769 __ tst(tmp2, UPPER_BIT_MASK); 3770 __ br(Assembler::NE, RET_TRUE); 3771 __ cmp(len, large_loop_size); 3772 __ br(Assembler::GE, LARGE_LOOP); 3773 3774 __ bind(CHECK_16); // small 16-byte load pre-loop 3775 __ cmp(len, (u1)16); 3776 __ br(Assembler::LT, POST_LOOP16); 3777 3778 __ bind(LOOP16); // small 16-byte load loop 3779 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3780 __ sub(len, len, 16); 3781 __ orr(tmp2, tmp2, tmp3); 3782 __ tst(tmp2, UPPER_BIT_MASK); 3783 __ br(Assembler::NE, RET_TRUE); 3784 __ cmp(len, (u1)16); 3785 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3786 3787 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3788 __ cmp(len, (u1)8); 3789 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3790 __ ldr(tmp3, Address(__ post(ary1, 8))); 3791 __ sub(len, len, 8); 3792 __ tst(tmp3, UPPER_BIT_MASK); 3793 __ br(Assembler::NE, RET_TRUE); 3794 3795 __ bind(POST_LOOP16_LOAD_TAIL); 3796 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3797 __ ldr(tmp1, Address(ary1)); 3798 __ mov(tmp2, 64); 3799 __ sub(tmp4, tmp2, len, __ LSL, 3); 3800 __ lslv(tmp1, tmp1, tmp4); 3801 __ tst(tmp1, UPPER_BIT_MASK); 3802 __ br(Assembler::NE, RET_TRUE); 3803 // Fallthrough 3804 3805 __ bind(RET_FALSE); 3806 __ pop(spilled_regs, sp); 3807 __ leave(); 3808 __ mov(result, zr); 3809 __ ret(lr); 3810 3811 __ bind(RET_TRUE); 3812 __ pop(spilled_regs, sp); 3813 __ bind(RET_TRUE_NO_POP); 3814 __ leave(); 3815 __ mov(result, 1); 3816 __ ret(lr); 3817 3818 __ bind(DONE); 3819 __ pop(spilled_regs, sp); 3820 __ leave(); 3821 __ ret(lr); 3822 return entry; 3823 } 3824 3825 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3826 bool usePrefetch, Label &NOT_EQUAL) { 3827 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3828 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3829 tmp7 = r12, tmp8 = r13; 3830 Label LOOP; 3831 3832 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3833 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3834 __ bind(LOOP); 3835 if (usePrefetch) { 3836 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3837 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3838 } 3839 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3840 __ eor(tmp1, tmp1, tmp2); 3841 __ eor(tmp3, tmp3, tmp4); 3842 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3843 __ orr(tmp1, tmp1, tmp3); 3844 __ cbnz(tmp1, NOT_EQUAL); 3845 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3846 __ eor(tmp5, tmp5, tmp6); 3847 __ eor(tmp7, tmp7, tmp8); 3848 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3849 __ orr(tmp5, tmp5, tmp7); 3850 __ cbnz(tmp5, NOT_EQUAL); 3851 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3852 __ eor(tmp1, tmp1, tmp2); 3853 __ eor(tmp3, tmp3, tmp4); 3854 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3855 __ orr(tmp1, tmp1, tmp3); 3856 __ cbnz(tmp1, NOT_EQUAL); 3857 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3858 __ eor(tmp5, tmp5, tmp6); 3859 __ sub(cnt1, cnt1, 8 * wordSize); 3860 __ eor(tmp7, tmp7, tmp8); 3861 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3862 // tmp6 is not used. MacroAssembler::subs is used here (rather than 3863 // cmp) because subs allows an unlimited range of immediate operand. 3864 __ subs(tmp6, cnt1, loopThreshold); 3865 __ orr(tmp5, tmp5, tmp7); 3866 __ cbnz(tmp5, NOT_EQUAL); 3867 __ br(__ GE, LOOP); 3868 // post-loop 3869 __ eor(tmp1, tmp1, tmp2); 3870 __ eor(tmp3, tmp3, tmp4); 3871 __ orr(tmp1, tmp1, tmp3); 3872 __ sub(cnt1, cnt1, 2 * wordSize); 3873 __ cbnz(tmp1, NOT_EQUAL); 3874 } 3875 3876 void generate_large_array_equals_loop_simd(int loopThreshold, 3877 bool usePrefetch, Label &NOT_EQUAL) { 3878 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3879 tmp2 = rscratch2; 3880 Label LOOP; 3881 3882 __ bind(LOOP); 3883 if (usePrefetch) { 3884 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3885 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3886 } 3887 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3888 __ sub(cnt1, cnt1, 8 * wordSize); 3889 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3890 __ subs(tmp1, cnt1, loopThreshold); 3891 __ eor(v0, __ T16B, v0, v4); 3892 __ eor(v1, __ T16B, v1, v5); 3893 __ eor(v2, __ T16B, v2, v6); 3894 __ eor(v3, __ T16B, v3, v7); 3895 __ orr(v0, __ T16B, v0, v1); 3896 __ orr(v1, __ T16B, v2, v3); 3897 __ orr(v0, __ T16B, v0, v1); 3898 __ umov(tmp1, v0, __ D, 0); 3899 __ umov(tmp2, v0, __ D, 1); 3900 __ orr(tmp1, tmp1, tmp2); 3901 __ cbnz(tmp1, NOT_EQUAL); 3902 __ br(__ GE, LOOP); 3903 } 3904 3905 // a1 = r1 - array1 address 3906 // a2 = r2 - array2 address 3907 // result = r0 - return value. Already contains "false" 3908 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3909 // r3-r5 are reserved temporary registers 3910 address generate_large_array_equals() { 3911 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3912 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3913 tmp7 = r12, tmp8 = r13; 3914 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3915 SMALL_LOOP, POST_LOOP; 3916 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3917 // calculate if at least 32 prefetched bytes are used 3918 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3919 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3920 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3921 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3922 tmp5, tmp6, tmp7, tmp8); 3923 3924 __ align(CodeEntryAlignment); 3925 3926 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3927 3928 address entry = __ pc(); 3929 __ enter(); 3930 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3931 // also advance pointers to use post-increment instead of pre-increment 3932 __ add(a1, a1, wordSize); 3933 __ add(a2, a2, wordSize); 3934 if (AvoidUnalignedAccesses) { 3935 // both implementations (SIMD/nonSIMD) are using relatively large load 3936 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 3937 // on some CPUs in case of address is not at least 16-byte aligned. 3938 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 3939 // load if needed at least for 1st address and make if 16-byte aligned. 3940 Label ALIGNED16; 3941 __ tbz(a1, 3, ALIGNED16); 3942 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3943 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3944 __ sub(cnt1, cnt1, wordSize); 3945 __ eor(tmp1, tmp1, tmp2); 3946 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 3947 __ bind(ALIGNED16); 3948 } 3949 if (UseSIMDForArrayEquals) { 3950 if (SoftwarePrefetchHintDistance >= 0) { 3951 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3952 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3953 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 3954 /* prfm = */ true, NOT_EQUAL); 3955 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3956 __ br(__ LT, TAIL); 3957 } 3958 __ bind(NO_PREFETCH_LARGE_LOOP); 3959 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 3960 /* prfm = */ false, NOT_EQUAL); 3961 } else { 3962 __ push(spilled_regs, sp); 3963 if (SoftwarePrefetchHintDistance >= 0) { 3964 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3965 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3966 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 3967 /* prfm = */ true, NOT_EQUAL); 3968 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3969 __ br(__ LT, TAIL); 3970 } 3971 __ bind(NO_PREFETCH_LARGE_LOOP); 3972 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 3973 /* prfm = */ false, NOT_EQUAL); 3974 } 3975 __ bind(TAIL); 3976 __ cbz(cnt1, EQUAL); 3977 __ subs(cnt1, cnt1, wordSize); 3978 __ br(__ LE, POST_LOOP); 3979 __ bind(SMALL_LOOP); 3980 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3981 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3982 __ subs(cnt1, cnt1, wordSize); 3983 __ eor(tmp1, tmp1, tmp2); 3984 __ cbnz(tmp1, NOT_EQUAL); 3985 __ br(__ GT, SMALL_LOOP); 3986 __ bind(POST_LOOP); 3987 __ ldr(tmp1, Address(a1, cnt1)); 3988 __ ldr(tmp2, Address(a2, cnt1)); 3989 __ eor(tmp1, tmp1, tmp2); 3990 __ cbnz(tmp1, NOT_EQUAL); 3991 __ bind(EQUAL); 3992 __ mov(result, true); 3993 __ bind(NOT_EQUAL); 3994 if (!UseSIMDForArrayEquals) { 3995 __ pop(spilled_regs, sp); 3996 } 3997 __ bind(NOT_EQUAL_NO_POP); 3998 __ leave(); 3999 __ ret(lr); 4000 return entry; 4001 } 4002 4003 address generate_dsin_dcos(bool isCos) { 4004 __ align(CodeEntryAlignment); 4005 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 4006 address start = __ pc(); 4007 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 4008 (address)StubRoutines::aarch64::_two_over_pi, 4009 (address)StubRoutines::aarch64::_pio2, 4010 (address)StubRoutines::aarch64::_dsin_coef, 4011 (address)StubRoutines::aarch64::_dcos_coef); 4012 return start; 4013 } 4014 4015 address generate_dlog() { 4016 __ align(CodeEntryAlignment); 4017 StubCodeMark mark(this, "StubRoutines", "dlog"); 4018 address entry = __ pc(); 4019 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 4020 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 4021 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 4022 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 4023 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 4024 return entry; 4025 } 4026 4027 // code for comparing 16 bytes of strings with same encoding 4028 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 4029 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; 4030 __ ldr(rscratch1, Address(__ post(str1, 8))); 4031 __ eor(rscratch2, tmp1, tmp2); 4032 __ ldr(cnt1, Address(__ post(str2, 8))); 4033 __ cbnz(rscratch2, DIFF1); 4034 __ ldr(tmp1, Address(__ post(str1, 8))); 4035 __ eor(rscratch2, rscratch1, cnt1); 4036 __ ldr(tmp2, Address(__ post(str2, 8))); 4037 __ cbnz(rscratch2, DIFF2); 4038 } 4039 4040 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 4041 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 4042 Label &DIFF2) { 4043 Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12; 4044 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 4045 4046 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 4047 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4048 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4049 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 4050 4051 __ fmovd(tmpL, vtmp3); 4052 __ eor(rscratch2, tmp3, tmpL); 4053 __ cbnz(rscratch2, DIFF2); 4054 4055 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4056 __ umov(tmpL, vtmp3, __ D, 1); 4057 __ eor(rscratch2, tmpU, tmpL); 4058 __ cbnz(rscratch2, DIFF1); 4059 4060 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4061 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4062 __ fmovd(tmpL, vtmp); 4063 __ eor(rscratch2, tmp3, tmpL); 4064 __ cbnz(rscratch2, DIFF2); 4065 4066 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4067 __ umov(tmpL, vtmp, __ D, 1); 4068 __ eor(rscratch2, tmpU, tmpL); 4069 __ cbnz(rscratch2, DIFF1); 4070 } 4071 4072 // r0 = result 4073 // r1 = str1 4074 // r2 = cnt1 4075 // r3 = str2 4076 // r4 = cnt2 4077 // r10 = tmp1 4078 // r11 = tmp2 4079 address generate_compare_long_string_different_encoding(bool isLU) { 4080 __ align(CodeEntryAlignment); 4081 StubCodeMark mark(this, "StubRoutines", isLU 4082 ? "compare_long_string_different_encoding LU" 4083 : "compare_long_string_different_encoding UL"); 4084 address entry = __ pc(); 4085 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 4086 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER, 4087 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 4088 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4089 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 4090 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 4091 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 4092 4093 int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2); 4094 4095 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 4096 // cnt2 == amount of characters left to compare 4097 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 4098 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4099 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 4100 __ add(str2, str2, isLU ? wordSize : wordSize/2); 4101 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 4102 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 4103 __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1); 4104 __ eor(rscratch2, tmp1, tmp2); 4105 __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0); 4106 __ mov(rscratch1, tmp2); 4107 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 4108 Register strU = isLU ? str2 : str1, 4109 strL = isLU ? str1 : str2, 4110 tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 4111 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 4112 __ push(spilled_regs, sp); 4113 __ sub(tmp2, strL, cnt2); // strL pointer to load from 4114 __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from 4115 4116 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4117 4118 if (SoftwarePrefetchHintDistance >= 0) { 4119 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4120 __ br(__ LT, SMALL_LOOP); 4121 __ bind(LARGE_LOOP_PREFETCH); 4122 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 4123 __ mov(tmp4, 2); 4124 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4125 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 4126 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4127 __ subs(tmp4, tmp4, 1); 4128 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 4129 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4130 __ mov(tmp4, 2); 4131 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 4132 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4133 __ subs(tmp4, tmp4, 1); 4134 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 4135 __ sub(cnt2, cnt2, 64); 4136 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4137 __ br(__ GE, LARGE_LOOP_PREFETCH); 4138 } 4139 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 4140 __ subs(cnt2, cnt2, 16); 4141 __ br(__ LT, TAIL); 4142 __ b(SMALL_LOOP_ENTER); 4143 __ bind(SMALL_LOOP); // smaller loop 4144 __ subs(cnt2, cnt2, 16); 4145 __ bind(SMALL_LOOP_ENTER); 4146 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4147 __ br(__ GE, SMALL_LOOP); 4148 __ cbz(cnt2, LOAD_LAST); 4149 __ bind(TAIL); // 1..15 characters left 4150 __ subs(zr, cnt2, -8); 4151 __ br(__ GT, TAIL_LOAD_16); 4152 __ ldrd(vtmp, Address(tmp2)); 4153 __ zip1(vtmp3, __ T8B, vtmp, vtmpZ); 4154 4155 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4156 __ fmovd(tmpL, vtmp3); 4157 __ eor(rscratch2, tmp3, tmpL); 4158 __ cbnz(rscratch2, DIFF2); 4159 __ umov(tmpL, vtmp3, __ D, 1); 4160 __ eor(rscratch2, tmpU, tmpL); 4161 __ cbnz(rscratch2, DIFF1); 4162 __ b(LOAD_LAST); 4163 __ bind(TAIL_LOAD_16); 4164 __ ldrq(vtmp, Address(tmp2)); 4165 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4166 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4167 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4168 __ fmovd(tmpL, vtmp3); 4169 __ eor(rscratch2, tmp3, tmpL); 4170 __ cbnz(rscratch2, DIFF2); 4171 4172 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4173 __ umov(tmpL, vtmp3, __ D, 1); 4174 __ eor(rscratch2, tmpU, tmpL); 4175 __ cbnz(rscratch2, DIFF1); 4176 4177 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4178 __ fmovd(tmpL, vtmp); 4179 __ eor(rscratch2, tmp3, tmpL); 4180 __ cbnz(rscratch2, DIFF2); 4181 4182 __ umov(tmpL, vtmp, __ D, 1); 4183 __ eor(rscratch2, tmpU, tmpL); 4184 __ cbnz(rscratch2, DIFF1); 4185 __ b(LOAD_LAST); 4186 __ bind(DIFF2); 4187 __ mov(tmpU, tmp3); 4188 __ bind(DIFF1); 4189 __ pop(spilled_regs, sp); 4190 __ b(CALCULATE_DIFFERENCE); 4191 __ bind(LOAD_LAST); 4192 __ pop(spilled_regs, sp); 4193 4194 __ ldrs(vtmp, Address(strL)); 4195 __ ldr(tmpU, Address(strU)); 4196 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4197 __ fmovd(tmpL, vtmp); 4198 4199 __ eor(rscratch2, tmpU, tmpL); 4200 __ cbz(rscratch2, DONE); 4201 4202 // Find the first different characters in the longwords and 4203 // compute their difference. 4204 __ bind(CALCULATE_DIFFERENCE); 4205 __ rev(rscratch2, rscratch2); 4206 __ clz(rscratch2, rscratch2); 4207 __ andr(rscratch2, rscratch2, -16); 4208 __ lsrv(tmp1, tmp1, rscratch2); 4209 __ uxthw(tmp1, tmp1); 4210 __ lsrv(rscratch1, rscratch1, rscratch2); 4211 __ uxthw(rscratch1, rscratch1); 4212 __ subw(result, tmp1, rscratch1); 4213 __ bind(DONE); 4214 __ ret(lr); 4215 return entry; 4216 } 4217 4218 // r0 = result 4219 // r1 = str1 4220 // r2 = cnt1 4221 // r3 = str2 4222 // r4 = cnt2 4223 // r10 = tmp1 4224 // r11 = tmp2 4225 address generate_compare_long_string_same_encoding(bool isLL) { 4226 __ align(CodeEntryAlignment); 4227 StubCodeMark mark(this, "StubRoutines", isLL 4228 ? "compare_long_string_same_encoding LL" 4229 : "compare_long_string_same_encoding UU"); 4230 address entry = __ pc(); 4231 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4232 tmp1 = r10, tmp2 = r11; 4233 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, 4234 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, 4235 DIFF_LAST_POSITION, DIFF_LAST_POSITION2; 4236 // exit from large loop when less than 64 bytes left to read or we're about 4237 // to prefetch memory behind array border 4238 int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 4239 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 4240 // update cnt2 counter with already loaded 8 bytes 4241 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 4242 // update pointers, because of previous read 4243 __ add(str1, str1, wordSize); 4244 __ add(str2, str2, wordSize); 4245 if (SoftwarePrefetchHintDistance >= 0) { 4246 __ bind(LARGE_LOOP_PREFETCH); 4247 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 4248 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 4249 compare_string_16_bytes_same(DIFF, DIFF2); 4250 compare_string_16_bytes_same(DIFF, DIFF2); 4251 __ sub(cnt2, cnt2, isLL ? 64 : 32); 4252 compare_string_16_bytes_same(DIFF, DIFF2); 4253 __ subs(rscratch2, cnt2, largeLoopExitCondition); 4254 compare_string_16_bytes_same(DIFF, DIFF2); 4255 __ br(__ GT, LARGE_LOOP_PREFETCH); 4256 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? 4257 // less than 16 bytes left? 4258 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4259 __ br(__ LT, TAIL); 4260 } 4261 __ bind(SMALL_LOOP); 4262 compare_string_16_bytes_same(DIFF, DIFF2); 4263 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4264 __ br(__ GE, SMALL_LOOP); 4265 __ bind(TAIL); 4266 __ adds(cnt2, cnt2, isLL ? 16 : 8); 4267 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); 4268 __ subs(cnt2, cnt2, isLL ? 8 : 4); 4269 __ br(__ LE, CHECK_LAST); 4270 __ eor(rscratch2, tmp1, tmp2); 4271 __ cbnz(rscratch2, DIFF); 4272 __ ldr(tmp1, Address(__ post(str1, 8))); 4273 __ ldr(tmp2, Address(__ post(str2, 8))); 4274 __ sub(cnt2, cnt2, isLL ? 8 : 4); 4275 __ bind(CHECK_LAST); 4276 if (!isLL) { 4277 __ add(cnt2, cnt2, cnt2); // now in bytes 4278 } 4279 __ eor(rscratch2, tmp1, tmp2); 4280 __ cbnz(rscratch2, DIFF); 4281 __ ldr(rscratch1, Address(str1, cnt2)); 4282 __ ldr(cnt1, Address(str2, cnt2)); 4283 __ eor(rscratch2, rscratch1, cnt1); 4284 __ cbz(rscratch2, LENGTH_DIFF); 4285 // Find the first different characters in the longwords and 4286 // compute their difference. 4287 __ bind(DIFF2); 4288 __ rev(rscratch2, rscratch2); 4289 __ clz(rscratch2, rscratch2); 4290 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4291 __ lsrv(rscratch1, rscratch1, rscratch2); 4292 if (isLL) { 4293 __ lsrv(cnt1, cnt1, rscratch2); 4294 __ uxtbw(rscratch1, rscratch1); 4295 __ uxtbw(cnt1, cnt1); 4296 } else { 4297 __ lsrv(cnt1, cnt1, rscratch2); 4298 __ uxthw(rscratch1, rscratch1); 4299 __ uxthw(cnt1, cnt1); 4300 } 4301 __ subw(result, rscratch1, cnt1); 4302 __ b(LENGTH_DIFF); 4303 __ bind(DIFF); 4304 __ rev(rscratch2, rscratch2); 4305 __ clz(rscratch2, rscratch2); 4306 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4307 __ lsrv(tmp1, tmp1, rscratch2); 4308 if (isLL) { 4309 __ lsrv(tmp2, tmp2, rscratch2); 4310 __ uxtbw(tmp1, tmp1); 4311 __ uxtbw(tmp2, tmp2); 4312 } else { 4313 __ lsrv(tmp2, tmp2, rscratch2); 4314 __ uxthw(tmp1, tmp1); 4315 __ uxthw(tmp2, tmp2); 4316 } 4317 __ subw(result, tmp1, tmp2); 4318 __ b(LENGTH_DIFF); 4319 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 4320 __ eor(rscratch2, tmp1, tmp2); 4321 __ cbnz(rscratch2, DIFF); 4322 __ bind(LENGTH_DIFF); 4323 __ ret(lr); 4324 return entry; 4325 } 4326 4327 void generate_compare_long_strings() { 4328 StubRoutines::aarch64::_compare_long_string_LL 4329 = generate_compare_long_string_same_encoding(true); 4330 StubRoutines::aarch64::_compare_long_string_UU 4331 = generate_compare_long_string_same_encoding(false); 4332 StubRoutines::aarch64::_compare_long_string_LU 4333 = generate_compare_long_string_different_encoding(true); 4334 StubRoutines::aarch64::_compare_long_string_UL 4335 = generate_compare_long_string_different_encoding(false); 4336 } 4337 4338 // R0 = result 4339 // R1 = str2 4340 // R2 = cnt1 4341 // R3 = str1 4342 // R4 = cnt2 4343 // This generic linear code use few additional ideas, which makes it faster: 4344 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 4345 // in order to skip initial loading(help in systems with 1 ld pipeline) 4346 // 2) we can use "fast" algorithm of finding single character to search for 4347 // first symbol with less branches(1 branch per each loaded register instead 4348 // of branch for each symbol), so, this is where constants like 4349 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 4350 // 3) after loading and analyzing 1st register of source string, it can be 4351 // used to search for every 1st character entry, saving few loads in 4352 // comparison with "simplier-but-slower" implementation 4353 // 4) in order to avoid lots of push/pop operations, code below is heavily 4354 // re-using/re-initializing/compressing register values, which makes code 4355 // larger and a bit less readable, however, most of extra operations are 4356 // issued during loads or branches, so, penalty is minimal 4357 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 4358 const char* stubName = str1_isL 4359 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 4360 : "indexof_linear_uu"; 4361 __ align(CodeEntryAlignment); 4362 StubCodeMark mark(this, "StubRoutines", stubName); 4363 address entry = __ pc(); 4364 4365 int str1_chr_size = str1_isL ? 1 : 2; 4366 int str2_chr_size = str2_isL ? 1 : 2; 4367 int str1_chr_shift = str1_isL ? 0 : 1; 4368 int str2_chr_shift = str2_isL ? 0 : 1; 4369 bool isL = str1_isL && str2_isL; 4370 // parameters 4371 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 4372 // temporary registers 4373 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 4374 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 4375 // redefinitions 4376 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 4377 4378 __ push(spilled_regs, sp); 4379 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 4380 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 4381 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 4382 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 4383 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 4384 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 4385 // Read whole register from str1. It is safe, because length >=8 here 4386 __ ldr(ch1, Address(str1)); 4387 // Read whole register from str2. It is safe, because length >=8 here 4388 __ ldr(ch2, Address(str2)); 4389 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 4390 if (str1_isL != str2_isL) { 4391 __ eor(v0, __ T16B, v0, v0); 4392 } 4393 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4394 __ mul(first, first, tmp1); 4395 // check if we have less than 1 register to check 4396 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 4397 if (str1_isL != str2_isL) { 4398 __ fmovd(v1, ch1); 4399 } 4400 __ br(__ LE, L_SMALL); 4401 __ eor(ch2, first, ch2); 4402 if (str1_isL != str2_isL) { 4403 __ zip1(v1, __ T16B, v1, v0); 4404 } 4405 __ sub(tmp2, ch2, tmp1); 4406 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4407 __ bics(tmp2, tmp2, ch2); 4408 if (str1_isL != str2_isL) { 4409 __ fmovd(ch1, v1); 4410 } 4411 __ br(__ NE, L_HAS_ZERO); 4412 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4413 __ add(result, result, wordSize/str2_chr_size); 4414 __ add(str2, str2, wordSize); 4415 __ br(__ LT, L_POST_LOOP); 4416 __ BIND(L_LOOP); 4417 __ ldr(ch2, Address(str2)); 4418 __ eor(ch2, first, ch2); 4419 __ sub(tmp2, ch2, tmp1); 4420 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4421 __ bics(tmp2, tmp2, ch2); 4422 __ br(__ NE, L_HAS_ZERO); 4423 __ BIND(L_LOOP_PROCEED); 4424 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4425 __ add(str2, str2, wordSize); 4426 __ add(result, result, wordSize/str2_chr_size); 4427 __ br(__ GE, L_LOOP); 4428 __ BIND(L_POST_LOOP); 4429 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 4430 __ br(__ LE, NOMATCH); 4431 __ ldr(ch2, Address(str2)); 4432 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4433 __ eor(ch2, first, ch2); 4434 __ sub(tmp2, ch2, tmp1); 4435 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4436 __ mov(tmp4, -1); // all bits set 4437 __ b(L_SMALL_PROCEED); 4438 __ align(OptoLoopAlignment); 4439 __ BIND(L_SMALL); 4440 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4441 __ eor(ch2, first, ch2); 4442 if (str1_isL != str2_isL) { 4443 __ zip1(v1, __ T16B, v1, v0); 4444 } 4445 __ sub(tmp2, ch2, tmp1); 4446 __ mov(tmp4, -1); // all bits set 4447 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4448 if (str1_isL != str2_isL) { 4449 __ fmovd(ch1, v1); // move converted 4 symbols 4450 } 4451 __ BIND(L_SMALL_PROCEED); 4452 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 4453 __ bic(tmp2, tmp2, ch2); 4454 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 4455 __ rbit(tmp2, tmp2); 4456 __ br(__ EQ, NOMATCH); 4457 __ BIND(L_SMALL_HAS_ZERO_LOOP); 4458 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 4459 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 4460 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 4461 if (str2_isL) { // LL 4462 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4463 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4464 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4465 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4466 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4467 } else { 4468 __ mov(ch2, 0xE); // all bits in byte set except last one 4469 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4470 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4471 __ lslv(tmp2, tmp2, tmp4); 4472 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4473 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4474 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4475 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4476 } 4477 __ cmp(ch1, ch2); 4478 __ mov(tmp4, wordSize/str2_chr_size); 4479 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4480 __ BIND(L_SMALL_CMP_LOOP); 4481 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4482 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4483 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4484 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4485 __ add(tmp4, tmp4, 1); 4486 __ cmp(tmp4, cnt1); 4487 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 4488 __ cmp(first, ch2); 4489 __ br(__ EQ, L_SMALL_CMP_LOOP); 4490 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 4491 __ cbz(tmp2, NOMATCH); // no more matches. exit 4492 __ clz(tmp4, tmp2); 4493 __ add(result, result, 1); // advance index 4494 __ add(str2, str2, str2_chr_size); // advance pointer 4495 __ b(L_SMALL_HAS_ZERO_LOOP); 4496 __ align(OptoLoopAlignment); 4497 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 4498 __ cmp(first, ch2); 4499 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4500 __ b(DONE); 4501 __ align(OptoLoopAlignment); 4502 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 4503 if (str2_isL) { // LL 4504 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4505 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4506 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4507 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4508 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4509 } else { 4510 __ mov(ch2, 0xE); // all bits in byte set except last one 4511 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4512 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4513 __ lslv(tmp2, tmp2, tmp4); 4514 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4515 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4516 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4517 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4518 } 4519 __ cmp(ch1, ch2); 4520 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4521 __ b(DONE); 4522 __ align(OptoLoopAlignment); 4523 __ BIND(L_HAS_ZERO); 4524 __ rbit(tmp2, tmp2); 4525 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 4526 // Now, perform compression of counters(cnt2 and cnt1) into one register. 4527 // It's fine because both counters are 32bit and are not changed in this 4528 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 4529 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 4530 __ sub(result, result, 1); 4531 __ BIND(L_HAS_ZERO_LOOP); 4532 __ mov(cnt1, wordSize/str2_chr_size); 4533 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4534 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 4535 if (str2_isL) { 4536 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4537 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4538 __ lslv(tmp2, tmp2, tmp4); 4539 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4540 __ add(tmp4, tmp4, 1); 4541 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4542 __ lsl(tmp2, tmp2, 1); 4543 __ mov(tmp4, wordSize/str2_chr_size); 4544 } else { 4545 __ mov(ch2, 0xE); 4546 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4547 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4548 __ lslv(tmp2, tmp2, tmp4); 4549 __ add(tmp4, tmp4, 1); 4550 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4551 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4552 __ lsl(tmp2, tmp2, 1); 4553 __ mov(tmp4, wordSize/str2_chr_size); 4554 __ sub(str2, str2, str2_chr_size); 4555 } 4556 __ cmp(ch1, ch2); 4557 __ mov(tmp4, wordSize/str2_chr_size); 4558 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4559 __ BIND(L_CMP_LOOP); 4560 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4561 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4562 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4563 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4564 __ add(tmp4, tmp4, 1); 4565 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4566 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 4567 __ cmp(cnt1, ch2); 4568 __ br(__ EQ, L_CMP_LOOP); 4569 __ BIND(L_CMP_LOOP_NOMATCH); 4570 // here we're not matched 4571 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 4572 __ clz(tmp4, tmp2); 4573 __ add(str2, str2, str2_chr_size); // advance pointer 4574 __ b(L_HAS_ZERO_LOOP); 4575 __ align(OptoLoopAlignment); 4576 __ BIND(L_CMP_LOOP_LAST_CMP); 4577 __ cmp(cnt1, ch2); 4578 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4579 __ b(DONE); 4580 __ align(OptoLoopAlignment); 4581 __ BIND(L_CMP_LOOP_LAST_CMP2); 4582 if (str2_isL) { 4583 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4584 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4585 __ lslv(tmp2, tmp2, tmp4); 4586 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4587 __ add(tmp4, tmp4, 1); 4588 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4589 __ lsl(tmp2, tmp2, 1); 4590 } else { 4591 __ mov(ch2, 0xE); 4592 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4593 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4594 __ lslv(tmp2, tmp2, tmp4); 4595 __ add(tmp4, tmp4, 1); 4596 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4597 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4598 __ lsl(tmp2, tmp2, 1); 4599 __ sub(str2, str2, str2_chr_size); 4600 } 4601 __ cmp(ch1, ch2); 4602 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4603 __ b(DONE); 4604 __ align(OptoLoopAlignment); 4605 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 4606 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 4607 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 4608 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 4609 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 4610 // result by analyzed characters value, so, we can just reset lower bits 4611 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 4612 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 4613 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 4614 // index of last analyzed substring inside current octet. So, str2 in at 4615 // respective start address. We need to advance it to next octet 4616 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 4617 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 4618 __ bfm(result, zr, 0, 2 - str2_chr_shift); 4619 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 4620 __ movw(cnt2, cnt2); 4621 __ b(L_LOOP_PROCEED); 4622 __ align(OptoLoopAlignment); 4623 __ BIND(NOMATCH); 4624 __ mov(result, -1); 4625 __ BIND(DONE); 4626 __ pop(spilled_regs, sp); 4627 __ ret(lr); 4628 return entry; 4629 } 4630 4631 void generate_string_indexof_stubs() { 4632 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 4633 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 4634 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 4635 } 4636 4637 void inflate_and_store_2_fp_registers(bool generatePrfm, 4638 FloatRegister src1, FloatRegister src2) { 4639 Register dst = r1; 4640 __ zip1(v1, __ T16B, src1, v0); 4641 __ zip2(v2, __ T16B, src1, v0); 4642 if (generatePrfm) { 4643 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 4644 } 4645 __ zip1(v3, __ T16B, src2, v0); 4646 __ zip2(v4, __ T16B, src2, v0); 4647 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4648 } 4649 4650 // R0 = src 4651 // R1 = dst 4652 // R2 = len 4653 // R3 = len >> 3 4654 // V0 = 0 4655 // v1 = loaded 8 bytes 4656 address generate_large_byte_array_inflate() { 4657 __ align(CodeEntryAlignment); 4658 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4659 address entry = __ pc(); 4660 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4661 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4662 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4; 4663 4664 // do one more 8-byte read to have address 16-byte aligned in most cases 4665 // also use single store instruction 4666 __ ldrd(v2, __ post(src, 8)); 4667 __ sub(octetCounter, octetCounter, 2); 4668 __ zip1(v1, __ T16B, v1, v0); 4669 __ zip1(v2, __ T16B, v2, v0); 4670 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4671 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4672 __ subs(rscratch1, octetCounter, large_loop_threshold); 4673 __ br(__ LE, LOOP_START); 4674 __ b(LOOP_PRFM_START); 4675 __ bind(LOOP_PRFM); 4676 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4677 __ bind(LOOP_PRFM_START); 4678 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4679 __ sub(octetCounter, octetCounter, 8); 4680 __ subs(rscratch1, octetCounter, large_loop_threshold); 4681 inflate_and_store_2_fp_registers(true, v3, v4); 4682 inflate_and_store_2_fp_registers(true, v5, v6); 4683 __ br(__ GT, LOOP_PRFM); 4684 __ cmp(octetCounter, (u1)8); 4685 __ br(__ LT, DONE); 4686 __ bind(LOOP); 4687 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4688 __ bind(LOOP_START); 4689 __ sub(octetCounter, octetCounter, 8); 4690 __ cmp(octetCounter, (u1)8); 4691 inflate_and_store_2_fp_registers(false, v3, v4); 4692 inflate_and_store_2_fp_registers(false, v5, v6); 4693 __ br(__ GE, LOOP); 4694 __ bind(DONE); 4695 __ ret(lr); 4696 return entry; 4697 } 4698 4699 /** 4700 * Arguments: 4701 * 4702 * Input: 4703 * c_rarg0 - current state address 4704 * c_rarg1 - H key address 4705 * c_rarg2 - data address 4706 * c_rarg3 - number of blocks 4707 * 4708 * Output: 4709 * Updated state at c_rarg0 4710 */ 4711 address generate_ghash_processBlocks() { 4712 // Bafflingly, GCM uses little-endian for the byte order, but 4713 // big-endian for the bit order. For example, the polynomial 1 is 4714 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4715 // 4716 // So, we must either reverse the bytes in each word and do 4717 // everything big-endian or reverse the bits in each byte and do 4718 // it little-endian. On AArch64 it's more idiomatic to reverse 4719 // the bits in each byte (we have an instruction, RBIT, to do 4720 // that) and keep the data in little-endian bit order throught the 4721 // calculation, bit-reversing the inputs and outputs. 4722 4723 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4724 __ align(wordSize * 2); 4725 address p = __ pc(); 4726 __ emit_int64(0x87); // The low-order bits of the field 4727 // polynomial (i.e. p = z^7+z^2+z+1) 4728 // repeated in the low and high parts of a 4729 // 128-bit vector 4730 __ emit_int64(0x87); 4731 4732 __ align(CodeEntryAlignment); 4733 address start = __ pc(); 4734 4735 Register state = c_rarg0; 4736 Register subkeyH = c_rarg1; 4737 Register data = c_rarg2; 4738 Register blocks = c_rarg3; 4739 4740 FloatRegister vzr = v30; 4741 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4742 4743 __ ldrq(v0, Address(state)); 4744 __ ldrq(v1, Address(subkeyH)); 4745 4746 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4747 __ rbit(v0, __ T16B, v0); 4748 __ rev64(v1, __ T16B, v1); 4749 __ rbit(v1, __ T16B, v1); 4750 4751 __ ldrq(v26, p); 4752 4753 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4754 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4755 4756 { 4757 Label L_ghash_loop; 4758 __ bind(L_ghash_loop); 4759 4760 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4761 // reversing each byte 4762 __ rbit(v2, __ T16B, v2); 4763 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4764 4765 // Multiply state in v2 by subkey in v1 4766 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4767 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4768 /*temps*/v6, v20, v18, v21); 4769 // Reduce v7:v5 by the field polynomial 4770 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4771 4772 __ sub(blocks, blocks, 1); 4773 __ cbnz(blocks, L_ghash_loop); 4774 } 4775 4776 // The bit-reversed result is at this point in v0 4777 __ rev64(v1, __ T16B, v0); 4778 __ rbit(v1, __ T16B, v1); 4779 4780 __ st1(v1, __ T16B, state); 4781 __ ret(lr); 4782 4783 return start; 4784 } 4785 4786 // Continuation point for throwing of implicit exceptions that are 4787 // not handled in the current activation. Fabricates an exception 4788 // oop and initiates normal exception dispatching in this 4789 // frame. Since we need to preserve callee-saved values (currently 4790 // only for C2, but done for C1 as well) we need a callee-saved oop 4791 // map and therefore have to make these stubs into RuntimeStubs 4792 // rather than BufferBlobs. If the compiler needs all registers to 4793 // be preserved between the fault point and the exception handler 4794 // then it must assume responsibility for that in 4795 // AbstractCompiler::continuation_for_implicit_null_exception or 4796 // continuation_for_implicit_division_by_zero_exception. All other 4797 // implicit exceptions (e.g., NullPointerException or 4798 // AbstractMethodError on entry) are either at call sites or 4799 // otherwise assume that stack unwinding will be initiated, so 4800 // caller saved registers were assumed volatile in the compiler. 4801 4802 #undef __ 4803 #define __ masm-> 4804 4805 address generate_throw_exception(const char* name, 4806 address runtime_entry, 4807 Register arg1 = noreg, 4808 Register arg2 = noreg) { 4809 // Information about frame layout at time of blocking runtime call. 4810 // Note that we only have to preserve callee-saved registers since 4811 // the compilers are responsible for supplying a continuation point 4812 // if they expect all registers to be preserved. 4813 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4814 enum layout { 4815 rfp_off = 0, 4816 rfp_off2, 4817 return_off, 4818 return_off2, 4819 framesize // inclusive of return address 4820 }; 4821 4822 int insts_size = 512; 4823 int locs_size = 64; 4824 4825 CodeBuffer code(name, insts_size, locs_size); 4826 OopMapSet* oop_maps = new OopMapSet(); 4827 MacroAssembler* masm = new MacroAssembler(&code); 4828 4829 address start = __ pc(); 4830 4831 // This is an inlined and slightly modified version of call_VM 4832 // which has the ability to fetch the return PC out of 4833 // thread-local storage and also sets up last_Java_sp slightly 4834 // differently than the real call_VM 4835 4836 __ enter(); // Save FP and LR before call 4837 4838 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4839 4840 // lr and fp are already in place 4841 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4842 4843 int frame_complete = __ pc() - start; 4844 4845 // Set up last_Java_sp and last_Java_fp 4846 address the_pc = __ pc(); 4847 __ set_last_Java_frame(sp, rfp, (address)NULL, rscratch1); 4848 4849 // Call runtime 4850 if (arg1 != noreg) { 4851 assert(arg2 != c_rarg1, "clobbered"); 4852 __ mov(c_rarg1, arg1); 4853 } 4854 if (arg2 != noreg) { 4855 __ mov(c_rarg2, arg2); 4856 } 4857 __ mov(c_rarg0, rthread); 4858 BLOCK_COMMENT("call runtime_entry"); 4859 __ mov(rscratch1, runtime_entry); 4860 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4861 4862 // Generate oop map 4863 OopMap* map = new OopMap(framesize, 0); 4864 4865 oop_maps->add_gc_map(the_pc - start, map); 4866 4867 __ reset_last_Java_frame(true); 4868 __ maybe_isb(); 4869 4870 __ leave(); 4871 4872 // check for pending exceptions 4873 #ifdef ASSERT 4874 Label L; 4875 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4876 __ cbnz(rscratch1, L); 4877 __ should_not_reach_here(); 4878 __ bind(L); 4879 #endif // ASSERT 4880 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4881 4882 4883 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4884 RuntimeStub* stub = 4885 RuntimeStub::new_runtime_stub(name, 4886 &code, 4887 frame_complete, 4888 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4889 oop_maps, false); 4890 return stub->entry_point(); 4891 } 4892 4893 class MontgomeryMultiplyGenerator : public MacroAssembler { 4894 4895 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4896 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4897 4898 RegSet _toSave; 4899 bool _squaring; 4900 4901 public: 4902 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4903 : MacroAssembler(as->code()), _squaring(squaring) { 4904 4905 // Register allocation 4906 4907 Register reg = c_rarg0; 4908 Pa_base = reg; // Argument registers 4909 if (squaring) 4910 Pb_base = Pa_base; 4911 else 4912 Pb_base = ++reg; 4913 Pn_base = ++reg; 4914 Rlen= ++reg; 4915 inv = ++reg; 4916 Pm_base = ++reg; 4917 4918 // Working registers: 4919 Ra = ++reg; // The current digit of a, b, n, and m. 4920 Rb = ++reg; 4921 Rm = ++reg; 4922 Rn = ++reg; 4923 4924 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4925 Pb = ++reg; 4926 Pm = ++reg; 4927 Pn = ++reg; 4928 4929 t0 = ++reg; // Three registers which form a 4930 t1 = ++reg; // triple-precision accumuator. 4931 t2 = ++reg; 4932 4933 Ri = ++reg; // Inner and outer loop indexes. 4934 Rj = ++reg; 4935 4936 Rhi_ab = ++reg; // Product registers: low and high parts 4937 Rlo_ab = ++reg; // of a*b and m*n. 4938 Rhi_mn = ++reg; 4939 Rlo_mn = ++reg; 4940 4941 // r19 and up are callee-saved. 4942 _toSave = RegSet::range(r19, reg) + Pm_base; 4943 } 4944 4945 private: 4946 void save_regs() { 4947 push(_toSave, sp); 4948 } 4949 4950 void restore_regs() { 4951 pop(_toSave, sp); 4952 } 4953 4954 template <typename T> 4955 void unroll_2(Register count, T block) { 4956 Label loop, end, odd; 4957 tbnz(count, 0, odd); 4958 cbz(count, end); 4959 align(16); 4960 bind(loop); 4961 (this->*block)(); 4962 bind(odd); 4963 (this->*block)(); 4964 subs(count, count, 2); 4965 br(Assembler::GT, loop); 4966 bind(end); 4967 } 4968 4969 template <typename T> 4970 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4971 Label loop, end, odd; 4972 tbnz(count, 0, odd); 4973 cbz(count, end); 4974 align(16); 4975 bind(loop); 4976 (this->*block)(d, s, tmp); 4977 bind(odd); 4978 (this->*block)(d, s, tmp); 4979 subs(count, count, 2); 4980 br(Assembler::GT, loop); 4981 bind(end); 4982 } 4983 4984 void pre1(RegisterOrConstant i) { 4985 block_comment("pre1"); 4986 // Pa = Pa_base; 4987 // Pb = Pb_base + i; 4988 // Pm = Pm_base; 4989 // Pn = Pn_base + i; 4990 // Ra = *Pa; 4991 // Rb = *Pb; 4992 // Rm = *Pm; 4993 // Rn = *Pn; 4994 ldr(Ra, Address(Pa_base)); 4995 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4996 ldr(Rm, Address(Pm_base)); 4997 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4998 lea(Pa, Address(Pa_base)); 4999 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 5000 lea(Pm, Address(Pm_base)); 5001 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5002 5003 // Zero the m*n result. 5004 mov(Rhi_mn, zr); 5005 mov(Rlo_mn, zr); 5006 } 5007 5008 // The core multiply-accumulate step of a Montgomery 5009 // multiplication. The idea is to schedule operations as a 5010 // pipeline so that instructions with long latencies (loads and 5011 // multiplies) have time to complete before their results are 5012 // used. This most benefits in-order implementations of the 5013 // architecture but out-of-order ones also benefit. 5014 void step() { 5015 block_comment("step"); 5016 // MACC(Ra, Rb, t0, t1, t2); 5017 // Ra = *++Pa; 5018 // Rb = *--Pb; 5019 umulh(Rhi_ab, Ra, Rb); 5020 mul(Rlo_ab, Ra, Rb); 5021 ldr(Ra, pre(Pa, wordSize)); 5022 ldr(Rb, pre(Pb, -wordSize)); 5023 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 5024 // previous iteration. 5025 // MACC(Rm, Rn, t0, t1, t2); 5026 // Rm = *++Pm; 5027 // Rn = *--Pn; 5028 umulh(Rhi_mn, Rm, Rn); 5029 mul(Rlo_mn, Rm, Rn); 5030 ldr(Rm, pre(Pm, wordSize)); 5031 ldr(Rn, pre(Pn, -wordSize)); 5032 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5033 } 5034 5035 void post1() { 5036 block_comment("post1"); 5037 5038 // MACC(Ra, Rb, t0, t1, t2); 5039 // Ra = *++Pa; 5040 // Rb = *--Pb; 5041 umulh(Rhi_ab, Ra, Rb); 5042 mul(Rlo_ab, Ra, Rb); 5043 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5044 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5045 5046 // *Pm = Rm = t0 * inv; 5047 mul(Rm, t0, inv); 5048 str(Rm, Address(Pm)); 5049 5050 // MACC(Rm, Rn, t0, t1, t2); 5051 // t0 = t1; t1 = t2; t2 = 0; 5052 umulh(Rhi_mn, Rm, Rn); 5053 5054 #ifndef PRODUCT 5055 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5056 { 5057 mul(Rlo_mn, Rm, Rn); 5058 add(Rlo_mn, t0, Rlo_mn); 5059 Label ok; 5060 cbz(Rlo_mn, ok); { 5061 stop("broken Montgomery multiply"); 5062 } bind(ok); 5063 } 5064 #endif 5065 // We have very carefully set things up so that 5066 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5067 // the lower half of Rm * Rn because we know the result already: 5068 // it must be -t0. t0 + (-t0) must generate a carry iff 5069 // t0 != 0. So, rather than do a mul and an adds we just set 5070 // the carry flag iff t0 is nonzero. 5071 // 5072 // mul(Rlo_mn, Rm, Rn); 5073 // adds(zr, t0, Rlo_mn); 5074 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5075 adcs(t0, t1, Rhi_mn); 5076 adc(t1, t2, zr); 5077 mov(t2, zr); 5078 } 5079 5080 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 5081 block_comment("pre2"); 5082 // Pa = Pa_base + i-len; 5083 // Pb = Pb_base + len; 5084 // Pm = Pm_base + i-len; 5085 // Pn = Pn_base + len; 5086 5087 if (i.is_register()) { 5088 sub(Rj, i.as_register(), len); 5089 } else { 5090 mov(Rj, i.as_constant()); 5091 sub(Rj, Rj, len); 5092 } 5093 // Rj == i-len 5094 5095 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 5096 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 5097 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5098 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 5099 5100 // Ra = *++Pa; 5101 // Rb = *--Pb; 5102 // Rm = *++Pm; 5103 // Rn = *--Pn; 5104 ldr(Ra, pre(Pa, wordSize)); 5105 ldr(Rb, pre(Pb, -wordSize)); 5106 ldr(Rm, pre(Pm, wordSize)); 5107 ldr(Rn, pre(Pn, -wordSize)); 5108 5109 mov(Rhi_mn, zr); 5110 mov(Rlo_mn, zr); 5111 } 5112 5113 void post2(RegisterOrConstant i, RegisterOrConstant len) { 5114 block_comment("post2"); 5115 if (i.is_constant()) { 5116 mov(Rj, i.as_constant()-len.as_constant()); 5117 } else { 5118 sub(Rj, i.as_register(), len); 5119 } 5120 5121 adds(t0, t0, Rlo_mn); // The pending m*n, low part 5122 5123 // As soon as we know the least significant digit of our result, 5124 // store it. 5125 // Pm_base[i-len] = t0; 5126 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5127 5128 // t0 = t1; t1 = t2; t2 = 0; 5129 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 5130 adc(t1, t2, zr); 5131 mov(t2, zr); 5132 } 5133 5134 // A carry in t0 after Montgomery multiplication means that we 5135 // should subtract multiples of n from our result in m. We'll 5136 // keep doing that until there is no carry. 5137 void normalize(RegisterOrConstant len) { 5138 block_comment("normalize"); 5139 // while (t0) 5140 // t0 = sub(Pm_base, Pn_base, t0, len); 5141 Label loop, post, again; 5142 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 5143 cbz(t0, post); { 5144 bind(again); { 5145 mov(i, zr); 5146 mov(cnt, len); 5147 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5148 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5149 subs(zr, zr, zr); // set carry flag, i.e. no borrow 5150 align(16); 5151 bind(loop); { 5152 sbcs(Rm, Rm, Rn); 5153 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5154 add(i, i, 1); 5155 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5156 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5157 sub(cnt, cnt, 1); 5158 } cbnz(cnt, loop); 5159 sbc(t0, t0, zr); 5160 } cbnz(t0, again); 5161 } bind(post); 5162 } 5163 5164 // Move memory at s to d, reversing words. 5165 // Increments d to end of copied memory 5166 // Destroys tmp1, tmp2 5167 // Preserves len 5168 // Leaves s pointing to the address which was in d at start 5169 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 5170 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 5171 5172 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 5173 mov(tmp1, len); 5174 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 5175 sub(s, d, len, ext::uxtw, LogBytesPerWord); 5176 } 5177 // where 5178 void reverse1(Register d, Register s, Register tmp) { 5179 ldr(tmp, pre(s, -wordSize)); 5180 ror(tmp, tmp, 32); 5181 str(tmp, post(d, wordSize)); 5182 } 5183 5184 void step_squaring() { 5185 // An extra ACC 5186 step(); 5187 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5188 } 5189 5190 void last_squaring(RegisterOrConstant i) { 5191 Label dont; 5192 // if ((i & 1) == 0) { 5193 tbnz(i.as_register(), 0, dont); { 5194 // MACC(Ra, Rb, t0, t1, t2); 5195 // Ra = *++Pa; 5196 // Rb = *--Pb; 5197 umulh(Rhi_ab, Ra, Rb); 5198 mul(Rlo_ab, Ra, Rb); 5199 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5200 } bind(dont); 5201 } 5202 5203 void extra_step_squaring() { 5204 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5205 5206 // MACC(Rm, Rn, t0, t1, t2); 5207 // Rm = *++Pm; 5208 // Rn = *--Pn; 5209 umulh(Rhi_mn, Rm, Rn); 5210 mul(Rlo_mn, Rm, Rn); 5211 ldr(Rm, pre(Pm, wordSize)); 5212 ldr(Rn, pre(Pn, -wordSize)); 5213 } 5214 5215 void post1_squaring() { 5216 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5217 5218 // *Pm = Rm = t0 * inv; 5219 mul(Rm, t0, inv); 5220 str(Rm, Address(Pm)); 5221 5222 // MACC(Rm, Rn, t0, t1, t2); 5223 // t0 = t1; t1 = t2; t2 = 0; 5224 umulh(Rhi_mn, Rm, Rn); 5225 5226 #ifndef PRODUCT 5227 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5228 { 5229 mul(Rlo_mn, Rm, Rn); 5230 add(Rlo_mn, t0, Rlo_mn); 5231 Label ok; 5232 cbz(Rlo_mn, ok); { 5233 stop("broken Montgomery multiply"); 5234 } bind(ok); 5235 } 5236 #endif 5237 // We have very carefully set things up so that 5238 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5239 // the lower half of Rm * Rn because we know the result already: 5240 // it must be -t0. t0 + (-t0) must generate a carry iff 5241 // t0 != 0. So, rather than do a mul and an adds we just set 5242 // the carry flag iff t0 is nonzero. 5243 // 5244 // mul(Rlo_mn, Rm, Rn); 5245 // adds(zr, t0, Rlo_mn); 5246 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5247 adcs(t0, t1, Rhi_mn); 5248 adc(t1, t2, zr); 5249 mov(t2, zr); 5250 } 5251 5252 void acc(Register Rhi, Register Rlo, 5253 Register t0, Register t1, Register t2) { 5254 adds(t0, t0, Rlo); 5255 adcs(t1, t1, Rhi); 5256 adc(t2, t2, zr); 5257 } 5258 5259 public: 5260 /** 5261 * Fast Montgomery multiplication. The derivation of the 5262 * algorithm is in A Cryptographic Library for the Motorola 5263 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 5264 * 5265 * Arguments: 5266 * 5267 * Inputs for multiplication: 5268 * c_rarg0 - int array elements a 5269 * c_rarg1 - int array elements b 5270 * c_rarg2 - int array elements n (the modulus) 5271 * c_rarg3 - int length 5272 * c_rarg4 - int inv 5273 * c_rarg5 - int array elements m (the result) 5274 * 5275 * Inputs for squaring: 5276 * c_rarg0 - int array elements a 5277 * c_rarg1 - int array elements n (the modulus) 5278 * c_rarg2 - int length 5279 * c_rarg3 - int inv 5280 * c_rarg4 - int array elements m (the result) 5281 * 5282 */ 5283 address generate_multiply() { 5284 Label argh, nothing; 5285 bind(argh); 5286 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5287 5288 align(CodeEntryAlignment); 5289 address entry = pc(); 5290 5291 cbzw(Rlen, nothing); 5292 5293 enter(); 5294 5295 // Make room. 5296 cmpw(Rlen, 512); 5297 br(Assembler::HI, argh); 5298 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5299 andr(sp, Ra, -2 * wordSize); 5300 5301 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5302 5303 { 5304 // Copy input args, reversing as we go. We use Ra as a 5305 // temporary variable. 5306 reverse(Ra, Pa_base, Rlen, t0, t1); 5307 if (!_squaring) 5308 reverse(Ra, Pb_base, Rlen, t0, t1); 5309 reverse(Ra, Pn_base, Rlen, t0, t1); 5310 } 5311 5312 // Push all call-saved registers and also Pm_base which we'll need 5313 // at the end. 5314 save_regs(); 5315 5316 #ifndef PRODUCT 5317 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 5318 { 5319 ldr(Rn, Address(Pn_base, 0)); 5320 mul(Rlo_mn, Rn, inv); 5321 subs(zr, Rlo_mn, -1); 5322 Label ok; 5323 br(EQ, ok); { 5324 stop("broken inverse in Montgomery multiply"); 5325 } bind(ok); 5326 } 5327 #endif 5328 5329 mov(Pm_base, Ra); 5330 5331 mov(t0, zr); 5332 mov(t1, zr); 5333 mov(t2, zr); 5334 5335 block_comment("for (int i = 0; i < len; i++) {"); 5336 mov(Ri, zr); { 5337 Label loop, end; 5338 cmpw(Ri, Rlen); 5339 br(Assembler::GE, end); 5340 5341 bind(loop); 5342 pre1(Ri); 5343 5344 block_comment(" for (j = i; j; j--) {"); { 5345 movw(Rj, Ri); 5346 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5347 } block_comment(" } // j"); 5348 5349 post1(); 5350 addw(Ri, Ri, 1); 5351 cmpw(Ri, Rlen); 5352 br(Assembler::LT, loop); 5353 bind(end); 5354 block_comment("} // i"); 5355 } 5356 5357 block_comment("for (int i = len; i < 2*len; i++) {"); 5358 mov(Ri, Rlen); { 5359 Label loop, end; 5360 cmpw(Ri, Rlen, Assembler::LSL, 1); 5361 br(Assembler::GE, end); 5362 5363 bind(loop); 5364 pre2(Ri, Rlen); 5365 5366 block_comment(" for (j = len*2-i-1; j; j--) {"); { 5367 lslw(Rj, Rlen, 1); 5368 subw(Rj, Rj, Ri); 5369 subw(Rj, Rj, 1); 5370 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5371 } block_comment(" } // j"); 5372 5373 post2(Ri, Rlen); 5374 addw(Ri, Ri, 1); 5375 cmpw(Ri, Rlen, Assembler::LSL, 1); 5376 br(Assembler::LT, loop); 5377 bind(end); 5378 } 5379 block_comment("} // i"); 5380 5381 normalize(Rlen); 5382 5383 mov(Ra, Pm_base); // Save Pm_base in Ra 5384 restore_regs(); // Restore caller's Pm_base 5385 5386 // Copy our result into caller's Pm_base 5387 reverse(Pm_base, Ra, Rlen, t0, t1); 5388 5389 leave(); 5390 bind(nothing); 5391 ret(lr); 5392 5393 return entry; 5394 } 5395 // In C, approximately: 5396 5397 // void 5398 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 5399 // unsigned long Pn_base[], unsigned long Pm_base[], 5400 // unsigned long inv, int len) { 5401 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5402 // unsigned long *Pa, *Pb, *Pn, *Pm; 5403 // unsigned long Ra, Rb, Rn, Rm; 5404 5405 // int i; 5406 5407 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5408 5409 // for (i = 0; i < len; i++) { 5410 // int j; 5411 5412 // Pa = Pa_base; 5413 // Pb = Pb_base + i; 5414 // Pm = Pm_base; 5415 // Pn = Pn_base + i; 5416 5417 // Ra = *Pa; 5418 // Rb = *Pb; 5419 // Rm = *Pm; 5420 // Rn = *Pn; 5421 5422 // int iters = i; 5423 // for (j = 0; iters--; j++) { 5424 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5425 // MACC(Ra, Rb, t0, t1, t2); 5426 // Ra = *++Pa; 5427 // Rb = *--Pb; 5428 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5429 // MACC(Rm, Rn, t0, t1, t2); 5430 // Rm = *++Pm; 5431 // Rn = *--Pn; 5432 // } 5433 5434 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 5435 // MACC(Ra, Rb, t0, t1, t2); 5436 // *Pm = Rm = t0 * inv; 5437 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5438 // MACC(Rm, Rn, t0, t1, t2); 5439 5440 // assert(t0 == 0, "broken Montgomery multiply"); 5441 5442 // t0 = t1; t1 = t2; t2 = 0; 5443 // } 5444 5445 // for (i = len; i < 2*len; i++) { 5446 // int j; 5447 5448 // Pa = Pa_base + i-len; 5449 // Pb = Pb_base + len; 5450 // Pm = Pm_base + i-len; 5451 // Pn = Pn_base + len; 5452 5453 // Ra = *++Pa; 5454 // Rb = *--Pb; 5455 // Rm = *++Pm; 5456 // Rn = *--Pn; 5457 5458 // int iters = len*2-i-1; 5459 // for (j = i-len+1; iters--; j++) { 5460 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5461 // MACC(Ra, Rb, t0, t1, t2); 5462 // Ra = *++Pa; 5463 // Rb = *--Pb; 5464 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5465 // MACC(Rm, Rn, t0, t1, t2); 5466 // Rm = *++Pm; 5467 // Rn = *--Pn; 5468 // } 5469 5470 // Pm_base[i-len] = t0; 5471 // t0 = t1; t1 = t2; t2 = 0; 5472 // } 5473 5474 // while (t0) 5475 // t0 = sub(Pm_base, Pn_base, t0, len); 5476 // } 5477 5478 /** 5479 * Fast Montgomery squaring. This uses asymptotically 25% fewer 5480 * multiplies than Montgomery multiplication so it should be up to 5481 * 25% faster. However, its loop control is more complex and it 5482 * may actually run slower on some machines. 5483 * 5484 * Arguments: 5485 * 5486 * Inputs: 5487 * c_rarg0 - int array elements a 5488 * c_rarg1 - int array elements n (the modulus) 5489 * c_rarg2 - int length 5490 * c_rarg3 - int inv 5491 * c_rarg4 - int array elements m (the result) 5492 * 5493 */ 5494 address generate_square() { 5495 Label argh; 5496 bind(argh); 5497 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5498 5499 align(CodeEntryAlignment); 5500 address entry = pc(); 5501 5502 enter(); 5503 5504 // Make room. 5505 cmpw(Rlen, 512); 5506 br(Assembler::HI, argh); 5507 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5508 andr(sp, Ra, -2 * wordSize); 5509 5510 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5511 5512 { 5513 // Copy input args, reversing as we go. We use Ra as a 5514 // temporary variable. 5515 reverse(Ra, Pa_base, Rlen, t0, t1); 5516 reverse(Ra, Pn_base, Rlen, t0, t1); 5517 } 5518 5519 // Push all call-saved registers and also Pm_base which we'll need 5520 // at the end. 5521 save_regs(); 5522 5523 mov(Pm_base, Ra); 5524 5525 mov(t0, zr); 5526 mov(t1, zr); 5527 mov(t2, zr); 5528 5529 block_comment("for (int i = 0; i < len; i++) {"); 5530 mov(Ri, zr); { 5531 Label loop, end; 5532 bind(loop); 5533 cmp(Ri, Rlen); 5534 br(Assembler::GE, end); 5535 5536 pre1(Ri); 5537 5538 block_comment("for (j = (i+1)/2; j; j--) {"); { 5539 add(Rj, Ri, 1); 5540 lsr(Rj, Rj, 1); 5541 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5542 } block_comment(" } // j"); 5543 5544 last_squaring(Ri); 5545 5546 block_comment(" for (j = i/2; j; j--) {"); { 5547 lsr(Rj, Ri, 1); 5548 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5549 } block_comment(" } // j"); 5550 5551 post1_squaring(); 5552 add(Ri, Ri, 1); 5553 cmp(Ri, Rlen); 5554 br(Assembler::LT, loop); 5555 5556 bind(end); 5557 block_comment("} // i"); 5558 } 5559 5560 block_comment("for (int i = len; i < 2*len; i++) {"); 5561 mov(Ri, Rlen); { 5562 Label loop, end; 5563 bind(loop); 5564 cmp(Ri, Rlen, Assembler::LSL, 1); 5565 br(Assembler::GE, end); 5566 5567 pre2(Ri, Rlen); 5568 5569 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 5570 lsl(Rj, Rlen, 1); 5571 sub(Rj, Rj, Ri); 5572 sub(Rj, Rj, 1); 5573 lsr(Rj, Rj, 1); 5574 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5575 } block_comment(" } // j"); 5576 5577 last_squaring(Ri); 5578 5579 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 5580 lsl(Rj, Rlen, 1); 5581 sub(Rj, Rj, Ri); 5582 lsr(Rj, Rj, 1); 5583 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5584 } block_comment(" } // j"); 5585 5586 post2(Ri, Rlen); 5587 add(Ri, Ri, 1); 5588 cmp(Ri, Rlen, Assembler::LSL, 1); 5589 5590 br(Assembler::LT, loop); 5591 bind(end); 5592 block_comment("} // i"); 5593 } 5594 5595 normalize(Rlen); 5596 5597 mov(Ra, Pm_base); // Save Pm_base in Ra 5598 restore_regs(); // Restore caller's Pm_base 5599 5600 // Copy our result into caller's Pm_base 5601 reverse(Pm_base, Ra, Rlen, t0, t1); 5602 5603 leave(); 5604 ret(lr); 5605 5606 return entry; 5607 } 5608 // In C, approximately: 5609 5610 // void 5611 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 5612 // unsigned long Pm_base[], unsigned long inv, int len) { 5613 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5614 // unsigned long *Pa, *Pb, *Pn, *Pm; 5615 // unsigned long Ra, Rb, Rn, Rm; 5616 5617 // int i; 5618 5619 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5620 5621 // for (i = 0; i < len; i++) { 5622 // int j; 5623 5624 // Pa = Pa_base; 5625 // Pb = Pa_base + i; 5626 // Pm = Pm_base; 5627 // Pn = Pn_base + i; 5628 5629 // Ra = *Pa; 5630 // Rb = *Pb; 5631 // Rm = *Pm; 5632 // Rn = *Pn; 5633 5634 // int iters = (i+1)/2; 5635 // for (j = 0; iters--; j++) { 5636 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5637 // MACC2(Ra, Rb, t0, t1, t2); 5638 // Ra = *++Pa; 5639 // Rb = *--Pb; 5640 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5641 // MACC(Rm, Rn, t0, t1, t2); 5642 // Rm = *++Pm; 5643 // Rn = *--Pn; 5644 // } 5645 // if ((i & 1) == 0) { 5646 // assert(Ra == Pa_base[j], "must be"); 5647 // MACC(Ra, Ra, t0, t1, t2); 5648 // } 5649 // iters = i/2; 5650 // assert(iters == i-j, "must be"); 5651 // for (; iters--; j++) { 5652 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5653 // MACC(Rm, Rn, t0, t1, t2); 5654 // Rm = *++Pm; 5655 // Rn = *--Pn; 5656 // } 5657 5658 // *Pm = Rm = t0 * inv; 5659 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5660 // MACC(Rm, Rn, t0, t1, t2); 5661 5662 // assert(t0 == 0, "broken Montgomery multiply"); 5663 5664 // t0 = t1; t1 = t2; t2 = 0; 5665 // } 5666 5667 // for (i = len; i < 2*len; i++) { 5668 // int start = i-len+1; 5669 // int end = start + (len - start)/2; 5670 // int j; 5671 5672 // Pa = Pa_base + i-len; 5673 // Pb = Pa_base + len; 5674 // Pm = Pm_base + i-len; 5675 // Pn = Pn_base + len; 5676 5677 // Ra = *++Pa; 5678 // Rb = *--Pb; 5679 // Rm = *++Pm; 5680 // Rn = *--Pn; 5681 5682 // int iters = (2*len-i-1)/2; 5683 // assert(iters == end-start, "must be"); 5684 // for (j = start; iters--; j++) { 5685 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5686 // MACC2(Ra, Rb, t0, t1, t2); 5687 // Ra = *++Pa; 5688 // Rb = *--Pb; 5689 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5690 // MACC(Rm, Rn, t0, t1, t2); 5691 // Rm = *++Pm; 5692 // Rn = *--Pn; 5693 // } 5694 // if ((i & 1) == 0) { 5695 // assert(Ra == Pa_base[j], "must be"); 5696 // MACC(Ra, Ra, t0, t1, t2); 5697 // } 5698 // iters = (2*len-i)/2; 5699 // assert(iters == len-j, "must be"); 5700 // for (; iters--; j++) { 5701 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5702 // MACC(Rm, Rn, t0, t1, t2); 5703 // Rm = *++Pm; 5704 // Rn = *--Pn; 5705 // } 5706 // Pm_base[i-len] = t0; 5707 // t0 = t1; t1 = t2; t2 = 0; 5708 // } 5709 5710 // while (t0) 5711 // t0 = sub(Pm_base, Pn_base, t0, len); 5712 // } 5713 }; 5714 5715 5716 // Initialization 5717 void generate_initial() { 5718 // Generate initial stubs and initializes the entry points 5719 5720 // entry points that exist in all platforms Note: This is code 5721 // that could be shared among different platforms - however the 5722 // benefit seems to be smaller than the disadvantage of having a 5723 // much more complicated generator structure. See also comment in 5724 // stubRoutines.hpp. 5725 5726 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5727 5728 StubRoutines::_call_stub_entry = 5729 generate_call_stub(StubRoutines::_call_stub_return_address); 5730 5731 // is referenced by megamorphic call 5732 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5733 5734 // Build this early so it's available for the interpreter. 5735 StubRoutines::_throw_StackOverflowError_entry = 5736 generate_throw_exception("StackOverflowError throw_exception", 5737 CAST_FROM_FN_PTR(address, 5738 SharedRuntime::throw_StackOverflowError)); 5739 StubRoutines::_throw_delayed_StackOverflowError_entry = 5740 generate_throw_exception("delayed StackOverflowError throw_exception", 5741 CAST_FROM_FN_PTR(address, 5742 SharedRuntime::throw_delayed_StackOverflowError)); 5743 if (UseCRC32Intrinsics) { 5744 // set table address before stub generation which use it 5745 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5746 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5747 } 5748 5749 if (UseCRC32CIntrinsics) { 5750 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5751 } 5752 5753 // Disabled until JDK-8210858 is fixed 5754 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5755 // StubRoutines::_dlog = generate_dlog(); 5756 // } 5757 5758 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5759 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 5760 } 5761 5762 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5763 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 5764 } 5765 } 5766 5767 void generate_all() { 5768 // support for verify_oop (must happen after universe_init) 5769 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5770 StubRoutines::_throw_AbstractMethodError_entry = 5771 generate_throw_exception("AbstractMethodError throw_exception", 5772 CAST_FROM_FN_PTR(address, 5773 SharedRuntime:: 5774 throw_AbstractMethodError)); 5775 5776 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5777 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5778 CAST_FROM_FN_PTR(address, 5779 SharedRuntime:: 5780 throw_IncompatibleClassChangeError)); 5781 5782 StubRoutines::_throw_NullPointerException_at_call_entry = 5783 generate_throw_exception("NullPointerException at call throw_exception", 5784 CAST_FROM_FN_PTR(address, 5785 SharedRuntime:: 5786 throw_NullPointerException_at_call)); 5787 5788 // arraycopy stubs used by compilers 5789 generate_arraycopy_stubs(); 5790 5791 // has negatives stub for large arrays. 5792 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5793 5794 // array equals stub for large arrays. 5795 if (!UseSimpleArrayEquals) { 5796 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5797 } 5798 5799 generate_compare_long_strings(); 5800 5801 generate_string_indexof_stubs(); 5802 5803 // byte_array_inflate stub for large arrays. 5804 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5805 5806 #ifdef COMPILER2 5807 if (UseMultiplyToLenIntrinsic) { 5808 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5809 } 5810 5811 if (UseSquareToLenIntrinsic) { 5812 StubRoutines::_squareToLen = generate_squareToLen(); 5813 } 5814 5815 if (UseMulAddIntrinsic) { 5816 StubRoutines::_mulAdd = generate_mulAdd(); 5817 } 5818 5819 if (UseMontgomeryMultiplyIntrinsic) { 5820 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5821 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5822 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5823 } 5824 5825 if (UseMontgomerySquareIntrinsic) { 5826 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5827 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5828 // We use generate_multiply() rather than generate_square() 5829 // because it's faster for the sizes of modulus we care about. 5830 StubRoutines::_montgomerySquare = g.generate_multiply(); 5831 } 5832 #endif // COMPILER2 5833 5834 #ifndef BUILTIN_SIM 5835 // generate GHASH intrinsics code 5836 if (UseGHASHIntrinsics) { 5837 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5838 } 5839 5840 if (UseAESIntrinsics) { 5841 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5842 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5843 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5844 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5845 } 5846 5847 if (UseSHA1Intrinsics) { 5848 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5849 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5850 } 5851 if (UseSHA256Intrinsics) { 5852 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5853 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5854 } 5855 5856 // generate Adler32 intrinsics code 5857 if (UseAdler32Intrinsics) { 5858 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5859 } 5860 5861 // Safefetch stubs. 5862 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5863 &StubRoutines::_safefetch32_fault_pc, 5864 &StubRoutines::_safefetch32_continuation_pc); 5865 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5866 &StubRoutines::_safefetchN_fault_pc, 5867 &StubRoutines::_safefetchN_continuation_pc); 5868 #endif 5869 StubRoutines::aarch64::set_completed(); 5870 } 5871 5872 public: 5873 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5874 if (all) { 5875 generate_all(); 5876 } else { 5877 generate_initial(); 5878 } 5879 } 5880 }; // end class declaration 5881 5882 void StubGenerator_generate(CodeBuffer* code, bool all) { 5883 StubGenerator g(code, all); 5884 }