1 /* 2 * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "nativeInst_aarch64.hpp" 33 #include "oops/instanceOop.hpp" 34 #include "oops/method.hpp" 35 #include "oops/objArrayKlass.hpp" 36 #include "oops/oop.inline.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "runtime/frame.inline.hpp" 39 #include "runtime/handles.inline.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubCodeGenerator.hpp" 42 #include "runtime/stubRoutines.hpp" 43 #include "runtime/thread.inline.hpp" 44 #include "utilities/align.hpp" 45 #ifdef COMPILER2 46 #include "opto/runtime.hpp" 47 #endif 48 49 #ifdef BUILTIN_SIM 50 #include "../../../../../../simulator/simulator.hpp" 51 #endif 52 53 // Declaration and definition of StubGenerator (no .hpp file). 54 // For a more detailed description of the stub routine structure 55 // see the comment in stubRoutines.hpp 56 57 #undef __ 58 #define __ _masm-> 59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 60 61 #ifdef PRODUCT 62 #define BLOCK_COMMENT(str) /* nothing */ 63 #else 64 #define BLOCK_COMMENT(str) __ block_comment(str) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Stub Code definitions 70 71 class StubGenerator: public StubCodeGenerator { 72 private: 73 74 #ifdef PRODUCT 75 #define inc_counter_np(counter) ((void)0) 76 #else 77 void inc_counter_np_(int& counter) { 78 __ lea(rscratch2, ExternalAddress((address)&counter)); 79 __ ldrw(rscratch1, Address(rscratch2)); 80 __ addw(rscratch1, rscratch1, 1); 81 __ strw(rscratch1, Address(rscratch2)); 82 } 83 #define inc_counter_np(counter) \ 84 BLOCK_COMMENT("inc_counter " #counter); \ 85 inc_counter_np_(counter); 86 #endif 87 88 // Call stubs are used to call Java from C 89 // 90 // Arguments: 91 // c_rarg0: call wrapper address address 92 // c_rarg1: result address 93 // c_rarg2: result type BasicType 94 // c_rarg3: method Method* 95 // c_rarg4: (interpreter) entry point address 96 // c_rarg5: parameters intptr_t* 97 // c_rarg6: parameter size (in words) int 98 // c_rarg7: thread Thread* 99 // 100 // There is no return from the stub itself as any Java result 101 // is written to result 102 // 103 // we save r30 (lr) as the return PC at the base of the frame and 104 // link r29 (fp) below it as the frame pointer installing sp (r31) 105 // into fp. 106 // 107 // we save r0-r7, which accounts for all the c arguments. 108 // 109 // TODO: strictly do we need to save them all? they are treated as 110 // volatile by C so could we omit saving the ones we are going to 111 // place in global registers (thread? method?) or those we only use 112 // during setup of the Java call? 113 // 114 // we don't need to save r8 which C uses as an indirect result location 115 // return register. 116 // 117 // we don't need to save r9-r15 which both C and Java treat as 118 // volatile 119 // 120 // we don't need to save r16-18 because Java does not use them 121 // 122 // we save r19-r28 which Java uses as scratch registers and C 123 // expects to be callee-save 124 // 125 // we save the bottom 64 bits of each value stored in v8-v15; it is 126 // the responsibility of the caller to preserve larger values. 127 // 128 // so the stub frame looks like this when we enter Java code 129 // 130 // [ return_from_Java ] <--- sp 131 // [ argument word n ] 132 // ... 133 // -27 [ argument word 1 ] 134 // -26 [ saved v15 ] <--- sp_after_call 135 // -25 [ saved v14 ] 136 // -24 [ saved v13 ] 137 // -23 [ saved v12 ] 138 // -22 [ saved v11 ] 139 // -21 [ saved v10 ] 140 // -20 [ saved v9 ] 141 // -19 [ saved v8 ] 142 // -18 [ saved r28 ] 143 // -17 [ saved r27 ] 144 // -16 [ saved r26 ] 145 // -15 [ saved r25 ] 146 // -14 [ saved r24 ] 147 // -13 [ saved r23 ] 148 // -12 [ saved r22 ] 149 // -11 [ saved r21 ] 150 // -10 [ saved r20 ] 151 // -9 [ saved r19 ] 152 // -8 [ call wrapper (r0) ] 153 // -7 [ result (r1) ] 154 // -6 [ result type (r2) ] 155 // -5 [ method (r3) ] 156 // -4 [ entry point (r4) ] 157 // -3 [ parameters (r5) ] 158 // -2 [ parameter size (r6) ] 159 // -1 [ thread (r7) ] 160 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 161 // 1 [ saved lr (r30) ] 162 163 // Call stub stack layout word offsets from fp 164 enum call_stub_layout { 165 sp_after_call_off = -26, 166 167 d15_off = -26, 168 d13_off = -24, 169 d11_off = -22, 170 d9_off = -20, 171 172 r28_off = -18, 173 r26_off = -16, 174 r24_off = -14, 175 r22_off = -12, 176 r20_off = -10, 177 call_wrapper_off = -8, 178 result_off = -7, 179 result_type_off = -6, 180 method_off = -5, 181 entry_point_off = -4, 182 parameter_size_off = -2, 183 thread_off = -1, 184 fp_f = 0, 185 retaddr_off = 1, 186 }; 187 188 address generate_call_stub(address& return_address) { 189 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 190 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 191 "adjust this code"); 192 193 StubCodeMark mark(this, "StubRoutines", "call_stub"); 194 address start = __ pc(); 195 196 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 197 198 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 199 const Address result (rfp, result_off * wordSize); 200 const Address result_type (rfp, result_type_off * wordSize); 201 const Address method (rfp, method_off * wordSize); 202 const Address entry_point (rfp, entry_point_off * wordSize); 203 const Address parameter_size(rfp, parameter_size_off * wordSize); 204 205 const Address thread (rfp, thread_off * wordSize); 206 207 const Address d15_save (rfp, d15_off * wordSize); 208 const Address d13_save (rfp, d13_off * wordSize); 209 const Address d11_save (rfp, d11_off * wordSize); 210 const Address d9_save (rfp, d9_off * wordSize); 211 212 const Address r28_save (rfp, r28_off * wordSize); 213 const Address r26_save (rfp, r26_off * wordSize); 214 const Address r24_save (rfp, r24_off * wordSize); 215 const Address r22_save (rfp, r22_off * wordSize); 216 const Address r20_save (rfp, r20_off * wordSize); 217 218 // stub code 219 220 // we need a C prolog to bootstrap the x86 caller into the sim 221 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 222 223 address aarch64_entry = __ pc(); 224 225 #ifdef BUILTIN_SIM 226 // Save sender's SP for stack traces. 227 __ mov(rscratch1, sp); 228 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 229 #endif 230 // set up frame and move sp to end of save area 231 __ enter(); 232 __ sub(sp, rfp, -sp_after_call_off * wordSize); 233 234 // save register parameters and Java scratch/global registers 235 // n.b. we save thread even though it gets installed in 236 // rthread because we want to sanity check rthread later 237 __ str(c_rarg7, thread); 238 __ strw(c_rarg6, parameter_size); 239 __ stp(c_rarg4, c_rarg5, entry_point); 240 __ stp(c_rarg2, c_rarg3, result_type); 241 __ stp(c_rarg0, c_rarg1, call_wrapper); 242 243 __ stp(r20, r19, r20_save); 244 __ stp(r22, r21, r22_save); 245 __ stp(r24, r23, r24_save); 246 __ stp(r26, r25, r26_save); 247 __ stp(r28, r27, r28_save); 248 249 __ stpd(v9, v8, d9_save); 250 __ stpd(v11, v10, d11_save); 251 __ stpd(v13, v12, d13_save); 252 __ stpd(v15, v14, d15_save); 253 254 // install Java thread in global register now we have saved 255 // whatever value it held 256 __ mov(rthread, c_rarg7); 257 // And method 258 __ mov(rmethod, c_rarg3); 259 260 // set up the heapbase register 261 __ reinit_heapbase(); 262 263 #ifdef ASSERT 264 // make sure we have no pending exceptions 265 { 266 Label L; 267 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 268 __ cmp(rscratch1, (u1)NULL_WORD); 269 __ br(Assembler::EQ, L); 270 __ stop("StubRoutines::call_stub: entered with pending exception"); 271 __ BIND(L); 272 } 273 #endif 274 // pass parameters if any 275 __ mov(esp, sp); 276 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 277 __ andr(sp, rscratch1, -2 * wordSize); 278 279 BLOCK_COMMENT("pass parameters if any"); 280 Label parameters_done; 281 // parameter count is still in c_rarg6 282 // and parameter pointer identifying param 1 is in c_rarg5 283 __ cbzw(c_rarg6, parameters_done); 284 285 address loop = __ pc(); 286 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 287 __ subsw(c_rarg6, c_rarg6, 1); 288 __ push(rscratch1); 289 __ br(Assembler::GT, loop); 290 291 __ BIND(parameters_done); 292 293 // call Java entry -- passing methdoOop, and current sp 294 // rmethod: Method* 295 // r13: sender sp 296 BLOCK_COMMENT("call Java function"); 297 __ mov(r13, sp); 298 __ blr(c_rarg4); 299 300 // tell the simulator we have returned to the stub 301 302 // we do this here because the notify will already have been done 303 // if we get to the next instruction via an exception 304 // 305 // n.b. adding this instruction here affects the calculation of 306 // whether or not a routine returns to the call stub (used when 307 // doing stack walks) since the normal test is to check the return 308 // pc against the address saved below. so we may need to allow for 309 // this extra instruction in the check. 310 311 if (NotifySimulator) { 312 __ notify(Assembler::method_reentry); 313 } 314 // save current address for use by exception handling code 315 316 return_address = __ pc(); 317 318 // store result depending on type (everything that is not 319 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 320 // n.b. this assumes Java returns an integral result in r0 321 // and a floating result in j_farg0 322 __ ldr(j_rarg2, result); 323 Label is_long, is_float, is_double, exit; 324 __ ldr(j_rarg1, result_type); 325 __ cmp(j_rarg1, (u1)T_OBJECT); 326 __ br(Assembler::EQ, is_long); 327 __ cmp(j_rarg1, (u1)T_LONG); 328 __ br(Assembler::EQ, is_long); 329 __ cmp(j_rarg1, (u1)T_FLOAT); 330 __ br(Assembler::EQ, is_float); 331 __ cmp(j_rarg1, (u1)T_DOUBLE); 332 __ br(Assembler::EQ, is_double); 333 334 // handle T_INT case 335 __ strw(r0, Address(j_rarg2)); 336 337 __ BIND(exit); 338 339 // pop parameters 340 __ sub(esp, rfp, -sp_after_call_off * wordSize); 341 342 #ifdef ASSERT 343 // verify that threads correspond 344 { 345 Label L, S; 346 __ ldr(rscratch1, thread); 347 __ cmp(rthread, rscratch1); 348 __ br(Assembler::NE, S); 349 __ get_thread(rscratch1); 350 __ cmp(rthread, rscratch1); 351 __ br(Assembler::EQ, L); 352 __ BIND(S); 353 __ stop("StubRoutines::call_stub: threads must correspond"); 354 __ BIND(L); 355 } 356 #endif 357 358 // restore callee-save registers 359 __ ldpd(v15, v14, d15_save); 360 __ ldpd(v13, v12, d13_save); 361 __ ldpd(v11, v10, d11_save); 362 __ ldpd(v9, v8, d9_save); 363 364 __ ldp(r28, r27, r28_save); 365 __ ldp(r26, r25, r26_save); 366 __ ldp(r24, r23, r24_save); 367 __ ldp(r22, r21, r22_save); 368 __ ldp(r20, r19, r20_save); 369 370 __ ldp(c_rarg0, c_rarg1, call_wrapper); 371 __ ldrw(c_rarg2, result_type); 372 __ ldr(c_rarg3, method); 373 __ ldp(c_rarg4, c_rarg5, entry_point); 374 __ ldp(c_rarg6, c_rarg7, parameter_size); 375 376 #ifndef PRODUCT 377 // tell the simulator we are about to end Java execution 378 if (NotifySimulator) { 379 __ notify(Assembler::method_exit); 380 } 381 #endif 382 // leave frame and return to caller 383 __ leave(); 384 __ ret(lr); 385 386 // handle return types different from T_INT 387 388 __ BIND(is_long); 389 __ str(r0, Address(j_rarg2, 0)); 390 __ br(Assembler::AL, exit); 391 392 __ BIND(is_float); 393 __ strs(j_farg0, Address(j_rarg2, 0)); 394 __ br(Assembler::AL, exit); 395 396 __ BIND(is_double); 397 __ strd(j_farg0, Address(j_rarg2, 0)); 398 __ br(Assembler::AL, exit); 399 400 return start; 401 } 402 403 // Return point for a Java call if there's an exception thrown in 404 // Java code. The exception is caught and transformed into a 405 // pending exception stored in JavaThread that can be tested from 406 // within the VM. 407 // 408 // Note: Usually the parameters are removed by the callee. In case 409 // of an exception crossing an activation frame boundary, that is 410 // not the case if the callee is compiled code => need to setup the 411 // rsp. 412 // 413 // r0: exception oop 414 415 // NOTE: this is used as a target from the signal handler so it 416 // needs an x86 prolog which returns into the current simulator 417 // executing the generated catch_exception code. so the prolog 418 // needs to install rax in a sim register and adjust the sim's 419 // restart pc to enter the generated code at the start position 420 // then return from native to simulated execution. 421 422 address generate_catch_exception() { 423 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 424 address start = __ pc(); 425 426 // same as in generate_call_stub(): 427 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 428 const Address thread (rfp, thread_off * wordSize); 429 430 #ifdef ASSERT 431 // verify that threads correspond 432 { 433 Label L, S; 434 __ ldr(rscratch1, thread); 435 __ cmp(rthread, rscratch1); 436 __ br(Assembler::NE, S); 437 __ get_thread(rscratch1); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::EQ, L); 440 __ bind(S); 441 __ stop("StubRoutines::catch_exception: threads must correspond"); 442 __ bind(L); 443 } 444 #endif 445 446 // set pending exception 447 __ verify_oop(r0); 448 449 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 450 __ mov(rscratch1, (address)__FILE__); 451 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 452 __ movw(rscratch1, (int)__LINE__); 453 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 454 455 // complete return to VM 456 assert(StubRoutines::_call_stub_return_address != NULL, 457 "_call_stub_return_address must have been generated before"); 458 __ b(StubRoutines::_call_stub_return_address); 459 460 return start; 461 } 462 463 // Continuation point for runtime calls returning with a pending 464 // exception. The pending exception check happened in the runtime 465 // or native call stub. The pending exception in Thread is 466 // converted into a Java-level exception. 467 // 468 // Contract with Java-level exception handlers: 469 // r0: exception 470 // r3: throwing pc 471 // 472 // NOTE: At entry of this stub, exception-pc must be in LR !! 473 474 // NOTE: this is always used as a jump target within generated code 475 // so it just needs to be generated code wiht no x86 prolog 476 477 address generate_forward_exception() { 478 StubCodeMark mark(this, "StubRoutines", "forward exception"); 479 address start = __ pc(); 480 481 // Upon entry, LR points to the return address returning into 482 // Java (interpreted or compiled) code; i.e., the return address 483 // becomes the throwing pc. 484 // 485 // Arguments pushed before the runtime call are still on the stack 486 // but the exception handler will reset the stack pointer -> 487 // ignore them. A potential result in registers can be ignored as 488 // well. 489 490 #ifdef ASSERT 491 // make sure this code is only executed if there is a pending exception 492 { 493 Label L; 494 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 495 __ cbnz(rscratch1, L); 496 __ stop("StubRoutines::forward exception: no pending exception (1)"); 497 __ bind(L); 498 } 499 #endif 500 501 // compute exception handler into r19 502 503 // call the VM to find the handler address associated with the 504 // caller address. pass thread in r0 and caller pc (ret address) 505 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 506 // the stack. 507 __ mov(c_rarg1, lr); 508 // lr will be trashed by the VM call so we move it to R19 509 // (callee-saved) because we also need to pass it to the handler 510 // returned by this call. 511 __ mov(r19, lr); 512 BLOCK_COMMENT("call exception_handler_for_return_address"); 513 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 514 SharedRuntime::exception_handler_for_return_address), 515 rthread, c_rarg1); 516 // we should not really care that lr is no longer the callee 517 // address. we saved the value the handler needs in r19 so we can 518 // just copy it to r3. however, the C2 handler will push its own 519 // frame and then calls into the VM and the VM code asserts that 520 // the PC for the frame above the handler belongs to a compiled 521 // Java method. So, we restore lr here to satisfy that assert. 522 __ mov(lr, r19); 523 // setup r0 & r3 & clear pending exception 524 __ mov(r3, r19); 525 __ mov(r19, r0); 526 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 527 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 528 529 #ifdef ASSERT 530 // make sure exception is set 531 { 532 Label L; 533 __ cbnz(r0, L); 534 __ stop("StubRoutines::forward exception: no pending exception (2)"); 535 __ bind(L); 536 } 537 #endif 538 539 // continue at exception handler 540 // r0: exception 541 // r3: throwing pc 542 // r19: exception handler 543 __ verify_oop(r0); 544 __ br(r19); 545 546 return start; 547 } 548 549 // Non-destructive plausibility checks for oops 550 // 551 // Arguments: 552 // r0: oop to verify 553 // rscratch1: error message 554 // 555 // Stack after saving c_rarg3: 556 // [tos + 0]: saved c_rarg3 557 // [tos + 1]: saved c_rarg2 558 // [tos + 2]: saved lr 559 // [tos + 3]: saved rscratch2 560 // [tos + 4]: saved r0 561 // [tos + 5]: saved rscratch1 562 address generate_verify_oop() { 563 564 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 565 address start = __ pc(); 566 567 Label exit, error; 568 569 // save c_rarg2 and c_rarg3 570 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 571 572 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 573 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 574 __ ldr(c_rarg3, Address(c_rarg2)); 575 __ add(c_rarg3, c_rarg3, 1); 576 __ str(c_rarg3, Address(c_rarg2)); 577 578 // object is in r0 579 // make sure object is 'reasonable' 580 __ cbz(r0, exit); // if obj is NULL it is OK 581 582 // Check if the oop is in the right area of memory 583 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 584 __ andr(c_rarg2, r0, c_rarg3); 585 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 586 587 // Compare c_rarg2 and c_rarg3. We don't use a compare 588 // instruction here because the flags register is live. 589 __ eor(c_rarg2, c_rarg2, c_rarg3); 590 __ cbnz(c_rarg2, error); 591 592 // make sure klass is 'reasonable', which is not zero. 593 __ load_klass(r0, r0); // get klass 594 __ cbz(r0, error); // if klass is NULL it is broken 595 596 // return if everything seems ok 597 __ bind(exit); 598 599 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 600 __ ret(lr); 601 602 // handle errors 603 __ bind(error); 604 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 605 606 __ push(RegSet::range(r0, r29), sp); 607 // debug(char* msg, int64_t pc, int64_t regs[]) 608 __ mov(c_rarg0, rscratch1); // pass address of error message 609 __ mov(c_rarg1, lr); // pass return address 610 __ mov(c_rarg2, sp); // pass address of regs on stack 611 #ifndef PRODUCT 612 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 613 #endif 614 BLOCK_COMMENT("call MacroAssembler::debug"); 615 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 616 __ blrt(rscratch1, 3, 0, 1); 617 618 return start; 619 } 620 621 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 622 623 // The inner part of zero_words(). This is the bulk operation, 624 // zeroing words in blocks, possibly using DC ZVA to do it. The 625 // caller is responsible for zeroing the last few words. 626 // 627 // Inputs: 628 // r10: the HeapWord-aligned base address of an array to zero. 629 // r11: the count in HeapWords, r11 > 0. 630 // 631 // Returns r10 and r11, adjusted for the caller to clear. 632 // r10: the base address of the tail of words left to clear. 633 // r11: the number of words in the tail. 634 // r11 < MacroAssembler::zero_words_block_size. 635 636 address generate_zero_blocks() { 637 Label done; 638 Label base_aligned; 639 640 Register base = r10, cnt = r11; 641 642 __ align(CodeEntryAlignment); 643 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 644 address start = __ pc(); 645 646 if (UseBlockZeroing) { 647 int zva_length = VM_Version::zva_length(); 648 649 // Ensure ZVA length can be divided by 16. This is required by 650 // the subsequent operations. 651 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 652 653 __ tbz(base, 3, base_aligned); 654 __ str(zr, Address(__ post(base, 8))); 655 __ sub(cnt, cnt, 1); 656 __ bind(base_aligned); 657 658 // Ensure count >= zva_length * 2 so that it still deserves a zva after 659 // alignment. 660 Label small; 661 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 662 __ subs(rscratch1, cnt, low_limit >> 3); 663 __ br(Assembler::LT, small); 664 __ zero_dcache_blocks(base, cnt); 665 __ bind(small); 666 } 667 668 { 669 // Number of stp instructions we'll unroll 670 const int unroll = 671 MacroAssembler::zero_words_block_size / 2; 672 // Clear the remaining blocks. 673 Label loop; 674 __ subs(cnt, cnt, unroll * 2); 675 __ br(Assembler::LT, done); 676 __ bind(loop); 677 for (int i = 0; i < unroll; i++) 678 __ stp(zr, zr, __ post(base, 16)); 679 __ subs(cnt, cnt, unroll * 2); 680 __ br(Assembler::GE, loop); 681 __ bind(done); 682 __ add(cnt, cnt, unroll * 2); 683 } 684 685 __ ret(lr); 686 687 return start; 688 } 689 690 691 typedef enum { 692 copy_forwards = 1, 693 copy_backwards = -1 694 } copy_direction; 695 696 // Bulk copy of blocks of 8 words. 697 // 698 // count is a count of words. 699 // 700 // Precondition: count >= 8 701 // 702 // Postconditions: 703 // 704 // The least significant bit of count contains the remaining count 705 // of words to copy. The rest of count is trash. 706 // 707 // s and d are adjusted to point to the remaining words to copy 708 // 709 void generate_copy_longs(Label &start, Register s, Register d, Register count, 710 copy_direction direction) { 711 int unit = wordSize * direction; 712 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 713 714 int offset; 715 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 716 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 717 const Register stride = r13; 718 719 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 720 assert_different_registers(s, d, count, rscratch1); 721 722 Label again, drain; 723 const char *stub_name; 724 if (direction == copy_forwards) 725 stub_name = "forward_copy_longs"; 726 else 727 stub_name = "backward_copy_longs"; 728 729 __ align(CodeEntryAlignment); 730 731 StubCodeMark mark(this, "StubRoutines", stub_name); 732 733 __ bind(start); 734 735 Label unaligned_copy_long; 736 if (AvoidUnalignedAccesses) { 737 __ tbnz(d, 3, unaligned_copy_long); 738 } 739 740 if (direction == copy_forwards) { 741 __ sub(s, s, bias); 742 __ sub(d, d, bias); 743 } 744 745 #ifdef ASSERT 746 // Make sure we are never given < 8 words 747 { 748 Label L; 749 __ cmp(count, (u1)8); 750 __ br(Assembler::GE, L); 751 __ stop("genrate_copy_longs called with < 8 words"); 752 __ bind(L); 753 } 754 #endif 755 756 // Fill 8 registers 757 if (UseSIMDForMemoryOps) { 758 __ ldpq(v0, v1, Address(s, 4 * unit)); 759 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 760 } else { 761 __ ldp(t0, t1, Address(s, 2 * unit)); 762 __ ldp(t2, t3, Address(s, 4 * unit)); 763 __ ldp(t4, t5, Address(s, 6 * unit)); 764 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 765 } 766 767 __ subs(count, count, 16); 768 __ br(Assembler::LO, drain); 769 770 int prefetch = PrefetchCopyIntervalInBytes; 771 bool use_stride = false; 772 if (direction == copy_backwards) { 773 use_stride = prefetch > 256; 774 prefetch = -prefetch; 775 if (use_stride) __ mov(stride, prefetch); 776 } 777 778 __ bind(again); 779 780 if (PrefetchCopyIntervalInBytes > 0) 781 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 782 783 if (UseSIMDForMemoryOps) { 784 __ stpq(v0, v1, Address(d, 4 * unit)); 785 __ ldpq(v0, v1, Address(s, 4 * unit)); 786 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 787 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 788 } else { 789 __ stp(t0, t1, Address(d, 2 * unit)); 790 __ ldp(t0, t1, Address(s, 2 * unit)); 791 __ stp(t2, t3, Address(d, 4 * unit)); 792 __ ldp(t2, t3, Address(s, 4 * unit)); 793 __ stp(t4, t5, Address(d, 6 * unit)); 794 __ ldp(t4, t5, Address(s, 6 * unit)); 795 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 796 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 797 } 798 799 __ subs(count, count, 8); 800 __ br(Assembler::HS, again); 801 802 // Drain 803 __ bind(drain); 804 if (UseSIMDForMemoryOps) { 805 __ stpq(v0, v1, Address(d, 4 * unit)); 806 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 807 } else { 808 __ stp(t0, t1, Address(d, 2 * unit)); 809 __ stp(t2, t3, Address(d, 4 * unit)); 810 __ stp(t4, t5, Address(d, 6 * unit)); 811 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 812 } 813 814 { 815 Label L1, L2; 816 __ tbz(count, exact_log2(4), L1); 817 if (UseSIMDForMemoryOps) { 818 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 819 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 820 } else { 821 __ ldp(t0, t1, Address(s, 2 * unit)); 822 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 823 __ stp(t0, t1, Address(d, 2 * unit)); 824 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 825 } 826 __ bind(L1); 827 828 if (direction == copy_forwards) { 829 __ add(s, s, bias); 830 __ add(d, d, bias); 831 } 832 833 __ tbz(count, 1, L2); 834 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 835 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 836 __ bind(L2); 837 } 838 839 __ ret(lr); 840 841 if (AvoidUnalignedAccesses) { 842 Label drain, again; 843 // Register order for storing. Order is different for backward copy. 844 845 __ bind(unaligned_copy_long); 846 847 // source address is even aligned, target odd aligned 848 // 849 // when forward copying word pairs we read long pairs at offsets 850 // {0, 2, 4, 6} (in long words). when backwards copying we read 851 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 852 // address by -2 in the forwards case so we can compute the 853 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 854 // or -1. 855 // 856 // when forward copying we need to store 1 word, 3 pairs and 857 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 858 // zero offset We adjust the destination by -1 which means we 859 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 860 // 861 // When backwards copyng we need to store 1 word, 3 pairs and 862 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 863 // offsets {1, 3, 5, 7, 8} * unit. 864 865 if (direction == copy_forwards) { 866 __ sub(s, s, 16); 867 __ sub(d, d, 8); 868 } 869 870 // Fill 8 registers 871 // 872 // for forwards copy s was offset by -16 from the original input 873 // value of s so the register contents are at these offsets 874 // relative to the 64 bit block addressed by that original input 875 // and so on for each successive 64 byte block when s is updated 876 // 877 // t0 at offset 0, t1 at offset 8 878 // t2 at offset 16, t3 at offset 24 879 // t4 at offset 32, t5 at offset 40 880 // t6 at offset 48, t7 at offset 56 881 882 // for backwards copy s was not offset so the register contents 883 // are at these offsets into the preceding 64 byte block 884 // relative to that original input and so on for each successive 885 // preceding 64 byte block when s is updated. this explains the 886 // slightly counter-intuitive looking pattern of register usage 887 // in the stp instructions for backwards copy. 888 // 889 // t0 at offset -16, t1 at offset -8 890 // t2 at offset -32, t3 at offset -24 891 // t4 at offset -48, t5 at offset -40 892 // t6 at offset -64, t7 at offset -56 893 894 __ ldp(t0, t1, Address(s, 2 * unit)); 895 __ ldp(t2, t3, Address(s, 4 * unit)); 896 __ ldp(t4, t5, Address(s, 6 * unit)); 897 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 898 899 __ subs(count, count, 16); 900 __ br(Assembler::LO, drain); 901 902 int prefetch = PrefetchCopyIntervalInBytes; 903 bool use_stride = false; 904 if (direction == copy_backwards) { 905 use_stride = prefetch > 256; 906 prefetch = -prefetch; 907 if (use_stride) __ mov(stride, prefetch); 908 } 909 910 __ bind(again); 911 912 if (PrefetchCopyIntervalInBytes > 0) 913 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 914 915 if (direction == copy_forwards) { 916 // allowing for the offset of -8 the store instructions place 917 // registers into the target 64 bit block at the following 918 // offsets 919 // 920 // t0 at offset 0 921 // t1 at offset 8, t2 at offset 16 922 // t3 at offset 24, t4 at offset 32 923 // t5 at offset 40, t6 at offset 48 924 // t7 at offset 56 925 926 __ str(t0, Address(d, 1 * unit)); 927 __ stp(t1, t2, Address(d, 2 * unit)); 928 __ ldp(t0, t1, Address(s, 2 * unit)); 929 __ stp(t3, t4, Address(d, 4 * unit)); 930 __ ldp(t2, t3, Address(s, 4 * unit)); 931 __ stp(t5, t6, Address(d, 6 * unit)); 932 __ ldp(t4, t5, Address(s, 6 * unit)); 933 __ str(t7, Address(__ pre(d, 8 * unit))); 934 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 935 } else { 936 // d was not offset when we started so the registers are 937 // written into the 64 bit block preceding d with the following 938 // offsets 939 // 940 // t1 at offset -8 941 // t3 at offset -24, t0 at offset -16 942 // t5 at offset -48, t2 at offset -32 943 // t7 at offset -56, t4 at offset -48 944 // t6 at offset -64 945 // 946 // note that this matches the offsets previously noted for the 947 // loads 948 949 __ str(t1, Address(d, 1 * unit)); 950 __ stp(t3, t0, Address(d, 3 * unit)); 951 __ ldp(t0, t1, Address(s, 2 * unit)); 952 __ stp(t5, t2, Address(d, 5 * unit)); 953 __ ldp(t2, t3, Address(s, 4 * unit)); 954 __ stp(t7, t4, Address(d, 7 * unit)); 955 __ ldp(t4, t5, Address(s, 6 * unit)); 956 __ str(t6, Address(__ pre(d, 8 * unit))); 957 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 958 } 959 960 __ subs(count, count, 8); 961 __ br(Assembler::HS, again); 962 963 // Drain 964 // 965 // this uses the same pattern of offsets and register arguments 966 // as above 967 __ bind(drain); 968 if (direction == copy_forwards) { 969 __ str(t0, Address(d, 1 * unit)); 970 __ stp(t1, t2, Address(d, 2 * unit)); 971 __ stp(t3, t4, Address(d, 4 * unit)); 972 __ stp(t5, t6, Address(d, 6 * unit)); 973 __ str(t7, Address(__ pre(d, 8 * unit))); 974 } else { 975 __ str(t1, Address(d, 1 * unit)); 976 __ stp(t3, t0, Address(d, 3 * unit)); 977 __ stp(t5, t2, Address(d, 5 * unit)); 978 __ stp(t7, t4, Address(d, 7 * unit)); 979 __ str(t6, Address(__ pre(d, 8 * unit))); 980 } 981 // now we need to copy any remaining part block which may 982 // include a 4 word block subblock and/or a 2 word subblock. 983 // bits 2 and 1 in the count are the tell-tale for whetehr we 984 // have each such subblock 985 { 986 Label L1, L2; 987 __ tbz(count, exact_log2(4), L1); 988 // this is the same as above but copying only 4 longs hence 989 // with ony one intervening stp between the str instructions 990 // but note that the offsets and registers still follow the 991 // same pattern 992 __ ldp(t0, t1, Address(s, 2 * unit)); 993 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 994 if (direction == copy_forwards) { 995 __ str(t0, Address(d, 1 * unit)); 996 __ stp(t1, t2, Address(d, 2 * unit)); 997 __ str(t3, Address(__ pre(d, 4 * unit))); 998 } else { 999 __ str(t1, Address(d, 1 * unit)); 1000 __ stp(t3, t0, Address(d, 3 * unit)); 1001 __ str(t2, Address(__ pre(d, 4 * unit))); 1002 } 1003 __ bind(L1); 1004 1005 __ tbz(count, 1, L2); 1006 // this is the same as above but copying only 2 longs hence 1007 // there is no intervening stp between the str instructions 1008 // but note that the offset and register patterns are still 1009 // the same 1010 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1011 if (direction == copy_forwards) { 1012 __ str(t0, Address(d, 1 * unit)); 1013 __ str(t1, Address(__ pre(d, 2 * unit))); 1014 } else { 1015 __ str(t1, Address(d, 1 * unit)); 1016 __ str(t0, Address(__ pre(d, 2 * unit))); 1017 } 1018 __ bind(L2); 1019 1020 // for forwards copy we need to re-adjust the offsets we 1021 // applied so that s and d are follow the last words written 1022 1023 if (direction == copy_forwards) { 1024 __ add(s, s, 16); 1025 __ add(d, d, 8); 1026 } 1027 1028 } 1029 1030 __ ret(lr); 1031 } 1032 } 1033 1034 // Small copy: less than 16 bytes. 1035 // 1036 // NB: Ignores all of the bits of count which represent more than 15 1037 // bytes, so a caller doesn't have to mask them. 1038 1039 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1040 bool is_backwards = step < 0; 1041 size_t granularity = uabs(step); 1042 int direction = is_backwards ? -1 : 1; 1043 int unit = wordSize * direction; 1044 1045 Label Lword, Lint, Lshort, Lbyte; 1046 1047 assert(granularity 1048 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1049 1050 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1051 1052 // ??? I don't know if this bit-test-and-branch is the right thing 1053 // to do. It does a lot of jumping, resulting in several 1054 // mispredicted branches. It might make more sense to do this 1055 // with something like Duff's device with a single computed branch. 1056 1057 __ tbz(count, 3 - exact_log2(granularity), Lword); 1058 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1059 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1060 __ bind(Lword); 1061 1062 if (granularity <= sizeof (jint)) { 1063 __ tbz(count, 2 - exact_log2(granularity), Lint); 1064 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1065 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1066 __ bind(Lint); 1067 } 1068 1069 if (granularity <= sizeof (jshort)) { 1070 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1071 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1072 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1073 __ bind(Lshort); 1074 } 1075 1076 if (granularity <= sizeof (jbyte)) { 1077 __ tbz(count, 0, Lbyte); 1078 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1079 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1080 __ bind(Lbyte); 1081 } 1082 } 1083 1084 Label copy_f, copy_b; 1085 1086 // All-singing all-dancing memory copy. 1087 // 1088 // Copy count units of memory from s to d. The size of a unit is 1089 // step, which can be positive or negative depending on the direction 1090 // of copy. If is_aligned is false, we align the source address. 1091 // 1092 1093 void copy_memory(bool is_aligned, Register s, Register d, 1094 Register count, Register tmp, int step) { 1095 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1096 bool is_backwards = step < 0; 1097 int granularity = uabs(step); 1098 const Register t0 = r3, t1 = r4; 1099 1100 // <= 96 bytes do inline. Direction doesn't matter because we always 1101 // load all the data before writing anything 1102 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1103 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1104 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1105 const Register send = r17, dend = r18; 1106 1107 if (PrefetchCopyIntervalInBytes > 0) 1108 __ prfm(Address(s, 0), PLDL1KEEP); 1109 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1110 __ br(Assembler::HI, copy_big); 1111 1112 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1113 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1114 1115 __ cmp(count, u1(16/granularity)); 1116 __ br(Assembler::LS, copy16); 1117 1118 __ cmp(count, u1(64/granularity)); 1119 __ br(Assembler::HI, copy80); 1120 1121 __ cmp(count, u1(32/granularity)); 1122 __ br(Assembler::LS, copy32); 1123 1124 // 33..64 bytes 1125 if (UseSIMDForMemoryOps) { 1126 __ ldpq(v0, v1, Address(s, 0)); 1127 __ ldpq(v2, v3, Address(send, -32)); 1128 __ stpq(v0, v1, Address(d, 0)); 1129 __ stpq(v2, v3, Address(dend, -32)); 1130 } else { 1131 __ ldp(t0, t1, Address(s, 0)); 1132 __ ldp(t2, t3, Address(s, 16)); 1133 __ ldp(t4, t5, Address(send, -32)); 1134 __ ldp(t6, t7, Address(send, -16)); 1135 1136 __ stp(t0, t1, Address(d, 0)); 1137 __ stp(t2, t3, Address(d, 16)); 1138 __ stp(t4, t5, Address(dend, -32)); 1139 __ stp(t6, t7, Address(dend, -16)); 1140 } 1141 __ b(finish); 1142 1143 // 17..32 bytes 1144 __ bind(copy32); 1145 __ ldp(t0, t1, Address(s, 0)); 1146 __ ldp(t2, t3, Address(send, -16)); 1147 __ stp(t0, t1, Address(d, 0)); 1148 __ stp(t2, t3, Address(dend, -16)); 1149 __ b(finish); 1150 1151 // 65..80/96 bytes 1152 // (96 bytes if SIMD because we do 32 byes per instruction) 1153 __ bind(copy80); 1154 if (UseSIMDForMemoryOps) { 1155 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1156 __ ldpq(v4, v5, Address(send, -32)); 1157 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1158 __ stpq(v4, v5, Address(dend, -32)); 1159 } else { 1160 __ ldp(t0, t1, Address(s, 0)); 1161 __ ldp(t2, t3, Address(s, 16)); 1162 __ ldp(t4, t5, Address(s, 32)); 1163 __ ldp(t6, t7, Address(s, 48)); 1164 __ ldp(t8, t9, Address(send, -16)); 1165 1166 __ stp(t0, t1, Address(d, 0)); 1167 __ stp(t2, t3, Address(d, 16)); 1168 __ stp(t4, t5, Address(d, 32)); 1169 __ stp(t6, t7, Address(d, 48)); 1170 __ stp(t8, t9, Address(dend, -16)); 1171 } 1172 __ b(finish); 1173 1174 // 0..16 bytes 1175 __ bind(copy16); 1176 __ cmp(count, u1(8/granularity)); 1177 __ br(Assembler::LO, copy8); 1178 1179 // 8..16 bytes 1180 __ ldr(t0, Address(s, 0)); 1181 __ ldr(t1, Address(send, -8)); 1182 __ str(t0, Address(d, 0)); 1183 __ str(t1, Address(dend, -8)); 1184 __ b(finish); 1185 1186 if (granularity < 8) { 1187 // 4..7 bytes 1188 __ bind(copy8); 1189 __ tbz(count, 2 - exact_log2(granularity), copy4); 1190 __ ldrw(t0, Address(s, 0)); 1191 __ ldrw(t1, Address(send, -4)); 1192 __ strw(t0, Address(d, 0)); 1193 __ strw(t1, Address(dend, -4)); 1194 __ b(finish); 1195 if (granularity < 4) { 1196 // 0..3 bytes 1197 __ bind(copy4); 1198 __ cbz(count, finish); // get rid of 0 case 1199 if (granularity == 2) { 1200 __ ldrh(t0, Address(s, 0)); 1201 __ strh(t0, Address(d, 0)); 1202 } else { // granularity == 1 1203 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1204 // the first and last byte. 1205 // Handle the 3 byte case by loading and storing base + count/2 1206 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1207 // This does means in the 1 byte case we load/store the same 1208 // byte 3 times. 1209 __ lsr(count, count, 1); 1210 __ ldrb(t0, Address(s, 0)); 1211 __ ldrb(t1, Address(send, -1)); 1212 __ ldrb(t2, Address(s, count)); 1213 __ strb(t0, Address(d, 0)); 1214 __ strb(t1, Address(dend, -1)); 1215 __ strb(t2, Address(d, count)); 1216 } 1217 __ b(finish); 1218 } 1219 } 1220 1221 __ bind(copy_big); 1222 if (is_backwards) { 1223 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1224 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1225 } 1226 1227 // Now we've got the small case out of the way we can align the 1228 // source address on a 2-word boundary. 1229 1230 Label aligned; 1231 1232 if (is_aligned) { 1233 // We may have to adjust by 1 word to get s 2-word-aligned. 1234 __ tbz(s, exact_log2(wordSize), aligned); 1235 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1236 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1237 __ sub(count, count, wordSize/granularity); 1238 } else { 1239 if (is_backwards) { 1240 __ andr(rscratch2, s, 2 * wordSize - 1); 1241 } else { 1242 __ neg(rscratch2, s); 1243 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1244 } 1245 // rscratch2 is the byte adjustment needed to align s. 1246 __ cbz(rscratch2, aligned); 1247 int shift = exact_log2(granularity); 1248 if (shift) __ lsr(rscratch2, rscratch2, shift); 1249 __ sub(count, count, rscratch2); 1250 1251 #if 0 1252 // ?? This code is only correct for a disjoint copy. It may or 1253 // may not make sense to use it in that case. 1254 1255 // Copy the first pair; s and d may not be aligned. 1256 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1257 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1258 1259 // Align s and d, adjust count 1260 if (is_backwards) { 1261 __ sub(s, s, rscratch2); 1262 __ sub(d, d, rscratch2); 1263 } else { 1264 __ add(s, s, rscratch2); 1265 __ add(d, d, rscratch2); 1266 } 1267 #else 1268 copy_memory_small(s, d, rscratch2, rscratch1, step); 1269 #endif 1270 } 1271 1272 __ bind(aligned); 1273 1274 // s is now 2-word-aligned. 1275 1276 // We have a count of units and some trailing bytes. Adjust the 1277 // count and do a bulk copy of words. 1278 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1279 if (direction == copy_forwards) 1280 __ bl(copy_f); 1281 else 1282 __ bl(copy_b); 1283 1284 // And the tail. 1285 copy_memory_small(s, d, count, tmp, step); 1286 1287 if (granularity >= 8) __ bind(copy8); 1288 if (granularity >= 4) __ bind(copy4); 1289 __ bind(finish); 1290 } 1291 1292 1293 void clobber_registers() { 1294 #ifdef ASSERT 1295 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1296 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1297 for (Register r = r3; r <= r18; r++) 1298 if (r != rscratch1) __ mov(r, rscratch1); 1299 #endif 1300 } 1301 1302 // Scan over array at a for count oops, verifying each one. 1303 // Preserves a and count, clobbers rscratch1 and rscratch2. 1304 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1305 Label loop, end; 1306 __ mov(rscratch1, a); 1307 __ mov(rscratch2, zr); 1308 __ bind(loop); 1309 __ cmp(rscratch2, count); 1310 __ br(Assembler::HS, end); 1311 if (size == (size_t)wordSize) { 1312 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1313 __ verify_oop(temp); 1314 } else { 1315 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1316 __ decode_heap_oop(temp); // calls verify_oop 1317 } 1318 __ add(rscratch2, rscratch2, size); 1319 __ b(loop); 1320 __ bind(end); 1321 } 1322 1323 // Arguments: 1324 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1325 // ignored 1326 // is_oop - true => oop array, so generate store check code 1327 // name - stub name string 1328 // 1329 // Inputs: 1330 // c_rarg0 - source array address 1331 // c_rarg1 - destination array address 1332 // c_rarg2 - element count, treated as ssize_t, can be zero 1333 // 1334 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1335 // the hardware handle it. The two dwords within qwords that span 1336 // cache line boundaries will still be loaded and stored atomicly. 1337 // 1338 // Side Effects: 1339 // disjoint_int_copy_entry is set to the no-overlap entry point 1340 // used by generate_conjoint_int_oop_copy(). 1341 // 1342 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1343 const char *name, bool dest_uninitialized = false) { 1344 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1345 RegSet saved_reg = RegSet::of(s, d, count); 1346 __ align(CodeEntryAlignment); 1347 StubCodeMark mark(this, "StubRoutines", name); 1348 address start = __ pc(); 1349 __ enter(); 1350 1351 if (entry != NULL) { 1352 *entry = __ pc(); 1353 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1354 BLOCK_COMMENT("Entry:"); 1355 } 1356 1357 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1358 if (dest_uninitialized) { 1359 decorators |= IS_DEST_UNINITIALIZED; 1360 } 1361 if (aligned) { 1362 decorators |= ARRAYCOPY_ALIGNED; 1363 } 1364 1365 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1366 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg); 1367 1368 if (is_oop) { 1369 // save regs before copy_memory 1370 __ push(RegSet::of(d, count), sp); 1371 } 1372 copy_memory(aligned, s, d, count, rscratch1, size); 1373 1374 if (is_oop) { 1375 __ pop(RegSet::of(d, count), sp); 1376 if (VerifyOops) 1377 verify_oop_array(size, d, count, r16); 1378 } 1379 1380 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1381 1382 __ leave(); 1383 __ mov(r0, zr); // return 0 1384 __ ret(lr); 1385 #ifdef BUILTIN_SIM 1386 { 1387 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1388 sim->notifyCompile(const_cast<char*>(name), start); 1389 } 1390 #endif 1391 return start; 1392 } 1393 1394 // Arguments: 1395 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1396 // ignored 1397 // is_oop - true => oop array, so generate store check code 1398 // name - stub name string 1399 // 1400 // Inputs: 1401 // c_rarg0 - source array address 1402 // c_rarg1 - destination array address 1403 // c_rarg2 - element count, treated as ssize_t, can be zero 1404 // 1405 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1406 // the hardware handle it. The two dwords within qwords that span 1407 // cache line boundaries will still be loaded and stored atomicly. 1408 // 1409 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1410 address *entry, const char *name, 1411 bool dest_uninitialized = false) { 1412 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1413 RegSet saved_regs = RegSet::of(s, d, count); 1414 StubCodeMark mark(this, "StubRoutines", name); 1415 address start = __ pc(); 1416 __ enter(); 1417 1418 if (entry != NULL) { 1419 *entry = __ pc(); 1420 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1421 BLOCK_COMMENT("Entry:"); 1422 } 1423 1424 // use fwd copy when (d-s) above_equal (count*size) 1425 __ sub(rscratch1, d, s); 1426 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1427 __ br(Assembler::HS, nooverlap_target); 1428 1429 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1430 if (dest_uninitialized) { 1431 decorators |= IS_DEST_UNINITIALIZED; 1432 } 1433 if (aligned) { 1434 decorators |= ARRAYCOPY_ALIGNED; 1435 } 1436 1437 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1438 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs); 1439 1440 if (is_oop) { 1441 // save regs before copy_memory 1442 __ push(RegSet::of(d, count), sp); 1443 } 1444 copy_memory(aligned, s, d, count, rscratch1, -size); 1445 if (is_oop) { 1446 __ pop(RegSet::of(d, count), sp); 1447 if (VerifyOops) 1448 verify_oop_array(size, d, count, r16); 1449 } 1450 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1451 __ leave(); 1452 __ mov(r0, zr); // return 0 1453 __ ret(lr); 1454 #ifdef BUILTIN_SIM 1455 { 1456 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1457 sim->notifyCompile(const_cast<char*>(name), start); 1458 } 1459 #endif 1460 return start; 1461 } 1462 1463 // Arguments: 1464 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1465 // ignored 1466 // name - stub name string 1467 // 1468 // Inputs: 1469 // c_rarg0 - source array address 1470 // c_rarg1 - destination array address 1471 // c_rarg2 - element count, treated as ssize_t, can be zero 1472 // 1473 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1474 // we let the hardware handle it. The one to eight bytes within words, 1475 // dwords or qwords that span cache line boundaries will still be loaded 1476 // and stored atomically. 1477 // 1478 // Side Effects: 1479 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1480 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1481 // we let the hardware handle it. The one to eight bytes within words, 1482 // dwords or qwords that span cache line boundaries will still be loaded 1483 // and stored atomically. 1484 // 1485 // Side Effects: 1486 // disjoint_byte_copy_entry is set to the no-overlap entry point 1487 // used by generate_conjoint_byte_copy(). 1488 // 1489 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1490 const bool not_oop = false; 1491 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1492 } 1493 1494 // Arguments: 1495 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1496 // ignored 1497 // name - stub name string 1498 // 1499 // Inputs: 1500 // c_rarg0 - source array address 1501 // c_rarg1 - destination array address 1502 // c_rarg2 - element count, treated as ssize_t, can be zero 1503 // 1504 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1505 // we let the hardware handle it. The one to eight bytes within words, 1506 // dwords or qwords that span cache line boundaries will still be loaded 1507 // and stored atomically. 1508 // 1509 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1510 address* entry, const char *name) { 1511 const bool not_oop = false; 1512 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1513 } 1514 1515 // Arguments: 1516 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1517 // ignored 1518 // name - stub name string 1519 // 1520 // Inputs: 1521 // c_rarg0 - source array address 1522 // c_rarg1 - destination array address 1523 // c_rarg2 - element count, treated as ssize_t, can be zero 1524 // 1525 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1526 // let the hardware handle it. The two or four words within dwords 1527 // or qwords that span cache line boundaries will still be loaded 1528 // and stored atomically. 1529 // 1530 // Side Effects: 1531 // disjoint_short_copy_entry is set to the no-overlap entry point 1532 // used by generate_conjoint_short_copy(). 1533 // 1534 address generate_disjoint_short_copy(bool aligned, 1535 address* entry, const char *name) { 1536 const bool not_oop = false; 1537 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1538 } 1539 1540 // Arguments: 1541 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1542 // ignored 1543 // name - stub name string 1544 // 1545 // Inputs: 1546 // c_rarg0 - source array address 1547 // c_rarg1 - destination array address 1548 // c_rarg2 - element count, treated as ssize_t, can be zero 1549 // 1550 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1551 // let the hardware handle it. The two or four words within dwords 1552 // or qwords that span cache line boundaries will still be loaded 1553 // and stored atomically. 1554 // 1555 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1556 address *entry, const char *name) { 1557 const bool not_oop = false; 1558 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1559 1560 } 1561 // Arguments: 1562 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1563 // ignored 1564 // name - stub name string 1565 // 1566 // Inputs: 1567 // c_rarg0 - source array address 1568 // c_rarg1 - destination array address 1569 // c_rarg2 - element count, treated as ssize_t, can be zero 1570 // 1571 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1572 // the hardware handle it. The two dwords within qwords that span 1573 // cache line boundaries will still be loaded and stored atomicly. 1574 // 1575 // Side Effects: 1576 // disjoint_int_copy_entry is set to the no-overlap entry point 1577 // used by generate_conjoint_int_oop_copy(). 1578 // 1579 address generate_disjoint_int_copy(bool aligned, address *entry, 1580 const char *name, bool dest_uninitialized = false) { 1581 const bool not_oop = false; 1582 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1583 } 1584 1585 // Arguments: 1586 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1587 // ignored 1588 // name - stub name string 1589 // 1590 // Inputs: 1591 // c_rarg0 - source array address 1592 // c_rarg1 - destination array address 1593 // c_rarg2 - element count, treated as ssize_t, can be zero 1594 // 1595 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1596 // the hardware handle it. The two dwords within qwords that span 1597 // cache line boundaries will still be loaded and stored atomicly. 1598 // 1599 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1600 address *entry, const char *name, 1601 bool dest_uninitialized = false) { 1602 const bool not_oop = false; 1603 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1604 } 1605 1606 1607 // Arguments: 1608 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1609 // ignored 1610 // name - stub name string 1611 // 1612 // Inputs: 1613 // c_rarg0 - source array address 1614 // c_rarg1 - destination array address 1615 // c_rarg2 - element count, treated as size_t, can be zero 1616 // 1617 // Side Effects: 1618 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1619 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1620 // 1621 address generate_disjoint_long_copy(bool aligned, address *entry, 1622 const char *name, bool dest_uninitialized = false) { 1623 const bool not_oop = false; 1624 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1625 } 1626 1627 // Arguments: 1628 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1629 // ignored 1630 // name - stub name string 1631 // 1632 // Inputs: 1633 // c_rarg0 - source array address 1634 // c_rarg1 - destination array address 1635 // c_rarg2 - element count, treated as size_t, can be zero 1636 // 1637 address generate_conjoint_long_copy(bool aligned, 1638 address nooverlap_target, address *entry, 1639 const char *name, bool dest_uninitialized = false) { 1640 const bool not_oop = false; 1641 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1642 } 1643 1644 // Arguments: 1645 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1646 // ignored 1647 // name - stub name string 1648 // 1649 // Inputs: 1650 // c_rarg0 - source array address 1651 // c_rarg1 - destination array address 1652 // c_rarg2 - element count, treated as size_t, can be zero 1653 // 1654 // Side Effects: 1655 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1656 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1657 // 1658 address generate_disjoint_oop_copy(bool aligned, address *entry, 1659 const char *name, bool dest_uninitialized) { 1660 const bool is_oop = true; 1661 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1662 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1663 } 1664 1665 // Arguments: 1666 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1667 // ignored 1668 // name - stub name string 1669 // 1670 // Inputs: 1671 // c_rarg0 - source array address 1672 // c_rarg1 - destination array address 1673 // c_rarg2 - element count, treated as size_t, can be zero 1674 // 1675 address generate_conjoint_oop_copy(bool aligned, 1676 address nooverlap_target, address *entry, 1677 const char *name, bool dest_uninitialized) { 1678 const bool is_oop = true; 1679 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1680 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1681 name, dest_uninitialized); 1682 } 1683 1684 1685 // Helper for generating a dynamic type check. 1686 // Smashes rscratch1, rscratch2. 1687 void generate_type_check(Register sub_klass, 1688 Register super_check_offset, 1689 Register super_klass, 1690 Label& L_success) { 1691 assert_different_registers(sub_klass, super_check_offset, super_klass); 1692 1693 BLOCK_COMMENT("type_check:"); 1694 1695 Label L_miss; 1696 1697 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1698 super_check_offset); 1699 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1700 1701 // Fall through on failure! 1702 __ BIND(L_miss); 1703 } 1704 1705 // 1706 // Generate checkcasting array copy stub 1707 // 1708 // Input: 1709 // c_rarg0 - source array address 1710 // c_rarg1 - destination array address 1711 // c_rarg2 - element count, treated as ssize_t, can be zero 1712 // c_rarg3 - size_t ckoff (super_check_offset) 1713 // c_rarg4 - oop ckval (super_klass) 1714 // 1715 // Output: 1716 // r0 == 0 - success 1717 // r0 == -1^K - failure, where K is partial transfer count 1718 // 1719 address generate_checkcast_copy(const char *name, address *entry, 1720 bool dest_uninitialized = false) { 1721 1722 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1723 1724 // Input registers (after setup_arg_regs) 1725 const Register from = c_rarg0; // source array address 1726 const Register to = c_rarg1; // destination array address 1727 const Register count = c_rarg2; // elementscount 1728 const Register ckoff = c_rarg3; // super_check_offset 1729 const Register ckval = c_rarg4; // super_klass 1730 1731 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1732 RegSet wb_post_saved_regs = RegSet::of(count); 1733 1734 // Registers used as temps (r18, r19, r20 are save-on-entry) 1735 const Register count_save = r21; // orig elementscount 1736 const Register start_to = r20; // destination array start address 1737 const Register copied_oop = r18; // actual oop copied 1738 const Register r19_klass = r19; // oop._klass 1739 1740 //--------------------------------------------------------------- 1741 // Assembler stub will be used for this call to arraycopy 1742 // if the two arrays are subtypes of Object[] but the 1743 // destination array type is not equal to or a supertype 1744 // of the source type. Each element must be separately 1745 // checked. 1746 1747 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1748 copied_oop, r19_klass, count_save); 1749 1750 __ align(CodeEntryAlignment); 1751 StubCodeMark mark(this, "StubRoutines", name); 1752 address start = __ pc(); 1753 1754 __ enter(); // required for proper stackwalking of RuntimeStub frame 1755 1756 #ifdef ASSERT 1757 // caller guarantees that the arrays really are different 1758 // otherwise, we would have to make conjoint checks 1759 { Label L; 1760 array_overlap_test(L, TIMES_OOP); 1761 __ stop("checkcast_copy within a single array"); 1762 __ bind(L); 1763 } 1764 #endif //ASSERT 1765 1766 // Caller of this entry point must set up the argument registers. 1767 if (entry != NULL) { 1768 *entry = __ pc(); 1769 BLOCK_COMMENT("Entry:"); 1770 } 1771 1772 // Empty array: Nothing to do. 1773 __ cbz(count, L_done); 1774 1775 __ push(RegSet::of(r18, r19, r20, r21), sp); 1776 1777 #ifdef ASSERT 1778 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1779 // The ckoff and ckval must be mutually consistent, 1780 // even though caller generates both. 1781 { Label L; 1782 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1783 __ ldrw(start_to, Address(ckval, sco_offset)); 1784 __ cmpw(ckoff, start_to); 1785 __ br(Assembler::EQ, L); 1786 __ stop("super_check_offset inconsistent"); 1787 __ bind(L); 1788 } 1789 #endif //ASSERT 1790 1791 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1792 bool is_oop = true; 1793 if (dest_uninitialized) { 1794 decorators |= IS_DEST_UNINITIALIZED; 1795 } 1796 1797 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1798 bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs); 1799 1800 // save the original count 1801 __ mov(count_save, count); 1802 1803 // Copy from low to high addresses 1804 __ mov(start_to, to); // Save destination array start address 1805 __ b(L_load_element); 1806 1807 // ======== begin loop ======== 1808 // (Loop is rotated; its entry is L_load_element.) 1809 // Loop control: 1810 // for (; count != 0; count--) { 1811 // copied_oop = load_heap_oop(from++); 1812 // ... generate_type_check ...; 1813 // store_heap_oop(to++, copied_oop); 1814 // } 1815 __ align(OptoLoopAlignment); 1816 1817 __ BIND(L_store_element); 1818 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1819 __ sub(count, count, 1); 1820 __ cbz(count, L_do_card_marks); 1821 1822 // ======== loop entry is here ======== 1823 __ BIND(L_load_element); 1824 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1825 __ cbz(copied_oop, L_store_element); 1826 1827 __ load_klass(r19_klass, copied_oop);// query the object klass 1828 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1829 // ======== end loop ======== 1830 1831 // It was a real error; we must depend on the caller to finish the job. 1832 // Register count = remaining oops, count_orig = total oops. 1833 // Emit GC store barriers for the oops we have copied and report 1834 // their number to the caller. 1835 1836 __ subs(count, count_save, count); // K = partially copied oop count 1837 __ eon(count, count, zr); // report (-1^K) to caller 1838 __ br(Assembler::EQ, L_done_pop); 1839 1840 __ BIND(L_do_card_marks); 1841 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1842 1843 __ bind(L_done_pop); 1844 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1845 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1846 1847 __ bind(L_done); 1848 __ mov(r0, count); 1849 __ leave(); 1850 __ ret(lr); 1851 1852 return start; 1853 } 1854 1855 // Perform range checks on the proposed arraycopy. 1856 // Kills temp, but nothing else. 1857 // Also, clean the sign bits of src_pos and dst_pos. 1858 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1859 Register src_pos, // source position (c_rarg1) 1860 Register dst, // destination array oo (c_rarg2) 1861 Register dst_pos, // destination position (c_rarg3) 1862 Register length, 1863 Register temp, 1864 Label& L_failed) { 1865 BLOCK_COMMENT("arraycopy_range_checks:"); 1866 1867 assert_different_registers(rscratch1, temp); 1868 1869 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1870 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1871 __ addw(temp, length, src_pos); 1872 __ cmpw(temp, rscratch1); 1873 __ br(Assembler::HI, L_failed); 1874 1875 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1876 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1877 __ addw(temp, length, dst_pos); 1878 __ cmpw(temp, rscratch1); 1879 __ br(Assembler::HI, L_failed); 1880 1881 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1882 __ movw(src_pos, src_pos); 1883 __ movw(dst_pos, dst_pos); 1884 1885 BLOCK_COMMENT("arraycopy_range_checks done"); 1886 } 1887 1888 // These stubs get called from some dumb test routine. 1889 // I'll write them properly when they're called from 1890 // something that's actually doing something. 1891 static void fake_arraycopy_stub(address src, address dst, int count) { 1892 assert(count == 0, "huh?"); 1893 } 1894 1895 1896 // 1897 // Generate 'unsafe' array copy stub 1898 // Though just as safe as the other stubs, it takes an unscaled 1899 // size_t argument instead of an element count. 1900 // 1901 // Input: 1902 // c_rarg0 - source array address 1903 // c_rarg1 - destination array address 1904 // c_rarg2 - byte count, treated as ssize_t, can be zero 1905 // 1906 // Examines the alignment of the operands and dispatches 1907 // to a long, int, short, or byte copy loop. 1908 // 1909 address generate_unsafe_copy(const char *name, 1910 address byte_copy_entry, 1911 address short_copy_entry, 1912 address int_copy_entry, 1913 address long_copy_entry) { 1914 Label L_long_aligned, L_int_aligned, L_short_aligned; 1915 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1916 1917 __ align(CodeEntryAlignment); 1918 StubCodeMark mark(this, "StubRoutines", name); 1919 address start = __ pc(); 1920 __ enter(); // required for proper stackwalking of RuntimeStub frame 1921 1922 // bump this on entry, not on exit: 1923 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1924 1925 __ orr(rscratch1, s, d); 1926 __ orr(rscratch1, rscratch1, count); 1927 1928 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1929 __ cbz(rscratch1, L_long_aligned); 1930 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1931 __ cbz(rscratch1, L_int_aligned); 1932 __ tbz(rscratch1, 0, L_short_aligned); 1933 __ b(RuntimeAddress(byte_copy_entry)); 1934 1935 __ BIND(L_short_aligned); 1936 __ lsr(count, count, LogBytesPerShort); // size => short_count 1937 __ b(RuntimeAddress(short_copy_entry)); 1938 __ BIND(L_int_aligned); 1939 __ lsr(count, count, LogBytesPerInt); // size => int_count 1940 __ b(RuntimeAddress(int_copy_entry)); 1941 __ BIND(L_long_aligned); 1942 __ lsr(count, count, LogBytesPerLong); // size => long_count 1943 __ b(RuntimeAddress(long_copy_entry)); 1944 1945 return start; 1946 } 1947 1948 // 1949 // Generate generic array copy stubs 1950 // 1951 // Input: 1952 // c_rarg0 - src oop 1953 // c_rarg1 - src_pos (32-bits) 1954 // c_rarg2 - dst oop 1955 // c_rarg3 - dst_pos (32-bits) 1956 // c_rarg4 - element count (32-bits) 1957 // 1958 // Output: 1959 // r0 == 0 - success 1960 // r0 == -1^K - failure, where K is partial transfer count 1961 // 1962 address generate_generic_copy(const char *name, 1963 address byte_copy_entry, address short_copy_entry, 1964 address int_copy_entry, address oop_copy_entry, 1965 address long_copy_entry, address checkcast_copy_entry) { 1966 1967 Label L_failed, L_objArray; 1968 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1969 1970 // Input registers 1971 const Register src = c_rarg0; // source array oop 1972 const Register src_pos = c_rarg1; // source position 1973 const Register dst = c_rarg2; // destination array oop 1974 const Register dst_pos = c_rarg3; // destination position 1975 const Register length = c_rarg4; 1976 1977 1978 // Registers used as temps 1979 const Register dst_klass = c_rarg5; 1980 1981 __ align(CodeEntryAlignment); 1982 1983 StubCodeMark mark(this, "StubRoutines", name); 1984 1985 address start = __ pc(); 1986 1987 __ enter(); // required for proper stackwalking of RuntimeStub frame 1988 1989 // bump this on entry, not on exit: 1990 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1991 1992 //----------------------------------------------------------------------- 1993 // Assembler stub will be used for this call to arraycopy 1994 // if the following conditions are met: 1995 // 1996 // (1) src and dst must not be null. 1997 // (2) src_pos must not be negative. 1998 // (3) dst_pos must not be negative. 1999 // (4) length must not be negative. 2000 // (5) src klass and dst klass should be the same and not NULL. 2001 // (6) src and dst should be arrays. 2002 // (7) src_pos + length must not exceed length of src. 2003 // (8) dst_pos + length must not exceed length of dst. 2004 // 2005 2006 // if (src == NULL) return -1; 2007 __ cbz(src, L_failed); 2008 2009 // if (src_pos < 0) return -1; 2010 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2011 2012 // if (dst == NULL) return -1; 2013 __ cbz(dst, L_failed); 2014 2015 // if (dst_pos < 0) return -1; 2016 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2017 2018 // registers used as temp 2019 const Register scratch_length = r16; // elements count to copy 2020 const Register scratch_src_klass = r17; // array klass 2021 const Register lh = r18; // layout helper 2022 2023 // if (length < 0) return -1; 2024 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2025 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2026 2027 __ load_klass(scratch_src_klass, src); 2028 #ifdef ASSERT 2029 // assert(src->klass() != NULL); 2030 { 2031 BLOCK_COMMENT("assert klasses not null {"); 2032 Label L1, L2; 2033 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2034 __ bind(L1); 2035 __ stop("broken null klass"); 2036 __ bind(L2); 2037 __ load_klass(rscratch1, dst); 2038 __ cbz(rscratch1, L1); // this would be broken also 2039 BLOCK_COMMENT("} assert klasses not null done"); 2040 } 2041 #endif 2042 2043 // Load layout helper (32-bits) 2044 // 2045 // |array_tag| | header_size | element_type | |log2_element_size| 2046 // 32 30 24 16 8 2 0 2047 // 2048 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2049 // 2050 2051 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2052 2053 // Handle objArrays completely differently... 2054 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2055 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2056 __ movw(rscratch1, objArray_lh); 2057 __ eorw(rscratch2, lh, rscratch1); 2058 __ cbzw(rscratch2, L_objArray); 2059 2060 // if (src->klass() != dst->klass()) return -1; 2061 __ load_klass(rscratch2, dst); 2062 __ eor(rscratch2, rscratch2, scratch_src_klass); 2063 __ cbnz(rscratch2, L_failed); 2064 2065 // if (!src->is_Array()) return -1; 2066 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2067 2068 // At this point, it is known to be a typeArray (array_tag 0x3). 2069 #ifdef ASSERT 2070 { 2071 BLOCK_COMMENT("assert primitive array {"); 2072 Label L; 2073 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2074 __ cmpw(lh, rscratch2); 2075 __ br(Assembler::GE, L); 2076 __ stop("must be a primitive array"); 2077 __ bind(L); 2078 BLOCK_COMMENT("} assert primitive array done"); 2079 } 2080 #endif 2081 2082 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2083 rscratch2, L_failed); 2084 2085 // TypeArrayKlass 2086 // 2087 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2088 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2089 // 2090 2091 const Register rscratch1_offset = rscratch1; // array offset 2092 const Register r18_elsize = lh; // element size 2093 2094 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2095 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2096 __ add(src, src, rscratch1_offset); // src array offset 2097 __ add(dst, dst, rscratch1_offset); // dst array offset 2098 BLOCK_COMMENT("choose copy loop based on element size"); 2099 2100 // next registers should be set before the jump to corresponding stub 2101 const Register from = c_rarg0; // source array address 2102 const Register to = c_rarg1; // destination array address 2103 const Register count = c_rarg2; // elements count 2104 2105 // 'from', 'to', 'count' registers should be set in such order 2106 // since they are the same as 'src', 'src_pos', 'dst'. 2107 2108 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2109 2110 // The possible values of elsize are 0-3, i.e. exact_log2(element 2111 // size in bytes). We do a simple bitwise binary search. 2112 __ BIND(L_copy_bytes); 2113 __ tbnz(r18_elsize, 1, L_copy_ints); 2114 __ tbnz(r18_elsize, 0, L_copy_shorts); 2115 __ lea(from, Address(src, src_pos));// src_addr 2116 __ lea(to, Address(dst, dst_pos));// dst_addr 2117 __ movw(count, scratch_length); // length 2118 __ b(RuntimeAddress(byte_copy_entry)); 2119 2120 __ BIND(L_copy_shorts); 2121 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2122 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2123 __ movw(count, scratch_length); // length 2124 __ b(RuntimeAddress(short_copy_entry)); 2125 2126 __ BIND(L_copy_ints); 2127 __ tbnz(r18_elsize, 0, L_copy_longs); 2128 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2129 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2130 __ movw(count, scratch_length); // length 2131 __ b(RuntimeAddress(int_copy_entry)); 2132 2133 __ BIND(L_copy_longs); 2134 #ifdef ASSERT 2135 { 2136 BLOCK_COMMENT("assert long copy {"); 2137 Label L; 2138 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2139 __ cmpw(r18_elsize, LogBytesPerLong); 2140 __ br(Assembler::EQ, L); 2141 __ stop("must be long copy, but elsize is wrong"); 2142 __ bind(L); 2143 BLOCK_COMMENT("} assert long copy done"); 2144 } 2145 #endif 2146 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2147 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2148 __ movw(count, scratch_length); // length 2149 __ b(RuntimeAddress(long_copy_entry)); 2150 2151 // ObjArrayKlass 2152 __ BIND(L_objArray); 2153 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2154 2155 Label L_plain_copy, L_checkcast_copy; 2156 // test array classes for subtyping 2157 __ load_klass(r18, dst); 2158 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2159 __ br(Assembler::NE, L_checkcast_copy); 2160 2161 // Identically typed arrays can be copied without element-wise checks. 2162 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2163 rscratch2, L_failed); 2164 2165 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2166 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2167 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2168 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2169 __ movw(count, scratch_length); // length 2170 __ BIND(L_plain_copy); 2171 __ b(RuntimeAddress(oop_copy_entry)); 2172 2173 __ BIND(L_checkcast_copy); 2174 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2175 { 2176 // Before looking at dst.length, make sure dst is also an objArray. 2177 __ ldrw(rscratch1, Address(r18, lh_offset)); 2178 __ movw(rscratch2, objArray_lh); 2179 __ eorw(rscratch1, rscratch1, rscratch2); 2180 __ cbnzw(rscratch1, L_failed); 2181 2182 // It is safe to examine both src.length and dst.length. 2183 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2184 r18, L_failed); 2185 2186 __ load_klass(dst_klass, dst); // reload 2187 2188 // Marshal the base address arguments now, freeing registers. 2189 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2190 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2191 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2192 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2193 __ movw(count, length); // length (reloaded) 2194 Register sco_temp = c_rarg3; // this register is free now 2195 assert_different_registers(from, to, count, sco_temp, 2196 dst_klass, scratch_src_klass); 2197 // assert_clean_int(count, sco_temp); 2198 2199 // Generate the type check. 2200 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2201 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2202 2203 // Smashes rscratch1, rscratch2 2204 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2205 2206 // Fetch destination element klass from the ObjArrayKlass header. 2207 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2208 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2209 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2210 2211 // the checkcast_copy loop needs two extra arguments: 2212 assert(c_rarg3 == sco_temp, "#3 already in place"); 2213 // Set up arguments for checkcast_copy_entry. 2214 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2215 __ b(RuntimeAddress(checkcast_copy_entry)); 2216 } 2217 2218 __ BIND(L_failed); 2219 __ mov(r0, -1); 2220 __ leave(); // required for proper stackwalking of RuntimeStub frame 2221 __ ret(lr); 2222 2223 return start; 2224 } 2225 2226 // 2227 // Generate stub for array fill. If "aligned" is true, the 2228 // "to" address is assumed to be heapword aligned. 2229 // 2230 // Arguments for generated stub: 2231 // to: c_rarg0 2232 // value: c_rarg1 2233 // count: c_rarg2 treated as signed 2234 // 2235 address generate_fill(BasicType t, bool aligned, const char *name) { 2236 __ align(CodeEntryAlignment); 2237 StubCodeMark mark(this, "StubRoutines", name); 2238 address start = __ pc(); 2239 2240 BLOCK_COMMENT("Entry:"); 2241 2242 const Register to = c_rarg0; // source array address 2243 const Register value = c_rarg1; // value 2244 const Register count = c_rarg2; // elements count 2245 2246 const Register bz_base = r10; // base for block_zero routine 2247 const Register cnt_words = r11; // temp register 2248 2249 __ enter(); 2250 2251 Label L_fill_elements, L_exit1; 2252 2253 int shift = -1; 2254 switch (t) { 2255 case T_BYTE: 2256 shift = 0; 2257 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2258 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2259 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2260 __ br(Assembler::LO, L_fill_elements); 2261 break; 2262 case T_SHORT: 2263 shift = 1; 2264 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2265 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2266 __ br(Assembler::LO, L_fill_elements); 2267 break; 2268 case T_INT: 2269 shift = 2; 2270 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2271 __ br(Assembler::LO, L_fill_elements); 2272 break; 2273 default: ShouldNotReachHere(); 2274 } 2275 2276 // Align source address at 8 bytes address boundary. 2277 Label L_skip_align1, L_skip_align2, L_skip_align4; 2278 if (!aligned) { 2279 switch (t) { 2280 case T_BYTE: 2281 // One byte misalignment happens only for byte arrays. 2282 __ tbz(to, 0, L_skip_align1); 2283 __ strb(value, Address(__ post(to, 1))); 2284 __ subw(count, count, 1); 2285 __ bind(L_skip_align1); 2286 // Fallthrough 2287 case T_SHORT: 2288 // Two bytes misalignment happens only for byte and short (char) arrays. 2289 __ tbz(to, 1, L_skip_align2); 2290 __ strh(value, Address(__ post(to, 2))); 2291 __ subw(count, count, 2 >> shift); 2292 __ bind(L_skip_align2); 2293 // Fallthrough 2294 case T_INT: 2295 // Align to 8 bytes, we know we are 4 byte aligned to start. 2296 __ tbz(to, 2, L_skip_align4); 2297 __ strw(value, Address(__ post(to, 4))); 2298 __ subw(count, count, 4 >> shift); 2299 __ bind(L_skip_align4); 2300 break; 2301 default: ShouldNotReachHere(); 2302 } 2303 } 2304 2305 // 2306 // Fill large chunks 2307 // 2308 __ lsrw(cnt_words, count, 3 - shift); // number of words 2309 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2310 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2311 if (UseBlockZeroing) { 2312 Label non_block_zeroing, rest; 2313 // If the fill value is zero we can use the fast zero_words(). 2314 __ cbnz(value, non_block_zeroing); 2315 __ mov(bz_base, to); 2316 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2317 __ zero_words(bz_base, cnt_words); 2318 __ b(rest); 2319 __ bind(non_block_zeroing); 2320 __ fill_words(to, cnt_words, value); 2321 __ bind(rest); 2322 } else { 2323 __ fill_words(to, cnt_words, value); 2324 } 2325 2326 // Remaining count is less than 8 bytes. Fill it by a single store. 2327 // Note that the total length is no less than 8 bytes. 2328 if (t == T_BYTE || t == T_SHORT) { 2329 Label L_exit1; 2330 __ cbzw(count, L_exit1); 2331 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2332 __ str(value, Address(to, -8)); // overwrite some elements 2333 __ bind(L_exit1); 2334 __ leave(); 2335 __ ret(lr); 2336 } 2337 2338 // Handle copies less than 8 bytes. 2339 Label L_fill_2, L_fill_4, L_exit2; 2340 __ bind(L_fill_elements); 2341 switch (t) { 2342 case T_BYTE: 2343 __ tbz(count, 0, L_fill_2); 2344 __ strb(value, Address(__ post(to, 1))); 2345 __ bind(L_fill_2); 2346 __ tbz(count, 1, L_fill_4); 2347 __ strh(value, Address(__ post(to, 2))); 2348 __ bind(L_fill_4); 2349 __ tbz(count, 2, L_exit2); 2350 __ strw(value, Address(to)); 2351 break; 2352 case T_SHORT: 2353 __ tbz(count, 0, L_fill_4); 2354 __ strh(value, Address(__ post(to, 2))); 2355 __ bind(L_fill_4); 2356 __ tbz(count, 1, L_exit2); 2357 __ strw(value, Address(to)); 2358 break; 2359 case T_INT: 2360 __ cbzw(count, L_exit2); 2361 __ strw(value, Address(to)); 2362 break; 2363 default: ShouldNotReachHere(); 2364 } 2365 __ bind(L_exit2); 2366 __ leave(); 2367 __ ret(lr); 2368 return start; 2369 } 2370 2371 void generate_arraycopy_stubs() { 2372 address entry; 2373 address entry_jbyte_arraycopy; 2374 address entry_jshort_arraycopy; 2375 address entry_jint_arraycopy; 2376 address entry_oop_arraycopy; 2377 address entry_jlong_arraycopy; 2378 address entry_checkcast_arraycopy; 2379 2380 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2381 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2382 2383 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2384 2385 //*** jbyte 2386 // Always need aligned and unaligned versions 2387 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2388 "jbyte_disjoint_arraycopy"); 2389 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2390 &entry_jbyte_arraycopy, 2391 "jbyte_arraycopy"); 2392 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2393 "arrayof_jbyte_disjoint_arraycopy"); 2394 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2395 "arrayof_jbyte_arraycopy"); 2396 2397 //*** jshort 2398 // Always need aligned and unaligned versions 2399 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2400 "jshort_disjoint_arraycopy"); 2401 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2402 &entry_jshort_arraycopy, 2403 "jshort_arraycopy"); 2404 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2405 "arrayof_jshort_disjoint_arraycopy"); 2406 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2407 "arrayof_jshort_arraycopy"); 2408 2409 //*** jint 2410 // Aligned versions 2411 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2412 "arrayof_jint_disjoint_arraycopy"); 2413 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2414 "arrayof_jint_arraycopy"); 2415 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2416 // entry_jint_arraycopy always points to the unaligned version 2417 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2418 "jint_disjoint_arraycopy"); 2419 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2420 &entry_jint_arraycopy, 2421 "jint_arraycopy"); 2422 2423 //*** jlong 2424 // It is always aligned 2425 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2426 "arrayof_jlong_disjoint_arraycopy"); 2427 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2428 "arrayof_jlong_arraycopy"); 2429 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2430 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2431 2432 //*** oops 2433 { 2434 // With compressed oops we need unaligned versions; notice that 2435 // we overwrite entry_oop_arraycopy. 2436 bool aligned = !UseCompressedOops; 2437 2438 StubRoutines::_arrayof_oop_disjoint_arraycopy 2439 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2440 /*dest_uninitialized*/false); 2441 StubRoutines::_arrayof_oop_arraycopy 2442 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2443 /*dest_uninitialized*/false); 2444 // Aligned versions without pre-barriers 2445 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2446 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2447 /*dest_uninitialized*/true); 2448 StubRoutines::_arrayof_oop_arraycopy_uninit 2449 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2450 /*dest_uninitialized*/true); 2451 } 2452 2453 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2454 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2455 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2456 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2457 2458 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2459 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2460 /*dest_uninitialized*/true); 2461 2462 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2463 entry_jbyte_arraycopy, 2464 entry_jshort_arraycopy, 2465 entry_jint_arraycopy, 2466 entry_jlong_arraycopy); 2467 2468 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2469 entry_jbyte_arraycopy, 2470 entry_jshort_arraycopy, 2471 entry_jint_arraycopy, 2472 entry_oop_arraycopy, 2473 entry_jlong_arraycopy, 2474 entry_checkcast_arraycopy); 2475 2476 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2477 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2478 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2479 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2480 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2481 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2482 } 2483 2484 void generate_math_stubs() { Unimplemented(); } 2485 2486 // Arguments: 2487 // 2488 // Inputs: 2489 // c_rarg0 - source byte array address 2490 // c_rarg1 - destination byte array address 2491 // c_rarg2 - K (key) in little endian int array 2492 // 2493 address generate_aescrypt_encryptBlock() { 2494 __ align(CodeEntryAlignment); 2495 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2496 2497 Label L_doLast; 2498 2499 const Register from = c_rarg0; // source array address 2500 const Register to = c_rarg1; // destination array address 2501 const Register key = c_rarg2; // key array address 2502 const Register keylen = rscratch1; 2503 2504 address start = __ pc(); 2505 __ enter(); 2506 2507 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2508 2509 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2510 2511 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2512 __ rev32(v1, __ T16B, v1); 2513 __ rev32(v2, __ T16B, v2); 2514 __ rev32(v3, __ T16B, v3); 2515 __ rev32(v4, __ T16B, v4); 2516 __ aese(v0, v1); 2517 __ aesmc(v0, v0); 2518 __ aese(v0, v2); 2519 __ aesmc(v0, v0); 2520 __ aese(v0, v3); 2521 __ aesmc(v0, v0); 2522 __ aese(v0, v4); 2523 __ aesmc(v0, v0); 2524 2525 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2526 __ rev32(v1, __ T16B, v1); 2527 __ rev32(v2, __ T16B, v2); 2528 __ rev32(v3, __ T16B, v3); 2529 __ rev32(v4, __ T16B, v4); 2530 __ aese(v0, v1); 2531 __ aesmc(v0, v0); 2532 __ aese(v0, v2); 2533 __ aesmc(v0, v0); 2534 __ aese(v0, v3); 2535 __ aesmc(v0, v0); 2536 __ aese(v0, v4); 2537 __ aesmc(v0, v0); 2538 2539 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2540 __ rev32(v1, __ T16B, v1); 2541 __ rev32(v2, __ T16B, v2); 2542 2543 __ cmpw(keylen, 44); 2544 __ br(Assembler::EQ, L_doLast); 2545 2546 __ aese(v0, v1); 2547 __ aesmc(v0, v0); 2548 __ aese(v0, v2); 2549 __ aesmc(v0, v0); 2550 2551 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2552 __ rev32(v1, __ T16B, v1); 2553 __ rev32(v2, __ T16B, v2); 2554 2555 __ cmpw(keylen, 52); 2556 __ br(Assembler::EQ, L_doLast); 2557 2558 __ aese(v0, v1); 2559 __ aesmc(v0, v0); 2560 __ aese(v0, v2); 2561 __ aesmc(v0, v0); 2562 2563 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2564 __ rev32(v1, __ T16B, v1); 2565 __ rev32(v2, __ T16B, v2); 2566 2567 __ BIND(L_doLast); 2568 2569 __ aese(v0, v1); 2570 __ aesmc(v0, v0); 2571 __ aese(v0, v2); 2572 2573 __ ld1(v1, __ T16B, key); 2574 __ rev32(v1, __ T16B, v1); 2575 __ eor(v0, __ T16B, v0, v1); 2576 2577 __ st1(v0, __ T16B, to); 2578 2579 __ mov(r0, 0); 2580 2581 __ leave(); 2582 __ ret(lr); 2583 2584 return start; 2585 } 2586 2587 // Arguments: 2588 // 2589 // Inputs: 2590 // c_rarg0 - source byte array address 2591 // c_rarg1 - destination byte array address 2592 // c_rarg2 - K (key) in little endian int array 2593 // 2594 address generate_aescrypt_decryptBlock() { 2595 assert(UseAES, "need AES instructions and misaligned SSE support"); 2596 __ align(CodeEntryAlignment); 2597 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2598 Label L_doLast; 2599 2600 const Register from = c_rarg0; // source array address 2601 const Register to = c_rarg1; // destination array address 2602 const Register key = c_rarg2; // key array address 2603 const Register keylen = rscratch1; 2604 2605 address start = __ pc(); 2606 __ enter(); // required for proper stackwalking of RuntimeStub frame 2607 2608 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2609 2610 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2611 2612 __ ld1(v5, __ T16B, __ post(key, 16)); 2613 __ rev32(v5, __ T16B, v5); 2614 2615 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2616 __ rev32(v1, __ T16B, v1); 2617 __ rev32(v2, __ T16B, v2); 2618 __ rev32(v3, __ T16B, v3); 2619 __ rev32(v4, __ T16B, v4); 2620 __ aesd(v0, v1); 2621 __ aesimc(v0, v0); 2622 __ aesd(v0, v2); 2623 __ aesimc(v0, v0); 2624 __ aesd(v0, v3); 2625 __ aesimc(v0, v0); 2626 __ aesd(v0, v4); 2627 __ aesimc(v0, v0); 2628 2629 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2630 __ rev32(v1, __ T16B, v1); 2631 __ rev32(v2, __ T16B, v2); 2632 __ rev32(v3, __ T16B, v3); 2633 __ rev32(v4, __ T16B, v4); 2634 __ aesd(v0, v1); 2635 __ aesimc(v0, v0); 2636 __ aesd(v0, v2); 2637 __ aesimc(v0, v0); 2638 __ aesd(v0, v3); 2639 __ aesimc(v0, v0); 2640 __ aesd(v0, v4); 2641 __ aesimc(v0, v0); 2642 2643 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2644 __ rev32(v1, __ T16B, v1); 2645 __ rev32(v2, __ T16B, v2); 2646 2647 __ cmpw(keylen, 44); 2648 __ br(Assembler::EQ, L_doLast); 2649 2650 __ aesd(v0, v1); 2651 __ aesimc(v0, v0); 2652 __ aesd(v0, v2); 2653 __ aesimc(v0, v0); 2654 2655 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2656 __ rev32(v1, __ T16B, v1); 2657 __ rev32(v2, __ T16B, v2); 2658 2659 __ cmpw(keylen, 52); 2660 __ br(Assembler::EQ, L_doLast); 2661 2662 __ aesd(v0, v1); 2663 __ aesimc(v0, v0); 2664 __ aesd(v0, v2); 2665 __ aesimc(v0, v0); 2666 2667 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2668 __ rev32(v1, __ T16B, v1); 2669 __ rev32(v2, __ T16B, v2); 2670 2671 __ BIND(L_doLast); 2672 2673 __ aesd(v0, v1); 2674 __ aesimc(v0, v0); 2675 __ aesd(v0, v2); 2676 2677 __ eor(v0, __ T16B, v0, v5); 2678 2679 __ st1(v0, __ T16B, to); 2680 2681 __ mov(r0, 0); 2682 2683 __ leave(); 2684 __ ret(lr); 2685 2686 return start; 2687 } 2688 2689 // Arguments: 2690 // 2691 // Inputs: 2692 // c_rarg0 - source byte array address 2693 // c_rarg1 - destination byte array address 2694 // c_rarg2 - K (key) in little endian int array 2695 // c_rarg3 - r vector byte array address 2696 // c_rarg4 - input length 2697 // 2698 // Output: 2699 // x0 - input length 2700 // 2701 address generate_cipherBlockChaining_encryptAESCrypt() { 2702 assert(UseAES, "need AES instructions and misaligned SSE support"); 2703 __ align(CodeEntryAlignment); 2704 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2705 2706 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2707 2708 const Register from = c_rarg0; // source array address 2709 const Register to = c_rarg1; // destination array address 2710 const Register key = c_rarg2; // key array address 2711 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2712 // and left with the results of the last encryption block 2713 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2714 const Register keylen = rscratch1; 2715 2716 address start = __ pc(); 2717 2718 __ enter(); 2719 2720 __ movw(rscratch2, len_reg); 2721 2722 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2723 2724 __ ld1(v0, __ T16B, rvec); 2725 2726 __ cmpw(keylen, 52); 2727 __ br(Assembler::CC, L_loadkeys_44); 2728 __ br(Assembler::EQ, L_loadkeys_52); 2729 2730 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2731 __ rev32(v17, __ T16B, v17); 2732 __ rev32(v18, __ T16B, v18); 2733 __ BIND(L_loadkeys_52); 2734 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2735 __ rev32(v19, __ T16B, v19); 2736 __ rev32(v20, __ T16B, v20); 2737 __ BIND(L_loadkeys_44); 2738 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2739 __ rev32(v21, __ T16B, v21); 2740 __ rev32(v22, __ T16B, v22); 2741 __ rev32(v23, __ T16B, v23); 2742 __ rev32(v24, __ T16B, v24); 2743 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2744 __ rev32(v25, __ T16B, v25); 2745 __ rev32(v26, __ T16B, v26); 2746 __ rev32(v27, __ T16B, v27); 2747 __ rev32(v28, __ T16B, v28); 2748 __ ld1(v29, v30, v31, __ T16B, key); 2749 __ rev32(v29, __ T16B, v29); 2750 __ rev32(v30, __ T16B, v30); 2751 __ rev32(v31, __ T16B, v31); 2752 2753 __ BIND(L_aes_loop); 2754 __ ld1(v1, __ T16B, __ post(from, 16)); 2755 __ eor(v0, __ T16B, v0, v1); 2756 2757 __ br(Assembler::CC, L_rounds_44); 2758 __ br(Assembler::EQ, L_rounds_52); 2759 2760 __ aese(v0, v17); __ aesmc(v0, v0); 2761 __ aese(v0, v18); __ aesmc(v0, v0); 2762 __ BIND(L_rounds_52); 2763 __ aese(v0, v19); __ aesmc(v0, v0); 2764 __ aese(v0, v20); __ aesmc(v0, v0); 2765 __ BIND(L_rounds_44); 2766 __ aese(v0, v21); __ aesmc(v0, v0); 2767 __ aese(v0, v22); __ aesmc(v0, v0); 2768 __ aese(v0, v23); __ aesmc(v0, v0); 2769 __ aese(v0, v24); __ aesmc(v0, v0); 2770 __ aese(v0, v25); __ aesmc(v0, v0); 2771 __ aese(v0, v26); __ aesmc(v0, v0); 2772 __ aese(v0, v27); __ aesmc(v0, v0); 2773 __ aese(v0, v28); __ aesmc(v0, v0); 2774 __ aese(v0, v29); __ aesmc(v0, v0); 2775 __ aese(v0, v30); 2776 __ eor(v0, __ T16B, v0, v31); 2777 2778 __ st1(v0, __ T16B, __ post(to, 16)); 2779 2780 __ subw(len_reg, len_reg, 16); 2781 __ cbnzw(len_reg, L_aes_loop); 2782 2783 __ st1(v0, __ T16B, rvec); 2784 2785 __ mov(r0, rscratch2); 2786 2787 __ leave(); 2788 __ ret(lr); 2789 2790 return start; 2791 } 2792 2793 // Arguments: 2794 // 2795 // Inputs: 2796 // c_rarg0 - source byte array address 2797 // c_rarg1 - destination byte array address 2798 // c_rarg2 - K (key) in little endian int array 2799 // c_rarg3 - r vector byte array address 2800 // c_rarg4 - input length 2801 // 2802 // Output: 2803 // r0 - input length 2804 // 2805 address generate_cipherBlockChaining_decryptAESCrypt() { 2806 assert(UseAES, "need AES instructions and misaligned SSE support"); 2807 __ align(CodeEntryAlignment); 2808 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2809 2810 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2811 2812 const Register from = c_rarg0; // source array address 2813 const Register to = c_rarg1; // destination array address 2814 const Register key = c_rarg2; // key array address 2815 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2816 // and left with the results of the last encryption block 2817 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2818 const Register keylen = rscratch1; 2819 2820 address start = __ pc(); 2821 2822 __ enter(); 2823 2824 __ movw(rscratch2, len_reg); 2825 2826 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2827 2828 __ ld1(v2, __ T16B, rvec); 2829 2830 __ ld1(v31, __ T16B, __ post(key, 16)); 2831 __ rev32(v31, __ T16B, v31); 2832 2833 __ cmpw(keylen, 52); 2834 __ br(Assembler::CC, L_loadkeys_44); 2835 __ br(Assembler::EQ, L_loadkeys_52); 2836 2837 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2838 __ rev32(v17, __ T16B, v17); 2839 __ rev32(v18, __ T16B, v18); 2840 __ BIND(L_loadkeys_52); 2841 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2842 __ rev32(v19, __ T16B, v19); 2843 __ rev32(v20, __ T16B, v20); 2844 __ BIND(L_loadkeys_44); 2845 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2846 __ rev32(v21, __ T16B, v21); 2847 __ rev32(v22, __ T16B, v22); 2848 __ rev32(v23, __ T16B, v23); 2849 __ rev32(v24, __ T16B, v24); 2850 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2851 __ rev32(v25, __ T16B, v25); 2852 __ rev32(v26, __ T16B, v26); 2853 __ rev32(v27, __ T16B, v27); 2854 __ rev32(v28, __ T16B, v28); 2855 __ ld1(v29, v30, __ T16B, key); 2856 __ rev32(v29, __ T16B, v29); 2857 __ rev32(v30, __ T16B, v30); 2858 2859 __ BIND(L_aes_loop); 2860 __ ld1(v0, __ T16B, __ post(from, 16)); 2861 __ orr(v1, __ T16B, v0, v0); 2862 2863 __ br(Assembler::CC, L_rounds_44); 2864 __ br(Assembler::EQ, L_rounds_52); 2865 2866 __ aesd(v0, v17); __ aesimc(v0, v0); 2867 __ aesd(v0, v18); __ aesimc(v0, v0); 2868 __ BIND(L_rounds_52); 2869 __ aesd(v0, v19); __ aesimc(v0, v0); 2870 __ aesd(v0, v20); __ aesimc(v0, v0); 2871 __ BIND(L_rounds_44); 2872 __ aesd(v0, v21); __ aesimc(v0, v0); 2873 __ aesd(v0, v22); __ aesimc(v0, v0); 2874 __ aesd(v0, v23); __ aesimc(v0, v0); 2875 __ aesd(v0, v24); __ aesimc(v0, v0); 2876 __ aesd(v0, v25); __ aesimc(v0, v0); 2877 __ aesd(v0, v26); __ aesimc(v0, v0); 2878 __ aesd(v0, v27); __ aesimc(v0, v0); 2879 __ aesd(v0, v28); __ aesimc(v0, v0); 2880 __ aesd(v0, v29); __ aesimc(v0, v0); 2881 __ aesd(v0, v30); 2882 __ eor(v0, __ T16B, v0, v31); 2883 __ eor(v0, __ T16B, v0, v2); 2884 2885 __ st1(v0, __ T16B, __ post(to, 16)); 2886 __ orr(v2, __ T16B, v1, v1); 2887 2888 __ subw(len_reg, len_reg, 16); 2889 __ cbnzw(len_reg, L_aes_loop); 2890 2891 __ st1(v2, __ T16B, rvec); 2892 2893 __ mov(r0, rscratch2); 2894 2895 __ leave(); 2896 __ ret(lr); 2897 2898 return start; 2899 } 2900 2901 // Arguments: 2902 // 2903 // Inputs: 2904 // c_rarg0 - byte[] source+offset 2905 // c_rarg1 - int[] SHA.state 2906 // c_rarg2 - int offset 2907 // c_rarg3 - int limit 2908 // 2909 address generate_sha1_implCompress(bool multi_block, const char *name) { 2910 __ align(CodeEntryAlignment); 2911 StubCodeMark mark(this, "StubRoutines", name); 2912 address start = __ pc(); 2913 2914 Register buf = c_rarg0; 2915 Register state = c_rarg1; 2916 Register ofs = c_rarg2; 2917 Register limit = c_rarg3; 2918 2919 Label keys; 2920 Label sha1_loop; 2921 2922 // load the keys into v0..v3 2923 __ adr(rscratch1, keys); 2924 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2925 // load 5 words state into v6, v7 2926 __ ldrq(v6, Address(state, 0)); 2927 __ ldrs(v7, Address(state, 16)); 2928 2929 2930 __ BIND(sha1_loop); 2931 // load 64 bytes of data into v16..v19 2932 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2933 __ rev32(v16, __ T16B, v16); 2934 __ rev32(v17, __ T16B, v17); 2935 __ rev32(v18, __ T16B, v18); 2936 __ rev32(v19, __ T16B, v19); 2937 2938 // do the sha1 2939 __ addv(v4, __ T4S, v16, v0); 2940 __ orr(v20, __ T16B, v6, v6); 2941 2942 FloatRegister d0 = v16; 2943 FloatRegister d1 = v17; 2944 FloatRegister d2 = v18; 2945 FloatRegister d3 = v19; 2946 2947 for (int round = 0; round < 20; round++) { 2948 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2949 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2950 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2951 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2952 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2953 2954 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2955 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2956 __ sha1h(tmp2, __ T4S, v20); 2957 if (round < 5) 2958 __ sha1c(v20, __ T4S, tmp3, tmp4); 2959 else if (round < 10 || round >= 15) 2960 __ sha1p(v20, __ T4S, tmp3, tmp4); 2961 else 2962 __ sha1m(v20, __ T4S, tmp3, tmp4); 2963 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2964 2965 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2966 } 2967 2968 __ addv(v7, __ T2S, v7, v21); 2969 __ addv(v6, __ T4S, v6, v20); 2970 2971 if (multi_block) { 2972 __ add(ofs, ofs, 64); 2973 __ cmp(ofs, limit); 2974 __ br(Assembler::LE, sha1_loop); 2975 __ mov(c_rarg0, ofs); // return ofs 2976 } 2977 2978 __ strq(v6, Address(state, 0)); 2979 __ strs(v7, Address(state, 16)); 2980 2981 __ ret(lr); 2982 2983 __ bind(keys); 2984 __ emit_int32(0x5a827999); 2985 __ emit_int32(0x6ed9eba1); 2986 __ emit_int32(0x8f1bbcdc); 2987 __ emit_int32(0xca62c1d6); 2988 2989 return start; 2990 } 2991 2992 2993 // Arguments: 2994 // 2995 // Inputs: 2996 // c_rarg0 - byte[] source+offset 2997 // c_rarg1 - int[] SHA.state 2998 // c_rarg2 - int offset 2999 // c_rarg3 - int limit 3000 // 3001 address generate_sha256_implCompress(bool multi_block, const char *name) { 3002 static const uint32_t round_consts[64] = { 3003 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3004 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3005 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3006 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3007 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3008 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3009 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3010 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3011 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3012 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3013 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3014 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3015 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3016 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3017 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3018 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3019 }; 3020 __ align(CodeEntryAlignment); 3021 StubCodeMark mark(this, "StubRoutines", name); 3022 address start = __ pc(); 3023 3024 Register buf = c_rarg0; 3025 Register state = c_rarg1; 3026 Register ofs = c_rarg2; 3027 Register limit = c_rarg3; 3028 3029 Label sha1_loop; 3030 3031 __ stpd(v8, v9, __ pre(sp, -32)); 3032 __ stpd(v10, v11, Address(sp, 16)); 3033 3034 // dga == v0 3035 // dgb == v1 3036 // dg0 == v2 3037 // dg1 == v3 3038 // dg2 == v4 3039 // t0 == v6 3040 // t1 == v7 3041 3042 // load 16 keys to v16..v31 3043 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3044 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3045 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3046 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3047 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3048 3049 // load 8 words (256 bits) state 3050 __ ldpq(v0, v1, state); 3051 3052 __ BIND(sha1_loop); 3053 // load 64 bytes of data into v8..v11 3054 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3055 __ rev32(v8, __ T16B, v8); 3056 __ rev32(v9, __ T16B, v9); 3057 __ rev32(v10, __ T16B, v10); 3058 __ rev32(v11, __ T16B, v11); 3059 3060 __ addv(v6, __ T4S, v8, v16); 3061 __ orr(v2, __ T16B, v0, v0); 3062 __ orr(v3, __ T16B, v1, v1); 3063 3064 FloatRegister d0 = v8; 3065 FloatRegister d1 = v9; 3066 FloatRegister d2 = v10; 3067 FloatRegister d3 = v11; 3068 3069 3070 for (int round = 0; round < 16; round++) { 3071 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3072 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3073 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3074 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3075 3076 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3077 __ orr(v4, __ T16B, v2, v2); 3078 if (round < 15) 3079 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3080 __ sha256h(v2, __ T4S, v3, tmp2); 3081 __ sha256h2(v3, __ T4S, v4, tmp2); 3082 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3083 3084 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3085 } 3086 3087 __ addv(v0, __ T4S, v0, v2); 3088 __ addv(v1, __ T4S, v1, v3); 3089 3090 if (multi_block) { 3091 __ add(ofs, ofs, 64); 3092 __ cmp(ofs, limit); 3093 __ br(Assembler::LE, sha1_loop); 3094 __ mov(c_rarg0, ofs); // return ofs 3095 } 3096 3097 __ ldpd(v10, v11, Address(sp, 16)); 3098 __ ldpd(v8, v9, __ post(sp, 32)); 3099 3100 __ stpq(v0, v1, state); 3101 3102 __ ret(lr); 3103 3104 return start; 3105 } 3106 3107 #ifndef BUILTIN_SIM 3108 // Safefetch stubs. 3109 void generate_safefetch(const char* name, int size, address* entry, 3110 address* fault_pc, address* continuation_pc) { 3111 // safefetch signatures: 3112 // int SafeFetch32(int* adr, int errValue); 3113 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3114 // 3115 // arguments: 3116 // c_rarg0 = adr 3117 // c_rarg1 = errValue 3118 // 3119 // result: 3120 // PPC_RET = *adr or errValue 3121 3122 StubCodeMark mark(this, "StubRoutines", name); 3123 3124 // Entry point, pc or function descriptor. 3125 *entry = __ pc(); 3126 3127 // Load *adr into c_rarg1, may fault. 3128 *fault_pc = __ pc(); 3129 switch (size) { 3130 case 4: 3131 // int32_t 3132 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3133 break; 3134 case 8: 3135 // int64_t 3136 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3137 break; 3138 default: 3139 ShouldNotReachHere(); 3140 } 3141 3142 // return errValue or *adr 3143 *continuation_pc = __ pc(); 3144 __ mov(r0, c_rarg1); 3145 __ ret(lr); 3146 } 3147 #endif 3148 3149 /** 3150 * Arguments: 3151 * 3152 * Inputs: 3153 * c_rarg0 - int crc 3154 * c_rarg1 - byte* buf 3155 * c_rarg2 - int length 3156 * 3157 * Ouput: 3158 * rax - int crc result 3159 */ 3160 address generate_updateBytesCRC32() { 3161 assert(UseCRC32Intrinsics, "what are we doing here?"); 3162 3163 __ align(CodeEntryAlignment); 3164 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3165 3166 address start = __ pc(); 3167 3168 const Register crc = c_rarg0; // crc 3169 const Register buf = c_rarg1; // source java byte array address 3170 const Register len = c_rarg2; // length 3171 const Register table0 = c_rarg3; // crc_table address 3172 const Register table1 = c_rarg4; 3173 const Register table2 = c_rarg5; 3174 const Register table3 = c_rarg6; 3175 const Register tmp3 = c_rarg7; 3176 3177 BLOCK_COMMENT("Entry:"); 3178 __ enter(); // required for proper stackwalking of RuntimeStub frame 3179 3180 __ kernel_crc32(crc, buf, len, 3181 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3182 3183 __ leave(); // required for proper stackwalking of RuntimeStub frame 3184 __ ret(lr); 3185 3186 return start; 3187 } 3188 3189 /** 3190 * Arguments: 3191 * 3192 * Inputs: 3193 * c_rarg0 - int crc 3194 * c_rarg1 - byte* buf 3195 * c_rarg2 - int length 3196 * c_rarg3 - int* table 3197 * 3198 * Ouput: 3199 * r0 - int crc result 3200 */ 3201 address generate_updateBytesCRC32C() { 3202 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3203 3204 __ align(CodeEntryAlignment); 3205 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3206 3207 address start = __ pc(); 3208 3209 const Register crc = c_rarg0; // crc 3210 const Register buf = c_rarg1; // source java byte array address 3211 const Register len = c_rarg2; // length 3212 const Register table0 = c_rarg3; // crc_table address 3213 const Register table1 = c_rarg4; 3214 const Register table2 = c_rarg5; 3215 const Register table3 = c_rarg6; 3216 const Register tmp3 = c_rarg7; 3217 3218 BLOCK_COMMENT("Entry:"); 3219 __ enter(); // required for proper stackwalking of RuntimeStub frame 3220 3221 __ kernel_crc32c(crc, buf, len, 3222 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3223 3224 __ leave(); // required for proper stackwalking of RuntimeStub frame 3225 __ ret(lr); 3226 3227 return start; 3228 } 3229 3230 /*** 3231 * Arguments: 3232 * 3233 * Inputs: 3234 * c_rarg0 - int adler 3235 * c_rarg1 - byte* buff 3236 * c_rarg2 - int len 3237 * 3238 * Output: 3239 * c_rarg0 - int adler result 3240 */ 3241 address generate_updateBytesAdler32() { 3242 __ align(CodeEntryAlignment); 3243 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3244 address start = __ pc(); 3245 3246 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3247 3248 // Aliases 3249 Register adler = c_rarg0; 3250 Register s1 = c_rarg0; 3251 Register s2 = c_rarg3; 3252 Register buff = c_rarg1; 3253 Register len = c_rarg2; 3254 Register nmax = r4; 3255 Register base = r5; 3256 Register count = r6; 3257 Register temp0 = rscratch1; 3258 Register temp1 = rscratch2; 3259 FloatRegister vbytes = v0; 3260 FloatRegister vs1acc = v1; 3261 FloatRegister vs2acc = v2; 3262 FloatRegister vtable = v3; 3263 3264 // Max number of bytes we can process before having to take the mod 3265 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3266 unsigned long BASE = 0xfff1; 3267 unsigned long NMAX = 0x15B0; 3268 3269 __ mov(base, BASE); 3270 __ mov(nmax, NMAX); 3271 3272 // Load accumulation coefficients for the upper 16 bits 3273 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 3274 __ ld1(vtable, __ T16B, Address(temp0)); 3275 3276 // s1 is initialized to the lower 16 bits of adler 3277 // s2 is initialized to the upper 16 bits of adler 3278 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3279 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3280 3281 // The pipelined loop needs at least 16 elements for 1 iteration 3282 // It does check this, but it is more effective to skip to the cleanup loop 3283 __ cmp(len, (u1)16); 3284 __ br(Assembler::HS, L_nmax); 3285 __ cbz(len, L_combine); 3286 3287 __ bind(L_simple_by1_loop); 3288 __ ldrb(temp0, Address(__ post(buff, 1))); 3289 __ add(s1, s1, temp0); 3290 __ add(s2, s2, s1); 3291 __ subs(len, len, 1); 3292 __ br(Assembler::HI, L_simple_by1_loop); 3293 3294 // s1 = s1 % BASE 3295 __ subs(temp0, s1, base); 3296 __ csel(s1, temp0, s1, Assembler::HS); 3297 3298 // s2 = s2 % BASE 3299 __ lsr(temp0, s2, 16); 3300 __ lsl(temp1, temp0, 4); 3301 __ sub(temp1, temp1, temp0); 3302 __ add(s2, temp1, s2, ext::uxth); 3303 3304 __ subs(temp0, s2, base); 3305 __ csel(s2, temp0, s2, Assembler::HS); 3306 3307 __ b(L_combine); 3308 3309 __ bind(L_nmax); 3310 __ subs(len, len, nmax); 3311 __ sub(count, nmax, 16); 3312 __ br(Assembler::LO, L_by16); 3313 3314 __ bind(L_nmax_loop); 3315 3316 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3317 vbytes, vs1acc, vs2acc, vtable); 3318 3319 __ subs(count, count, 16); 3320 __ br(Assembler::HS, L_nmax_loop); 3321 3322 // s1 = s1 % BASE 3323 __ lsr(temp0, s1, 16); 3324 __ lsl(temp1, temp0, 4); 3325 __ sub(temp1, temp1, temp0); 3326 __ add(temp1, temp1, s1, ext::uxth); 3327 3328 __ lsr(temp0, temp1, 16); 3329 __ lsl(s1, temp0, 4); 3330 __ sub(s1, s1, temp0); 3331 __ add(s1, s1, temp1, ext:: uxth); 3332 3333 __ subs(temp0, s1, base); 3334 __ csel(s1, temp0, s1, Assembler::HS); 3335 3336 // s2 = s2 % BASE 3337 __ lsr(temp0, s2, 16); 3338 __ lsl(temp1, temp0, 4); 3339 __ sub(temp1, temp1, temp0); 3340 __ add(temp1, temp1, s2, ext::uxth); 3341 3342 __ lsr(temp0, temp1, 16); 3343 __ lsl(s2, temp0, 4); 3344 __ sub(s2, s2, temp0); 3345 __ add(s2, s2, temp1, ext:: uxth); 3346 3347 __ subs(temp0, s2, base); 3348 __ csel(s2, temp0, s2, Assembler::HS); 3349 3350 __ subs(len, len, nmax); 3351 __ sub(count, nmax, 16); 3352 __ br(Assembler::HS, L_nmax_loop); 3353 3354 __ bind(L_by16); 3355 __ adds(len, len, count); 3356 __ br(Assembler::LO, L_by1); 3357 3358 __ bind(L_by16_loop); 3359 3360 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3361 vbytes, vs1acc, vs2acc, vtable); 3362 3363 __ subs(len, len, 16); 3364 __ br(Assembler::HS, L_by16_loop); 3365 3366 __ bind(L_by1); 3367 __ adds(len, len, 15); 3368 __ br(Assembler::LO, L_do_mod); 3369 3370 __ bind(L_by1_loop); 3371 __ ldrb(temp0, Address(__ post(buff, 1))); 3372 __ add(s1, temp0, s1); 3373 __ add(s2, s2, s1); 3374 __ subs(len, len, 1); 3375 __ br(Assembler::HS, L_by1_loop); 3376 3377 __ bind(L_do_mod); 3378 // s1 = s1 % BASE 3379 __ lsr(temp0, s1, 16); 3380 __ lsl(temp1, temp0, 4); 3381 __ sub(temp1, temp1, temp0); 3382 __ add(temp1, temp1, s1, ext::uxth); 3383 3384 __ lsr(temp0, temp1, 16); 3385 __ lsl(s1, temp0, 4); 3386 __ sub(s1, s1, temp0); 3387 __ add(s1, s1, temp1, ext:: uxth); 3388 3389 __ subs(temp0, s1, base); 3390 __ csel(s1, temp0, s1, Assembler::HS); 3391 3392 // s2 = s2 % BASE 3393 __ lsr(temp0, s2, 16); 3394 __ lsl(temp1, temp0, 4); 3395 __ sub(temp1, temp1, temp0); 3396 __ add(temp1, temp1, s2, ext::uxth); 3397 3398 __ lsr(temp0, temp1, 16); 3399 __ lsl(s2, temp0, 4); 3400 __ sub(s2, s2, temp0); 3401 __ add(s2, s2, temp1, ext:: uxth); 3402 3403 __ subs(temp0, s2, base); 3404 __ csel(s2, temp0, s2, Assembler::HS); 3405 3406 // Combine lower bits and higher bits 3407 __ bind(L_combine); 3408 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3409 3410 __ ret(lr); 3411 3412 return start; 3413 } 3414 3415 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 3416 Register temp0, Register temp1, FloatRegister vbytes, 3417 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 3418 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 3419 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 3420 // In non-vectorized code, we update s1 and s2 as: 3421 // s1 <- s1 + b1 3422 // s2 <- s2 + s1 3423 // s1 <- s1 + b2 3424 // s2 <- s2 + b1 3425 // ... 3426 // s1 <- s1 + b16 3427 // s2 <- s2 + s1 3428 // Putting above assignments together, we have: 3429 // s1_new = s1 + b1 + b2 + ... + b16 3430 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 3431 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 3432 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 3433 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 3434 3435 // s2 = s2 + s1 * 16 3436 __ add(s2, s2, s1, Assembler::LSL, 4); 3437 3438 // vs1acc = b1 + b2 + b3 + ... + b16 3439 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 3440 __ umullv(vs2acc, __ T8B, vtable, vbytes); 3441 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 3442 __ uaddlv(vs1acc, __ T16B, vbytes); 3443 __ uaddlv(vs2acc, __ T8H, vs2acc); 3444 3445 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 3446 __ fmovd(temp0, vs1acc); 3447 __ fmovd(temp1, vs2acc); 3448 __ add(s1, s1, temp0); 3449 __ add(s2, s2, temp1); 3450 } 3451 3452 /** 3453 * Arguments: 3454 * 3455 * Input: 3456 * c_rarg0 - x address 3457 * c_rarg1 - x length 3458 * c_rarg2 - y address 3459 * c_rarg3 - y lenth 3460 * c_rarg4 - z address 3461 * c_rarg5 - z length 3462 */ 3463 address generate_multiplyToLen() { 3464 __ align(CodeEntryAlignment); 3465 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3466 3467 address start = __ pc(); 3468 const Register x = r0; 3469 const Register xlen = r1; 3470 const Register y = r2; 3471 const Register ylen = r3; 3472 const Register z = r4; 3473 const Register zlen = r5; 3474 3475 const Register tmp1 = r10; 3476 const Register tmp2 = r11; 3477 const Register tmp3 = r12; 3478 const Register tmp4 = r13; 3479 const Register tmp5 = r14; 3480 const Register tmp6 = r15; 3481 const Register tmp7 = r16; 3482 3483 BLOCK_COMMENT("Entry:"); 3484 __ enter(); // required for proper stackwalking of RuntimeStub frame 3485 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3486 __ leave(); // required for proper stackwalking of RuntimeStub frame 3487 __ ret(lr); 3488 3489 return start; 3490 } 3491 3492 address generate_squareToLen() { 3493 // squareToLen algorithm for sizes 1..127 described in java code works 3494 // faster than multiply_to_len on some CPUs and slower on others, but 3495 // multiply_to_len shows a bit better overall results 3496 __ align(CodeEntryAlignment); 3497 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3498 address start = __ pc(); 3499 3500 const Register x = r0; 3501 const Register xlen = r1; 3502 const Register z = r2; 3503 const Register zlen = r3; 3504 const Register y = r4; // == x 3505 const Register ylen = r5; // == xlen 3506 3507 const Register tmp1 = r10; 3508 const Register tmp2 = r11; 3509 const Register tmp3 = r12; 3510 const Register tmp4 = r13; 3511 const Register tmp5 = r14; 3512 const Register tmp6 = r15; 3513 const Register tmp7 = r16; 3514 3515 RegSet spilled_regs = RegSet::of(y, ylen); 3516 BLOCK_COMMENT("Entry:"); 3517 __ enter(); 3518 __ push(spilled_regs, sp); 3519 __ mov(y, x); 3520 __ mov(ylen, xlen); 3521 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3522 __ pop(spilled_regs, sp); 3523 __ leave(); 3524 __ ret(lr); 3525 return start; 3526 } 3527 3528 address generate_mulAdd() { 3529 __ align(CodeEntryAlignment); 3530 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3531 3532 address start = __ pc(); 3533 3534 const Register out = r0; 3535 const Register in = r1; 3536 const Register offset = r2; 3537 const Register len = r3; 3538 const Register k = r4; 3539 3540 BLOCK_COMMENT("Entry:"); 3541 __ enter(); 3542 __ mul_add(out, in, offset, len, k); 3543 __ leave(); 3544 __ ret(lr); 3545 3546 return start; 3547 } 3548 3549 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3550 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3551 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3552 // Karatsuba multiplication performs a 128*128 -> 256-bit 3553 // multiplication in three 128-bit multiplications and a few 3554 // additions. 3555 // 3556 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3557 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3558 // 3559 // Inputs: 3560 // 3561 // A0 in a.d[0] (subkey) 3562 // A1 in a.d[1] 3563 // (A1+A0) in a1_xor_a0.d[0] 3564 // 3565 // B0 in b.d[0] (state) 3566 // B1 in b.d[1] 3567 3568 __ ext(tmp1, __ T16B, b, b, 0x08); 3569 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3570 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3571 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3572 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3573 3574 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3575 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3576 __ eor(tmp2, __ T16B, tmp2, tmp4); 3577 __ eor(tmp2, __ T16B, tmp2, tmp3); 3578 3579 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3580 __ ins(result_hi, __ D, tmp2, 0, 1); 3581 __ ins(result_lo, __ D, tmp2, 1, 0); 3582 } 3583 3584 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3585 FloatRegister p, FloatRegister z, FloatRegister t1) { 3586 const FloatRegister t0 = result; 3587 3588 // The GCM field polynomial f is z^128 + p(z), where p = 3589 // z^7+z^2+z+1. 3590 // 3591 // z^128 === -p(z) (mod (z^128 + p(z))) 3592 // 3593 // so, given that the product we're reducing is 3594 // a == lo + hi * z^128 3595 // substituting, 3596 // === lo - hi * p(z) (mod (z^128 + p(z))) 3597 // 3598 // we reduce by multiplying hi by p(z) and subtracting the result 3599 // from (i.e. XORing it with) lo. Because p has no nonzero high 3600 // bits we can do this with two 64-bit multiplications, lo*p and 3601 // hi*p. 3602 3603 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3604 __ ext(t1, __ T16B, t0, z, 8); 3605 __ eor(hi, __ T16B, hi, t1); 3606 __ ext(t1, __ T16B, z, t0, 8); 3607 __ eor(lo, __ T16B, lo, t1); 3608 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3609 __ eor(result, __ T16B, lo, t0); 3610 } 3611 3612 address generate_has_negatives(address &has_negatives_long) { 3613 const u1 large_loop_size = 64; 3614 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3615 int dcache_line = VM_Version::dcache_line_size(); 3616 3617 Register ary1 = r1, len = r2, result = r0; 3618 3619 __ align(CodeEntryAlignment); 3620 3621 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3622 3623 address entry = __ pc(); 3624 3625 __ enter(); 3626 3627 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3628 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3629 3630 __ cmp(len, (u1)15); 3631 __ br(Assembler::GT, LEN_OVER_15); 3632 // The only case when execution falls into this code is when pointer is near 3633 // the end of memory page and we have to avoid reading next page 3634 __ add(ary1, ary1, len); 3635 __ subs(len, len, 8); 3636 __ br(Assembler::GT, LEN_OVER_8); 3637 __ ldr(rscratch2, Address(ary1, -8)); 3638 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3639 __ lsrv(rscratch2, rscratch2, rscratch1); 3640 __ tst(rscratch2, UPPER_BIT_MASK); 3641 __ cset(result, Assembler::NE); 3642 __ leave(); 3643 __ ret(lr); 3644 __ bind(LEN_OVER_8); 3645 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3646 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3647 __ tst(rscratch2, UPPER_BIT_MASK); 3648 __ br(Assembler::NE, RET_TRUE_NO_POP); 3649 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3650 __ lsrv(rscratch1, rscratch1, rscratch2); 3651 __ tst(rscratch1, UPPER_BIT_MASK); 3652 __ cset(result, Assembler::NE); 3653 __ leave(); 3654 __ ret(lr); 3655 3656 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3657 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3658 3659 has_negatives_long = __ pc(); // 2nd entry point 3660 3661 __ enter(); 3662 3663 __ bind(LEN_OVER_15); 3664 __ push(spilled_regs, sp); 3665 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3666 __ cbz(rscratch2, ALIGNED); 3667 __ ldp(tmp6, tmp1, Address(ary1)); 3668 __ mov(tmp5, 16); 3669 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3670 __ add(ary1, ary1, rscratch1); 3671 __ sub(len, len, rscratch1); 3672 __ orr(tmp6, tmp6, tmp1); 3673 __ tst(tmp6, UPPER_BIT_MASK); 3674 __ br(Assembler::NE, RET_TRUE); 3675 3676 __ bind(ALIGNED); 3677 __ cmp(len, large_loop_size); 3678 __ br(Assembler::LT, CHECK_16); 3679 // Perform 16-byte load as early return in pre-loop to handle situation 3680 // when initially aligned large array has negative values at starting bytes, 3681 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3682 // slower. Cases with negative bytes further ahead won't be affected that 3683 // much. In fact, it'll be faster due to early loads, less instructions and 3684 // less branches in LARGE_LOOP. 3685 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3686 __ sub(len, len, 16); 3687 __ orr(tmp6, tmp6, tmp1); 3688 __ tst(tmp6, UPPER_BIT_MASK); 3689 __ br(Assembler::NE, RET_TRUE); 3690 __ cmp(len, large_loop_size); 3691 __ br(Assembler::LT, CHECK_16); 3692 3693 if (SoftwarePrefetchHintDistance >= 0 3694 && SoftwarePrefetchHintDistance >= dcache_line) { 3695 // initial prefetch 3696 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3697 } 3698 __ bind(LARGE_LOOP); 3699 if (SoftwarePrefetchHintDistance >= 0) { 3700 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3701 } 3702 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3703 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3704 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3705 // instructions per cycle and have less branches, but this approach disables 3706 // early return, thus, all 64 bytes are loaded and checked every time. 3707 __ ldp(tmp2, tmp3, Address(ary1)); 3708 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3709 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3710 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3711 __ add(ary1, ary1, large_loop_size); 3712 __ sub(len, len, large_loop_size); 3713 __ orr(tmp2, tmp2, tmp3); 3714 __ orr(tmp4, tmp4, tmp5); 3715 __ orr(rscratch1, rscratch1, rscratch2); 3716 __ orr(tmp6, tmp6, tmp1); 3717 __ orr(tmp2, tmp2, tmp4); 3718 __ orr(rscratch1, rscratch1, tmp6); 3719 __ orr(tmp2, tmp2, rscratch1); 3720 __ tst(tmp2, UPPER_BIT_MASK); 3721 __ br(Assembler::NE, RET_TRUE); 3722 __ cmp(len, large_loop_size); 3723 __ br(Assembler::GE, LARGE_LOOP); 3724 3725 __ bind(CHECK_16); // small 16-byte load pre-loop 3726 __ cmp(len, (u1)16); 3727 __ br(Assembler::LT, POST_LOOP16); 3728 3729 __ bind(LOOP16); // small 16-byte load loop 3730 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3731 __ sub(len, len, 16); 3732 __ orr(tmp2, tmp2, tmp3); 3733 __ tst(tmp2, UPPER_BIT_MASK); 3734 __ br(Assembler::NE, RET_TRUE); 3735 __ cmp(len, (u1)16); 3736 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3737 3738 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3739 __ cmp(len, (u1)8); 3740 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3741 __ ldr(tmp3, Address(__ post(ary1, 8))); 3742 __ sub(len, len, 8); 3743 __ tst(tmp3, UPPER_BIT_MASK); 3744 __ br(Assembler::NE, RET_TRUE); 3745 3746 __ bind(POST_LOOP16_LOAD_TAIL); 3747 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3748 __ ldr(tmp1, Address(ary1)); 3749 __ mov(tmp2, 64); 3750 __ sub(tmp4, tmp2, len, __ LSL, 3); 3751 __ lslv(tmp1, tmp1, tmp4); 3752 __ tst(tmp1, UPPER_BIT_MASK); 3753 __ br(Assembler::NE, RET_TRUE); 3754 // Fallthrough 3755 3756 __ bind(RET_FALSE); 3757 __ pop(spilled_regs, sp); 3758 __ leave(); 3759 __ mov(result, zr); 3760 __ ret(lr); 3761 3762 __ bind(RET_TRUE); 3763 __ pop(spilled_regs, sp); 3764 __ bind(RET_TRUE_NO_POP); 3765 __ leave(); 3766 __ mov(result, 1); 3767 __ ret(lr); 3768 3769 __ bind(DONE); 3770 __ pop(spilled_regs, sp); 3771 __ leave(); 3772 __ ret(lr); 3773 return entry; 3774 } 3775 3776 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3777 bool usePrefetch, Label &NOT_EQUAL) { 3778 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3779 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3780 tmp7 = r12, tmp8 = r13; 3781 Label LOOP; 3782 3783 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3784 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3785 __ bind(LOOP); 3786 if (usePrefetch) { 3787 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3788 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3789 } 3790 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3791 __ eor(tmp1, tmp1, tmp2); 3792 __ eor(tmp3, tmp3, tmp4); 3793 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3794 __ orr(tmp1, tmp1, tmp3); 3795 __ cbnz(tmp1, NOT_EQUAL); 3796 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3797 __ eor(tmp5, tmp5, tmp6); 3798 __ eor(tmp7, tmp7, tmp8); 3799 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3800 __ orr(tmp5, tmp5, tmp7); 3801 __ cbnz(tmp5, NOT_EQUAL); 3802 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3803 __ eor(tmp1, tmp1, tmp2); 3804 __ eor(tmp3, tmp3, tmp4); 3805 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3806 __ orr(tmp1, tmp1, tmp3); 3807 __ cbnz(tmp1, NOT_EQUAL); 3808 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3809 __ eor(tmp5, tmp5, tmp6); 3810 __ sub(cnt1, cnt1, 8 * wordSize); 3811 __ eor(tmp7, tmp7, tmp8); 3812 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3813 // tmp6 is not used. MacroAssembler::subs is used here (rather than 3814 // cmp) because subs allows an unlimited range of immediate operand. 3815 __ subs(tmp6, cnt1, loopThreshold); 3816 __ orr(tmp5, tmp5, tmp7); 3817 __ cbnz(tmp5, NOT_EQUAL); 3818 __ br(__ GE, LOOP); 3819 // post-loop 3820 __ eor(tmp1, tmp1, tmp2); 3821 __ eor(tmp3, tmp3, tmp4); 3822 __ orr(tmp1, tmp1, tmp3); 3823 __ sub(cnt1, cnt1, 2 * wordSize); 3824 __ cbnz(tmp1, NOT_EQUAL); 3825 } 3826 3827 void generate_large_array_equals_loop_simd(int loopThreshold, 3828 bool usePrefetch, Label &NOT_EQUAL) { 3829 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3830 tmp2 = rscratch2; 3831 Label LOOP; 3832 3833 __ bind(LOOP); 3834 if (usePrefetch) { 3835 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3836 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3837 } 3838 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3839 __ sub(cnt1, cnt1, 8 * wordSize); 3840 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3841 __ subs(tmp1, cnt1, loopThreshold); 3842 __ eor(v0, __ T16B, v0, v4); 3843 __ eor(v1, __ T16B, v1, v5); 3844 __ eor(v2, __ T16B, v2, v6); 3845 __ eor(v3, __ T16B, v3, v7); 3846 __ orr(v0, __ T16B, v0, v1); 3847 __ orr(v1, __ T16B, v2, v3); 3848 __ orr(v0, __ T16B, v0, v1); 3849 __ umov(tmp1, v0, __ D, 0); 3850 __ umov(tmp2, v0, __ D, 1); 3851 __ orr(tmp1, tmp1, tmp2); 3852 __ cbnz(tmp1, NOT_EQUAL); 3853 __ br(__ GE, LOOP); 3854 } 3855 3856 // a1 = r1 - array1 address 3857 // a2 = r2 - array2 address 3858 // result = r0 - return value. Already contains "false" 3859 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3860 // r3-r5 are reserved temporary registers 3861 address generate_large_array_equals() { 3862 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3863 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3864 tmp7 = r12, tmp8 = r13; 3865 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3866 SMALL_LOOP, POST_LOOP; 3867 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3868 // calculate if at least 32 prefetched bytes are used 3869 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3870 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3871 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3872 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3873 tmp5, tmp6, tmp7, tmp8); 3874 3875 __ align(CodeEntryAlignment); 3876 3877 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3878 3879 address entry = __ pc(); 3880 __ enter(); 3881 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3882 // also advance pointers to use post-increment instead of pre-increment 3883 __ add(a1, a1, wordSize); 3884 __ add(a2, a2, wordSize); 3885 if (AvoidUnalignedAccesses) { 3886 // both implementations (SIMD/nonSIMD) are using relatively large load 3887 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 3888 // on some CPUs in case of address is not at least 16-byte aligned. 3889 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 3890 // load if needed at least for 1st address and make if 16-byte aligned. 3891 Label ALIGNED16; 3892 __ tbz(a1, 3, ALIGNED16); 3893 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3894 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3895 __ sub(cnt1, cnt1, wordSize); 3896 __ eor(tmp1, tmp1, tmp2); 3897 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 3898 __ bind(ALIGNED16); 3899 } 3900 if (UseSIMDForArrayEquals) { 3901 if (SoftwarePrefetchHintDistance >= 0) { 3902 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3903 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3904 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 3905 /* prfm = */ true, NOT_EQUAL); 3906 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3907 __ br(__ LT, TAIL); 3908 } 3909 __ bind(NO_PREFETCH_LARGE_LOOP); 3910 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 3911 /* prfm = */ false, NOT_EQUAL); 3912 } else { 3913 __ push(spilled_regs, sp); 3914 if (SoftwarePrefetchHintDistance >= 0) { 3915 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3916 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3917 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 3918 /* prfm = */ true, NOT_EQUAL); 3919 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3920 __ br(__ LT, TAIL); 3921 } 3922 __ bind(NO_PREFETCH_LARGE_LOOP); 3923 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 3924 /* prfm = */ false, NOT_EQUAL); 3925 } 3926 __ bind(TAIL); 3927 __ cbz(cnt1, EQUAL); 3928 __ subs(cnt1, cnt1, wordSize); 3929 __ br(__ LE, POST_LOOP); 3930 __ bind(SMALL_LOOP); 3931 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3932 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3933 __ subs(cnt1, cnt1, wordSize); 3934 __ eor(tmp1, tmp1, tmp2); 3935 __ cbnz(tmp1, NOT_EQUAL); 3936 __ br(__ GT, SMALL_LOOP); 3937 __ bind(POST_LOOP); 3938 __ ldr(tmp1, Address(a1, cnt1)); 3939 __ ldr(tmp2, Address(a2, cnt1)); 3940 __ eor(tmp1, tmp1, tmp2); 3941 __ cbnz(tmp1, NOT_EQUAL); 3942 __ bind(EQUAL); 3943 __ mov(result, true); 3944 __ bind(NOT_EQUAL); 3945 if (!UseSIMDForArrayEquals) { 3946 __ pop(spilled_regs, sp); 3947 } 3948 __ bind(NOT_EQUAL_NO_POP); 3949 __ leave(); 3950 __ ret(lr); 3951 return entry; 3952 } 3953 3954 address generate_dsin_dcos(bool isCos) { 3955 __ align(CodeEntryAlignment); 3956 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 3957 address start = __ pc(); 3958 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 3959 (address)StubRoutines::aarch64::_two_over_pi, 3960 (address)StubRoutines::aarch64::_pio2, 3961 (address)StubRoutines::aarch64::_dsin_coef, 3962 (address)StubRoutines::aarch64::_dcos_coef); 3963 return start; 3964 } 3965 3966 address generate_dlog() { 3967 __ align(CodeEntryAlignment); 3968 StubCodeMark mark(this, "StubRoutines", "dlog"); 3969 address entry = __ pc(); 3970 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 3971 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 3972 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 3973 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 3974 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 3975 return entry; 3976 } 3977 3978 // code for comparing 16 bytes of strings with same encoding 3979 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 3980 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; 3981 __ ldr(rscratch1, Address(__ post(str1, 8))); 3982 __ eor(rscratch2, tmp1, tmp2); 3983 __ ldr(cnt1, Address(__ post(str2, 8))); 3984 __ cbnz(rscratch2, DIFF1); 3985 __ ldr(tmp1, Address(__ post(str1, 8))); 3986 __ eor(rscratch2, rscratch1, cnt1); 3987 __ ldr(tmp2, Address(__ post(str2, 8))); 3988 __ cbnz(rscratch2, DIFF2); 3989 } 3990 3991 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 3992 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 3993 Label &DIFF2) { 3994 Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12; 3995 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 3996 3997 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 3998 __ ldr(tmpU, Address(__ post(cnt1, 8))); 3999 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4000 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 4001 4002 __ fmovd(tmpL, vtmp3); 4003 __ eor(rscratch2, tmp3, tmpL); 4004 __ cbnz(rscratch2, DIFF2); 4005 4006 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4007 __ umov(tmpL, vtmp3, __ D, 1); 4008 __ eor(rscratch2, tmpU, tmpL); 4009 __ cbnz(rscratch2, DIFF1); 4010 4011 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4012 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4013 __ fmovd(tmpL, vtmp); 4014 __ eor(rscratch2, tmp3, tmpL); 4015 __ cbnz(rscratch2, DIFF2); 4016 4017 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4018 __ umov(tmpL, vtmp, __ D, 1); 4019 __ eor(rscratch2, tmpU, tmpL); 4020 __ cbnz(rscratch2, DIFF1); 4021 } 4022 4023 // r0 = result 4024 // r1 = str1 4025 // r2 = cnt1 4026 // r3 = str2 4027 // r4 = cnt2 4028 // r10 = tmp1 4029 // r11 = tmp2 4030 address generate_compare_long_string_different_encoding(bool isLU) { 4031 __ align(CodeEntryAlignment); 4032 StubCodeMark mark(this, "StubRoutines", isLU 4033 ? "compare_long_string_different_encoding LU" 4034 : "compare_long_string_different_encoding UL"); 4035 address entry = __ pc(); 4036 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 4037 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER, 4038 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 4039 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4040 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 4041 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 4042 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 4043 4044 int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2); 4045 4046 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 4047 // cnt2 == amount of characters left to compare 4048 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 4049 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4050 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 4051 __ add(str2, str2, isLU ? wordSize : wordSize/2); 4052 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 4053 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 4054 __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1); 4055 __ eor(rscratch2, tmp1, tmp2); 4056 __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0); 4057 __ mov(rscratch1, tmp2); 4058 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 4059 Register strU = isLU ? str2 : str1, 4060 strL = isLU ? str1 : str2, 4061 tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 4062 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 4063 __ push(spilled_regs, sp); 4064 __ sub(tmp2, strL, cnt2); // strL pointer to load from 4065 __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from 4066 4067 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4068 4069 if (SoftwarePrefetchHintDistance >= 0) { 4070 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4071 __ br(__ LT, SMALL_LOOP); 4072 __ bind(LARGE_LOOP_PREFETCH); 4073 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 4074 __ mov(tmp4, 2); 4075 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4076 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 4077 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4078 __ subs(tmp4, tmp4, 1); 4079 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 4080 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4081 __ mov(tmp4, 2); 4082 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 4083 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4084 __ subs(tmp4, tmp4, 1); 4085 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 4086 __ sub(cnt2, cnt2, 64); 4087 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4088 __ br(__ GE, LARGE_LOOP_PREFETCH); 4089 } 4090 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 4091 __ subs(cnt2, cnt2, 16); 4092 __ br(__ LT, TAIL); 4093 __ b(SMALL_LOOP_ENTER); 4094 __ bind(SMALL_LOOP); // smaller loop 4095 __ subs(cnt2, cnt2, 16); 4096 __ bind(SMALL_LOOP_ENTER); 4097 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4098 __ br(__ GE, SMALL_LOOP); 4099 __ cbz(cnt2, LOAD_LAST); 4100 __ bind(TAIL); // 1..15 characters left 4101 __ subs(zr, cnt2, -8); 4102 __ br(__ GT, TAIL_LOAD_16); 4103 __ ldrd(vtmp, Address(tmp2)); 4104 __ zip1(vtmp3, __ T8B, vtmp, vtmpZ); 4105 4106 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4107 __ fmovd(tmpL, vtmp3); 4108 __ eor(rscratch2, tmp3, tmpL); 4109 __ cbnz(rscratch2, DIFF2); 4110 __ umov(tmpL, vtmp3, __ D, 1); 4111 __ eor(rscratch2, tmpU, tmpL); 4112 __ cbnz(rscratch2, DIFF1); 4113 __ b(LOAD_LAST); 4114 __ bind(TAIL_LOAD_16); 4115 __ ldrq(vtmp, Address(tmp2)); 4116 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4117 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4118 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4119 __ fmovd(tmpL, vtmp3); 4120 __ eor(rscratch2, tmp3, tmpL); 4121 __ cbnz(rscratch2, DIFF2); 4122 4123 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4124 __ umov(tmpL, vtmp3, __ D, 1); 4125 __ eor(rscratch2, tmpU, tmpL); 4126 __ cbnz(rscratch2, DIFF1); 4127 4128 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4129 __ fmovd(tmpL, vtmp); 4130 __ eor(rscratch2, tmp3, tmpL); 4131 __ cbnz(rscratch2, DIFF2); 4132 4133 __ umov(tmpL, vtmp, __ D, 1); 4134 __ eor(rscratch2, tmpU, tmpL); 4135 __ cbnz(rscratch2, DIFF1); 4136 __ b(LOAD_LAST); 4137 __ bind(DIFF2); 4138 __ mov(tmpU, tmp3); 4139 __ bind(DIFF1); 4140 __ pop(spilled_regs, sp); 4141 __ b(CALCULATE_DIFFERENCE); 4142 __ bind(LOAD_LAST); 4143 __ pop(spilled_regs, sp); 4144 4145 __ ldrs(vtmp, Address(strL)); 4146 __ ldr(tmpU, Address(strU)); 4147 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4148 __ fmovd(tmpL, vtmp); 4149 4150 __ eor(rscratch2, tmpU, tmpL); 4151 __ cbz(rscratch2, DONE); 4152 4153 // Find the first different characters in the longwords and 4154 // compute their difference. 4155 __ bind(CALCULATE_DIFFERENCE); 4156 __ rev(rscratch2, rscratch2); 4157 __ clz(rscratch2, rscratch2); 4158 __ andr(rscratch2, rscratch2, -16); 4159 __ lsrv(tmp1, tmp1, rscratch2); 4160 __ uxthw(tmp1, tmp1); 4161 __ lsrv(rscratch1, rscratch1, rscratch2); 4162 __ uxthw(rscratch1, rscratch1); 4163 __ subw(result, tmp1, rscratch1); 4164 __ bind(DONE); 4165 __ ret(lr); 4166 return entry; 4167 } 4168 4169 // r0 = result 4170 // r1 = str1 4171 // r2 = cnt1 4172 // r3 = str2 4173 // r4 = cnt2 4174 // r10 = tmp1 4175 // r11 = tmp2 4176 address generate_compare_long_string_same_encoding(bool isLL) { 4177 __ align(CodeEntryAlignment); 4178 StubCodeMark mark(this, "StubRoutines", isLL 4179 ? "compare_long_string_same_encoding LL" 4180 : "compare_long_string_same_encoding UU"); 4181 address entry = __ pc(); 4182 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4183 tmp1 = r10, tmp2 = r11; 4184 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, 4185 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, 4186 DIFF_LAST_POSITION, DIFF_LAST_POSITION2; 4187 // exit from large loop when less than 64 bytes left to read or we're about 4188 // to prefetch memory behind array border 4189 int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 4190 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 4191 // update cnt2 counter with already loaded 8 bytes 4192 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 4193 // update pointers, because of previous read 4194 __ add(str1, str1, wordSize); 4195 __ add(str2, str2, wordSize); 4196 if (SoftwarePrefetchHintDistance >= 0) { 4197 __ bind(LARGE_LOOP_PREFETCH); 4198 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 4199 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 4200 compare_string_16_bytes_same(DIFF, DIFF2); 4201 compare_string_16_bytes_same(DIFF, DIFF2); 4202 __ sub(cnt2, cnt2, isLL ? 64 : 32); 4203 compare_string_16_bytes_same(DIFF, DIFF2); 4204 __ subs(rscratch2, cnt2, largeLoopExitCondition); 4205 compare_string_16_bytes_same(DIFF, DIFF2); 4206 __ br(__ GT, LARGE_LOOP_PREFETCH); 4207 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? 4208 // less than 16 bytes left? 4209 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4210 __ br(__ LT, TAIL); 4211 } 4212 __ bind(SMALL_LOOP); 4213 compare_string_16_bytes_same(DIFF, DIFF2); 4214 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4215 __ br(__ GE, SMALL_LOOP); 4216 __ bind(TAIL); 4217 __ adds(cnt2, cnt2, isLL ? 16 : 8); 4218 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); 4219 __ subs(cnt2, cnt2, isLL ? 8 : 4); 4220 __ br(__ LE, CHECK_LAST); 4221 __ eor(rscratch2, tmp1, tmp2); 4222 __ cbnz(rscratch2, DIFF); 4223 __ ldr(tmp1, Address(__ post(str1, 8))); 4224 __ ldr(tmp2, Address(__ post(str2, 8))); 4225 __ sub(cnt2, cnt2, isLL ? 8 : 4); 4226 __ bind(CHECK_LAST); 4227 if (!isLL) { 4228 __ add(cnt2, cnt2, cnt2); // now in bytes 4229 } 4230 __ eor(rscratch2, tmp1, tmp2); 4231 __ cbnz(rscratch2, DIFF); 4232 __ ldr(rscratch1, Address(str1, cnt2)); 4233 __ ldr(cnt1, Address(str2, cnt2)); 4234 __ eor(rscratch2, rscratch1, cnt1); 4235 __ cbz(rscratch2, LENGTH_DIFF); 4236 // Find the first different characters in the longwords and 4237 // compute their difference. 4238 __ bind(DIFF2); 4239 __ rev(rscratch2, rscratch2); 4240 __ clz(rscratch2, rscratch2); 4241 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4242 __ lsrv(rscratch1, rscratch1, rscratch2); 4243 if (isLL) { 4244 __ lsrv(cnt1, cnt1, rscratch2); 4245 __ uxtbw(rscratch1, rscratch1); 4246 __ uxtbw(cnt1, cnt1); 4247 } else { 4248 __ lsrv(cnt1, cnt1, rscratch2); 4249 __ uxthw(rscratch1, rscratch1); 4250 __ uxthw(cnt1, cnt1); 4251 } 4252 __ subw(result, rscratch1, cnt1); 4253 __ b(LENGTH_DIFF); 4254 __ bind(DIFF); 4255 __ rev(rscratch2, rscratch2); 4256 __ clz(rscratch2, rscratch2); 4257 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4258 __ lsrv(tmp1, tmp1, rscratch2); 4259 if (isLL) { 4260 __ lsrv(tmp2, tmp2, rscratch2); 4261 __ uxtbw(tmp1, tmp1); 4262 __ uxtbw(tmp2, tmp2); 4263 } else { 4264 __ lsrv(tmp2, tmp2, rscratch2); 4265 __ uxthw(tmp1, tmp1); 4266 __ uxthw(tmp2, tmp2); 4267 } 4268 __ subw(result, tmp1, tmp2); 4269 __ b(LENGTH_DIFF); 4270 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 4271 __ eor(rscratch2, tmp1, tmp2); 4272 __ cbnz(rscratch2, DIFF); 4273 __ bind(LENGTH_DIFF); 4274 __ ret(lr); 4275 return entry; 4276 } 4277 4278 void generate_compare_long_strings() { 4279 StubRoutines::aarch64::_compare_long_string_LL 4280 = generate_compare_long_string_same_encoding(true); 4281 StubRoutines::aarch64::_compare_long_string_UU 4282 = generate_compare_long_string_same_encoding(false); 4283 StubRoutines::aarch64::_compare_long_string_LU 4284 = generate_compare_long_string_different_encoding(true); 4285 StubRoutines::aarch64::_compare_long_string_UL 4286 = generate_compare_long_string_different_encoding(false); 4287 } 4288 4289 // R0 = result 4290 // R1 = str2 4291 // R2 = cnt1 4292 // R3 = str1 4293 // R4 = cnt2 4294 // This generic linear code use few additional ideas, which makes it faster: 4295 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 4296 // in order to skip initial loading(help in systems with 1 ld pipeline) 4297 // 2) we can use "fast" algorithm of finding single character to search for 4298 // first symbol with less branches(1 branch per each loaded register instead 4299 // of branch for each symbol), so, this is where constants like 4300 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 4301 // 3) after loading and analyzing 1st register of source string, it can be 4302 // used to search for every 1st character entry, saving few loads in 4303 // comparison with "simplier-but-slower" implementation 4304 // 4) in order to avoid lots of push/pop operations, code below is heavily 4305 // re-using/re-initializing/compressing register values, which makes code 4306 // larger and a bit less readable, however, most of extra operations are 4307 // issued during loads or branches, so, penalty is minimal 4308 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 4309 const char* stubName = str1_isL 4310 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 4311 : "indexof_linear_uu"; 4312 __ align(CodeEntryAlignment); 4313 StubCodeMark mark(this, "StubRoutines", stubName); 4314 address entry = __ pc(); 4315 4316 int str1_chr_size = str1_isL ? 1 : 2; 4317 int str2_chr_size = str2_isL ? 1 : 2; 4318 int str1_chr_shift = str1_isL ? 0 : 1; 4319 int str2_chr_shift = str2_isL ? 0 : 1; 4320 bool isL = str1_isL && str2_isL; 4321 // parameters 4322 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 4323 // temporary registers 4324 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 4325 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 4326 // redefinitions 4327 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 4328 4329 __ push(spilled_regs, sp); 4330 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 4331 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 4332 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 4333 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 4334 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 4335 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 4336 // Read whole register from str1. It is safe, because length >=8 here 4337 __ ldr(ch1, Address(str1)); 4338 // Read whole register from str2. It is safe, because length >=8 here 4339 __ ldr(ch2, Address(str2)); 4340 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 4341 if (str1_isL != str2_isL) { 4342 __ eor(v0, __ T16B, v0, v0); 4343 } 4344 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4345 __ mul(first, first, tmp1); 4346 // check if we have less than 1 register to check 4347 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 4348 if (str1_isL != str2_isL) { 4349 __ fmovd(v1, ch1); 4350 } 4351 __ br(__ LE, L_SMALL); 4352 __ eor(ch2, first, ch2); 4353 if (str1_isL != str2_isL) { 4354 __ zip1(v1, __ T16B, v1, v0); 4355 } 4356 __ sub(tmp2, ch2, tmp1); 4357 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4358 __ bics(tmp2, tmp2, ch2); 4359 if (str1_isL != str2_isL) { 4360 __ fmovd(ch1, v1); 4361 } 4362 __ br(__ NE, L_HAS_ZERO); 4363 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4364 __ add(result, result, wordSize/str2_chr_size); 4365 __ add(str2, str2, wordSize); 4366 __ br(__ LT, L_POST_LOOP); 4367 __ BIND(L_LOOP); 4368 __ ldr(ch2, Address(str2)); 4369 __ eor(ch2, first, ch2); 4370 __ sub(tmp2, ch2, tmp1); 4371 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4372 __ bics(tmp2, tmp2, ch2); 4373 __ br(__ NE, L_HAS_ZERO); 4374 __ BIND(L_LOOP_PROCEED); 4375 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4376 __ add(str2, str2, wordSize); 4377 __ add(result, result, wordSize/str2_chr_size); 4378 __ br(__ GE, L_LOOP); 4379 __ BIND(L_POST_LOOP); 4380 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 4381 __ br(__ LE, NOMATCH); 4382 __ ldr(ch2, Address(str2)); 4383 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4384 __ eor(ch2, first, ch2); 4385 __ sub(tmp2, ch2, tmp1); 4386 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4387 __ mov(tmp4, -1); // all bits set 4388 __ b(L_SMALL_PROCEED); 4389 __ align(OptoLoopAlignment); 4390 __ BIND(L_SMALL); 4391 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4392 __ eor(ch2, first, ch2); 4393 if (str1_isL != str2_isL) { 4394 __ zip1(v1, __ T16B, v1, v0); 4395 } 4396 __ sub(tmp2, ch2, tmp1); 4397 __ mov(tmp4, -1); // all bits set 4398 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4399 if (str1_isL != str2_isL) { 4400 __ fmovd(ch1, v1); // move converted 4 symbols 4401 } 4402 __ BIND(L_SMALL_PROCEED); 4403 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 4404 __ bic(tmp2, tmp2, ch2); 4405 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 4406 __ rbit(tmp2, tmp2); 4407 __ br(__ EQ, NOMATCH); 4408 __ BIND(L_SMALL_HAS_ZERO_LOOP); 4409 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 4410 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 4411 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 4412 if (str2_isL) { // LL 4413 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4414 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4415 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4416 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4417 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4418 } else { 4419 __ mov(ch2, 0xE); // all bits in byte set except last one 4420 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4421 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4422 __ lslv(tmp2, tmp2, tmp4); 4423 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4424 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4425 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4426 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4427 } 4428 __ cmp(ch1, ch2); 4429 __ mov(tmp4, wordSize/str2_chr_size); 4430 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4431 __ BIND(L_SMALL_CMP_LOOP); 4432 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4433 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4434 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4435 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4436 __ add(tmp4, tmp4, 1); 4437 __ cmp(tmp4, cnt1); 4438 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 4439 __ cmp(first, ch2); 4440 __ br(__ EQ, L_SMALL_CMP_LOOP); 4441 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 4442 __ cbz(tmp2, NOMATCH); // no more matches. exit 4443 __ clz(tmp4, tmp2); 4444 __ add(result, result, 1); // advance index 4445 __ add(str2, str2, str2_chr_size); // advance pointer 4446 __ b(L_SMALL_HAS_ZERO_LOOP); 4447 __ align(OptoLoopAlignment); 4448 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 4449 __ cmp(first, ch2); 4450 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4451 __ b(DONE); 4452 __ align(OptoLoopAlignment); 4453 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 4454 if (str2_isL) { // LL 4455 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4456 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4457 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4458 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4459 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4460 } else { 4461 __ mov(ch2, 0xE); // all bits in byte set except last one 4462 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4463 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4464 __ lslv(tmp2, tmp2, tmp4); 4465 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4466 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4467 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4468 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4469 } 4470 __ cmp(ch1, ch2); 4471 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4472 __ b(DONE); 4473 __ align(OptoLoopAlignment); 4474 __ BIND(L_HAS_ZERO); 4475 __ rbit(tmp2, tmp2); 4476 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 4477 // Now, perform compression of counters(cnt2 and cnt1) into one register. 4478 // It's fine because both counters are 32bit and are not changed in this 4479 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 4480 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 4481 __ sub(result, result, 1); 4482 __ BIND(L_HAS_ZERO_LOOP); 4483 __ mov(cnt1, wordSize/str2_chr_size); 4484 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4485 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 4486 if (str2_isL) { 4487 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4488 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4489 __ lslv(tmp2, tmp2, tmp4); 4490 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4491 __ add(tmp4, tmp4, 1); 4492 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4493 __ lsl(tmp2, tmp2, 1); 4494 __ mov(tmp4, wordSize/str2_chr_size); 4495 } else { 4496 __ mov(ch2, 0xE); 4497 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4498 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4499 __ lslv(tmp2, tmp2, tmp4); 4500 __ add(tmp4, tmp4, 1); 4501 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4502 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4503 __ lsl(tmp2, tmp2, 1); 4504 __ mov(tmp4, wordSize/str2_chr_size); 4505 __ sub(str2, str2, str2_chr_size); 4506 } 4507 __ cmp(ch1, ch2); 4508 __ mov(tmp4, wordSize/str2_chr_size); 4509 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4510 __ BIND(L_CMP_LOOP); 4511 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4512 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4513 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4514 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4515 __ add(tmp4, tmp4, 1); 4516 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4517 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 4518 __ cmp(cnt1, ch2); 4519 __ br(__ EQ, L_CMP_LOOP); 4520 __ BIND(L_CMP_LOOP_NOMATCH); 4521 // here we're not matched 4522 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 4523 __ clz(tmp4, tmp2); 4524 __ add(str2, str2, str2_chr_size); // advance pointer 4525 __ b(L_HAS_ZERO_LOOP); 4526 __ align(OptoLoopAlignment); 4527 __ BIND(L_CMP_LOOP_LAST_CMP); 4528 __ cmp(cnt1, ch2); 4529 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4530 __ b(DONE); 4531 __ align(OptoLoopAlignment); 4532 __ BIND(L_CMP_LOOP_LAST_CMP2); 4533 if (str2_isL) { 4534 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4535 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4536 __ lslv(tmp2, tmp2, tmp4); 4537 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4538 __ add(tmp4, tmp4, 1); 4539 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4540 __ lsl(tmp2, tmp2, 1); 4541 } else { 4542 __ mov(ch2, 0xE); 4543 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4544 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4545 __ lslv(tmp2, tmp2, tmp4); 4546 __ add(tmp4, tmp4, 1); 4547 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4548 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4549 __ lsl(tmp2, tmp2, 1); 4550 __ sub(str2, str2, str2_chr_size); 4551 } 4552 __ cmp(ch1, ch2); 4553 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4554 __ b(DONE); 4555 __ align(OptoLoopAlignment); 4556 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 4557 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 4558 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 4559 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 4560 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 4561 // result by analyzed characters value, so, we can just reset lower bits 4562 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 4563 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 4564 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 4565 // index of last analyzed substring inside current octet. So, str2 in at 4566 // respective start address. We need to advance it to next octet 4567 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 4568 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 4569 __ bfm(result, zr, 0, 2 - str2_chr_shift); 4570 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 4571 __ movw(cnt2, cnt2); 4572 __ b(L_LOOP_PROCEED); 4573 __ align(OptoLoopAlignment); 4574 __ BIND(NOMATCH); 4575 __ mov(result, -1); 4576 __ BIND(DONE); 4577 __ pop(spilled_regs, sp); 4578 __ ret(lr); 4579 return entry; 4580 } 4581 4582 void generate_string_indexof_stubs() { 4583 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 4584 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 4585 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 4586 } 4587 4588 void inflate_and_store_2_fp_registers(bool generatePrfm, 4589 FloatRegister src1, FloatRegister src2) { 4590 Register dst = r1; 4591 __ zip1(v1, __ T16B, src1, v0); 4592 __ zip2(v2, __ T16B, src1, v0); 4593 if (generatePrfm) { 4594 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 4595 } 4596 __ zip1(v3, __ T16B, src2, v0); 4597 __ zip2(v4, __ T16B, src2, v0); 4598 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4599 } 4600 4601 // R0 = src 4602 // R1 = dst 4603 // R2 = len 4604 // R3 = len >> 3 4605 // V0 = 0 4606 // v1 = loaded 8 bytes 4607 address generate_large_byte_array_inflate() { 4608 __ align(CodeEntryAlignment); 4609 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4610 address entry = __ pc(); 4611 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4612 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4613 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4; 4614 4615 // do one more 8-byte read to have address 16-byte aligned in most cases 4616 // also use single store instruction 4617 __ ldrd(v2, __ post(src, 8)); 4618 __ sub(octetCounter, octetCounter, 2); 4619 __ zip1(v1, __ T16B, v1, v0); 4620 __ zip1(v2, __ T16B, v2, v0); 4621 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4622 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4623 __ subs(rscratch1, octetCounter, large_loop_threshold); 4624 __ br(__ LE, LOOP_START); 4625 __ b(LOOP_PRFM_START); 4626 __ bind(LOOP_PRFM); 4627 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4628 __ bind(LOOP_PRFM_START); 4629 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4630 __ sub(octetCounter, octetCounter, 8); 4631 __ subs(rscratch1, octetCounter, large_loop_threshold); 4632 inflate_and_store_2_fp_registers(true, v3, v4); 4633 inflate_and_store_2_fp_registers(true, v5, v6); 4634 __ br(__ GT, LOOP_PRFM); 4635 __ cmp(octetCounter, (u1)8); 4636 __ br(__ LT, DONE); 4637 __ bind(LOOP); 4638 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4639 __ bind(LOOP_START); 4640 __ sub(octetCounter, octetCounter, 8); 4641 __ cmp(octetCounter, (u1)8); 4642 inflate_and_store_2_fp_registers(false, v3, v4); 4643 inflate_and_store_2_fp_registers(false, v5, v6); 4644 __ br(__ GE, LOOP); 4645 __ bind(DONE); 4646 __ ret(lr); 4647 return entry; 4648 } 4649 4650 /** 4651 * Arguments: 4652 * 4653 * Input: 4654 * c_rarg0 - current state address 4655 * c_rarg1 - H key address 4656 * c_rarg2 - data address 4657 * c_rarg3 - number of blocks 4658 * 4659 * Output: 4660 * Updated state at c_rarg0 4661 */ 4662 address generate_ghash_processBlocks() { 4663 // Bafflingly, GCM uses little-endian for the byte order, but 4664 // big-endian for the bit order. For example, the polynomial 1 is 4665 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4666 // 4667 // So, we must either reverse the bytes in each word and do 4668 // everything big-endian or reverse the bits in each byte and do 4669 // it little-endian. On AArch64 it's more idiomatic to reverse 4670 // the bits in each byte (we have an instruction, RBIT, to do 4671 // that) and keep the data in little-endian bit order throught the 4672 // calculation, bit-reversing the inputs and outputs. 4673 4674 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4675 __ align(wordSize * 2); 4676 address p = __ pc(); 4677 __ emit_int64(0x87); // The low-order bits of the field 4678 // polynomial (i.e. p = z^7+z^2+z+1) 4679 // repeated in the low and high parts of a 4680 // 128-bit vector 4681 __ emit_int64(0x87); 4682 4683 __ align(CodeEntryAlignment); 4684 address start = __ pc(); 4685 4686 Register state = c_rarg0; 4687 Register subkeyH = c_rarg1; 4688 Register data = c_rarg2; 4689 Register blocks = c_rarg3; 4690 4691 FloatRegister vzr = v30; 4692 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4693 4694 __ ldrq(v0, Address(state)); 4695 __ ldrq(v1, Address(subkeyH)); 4696 4697 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4698 __ rbit(v0, __ T16B, v0); 4699 __ rev64(v1, __ T16B, v1); 4700 __ rbit(v1, __ T16B, v1); 4701 4702 __ ldrq(v26, p); 4703 4704 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4705 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4706 4707 { 4708 Label L_ghash_loop; 4709 __ bind(L_ghash_loop); 4710 4711 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4712 // reversing each byte 4713 __ rbit(v2, __ T16B, v2); 4714 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4715 4716 // Multiply state in v2 by subkey in v1 4717 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4718 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4719 /*temps*/v6, v20, v18, v21); 4720 // Reduce v7:v5 by the field polynomial 4721 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4722 4723 __ sub(blocks, blocks, 1); 4724 __ cbnz(blocks, L_ghash_loop); 4725 } 4726 4727 // The bit-reversed result is at this point in v0 4728 __ rev64(v1, __ T16B, v0); 4729 __ rbit(v1, __ T16B, v1); 4730 4731 __ st1(v1, __ T16B, state); 4732 __ ret(lr); 4733 4734 return start; 4735 } 4736 4737 // Continuation point for throwing of implicit exceptions that are 4738 // not handled in the current activation. Fabricates an exception 4739 // oop and initiates normal exception dispatching in this 4740 // frame. Since we need to preserve callee-saved values (currently 4741 // only for C2, but done for C1 as well) we need a callee-saved oop 4742 // map and therefore have to make these stubs into RuntimeStubs 4743 // rather than BufferBlobs. If the compiler needs all registers to 4744 // be preserved between the fault point and the exception handler 4745 // then it must assume responsibility for that in 4746 // AbstractCompiler::continuation_for_implicit_null_exception or 4747 // continuation_for_implicit_division_by_zero_exception. All other 4748 // implicit exceptions (e.g., NullPointerException or 4749 // AbstractMethodError on entry) are either at call sites or 4750 // otherwise assume that stack unwinding will be initiated, so 4751 // caller saved registers were assumed volatile in the compiler. 4752 4753 #undef __ 4754 #define __ masm-> 4755 4756 address generate_throw_exception(const char* name, 4757 address runtime_entry, 4758 Register arg1 = noreg, 4759 Register arg2 = noreg) { 4760 // Information about frame layout at time of blocking runtime call. 4761 // Note that we only have to preserve callee-saved registers since 4762 // the compilers are responsible for supplying a continuation point 4763 // if they expect all registers to be preserved. 4764 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4765 enum layout { 4766 rfp_off = 0, 4767 rfp_off2, 4768 return_off, 4769 return_off2, 4770 framesize // inclusive of return address 4771 }; 4772 4773 int insts_size = 512; 4774 int locs_size = 64; 4775 4776 CodeBuffer code(name, insts_size, locs_size); 4777 OopMapSet* oop_maps = new OopMapSet(); 4778 MacroAssembler* masm = new MacroAssembler(&code); 4779 4780 address start = __ pc(); 4781 4782 // This is an inlined and slightly modified version of call_VM 4783 // which has the ability to fetch the return PC out of 4784 // thread-local storage and also sets up last_Java_sp slightly 4785 // differently than the real call_VM 4786 4787 __ enter(); // Save FP and LR before call 4788 4789 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4790 4791 // lr and fp are already in place 4792 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4793 4794 int frame_complete = __ pc() - start; 4795 4796 // Set up last_Java_sp and last_Java_fp 4797 address the_pc = __ pc(); 4798 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 4799 4800 // Call runtime 4801 if (arg1 != noreg) { 4802 assert(arg2 != c_rarg1, "clobbered"); 4803 __ mov(c_rarg1, arg1); 4804 } 4805 if (arg2 != noreg) { 4806 __ mov(c_rarg2, arg2); 4807 } 4808 __ mov(c_rarg0, rthread); 4809 BLOCK_COMMENT("call runtime_entry"); 4810 __ mov(rscratch1, runtime_entry); 4811 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4812 4813 // Generate oop map 4814 OopMap* map = new OopMap(framesize, 0); 4815 4816 oop_maps->add_gc_map(the_pc - start, map); 4817 4818 __ reset_last_Java_frame(true); 4819 __ maybe_isb(); 4820 4821 __ leave(); 4822 4823 // check for pending exceptions 4824 #ifdef ASSERT 4825 Label L; 4826 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4827 __ cbnz(rscratch1, L); 4828 __ should_not_reach_here(); 4829 __ bind(L); 4830 #endif // ASSERT 4831 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4832 4833 4834 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4835 RuntimeStub* stub = 4836 RuntimeStub::new_runtime_stub(name, 4837 &code, 4838 frame_complete, 4839 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4840 oop_maps, false); 4841 return stub->entry_point(); 4842 } 4843 4844 class MontgomeryMultiplyGenerator : public MacroAssembler { 4845 4846 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4847 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4848 4849 RegSet _toSave; 4850 bool _squaring; 4851 4852 public: 4853 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4854 : MacroAssembler(as->code()), _squaring(squaring) { 4855 4856 // Register allocation 4857 4858 Register reg = c_rarg0; 4859 Pa_base = reg; // Argument registers 4860 if (squaring) 4861 Pb_base = Pa_base; 4862 else 4863 Pb_base = ++reg; 4864 Pn_base = ++reg; 4865 Rlen= ++reg; 4866 inv = ++reg; 4867 Pm_base = ++reg; 4868 4869 // Working registers: 4870 Ra = ++reg; // The current digit of a, b, n, and m. 4871 Rb = ++reg; 4872 Rm = ++reg; 4873 Rn = ++reg; 4874 4875 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4876 Pb = ++reg; 4877 Pm = ++reg; 4878 Pn = ++reg; 4879 4880 t0 = ++reg; // Three registers which form a 4881 t1 = ++reg; // triple-precision accumuator. 4882 t2 = ++reg; 4883 4884 Ri = ++reg; // Inner and outer loop indexes. 4885 Rj = ++reg; 4886 4887 Rhi_ab = ++reg; // Product registers: low and high parts 4888 Rlo_ab = ++reg; // of a*b and m*n. 4889 Rhi_mn = ++reg; 4890 Rlo_mn = ++reg; 4891 4892 // r19 and up are callee-saved. 4893 _toSave = RegSet::range(r19, reg) + Pm_base; 4894 } 4895 4896 private: 4897 void save_regs() { 4898 push(_toSave, sp); 4899 } 4900 4901 void restore_regs() { 4902 pop(_toSave, sp); 4903 } 4904 4905 template <typename T> 4906 void unroll_2(Register count, T block) { 4907 Label loop, end, odd; 4908 tbnz(count, 0, odd); 4909 cbz(count, end); 4910 align(16); 4911 bind(loop); 4912 (this->*block)(); 4913 bind(odd); 4914 (this->*block)(); 4915 subs(count, count, 2); 4916 br(Assembler::GT, loop); 4917 bind(end); 4918 } 4919 4920 template <typename T> 4921 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4922 Label loop, end, odd; 4923 tbnz(count, 0, odd); 4924 cbz(count, end); 4925 align(16); 4926 bind(loop); 4927 (this->*block)(d, s, tmp); 4928 bind(odd); 4929 (this->*block)(d, s, tmp); 4930 subs(count, count, 2); 4931 br(Assembler::GT, loop); 4932 bind(end); 4933 } 4934 4935 void pre1(RegisterOrConstant i) { 4936 block_comment("pre1"); 4937 // Pa = Pa_base; 4938 // Pb = Pb_base + i; 4939 // Pm = Pm_base; 4940 // Pn = Pn_base + i; 4941 // Ra = *Pa; 4942 // Rb = *Pb; 4943 // Rm = *Pm; 4944 // Rn = *Pn; 4945 ldr(Ra, Address(Pa_base)); 4946 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4947 ldr(Rm, Address(Pm_base)); 4948 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4949 lea(Pa, Address(Pa_base)); 4950 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4951 lea(Pm, Address(Pm_base)); 4952 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4953 4954 // Zero the m*n result. 4955 mov(Rhi_mn, zr); 4956 mov(Rlo_mn, zr); 4957 } 4958 4959 // The core multiply-accumulate step of a Montgomery 4960 // multiplication. The idea is to schedule operations as a 4961 // pipeline so that instructions with long latencies (loads and 4962 // multiplies) have time to complete before their results are 4963 // used. This most benefits in-order implementations of the 4964 // architecture but out-of-order ones also benefit. 4965 void step() { 4966 block_comment("step"); 4967 // MACC(Ra, Rb, t0, t1, t2); 4968 // Ra = *++Pa; 4969 // Rb = *--Pb; 4970 umulh(Rhi_ab, Ra, Rb); 4971 mul(Rlo_ab, Ra, Rb); 4972 ldr(Ra, pre(Pa, wordSize)); 4973 ldr(Rb, pre(Pb, -wordSize)); 4974 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4975 // previous iteration. 4976 // MACC(Rm, Rn, t0, t1, t2); 4977 // Rm = *++Pm; 4978 // Rn = *--Pn; 4979 umulh(Rhi_mn, Rm, Rn); 4980 mul(Rlo_mn, Rm, Rn); 4981 ldr(Rm, pre(Pm, wordSize)); 4982 ldr(Rn, pre(Pn, -wordSize)); 4983 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4984 } 4985 4986 void post1() { 4987 block_comment("post1"); 4988 4989 // MACC(Ra, Rb, t0, t1, t2); 4990 // Ra = *++Pa; 4991 // Rb = *--Pb; 4992 umulh(Rhi_ab, Ra, Rb); 4993 mul(Rlo_ab, Ra, Rb); 4994 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4995 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4996 4997 // *Pm = Rm = t0 * inv; 4998 mul(Rm, t0, inv); 4999 str(Rm, Address(Pm)); 5000 5001 // MACC(Rm, Rn, t0, t1, t2); 5002 // t0 = t1; t1 = t2; t2 = 0; 5003 umulh(Rhi_mn, Rm, Rn); 5004 5005 #ifndef PRODUCT 5006 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5007 { 5008 mul(Rlo_mn, Rm, Rn); 5009 add(Rlo_mn, t0, Rlo_mn); 5010 Label ok; 5011 cbz(Rlo_mn, ok); { 5012 stop("broken Montgomery multiply"); 5013 } bind(ok); 5014 } 5015 #endif 5016 // We have very carefully set things up so that 5017 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5018 // the lower half of Rm * Rn because we know the result already: 5019 // it must be -t0. t0 + (-t0) must generate a carry iff 5020 // t0 != 0. So, rather than do a mul and an adds we just set 5021 // the carry flag iff t0 is nonzero. 5022 // 5023 // mul(Rlo_mn, Rm, Rn); 5024 // adds(zr, t0, Rlo_mn); 5025 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5026 adcs(t0, t1, Rhi_mn); 5027 adc(t1, t2, zr); 5028 mov(t2, zr); 5029 } 5030 5031 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 5032 block_comment("pre2"); 5033 // Pa = Pa_base + i-len; 5034 // Pb = Pb_base + len; 5035 // Pm = Pm_base + i-len; 5036 // Pn = Pn_base + len; 5037 5038 if (i.is_register()) { 5039 sub(Rj, i.as_register(), len); 5040 } else { 5041 mov(Rj, i.as_constant()); 5042 sub(Rj, Rj, len); 5043 } 5044 // Rj == i-len 5045 5046 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 5047 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 5048 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5049 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 5050 5051 // Ra = *++Pa; 5052 // Rb = *--Pb; 5053 // Rm = *++Pm; 5054 // Rn = *--Pn; 5055 ldr(Ra, pre(Pa, wordSize)); 5056 ldr(Rb, pre(Pb, -wordSize)); 5057 ldr(Rm, pre(Pm, wordSize)); 5058 ldr(Rn, pre(Pn, -wordSize)); 5059 5060 mov(Rhi_mn, zr); 5061 mov(Rlo_mn, zr); 5062 } 5063 5064 void post2(RegisterOrConstant i, RegisterOrConstant len) { 5065 block_comment("post2"); 5066 if (i.is_constant()) { 5067 mov(Rj, i.as_constant()-len.as_constant()); 5068 } else { 5069 sub(Rj, i.as_register(), len); 5070 } 5071 5072 adds(t0, t0, Rlo_mn); // The pending m*n, low part 5073 5074 // As soon as we know the least significant digit of our result, 5075 // store it. 5076 // Pm_base[i-len] = t0; 5077 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5078 5079 // t0 = t1; t1 = t2; t2 = 0; 5080 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 5081 adc(t1, t2, zr); 5082 mov(t2, zr); 5083 } 5084 5085 // A carry in t0 after Montgomery multiplication means that we 5086 // should subtract multiples of n from our result in m. We'll 5087 // keep doing that until there is no carry. 5088 void normalize(RegisterOrConstant len) { 5089 block_comment("normalize"); 5090 // while (t0) 5091 // t0 = sub(Pm_base, Pn_base, t0, len); 5092 Label loop, post, again; 5093 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 5094 cbz(t0, post); { 5095 bind(again); { 5096 mov(i, zr); 5097 mov(cnt, len); 5098 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5099 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5100 subs(zr, zr, zr); // set carry flag, i.e. no borrow 5101 align(16); 5102 bind(loop); { 5103 sbcs(Rm, Rm, Rn); 5104 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5105 add(i, i, 1); 5106 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5107 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5108 sub(cnt, cnt, 1); 5109 } cbnz(cnt, loop); 5110 sbc(t0, t0, zr); 5111 } cbnz(t0, again); 5112 } bind(post); 5113 } 5114 5115 // Move memory at s to d, reversing words. 5116 // Increments d to end of copied memory 5117 // Destroys tmp1, tmp2 5118 // Preserves len 5119 // Leaves s pointing to the address which was in d at start 5120 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 5121 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 5122 5123 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 5124 mov(tmp1, len); 5125 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 5126 sub(s, d, len, ext::uxtw, LogBytesPerWord); 5127 } 5128 // where 5129 void reverse1(Register d, Register s, Register tmp) { 5130 ldr(tmp, pre(s, -wordSize)); 5131 ror(tmp, tmp, 32); 5132 str(tmp, post(d, wordSize)); 5133 } 5134 5135 void step_squaring() { 5136 // An extra ACC 5137 step(); 5138 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5139 } 5140 5141 void last_squaring(RegisterOrConstant i) { 5142 Label dont; 5143 // if ((i & 1) == 0) { 5144 tbnz(i.as_register(), 0, dont); { 5145 // MACC(Ra, Rb, t0, t1, t2); 5146 // Ra = *++Pa; 5147 // Rb = *--Pb; 5148 umulh(Rhi_ab, Ra, Rb); 5149 mul(Rlo_ab, Ra, Rb); 5150 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5151 } bind(dont); 5152 } 5153 5154 void extra_step_squaring() { 5155 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5156 5157 // MACC(Rm, Rn, t0, t1, t2); 5158 // Rm = *++Pm; 5159 // Rn = *--Pn; 5160 umulh(Rhi_mn, Rm, Rn); 5161 mul(Rlo_mn, Rm, Rn); 5162 ldr(Rm, pre(Pm, wordSize)); 5163 ldr(Rn, pre(Pn, -wordSize)); 5164 } 5165 5166 void post1_squaring() { 5167 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5168 5169 // *Pm = Rm = t0 * inv; 5170 mul(Rm, t0, inv); 5171 str(Rm, Address(Pm)); 5172 5173 // MACC(Rm, Rn, t0, t1, t2); 5174 // t0 = t1; t1 = t2; t2 = 0; 5175 umulh(Rhi_mn, Rm, Rn); 5176 5177 #ifndef PRODUCT 5178 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5179 { 5180 mul(Rlo_mn, Rm, Rn); 5181 add(Rlo_mn, t0, Rlo_mn); 5182 Label ok; 5183 cbz(Rlo_mn, ok); { 5184 stop("broken Montgomery multiply"); 5185 } bind(ok); 5186 } 5187 #endif 5188 // We have very carefully set things up so that 5189 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5190 // the lower half of Rm * Rn because we know the result already: 5191 // it must be -t0. t0 + (-t0) must generate a carry iff 5192 // t0 != 0. So, rather than do a mul and an adds we just set 5193 // the carry flag iff t0 is nonzero. 5194 // 5195 // mul(Rlo_mn, Rm, Rn); 5196 // adds(zr, t0, Rlo_mn); 5197 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5198 adcs(t0, t1, Rhi_mn); 5199 adc(t1, t2, zr); 5200 mov(t2, zr); 5201 } 5202 5203 void acc(Register Rhi, Register Rlo, 5204 Register t0, Register t1, Register t2) { 5205 adds(t0, t0, Rlo); 5206 adcs(t1, t1, Rhi); 5207 adc(t2, t2, zr); 5208 } 5209 5210 public: 5211 /** 5212 * Fast Montgomery multiplication. The derivation of the 5213 * algorithm is in A Cryptographic Library for the Motorola 5214 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 5215 * 5216 * Arguments: 5217 * 5218 * Inputs for multiplication: 5219 * c_rarg0 - int array elements a 5220 * c_rarg1 - int array elements b 5221 * c_rarg2 - int array elements n (the modulus) 5222 * c_rarg3 - int length 5223 * c_rarg4 - int inv 5224 * c_rarg5 - int array elements m (the result) 5225 * 5226 * Inputs for squaring: 5227 * c_rarg0 - int array elements a 5228 * c_rarg1 - int array elements n (the modulus) 5229 * c_rarg2 - int length 5230 * c_rarg3 - int inv 5231 * c_rarg4 - int array elements m (the result) 5232 * 5233 */ 5234 address generate_multiply() { 5235 Label argh, nothing; 5236 bind(argh); 5237 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5238 5239 align(CodeEntryAlignment); 5240 address entry = pc(); 5241 5242 cbzw(Rlen, nothing); 5243 5244 enter(); 5245 5246 // Make room. 5247 cmpw(Rlen, 512); 5248 br(Assembler::HI, argh); 5249 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5250 andr(sp, Ra, -2 * wordSize); 5251 5252 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5253 5254 { 5255 // Copy input args, reversing as we go. We use Ra as a 5256 // temporary variable. 5257 reverse(Ra, Pa_base, Rlen, t0, t1); 5258 if (!_squaring) 5259 reverse(Ra, Pb_base, Rlen, t0, t1); 5260 reverse(Ra, Pn_base, Rlen, t0, t1); 5261 } 5262 5263 // Push all call-saved registers and also Pm_base which we'll need 5264 // at the end. 5265 save_regs(); 5266 5267 #ifndef PRODUCT 5268 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 5269 { 5270 ldr(Rn, Address(Pn_base, 0)); 5271 mul(Rlo_mn, Rn, inv); 5272 subs(zr, Rlo_mn, -1); 5273 Label ok; 5274 br(EQ, ok); { 5275 stop("broken inverse in Montgomery multiply"); 5276 } bind(ok); 5277 } 5278 #endif 5279 5280 mov(Pm_base, Ra); 5281 5282 mov(t0, zr); 5283 mov(t1, zr); 5284 mov(t2, zr); 5285 5286 block_comment("for (int i = 0; i < len; i++) {"); 5287 mov(Ri, zr); { 5288 Label loop, end; 5289 cmpw(Ri, Rlen); 5290 br(Assembler::GE, end); 5291 5292 bind(loop); 5293 pre1(Ri); 5294 5295 block_comment(" for (j = i; j; j--) {"); { 5296 movw(Rj, Ri); 5297 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5298 } block_comment(" } // j"); 5299 5300 post1(); 5301 addw(Ri, Ri, 1); 5302 cmpw(Ri, Rlen); 5303 br(Assembler::LT, loop); 5304 bind(end); 5305 block_comment("} // i"); 5306 } 5307 5308 block_comment("for (int i = len; i < 2*len; i++) {"); 5309 mov(Ri, Rlen); { 5310 Label loop, end; 5311 cmpw(Ri, Rlen, Assembler::LSL, 1); 5312 br(Assembler::GE, end); 5313 5314 bind(loop); 5315 pre2(Ri, Rlen); 5316 5317 block_comment(" for (j = len*2-i-1; j; j--) {"); { 5318 lslw(Rj, Rlen, 1); 5319 subw(Rj, Rj, Ri); 5320 subw(Rj, Rj, 1); 5321 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5322 } block_comment(" } // j"); 5323 5324 post2(Ri, Rlen); 5325 addw(Ri, Ri, 1); 5326 cmpw(Ri, Rlen, Assembler::LSL, 1); 5327 br(Assembler::LT, loop); 5328 bind(end); 5329 } 5330 block_comment("} // i"); 5331 5332 normalize(Rlen); 5333 5334 mov(Ra, Pm_base); // Save Pm_base in Ra 5335 restore_regs(); // Restore caller's Pm_base 5336 5337 // Copy our result into caller's Pm_base 5338 reverse(Pm_base, Ra, Rlen, t0, t1); 5339 5340 leave(); 5341 bind(nothing); 5342 ret(lr); 5343 5344 return entry; 5345 } 5346 // In C, approximately: 5347 5348 // void 5349 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 5350 // unsigned long Pn_base[], unsigned long Pm_base[], 5351 // unsigned long inv, int len) { 5352 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5353 // unsigned long *Pa, *Pb, *Pn, *Pm; 5354 // unsigned long Ra, Rb, Rn, Rm; 5355 5356 // int i; 5357 5358 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5359 5360 // for (i = 0; i < len; i++) { 5361 // int j; 5362 5363 // Pa = Pa_base; 5364 // Pb = Pb_base + i; 5365 // Pm = Pm_base; 5366 // Pn = Pn_base + i; 5367 5368 // Ra = *Pa; 5369 // Rb = *Pb; 5370 // Rm = *Pm; 5371 // Rn = *Pn; 5372 5373 // int iters = i; 5374 // for (j = 0; iters--; j++) { 5375 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5376 // MACC(Ra, Rb, t0, t1, t2); 5377 // Ra = *++Pa; 5378 // Rb = *--Pb; 5379 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5380 // MACC(Rm, Rn, t0, t1, t2); 5381 // Rm = *++Pm; 5382 // Rn = *--Pn; 5383 // } 5384 5385 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 5386 // MACC(Ra, Rb, t0, t1, t2); 5387 // *Pm = Rm = t0 * inv; 5388 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5389 // MACC(Rm, Rn, t0, t1, t2); 5390 5391 // assert(t0 == 0, "broken Montgomery multiply"); 5392 5393 // t0 = t1; t1 = t2; t2 = 0; 5394 // } 5395 5396 // for (i = len; i < 2*len; i++) { 5397 // int j; 5398 5399 // Pa = Pa_base + i-len; 5400 // Pb = Pb_base + len; 5401 // Pm = Pm_base + i-len; 5402 // Pn = Pn_base + len; 5403 5404 // Ra = *++Pa; 5405 // Rb = *--Pb; 5406 // Rm = *++Pm; 5407 // Rn = *--Pn; 5408 5409 // int iters = len*2-i-1; 5410 // for (j = i-len+1; iters--; j++) { 5411 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5412 // MACC(Ra, Rb, t0, t1, t2); 5413 // Ra = *++Pa; 5414 // Rb = *--Pb; 5415 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5416 // MACC(Rm, Rn, t0, t1, t2); 5417 // Rm = *++Pm; 5418 // Rn = *--Pn; 5419 // } 5420 5421 // Pm_base[i-len] = t0; 5422 // t0 = t1; t1 = t2; t2 = 0; 5423 // } 5424 5425 // while (t0) 5426 // t0 = sub(Pm_base, Pn_base, t0, len); 5427 // } 5428 5429 /** 5430 * Fast Montgomery squaring. This uses asymptotically 25% fewer 5431 * multiplies than Montgomery multiplication so it should be up to 5432 * 25% faster. However, its loop control is more complex and it 5433 * may actually run slower on some machines. 5434 * 5435 * Arguments: 5436 * 5437 * Inputs: 5438 * c_rarg0 - int array elements a 5439 * c_rarg1 - int array elements n (the modulus) 5440 * c_rarg2 - int length 5441 * c_rarg3 - int inv 5442 * c_rarg4 - int array elements m (the result) 5443 * 5444 */ 5445 address generate_square() { 5446 Label argh; 5447 bind(argh); 5448 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5449 5450 align(CodeEntryAlignment); 5451 address entry = pc(); 5452 5453 enter(); 5454 5455 // Make room. 5456 cmpw(Rlen, 512); 5457 br(Assembler::HI, argh); 5458 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5459 andr(sp, Ra, -2 * wordSize); 5460 5461 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5462 5463 { 5464 // Copy input args, reversing as we go. We use Ra as a 5465 // temporary variable. 5466 reverse(Ra, Pa_base, Rlen, t0, t1); 5467 reverse(Ra, Pn_base, Rlen, t0, t1); 5468 } 5469 5470 // Push all call-saved registers and also Pm_base which we'll need 5471 // at the end. 5472 save_regs(); 5473 5474 mov(Pm_base, Ra); 5475 5476 mov(t0, zr); 5477 mov(t1, zr); 5478 mov(t2, zr); 5479 5480 block_comment("for (int i = 0; i < len; i++) {"); 5481 mov(Ri, zr); { 5482 Label loop, end; 5483 bind(loop); 5484 cmp(Ri, Rlen); 5485 br(Assembler::GE, end); 5486 5487 pre1(Ri); 5488 5489 block_comment("for (j = (i+1)/2; j; j--) {"); { 5490 add(Rj, Ri, 1); 5491 lsr(Rj, Rj, 1); 5492 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5493 } block_comment(" } // j"); 5494 5495 last_squaring(Ri); 5496 5497 block_comment(" for (j = i/2; j; j--) {"); { 5498 lsr(Rj, Ri, 1); 5499 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5500 } block_comment(" } // j"); 5501 5502 post1_squaring(); 5503 add(Ri, Ri, 1); 5504 cmp(Ri, Rlen); 5505 br(Assembler::LT, loop); 5506 5507 bind(end); 5508 block_comment("} // i"); 5509 } 5510 5511 block_comment("for (int i = len; i < 2*len; i++) {"); 5512 mov(Ri, Rlen); { 5513 Label loop, end; 5514 bind(loop); 5515 cmp(Ri, Rlen, Assembler::LSL, 1); 5516 br(Assembler::GE, end); 5517 5518 pre2(Ri, Rlen); 5519 5520 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 5521 lsl(Rj, Rlen, 1); 5522 sub(Rj, Rj, Ri); 5523 sub(Rj, Rj, 1); 5524 lsr(Rj, Rj, 1); 5525 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5526 } block_comment(" } // j"); 5527 5528 last_squaring(Ri); 5529 5530 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 5531 lsl(Rj, Rlen, 1); 5532 sub(Rj, Rj, Ri); 5533 lsr(Rj, Rj, 1); 5534 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5535 } block_comment(" } // j"); 5536 5537 post2(Ri, Rlen); 5538 add(Ri, Ri, 1); 5539 cmp(Ri, Rlen, Assembler::LSL, 1); 5540 5541 br(Assembler::LT, loop); 5542 bind(end); 5543 block_comment("} // i"); 5544 } 5545 5546 normalize(Rlen); 5547 5548 mov(Ra, Pm_base); // Save Pm_base in Ra 5549 restore_regs(); // Restore caller's Pm_base 5550 5551 // Copy our result into caller's Pm_base 5552 reverse(Pm_base, Ra, Rlen, t0, t1); 5553 5554 leave(); 5555 ret(lr); 5556 5557 return entry; 5558 } 5559 // In C, approximately: 5560 5561 // void 5562 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 5563 // unsigned long Pm_base[], unsigned long inv, int len) { 5564 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5565 // unsigned long *Pa, *Pb, *Pn, *Pm; 5566 // unsigned long Ra, Rb, Rn, Rm; 5567 5568 // int i; 5569 5570 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5571 5572 // for (i = 0; i < len; i++) { 5573 // int j; 5574 5575 // Pa = Pa_base; 5576 // Pb = Pa_base + i; 5577 // Pm = Pm_base; 5578 // Pn = Pn_base + i; 5579 5580 // Ra = *Pa; 5581 // Rb = *Pb; 5582 // Rm = *Pm; 5583 // Rn = *Pn; 5584 5585 // int iters = (i+1)/2; 5586 // for (j = 0; iters--; j++) { 5587 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5588 // MACC2(Ra, Rb, t0, t1, t2); 5589 // Ra = *++Pa; 5590 // Rb = *--Pb; 5591 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5592 // MACC(Rm, Rn, t0, t1, t2); 5593 // Rm = *++Pm; 5594 // Rn = *--Pn; 5595 // } 5596 // if ((i & 1) == 0) { 5597 // assert(Ra == Pa_base[j], "must be"); 5598 // MACC(Ra, Ra, t0, t1, t2); 5599 // } 5600 // iters = i/2; 5601 // assert(iters == i-j, "must be"); 5602 // for (; iters--; j++) { 5603 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5604 // MACC(Rm, Rn, t0, t1, t2); 5605 // Rm = *++Pm; 5606 // Rn = *--Pn; 5607 // } 5608 5609 // *Pm = Rm = t0 * inv; 5610 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5611 // MACC(Rm, Rn, t0, t1, t2); 5612 5613 // assert(t0 == 0, "broken Montgomery multiply"); 5614 5615 // t0 = t1; t1 = t2; t2 = 0; 5616 // } 5617 5618 // for (i = len; i < 2*len; i++) { 5619 // int start = i-len+1; 5620 // int end = start + (len - start)/2; 5621 // int j; 5622 5623 // Pa = Pa_base + i-len; 5624 // Pb = Pa_base + len; 5625 // Pm = Pm_base + i-len; 5626 // Pn = Pn_base + len; 5627 5628 // Ra = *++Pa; 5629 // Rb = *--Pb; 5630 // Rm = *++Pm; 5631 // Rn = *--Pn; 5632 5633 // int iters = (2*len-i-1)/2; 5634 // assert(iters == end-start, "must be"); 5635 // for (j = start; iters--; j++) { 5636 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5637 // MACC2(Ra, Rb, t0, t1, t2); 5638 // Ra = *++Pa; 5639 // Rb = *--Pb; 5640 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5641 // MACC(Rm, Rn, t0, t1, t2); 5642 // Rm = *++Pm; 5643 // Rn = *--Pn; 5644 // } 5645 // if ((i & 1) == 0) { 5646 // assert(Ra == Pa_base[j], "must be"); 5647 // MACC(Ra, Ra, t0, t1, t2); 5648 // } 5649 // iters = (2*len-i)/2; 5650 // assert(iters == len-j, "must be"); 5651 // for (; iters--; j++) { 5652 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5653 // MACC(Rm, Rn, t0, t1, t2); 5654 // Rm = *++Pm; 5655 // Rn = *--Pn; 5656 // } 5657 // Pm_base[i-len] = t0; 5658 // t0 = t1; t1 = t2; t2 = 0; 5659 // } 5660 5661 // while (t0) 5662 // t0 = sub(Pm_base, Pn_base, t0, len); 5663 // } 5664 }; 5665 5666 5667 // Initialization 5668 void generate_initial() { 5669 // Generate initial stubs and initializes the entry points 5670 5671 // entry points that exist in all platforms Note: This is code 5672 // that could be shared among different platforms - however the 5673 // benefit seems to be smaller than the disadvantage of having a 5674 // much more complicated generator structure. See also comment in 5675 // stubRoutines.hpp. 5676 5677 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5678 5679 StubRoutines::_call_stub_entry = 5680 generate_call_stub(StubRoutines::_call_stub_return_address); 5681 5682 // is referenced by megamorphic call 5683 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5684 5685 // Build this early so it's available for the interpreter. 5686 StubRoutines::_throw_StackOverflowError_entry = 5687 generate_throw_exception("StackOverflowError throw_exception", 5688 CAST_FROM_FN_PTR(address, 5689 SharedRuntime::throw_StackOverflowError)); 5690 StubRoutines::_throw_delayed_StackOverflowError_entry = 5691 generate_throw_exception("delayed StackOverflowError throw_exception", 5692 CAST_FROM_FN_PTR(address, 5693 SharedRuntime::throw_delayed_StackOverflowError)); 5694 if (UseCRC32Intrinsics) { 5695 // set table address before stub generation which use it 5696 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5697 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5698 } 5699 5700 if (UseCRC32CIntrinsics) { 5701 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5702 } 5703 5704 // Disabled until JDK-8210858 is fixed 5705 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5706 // StubRoutines::_dlog = generate_dlog(); 5707 // } 5708 5709 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5710 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 5711 } 5712 5713 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5714 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 5715 } 5716 } 5717 5718 void generate_all() { 5719 // support for verify_oop (must happen after universe_init) 5720 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5721 StubRoutines::_throw_AbstractMethodError_entry = 5722 generate_throw_exception("AbstractMethodError throw_exception", 5723 CAST_FROM_FN_PTR(address, 5724 SharedRuntime:: 5725 throw_AbstractMethodError)); 5726 5727 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5728 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5729 CAST_FROM_FN_PTR(address, 5730 SharedRuntime:: 5731 throw_IncompatibleClassChangeError)); 5732 5733 StubRoutines::_throw_NullPointerException_at_call_entry = 5734 generate_throw_exception("NullPointerException at call throw_exception", 5735 CAST_FROM_FN_PTR(address, 5736 SharedRuntime:: 5737 throw_NullPointerException_at_call)); 5738 5739 // arraycopy stubs used by compilers 5740 generate_arraycopy_stubs(); 5741 5742 // has negatives stub for large arrays. 5743 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5744 5745 // array equals stub for large arrays. 5746 if (!UseSimpleArrayEquals) { 5747 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5748 } 5749 5750 generate_compare_long_strings(); 5751 5752 generate_string_indexof_stubs(); 5753 5754 // byte_array_inflate stub for large arrays. 5755 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5756 5757 #ifdef COMPILER2 5758 if (UseMultiplyToLenIntrinsic) { 5759 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5760 } 5761 5762 if (UseSquareToLenIntrinsic) { 5763 StubRoutines::_squareToLen = generate_squareToLen(); 5764 } 5765 5766 if (UseMulAddIntrinsic) { 5767 StubRoutines::_mulAdd = generate_mulAdd(); 5768 } 5769 5770 if (UseMontgomeryMultiplyIntrinsic) { 5771 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5772 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5773 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5774 } 5775 5776 if (UseMontgomerySquareIntrinsic) { 5777 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5778 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5779 // We use generate_multiply() rather than generate_square() 5780 // because it's faster for the sizes of modulus we care about. 5781 StubRoutines::_montgomerySquare = g.generate_multiply(); 5782 } 5783 #endif // COMPILER2 5784 5785 #ifndef BUILTIN_SIM 5786 // generate GHASH intrinsics code 5787 if (UseGHASHIntrinsics) { 5788 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5789 } 5790 5791 if (UseAESIntrinsics) { 5792 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5793 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5794 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5795 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5796 } 5797 5798 if (UseSHA1Intrinsics) { 5799 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5800 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5801 } 5802 if (UseSHA256Intrinsics) { 5803 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5804 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5805 } 5806 5807 // generate Adler32 intrinsics code 5808 if (UseAdler32Intrinsics) { 5809 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5810 } 5811 5812 // Safefetch stubs. 5813 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5814 &StubRoutines::_safefetch32_fault_pc, 5815 &StubRoutines::_safefetch32_continuation_pc); 5816 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5817 &StubRoutines::_safefetchN_fault_pc, 5818 &StubRoutines::_safefetchN_continuation_pc); 5819 #endif 5820 StubRoutines::aarch64::set_completed(); 5821 } 5822 5823 public: 5824 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5825 if (all) { 5826 generate_all(); 5827 } else { 5828 generate_initial(); 5829 } 5830 } 5831 }; // end class declaration 5832 5833 void StubGenerator_generate(CodeBuffer* code, bool all) { 5834 StubGenerator g(code, all); 5835 }