1 /* 2 * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "nativeInst_aarch64.hpp" 33 #include "oops/instanceOop.hpp" 34 #include "oops/method.hpp" 35 #include "oops/objArrayKlass.hpp" 36 #include "oops/oop.inline.hpp" 37 #include "prims/methodHandles.hpp" 38 #include "runtime/frame.inline.hpp" 39 #include "runtime/handles.inline.hpp" 40 #include "runtime/sharedRuntime.hpp" 41 #include "runtime/stubCodeGenerator.hpp" 42 #include "runtime/stubRoutines.hpp" 43 #include "runtime/thread.inline.hpp" 44 #include "utilities/align.hpp" 45 #ifdef COMPILER2 46 #include "opto/runtime.hpp" 47 #endif 48 49 #ifdef BUILTIN_SIM 50 #include "../../../../../../simulator/simulator.hpp" 51 #endif 52 53 // Declaration and definition of StubGenerator (no .hpp file). 54 // For a more detailed description of the stub routine structure 55 // see the comment in stubRoutines.hpp 56 57 #undef __ 58 #define __ _masm-> 59 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 60 61 #ifdef PRODUCT 62 #define BLOCK_COMMENT(str) /* nothing */ 63 #else 64 #define BLOCK_COMMENT(str) __ block_comment(str) 65 #endif 66 67 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 68 69 // Stub Code definitions 70 71 class StubGenerator: public StubCodeGenerator { 72 private: 73 74 #ifdef PRODUCT 75 #define inc_counter_np(counter) ((void)0) 76 #else 77 void inc_counter_np_(int& counter) { 78 __ lea(rscratch2, ExternalAddress((address)&counter)); 79 __ ldrw(rscratch1, Address(rscratch2)); 80 __ addw(rscratch1, rscratch1, 1); 81 __ strw(rscratch1, Address(rscratch2)); 82 } 83 #define inc_counter_np(counter) \ 84 BLOCK_COMMENT("inc_counter " #counter); \ 85 inc_counter_np_(counter); 86 #endif 87 88 // Call stubs are used to call Java from C 89 // 90 // Arguments: 91 // c_rarg0: call wrapper address address 92 // c_rarg1: result address 93 // c_rarg2: result type BasicType 94 // c_rarg3: method Method* 95 // c_rarg4: (interpreter) entry point address 96 // c_rarg5: parameters intptr_t* 97 // c_rarg6: parameter size (in words) int 98 // c_rarg7: thread Thread* 99 // 100 // There is no return from the stub itself as any Java result 101 // is written to result 102 // 103 // we save r30 (lr) as the return PC at the base of the frame and 104 // link r29 (fp) below it as the frame pointer installing sp (r31) 105 // into fp. 106 // 107 // we save r0-r7, which accounts for all the c arguments. 108 // 109 // TODO: strictly do we need to save them all? they are treated as 110 // volatile by C so could we omit saving the ones we are going to 111 // place in global registers (thread? method?) or those we only use 112 // during setup of the Java call? 113 // 114 // we don't need to save r8 which C uses as an indirect result location 115 // return register. 116 // 117 // we don't need to save r9-r15 which both C and Java treat as 118 // volatile 119 // 120 // we don't need to save r16-18 because Java does not use them 121 // 122 // we save r19-r28 which Java uses as scratch registers and C 123 // expects to be callee-save 124 // 125 // we save the bottom 64 bits of each value stored in v8-v15; it is 126 // the responsibility of the caller to preserve larger values. 127 // 128 // so the stub frame looks like this when we enter Java code 129 // 130 // [ return_from_Java ] <--- sp 131 // [ argument word n ] 132 // ... 133 // -27 [ argument word 1 ] 134 // -26 [ saved v15 ] <--- sp_after_call 135 // -25 [ saved v14 ] 136 // -24 [ saved v13 ] 137 // -23 [ saved v12 ] 138 // -22 [ saved v11 ] 139 // -21 [ saved v10 ] 140 // -20 [ saved v9 ] 141 // -19 [ saved v8 ] 142 // -18 [ saved r28 ] 143 // -17 [ saved r27 ] 144 // -16 [ saved r26 ] 145 // -15 [ saved r25 ] 146 // -14 [ saved r24 ] 147 // -13 [ saved r23 ] 148 // -12 [ saved r22 ] 149 // -11 [ saved r21 ] 150 // -10 [ saved r20 ] 151 // -9 [ saved r19 ] 152 // -8 [ call wrapper (r0) ] 153 // -7 [ result (r1) ] 154 // -6 [ result type (r2) ] 155 // -5 [ method (r3) ] 156 // -4 [ entry point (r4) ] 157 // -3 [ parameters (r5) ] 158 // -2 [ parameter size (r6) ] 159 // -1 [ thread (r7) ] 160 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 161 // 1 [ saved lr (r30) ] 162 163 // Call stub stack layout word offsets from fp 164 enum call_stub_layout { 165 sp_after_call_off = -26, 166 167 d15_off = -26, 168 d13_off = -24, 169 d11_off = -22, 170 d9_off = -20, 171 172 r28_off = -18, 173 r26_off = -16, 174 r24_off = -14, 175 r22_off = -12, 176 r20_off = -10, 177 call_wrapper_off = -8, 178 result_off = -7, 179 result_type_off = -6, 180 method_off = -5, 181 entry_point_off = -4, 182 parameter_size_off = -2, 183 thread_off = -1, 184 fp_f = 0, 185 retaddr_off = 1, 186 }; 187 188 address generate_call_stub(address& return_address) { 189 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 190 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 191 "adjust this code"); 192 193 StubCodeMark mark(this, "StubRoutines", "call_stub"); 194 address start = __ pc(); 195 196 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 197 198 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 199 const Address result (rfp, result_off * wordSize); 200 const Address result_type (rfp, result_type_off * wordSize); 201 const Address method (rfp, method_off * wordSize); 202 const Address entry_point (rfp, entry_point_off * wordSize); 203 const Address parameter_size(rfp, parameter_size_off * wordSize); 204 205 const Address thread (rfp, thread_off * wordSize); 206 207 const Address d15_save (rfp, d15_off * wordSize); 208 const Address d13_save (rfp, d13_off * wordSize); 209 const Address d11_save (rfp, d11_off * wordSize); 210 const Address d9_save (rfp, d9_off * wordSize); 211 212 const Address r28_save (rfp, r28_off * wordSize); 213 const Address r26_save (rfp, r26_off * wordSize); 214 const Address r24_save (rfp, r24_off * wordSize); 215 const Address r22_save (rfp, r22_off * wordSize); 216 const Address r20_save (rfp, r20_off * wordSize); 217 218 // stub code 219 220 // we need a C prolog to bootstrap the x86 caller into the sim 221 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 222 223 address aarch64_entry = __ pc(); 224 225 #ifdef BUILTIN_SIM 226 // Save sender's SP for stack traces. 227 __ mov(rscratch1, sp); 228 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 229 #endif 230 // set up frame and move sp to end of save area 231 __ enter(); 232 __ sub(sp, rfp, -sp_after_call_off * wordSize); 233 234 // save register parameters and Java scratch/global registers 235 // n.b. we save thread even though it gets installed in 236 // rthread because we want to sanity check rthread later 237 __ str(c_rarg7, thread); 238 __ strw(c_rarg6, parameter_size); 239 __ stp(c_rarg4, c_rarg5, entry_point); 240 __ stp(c_rarg2, c_rarg3, result_type); 241 __ stp(c_rarg0, c_rarg1, call_wrapper); 242 243 __ stp(r20, r19, r20_save); 244 __ stp(r22, r21, r22_save); 245 __ stp(r24, r23, r24_save); 246 __ stp(r26, r25, r26_save); 247 __ stp(r28, r27, r28_save); 248 249 __ stpd(v9, v8, d9_save); 250 __ stpd(v11, v10, d11_save); 251 __ stpd(v13, v12, d13_save); 252 __ stpd(v15, v14, d15_save); 253 254 // install Java thread in global register now we have saved 255 // whatever value it held 256 __ mov(rthread, c_rarg7); 257 // And method 258 __ mov(rmethod, c_rarg3); 259 260 // set up the heapbase register 261 __ reinit_heapbase(); 262 263 #ifdef ASSERT 264 // make sure we have no pending exceptions 265 { 266 Label L; 267 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 268 __ cmp(rscratch1, (u1)NULL_WORD); 269 __ br(Assembler::EQ, L); 270 __ stop("StubRoutines::call_stub: entered with pending exception"); 271 __ BIND(L); 272 } 273 #endif 274 // pass parameters if any 275 __ mov(esp, sp); 276 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 277 __ andr(sp, rscratch1, -2 * wordSize); 278 279 BLOCK_COMMENT("pass parameters if any"); 280 Label parameters_done; 281 // parameter count is still in c_rarg6 282 // and parameter pointer identifying param 1 is in c_rarg5 283 __ cbzw(c_rarg6, parameters_done); 284 285 address loop = __ pc(); 286 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 287 __ subsw(c_rarg6, c_rarg6, 1); 288 __ push(rscratch1); 289 __ br(Assembler::GT, loop); 290 291 __ BIND(parameters_done); 292 293 // call Java entry -- passing methdoOop, and current sp 294 // rmethod: Method* 295 // r13: sender sp 296 BLOCK_COMMENT("call Java function"); 297 __ mov(r13, sp); 298 __ blr(c_rarg4); 299 300 // tell the simulator we have returned to the stub 301 302 // we do this here because the notify will already have been done 303 // if we get to the next instruction via an exception 304 // 305 // n.b. adding this instruction here affects the calculation of 306 // whether or not a routine returns to the call stub (used when 307 // doing stack walks) since the normal test is to check the return 308 // pc against the address saved below. so we may need to allow for 309 // this extra instruction in the check. 310 311 if (NotifySimulator) { 312 __ notify(Assembler::method_reentry); 313 } 314 // save current address for use by exception handling code 315 316 return_address = __ pc(); 317 318 // store result depending on type (everything that is not 319 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 320 // n.b. this assumes Java returns an integral result in r0 321 // and a floating result in j_farg0 322 __ ldr(j_rarg2, result); 323 Label is_long, is_float, is_double, exit; 324 __ ldr(j_rarg1, result_type); 325 __ cmp(j_rarg1, (u1)T_OBJECT); 326 __ br(Assembler::EQ, is_long); 327 __ cmp(j_rarg1, (u1)T_LONG); 328 __ br(Assembler::EQ, is_long); 329 __ cmp(j_rarg1, (u1)T_FLOAT); 330 __ br(Assembler::EQ, is_float); 331 __ cmp(j_rarg1, (u1)T_DOUBLE); 332 __ br(Assembler::EQ, is_double); 333 334 // handle T_INT case 335 __ strw(r0, Address(j_rarg2)); 336 337 __ BIND(exit); 338 339 // pop parameters 340 __ sub(esp, rfp, -sp_after_call_off * wordSize); 341 342 #ifdef ASSERT 343 // verify that threads correspond 344 { 345 Label L, S; 346 __ ldr(rscratch1, thread); 347 __ cmp(rthread, rscratch1); 348 __ br(Assembler::NE, S); 349 __ get_thread(rscratch1); 350 __ cmp(rthread, rscratch1); 351 __ br(Assembler::EQ, L); 352 __ BIND(S); 353 __ stop("StubRoutines::call_stub: threads must correspond"); 354 __ BIND(L); 355 } 356 #endif 357 358 // restore callee-save registers 359 __ ldpd(v15, v14, d15_save); 360 __ ldpd(v13, v12, d13_save); 361 __ ldpd(v11, v10, d11_save); 362 __ ldpd(v9, v8, d9_save); 363 364 __ ldp(r28, r27, r28_save); 365 __ ldp(r26, r25, r26_save); 366 __ ldp(r24, r23, r24_save); 367 __ ldp(r22, r21, r22_save); 368 __ ldp(r20, r19, r20_save); 369 370 __ ldp(c_rarg0, c_rarg1, call_wrapper); 371 __ ldrw(c_rarg2, result_type); 372 __ ldr(c_rarg3, method); 373 __ ldp(c_rarg4, c_rarg5, entry_point); 374 __ ldp(c_rarg6, c_rarg7, parameter_size); 375 376 #ifndef PRODUCT 377 // tell the simulator we are about to end Java execution 378 if (NotifySimulator) { 379 __ notify(Assembler::method_exit); 380 } 381 #endif 382 // leave frame and return to caller 383 __ leave(); 384 __ ret(lr); 385 386 // handle return types different from T_INT 387 388 __ BIND(is_long); 389 __ str(r0, Address(j_rarg2, 0)); 390 __ br(Assembler::AL, exit); 391 392 __ BIND(is_float); 393 __ strs(j_farg0, Address(j_rarg2, 0)); 394 __ br(Assembler::AL, exit); 395 396 __ BIND(is_double); 397 __ strd(j_farg0, Address(j_rarg2, 0)); 398 __ br(Assembler::AL, exit); 399 400 return start; 401 } 402 403 // Return point for a Java call if there's an exception thrown in 404 // Java code. The exception is caught and transformed into a 405 // pending exception stored in JavaThread that can be tested from 406 // within the VM. 407 // 408 // Note: Usually the parameters are removed by the callee. In case 409 // of an exception crossing an activation frame boundary, that is 410 // not the case if the callee is compiled code => need to setup the 411 // rsp. 412 // 413 // r0: exception oop 414 415 // NOTE: this is used as a target from the signal handler so it 416 // needs an x86 prolog which returns into the current simulator 417 // executing the generated catch_exception code. so the prolog 418 // needs to install rax in a sim register and adjust the sim's 419 // restart pc to enter the generated code at the start position 420 // then return from native to simulated execution. 421 422 address generate_catch_exception() { 423 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 424 address start = __ pc(); 425 426 // same as in generate_call_stub(): 427 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 428 const Address thread (rfp, thread_off * wordSize); 429 430 #ifdef ASSERT 431 // verify that threads correspond 432 { 433 Label L, S; 434 __ ldr(rscratch1, thread); 435 __ cmp(rthread, rscratch1); 436 __ br(Assembler::NE, S); 437 __ get_thread(rscratch1); 438 __ cmp(rthread, rscratch1); 439 __ br(Assembler::EQ, L); 440 __ bind(S); 441 __ stop("StubRoutines::catch_exception: threads must correspond"); 442 __ bind(L); 443 } 444 #endif 445 446 // set pending exception 447 __ verify_oop(r0); 448 449 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 450 __ mov(rscratch1, (address)__FILE__); 451 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 452 __ movw(rscratch1, (int)__LINE__); 453 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 454 455 // complete return to VM 456 assert(StubRoutines::_call_stub_return_address != NULL, 457 "_call_stub_return_address must have been generated before"); 458 __ b(StubRoutines::_call_stub_return_address); 459 460 return start; 461 } 462 463 // Continuation point for runtime calls returning with a pending 464 // exception. The pending exception check happened in the runtime 465 // or native call stub. The pending exception in Thread is 466 // converted into a Java-level exception. 467 // 468 // Contract with Java-level exception handlers: 469 // r0: exception 470 // r3: throwing pc 471 // 472 // NOTE: At entry of this stub, exception-pc must be in LR !! 473 474 // NOTE: this is always used as a jump target within generated code 475 // so it just needs to be generated code wiht no x86 prolog 476 477 address generate_forward_exception() { 478 StubCodeMark mark(this, "StubRoutines", "forward exception"); 479 address start = __ pc(); 480 481 // Upon entry, LR points to the return address returning into 482 // Java (interpreted or compiled) code; i.e., the return address 483 // becomes the throwing pc. 484 // 485 // Arguments pushed before the runtime call are still on the stack 486 // but the exception handler will reset the stack pointer -> 487 // ignore them. A potential result in registers can be ignored as 488 // well. 489 490 #ifdef ASSERT 491 // make sure this code is only executed if there is a pending exception 492 { 493 Label L; 494 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 495 __ cbnz(rscratch1, L); 496 __ stop("StubRoutines::forward exception: no pending exception (1)"); 497 __ bind(L); 498 } 499 #endif 500 501 // compute exception handler into r19 502 503 // call the VM to find the handler address associated with the 504 // caller address. pass thread in r0 and caller pc (ret address) 505 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 506 // the stack. 507 __ mov(c_rarg1, lr); 508 // lr will be trashed by the VM call so we move it to R19 509 // (callee-saved) because we also need to pass it to the handler 510 // returned by this call. 511 __ mov(r19, lr); 512 BLOCK_COMMENT("call exception_handler_for_return_address"); 513 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 514 SharedRuntime::exception_handler_for_return_address), 515 rthread, c_rarg1); 516 // we should not really care that lr is no longer the callee 517 // address. we saved the value the handler needs in r19 so we can 518 // just copy it to r3. however, the C2 handler will push its own 519 // frame and then calls into the VM and the VM code asserts that 520 // the PC for the frame above the handler belongs to a compiled 521 // Java method. So, we restore lr here to satisfy that assert. 522 __ mov(lr, r19); 523 // setup r0 & r3 & clear pending exception 524 __ mov(r3, r19); 525 __ mov(r19, r0); 526 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 527 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 528 529 #ifdef ASSERT 530 // make sure exception is set 531 { 532 Label L; 533 __ cbnz(r0, L); 534 __ stop("StubRoutines::forward exception: no pending exception (2)"); 535 __ bind(L); 536 } 537 #endif 538 539 // continue at exception handler 540 // r0: exception 541 // r3: throwing pc 542 // r19: exception handler 543 __ verify_oop(r0); 544 __ br(r19); 545 546 return start; 547 } 548 549 // Non-destructive plausibility checks for oops 550 // 551 // Arguments: 552 // r0: oop to verify 553 // rscratch1: error message 554 // 555 // Stack after saving c_rarg3: 556 // [tos + 0]: saved c_rarg3 557 // [tos + 1]: saved c_rarg2 558 // [tos + 2]: saved lr 559 // [tos + 3]: saved rscratch2 560 // [tos + 4]: saved r0 561 // [tos + 5]: saved rscratch1 562 address generate_verify_oop() { 563 564 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 565 address start = __ pc(); 566 567 Label exit, error; 568 569 // save c_rarg2 and c_rarg3 570 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 571 572 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 573 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 574 __ ldr(c_rarg3, Address(c_rarg2)); 575 __ add(c_rarg3, c_rarg3, 1); 576 __ str(c_rarg3, Address(c_rarg2)); 577 578 // object is in r0 579 // make sure object is 'reasonable' 580 __ cbz(r0, exit); // if obj is NULL it is OK 581 582 // Check if the oop is in the right area of memory 583 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 584 __ andr(c_rarg2, r0, c_rarg3); 585 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 586 587 // Compare c_rarg2 and c_rarg3. We don't use a compare 588 // instruction here because the flags register is live. 589 __ eor(c_rarg2, c_rarg2, c_rarg3); 590 __ cbnz(c_rarg2, error); 591 592 // make sure klass is 'reasonable', which is not zero. 593 __ load_klass(r0, r0); // get klass 594 __ cbz(r0, error); // if klass is NULL it is broken 595 596 // return if everything seems ok 597 __ bind(exit); 598 599 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 600 __ ret(lr); 601 602 // handle errors 603 __ bind(error); 604 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 605 606 __ push(RegSet::range(r0, r29), sp); 607 // debug(char* msg, int64_t pc, int64_t regs[]) 608 __ mov(c_rarg0, rscratch1); // pass address of error message 609 __ mov(c_rarg1, lr); // pass return address 610 __ mov(c_rarg2, sp); // pass address of regs on stack 611 #ifndef PRODUCT 612 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 613 #endif 614 BLOCK_COMMENT("call MacroAssembler::debug"); 615 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 616 __ blrt(rscratch1, 3, 0, 1); 617 618 return start; 619 } 620 621 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 622 623 // The inner part of zero_words(). This is the bulk operation, 624 // zeroing words in blocks, possibly using DC ZVA to do it. The 625 // caller is responsible for zeroing the last few words. 626 // 627 // Inputs: 628 // r10: the HeapWord-aligned base address of an array to zero. 629 // r11: the count in HeapWords, r11 > 0. 630 // 631 // Returns r10 and r11, adjusted for the caller to clear. 632 // r10: the base address of the tail of words left to clear. 633 // r11: the number of words in the tail. 634 // r11 < MacroAssembler::zero_words_block_size. 635 636 address generate_zero_blocks() { 637 Label done; 638 Label base_aligned; 639 640 Register base = r10, cnt = r11; 641 642 __ align(CodeEntryAlignment); 643 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 644 address start = __ pc(); 645 646 if (UseBlockZeroing) { 647 int zva_length = VM_Version::zva_length(); 648 649 // Ensure ZVA length can be divided by 16. This is required by 650 // the subsequent operations. 651 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 652 653 __ tbz(base, 3, base_aligned); 654 __ str(zr, Address(__ post(base, 8))); 655 __ sub(cnt, cnt, 1); 656 __ bind(base_aligned); 657 658 // Ensure count >= zva_length * 2 so that it still deserves a zva after 659 // alignment. 660 Label small; 661 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 662 __ subs(rscratch1, cnt, low_limit >> 3); 663 __ br(Assembler::LT, small); 664 __ zero_dcache_blocks(base, cnt); 665 __ bind(small); 666 } 667 668 { 669 // Number of stp instructions we'll unroll 670 const int unroll = 671 MacroAssembler::zero_words_block_size / 2; 672 // Clear the remaining blocks. 673 Label loop; 674 __ subs(cnt, cnt, unroll * 2); 675 __ br(Assembler::LT, done); 676 __ bind(loop); 677 for (int i = 0; i < unroll; i++) 678 __ stp(zr, zr, __ post(base, 16)); 679 __ subs(cnt, cnt, unroll * 2); 680 __ br(Assembler::GE, loop); 681 __ bind(done); 682 __ add(cnt, cnt, unroll * 2); 683 } 684 685 __ ret(lr); 686 687 return start; 688 } 689 690 691 typedef enum { 692 copy_forwards = 1, 693 copy_backwards = -1 694 } copy_direction; 695 696 // Bulk copy of blocks of 8 words. 697 // 698 // count is a count of words. 699 // 700 // Precondition: count >= 8 701 // 702 // Postconditions: 703 // 704 // The least significant bit of count contains the remaining count 705 // of words to copy. The rest of count is trash. 706 // 707 // s and d are adjusted to point to the remaining words to copy 708 // 709 void generate_copy_longs(Label &start, Register s, Register d, Register count, 710 copy_direction direction) { 711 int unit = wordSize * direction; 712 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 713 714 int offset; 715 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 716 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 717 const Register stride = r13; 718 719 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 720 assert_different_registers(s, d, count, rscratch1); 721 722 Label again, drain; 723 const char *stub_name; 724 if (direction == copy_forwards) 725 stub_name = "forward_copy_longs"; 726 else 727 stub_name = "backward_copy_longs"; 728 729 __ align(CodeEntryAlignment); 730 731 StubCodeMark mark(this, "StubRoutines", stub_name); 732 733 __ bind(start); 734 735 Label unaligned_copy_long; 736 if (AvoidUnalignedAccesses) { 737 __ tbnz(d, 3, unaligned_copy_long); 738 } 739 740 if (direction == copy_forwards) { 741 __ sub(s, s, bias); 742 __ sub(d, d, bias); 743 } 744 745 #ifdef ASSERT 746 // Make sure we are never given < 8 words 747 { 748 Label L; 749 __ cmp(count, (u1)8); 750 __ br(Assembler::GE, L); 751 __ stop("genrate_copy_longs called with < 8 words"); 752 __ bind(L); 753 } 754 #endif 755 756 // Fill 8 registers 757 if (UseSIMDForMemoryOps) { 758 __ ldpq(v0, v1, Address(s, 4 * unit)); 759 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 760 } else { 761 __ ldp(t0, t1, Address(s, 2 * unit)); 762 __ ldp(t2, t3, Address(s, 4 * unit)); 763 __ ldp(t4, t5, Address(s, 6 * unit)); 764 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 765 } 766 767 __ subs(count, count, 16); 768 __ br(Assembler::LO, drain); 769 770 int prefetch = PrefetchCopyIntervalInBytes; 771 bool use_stride = false; 772 if (direction == copy_backwards) { 773 use_stride = prefetch > 256; 774 prefetch = -prefetch; 775 if (use_stride) __ mov(stride, prefetch); 776 } 777 778 __ bind(again); 779 780 if (PrefetchCopyIntervalInBytes > 0) 781 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 782 783 if (UseSIMDForMemoryOps) { 784 __ stpq(v0, v1, Address(d, 4 * unit)); 785 __ ldpq(v0, v1, Address(s, 4 * unit)); 786 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 787 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 788 } else { 789 __ stp(t0, t1, Address(d, 2 * unit)); 790 __ ldp(t0, t1, Address(s, 2 * unit)); 791 __ stp(t2, t3, Address(d, 4 * unit)); 792 __ ldp(t2, t3, Address(s, 4 * unit)); 793 __ stp(t4, t5, Address(d, 6 * unit)); 794 __ ldp(t4, t5, Address(s, 6 * unit)); 795 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 796 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 797 } 798 799 __ subs(count, count, 8); 800 __ br(Assembler::HS, again); 801 802 // Drain 803 __ bind(drain); 804 if (UseSIMDForMemoryOps) { 805 __ stpq(v0, v1, Address(d, 4 * unit)); 806 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 807 } else { 808 __ stp(t0, t1, Address(d, 2 * unit)); 809 __ stp(t2, t3, Address(d, 4 * unit)); 810 __ stp(t4, t5, Address(d, 6 * unit)); 811 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 812 } 813 814 { 815 Label L1, L2; 816 __ tbz(count, exact_log2(4), L1); 817 if (UseSIMDForMemoryOps) { 818 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 819 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 820 } else { 821 __ ldp(t0, t1, Address(s, 2 * unit)); 822 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 823 __ stp(t0, t1, Address(d, 2 * unit)); 824 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 825 } 826 __ bind(L1); 827 828 if (direction == copy_forwards) { 829 __ add(s, s, bias); 830 __ add(d, d, bias); 831 } 832 833 __ tbz(count, 1, L2); 834 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 835 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 836 __ bind(L2); 837 } 838 839 __ ret(lr); 840 841 if (AvoidUnalignedAccesses) { 842 Label drain, again; 843 // Register order for storing. Order is different for backward copy. 844 845 __ bind(unaligned_copy_long); 846 847 // source address is even aligned, target odd aligned 848 // 849 // when forward copying word pairs we read long pairs at offsets 850 // {0, 2, 4, 6} (in long words). when backwards copying we read 851 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 852 // address by -2 in the forwards case so we can compute the 853 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 854 // or -1. 855 // 856 // when forward copying we need to store 1 word, 3 pairs and 857 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 858 // zero offset We adjust the destination by -1 which means we 859 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 860 // 861 // When backwards copyng we need to store 1 word, 3 pairs and 862 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 863 // offsets {1, 3, 5, 7, 8} * unit. 864 865 if (direction == copy_forwards) { 866 __ sub(s, s, 16); 867 __ sub(d, d, 8); 868 } 869 870 // Fill 8 registers 871 // 872 // for forwards copy s was offset by -16 from the original input 873 // value of s so the register contents are at these offsets 874 // relative to the 64 bit block addressed by that original input 875 // and so on for each successive 64 byte block when s is updated 876 // 877 // t0 at offset 0, t1 at offset 8 878 // t2 at offset 16, t3 at offset 24 879 // t4 at offset 32, t5 at offset 40 880 // t6 at offset 48, t7 at offset 56 881 882 // for backwards copy s was not offset so the register contents 883 // are at these offsets into the preceding 64 byte block 884 // relative to that original input and so on for each successive 885 // preceding 64 byte block when s is updated. this explains the 886 // slightly counter-intuitive looking pattern of register usage 887 // in the stp instructions for backwards copy. 888 // 889 // t0 at offset -16, t1 at offset -8 890 // t2 at offset -32, t3 at offset -24 891 // t4 at offset -48, t5 at offset -40 892 // t6 at offset -64, t7 at offset -56 893 894 __ ldp(t0, t1, Address(s, 2 * unit)); 895 __ ldp(t2, t3, Address(s, 4 * unit)); 896 __ ldp(t4, t5, Address(s, 6 * unit)); 897 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 898 899 __ subs(count, count, 16); 900 __ br(Assembler::LO, drain); 901 902 int prefetch = PrefetchCopyIntervalInBytes; 903 bool use_stride = false; 904 if (direction == copy_backwards) { 905 use_stride = prefetch > 256; 906 prefetch = -prefetch; 907 if (use_stride) __ mov(stride, prefetch); 908 } 909 910 __ bind(again); 911 912 if (PrefetchCopyIntervalInBytes > 0) 913 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 914 915 if (direction == copy_forwards) { 916 // allowing for the offset of -8 the store instructions place 917 // registers into the target 64 bit block at the following 918 // offsets 919 // 920 // t0 at offset 0 921 // t1 at offset 8, t2 at offset 16 922 // t3 at offset 24, t4 at offset 32 923 // t5 at offset 40, t6 at offset 48 924 // t7 at offset 56 925 926 __ str(t0, Address(d, 1 * unit)); 927 __ stp(t1, t2, Address(d, 2 * unit)); 928 __ ldp(t0, t1, Address(s, 2 * unit)); 929 __ stp(t3, t4, Address(d, 4 * unit)); 930 __ ldp(t2, t3, Address(s, 4 * unit)); 931 __ stp(t5, t6, Address(d, 6 * unit)); 932 __ ldp(t4, t5, Address(s, 6 * unit)); 933 __ str(t7, Address(__ pre(d, 8 * unit))); 934 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 935 } else { 936 // d was not offset when we started so the registers are 937 // written into the 64 bit block preceding d with the following 938 // offsets 939 // 940 // t1 at offset -8 941 // t3 at offset -24, t0 at offset -16 942 // t5 at offset -48, t2 at offset -32 943 // t7 at offset -56, t4 at offset -48 944 // t6 at offset -64 945 // 946 // note that this matches the offsets previously noted for the 947 // loads 948 949 __ str(t1, Address(d, 1 * unit)); 950 __ stp(t3, t0, Address(d, 3 * unit)); 951 __ ldp(t0, t1, Address(s, 2 * unit)); 952 __ stp(t5, t2, Address(d, 5 * unit)); 953 __ ldp(t2, t3, Address(s, 4 * unit)); 954 __ stp(t7, t4, Address(d, 7 * unit)); 955 __ ldp(t4, t5, Address(s, 6 * unit)); 956 __ str(t6, Address(__ pre(d, 8 * unit))); 957 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 958 } 959 960 __ subs(count, count, 8); 961 __ br(Assembler::HS, again); 962 963 // Drain 964 // 965 // this uses the same pattern of offsets and register arguments 966 // as above 967 __ bind(drain); 968 if (direction == copy_forwards) { 969 __ str(t0, Address(d, 1 * unit)); 970 __ stp(t1, t2, Address(d, 2 * unit)); 971 __ stp(t3, t4, Address(d, 4 * unit)); 972 __ stp(t5, t6, Address(d, 6 * unit)); 973 __ str(t7, Address(__ pre(d, 8 * unit))); 974 } else { 975 __ str(t1, Address(d, 1 * unit)); 976 __ stp(t3, t0, Address(d, 3 * unit)); 977 __ stp(t5, t2, Address(d, 5 * unit)); 978 __ stp(t7, t4, Address(d, 7 * unit)); 979 __ str(t6, Address(__ pre(d, 8 * unit))); 980 } 981 // now we need to copy any remaining part block which may 982 // include a 4 word block subblock and/or a 2 word subblock. 983 // bits 2 and 1 in the count are the tell-tale for whetehr we 984 // have each such subblock 985 { 986 Label L1, L2; 987 __ tbz(count, exact_log2(4), L1); 988 // this is the same as above but copying only 4 longs hence 989 // with ony one intervening stp between the str instructions 990 // but note that the offsets and registers still follow the 991 // same pattern 992 __ ldp(t0, t1, Address(s, 2 * unit)); 993 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 994 if (direction == copy_forwards) { 995 __ str(t0, Address(d, 1 * unit)); 996 __ stp(t1, t2, Address(d, 2 * unit)); 997 __ str(t3, Address(__ pre(d, 4 * unit))); 998 } else { 999 __ str(t1, Address(d, 1 * unit)); 1000 __ stp(t3, t0, Address(d, 3 * unit)); 1001 __ str(t2, Address(__ pre(d, 4 * unit))); 1002 } 1003 __ bind(L1); 1004 1005 __ tbz(count, 1, L2); 1006 // this is the same as above but copying only 2 longs hence 1007 // there is no intervening stp between the str instructions 1008 // but note that the offset and register patterns are still 1009 // the same 1010 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1011 if (direction == copy_forwards) { 1012 __ str(t0, Address(d, 1 * unit)); 1013 __ str(t1, Address(__ pre(d, 2 * unit))); 1014 } else { 1015 __ str(t1, Address(d, 1 * unit)); 1016 __ str(t0, Address(__ pre(d, 2 * unit))); 1017 } 1018 __ bind(L2); 1019 1020 // for forwards copy we need to re-adjust the offsets we 1021 // applied so that s and d are follow the last words written 1022 1023 if (direction == copy_forwards) { 1024 __ add(s, s, 16); 1025 __ add(d, d, 8); 1026 } 1027 1028 } 1029 1030 __ ret(lr); 1031 } 1032 } 1033 1034 // Small copy: less than 16 bytes. 1035 // 1036 // NB: Ignores all of the bits of count which represent more than 15 1037 // bytes, so a caller doesn't have to mask them. 1038 1039 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1040 bool is_backwards = step < 0; 1041 size_t granularity = uabs(step); 1042 int direction = is_backwards ? -1 : 1; 1043 int unit = wordSize * direction; 1044 1045 Label Lword, Lint, Lshort, Lbyte; 1046 1047 assert(granularity 1048 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1049 1050 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1051 1052 // ??? I don't know if this bit-test-and-branch is the right thing 1053 // to do. It does a lot of jumping, resulting in several 1054 // mispredicted branches. It might make more sense to do this 1055 // with something like Duff's device with a single computed branch. 1056 1057 __ tbz(count, 3 - exact_log2(granularity), Lword); 1058 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1059 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1060 __ bind(Lword); 1061 1062 if (granularity <= sizeof (jint)) { 1063 __ tbz(count, 2 - exact_log2(granularity), Lint); 1064 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1065 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1066 __ bind(Lint); 1067 } 1068 1069 if (granularity <= sizeof (jshort)) { 1070 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1071 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1072 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1073 __ bind(Lshort); 1074 } 1075 1076 if (granularity <= sizeof (jbyte)) { 1077 __ tbz(count, 0, Lbyte); 1078 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1079 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1080 __ bind(Lbyte); 1081 } 1082 } 1083 1084 Label copy_f, copy_b; 1085 1086 // All-singing all-dancing memory copy. 1087 // 1088 // Copy count units of memory from s to d. The size of a unit is 1089 // step, which can be positive or negative depending on the direction 1090 // of copy. If is_aligned is false, we align the source address. 1091 // 1092 1093 void copy_memory(bool is_aligned, Register s, Register d, 1094 Register count, Register tmp, int step) { 1095 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1096 bool is_backwards = step < 0; 1097 int granularity = uabs(step); 1098 const Register t0 = r3, t1 = r4; 1099 1100 // <= 96 bytes do inline. Direction doesn't matter because we always 1101 // load all the data before writing anything 1102 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1103 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1104 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1105 const Register send = r17, dend = r18; 1106 1107 if (PrefetchCopyIntervalInBytes > 0) 1108 __ prfm(Address(s, 0), PLDL1KEEP); 1109 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1110 __ br(Assembler::HI, copy_big); 1111 1112 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1113 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1114 1115 __ cmp(count, u1(16/granularity)); 1116 __ br(Assembler::LS, copy16); 1117 1118 __ cmp(count, u1(64/granularity)); 1119 __ br(Assembler::HI, copy80); 1120 1121 __ cmp(count, u1(32/granularity)); 1122 __ br(Assembler::LS, copy32); 1123 1124 // 33..64 bytes 1125 if (UseSIMDForMemoryOps) { 1126 __ ldpq(v0, v1, Address(s, 0)); 1127 __ ldpq(v2, v3, Address(send, -32)); 1128 __ stpq(v0, v1, Address(d, 0)); 1129 __ stpq(v2, v3, Address(dend, -32)); 1130 } else { 1131 __ ldp(t0, t1, Address(s, 0)); 1132 __ ldp(t2, t3, Address(s, 16)); 1133 __ ldp(t4, t5, Address(send, -32)); 1134 __ ldp(t6, t7, Address(send, -16)); 1135 1136 __ stp(t0, t1, Address(d, 0)); 1137 __ stp(t2, t3, Address(d, 16)); 1138 __ stp(t4, t5, Address(dend, -32)); 1139 __ stp(t6, t7, Address(dend, -16)); 1140 } 1141 __ b(finish); 1142 1143 // 17..32 bytes 1144 __ bind(copy32); 1145 __ ldp(t0, t1, Address(s, 0)); 1146 __ ldp(t2, t3, Address(send, -16)); 1147 __ stp(t0, t1, Address(d, 0)); 1148 __ stp(t2, t3, Address(dend, -16)); 1149 __ b(finish); 1150 1151 // 65..80/96 bytes 1152 // (96 bytes if SIMD because we do 32 byes per instruction) 1153 __ bind(copy80); 1154 if (UseSIMDForMemoryOps) { 1155 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1156 __ ldpq(v4, v5, Address(send, -32)); 1157 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1158 __ stpq(v4, v5, Address(dend, -32)); 1159 } else { 1160 __ ldp(t0, t1, Address(s, 0)); 1161 __ ldp(t2, t3, Address(s, 16)); 1162 __ ldp(t4, t5, Address(s, 32)); 1163 __ ldp(t6, t7, Address(s, 48)); 1164 __ ldp(t8, t9, Address(send, -16)); 1165 1166 __ stp(t0, t1, Address(d, 0)); 1167 __ stp(t2, t3, Address(d, 16)); 1168 __ stp(t4, t5, Address(d, 32)); 1169 __ stp(t6, t7, Address(d, 48)); 1170 __ stp(t8, t9, Address(dend, -16)); 1171 } 1172 __ b(finish); 1173 1174 // 0..16 bytes 1175 __ bind(copy16); 1176 __ cmp(count, u1(8/granularity)); 1177 __ br(Assembler::LO, copy8); 1178 1179 // 8..16 bytes 1180 __ ldr(t0, Address(s, 0)); 1181 __ ldr(t1, Address(send, -8)); 1182 __ str(t0, Address(d, 0)); 1183 __ str(t1, Address(dend, -8)); 1184 __ b(finish); 1185 1186 if (granularity < 8) { 1187 // 4..7 bytes 1188 __ bind(copy8); 1189 __ tbz(count, 2 - exact_log2(granularity), copy4); 1190 __ ldrw(t0, Address(s, 0)); 1191 __ ldrw(t1, Address(send, -4)); 1192 __ strw(t0, Address(d, 0)); 1193 __ strw(t1, Address(dend, -4)); 1194 __ b(finish); 1195 if (granularity < 4) { 1196 // 0..3 bytes 1197 __ bind(copy4); 1198 __ cbz(count, finish); // get rid of 0 case 1199 if (granularity == 2) { 1200 __ ldrh(t0, Address(s, 0)); 1201 __ strh(t0, Address(d, 0)); 1202 } else { // granularity == 1 1203 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1204 // the first and last byte. 1205 // Handle the 3 byte case by loading and storing base + count/2 1206 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1207 // This does means in the 1 byte case we load/store the same 1208 // byte 3 times. 1209 __ lsr(count, count, 1); 1210 __ ldrb(t0, Address(s, 0)); 1211 __ ldrb(t1, Address(send, -1)); 1212 __ ldrb(t2, Address(s, count)); 1213 __ strb(t0, Address(d, 0)); 1214 __ strb(t1, Address(dend, -1)); 1215 __ strb(t2, Address(d, count)); 1216 } 1217 __ b(finish); 1218 } 1219 } 1220 1221 __ bind(copy_big); 1222 if (is_backwards) { 1223 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1224 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1225 } 1226 1227 // Now we've got the small case out of the way we can align the 1228 // source address on a 2-word boundary. 1229 1230 Label aligned; 1231 1232 if (is_aligned) { 1233 // We may have to adjust by 1 word to get s 2-word-aligned. 1234 __ tbz(s, exact_log2(wordSize), aligned); 1235 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1236 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1237 __ sub(count, count, wordSize/granularity); 1238 } else { 1239 if (is_backwards) { 1240 __ andr(rscratch2, s, 2 * wordSize - 1); 1241 } else { 1242 __ neg(rscratch2, s); 1243 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1244 } 1245 // rscratch2 is the byte adjustment needed to align s. 1246 __ cbz(rscratch2, aligned); 1247 int shift = exact_log2(granularity); 1248 if (shift) __ lsr(rscratch2, rscratch2, shift); 1249 __ sub(count, count, rscratch2); 1250 1251 #if 0 1252 // ?? This code is only correct for a disjoint copy. It may or 1253 // may not make sense to use it in that case. 1254 1255 // Copy the first pair; s and d may not be aligned. 1256 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1257 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1258 1259 // Align s and d, adjust count 1260 if (is_backwards) { 1261 __ sub(s, s, rscratch2); 1262 __ sub(d, d, rscratch2); 1263 } else { 1264 __ add(s, s, rscratch2); 1265 __ add(d, d, rscratch2); 1266 } 1267 #else 1268 copy_memory_small(s, d, rscratch2, rscratch1, step); 1269 #endif 1270 } 1271 1272 __ bind(aligned); 1273 1274 // s is now 2-word-aligned. 1275 1276 // We have a count of units and some trailing bytes. Adjust the 1277 // count and do a bulk copy of words. 1278 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1279 if (direction == copy_forwards) 1280 __ bl(copy_f); 1281 else 1282 __ bl(copy_b); 1283 1284 // And the tail. 1285 copy_memory_small(s, d, count, tmp, step); 1286 1287 if (granularity >= 8) __ bind(copy8); 1288 if (granularity >= 4) __ bind(copy4); 1289 __ bind(finish); 1290 } 1291 1292 1293 void clobber_registers() { 1294 #ifdef ASSERT 1295 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1296 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1297 for (Register r = r3; r <= r18; r++) 1298 if (r != rscratch1) __ mov(r, rscratch1); 1299 #endif 1300 } 1301 1302 // Scan over array at a for count oops, verifying each one. 1303 // Preserves a and count, clobbers rscratch1 and rscratch2. 1304 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1305 Label loop, end; 1306 __ mov(rscratch1, a); 1307 __ mov(rscratch2, zr); 1308 __ bind(loop); 1309 __ cmp(rscratch2, count); 1310 __ br(Assembler::HS, end); 1311 if (size == (size_t)wordSize) { 1312 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1313 __ verify_oop(temp); 1314 } else { 1315 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1316 __ decode_heap_oop(temp); // calls verify_oop 1317 } 1318 __ add(rscratch2, rscratch2, size); 1319 __ b(loop); 1320 __ bind(end); 1321 } 1322 1323 // Arguments: 1324 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1325 // ignored 1326 // is_oop - true => oop array, so generate store check code 1327 // name - stub name string 1328 // 1329 // Inputs: 1330 // c_rarg0 - source array address 1331 // c_rarg1 - destination array address 1332 // c_rarg2 - element count, treated as ssize_t, can be zero 1333 // 1334 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1335 // the hardware handle it. The two dwords within qwords that span 1336 // cache line boundaries will still be loaded and stored atomicly. 1337 // 1338 // Side Effects: 1339 // disjoint_int_copy_entry is set to the no-overlap entry point 1340 // used by generate_conjoint_int_oop_copy(). 1341 // 1342 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1343 const char *name, bool dest_uninitialized = false) { 1344 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1345 RegSet saved_reg = RegSet::of(s, d, count); 1346 __ align(CodeEntryAlignment); 1347 StubCodeMark mark(this, "StubRoutines", name); 1348 address start = __ pc(); 1349 __ enter(); 1350 1351 if (entry != NULL) { 1352 *entry = __ pc(); 1353 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1354 BLOCK_COMMENT("Entry:"); 1355 } 1356 1357 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1358 if (dest_uninitialized) { 1359 decorators |= IS_DEST_UNINITIALIZED; 1360 } 1361 if (aligned) { 1362 decorators |= ARRAYCOPY_ALIGNED; 1363 } 1364 1365 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1366 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_reg); 1367 1368 if (is_oop) { 1369 // save regs before copy_memory 1370 __ push(RegSet::of(d, count), sp); 1371 } 1372 copy_memory(aligned, s, d, count, rscratch1, size); 1373 1374 if (is_oop) { 1375 __ pop(RegSet::of(d, count), sp); 1376 if (VerifyOops) 1377 verify_oop_array(size, d, count, r16); 1378 } 1379 1380 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1381 1382 __ leave(); 1383 __ mov(r0, zr); // return 0 1384 __ ret(lr); 1385 #ifdef BUILTIN_SIM 1386 { 1387 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1388 sim->notifyCompile(const_cast<char*>(name), start); 1389 } 1390 #endif 1391 return start; 1392 } 1393 1394 // Arguments: 1395 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1396 // ignored 1397 // is_oop - true => oop array, so generate store check code 1398 // name - stub name string 1399 // 1400 // Inputs: 1401 // c_rarg0 - source array address 1402 // c_rarg1 - destination array address 1403 // c_rarg2 - element count, treated as ssize_t, can be zero 1404 // 1405 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1406 // the hardware handle it. The two dwords within qwords that span 1407 // cache line boundaries will still be loaded and stored atomicly. 1408 // 1409 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1410 address *entry, const char *name, 1411 bool dest_uninitialized = false) { 1412 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1413 RegSet saved_regs = RegSet::of(s, d, count); 1414 StubCodeMark mark(this, "StubRoutines", name); 1415 address start = __ pc(); 1416 __ enter(); 1417 1418 if (entry != NULL) { 1419 *entry = __ pc(); 1420 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1421 BLOCK_COMMENT("Entry:"); 1422 } 1423 1424 // use fwd copy when (d-s) above_equal (count*size) 1425 __ sub(rscratch1, d, s); 1426 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1427 __ br(Assembler::HS, nooverlap_target); 1428 1429 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1430 if (dest_uninitialized) { 1431 decorators |= IS_DEST_UNINITIALIZED; 1432 } 1433 if (aligned) { 1434 decorators |= ARRAYCOPY_ALIGNED; 1435 } 1436 1437 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1438 bs->arraycopy_prologue(_masm, decorators, is_oop, d, count, saved_regs); 1439 1440 if (is_oop) { 1441 // save regs before copy_memory 1442 __ push(RegSet::of(d, count), sp); 1443 } 1444 copy_memory(aligned, s, d, count, rscratch1, -size); 1445 if (is_oop) { 1446 __ pop(RegSet::of(d, count), sp); 1447 if (VerifyOops) 1448 verify_oop_array(size, d, count, r16); 1449 } 1450 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1451 __ leave(); 1452 __ mov(r0, zr); // return 0 1453 __ ret(lr); 1454 #ifdef BUILTIN_SIM 1455 { 1456 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1457 sim->notifyCompile(const_cast<char*>(name), start); 1458 } 1459 #endif 1460 return start; 1461 } 1462 1463 // Arguments: 1464 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1465 // ignored 1466 // name - stub name string 1467 // 1468 // Inputs: 1469 // c_rarg0 - source array address 1470 // c_rarg1 - destination array address 1471 // c_rarg2 - element count, treated as ssize_t, can be zero 1472 // 1473 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1474 // we let the hardware handle it. The one to eight bytes within words, 1475 // dwords or qwords that span cache line boundaries will still be loaded 1476 // and stored atomically. 1477 // 1478 // Side Effects: 1479 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1480 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1481 // we let the hardware handle it. The one to eight bytes within words, 1482 // dwords or qwords that span cache line boundaries will still be loaded 1483 // and stored atomically. 1484 // 1485 // Side Effects: 1486 // disjoint_byte_copy_entry is set to the no-overlap entry point 1487 // used by generate_conjoint_byte_copy(). 1488 // 1489 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1490 const bool not_oop = false; 1491 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1492 } 1493 1494 // Arguments: 1495 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1496 // ignored 1497 // name - stub name string 1498 // 1499 // Inputs: 1500 // c_rarg0 - source array address 1501 // c_rarg1 - destination array address 1502 // c_rarg2 - element count, treated as ssize_t, can be zero 1503 // 1504 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1505 // we let the hardware handle it. The one to eight bytes within words, 1506 // dwords or qwords that span cache line boundaries will still be loaded 1507 // and stored atomically. 1508 // 1509 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1510 address* entry, const char *name) { 1511 const bool not_oop = false; 1512 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1513 } 1514 1515 // Arguments: 1516 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1517 // ignored 1518 // name - stub name string 1519 // 1520 // Inputs: 1521 // c_rarg0 - source array address 1522 // c_rarg1 - destination array address 1523 // c_rarg2 - element count, treated as ssize_t, can be zero 1524 // 1525 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1526 // let the hardware handle it. The two or four words within dwords 1527 // or qwords that span cache line boundaries will still be loaded 1528 // and stored atomically. 1529 // 1530 // Side Effects: 1531 // disjoint_short_copy_entry is set to the no-overlap entry point 1532 // used by generate_conjoint_short_copy(). 1533 // 1534 address generate_disjoint_short_copy(bool aligned, 1535 address* entry, const char *name) { 1536 const bool not_oop = false; 1537 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1538 } 1539 1540 // Arguments: 1541 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1542 // ignored 1543 // name - stub name string 1544 // 1545 // Inputs: 1546 // c_rarg0 - source array address 1547 // c_rarg1 - destination array address 1548 // c_rarg2 - element count, treated as ssize_t, can be zero 1549 // 1550 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1551 // let the hardware handle it. The two or four words within dwords 1552 // or qwords that span cache line boundaries will still be loaded 1553 // and stored atomically. 1554 // 1555 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1556 address *entry, const char *name) { 1557 const bool not_oop = false; 1558 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1559 1560 } 1561 // Arguments: 1562 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1563 // ignored 1564 // name - stub name string 1565 // 1566 // Inputs: 1567 // c_rarg0 - source array address 1568 // c_rarg1 - destination array address 1569 // c_rarg2 - element count, treated as ssize_t, can be zero 1570 // 1571 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1572 // the hardware handle it. The two dwords within qwords that span 1573 // cache line boundaries will still be loaded and stored atomicly. 1574 // 1575 // Side Effects: 1576 // disjoint_int_copy_entry is set to the no-overlap entry point 1577 // used by generate_conjoint_int_oop_copy(). 1578 // 1579 address generate_disjoint_int_copy(bool aligned, address *entry, 1580 const char *name, bool dest_uninitialized = false) { 1581 const bool not_oop = false; 1582 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1583 } 1584 1585 // Arguments: 1586 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1587 // ignored 1588 // name - stub name string 1589 // 1590 // Inputs: 1591 // c_rarg0 - source array address 1592 // c_rarg1 - destination array address 1593 // c_rarg2 - element count, treated as ssize_t, can be zero 1594 // 1595 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1596 // the hardware handle it. The two dwords within qwords that span 1597 // cache line boundaries will still be loaded and stored atomicly. 1598 // 1599 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1600 address *entry, const char *name, 1601 bool dest_uninitialized = false) { 1602 const bool not_oop = false; 1603 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1604 } 1605 1606 1607 // Arguments: 1608 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1609 // ignored 1610 // name - stub name string 1611 // 1612 // Inputs: 1613 // c_rarg0 - source array address 1614 // c_rarg1 - destination array address 1615 // c_rarg2 - element count, treated as size_t, can be zero 1616 // 1617 // Side Effects: 1618 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1619 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1620 // 1621 address generate_disjoint_long_copy(bool aligned, address *entry, 1622 const char *name, bool dest_uninitialized = false) { 1623 const bool not_oop = false; 1624 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1625 } 1626 1627 // Arguments: 1628 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1629 // ignored 1630 // name - stub name string 1631 // 1632 // Inputs: 1633 // c_rarg0 - source array address 1634 // c_rarg1 - destination array address 1635 // c_rarg2 - element count, treated as size_t, can be zero 1636 // 1637 address generate_conjoint_long_copy(bool aligned, 1638 address nooverlap_target, address *entry, 1639 const char *name, bool dest_uninitialized = false) { 1640 const bool not_oop = false; 1641 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1642 } 1643 1644 // Arguments: 1645 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1646 // ignored 1647 // name - stub name string 1648 // 1649 // Inputs: 1650 // c_rarg0 - source array address 1651 // c_rarg1 - destination array address 1652 // c_rarg2 - element count, treated as size_t, can be zero 1653 // 1654 // Side Effects: 1655 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1656 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1657 // 1658 address generate_disjoint_oop_copy(bool aligned, address *entry, 1659 const char *name, bool dest_uninitialized) { 1660 const bool is_oop = true; 1661 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1662 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1663 } 1664 1665 // Arguments: 1666 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1667 // ignored 1668 // name - stub name string 1669 // 1670 // Inputs: 1671 // c_rarg0 - source array address 1672 // c_rarg1 - destination array address 1673 // c_rarg2 - element count, treated as size_t, can be zero 1674 // 1675 address generate_conjoint_oop_copy(bool aligned, 1676 address nooverlap_target, address *entry, 1677 const char *name, bool dest_uninitialized) { 1678 const bool is_oop = true; 1679 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1680 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1681 name, dest_uninitialized); 1682 } 1683 1684 1685 // Helper for generating a dynamic type check. 1686 // Smashes rscratch1, rscratch2. 1687 void generate_type_check(Register sub_klass, 1688 Register super_check_offset, 1689 Register super_klass, 1690 Label& L_success) { 1691 assert_different_registers(sub_klass, super_check_offset, super_klass); 1692 1693 BLOCK_COMMENT("type_check:"); 1694 1695 Label L_miss; 1696 1697 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1698 super_check_offset); 1699 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1700 1701 // Fall through on failure! 1702 __ BIND(L_miss); 1703 } 1704 1705 // 1706 // Generate checkcasting array copy stub 1707 // 1708 // Input: 1709 // c_rarg0 - source array address 1710 // c_rarg1 - destination array address 1711 // c_rarg2 - element count, treated as ssize_t, can be zero 1712 // c_rarg3 - size_t ckoff (super_check_offset) 1713 // c_rarg4 - oop ckval (super_klass) 1714 // 1715 // Output: 1716 // r0 == 0 - success 1717 // r0 == -1^K - failure, where K is partial transfer count 1718 // 1719 address generate_checkcast_copy(const char *name, address *entry, 1720 bool dest_uninitialized = false) { 1721 1722 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1723 1724 // Input registers (after setup_arg_regs) 1725 const Register from = c_rarg0; // source array address 1726 const Register to = c_rarg1; // destination array address 1727 const Register count = c_rarg2; // elementscount 1728 const Register ckoff = c_rarg3; // super_check_offset 1729 const Register ckval = c_rarg4; // super_klass 1730 1731 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1732 RegSet wb_post_saved_regs = RegSet::of(count); 1733 1734 // Registers used as temps (r18, r19, r20 are save-on-entry) 1735 const Register count_save = r21; // orig elementscount 1736 const Register start_to = r20; // destination array start address 1737 const Register copied_oop = r18; // actual oop copied 1738 const Register r19_klass = r19; // oop._klass 1739 1740 //--------------------------------------------------------------- 1741 // Assembler stub will be used for this call to arraycopy 1742 // if the two arrays are subtypes of Object[] but the 1743 // destination array type is not equal to or a supertype 1744 // of the source type. Each element must be separately 1745 // checked. 1746 1747 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1748 copied_oop, r19_klass, count_save); 1749 1750 __ align(CodeEntryAlignment); 1751 StubCodeMark mark(this, "StubRoutines", name); 1752 address start = __ pc(); 1753 1754 __ enter(); // required for proper stackwalking of RuntimeStub frame 1755 1756 #ifdef ASSERT 1757 // caller guarantees that the arrays really are different 1758 // otherwise, we would have to make conjoint checks 1759 { Label L; 1760 array_overlap_test(L, TIMES_OOP); 1761 __ stop("checkcast_copy within a single array"); 1762 __ bind(L); 1763 } 1764 #endif //ASSERT 1765 1766 // Caller of this entry point must set up the argument registers. 1767 if (entry != NULL) { 1768 *entry = __ pc(); 1769 BLOCK_COMMENT("Entry:"); 1770 } 1771 1772 // Empty array: Nothing to do. 1773 __ cbz(count, L_done); 1774 1775 __ push(RegSet::of(r18, r19, r20, r21), sp); 1776 1777 #ifdef ASSERT 1778 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1779 // The ckoff and ckval must be mutually consistent, 1780 // even though caller generates both. 1781 { Label L; 1782 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1783 __ ldrw(start_to, Address(ckval, sco_offset)); 1784 __ cmpw(ckoff, start_to); 1785 __ br(Assembler::EQ, L); 1786 __ stop("super_check_offset inconsistent"); 1787 __ bind(L); 1788 } 1789 #endif //ASSERT 1790 1791 // Note: checkcast arraycopy is always disjoint. If it were not, then we wouldn't 1792 // need to checkcast. 1793 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1794 bool is_oop = true; 1795 if (dest_uninitialized) { 1796 decorators |= IS_DEST_UNINITIALIZED; 1797 } 1798 1799 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1800 bs->arraycopy_prologue(_masm, decorators, is_oop, to, count, wb_pre_saved_regs); 1801 1802 // save the original count 1803 __ mov(count_save, count); 1804 1805 // Copy from low to high addresses 1806 __ mov(start_to, to); // Save destination array start address 1807 __ b(L_load_element); 1808 1809 // ======== begin loop ======== 1810 // (Loop is rotated; its entry is L_load_element.) 1811 // Loop control: 1812 // for (; count != 0; count--) { 1813 // copied_oop = load_heap_oop(from++); 1814 // ... generate_type_check ...; 1815 // store_heap_oop(to++, copied_oop); 1816 // } 1817 __ align(OptoLoopAlignment); 1818 1819 __ BIND(L_store_element); 1820 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1821 __ sub(count, count, 1); 1822 __ cbz(count, L_do_card_marks); 1823 1824 // ======== loop entry is here ======== 1825 __ BIND(L_load_element); 1826 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1827 __ cbz(copied_oop, L_store_element); 1828 1829 __ load_klass(r19_klass, copied_oop);// query the object klass 1830 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1831 // ======== end loop ======== 1832 1833 // It was a real error; we must depend on the caller to finish the job. 1834 // Register count = remaining oops, count_orig = total oops. 1835 // Emit GC store barriers for the oops we have copied and report 1836 // their number to the caller. 1837 1838 __ subs(count, count_save, count); // K = partially copied oop count 1839 __ eon(count, count, zr); // report (-1^K) to caller 1840 __ br(Assembler::EQ, L_done_pop); 1841 1842 __ BIND(L_do_card_marks); 1843 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1844 1845 __ bind(L_done_pop); 1846 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1847 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1848 1849 __ bind(L_done); 1850 __ mov(r0, count); 1851 __ leave(); 1852 __ ret(lr); 1853 1854 return start; 1855 } 1856 1857 // Perform range checks on the proposed arraycopy. 1858 // Kills temp, but nothing else. 1859 // Also, clean the sign bits of src_pos and dst_pos. 1860 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1861 Register src_pos, // source position (c_rarg1) 1862 Register dst, // destination array oo (c_rarg2) 1863 Register dst_pos, // destination position (c_rarg3) 1864 Register length, 1865 Register temp, 1866 Label& L_failed) { 1867 BLOCK_COMMENT("arraycopy_range_checks:"); 1868 1869 assert_different_registers(rscratch1, temp); 1870 1871 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1872 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1873 __ addw(temp, length, src_pos); 1874 __ cmpw(temp, rscratch1); 1875 __ br(Assembler::HI, L_failed); 1876 1877 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1878 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1879 __ addw(temp, length, dst_pos); 1880 __ cmpw(temp, rscratch1); 1881 __ br(Assembler::HI, L_failed); 1882 1883 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1884 __ movw(src_pos, src_pos); 1885 __ movw(dst_pos, dst_pos); 1886 1887 BLOCK_COMMENT("arraycopy_range_checks done"); 1888 } 1889 1890 // These stubs get called from some dumb test routine. 1891 // I'll write them properly when they're called from 1892 // something that's actually doing something. 1893 static void fake_arraycopy_stub(address src, address dst, int count) { 1894 assert(count == 0, "huh?"); 1895 } 1896 1897 1898 // 1899 // Generate 'unsafe' array copy stub 1900 // Though just as safe as the other stubs, it takes an unscaled 1901 // size_t argument instead of an element count. 1902 // 1903 // Input: 1904 // c_rarg0 - source array address 1905 // c_rarg1 - destination array address 1906 // c_rarg2 - byte count, treated as ssize_t, can be zero 1907 // 1908 // Examines the alignment of the operands and dispatches 1909 // to a long, int, short, or byte copy loop. 1910 // 1911 address generate_unsafe_copy(const char *name, 1912 address byte_copy_entry, 1913 address short_copy_entry, 1914 address int_copy_entry, 1915 address long_copy_entry) { 1916 Label L_long_aligned, L_int_aligned, L_short_aligned; 1917 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1918 1919 __ align(CodeEntryAlignment); 1920 StubCodeMark mark(this, "StubRoutines", name); 1921 address start = __ pc(); 1922 __ enter(); // required for proper stackwalking of RuntimeStub frame 1923 1924 // bump this on entry, not on exit: 1925 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1926 1927 __ orr(rscratch1, s, d); 1928 __ orr(rscratch1, rscratch1, count); 1929 1930 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1931 __ cbz(rscratch1, L_long_aligned); 1932 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1933 __ cbz(rscratch1, L_int_aligned); 1934 __ tbz(rscratch1, 0, L_short_aligned); 1935 __ b(RuntimeAddress(byte_copy_entry)); 1936 1937 __ BIND(L_short_aligned); 1938 __ lsr(count, count, LogBytesPerShort); // size => short_count 1939 __ b(RuntimeAddress(short_copy_entry)); 1940 __ BIND(L_int_aligned); 1941 __ lsr(count, count, LogBytesPerInt); // size => int_count 1942 __ b(RuntimeAddress(int_copy_entry)); 1943 __ BIND(L_long_aligned); 1944 __ lsr(count, count, LogBytesPerLong); // size => long_count 1945 __ b(RuntimeAddress(long_copy_entry)); 1946 1947 return start; 1948 } 1949 1950 // 1951 // Generate generic array copy stubs 1952 // 1953 // Input: 1954 // c_rarg0 - src oop 1955 // c_rarg1 - src_pos (32-bits) 1956 // c_rarg2 - dst oop 1957 // c_rarg3 - dst_pos (32-bits) 1958 // c_rarg4 - element count (32-bits) 1959 // 1960 // Output: 1961 // r0 == 0 - success 1962 // r0 == -1^K - failure, where K is partial transfer count 1963 // 1964 address generate_generic_copy(const char *name, 1965 address byte_copy_entry, address short_copy_entry, 1966 address int_copy_entry, address oop_copy_entry, 1967 address long_copy_entry, address checkcast_copy_entry) { 1968 1969 Label L_failed, L_objArray; 1970 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1971 1972 // Input registers 1973 const Register src = c_rarg0; // source array oop 1974 const Register src_pos = c_rarg1; // source position 1975 const Register dst = c_rarg2; // destination array oop 1976 const Register dst_pos = c_rarg3; // destination position 1977 const Register length = c_rarg4; 1978 1979 1980 // Registers used as temps 1981 const Register dst_klass = c_rarg5; 1982 1983 __ align(CodeEntryAlignment); 1984 1985 StubCodeMark mark(this, "StubRoutines", name); 1986 1987 address start = __ pc(); 1988 1989 __ enter(); // required for proper stackwalking of RuntimeStub frame 1990 1991 // bump this on entry, not on exit: 1992 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 1993 1994 //----------------------------------------------------------------------- 1995 // Assembler stub will be used for this call to arraycopy 1996 // if the following conditions are met: 1997 // 1998 // (1) src and dst must not be null. 1999 // (2) src_pos must not be negative. 2000 // (3) dst_pos must not be negative. 2001 // (4) length must not be negative. 2002 // (5) src klass and dst klass should be the same and not NULL. 2003 // (6) src and dst should be arrays. 2004 // (7) src_pos + length must not exceed length of src. 2005 // (8) dst_pos + length must not exceed length of dst. 2006 // 2007 2008 // if (src == NULL) return -1; 2009 __ cbz(src, L_failed); 2010 2011 // if (src_pos < 0) return -1; 2012 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2013 2014 // if (dst == NULL) return -1; 2015 __ cbz(dst, L_failed); 2016 2017 // if (dst_pos < 0) return -1; 2018 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2019 2020 // registers used as temp 2021 const Register scratch_length = r16; // elements count to copy 2022 const Register scratch_src_klass = r17; // array klass 2023 const Register lh = r18; // layout helper 2024 2025 // if (length < 0) return -1; 2026 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2027 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2028 2029 __ load_klass(scratch_src_klass, src); 2030 #ifdef ASSERT 2031 // assert(src->klass() != NULL); 2032 { 2033 BLOCK_COMMENT("assert klasses not null {"); 2034 Label L1, L2; 2035 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2036 __ bind(L1); 2037 __ stop("broken null klass"); 2038 __ bind(L2); 2039 __ load_klass(rscratch1, dst); 2040 __ cbz(rscratch1, L1); // this would be broken also 2041 BLOCK_COMMENT("} assert klasses not null done"); 2042 } 2043 #endif 2044 2045 // Load layout helper (32-bits) 2046 // 2047 // |array_tag| | header_size | element_type | |log2_element_size| 2048 // 32 30 24 16 8 2 0 2049 // 2050 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2051 // 2052 2053 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2054 2055 // Handle objArrays completely differently... 2056 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2057 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2058 __ movw(rscratch1, objArray_lh); 2059 __ eorw(rscratch2, lh, rscratch1); 2060 __ cbzw(rscratch2, L_objArray); 2061 2062 // if (src->klass() != dst->klass()) return -1; 2063 __ load_klass(rscratch2, dst); 2064 __ eor(rscratch2, rscratch2, scratch_src_klass); 2065 __ cbnz(rscratch2, L_failed); 2066 2067 // if (!src->is_Array()) return -1; 2068 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2069 2070 // At this point, it is known to be a typeArray (array_tag 0x3). 2071 #ifdef ASSERT 2072 { 2073 BLOCK_COMMENT("assert primitive array {"); 2074 Label L; 2075 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2076 __ cmpw(lh, rscratch2); 2077 __ br(Assembler::GE, L); 2078 __ stop("must be a primitive array"); 2079 __ bind(L); 2080 BLOCK_COMMENT("} assert primitive array done"); 2081 } 2082 #endif 2083 2084 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2085 rscratch2, L_failed); 2086 2087 // TypeArrayKlass 2088 // 2089 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2090 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2091 // 2092 2093 const Register rscratch1_offset = rscratch1; // array offset 2094 const Register r18_elsize = lh; // element size 2095 2096 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2097 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2098 __ add(src, src, rscratch1_offset); // src array offset 2099 __ add(dst, dst, rscratch1_offset); // dst array offset 2100 BLOCK_COMMENT("choose copy loop based on element size"); 2101 2102 // next registers should be set before the jump to corresponding stub 2103 const Register from = c_rarg0; // source array address 2104 const Register to = c_rarg1; // destination array address 2105 const Register count = c_rarg2; // elements count 2106 2107 // 'from', 'to', 'count' registers should be set in such order 2108 // since they are the same as 'src', 'src_pos', 'dst'. 2109 2110 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2111 2112 // The possible values of elsize are 0-3, i.e. exact_log2(element 2113 // size in bytes). We do a simple bitwise binary search. 2114 __ BIND(L_copy_bytes); 2115 __ tbnz(r18_elsize, 1, L_copy_ints); 2116 __ tbnz(r18_elsize, 0, L_copy_shorts); 2117 __ lea(from, Address(src, src_pos));// src_addr 2118 __ lea(to, Address(dst, dst_pos));// dst_addr 2119 __ movw(count, scratch_length); // length 2120 __ b(RuntimeAddress(byte_copy_entry)); 2121 2122 __ BIND(L_copy_shorts); 2123 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2124 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2125 __ movw(count, scratch_length); // length 2126 __ b(RuntimeAddress(short_copy_entry)); 2127 2128 __ BIND(L_copy_ints); 2129 __ tbnz(r18_elsize, 0, L_copy_longs); 2130 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2131 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2132 __ movw(count, scratch_length); // length 2133 __ b(RuntimeAddress(int_copy_entry)); 2134 2135 __ BIND(L_copy_longs); 2136 #ifdef ASSERT 2137 { 2138 BLOCK_COMMENT("assert long copy {"); 2139 Label L; 2140 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2141 __ cmpw(r18_elsize, LogBytesPerLong); 2142 __ br(Assembler::EQ, L); 2143 __ stop("must be long copy, but elsize is wrong"); 2144 __ bind(L); 2145 BLOCK_COMMENT("} assert long copy done"); 2146 } 2147 #endif 2148 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2149 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2150 __ movw(count, scratch_length); // length 2151 __ b(RuntimeAddress(long_copy_entry)); 2152 2153 // ObjArrayKlass 2154 __ BIND(L_objArray); 2155 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2156 2157 Label L_plain_copy, L_checkcast_copy; 2158 // test array classes for subtyping 2159 __ load_klass(r18, dst); 2160 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2161 __ br(Assembler::NE, L_checkcast_copy); 2162 2163 // Identically typed arrays can be copied without element-wise checks. 2164 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2165 rscratch2, L_failed); 2166 2167 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2168 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2169 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2170 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2171 __ movw(count, scratch_length); // length 2172 __ BIND(L_plain_copy); 2173 __ b(RuntimeAddress(oop_copy_entry)); 2174 2175 __ BIND(L_checkcast_copy); 2176 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2177 { 2178 // Before looking at dst.length, make sure dst is also an objArray. 2179 __ ldrw(rscratch1, Address(r18, lh_offset)); 2180 __ movw(rscratch2, objArray_lh); 2181 __ eorw(rscratch1, rscratch1, rscratch2); 2182 __ cbnzw(rscratch1, L_failed); 2183 2184 // It is safe to examine both src.length and dst.length. 2185 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2186 r18, L_failed); 2187 2188 __ load_klass(dst_klass, dst); // reload 2189 2190 // Marshal the base address arguments now, freeing registers. 2191 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2192 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2193 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2194 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2195 __ movw(count, length); // length (reloaded) 2196 Register sco_temp = c_rarg3; // this register is free now 2197 assert_different_registers(from, to, count, sco_temp, 2198 dst_klass, scratch_src_klass); 2199 // assert_clean_int(count, sco_temp); 2200 2201 // Generate the type check. 2202 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2203 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2204 2205 // Smashes rscratch1, rscratch2 2206 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2207 2208 // Fetch destination element klass from the ObjArrayKlass header. 2209 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2210 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2211 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2212 2213 // the checkcast_copy loop needs two extra arguments: 2214 assert(c_rarg3 == sco_temp, "#3 already in place"); 2215 // Set up arguments for checkcast_copy_entry. 2216 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2217 __ b(RuntimeAddress(checkcast_copy_entry)); 2218 } 2219 2220 __ BIND(L_failed); 2221 __ mov(r0, -1); 2222 __ leave(); // required for proper stackwalking of RuntimeStub frame 2223 __ ret(lr); 2224 2225 return start; 2226 } 2227 2228 // 2229 // Generate stub for array fill. If "aligned" is true, the 2230 // "to" address is assumed to be heapword aligned. 2231 // 2232 // Arguments for generated stub: 2233 // to: c_rarg0 2234 // value: c_rarg1 2235 // count: c_rarg2 treated as signed 2236 // 2237 address generate_fill(BasicType t, bool aligned, const char *name) { 2238 __ align(CodeEntryAlignment); 2239 StubCodeMark mark(this, "StubRoutines", name); 2240 address start = __ pc(); 2241 2242 BLOCK_COMMENT("Entry:"); 2243 2244 const Register to = c_rarg0; // source array address 2245 const Register value = c_rarg1; // value 2246 const Register count = c_rarg2; // elements count 2247 2248 const Register bz_base = r10; // base for block_zero routine 2249 const Register cnt_words = r11; // temp register 2250 2251 __ enter(); 2252 2253 Label L_fill_elements, L_exit1; 2254 2255 int shift = -1; 2256 switch (t) { 2257 case T_BYTE: 2258 shift = 0; 2259 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2260 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2261 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2262 __ br(Assembler::LO, L_fill_elements); 2263 break; 2264 case T_SHORT: 2265 shift = 1; 2266 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2267 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2268 __ br(Assembler::LO, L_fill_elements); 2269 break; 2270 case T_INT: 2271 shift = 2; 2272 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2273 __ br(Assembler::LO, L_fill_elements); 2274 break; 2275 default: ShouldNotReachHere(); 2276 } 2277 2278 // Align source address at 8 bytes address boundary. 2279 Label L_skip_align1, L_skip_align2, L_skip_align4; 2280 if (!aligned) { 2281 switch (t) { 2282 case T_BYTE: 2283 // One byte misalignment happens only for byte arrays. 2284 __ tbz(to, 0, L_skip_align1); 2285 __ strb(value, Address(__ post(to, 1))); 2286 __ subw(count, count, 1); 2287 __ bind(L_skip_align1); 2288 // Fallthrough 2289 case T_SHORT: 2290 // Two bytes misalignment happens only for byte and short (char) arrays. 2291 __ tbz(to, 1, L_skip_align2); 2292 __ strh(value, Address(__ post(to, 2))); 2293 __ subw(count, count, 2 >> shift); 2294 __ bind(L_skip_align2); 2295 // Fallthrough 2296 case T_INT: 2297 // Align to 8 bytes, we know we are 4 byte aligned to start. 2298 __ tbz(to, 2, L_skip_align4); 2299 __ strw(value, Address(__ post(to, 4))); 2300 __ subw(count, count, 4 >> shift); 2301 __ bind(L_skip_align4); 2302 break; 2303 default: ShouldNotReachHere(); 2304 } 2305 } 2306 2307 // 2308 // Fill large chunks 2309 // 2310 __ lsrw(cnt_words, count, 3 - shift); // number of words 2311 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2312 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2313 if (UseBlockZeroing) { 2314 Label non_block_zeroing, rest; 2315 // If the fill value is zero we can use the fast zero_words(). 2316 __ cbnz(value, non_block_zeroing); 2317 __ mov(bz_base, to); 2318 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2319 __ zero_words(bz_base, cnt_words); 2320 __ b(rest); 2321 __ bind(non_block_zeroing); 2322 __ fill_words(to, cnt_words, value); 2323 __ bind(rest); 2324 } else { 2325 __ fill_words(to, cnt_words, value); 2326 } 2327 2328 // Remaining count is less than 8 bytes. Fill it by a single store. 2329 // Note that the total length is no less than 8 bytes. 2330 if (t == T_BYTE || t == T_SHORT) { 2331 Label L_exit1; 2332 __ cbzw(count, L_exit1); 2333 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2334 __ str(value, Address(to, -8)); // overwrite some elements 2335 __ bind(L_exit1); 2336 __ leave(); 2337 __ ret(lr); 2338 } 2339 2340 // Handle copies less than 8 bytes. 2341 Label L_fill_2, L_fill_4, L_exit2; 2342 __ bind(L_fill_elements); 2343 switch (t) { 2344 case T_BYTE: 2345 __ tbz(count, 0, L_fill_2); 2346 __ strb(value, Address(__ post(to, 1))); 2347 __ bind(L_fill_2); 2348 __ tbz(count, 1, L_fill_4); 2349 __ strh(value, Address(__ post(to, 2))); 2350 __ bind(L_fill_4); 2351 __ tbz(count, 2, L_exit2); 2352 __ strw(value, Address(to)); 2353 break; 2354 case T_SHORT: 2355 __ tbz(count, 0, L_fill_4); 2356 __ strh(value, Address(__ post(to, 2))); 2357 __ bind(L_fill_4); 2358 __ tbz(count, 1, L_exit2); 2359 __ strw(value, Address(to)); 2360 break; 2361 case T_INT: 2362 __ cbzw(count, L_exit2); 2363 __ strw(value, Address(to)); 2364 break; 2365 default: ShouldNotReachHere(); 2366 } 2367 __ bind(L_exit2); 2368 __ leave(); 2369 __ ret(lr); 2370 return start; 2371 } 2372 2373 void generate_arraycopy_stubs() { 2374 address entry; 2375 address entry_jbyte_arraycopy; 2376 address entry_jshort_arraycopy; 2377 address entry_jint_arraycopy; 2378 address entry_oop_arraycopy; 2379 address entry_jlong_arraycopy; 2380 address entry_checkcast_arraycopy; 2381 2382 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2383 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2384 2385 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2386 2387 //*** jbyte 2388 // Always need aligned and unaligned versions 2389 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2390 "jbyte_disjoint_arraycopy"); 2391 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2392 &entry_jbyte_arraycopy, 2393 "jbyte_arraycopy"); 2394 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2395 "arrayof_jbyte_disjoint_arraycopy"); 2396 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2397 "arrayof_jbyte_arraycopy"); 2398 2399 //*** jshort 2400 // Always need aligned and unaligned versions 2401 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2402 "jshort_disjoint_arraycopy"); 2403 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2404 &entry_jshort_arraycopy, 2405 "jshort_arraycopy"); 2406 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2407 "arrayof_jshort_disjoint_arraycopy"); 2408 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2409 "arrayof_jshort_arraycopy"); 2410 2411 //*** jint 2412 // Aligned versions 2413 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2414 "arrayof_jint_disjoint_arraycopy"); 2415 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2416 "arrayof_jint_arraycopy"); 2417 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2418 // entry_jint_arraycopy always points to the unaligned version 2419 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2420 "jint_disjoint_arraycopy"); 2421 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2422 &entry_jint_arraycopy, 2423 "jint_arraycopy"); 2424 2425 //*** jlong 2426 // It is always aligned 2427 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2428 "arrayof_jlong_disjoint_arraycopy"); 2429 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2430 "arrayof_jlong_arraycopy"); 2431 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2432 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2433 2434 //*** oops 2435 { 2436 // With compressed oops we need unaligned versions; notice that 2437 // we overwrite entry_oop_arraycopy. 2438 bool aligned = !UseCompressedOops; 2439 2440 StubRoutines::_arrayof_oop_disjoint_arraycopy 2441 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2442 /*dest_uninitialized*/false); 2443 StubRoutines::_arrayof_oop_arraycopy 2444 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2445 /*dest_uninitialized*/false); 2446 // Aligned versions without pre-barriers 2447 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2448 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2449 /*dest_uninitialized*/true); 2450 StubRoutines::_arrayof_oop_arraycopy_uninit 2451 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2452 /*dest_uninitialized*/true); 2453 } 2454 2455 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2456 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2457 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2458 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2459 2460 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2461 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2462 /*dest_uninitialized*/true); 2463 2464 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2465 entry_jbyte_arraycopy, 2466 entry_jshort_arraycopy, 2467 entry_jint_arraycopy, 2468 entry_jlong_arraycopy); 2469 2470 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2471 entry_jbyte_arraycopy, 2472 entry_jshort_arraycopy, 2473 entry_jint_arraycopy, 2474 entry_oop_arraycopy, 2475 entry_jlong_arraycopy, 2476 entry_checkcast_arraycopy); 2477 2478 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2479 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2480 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2481 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2482 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2483 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2484 } 2485 2486 void generate_math_stubs() { Unimplemented(); } 2487 2488 // Arguments: 2489 // 2490 // Inputs: 2491 // c_rarg0 - source byte array address 2492 // c_rarg1 - destination byte array address 2493 // c_rarg2 - K (key) in little endian int array 2494 // 2495 address generate_aescrypt_encryptBlock() { 2496 __ align(CodeEntryAlignment); 2497 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2498 2499 Label L_doLast; 2500 2501 const Register from = c_rarg0; // source array address 2502 const Register to = c_rarg1; // destination array address 2503 const Register key = c_rarg2; // key array address 2504 const Register keylen = rscratch1; 2505 2506 address start = __ pc(); 2507 __ enter(); 2508 2509 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2510 2511 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2512 2513 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2514 __ rev32(v1, __ T16B, v1); 2515 __ rev32(v2, __ T16B, v2); 2516 __ rev32(v3, __ T16B, v3); 2517 __ rev32(v4, __ T16B, v4); 2518 __ aese(v0, v1); 2519 __ aesmc(v0, v0); 2520 __ aese(v0, v2); 2521 __ aesmc(v0, v0); 2522 __ aese(v0, v3); 2523 __ aesmc(v0, v0); 2524 __ aese(v0, v4); 2525 __ aesmc(v0, v0); 2526 2527 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2528 __ rev32(v1, __ T16B, v1); 2529 __ rev32(v2, __ T16B, v2); 2530 __ rev32(v3, __ T16B, v3); 2531 __ rev32(v4, __ T16B, v4); 2532 __ aese(v0, v1); 2533 __ aesmc(v0, v0); 2534 __ aese(v0, v2); 2535 __ aesmc(v0, v0); 2536 __ aese(v0, v3); 2537 __ aesmc(v0, v0); 2538 __ aese(v0, v4); 2539 __ aesmc(v0, v0); 2540 2541 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2542 __ rev32(v1, __ T16B, v1); 2543 __ rev32(v2, __ T16B, v2); 2544 2545 __ cmpw(keylen, 44); 2546 __ br(Assembler::EQ, L_doLast); 2547 2548 __ aese(v0, v1); 2549 __ aesmc(v0, v0); 2550 __ aese(v0, v2); 2551 __ aesmc(v0, v0); 2552 2553 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2554 __ rev32(v1, __ T16B, v1); 2555 __ rev32(v2, __ T16B, v2); 2556 2557 __ cmpw(keylen, 52); 2558 __ br(Assembler::EQ, L_doLast); 2559 2560 __ aese(v0, v1); 2561 __ aesmc(v0, v0); 2562 __ aese(v0, v2); 2563 __ aesmc(v0, v0); 2564 2565 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2566 __ rev32(v1, __ T16B, v1); 2567 __ rev32(v2, __ T16B, v2); 2568 2569 __ BIND(L_doLast); 2570 2571 __ aese(v0, v1); 2572 __ aesmc(v0, v0); 2573 __ aese(v0, v2); 2574 2575 __ ld1(v1, __ T16B, key); 2576 __ rev32(v1, __ T16B, v1); 2577 __ eor(v0, __ T16B, v0, v1); 2578 2579 __ st1(v0, __ T16B, to); 2580 2581 __ mov(r0, 0); 2582 2583 __ leave(); 2584 __ ret(lr); 2585 2586 return start; 2587 } 2588 2589 // Arguments: 2590 // 2591 // Inputs: 2592 // c_rarg0 - source byte array address 2593 // c_rarg1 - destination byte array address 2594 // c_rarg2 - K (key) in little endian int array 2595 // 2596 address generate_aescrypt_decryptBlock() { 2597 assert(UseAES, "need AES instructions and misaligned SSE support"); 2598 __ align(CodeEntryAlignment); 2599 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2600 Label L_doLast; 2601 2602 const Register from = c_rarg0; // source array address 2603 const Register to = c_rarg1; // destination array address 2604 const Register key = c_rarg2; // key array address 2605 const Register keylen = rscratch1; 2606 2607 address start = __ pc(); 2608 __ enter(); // required for proper stackwalking of RuntimeStub frame 2609 2610 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2611 2612 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2613 2614 __ ld1(v5, __ T16B, __ post(key, 16)); 2615 __ rev32(v5, __ T16B, v5); 2616 2617 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2618 __ rev32(v1, __ T16B, v1); 2619 __ rev32(v2, __ T16B, v2); 2620 __ rev32(v3, __ T16B, v3); 2621 __ rev32(v4, __ T16B, v4); 2622 __ aesd(v0, v1); 2623 __ aesimc(v0, v0); 2624 __ aesd(v0, v2); 2625 __ aesimc(v0, v0); 2626 __ aesd(v0, v3); 2627 __ aesimc(v0, v0); 2628 __ aesd(v0, v4); 2629 __ aesimc(v0, v0); 2630 2631 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2632 __ rev32(v1, __ T16B, v1); 2633 __ rev32(v2, __ T16B, v2); 2634 __ rev32(v3, __ T16B, v3); 2635 __ rev32(v4, __ T16B, v4); 2636 __ aesd(v0, v1); 2637 __ aesimc(v0, v0); 2638 __ aesd(v0, v2); 2639 __ aesimc(v0, v0); 2640 __ aesd(v0, v3); 2641 __ aesimc(v0, v0); 2642 __ aesd(v0, v4); 2643 __ aesimc(v0, v0); 2644 2645 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2646 __ rev32(v1, __ T16B, v1); 2647 __ rev32(v2, __ T16B, v2); 2648 2649 __ cmpw(keylen, 44); 2650 __ br(Assembler::EQ, L_doLast); 2651 2652 __ aesd(v0, v1); 2653 __ aesimc(v0, v0); 2654 __ aesd(v0, v2); 2655 __ aesimc(v0, v0); 2656 2657 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2658 __ rev32(v1, __ T16B, v1); 2659 __ rev32(v2, __ T16B, v2); 2660 2661 __ cmpw(keylen, 52); 2662 __ br(Assembler::EQ, L_doLast); 2663 2664 __ aesd(v0, v1); 2665 __ aesimc(v0, v0); 2666 __ aesd(v0, v2); 2667 __ aesimc(v0, v0); 2668 2669 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2670 __ rev32(v1, __ T16B, v1); 2671 __ rev32(v2, __ T16B, v2); 2672 2673 __ BIND(L_doLast); 2674 2675 __ aesd(v0, v1); 2676 __ aesimc(v0, v0); 2677 __ aesd(v0, v2); 2678 2679 __ eor(v0, __ T16B, v0, v5); 2680 2681 __ st1(v0, __ T16B, to); 2682 2683 __ mov(r0, 0); 2684 2685 __ leave(); 2686 __ ret(lr); 2687 2688 return start; 2689 } 2690 2691 // Arguments: 2692 // 2693 // Inputs: 2694 // c_rarg0 - source byte array address 2695 // c_rarg1 - destination byte array address 2696 // c_rarg2 - K (key) in little endian int array 2697 // c_rarg3 - r vector byte array address 2698 // c_rarg4 - input length 2699 // 2700 // Output: 2701 // x0 - input length 2702 // 2703 address generate_cipherBlockChaining_encryptAESCrypt() { 2704 assert(UseAES, "need AES instructions and misaligned SSE support"); 2705 __ align(CodeEntryAlignment); 2706 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2707 2708 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2709 2710 const Register from = c_rarg0; // source array address 2711 const Register to = c_rarg1; // destination array address 2712 const Register key = c_rarg2; // key array address 2713 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2714 // and left with the results of the last encryption block 2715 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2716 const Register keylen = rscratch1; 2717 2718 address start = __ pc(); 2719 2720 __ enter(); 2721 2722 __ movw(rscratch2, len_reg); 2723 2724 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2725 2726 __ ld1(v0, __ T16B, rvec); 2727 2728 __ cmpw(keylen, 52); 2729 __ br(Assembler::CC, L_loadkeys_44); 2730 __ br(Assembler::EQ, L_loadkeys_52); 2731 2732 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2733 __ rev32(v17, __ T16B, v17); 2734 __ rev32(v18, __ T16B, v18); 2735 __ BIND(L_loadkeys_52); 2736 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2737 __ rev32(v19, __ T16B, v19); 2738 __ rev32(v20, __ T16B, v20); 2739 __ BIND(L_loadkeys_44); 2740 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2741 __ rev32(v21, __ T16B, v21); 2742 __ rev32(v22, __ T16B, v22); 2743 __ rev32(v23, __ T16B, v23); 2744 __ rev32(v24, __ T16B, v24); 2745 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2746 __ rev32(v25, __ T16B, v25); 2747 __ rev32(v26, __ T16B, v26); 2748 __ rev32(v27, __ T16B, v27); 2749 __ rev32(v28, __ T16B, v28); 2750 __ ld1(v29, v30, v31, __ T16B, key); 2751 __ rev32(v29, __ T16B, v29); 2752 __ rev32(v30, __ T16B, v30); 2753 __ rev32(v31, __ T16B, v31); 2754 2755 __ BIND(L_aes_loop); 2756 __ ld1(v1, __ T16B, __ post(from, 16)); 2757 __ eor(v0, __ T16B, v0, v1); 2758 2759 __ br(Assembler::CC, L_rounds_44); 2760 __ br(Assembler::EQ, L_rounds_52); 2761 2762 __ aese(v0, v17); __ aesmc(v0, v0); 2763 __ aese(v0, v18); __ aesmc(v0, v0); 2764 __ BIND(L_rounds_52); 2765 __ aese(v0, v19); __ aesmc(v0, v0); 2766 __ aese(v0, v20); __ aesmc(v0, v0); 2767 __ BIND(L_rounds_44); 2768 __ aese(v0, v21); __ aesmc(v0, v0); 2769 __ aese(v0, v22); __ aesmc(v0, v0); 2770 __ aese(v0, v23); __ aesmc(v0, v0); 2771 __ aese(v0, v24); __ aesmc(v0, v0); 2772 __ aese(v0, v25); __ aesmc(v0, v0); 2773 __ aese(v0, v26); __ aesmc(v0, v0); 2774 __ aese(v0, v27); __ aesmc(v0, v0); 2775 __ aese(v0, v28); __ aesmc(v0, v0); 2776 __ aese(v0, v29); __ aesmc(v0, v0); 2777 __ aese(v0, v30); 2778 __ eor(v0, __ T16B, v0, v31); 2779 2780 __ st1(v0, __ T16B, __ post(to, 16)); 2781 2782 __ subw(len_reg, len_reg, 16); 2783 __ cbnzw(len_reg, L_aes_loop); 2784 2785 __ st1(v0, __ T16B, rvec); 2786 2787 __ mov(r0, rscratch2); 2788 2789 __ leave(); 2790 __ ret(lr); 2791 2792 return start; 2793 } 2794 2795 // Arguments: 2796 // 2797 // Inputs: 2798 // c_rarg0 - source byte array address 2799 // c_rarg1 - destination byte array address 2800 // c_rarg2 - K (key) in little endian int array 2801 // c_rarg3 - r vector byte array address 2802 // c_rarg4 - input length 2803 // 2804 // Output: 2805 // r0 - input length 2806 // 2807 address generate_cipherBlockChaining_decryptAESCrypt() { 2808 assert(UseAES, "need AES instructions and misaligned SSE support"); 2809 __ align(CodeEntryAlignment); 2810 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2811 2812 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2813 2814 const Register from = c_rarg0; // source array address 2815 const Register to = c_rarg1; // destination array address 2816 const Register key = c_rarg2; // key array address 2817 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2818 // and left with the results of the last encryption block 2819 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2820 const Register keylen = rscratch1; 2821 2822 address start = __ pc(); 2823 2824 __ enter(); 2825 2826 __ movw(rscratch2, len_reg); 2827 2828 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2829 2830 __ ld1(v2, __ T16B, rvec); 2831 2832 __ ld1(v31, __ T16B, __ post(key, 16)); 2833 __ rev32(v31, __ T16B, v31); 2834 2835 __ cmpw(keylen, 52); 2836 __ br(Assembler::CC, L_loadkeys_44); 2837 __ br(Assembler::EQ, L_loadkeys_52); 2838 2839 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2840 __ rev32(v17, __ T16B, v17); 2841 __ rev32(v18, __ T16B, v18); 2842 __ BIND(L_loadkeys_52); 2843 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2844 __ rev32(v19, __ T16B, v19); 2845 __ rev32(v20, __ T16B, v20); 2846 __ BIND(L_loadkeys_44); 2847 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2848 __ rev32(v21, __ T16B, v21); 2849 __ rev32(v22, __ T16B, v22); 2850 __ rev32(v23, __ T16B, v23); 2851 __ rev32(v24, __ T16B, v24); 2852 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2853 __ rev32(v25, __ T16B, v25); 2854 __ rev32(v26, __ T16B, v26); 2855 __ rev32(v27, __ T16B, v27); 2856 __ rev32(v28, __ T16B, v28); 2857 __ ld1(v29, v30, __ T16B, key); 2858 __ rev32(v29, __ T16B, v29); 2859 __ rev32(v30, __ T16B, v30); 2860 2861 __ BIND(L_aes_loop); 2862 __ ld1(v0, __ T16B, __ post(from, 16)); 2863 __ orr(v1, __ T16B, v0, v0); 2864 2865 __ br(Assembler::CC, L_rounds_44); 2866 __ br(Assembler::EQ, L_rounds_52); 2867 2868 __ aesd(v0, v17); __ aesimc(v0, v0); 2869 __ aesd(v0, v18); __ aesimc(v0, v0); 2870 __ BIND(L_rounds_52); 2871 __ aesd(v0, v19); __ aesimc(v0, v0); 2872 __ aesd(v0, v20); __ aesimc(v0, v0); 2873 __ BIND(L_rounds_44); 2874 __ aesd(v0, v21); __ aesimc(v0, v0); 2875 __ aesd(v0, v22); __ aesimc(v0, v0); 2876 __ aesd(v0, v23); __ aesimc(v0, v0); 2877 __ aesd(v0, v24); __ aesimc(v0, v0); 2878 __ aesd(v0, v25); __ aesimc(v0, v0); 2879 __ aesd(v0, v26); __ aesimc(v0, v0); 2880 __ aesd(v0, v27); __ aesimc(v0, v0); 2881 __ aesd(v0, v28); __ aesimc(v0, v0); 2882 __ aesd(v0, v29); __ aesimc(v0, v0); 2883 __ aesd(v0, v30); 2884 __ eor(v0, __ T16B, v0, v31); 2885 __ eor(v0, __ T16B, v0, v2); 2886 2887 __ st1(v0, __ T16B, __ post(to, 16)); 2888 __ orr(v2, __ T16B, v1, v1); 2889 2890 __ subw(len_reg, len_reg, 16); 2891 __ cbnzw(len_reg, L_aes_loop); 2892 2893 __ st1(v2, __ T16B, rvec); 2894 2895 __ mov(r0, rscratch2); 2896 2897 __ leave(); 2898 __ ret(lr); 2899 2900 return start; 2901 } 2902 2903 // Arguments: 2904 // 2905 // Inputs: 2906 // c_rarg0 - byte[] source+offset 2907 // c_rarg1 - int[] SHA.state 2908 // c_rarg2 - int offset 2909 // c_rarg3 - int limit 2910 // 2911 address generate_sha1_implCompress(bool multi_block, const char *name) { 2912 __ align(CodeEntryAlignment); 2913 StubCodeMark mark(this, "StubRoutines", name); 2914 address start = __ pc(); 2915 2916 Register buf = c_rarg0; 2917 Register state = c_rarg1; 2918 Register ofs = c_rarg2; 2919 Register limit = c_rarg3; 2920 2921 Label keys; 2922 Label sha1_loop; 2923 2924 // load the keys into v0..v3 2925 __ adr(rscratch1, keys); 2926 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2927 // load 5 words state into v6, v7 2928 __ ldrq(v6, Address(state, 0)); 2929 __ ldrs(v7, Address(state, 16)); 2930 2931 2932 __ BIND(sha1_loop); 2933 // load 64 bytes of data into v16..v19 2934 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2935 __ rev32(v16, __ T16B, v16); 2936 __ rev32(v17, __ T16B, v17); 2937 __ rev32(v18, __ T16B, v18); 2938 __ rev32(v19, __ T16B, v19); 2939 2940 // do the sha1 2941 __ addv(v4, __ T4S, v16, v0); 2942 __ orr(v20, __ T16B, v6, v6); 2943 2944 FloatRegister d0 = v16; 2945 FloatRegister d1 = v17; 2946 FloatRegister d2 = v18; 2947 FloatRegister d3 = v19; 2948 2949 for (int round = 0; round < 20; round++) { 2950 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2951 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2952 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2953 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2954 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2955 2956 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2957 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2958 __ sha1h(tmp2, __ T4S, v20); 2959 if (round < 5) 2960 __ sha1c(v20, __ T4S, tmp3, tmp4); 2961 else if (round < 10 || round >= 15) 2962 __ sha1p(v20, __ T4S, tmp3, tmp4); 2963 else 2964 __ sha1m(v20, __ T4S, tmp3, tmp4); 2965 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2966 2967 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2968 } 2969 2970 __ addv(v7, __ T2S, v7, v21); 2971 __ addv(v6, __ T4S, v6, v20); 2972 2973 if (multi_block) { 2974 __ add(ofs, ofs, 64); 2975 __ cmp(ofs, limit); 2976 __ br(Assembler::LE, sha1_loop); 2977 __ mov(c_rarg0, ofs); // return ofs 2978 } 2979 2980 __ strq(v6, Address(state, 0)); 2981 __ strs(v7, Address(state, 16)); 2982 2983 __ ret(lr); 2984 2985 __ bind(keys); 2986 __ emit_int32(0x5a827999); 2987 __ emit_int32(0x6ed9eba1); 2988 __ emit_int32(0x8f1bbcdc); 2989 __ emit_int32(0xca62c1d6); 2990 2991 return start; 2992 } 2993 2994 2995 // Arguments: 2996 // 2997 // Inputs: 2998 // c_rarg0 - byte[] source+offset 2999 // c_rarg1 - int[] SHA.state 3000 // c_rarg2 - int offset 3001 // c_rarg3 - int limit 3002 // 3003 address generate_sha256_implCompress(bool multi_block, const char *name) { 3004 static const uint32_t round_consts[64] = { 3005 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3006 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3007 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3008 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3009 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3010 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3011 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3012 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3013 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3014 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3015 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3016 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3017 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3018 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3019 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3020 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3021 }; 3022 __ align(CodeEntryAlignment); 3023 StubCodeMark mark(this, "StubRoutines", name); 3024 address start = __ pc(); 3025 3026 Register buf = c_rarg0; 3027 Register state = c_rarg1; 3028 Register ofs = c_rarg2; 3029 Register limit = c_rarg3; 3030 3031 Label sha1_loop; 3032 3033 __ stpd(v8, v9, __ pre(sp, -32)); 3034 __ stpd(v10, v11, Address(sp, 16)); 3035 3036 // dga == v0 3037 // dgb == v1 3038 // dg0 == v2 3039 // dg1 == v3 3040 // dg2 == v4 3041 // t0 == v6 3042 // t1 == v7 3043 3044 // load 16 keys to v16..v31 3045 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3046 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3047 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3048 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3049 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3050 3051 // load 8 words (256 bits) state 3052 __ ldpq(v0, v1, state); 3053 3054 __ BIND(sha1_loop); 3055 // load 64 bytes of data into v8..v11 3056 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3057 __ rev32(v8, __ T16B, v8); 3058 __ rev32(v9, __ T16B, v9); 3059 __ rev32(v10, __ T16B, v10); 3060 __ rev32(v11, __ T16B, v11); 3061 3062 __ addv(v6, __ T4S, v8, v16); 3063 __ orr(v2, __ T16B, v0, v0); 3064 __ orr(v3, __ T16B, v1, v1); 3065 3066 FloatRegister d0 = v8; 3067 FloatRegister d1 = v9; 3068 FloatRegister d2 = v10; 3069 FloatRegister d3 = v11; 3070 3071 3072 for (int round = 0; round < 16; round++) { 3073 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3074 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3075 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3076 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3077 3078 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3079 __ orr(v4, __ T16B, v2, v2); 3080 if (round < 15) 3081 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3082 __ sha256h(v2, __ T4S, v3, tmp2); 3083 __ sha256h2(v3, __ T4S, v4, tmp2); 3084 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3085 3086 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3087 } 3088 3089 __ addv(v0, __ T4S, v0, v2); 3090 __ addv(v1, __ T4S, v1, v3); 3091 3092 if (multi_block) { 3093 __ add(ofs, ofs, 64); 3094 __ cmp(ofs, limit); 3095 __ br(Assembler::LE, sha1_loop); 3096 __ mov(c_rarg0, ofs); // return ofs 3097 } 3098 3099 __ ldpd(v10, v11, Address(sp, 16)); 3100 __ ldpd(v8, v9, __ post(sp, 32)); 3101 3102 __ stpq(v0, v1, state); 3103 3104 __ ret(lr); 3105 3106 return start; 3107 } 3108 3109 #ifndef BUILTIN_SIM 3110 // Safefetch stubs. 3111 void generate_safefetch(const char* name, int size, address* entry, 3112 address* fault_pc, address* continuation_pc) { 3113 // safefetch signatures: 3114 // int SafeFetch32(int* adr, int errValue); 3115 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3116 // 3117 // arguments: 3118 // c_rarg0 = adr 3119 // c_rarg1 = errValue 3120 // 3121 // result: 3122 // PPC_RET = *adr or errValue 3123 3124 StubCodeMark mark(this, "StubRoutines", name); 3125 3126 // Entry point, pc or function descriptor. 3127 *entry = __ pc(); 3128 3129 // Load *adr into c_rarg1, may fault. 3130 *fault_pc = __ pc(); 3131 switch (size) { 3132 case 4: 3133 // int32_t 3134 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3135 break; 3136 case 8: 3137 // int64_t 3138 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3139 break; 3140 default: 3141 ShouldNotReachHere(); 3142 } 3143 3144 // return errValue or *adr 3145 *continuation_pc = __ pc(); 3146 __ mov(r0, c_rarg1); 3147 __ ret(lr); 3148 } 3149 #endif 3150 3151 /** 3152 * Arguments: 3153 * 3154 * Inputs: 3155 * c_rarg0 - int crc 3156 * c_rarg1 - byte* buf 3157 * c_rarg2 - int length 3158 * 3159 * Ouput: 3160 * rax - int crc result 3161 */ 3162 address generate_updateBytesCRC32() { 3163 assert(UseCRC32Intrinsics, "what are we doing here?"); 3164 3165 __ align(CodeEntryAlignment); 3166 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3167 3168 address start = __ pc(); 3169 3170 const Register crc = c_rarg0; // crc 3171 const Register buf = c_rarg1; // source java byte array address 3172 const Register len = c_rarg2; // length 3173 const Register table0 = c_rarg3; // crc_table address 3174 const Register table1 = c_rarg4; 3175 const Register table2 = c_rarg5; 3176 const Register table3 = c_rarg6; 3177 const Register tmp3 = c_rarg7; 3178 3179 BLOCK_COMMENT("Entry:"); 3180 __ enter(); // required for proper stackwalking of RuntimeStub frame 3181 3182 __ kernel_crc32(crc, buf, len, 3183 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3184 3185 __ leave(); // required for proper stackwalking of RuntimeStub frame 3186 __ ret(lr); 3187 3188 return start; 3189 } 3190 3191 /** 3192 * Arguments: 3193 * 3194 * Inputs: 3195 * c_rarg0 - int crc 3196 * c_rarg1 - byte* buf 3197 * c_rarg2 - int length 3198 * c_rarg3 - int* table 3199 * 3200 * Ouput: 3201 * r0 - int crc result 3202 */ 3203 address generate_updateBytesCRC32C() { 3204 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3205 3206 __ align(CodeEntryAlignment); 3207 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3208 3209 address start = __ pc(); 3210 3211 const Register crc = c_rarg0; // crc 3212 const Register buf = c_rarg1; // source java byte array address 3213 const Register len = c_rarg2; // length 3214 const Register table0 = c_rarg3; // crc_table address 3215 const Register table1 = c_rarg4; 3216 const Register table2 = c_rarg5; 3217 const Register table3 = c_rarg6; 3218 const Register tmp3 = c_rarg7; 3219 3220 BLOCK_COMMENT("Entry:"); 3221 __ enter(); // required for proper stackwalking of RuntimeStub frame 3222 3223 __ kernel_crc32c(crc, buf, len, 3224 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3225 3226 __ leave(); // required for proper stackwalking of RuntimeStub frame 3227 __ ret(lr); 3228 3229 return start; 3230 } 3231 3232 /*** 3233 * Arguments: 3234 * 3235 * Inputs: 3236 * c_rarg0 - int adler 3237 * c_rarg1 - byte* buff 3238 * c_rarg2 - int len 3239 * 3240 * Output: 3241 * c_rarg0 - int adler result 3242 */ 3243 address generate_updateBytesAdler32() { 3244 __ align(CodeEntryAlignment); 3245 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3246 address start = __ pc(); 3247 3248 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3249 3250 // Aliases 3251 Register adler = c_rarg0; 3252 Register s1 = c_rarg0; 3253 Register s2 = c_rarg3; 3254 Register buff = c_rarg1; 3255 Register len = c_rarg2; 3256 Register nmax = r4; 3257 Register base = r5; 3258 Register count = r6; 3259 Register temp0 = rscratch1; 3260 Register temp1 = rscratch2; 3261 FloatRegister vbytes = v0; 3262 FloatRegister vs1acc = v1; 3263 FloatRegister vs2acc = v2; 3264 FloatRegister vtable = v3; 3265 3266 // Max number of bytes we can process before having to take the mod 3267 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3268 unsigned long BASE = 0xfff1; 3269 unsigned long NMAX = 0x15B0; 3270 3271 __ mov(base, BASE); 3272 __ mov(nmax, NMAX); 3273 3274 // Load accumulation coefficients for the upper 16 bits 3275 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 3276 __ ld1(vtable, __ T16B, Address(temp0)); 3277 3278 // s1 is initialized to the lower 16 bits of adler 3279 // s2 is initialized to the upper 16 bits of adler 3280 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3281 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3282 3283 // The pipelined loop needs at least 16 elements for 1 iteration 3284 // It does check this, but it is more effective to skip to the cleanup loop 3285 __ cmp(len, (u1)16); 3286 __ br(Assembler::HS, L_nmax); 3287 __ cbz(len, L_combine); 3288 3289 __ bind(L_simple_by1_loop); 3290 __ ldrb(temp0, Address(__ post(buff, 1))); 3291 __ add(s1, s1, temp0); 3292 __ add(s2, s2, s1); 3293 __ subs(len, len, 1); 3294 __ br(Assembler::HI, L_simple_by1_loop); 3295 3296 // s1 = s1 % BASE 3297 __ subs(temp0, s1, base); 3298 __ csel(s1, temp0, s1, Assembler::HS); 3299 3300 // s2 = s2 % BASE 3301 __ lsr(temp0, s2, 16); 3302 __ lsl(temp1, temp0, 4); 3303 __ sub(temp1, temp1, temp0); 3304 __ add(s2, temp1, s2, ext::uxth); 3305 3306 __ subs(temp0, s2, base); 3307 __ csel(s2, temp0, s2, Assembler::HS); 3308 3309 __ b(L_combine); 3310 3311 __ bind(L_nmax); 3312 __ subs(len, len, nmax); 3313 __ sub(count, nmax, 16); 3314 __ br(Assembler::LO, L_by16); 3315 3316 __ bind(L_nmax_loop); 3317 3318 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3319 vbytes, vs1acc, vs2acc, vtable); 3320 3321 __ subs(count, count, 16); 3322 __ br(Assembler::HS, L_nmax_loop); 3323 3324 // s1 = s1 % BASE 3325 __ lsr(temp0, s1, 16); 3326 __ lsl(temp1, temp0, 4); 3327 __ sub(temp1, temp1, temp0); 3328 __ add(temp1, temp1, s1, ext::uxth); 3329 3330 __ lsr(temp0, temp1, 16); 3331 __ lsl(s1, temp0, 4); 3332 __ sub(s1, s1, temp0); 3333 __ add(s1, s1, temp1, ext:: uxth); 3334 3335 __ subs(temp0, s1, base); 3336 __ csel(s1, temp0, s1, Assembler::HS); 3337 3338 // s2 = s2 % BASE 3339 __ lsr(temp0, s2, 16); 3340 __ lsl(temp1, temp0, 4); 3341 __ sub(temp1, temp1, temp0); 3342 __ add(temp1, temp1, s2, ext::uxth); 3343 3344 __ lsr(temp0, temp1, 16); 3345 __ lsl(s2, temp0, 4); 3346 __ sub(s2, s2, temp0); 3347 __ add(s2, s2, temp1, ext:: uxth); 3348 3349 __ subs(temp0, s2, base); 3350 __ csel(s2, temp0, s2, Assembler::HS); 3351 3352 __ subs(len, len, nmax); 3353 __ sub(count, nmax, 16); 3354 __ br(Assembler::HS, L_nmax_loop); 3355 3356 __ bind(L_by16); 3357 __ adds(len, len, count); 3358 __ br(Assembler::LO, L_by1); 3359 3360 __ bind(L_by16_loop); 3361 3362 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3363 vbytes, vs1acc, vs2acc, vtable); 3364 3365 __ subs(len, len, 16); 3366 __ br(Assembler::HS, L_by16_loop); 3367 3368 __ bind(L_by1); 3369 __ adds(len, len, 15); 3370 __ br(Assembler::LO, L_do_mod); 3371 3372 __ bind(L_by1_loop); 3373 __ ldrb(temp0, Address(__ post(buff, 1))); 3374 __ add(s1, temp0, s1); 3375 __ add(s2, s2, s1); 3376 __ subs(len, len, 1); 3377 __ br(Assembler::HS, L_by1_loop); 3378 3379 __ bind(L_do_mod); 3380 // s1 = s1 % BASE 3381 __ lsr(temp0, s1, 16); 3382 __ lsl(temp1, temp0, 4); 3383 __ sub(temp1, temp1, temp0); 3384 __ add(temp1, temp1, s1, ext::uxth); 3385 3386 __ lsr(temp0, temp1, 16); 3387 __ lsl(s1, temp0, 4); 3388 __ sub(s1, s1, temp0); 3389 __ add(s1, s1, temp1, ext:: uxth); 3390 3391 __ subs(temp0, s1, base); 3392 __ csel(s1, temp0, s1, Assembler::HS); 3393 3394 // s2 = s2 % BASE 3395 __ lsr(temp0, s2, 16); 3396 __ lsl(temp1, temp0, 4); 3397 __ sub(temp1, temp1, temp0); 3398 __ add(temp1, temp1, s2, ext::uxth); 3399 3400 __ lsr(temp0, temp1, 16); 3401 __ lsl(s2, temp0, 4); 3402 __ sub(s2, s2, temp0); 3403 __ add(s2, s2, temp1, ext:: uxth); 3404 3405 __ subs(temp0, s2, base); 3406 __ csel(s2, temp0, s2, Assembler::HS); 3407 3408 // Combine lower bits and higher bits 3409 __ bind(L_combine); 3410 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3411 3412 __ ret(lr); 3413 3414 return start; 3415 } 3416 3417 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 3418 Register temp0, Register temp1, FloatRegister vbytes, 3419 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 3420 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 3421 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 3422 // In non-vectorized code, we update s1 and s2 as: 3423 // s1 <- s1 + b1 3424 // s2 <- s2 + s1 3425 // s1 <- s1 + b2 3426 // s2 <- s2 + b1 3427 // ... 3428 // s1 <- s1 + b16 3429 // s2 <- s2 + s1 3430 // Putting above assignments together, we have: 3431 // s1_new = s1 + b1 + b2 + ... + b16 3432 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 3433 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 3434 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 3435 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 3436 3437 // s2 = s2 + s1 * 16 3438 __ add(s2, s2, s1, Assembler::LSL, 4); 3439 3440 // vs1acc = b1 + b2 + b3 + ... + b16 3441 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 3442 __ umullv(vs2acc, __ T8B, vtable, vbytes); 3443 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 3444 __ uaddlv(vs1acc, __ T16B, vbytes); 3445 __ uaddlv(vs2acc, __ T8H, vs2acc); 3446 3447 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 3448 __ fmovd(temp0, vs1acc); 3449 __ fmovd(temp1, vs2acc); 3450 __ add(s1, s1, temp0); 3451 __ add(s2, s2, temp1); 3452 } 3453 3454 /** 3455 * Arguments: 3456 * 3457 * Input: 3458 * c_rarg0 - x address 3459 * c_rarg1 - x length 3460 * c_rarg2 - y address 3461 * c_rarg3 - y lenth 3462 * c_rarg4 - z address 3463 * c_rarg5 - z length 3464 */ 3465 address generate_multiplyToLen() { 3466 __ align(CodeEntryAlignment); 3467 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3468 3469 address start = __ pc(); 3470 const Register x = r0; 3471 const Register xlen = r1; 3472 const Register y = r2; 3473 const Register ylen = r3; 3474 const Register z = r4; 3475 const Register zlen = r5; 3476 3477 const Register tmp1 = r10; 3478 const Register tmp2 = r11; 3479 const Register tmp3 = r12; 3480 const Register tmp4 = r13; 3481 const Register tmp5 = r14; 3482 const Register tmp6 = r15; 3483 const Register tmp7 = r16; 3484 3485 BLOCK_COMMENT("Entry:"); 3486 __ enter(); // required for proper stackwalking of RuntimeStub frame 3487 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3488 __ leave(); // required for proper stackwalking of RuntimeStub frame 3489 __ ret(lr); 3490 3491 return start; 3492 } 3493 3494 address generate_squareToLen() { 3495 // squareToLen algorithm for sizes 1..127 described in java code works 3496 // faster than multiply_to_len on some CPUs and slower on others, but 3497 // multiply_to_len shows a bit better overall results 3498 __ align(CodeEntryAlignment); 3499 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3500 address start = __ pc(); 3501 3502 const Register x = r0; 3503 const Register xlen = r1; 3504 const Register z = r2; 3505 const Register zlen = r3; 3506 const Register y = r4; // == x 3507 const Register ylen = r5; // == xlen 3508 3509 const Register tmp1 = r10; 3510 const Register tmp2 = r11; 3511 const Register tmp3 = r12; 3512 const Register tmp4 = r13; 3513 const Register tmp5 = r14; 3514 const Register tmp6 = r15; 3515 const Register tmp7 = r16; 3516 3517 RegSet spilled_regs = RegSet::of(y, ylen); 3518 BLOCK_COMMENT("Entry:"); 3519 __ enter(); 3520 __ push(spilled_regs, sp); 3521 __ mov(y, x); 3522 __ mov(ylen, xlen); 3523 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3524 __ pop(spilled_regs, sp); 3525 __ leave(); 3526 __ ret(lr); 3527 return start; 3528 } 3529 3530 address generate_mulAdd() { 3531 __ align(CodeEntryAlignment); 3532 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3533 3534 address start = __ pc(); 3535 3536 const Register out = r0; 3537 const Register in = r1; 3538 const Register offset = r2; 3539 const Register len = r3; 3540 const Register k = r4; 3541 3542 BLOCK_COMMENT("Entry:"); 3543 __ enter(); 3544 __ mul_add(out, in, offset, len, k); 3545 __ leave(); 3546 __ ret(lr); 3547 3548 return start; 3549 } 3550 3551 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3552 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3553 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3554 // Karatsuba multiplication performs a 128*128 -> 256-bit 3555 // multiplication in three 128-bit multiplications and a few 3556 // additions. 3557 // 3558 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3559 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3560 // 3561 // Inputs: 3562 // 3563 // A0 in a.d[0] (subkey) 3564 // A1 in a.d[1] 3565 // (A1+A0) in a1_xor_a0.d[0] 3566 // 3567 // B0 in b.d[0] (state) 3568 // B1 in b.d[1] 3569 3570 __ ext(tmp1, __ T16B, b, b, 0x08); 3571 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3572 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3573 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3574 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3575 3576 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3577 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3578 __ eor(tmp2, __ T16B, tmp2, tmp4); 3579 __ eor(tmp2, __ T16B, tmp2, tmp3); 3580 3581 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3582 __ ins(result_hi, __ D, tmp2, 0, 1); 3583 __ ins(result_lo, __ D, tmp2, 1, 0); 3584 } 3585 3586 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3587 FloatRegister p, FloatRegister z, FloatRegister t1) { 3588 const FloatRegister t0 = result; 3589 3590 // The GCM field polynomial f is z^128 + p(z), where p = 3591 // z^7+z^2+z+1. 3592 // 3593 // z^128 === -p(z) (mod (z^128 + p(z))) 3594 // 3595 // so, given that the product we're reducing is 3596 // a == lo + hi * z^128 3597 // substituting, 3598 // === lo - hi * p(z) (mod (z^128 + p(z))) 3599 // 3600 // we reduce by multiplying hi by p(z) and subtracting the result 3601 // from (i.e. XORing it with) lo. Because p has no nonzero high 3602 // bits we can do this with two 64-bit multiplications, lo*p and 3603 // hi*p. 3604 3605 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3606 __ ext(t1, __ T16B, t0, z, 8); 3607 __ eor(hi, __ T16B, hi, t1); 3608 __ ext(t1, __ T16B, z, t0, 8); 3609 __ eor(lo, __ T16B, lo, t1); 3610 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3611 __ eor(result, __ T16B, lo, t0); 3612 } 3613 3614 address generate_has_negatives(address &has_negatives_long) { 3615 const u1 large_loop_size = 64; 3616 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3617 int dcache_line = VM_Version::dcache_line_size(); 3618 3619 Register ary1 = r1, len = r2, result = r0; 3620 3621 __ align(CodeEntryAlignment); 3622 3623 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3624 3625 address entry = __ pc(); 3626 3627 __ enter(); 3628 3629 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3630 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3631 3632 __ cmp(len, (u1)15); 3633 __ br(Assembler::GT, LEN_OVER_15); 3634 // The only case when execution falls into this code is when pointer is near 3635 // the end of memory page and we have to avoid reading next page 3636 __ add(ary1, ary1, len); 3637 __ subs(len, len, 8); 3638 __ br(Assembler::GT, LEN_OVER_8); 3639 __ ldr(rscratch2, Address(ary1, -8)); 3640 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3641 __ lsrv(rscratch2, rscratch2, rscratch1); 3642 __ tst(rscratch2, UPPER_BIT_MASK); 3643 __ cset(result, Assembler::NE); 3644 __ leave(); 3645 __ ret(lr); 3646 __ bind(LEN_OVER_8); 3647 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3648 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3649 __ tst(rscratch2, UPPER_BIT_MASK); 3650 __ br(Assembler::NE, RET_TRUE_NO_POP); 3651 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3652 __ lsrv(rscratch1, rscratch1, rscratch2); 3653 __ tst(rscratch1, UPPER_BIT_MASK); 3654 __ cset(result, Assembler::NE); 3655 __ leave(); 3656 __ ret(lr); 3657 3658 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3659 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3660 3661 has_negatives_long = __ pc(); // 2nd entry point 3662 3663 __ enter(); 3664 3665 __ bind(LEN_OVER_15); 3666 __ push(spilled_regs, sp); 3667 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3668 __ cbz(rscratch2, ALIGNED); 3669 __ ldp(tmp6, tmp1, Address(ary1)); 3670 __ mov(tmp5, 16); 3671 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3672 __ add(ary1, ary1, rscratch1); 3673 __ sub(len, len, rscratch1); 3674 __ orr(tmp6, tmp6, tmp1); 3675 __ tst(tmp6, UPPER_BIT_MASK); 3676 __ br(Assembler::NE, RET_TRUE); 3677 3678 __ bind(ALIGNED); 3679 __ cmp(len, large_loop_size); 3680 __ br(Assembler::LT, CHECK_16); 3681 // Perform 16-byte load as early return in pre-loop to handle situation 3682 // when initially aligned large array has negative values at starting bytes, 3683 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3684 // slower. Cases with negative bytes further ahead won't be affected that 3685 // much. In fact, it'll be faster due to early loads, less instructions and 3686 // less branches in LARGE_LOOP. 3687 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3688 __ sub(len, len, 16); 3689 __ orr(tmp6, tmp6, tmp1); 3690 __ tst(tmp6, UPPER_BIT_MASK); 3691 __ br(Assembler::NE, RET_TRUE); 3692 __ cmp(len, large_loop_size); 3693 __ br(Assembler::LT, CHECK_16); 3694 3695 if (SoftwarePrefetchHintDistance >= 0 3696 && SoftwarePrefetchHintDistance >= dcache_line) { 3697 // initial prefetch 3698 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3699 } 3700 __ bind(LARGE_LOOP); 3701 if (SoftwarePrefetchHintDistance >= 0) { 3702 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3703 } 3704 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3705 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3706 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3707 // instructions per cycle and have less branches, but this approach disables 3708 // early return, thus, all 64 bytes are loaded and checked every time. 3709 __ ldp(tmp2, tmp3, Address(ary1)); 3710 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3711 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3712 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3713 __ add(ary1, ary1, large_loop_size); 3714 __ sub(len, len, large_loop_size); 3715 __ orr(tmp2, tmp2, tmp3); 3716 __ orr(tmp4, tmp4, tmp5); 3717 __ orr(rscratch1, rscratch1, rscratch2); 3718 __ orr(tmp6, tmp6, tmp1); 3719 __ orr(tmp2, tmp2, tmp4); 3720 __ orr(rscratch1, rscratch1, tmp6); 3721 __ orr(tmp2, tmp2, rscratch1); 3722 __ tst(tmp2, UPPER_BIT_MASK); 3723 __ br(Assembler::NE, RET_TRUE); 3724 __ cmp(len, large_loop_size); 3725 __ br(Assembler::GE, LARGE_LOOP); 3726 3727 __ bind(CHECK_16); // small 16-byte load pre-loop 3728 __ cmp(len, (u1)16); 3729 __ br(Assembler::LT, POST_LOOP16); 3730 3731 __ bind(LOOP16); // small 16-byte load loop 3732 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3733 __ sub(len, len, 16); 3734 __ orr(tmp2, tmp2, tmp3); 3735 __ tst(tmp2, UPPER_BIT_MASK); 3736 __ br(Assembler::NE, RET_TRUE); 3737 __ cmp(len, (u1)16); 3738 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3739 3740 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3741 __ cmp(len, (u1)8); 3742 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3743 __ ldr(tmp3, Address(__ post(ary1, 8))); 3744 __ sub(len, len, 8); 3745 __ tst(tmp3, UPPER_BIT_MASK); 3746 __ br(Assembler::NE, RET_TRUE); 3747 3748 __ bind(POST_LOOP16_LOAD_TAIL); 3749 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3750 __ ldr(tmp1, Address(ary1)); 3751 __ mov(tmp2, 64); 3752 __ sub(tmp4, tmp2, len, __ LSL, 3); 3753 __ lslv(tmp1, tmp1, tmp4); 3754 __ tst(tmp1, UPPER_BIT_MASK); 3755 __ br(Assembler::NE, RET_TRUE); 3756 // Fallthrough 3757 3758 __ bind(RET_FALSE); 3759 __ pop(spilled_regs, sp); 3760 __ leave(); 3761 __ mov(result, zr); 3762 __ ret(lr); 3763 3764 __ bind(RET_TRUE); 3765 __ pop(spilled_regs, sp); 3766 __ bind(RET_TRUE_NO_POP); 3767 __ leave(); 3768 __ mov(result, 1); 3769 __ ret(lr); 3770 3771 __ bind(DONE); 3772 __ pop(spilled_regs, sp); 3773 __ leave(); 3774 __ ret(lr); 3775 return entry; 3776 } 3777 3778 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3779 bool usePrefetch, Label &NOT_EQUAL) { 3780 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3781 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3782 tmp7 = r12, tmp8 = r13; 3783 Label LOOP; 3784 3785 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3786 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3787 __ bind(LOOP); 3788 if (usePrefetch) { 3789 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3790 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3791 } 3792 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3793 __ eor(tmp1, tmp1, tmp2); 3794 __ eor(tmp3, tmp3, tmp4); 3795 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3796 __ orr(tmp1, tmp1, tmp3); 3797 __ cbnz(tmp1, NOT_EQUAL); 3798 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3799 __ eor(tmp5, tmp5, tmp6); 3800 __ eor(tmp7, tmp7, tmp8); 3801 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3802 __ orr(tmp5, tmp5, tmp7); 3803 __ cbnz(tmp5, NOT_EQUAL); 3804 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3805 __ eor(tmp1, tmp1, tmp2); 3806 __ eor(tmp3, tmp3, tmp4); 3807 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3808 __ orr(tmp1, tmp1, tmp3); 3809 __ cbnz(tmp1, NOT_EQUAL); 3810 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3811 __ eor(tmp5, tmp5, tmp6); 3812 __ sub(cnt1, cnt1, 8 * wordSize); 3813 __ eor(tmp7, tmp7, tmp8); 3814 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3815 // tmp6 is not used. MacroAssembler::subs is used here (rather than 3816 // cmp) because subs allows an unlimited range of immediate operand. 3817 __ subs(tmp6, cnt1, loopThreshold); 3818 __ orr(tmp5, tmp5, tmp7); 3819 __ cbnz(tmp5, NOT_EQUAL); 3820 __ br(__ GE, LOOP); 3821 // post-loop 3822 __ eor(tmp1, tmp1, tmp2); 3823 __ eor(tmp3, tmp3, tmp4); 3824 __ orr(tmp1, tmp1, tmp3); 3825 __ sub(cnt1, cnt1, 2 * wordSize); 3826 __ cbnz(tmp1, NOT_EQUAL); 3827 } 3828 3829 void generate_large_array_equals_loop_simd(int loopThreshold, 3830 bool usePrefetch, Label &NOT_EQUAL) { 3831 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3832 tmp2 = rscratch2; 3833 Label LOOP; 3834 3835 __ bind(LOOP); 3836 if (usePrefetch) { 3837 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3838 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3839 } 3840 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3841 __ sub(cnt1, cnt1, 8 * wordSize); 3842 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3843 __ subs(tmp1, cnt1, loopThreshold); 3844 __ eor(v0, __ T16B, v0, v4); 3845 __ eor(v1, __ T16B, v1, v5); 3846 __ eor(v2, __ T16B, v2, v6); 3847 __ eor(v3, __ T16B, v3, v7); 3848 __ orr(v0, __ T16B, v0, v1); 3849 __ orr(v1, __ T16B, v2, v3); 3850 __ orr(v0, __ T16B, v0, v1); 3851 __ umov(tmp1, v0, __ D, 0); 3852 __ umov(tmp2, v0, __ D, 1); 3853 __ orr(tmp1, tmp1, tmp2); 3854 __ cbnz(tmp1, NOT_EQUAL); 3855 __ br(__ GE, LOOP); 3856 } 3857 3858 // a1 = r1 - array1 address 3859 // a2 = r2 - array2 address 3860 // result = r0 - return value. Already contains "false" 3861 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3862 // r3-r5 are reserved temporary registers 3863 address generate_large_array_equals() { 3864 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3865 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3866 tmp7 = r12, tmp8 = r13; 3867 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3868 SMALL_LOOP, POST_LOOP; 3869 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3870 // calculate if at least 32 prefetched bytes are used 3871 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3872 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3873 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3874 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3875 tmp5, tmp6, tmp7, tmp8); 3876 3877 __ align(CodeEntryAlignment); 3878 3879 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3880 3881 address entry = __ pc(); 3882 __ enter(); 3883 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3884 // also advance pointers to use post-increment instead of pre-increment 3885 __ add(a1, a1, wordSize); 3886 __ add(a2, a2, wordSize); 3887 if (AvoidUnalignedAccesses) { 3888 // both implementations (SIMD/nonSIMD) are using relatively large load 3889 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 3890 // on some CPUs in case of address is not at least 16-byte aligned. 3891 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 3892 // load if needed at least for 1st address and make if 16-byte aligned. 3893 Label ALIGNED16; 3894 __ tbz(a1, 3, ALIGNED16); 3895 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3896 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3897 __ sub(cnt1, cnt1, wordSize); 3898 __ eor(tmp1, tmp1, tmp2); 3899 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 3900 __ bind(ALIGNED16); 3901 } 3902 if (UseSIMDForArrayEquals) { 3903 if (SoftwarePrefetchHintDistance >= 0) { 3904 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3905 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3906 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 3907 /* prfm = */ true, NOT_EQUAL); 3908 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3909 __ br(__ LT, TAIL); 3910 } 3911 __ bind(NO_PREFETCH_LARGE_LOOP); 3912 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 3913 /* prfm = */ false, NOT_EQUAL); 3914 } else { 3915 __ push(spilled_regs, sp); 3916 if (SoftwarePrefetchHintDistance >= 0) { 3917 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3918 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3919 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 3920 /* prfm = */ true, NOT_EQUAL); 3921 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3922 __ br(__ LT, TAIL); 3923 } 3924 __ bind(NO_PREFETCH_LARGE_LOOP); 3925 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 3926 /* prfm = */ false, NOT_EQUAL); 3927 } 3928 __ bind(TAIL); 3929 __ cbz(cnt1, EQUAL); 3930 __ subs(cnt1, cnt1, wordSize); 3931 __ br(__ LE, POST_LOOP); 3932 __ bind(SMALL_LOOP); 3933 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3934 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3935 __ subs(cnt1, cnt1, wordSize); 3936 __ eor(tmp1, tmp1, tmp2); 3937 __ cbnz(tmp1, NOT_EQUAL); 3938 __ br(__ GT, SMALL_LOOP); 3939 __ bind(POST_LOOP); 3940 __ ldr(tmp1, Address(a1, cnt1)); 3941 __ ldr(tmp2, Address(a2, cnt1)); 3942 __ eor(tmp1, tmp1, tmp2); 3943 __ cbnz(tmp1, NOT_EQUAL); 3944 __ bind(EQUAL); 3945 __ mov(result, true); 3946 __ bind(NOT_EQUAL); 3947 if (!UseSIMDForArrayEquals) { 3948 __ pop(spilled_regs, sp); 3949 } 3950 __ bind(NOT_EQUAL_NO_POP); 3951 __ leave(); 3952 __ ret(lr); 3953 return entry; 3954 } 3955 3956 address generate_dsin_dcos(bool isCos) { 3957 __ align(CodeEntryAlignment); 3958 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 3959 address start = __ pc(); 3960 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 3961 (address)StubRoutines::aarch64::_two_over_pi, 3962 (address)StubRoutines::aarch64::_pio2, 3963 (address)StubRoutines::aarch64::_dsin_coef, 3964 (address)StubRoutines::aarch64::_dcos_coef); 3965 return start; 3966 } 3967 3968 address generate_dlog() { 3969 __ align(CodeEntryAlignment); 3970 StubCodeMark mark(this, "StubRoutines", "dlog"); 3971 address entry = __ pc(); 3972 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 3973 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 3974 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 3975 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 3976 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 3977 return entry; 3978 } 3979 3980 // code for comparing 16 bytes of strings with same encoding 3981 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 3982 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; 3983 __ ldr(rscratch1, Address(__ post(str1, 8))); 3984 __ eor(rscratch2, tmp1, tmp2); 3985 __ ldr(cnt1, Address(__ post(str2, 8))); 3986 __ cbnz(rscratch2, DIFF1); 3987 __ ldr(tmp1, Address(__ post(str1, 8))); 3988 __ eor(rscratch2, rscratch1, cnt1); 3989 __ ldr(tmp2, Address(__ post(str2, 8))); 3990 __ cbnz(rscratch2, DIFF2); 3991 } 3992 3993 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 3994 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 3995 Label &DIFF2) { 3996 Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12; 3997 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 3998 3999 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 4000 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4001 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4002 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 4003 4004 __ fmovd(tmpL, vtmp3); 4005 __ eor(rscratch2, tmp3, tmpL); 4006 __ cbnz(rscratch2, DIFF2); 4007 4008 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4009 __ umov(tmpL, vtmp3, __ D, 1); 4010 __ eor(rscratch2, tmpU, tmpL); 4011 __ cbnz(rscratch2, DIFF1); 4012 4013 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4014 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4015 __ fmovd(tmpL, vtmp); 4016 __ eor(rscratch2, tmp3, tmpL); 4017 __ cbnz(rscratch2, DIFF2); 4018 4019 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4020 __ umov(tmpL, vtmp, __ D, 1); 4021 __ eor(rscratch2, tmpU, tmpL); 4022 __ cbnz(rscratch2, DIFF1); 4023 } 4024 4025 // r0 = result 4026 // r1 = str1 4027 // r2 = cnt1 4028 // r3 = str2 4029 // r4 = cnt2 4030 // r10 = tmp1 4031 // r11 = tmp2 4032 address generate_compare_long_string_different_encoding(bool isLU) { 4033 __ align(CodeEntryAlignment); 4034 StubCodeMark mark(this, "StubRoutines", isLU 4035 ? "compare_long_string_different_encoding LU" 4036 : "compare_long_string_different_encoding UL"); 4037 address entry = __ pc(); 4038 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 4039 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, SMALL_LOOP_ENTER, 4040 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 4041 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4042 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 4043 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 4044 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 4045 4046 int prefetchLoopExitCondition = MAX(32, SoftwarePrefetchHintDistance/2); 4047 4048 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 4049 // cnt2 == amount of characters left to compare 4050 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 4051 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4052 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 4053 __ add(str2, str2, isLU ? wordSize : wordSize/2); 4054 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 4055 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 4056 __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1); 4057 __ eor(rscratch2, tmp1, tmp2); 4058 __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0); 4059 __ mov(rscratch1, tmp2); 4060 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 4061 Register strU = isLU ? str2 : str1, 4062 strL = isLU ? str1 : str2, 4063 tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 4064 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 4065 __ push(spilled_regs, sp); 4066 __ sub(tmp2, strL, cnt2); // strL pointer to load from 4067 __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from 4068 4069 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4070 4071 if (SoftwarePrefetchHintDistance >= 0) { 4072 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4073 __ br(__ LT, SMALL_LOOP); 4074 __ bind(LARGE_LOOP_PREFETCH); 4075 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 4076 __ mov(tmp4, 2); 4077 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4078 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 4079 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4080 __ subs(tmp4, tmp4, 1); 4081 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 4082 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4083 __ mov(tmp4, 2); 4084 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 4085 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4086 __ subs(tmp4, tmp4, 1); 4087 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 4088 __ sub(cnt2, cnt2, 64); 4089 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4090 __ br(__ GE, LARGE_LOOP_PREFETCH); 4091 } 4092 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 4093 __ subs(cnt2, cnt2, 16); 4094 __ br(__ LT, TAIL); 4095 __ b(SMALL_LOOP_ENTER); 4096 __ bind(SMALL_LOOP); // smaller loop 4097 __ subs(cnt2, cnt2, 16); 4098 __ bind(SMALL_LOOP_ENTER); 4099 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4100 __ br(__ GE, SMALL_LOOP); 4101 __ cbz(cnt2, LOAD_LAST); 4102 __ bind(TAIL); // 1..15 characters left 4103 __ subs(zr, cnt2, -8); 4104 __ br(__ GT, TAIL_LOAD_16); 4105 __ ldrd(vtmp, Address(tmp2)); 4106 __ zip1(vtmp3, __ T8B, vtmp, vtmpZ); 4107 4108 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4109 __ fmovd(tmpL, vtmp3); 4110 __ eor(rscratch2, tmp3, tmpL); 4111 __ cbnz(rscratch2, DIFF2); 4112 __ umov(tmpL, vtmp3, __ D, 1); 4113 __ eor(rscratch2, tmpU, tmpL); 4114 __ cbnz(rscratch2, DIFF1); 4115 __ b(LOAD_LAST); 4116 __ bind(TAIL_LOAD_16); 4117 __ ldrq(vtmp, Address(tmp2)); 4118 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4119 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4120 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4121 __ fmovd(tmpL, vtmp3); 4122 __ eor(rscratch2, tmp3, tmpL); 4123 __ cbnz(rscratch2, DIFF2); 4124 4125 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4126 __ umov(tmpL, vtmp3, __ D, 1); 4127 __ eor(rscratch2, tmpU, tmpL); 4128 __ cbnz(rscratch2, DIFF1); 4129 4130 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4131 __ fmovd(tmpL, vtmp); 4132 __ eor(rscratch2, tmp3, tmpL); 4133 __ cbnz(rscratch2, DIFF2); 4134 4135 __ umov(tmpL, vtmp, __ D, 1); 4136 __ eor(rscratch2, tmpU, tmpL); 4137 __ cbnz(rscratch2, DIFF1); 4138 __ b(LOAD_LAST); 4139 __ bind(DIFF2); 4140 __ mov(tmpU, tmp3); 4141 __ bind(DIFF1); 4142 __ pop(spilled_regs, sp); 4143 __ b(CALCULATE_DIFFERENCE); 4144 __ bind(LOAD_LAST); 4145 __ pop(spilled_regs, sp); 4146 4147 __ ldrs(vtmp, Address(strL)); 4148 __ ldr(tmpU, Address(strU)); 4149 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4150 __ fmovd(tmpL, vtmp); 4151 4152 __ eor(rscratch2, tmpU, tmpL); 4153 __ cbz(rscratch2, DONE); 4154 4155 // Find the first different characters in the longwords and 4156 // compute their difference. 4157 __ bind(CALCULATE_DIFFERENCE); 4158 __ rev(rscratch2, rscratch2); 4159 __ clz(rscratch2, rscratch2); 4160 __ andr(rscratch2, rscratch2, -16); 4161 __ lsrv(tmp1, tmp1, rscratch2); 4162 __ uxthw(tmp1, tmp1); 4163 __ lsrv(rscratch1, rscratch1, rscratch2); 4164 __ uxthw(rscratch1, rscratch1); 4165 __ subw(result, tmp1, rscratch1); 4166 __ bind(DONE); 4167 __ ret(lr); 4168 return entry; 4169 } 4170 4171 // r0 = result 4172 // r1 = str1 4173 // r2 = cnt1 4174 // r3 = str2 4175 // r4 = cnt2 4176 // r10 = tmp1 4177 // r11 = tmp2 4178 address generate_compare_long_string_same_encoding(bool isLL) { 4179 __ align(CodeEntryAlignment); 4180 StubCodeMark mark(this, "StubRoutines", isLL 4181 ? "compare_long_string_same_encoding LL" 4182 : "compare_long_string_same_encoding UU"); 4183 address entry = __ pc(); 4184 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4185 tmp1 = r10, tmp2 = r11; 4186 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, 4187 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, 4188 DIFF_LAST_POSITION, DIFF_LAST_POSITION2; 4189 // exit from large loop when less than 64 bytes left to read or we're about 4190 // to prefetch memory behind array border 4191 int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 4192 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 4193 // update cnt2 counter with already loaded 8 bytes 4194 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 4195 // update pointers, because of previous read 4196 __ add(str1, str1, wordSize); 4197 __ add(str2, str2, wordSize); 4198 if (SoftwarePrefetchHintDistance >= 0) { 4199 __ bind(LARGE_LOOP_PREFETCH); 4200 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 4201 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 4202 compare_string_16_bytes_same(DIFF, DIFF2); 4203 compare_string_16_bytes_same(DIFF, DIFF2); 4204 __ sub(cnt2, cnt2, isLL ? 64 : 32); 4205 compare_string_16_bytes_same(DIFF, DIFF2); 4206 __ subs(rscratch2, cnt2, largeLoopExitCondition); 4207 compare_string_16_bytes_same(DIFF, DIFF2); 4208 __ br(__ GT, LARGE_LOOP_PREFETCH); 4209 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? 4210 // less than 16 bytes left? 4211 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4212 __ br(__ LT, TAIL); 4213 } 4214 __ bind(SMALL_LOOP); 4215 compare_string_16_bytes_same(DIFF, DIFF2); 4216 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4217 __ br(__ GE, SMALL_LOOP); 4218 __ bind(TAIL); 4219 __ adds(cnt2, cnt2, isLL ? 16 : 8); 4220 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); 4221 __ subs(cnt2, cnt2, isLL ? 8 : 4); 4222 __ br(__ LE, CHECK_LAST); 4223 __ eor(rscratch2, tmp1, tmp2); 4224 __ cbnz(rscratch2, DIFF); 4225 __ ldr(tmp1, Address(__ post(str1, 8))); 4226 __ ldr(tmp2, Address(__ post(str2, 8))); 4227 __ sub(cnt2, cnt2, isLL ? 8 : 4); 4228 __ bind(CHECK_LAST); 4229 if (!isLL) { 4230 __ add(cnt2, cnt2, cnt2); // now in bytes 4231 } 4232 __ eor(rscratch2, tmp1, tmp2); 4233 __ cbnz(rscratch2, DIFF); 4234 __ ldr(rscratch1, Address(str1, cnt2)); 4235 __ ldr(cnt1, Address(str2, cnt2)); 4236 __ eor(rscratch2, rscratch1, cnt1); 4237 __ cbz(rscratch2, LENGTH_DIFF); 4238 // Find the first different characters in the longwords and 4239 // compute their difference. 4240 __ bind(DIFF2); 4241 __ rev(rscratch2, rscratch2); 4242 __ clz(rscratch2, rscratch2); 4243 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4244 __ lsrv(rscratch1, rscratch1, rscratch2); 4245 if (isLL) { 4246 __ lsrv(cnt1, cnt1, rscratch2); 4247 __ uxtbw(rscratch1, rscratch1); 4248 __ uxtbw(cnt1, cnt1); 4249 } else { 4250 __ lsrv(cnt1, cnt1, rscratch2); 4251 __ uxthw(rscratch1, rscratch1); 4252 __ uxthw(cnt1, cnt1); 4253 } 4254 __ subw(result, rscratch1, cnt1); 4255 __ b(LENGTH_DIFF); 4256 __ bind(DIFF); 4257 __ rev(rscratch2, rscratch2); 4258 __ clz(rscratch2, rscratch2); 4259 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4260 __ lsrv(tmp1, tmp1, rscratch2); 4261 if (isLL) { 4262 __ lsrv(tmp2, tmp2, rscratch2); 4263 __ uxtbw(tmp1, tmp1); 4264 __ uxtbw(tmp2, tmp2); 4265 } else { 4266 __ lsrv(tmp2, tmp2, rscratch2); 4267 __ uxthw(tmp1, tmp1); 4268 __ uxthw(tmp2, tmp2); 4269 } 4270 __ subw(result, tmp1, tmp2); 4271 __ b(LENGTH_DIFF); 4272 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 4273 __ eor(rscratch2, tmp1, tmp2); 4274 __ cbnz(rscratch2, DIFF); 4275 __ bind(LENGTH_DIFF); 4276 __ ret(lr); 4277 return entry; 4278 } 4279 4280 void generate_compare_long_strings() { 4281 StubRoutines::aarch64::_compare_long_string_LL 4282 = generate_compare_long_string_same_encoding(true); 4283 StubRoutines::aarch64::_compare_long_string_UU 4284 = generate_compare_long_string_same_encoding(false); 4285 StubRoutines::aarch64::_compare_long_string_LU 4286 = generate_compare_long_string_different_encoding(true); 4287 StubRoutines::aarch64::_compare_long_string_UL 4288 = generate_compare_long_string_different_encoding(false); 4289 } 4290 4291 // R0 = result 4292 // R1 = str2 4293 // R2 = cnt1 4294 // R3 = str1 4295 // R4 = cnt2 4296 // This generic linear code use few additional ideas, which makes it faster: 4297 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 4298 // in order to skip initial loading(help in systems with 1 ld pipeline) 4299 // 2) we can use "fast" algorithm of finding single character to search for 4300 // first symbol with less branches(1 branch per each loaded register instead 4301 // of branch for each symbol), so, this is where constants like 4302 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 4303 // 3) after loading and analyzing 1st register of source string, it can be 4304 // used to search for every 1st character entry, saving few loads in 4305 // comparison with "simplier-but-slower" implementation 4306 // 4) in order to avoid lots of push/pop operations, code below is heavily 4307 // re-using/re-initializing/compressing register values, which makes code 4308 // larger and a bit less readable, however, most of extra operations are 4309 // issued during loads or branches, so, penalty is minimal 4310 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 4311 const char* stubName = str1_isL 4312 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 4313 : "indexof_linear_uu"; 4314 __ align(CodeEntryAlignment); 4315 StubCodeMark mark(this, "StubRoutines", stubName); 4316 address entry = __ pc(); 4317 4318 int str1_chr_size = str1_isL ? 1 : 2; 4319 int str2_chr_size = str2_isL ? 1 : 2; 4320 int str1_chr_shift = str1_isL ? 0 : 1; 4321 int str2_chr_shift = str2_isL ? 0 : 1; 4322 bool isL = str1_isL && str2_isL; 4323 // parameters 4324 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 4325 // temporary registers 4326 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 4327 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 4328 // redefinitions 4329 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 4330 4331 __ push(spilled_regs, sp); 4332 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 4333 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 4334 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 4335 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 4336 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 4337 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 4338 // Read whole register from str1. It is safe, because length >=8 here 4339 __ ldr(ch1, Address(str1)); 4340 // Read whole register from str2. It is safe, because length >=8 here 4341 __ ldr(ch2, Address(str2)); 4342 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 4343 if (str1_isL != str2_isL) { 4344 __ eor(v0, __ T16B, v0, v0); 4345 } 4346 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4347 __ mul(first, first, tmp1); 4348 // check if we have less than 1 register to check 4349 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 4350 if (str1_isL != str2_isL) { 4351 __ fmovd(v1, ch1); 4352 } 4353 __ br(__ LE, L_SMALL); 4354 __ eor(ch2, first, ch2); 4355 if (str1_isL != str2_isL) { 4356 __ zip1(v1, __ T16B, v1, v0); 4357 } 4358 __ sub(tmp2, ch2, tmp1); 4359 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4360 __ bics(tmp2, tmp2, ch2); 4361 if (str1_isL != str2_isL) { 4362 __ fmovd(ch1, v1); 4363 } 4364 __ br(__ NE, L_HAS_ZERO); 4365 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4366 __ add(result, result, wordSize/str2_chr_size); 4367 __ add(str2, str2, wordSize); 4368 __ br(__ LT, L_POST_LOOP); 4369 __ BIND(L_LOOP); 4370 __ ldr(ch2, Address(str2)); 4371 __ eor(ch2, first, ch2); 4372 __ sub(tmp2, ch2, tmp1); 4373 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4374 __ bics(tmp2, tmp2, ch2); 4375 __ br(__ NE, L_HAS_ZERO); 4376 __ BIND(L_LOOP_PROCEED); 4377 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4378 __ add(str2, str2, wordSize); 4379 __ add(result, result, wordSize/str2_chr_size); 4380 __ br(__ GE, L_LOOP); 4381 __ BIND(L_POST_LOOP); 4382 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 4383 __ br(__ LE, NOMATCH); 4384 __ ldr(ch2, Address(str2)); 4385 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4386 __ eor(ch2, first, ch2); 4387 __ sub(tmp2, ch2, tmp1); 4388 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4389 __ mov(tmp4, -1); // all bits set 4390 __ b(L_SMALL_PROCEED); 4391 __ align(OptoLoopAlignment); 4392 __ BIND(L_SMALL); 4393 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4394 __ eor(ch2, first, ch2); 4395 if (str1_isL != str2_isL) { 4396 __ zip1(v1, __ T16B, v1, v0); 4397 } 4398 __ sub(tmp2, ch2, tmp1); 4399 __ mov(tmp4, -1); // all bits set 4400 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4401 if (str1_isL != str2_isL) { 4402 __ fmovd(ch1, v1); // move converted 4 symbols 4403 } 4404 __ BIND(L_SMALL_PROCEED); 4405 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 4406 __ bic(tmp2, tmp2, ch2); 4407 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 4408 __ rbit(tmp2, tmp2); 4409 __ br(__ EQ, NOMATCH); 4410 __ BIND(L_SMALL_HAS_ZERO_LOOP); 4411 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 4412 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 4413 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 4414 if (str2_isL) { // LL 4415 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4416 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4417 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4418 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4419 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4420 } else { 4421 __ mov(ch2, 0xE); // all bits in byte set except last one 4422 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4423 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4424 __ lslv(tmp2, tmp2, tmp4); 4425 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4426 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4427 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4428 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4429 } 4430 __ cmp(ch1, ch2); 4431 __ mov(tmp4, wordSize/str2_chr_size); 4432 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4433 __ BIND(L_SMALL_CMP_LOOP); 4434 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4435 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4436 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4437 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4438 __ add(tmp4, tmp4, 1); 4439 __ cmp(tmp4, cnt1); 4440 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 4441 __ cmp(first, ch2); 4442 __ br(__ EQ, L_SMALL_CMP_LOOP); 4443 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 4444 __ cbz(tmp2, NOMATCH); // no more matches. exit 4445 __ clz(tmp4, tmp2); 4446 __ add(result, result, 1); // advance index 4447 __ add(str2, str2, str2_chr_size); // advance pointer 4448 __ b(L_SMALL_HAS_ZERO_LOOP); 4449 __ align(OptoLoopAlignment); 4450 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 4451 __ cmp(first, ch2); 4452 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4453 __ b(DONE); 4454 __ align(OptoLoopAlignment); 4455 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 4456 if (str2_isL) { // LL 4457 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4458 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4459 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4460 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4461 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4462 } else { 4463 __ mov(ch2, 0xE); // all bits in byte set except last one 4464 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4465 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4466 __ lslv(tmp2, tmp2, tmp4); 4467 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4468 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4469 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4470 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4471 } 4472 __ cmp(ch1, ch2); 4473 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4474 __ b(DONE); 4475 __ align(OptoLoopAlignment); 4476 __ BIND(L_HAS_ZERO); 4477 __ rbit(tmp2, tmp2); 4478 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 4479 // Now, perform compression of counters(cnt2 and cnt1) into one register. 4480 // It's fine because both counters are 32bit and are not changed in this 4481 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 4482 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 4483 __ sub(result, result, 1); 4484 __ BIND(L_HAS_ZERO_LOOP); 4485 __ mov(cnt1, wordSize/str2_chr_size); 4486 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4487 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 4488 if (str2_isL) { 4489 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4490 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4491 __ lslv(tmp2, tmp2, tmp4); 4492 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4493 __ add(tmp4, tmp4, 1); 4494 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4495 __ lsl(tmp2, tmp2, 1); 4496 __ mov(tmp4, wordSize/str2_chr_size); 4497 } else { 4498 __ mov(ch2, 0xE); 4499 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4500 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4501 __ lslv(tmp2, tmp2, tmp4); 4502 __ add(tmp4, tmp4, 1); 4503 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4504 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4505 __ lsl(tmp2, tmp2, 1); 4506 __ mov(tmp4, wordSize/str2_chr_size); 4507 __ sub(str2, str2, str2_chr_size); 4508 } 4509 __ cmp(ch1, ch2); 4510 __ mov(tmp4, wordSize/str2_chr_size); 4511 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4512 __ BIND(L_CMP_LOOP); 4513 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4514 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4515 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4516 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4517 __ add(tmp4, tmp4, 1); 4518 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4519 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 4520 __ cmp(cnt1, ch2); 4521 __ br(__ EQ, L_CMP_LOOP); 4522 __ BIND(L_CMP_LOOP_NOMATCH); 4523 // here we're not matched 4524 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 4525 __ clz(tmp4, tmp2); 4526 __ add(str2, str2, str2_chr_size); // advance pointer 4527 __ b(L_HAS_ZERO_LOOP); 4528 __ align(OptoLoopAlignment); 4529 __ BIND(L_CMP_LOOP_LAST_CMP); 4530 __ cmp(cnt1, ch2); 4531 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4532 __ b(DONE); 4533 __ align(OptoLoopAlignment); 4534 __ BIND(L_CMP_LOOP_LAST_CMP2); 4535 if (str2_isL) { 4536 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4537 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4538 __ lslv(tmp2, tmp2, tmp4); 4539 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4540 __ add(tmp4, tmp4, 1); 4541 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4542 __ lsl(tmp2, tmp2, 1); 4543 } else { 4544 __ mov(ch2, 0xE); 4545 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4546 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4547 __ lslv(tmp2, tmp2, tmp4); 4548 __ add(tmp4, tmp4, 1); 4549 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4550 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4551 __ lsl(tmp2, tmp2, 1); 4552 __ sub(str2, str2, str2_chr_size); 4553 } 4554 __ cmp(ch1, ch2); 4555 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4556 __ b(DONE); 4557 __ align(OptoLoopAlignment); 4558 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 4559 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 4560 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 4561 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 4562 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 4563 // result by analyzed characters value, so, we can just reset lower bits 4564 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 4565 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 4566 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 4567 // index of last analyzed substring inside current octet. So, str2 in at 4568 // respective start address. We need to advance it to next octet 4569 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 4570 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 4571 __ bfm(result, zr, 0, 2 - str2_chr_shift); 4572 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 4573 __ movw(cnt2, cnt2); 4574 __ b(L_LOOP_PROCEED); 4575 __ align(OptoLoopAlignment); 4576 __ BIND(NOMATCH); 4577 __ mov(result, -1); 4578 __ BIND(DONE); 4579 __ pop(spilled_regs, sp); 4580 __ ret(lr); 4581 return entry; 4582 } 4583 4584 void generate_string_indexof_stubs() { 4585 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 4586 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 4587 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 4588 } 4589 4590 void inflate_and_store_2_fp_registers(bool generatePrfm, 4591 FloatRegister src1, FloatRegister src2) { 4592 Register dst = r1; 4593 __ zip1(v1, __ T16B, src1, v0); 4594 __ zip2(v2, __ T16B, src1, v0); 4595 if (generatePrfm) { 4596 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 4597 } 4598 __ zip1(v3, __ T16B, src2, v0); 4599 __ zip2(v4, __ T16B, src2, v0); 4600 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4601 } 4602 4603 // R0 = src 4604 // R1 = dst 4605 // R2 = len 4606 // R3 = len >> 3 4607 // V0 = 0 4608 // v1 = loaded 8 bytes 4609 address generate_large_byte_array_inflate() { 4610 __ align(CodeEntryAlignment); 4611 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4612 address entry = __ pc(); 4613 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4614 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4615 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4; 4616 4617 // do one more 8-byte read to have address 16-byte aligned in most cases 4618 // also use single store instruction 4619 __ ldrd(v2, __ post(src, 8)); 4620 __ sub(octetCounter, octetCounter, 2); 4621 __ zip1(v1, __ T16B, v1, v0); 4622 __ zip1(v2, __ T16B, v2, v0); 4623 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4624 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4625 __ subs(rscratch1, octetCounter, large_loop_threshold); 4626 __ br(__ LE, LOOP_START); 4627 __ b(LOOP_PRFM_START); 4628 __ bind(LOOP_PRFM); 4629 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4630 __ bind(LOOP_PRFM_START); 4631 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4632 __ sub(octetCounter, octetCounter, 8); 4633 __ subs(rscratch1, octetCounter, large_loop_threshold); 4634 inflate_and_store_2_fp_registers(true, v3, v4); 4635 inflate_and_store_2_fp_registers(true, v5, v6); 4636 __ br(__ GT, LOOP_PRFM); 4637 __ cmp(octetCounter, (u1)8); 4638 __ br(__ LT, DONE); 4639 __ bind(LOOP); 4640 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4641 __ bind(LOOP_START); 4642 __ sub(octetCounter, octetCounter, 8); 4643 __ cmp(octetCounter, (u1)8); 4644 inflate_and_store_2_fp_registers(false, v3, v4); 4645 inflate_and_store_2_fp_registers(false, v5, v6); 4646 __ br(__ GE, LOOP); 4647 __ bind(DONE); 4648 __ ret(lr); 4649 return entry; 4650 } 4651 4652 /** 4653 * Arguments: 4654 * 4655 * Input: 4656 * c_rarg0 - current state address 4657 * c_rarg1 - H key address 4658 * c_rarg2 - data address 4659 * c_rarg3 - number of blocks 4660 * 4661 * Output: 4662 * Updated state at c_rarg0 4663 */ 4664 address generate_ghash_processBlocks() { 4665 // Bafflingly, GCM uses little-endian for the byte order, but 4666 // big-endian for the bit order. For example, the polynomial 1 is 4667 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4668 // 4669 // So, we must either reverse the bytes in each word and do 4670 // everything big-endian or reverse the bits in each byte and do 4671 // it little-endian. On AArch64 it's more idiomatic to reverse 4672 // the bits in each byte (we have an instruction, RBIT, to do 4673 // that) and keep the data in little-endian bit order throught the 4674 // calculation, bit-reversing the inputs and outputs. 4675 4676 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4677 __ align(wordSize * 2); 4678 address p = __ pc(); 4679 __ emit_int64(0x87); // The low-order bits of the field 4680 // polynomial (i.e. p = z^7+z^2+z+1) 4681 // repeated in the low and high parts of a 4682 // 128-bit vector 4683 __ emit_int64(0x87); 4684 4685 __ align(CodeEntryAlignment); 4686 address start = __ pc(); 4687 4688 Register state = c_rarg0; 4689 Register subkeyH = c_rarg1; 4690 Register data = c_rarg2; 4691 Register blocks = c_rarg3; 4692 4693 FloatRegister vzr = v30; 4694 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4695 4696 __ ldrq(v0, Address(state)); 4697 __ ldrq(v1, Address(subkeyH)); 4698 4699 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4700 __ rbit(v0, __ T16B, v0); 4701 __ rev64(v1, __ T16B, v1); 4702 __ rbit(v1, __ T16B, v1); 4703 4704 __ ldrq(v26, p); 4705 4706 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4707 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4708 4709 { 4710 Label L_ghash_loop; 4711 __ bind(L_ghash_loop); 4712 4713 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4714 // reversing each byte 4715 __ rbit(v2, __ T16B, v2); 4716 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4717 4718 // Multiply state in v2 by subkey in v1 4719 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4720 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4721 /*temps*/v6, v20, v18, v21); 4722 // Reduce v7:v5 by the field polynomial 4723 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4724 4725 __ sub(blocks, blocks, 1); 4726 __ cbnz(blocks, L_ghash_loop); 4727 } 4728 4729 // The bit-reversed result is at this point in v0 4730 __ rev64(v1, __ T16B, v0); 4731 __ rbit(v1, __ T16B, v1); 4732 4733 __ st1(v1, __ T16B, state); 4734 __ ret(lr); 4735 4736 return start; 4737 } 4738 4739 // Continuation point for throwing of implicit exceptions that are 4740 // not handled in the current activation. Fabricates an exception 4741 // oop and initiates normal exception dispatching in this 4742 // frame. Since we need to preserve callee-saved values (currently 4743 // only for C2, but done for C1 as well) we need a callee-saved oop 4744 // map and therefore have to make these stubs into RuntimeStubs 4745 // rather than BufferBlobs. If the compiler needs all registers to 4746 // be preserved between the fault point and the exception handler 4747 // then it must assume responsibility for that in 4748 // AbstractCompiler::continuation_for_implicit_null_exception or 4749 // continuation_for_implicit_division_by_zero_exception. All other 4750 // implicit exceptions (e.g., NullPointerException or 4751 // AbstractMethodError on entry) are either at call sites or 4752 // otherwise assume that stack unwinding will be initiated, so 4753 // caller saved registers were assumed volatile in the compiler. 4754 4755 #undef __ 4756 #define __ masm-> 4757 4758 address generate_throw_exception(const char* name, 4759 address runtime_entry, 4760 Register arg1 = noreg, 4761 Register arg2 = noreg) { 4762 // Information about frame layout at time of blocking runtime call. 4763 // Note that we only have to preserve callee-saved registers since 4764 // the compilers are responsible for supplying a continuation point 4765 // if they expect all registers to be preserved. 4766 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4767 enum layout { 4768 rfp_off = 0, 4769 rfp_off2, 4770 return_off, 4771 return_off2, 4772 framesize // inclusive of return address 4773 }; 4774 4775 int insts_size = 512; 4776 int locs_size = 64; 4777 4778 CodeBuffer code(name, insts_size, locs_size); 4779 OopMapSet* oop_maps = new OopMapSet(); 4780 MacroAssembler* masm = new MacroAssembler(&code); 4781 4782 address start = __ pc(); 4783 4784 // This is an inlined and slightly modified version of call_VM 4785 // which has the ability to fetch the return PC out of 4786 // thread-local storage and also sets up last_Java_sp slightly 4787 // differently than the real call_VM 4788 4789 __ enter(); // Save FP and LR before call 4790 4791 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4792 4793 // lr and fp are already in place 4794 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4795 4796 int frame_complete = __ pc() - start; 4797 4798 // Set up last_Java_sp and last_Java_fp 4799 address the_pc = __ pc(); 4800 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 4801 4802 // Call runtime 4803 if (arg1 != noreg) { 4804 assert(arg2 != c_rarg1, "clobbered"); 4805 __ mov(c_rarg1, arg1); 4806 } 4807 if (arg2 != noreg) { 4808 __ mov(c_rarg2, arg2); 4809 } 4810 __ mov(c_rarg0, rthread); 4811 BLOCK_COMMENT("call runtime_entry"); 4812 __ mov(rscratch1, runtime_entry); 4813 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4814 4815 // Generate oop map 4816 OopMap* map = new OopMap(framesize, 0); 4817 4818 oop_maps->add_gc_map(the_pc - start, map); 4819 4820 __ reset_last_Java_frame(true); 4821 __ maybe_isb(); 4822 4823 __ leave(); 4824 4825 // check for pending exceptions 4826 #ifdef ASSERT 4827 Label L; 4828 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4829 __ cbnz(rscratch1, L); 4830 __ should_not_reach_here(); 4831 __ bind(L); 4832 #endif // ASSERT 4833 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4834 4835 4836 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4837 RuntimeStub* stub = 4838 RuntimeStub::new_runtime_stub(name, 4839 &code, 4840 frame_complete, 4841 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4842 oop_maps, false); 4843 return stub->entry_point(); 4844 } 4845 4846 class MontgomeryMultiplyGenerator : public MacroAssembler { 4847 4848 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4849 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4850 4851 RegSet _toSave; 4852 bool _squaring; 4853 4854 public: 4855 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4856 : MacroAssembler(as->code()), _squaring(squaring) { 4857 4858 // Register allocation 4859 4860 Register reg = c_rarg0; 4861 Pa_base = reg; // Argument registers 4862 if (squaring) 4863 Pb_base = Pa_base; 4864 else 4865 Pb_base = ++reg; 4866 Pn_base = ++reg; 4867 Rlen= ++reg; 4868 inv = ++reg; 4869 Pm_base = ++reg; 4870 4871 // Working registers: 4872 Ra = ++reg; // The current digit of a, b, n, and m. 4873 Rb = ++reg; 4874 Rm = ++reg; 4875 Rn = ++reg; 4876 4877 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4878 Pb = ++reg; 4879 Pm = ++reg; 4880 Pn = ++reg; 4881 4882 t0 = ++reg; // Three registers which form a 4883 t1 = ++reg; // triple-precision accumuator. 4884 t2 = ++reg; 4885 4886 Ri = ++reg; // Inner and outer loop indexes. 4887 Rj = ++reg; 4888 4889 Rhi_ab = ++reg; // Product registers: low and high parts 4890 Rlo_ab = ++reg; // of a*b and m*n. 4891 Rhi_mn = ++reg; 4892 Rlo_mn = ++reg; 4893 4894 // r19 and up are callee-saved. 4895 _toSave = RegSet::range(r19, reg) + Pm_base; 4896 } 4897 4898 private: 4899 void save_regs() { 4900 push(_toSave, sp); 4901 } 4902 4903 void restore_regs() { 4904 pop(_toSave, sp); 4905 } 4906 4907 template <typename T> 4908 void unroll_2(Register count, T block) { 4909 Label loop, end, odd; 4910 tbnz(count, 0, odd); 4911 cbz(count, end); 4912 align(16); 4913 bind(loop); 4914 (this->*block)(); 4915 bind(odd); 4916 (this->*block)(); 4917 subs(count, count, 2); 4918 br(Assembler::GT, loop); 4919 bind(end); 4920 } 4921 4922 template <typename T> 4923 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4924 Label loop, end, odd; 4925 tbnz(count, 0, odd); 4926 cbz(count, end); 4927 align(16); 4928 bind(loop); 4929 (this->*block)(d, s, tmp); 4930 bind(odd); 4931 (this->*block)(d, s, tmp); 4932 subs(count, count, 2); 4933 br(Assembler::GT, loop); 4934 bind(end); 4935 } 4936 4937 void pre1(RegisterOrConstant i) { 4938 block_comment("pre1"); 4939 // Pa = Pa_base; 4940 // Pb = Pb_base + i; 4941 // Pm = Pm_base; 4942 // Pn = Pn_base + i; 4943 // Ra = *Pa; 4944 // Rb = *Pb; 4945 // Rm = *Pm; 4946 // Rn = *Pn; 4947 ldr(Ra, Address(Pa_base)); 4948 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4949 ldr(Rm, Address(Pm_base)); 4950 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4951 lea(Pa, Address(Pa_base)); 4952 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4953 lea(Pm, Address(Pm_base)); 4954 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4955 4956 // Zero the m*n result. 4957 mov(Rhi_mn, zr); 4958 mov(Rlo_mn, zr); 4959 } 4960 4961 // The core multiply-accumulate step of a Montgomery 4962 // multiplication. The idea is to schedule operations as a 4963 // pipeline so that instructions with long latencies (loads and 4964 // multiplies) have time to complete before their results are 4965 // used. This most benefits in-order implementations of the 4966 // architecture but out-of-order ones also benefit. 4967 void step() { 4968 block_comment("step"); 4969 // MACC(Ra, Rb, t0, t1, t2); 4970 // Ra = *++Pa; 4971 // Rb = *--Pb; 4972 umulh(Rhi_ab, Ra, Rb); 4973 mul(Rlo_ab, Ra, Rb); 4974 ldr(Ra, pre(Pa, wordSize)); 4975 ldr(Rb, pre(Pb, -wordSize)); 4976 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4977 // previous iteration. 4978 // MACC(Rm, Rn, t0, t1, t2); 4979 // Rm = *++Pm; 4980 // Rn = *--Pn; 4981 umulh(Rhi_mn, Rm, Rn); 4982 mul(Rlo_mn, Rm, Rn); 4983 ldr(Rm, pre(Pm, wordSize)); 4984 ldr(Rn, pre(Pn, -wordSize)); 4985 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4986 } 4987 4988 void post1() { 4989 block_comment("post1"); 4990 4991 // MACC(Ra, Rb, t0, t1, t2); 4992 // Ra = *++Pa; 4993 // Rb = *--Pb; 4994 umulh(Rhi_ab, Ra, Rb); 4995 mul(Rlo_ab, Ra, Rb); 4996 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4997 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4998 4999 // *Pm = Rm = t0 * inv; 5000 mul(Rm, t0, inv); 5001 str(Rm, Address(Pm)); 5002 5003 // MACC(Rm, Rn, t0, t1, t2); 5004 // t0 = t1; t1 = t2; t2 = 0; 5005 umulh(Rhi_mn, Rm, Rn); 5006 5007 #ifndef PRODUCT 5008 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5009 { 5010 mul(Rlo_mn, Rm, Rn); 5011 add(Rlo_mn, t0, Rlo_mn); 5012 Label ok; 5013 cbz(Rlo_mn, ok); { 5014 stop("broken Montgomery multiply"); 5015 } bind(ok); 5016 } 5017 #endif 5018 // We have very carefully set things up so that 5019 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5020 // the lower half of Rm * Rn because we know the result already: 5021 // it must be -t0. t0 + (-t0) must generate a carry iff 5022 // t0 != 0. So, rather than do a mul and an adds we just set 5023 // the carry flag iff t0 is nonzero. 5024 // 5025 // mul(Rlo_mn, Rm, Rn); 5026 // adds(zr, t0, Rlo_mn); 5027 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5028 adcs(t0, t1, Rhi_mn); 5029 adc(t1, t2, zr); 5030 mov(t2, zr); 5031 } 5032 5033 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 5034 block_comment("pre2"); 5035 // Pa = Pa_base + i-len; 5036 // Pb = Pb_base + len; 5037 // Pm = Pm_base + i-len; 5038 // Pn = Pn_base + len; 5039 5040 if (i.is_register()) { 5041 sub(Rj, i.as_register(), len); 5042 } else { 5043 mov(Rj, i.as_constant()); 5044 sub(Rj, Rj, len); 5045 } 5046 // Rj == i-len 5047 5048 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 5049 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 5050 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5051 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 5052 5053 // Ra = *++Pa; 5054 // Rb = *--Pb; 5055 // Rm = *++Pm; 5056 // Rn = *--Pn; 5057 ldr(Ra, pre(Pa, wordSize)); 5058 ldr(Rb, pre(Pb, -wordSize)); 5059 ldr(Rm, pre(Pm, wordSize)); 5060 ldr(Rn, pre(Pn, -wordSize)); 5061 5062 mov(Rhi_mn, zr); 5063 mov(Rlo_mn, zr); 5064 } 5065 5066 void post2(RegisterOrConstant i, RegisterOrConstant len) { 5067 block_comment("post2"); 5068 if (i.is_constant()) { 5069 mov(Rj, i.as_constant()-len.as_constant()); 5070 } else { 5071 sub(Rj, i.as_register(), len); 5072 } 5073 5074 adds(t0, t0, Rlo_mn); // The pending m*n, low part 5075 5076 // As soon as we know the least significant digit of our result, 5077 // store it. 5078 // Pm_base[i-len] = t0; 5079 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5080 5081 // t0 = t1; t1 = t2; t2 = 0; 5082 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 5083 adc(t1, t2, zr); 5084 mov(t2, zr); 5085 } 5086 5087 // A carry in t0 after Montgomery multiplication means that we 5088 // should subtract multiples of n from our result in m. We'll 5089 // keep doing that until there is no carry. 5090 void normalize(RegisterOrConstant len) { 5091 block_comment("normalize"); 5092 // while (t0) 5093 // t0 = sub(Pm_base, Pn_base, t0, len); 5094 Label loop, post, again; 5095 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 5096 cbz(t0, post); { 5097 bind(again); { 5098 mov(i, zr); 5099 mov(cnt, len); 5100 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5101 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5102 subs(zr, zr, zr); // set carry flag, i.e. no borrow 5103 align(16); 5104 bind(loop); { 5105 sbcs(Rm, Rm, Rn); 5106 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5107 add(i, i, 1); 5108 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5109 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5110 sub(cnt, cnt, 1); 5111 } cbnz(cnt, loop); 5112 sbc(t0, t0, zr); 5113 } cbnz(t0, again); 5114 } bind(post); 5115 } 5116 5117 // Move memory at s to d, reversing words. 5118 // Increments d to end of copied memory 5119 // Destroys tmp1, tmp2 5120 // Preserves len 5121 // Leaves s pointing to the address which was in d at start 5122 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 5123 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 5124 5125 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 5126 mov(tmp1, len); 5127 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 5128 sub(s, d, len, ext::uxtw, LogBytesPerWord); 5129 } 5130 // where 5131 void reverse1(Register d, Register s, Register tmp) { 5132 ldr(tmp, pre(s, -wordSize)); 5133 ror(tmp, tmp, 32); 5134 str(tmp, post(d, wordSize)); 5135 } 5136 5137 void step_squaring() { 5138 // An extra ACC 5139 step(); 5140 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5141 } 5142 5143 void last_squaring(RegisterOrConstant i) { 5144 Label dont; 5145 // if ((i & 1) == 0) { 5146 tbnz(i.as_register(), 0, dont); { 5147 // MACC(Ra, Rb, t0, t1, t2); 5148 // Ra = *++Pa; 5149 // Rb = *--Pb; 5150 umulh(Rhi_ab, Ra, Rb); 5151 mul(Rlo_ab, Ra, Rb); 5152 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5153 } bind(dont); 5154 } 5155 5156 void extra_step_squaring() { 5157 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5158 5159 // MACC(Rm, Rn, t0, t1, t2); 5160 // Rm = *++Pm; 5161 // Rn = *--Pn; 5162 umulh(Rhi_mn, Rm, Rn); 5163 mul(Rlo_mn, Rm, Rn); 5164 ldr(Rm, pre(Pm, wordSize)); 5165 ldr(Rn, pre(Pn, -wordSize)); 5166 } 5167 5168 void post1_squaring() { 5169 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5170 5171 // *Pm = Rm = t0 * inv; 5172 mul(Rm, t0, inv); 5173 str(Rm, Address(Pm)); 5174 5175 // MACC(Rm, Rn, t0, t1, t2); 5176 // t0 = t1; t1 = t2; t2 = 0; 5177 umulh(Rhi_mn, Rm, Rn); 5178 5179 #ifndef PRODUCT 5180 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5181 { 5182 mul(Rlo_mn, Rm, Rn); 5183 add(Rlo_mn, t0, Rlo_mn); 5184 Label ok; 5185 cbz(Rlo_mn, ok); { 5186 stop("broken Montgomery multiply"); 5187 } bind(ok); 5188 } 5189 #endif 5190 // We have very carefully set things up so that 5191 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5192 // the lower half of Rm * Rn because we know the result already: 5193 // it must be -t0. t0 + (-t0) must generate a carry iff 5194 // t0 != 0. So, rather than do a mul and an adds we just set 5195 // the carry flag iff t0 is nonzero. 5196 // 5197 // mul(Rlo_mn, Rm, Rn); 5198 // adds(zr, t0, Rlo_mn); 5199 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5200 adcs(t0, t1, Rhi_mn); 5201 adc(t1, t2, zr); 5202 mov(t2, zr); 5203 } 5204 5205 void acc(Register Rhi, Register Rlo, 5206 Register t0, Register t1, Register t2) { 5207 adds(t0, t0, Rlo); 5208 adcs(t1, t1, Rhi); 5209 adc(t2, t2, zr); 5210 } 5211 5212 public: 5213 /** 5214 * Fast Montgomery multiplication. The derivation of the 5215 * algorithm is in A Cryptographic Library for the Motorola 5216 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 5217 * 5218 * Arguments: 5219 * 5220 * Inputs for multiplication: 5221 * c_rarg0 - int array elements a 5222 * c_rarg1 - int array elements b 5223 * c_rarg2 - int array elements n (the modulus) 5224 * c_rarg3 - int length 5225 * c_rarg4 - int inv 5226 * c_rarg5 - int array elements m (the result) 5227 * 5228 * Inputs for squaring: 5229 * c_rarg0 - int array elements a 5230 * c_rarg1 - int array elements n (the modulus) 5231 * c_rarg2 - int length 5232 * c_rarg3 - int inv 5233 * c_rarg4 - int array elements m (the result) 5234 * 5235 */ 5236 address generate_multiply() { 5237 Label argh, nothing; 5238 bind(argh); 5239 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5240 5241 align(CodeEntryAlignment); 5242 address entry = pc(); 5243 5244 cbzw(Rlen, nothing); 5245 5246 enter(); 5247 5248 // Make room. 5249 cmpw(Rlen, 512); 5250 br(Assembler::HI, argh); 5251 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5252 andr(sp, Ra, -2 * wordSize); 5253 5254 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5255 5256 { 5257 // Copy input args, reversing as we go. We use Ra as a 5258 // temporary variable. 5259 reverse(Ra, Pa_base, Rlen, t0, t1); 5260 if (!_squaring) 5261 reverse(Ra, Pb_base, Rlen, t0, t1); 5262 reverse(Ra, Pn_base, Rlen, t0, t1); 5263 } 5264 5265 // Push all call-saved registers and also Pm_base which we'll need 5266 // at the end. 5267 save_regs(); 5268 5269 #ifndef PRODUCT 5270 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 5271 { 5272 ldr(Rn, Address(Pn_base, 0)); 5273 mul(Rlo_mn, Rn, inv); 5274 subs(zr, Rlo_mn, -1); 5275 Label ok; 5276 br(EQ, ok); { 5277 stop("broken inverse in Montgomery multiply"); 5278 } bind(ok); 5279 } 5280 #endif 5281 5282 mov(Pm_base, Ra); 5283 5284 mov(t0, zr); 5285 mov(t1, zr); 5286 mov(t2, zr); 5287 5288 block_comment("for (int i = 0; i < len; i++) {"); 5289 mov(Ri, zr); { 5290 Label loop, end; 5291 cmpw(Ri, Rlen); 5292 br(Assembler::GE, end); 5293 5294 bind(loop); 5295 pre1(Ri); 5296 5297 block_comment(" for (j = i; j; j--) {"); { 5298 movw(Rj, Ri); 5299 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5300 } block_comment(" } // j"); 5301 5302 post1(); 5303 addw(Ri, Ri, 1); 5304 cmpw(Ri, Rlen); 5305 br(Assembler::LT, loop); 5306 bind(end); 5307 block_comment("} // i"); 5308 } 5309 5310 block_comment("for (int i = len; i < 2*len; i++) {"); 5311 mov(Ri, Rlen); { 5312 Label loop, end; 5313 cmpw(Ri, Rlen, Assembler::LSL, 1); 5314 br(Assembler::GE, end); 5315 5316 bind(loop); 5317 pre2(Ri, Rlen); 5318 5319 block_comment(" for (j = len*2-i-1; j; j--) {"); { 5320 lslw(Rj, Rlen, 1); 5321 subw(Rj, Rj, Ri); 5322 subw(Rj, Rj, 1); 5323 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5324 } block_comment(" } // j"); 5325 5326 post2(Ri, Rlen); 5327 addw(Ri, Ri, 1); 5328 cmpw(Ri, Rlen, Assembler::LSL, 1); 5329 br(Assembler::LT, loop); 5330 bind(end); 5331 } 5332 block_comment("} // i"); 5333 5334 normalize(Rlen); 5335 5336 mov(Ra, Pm_base); // Save Pm_base in Ra 5337 restore_regs(); // Restore caller's Pm_base 5338 5339 // Copy our result into caller's Pm_base 5340 reverse(Pm_base, Ra, Rlen, t0, t1); 5341 5342 leave(); 5343 bind(nothing); 5344 ret(lr); 5345 5346 return entry; 5347 } 5348 // In C, approximately: 5349 5350 // void 5351 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 5352 // unsigned long Pn_base[], unsigned long Pm_base[], 5353 // unsigned long inv, int len) { 5354 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5355 // unsigned long *Pa, *Pb, *Pn, *Pm; 5356 // unsigned long Ra, Rb, Rn, Rm; 5357 5358 // int i; 5359 5360 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5361 5362 // for (i = 0; i < len; i++) { 5363 // int j; 5364 5365 // Pa = Pa_base; 5366 // Pb = Pb_base + i; 5367 // Pm = Pm_base; 5368 // Pn = Pn_base + i; 5369 5370 // Ra = *Pa; 5371 // Rb = *Pb; 5372 // Rm = *Pm; 5373 // Rn = *Pn; 5374 5375 // int iters = i; 5376 // for (j = 0; iters--; j++) { 5377 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5378 // MACC(Ra, Rb, t0, t1, t2); 5379 // Ra = *++Pa; 5380 // Rb = *--Pb; 5381 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5382 // MACC(Rm, Rn, t0, t1, t2); 5383 // Rm = *++Pm; 5384 // Rn = *--Pn; 5385 // } 5386 5387 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 5388 // MACC(Ra, Rb, t0, t1, t2); 5389 // *Pm = Rm = t0 * inv; 5390 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5391 // MACC(Rm, Rn, t0, t1, t2); 5392 5393 // assert(t0 == 0, "broken Montgomery multiply"); 5394 5395 // t0 = t1; t1 = t2; t2 = 0; 5396 // } 5397 5398 // for (i = len; i < 2*len; i++) { 5399 // int j; 5400 5401 // Pa = Pa_base + i-len; 5402 // Pb = Pb_base + len; 5403 // Pm = Pm_base + i-len; 5404 // Pn = Pn_base + len; 5405 5406 // Ra = *++Pa; 5407 // Rb = *--Pb; 5408 // Rm = *++Pm; 5409 // Rn = *--Pn; 5410 5411 // int iters = len*2-i-1; 5412 // for (j = i-len+1; iters--; j++) { 5413 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5414 // MACC(Ra, Rb, t0, t1, t2); 5415 // Ra = *++Pa; 5416 // Rb = *--Pb; 5417 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5418 // MACC(Rm, Rn, t0, t1, t2); 5419 // Rm = *++Pm; 5420 // Rn = *--Pn; 5421 // } 5422 5423 // Pm_base[i-len] = t0; 5424 // t0 = t1; t1 = t2; t2 = 0; 5425 // } 5426 5427 // while (t0) 5428 // t0 = sub(Pm_base, Pn_base, t0, len); 5429 // } 5430 5431 /** 5432 * Fast Montgomery squaring. This uses asymptotically 25% fewer 5433 * multiplies than Montgomery multiplication so it should be up to 5434 * 25% faster. However, its loop control is more complex and it 5435 * may actually run slower on some machines. 5436 * 5437 * Arguments: 5438 * 5439 * Inputs: 5440 * c_rarg0 - int array elements a 5441 * c_rarg1 - int array elements n (the modulus) 5442 * c_rarg2 - int length 5443 * c_rarg3 - int inv 5444 * c_rarg4 - int array elements m (the result) 5445 * 5446 */ 5447 address generate_square() { 5448 Label argh; 5449 bind(argh); 5450 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5451 5452 align(CodeEntryAlignment); 5453 address entry = pc(); 5454 5455 enter(); 5456 5457 // Make room. 5458 cmpw(Rlen, 512); 5459 br(Assembler::HI, argh); 5460 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5461 andr(sp, Ra, -2 * wordSize); 5462 5463 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5464 5465 { 5466 // Copy input args, reversing as we go. We use Ra as a 5467 // temporary variable. 5468 reverse(Ra, Pa_base, Rlen, t0, t1); 5469 reverse(Ra, Pn_base, Rlen, t0, t1); 5470 } 5471 5472 // Push all call-saved registers and also Pm_base which we'll need 5473 // at the end. 5474 save_regs(); 5475 5476 mov(Pm_base, Ra); 5477 5478 mov(t0, zr); 5479 mov(t1, zr); 5480 mov(t2, zr); 5481 5482 block_comment("for (int i = 0; i < len; i++) {"); 5483 mov(Ri, zr); { 5484 Label loop, end; 5485 bind(loop); 5486 cmp(Ri, Rlen); 5487 br(Assembler::GE, end); 5488 5489 pre1(Ri); 5490 5491 block_comment("for (j = (i+1)/2; j; j--) {"); { 5492 add(Rj, Ri, 1); 5493 lsr(Rj, Rj, 1); 5494 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5495 } block_comment(" } // j"); 5496 5497 last_squaring(Ri); 5498 5499 block_comment(" for (j = i/2; j; j--) {"); { 5500 lsr(Rj, Ri, 1); 5501 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5502 } block_comment(" } // j"); 5503 5504 post1_squaring(); 5505 add(Ri, Ri, 1); 5506 cmp(Ri, Rlen); 5507 br(Assembler::LT, loop); 5508 5509 bind(end); 5510 block_comment("} // i"); 5511 } 5512 5513 block_comment("for (int i = len; i < 2*len; i++) {"); 5514 mov(Ri, Rlen); { 5515 Label loop, end; 5516 bind(loop); 5517 cmp(Ri, Rlen, Assembler::LSL, 1); 5518 br(Assembler::GE, end); 5519 5520 pre2(Ri, Rlen); 5521 5522 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 5523 lsl(Rj, Rlen, 1); 5524 sub(Rj, Rj, Ri); 5525 sub(Rj, Rj, 1); 5526 lsr(Rj, Rj, 1); 5527 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5528 } block_comment(" } // j"); 5529 5530 last_squaring(Ri); 5531 5532 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 5533 lsl(Rj, Rlen, 1); 5534 sub(Rj, Rj, Ri); 5535 lsr(Rj, Rj, 1); 5536 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5537 } block_comment(" } // j"); 5538 5539 post2(Ri, Rlen); 5540 add(Ri, Ri, 1); 5541 cmp(Ri, Rlen, Assembler::LSL, 1); 5542 5543 br(Assembler::LT, loop); 5544 bind(end); 5545 block_comment("} // i"); 5546 } 5547 5548 normalize(Rlen); 5549 5550 mov(Ra, Pm_base); // Save Pm_base in Ra 5551 restore_regs(); // Restore caller's Pm_base 5552 5553 // Copy our result into caller's Pm_base 5554 reverse(Pm_base, Ra, Rlen, t0, t1); 5555 5556 leave(); 5557 ret(lr); 5558 5559 return entry; 5560 } 5561 // In C, approximately: 5562 5563 // void 5564 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 5565 // unsigned long Pm_base[], unsigned long inv, int len) { 5566 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5567 // unsigned long *Pa, *Pb, *Pn, *Pm; 5568 // unsigned long Ra, Rb, Rn, Rm; 5569 5570 // int i; 5571 5572 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5573 5574 // for (i = 0; i < len; i++) { 5575 // int j; 5576 5577 // Pa = Pa_base; 5578 // Pb = Pa_base + i; 5579 // Pm = Pm_base; 5580 // Pn = Pn_base + i; 5581 5582 // Ra = *Pa; 5583 // Rb = *Pb; 5584 // Rm = *Pm; 5585 // Rn = *Pn; 5586 5587 // int iters = (i+1)/2; 5588 // for (j = 0; iters--; j++) { 5589 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5590 // MACC2(Ra, Rb, t0, t1, t2); 5591 // Ra = *++Pa; 5592 // Rb = *--Pb; 5593 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5594 // MACC(Rm, Rn, t0, t1, t2); 5595 // Rm = *++Pm; 5596 // Rn = *--Pn; 5597 // } 5598 // if ((i & 1) == 0) { 5599 // assert(Ra == Pa_base[j], "must be"); 5600 // MACC(Ra, Ra, t0, t1, t2); 5601 // } 5602 // iters = i/2; 5603 // assert(iters == i-j, "must be"); 5604 // for (; iters--; j++) { 5605 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5606 // MACC(Rm, Rn, t0, t1, t2); 5607 // Rm = *++Pm; 5608 // Rn = *--Pn; 5609 // } 5610 5611 // *Pm = Rm = t0 * inv; 5612 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5613 // MACC(Rm, Rn, t0, t1, t2); 5614 5615 // assert(t0 == 0, "broken Montgomery multiply"); 5616 5617 // t0 = t1; t1 = t2; t2 = 0; 5618 // } 5619 5620 // for (i = len; i < 2*len; i++) { 5621 // int start = i-len+1; 5622 // int end = start + (len - start)/2; 5623 // int j; 5624 5625 // Pa = Pa_base + i-len; 5626 // Pb = Pa_base + len; 5627 // Pm = Pm_base + i-len; 5628 // Pn = Pn_base + len; 5629 5630 // Ra = *++Pa; 5631 // Rb = *--Pb; 5632 // Rm = *++Pm; 5633 // Rn = *--Pn; 5634 5635 // int iters = (2*len-i-1)/2; 5636 // assert(iters == end-start, "must be"); 5637 // for (j = start; iters--; j++) { 5638 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5639 // MACC2(Ra, Rb, t0, t1, t2); 5640 // Ra = *++Pa; 5641 // Rb = *--Pb; 5642 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5643 // MACC(Rm, Rn, t0, t1, t2); 5644 // Rm = *++Pm; 5645 // Rn = *--Pn; 5646 // } 5647 // if ((i & 1) == 0) { 5648 // assert(Ra == Pa_base[j], "must be"); 5649 // MACC(Ra, Ra, t0, t1, t2); 5650 // } 5651 // iters = (2*len-i)/2; 5652 // assert(iters == len-j, "must be"); 5653 // for (; iters--; j++) { 5654 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5655 // MACC(Rm, Rn, t0, t1, t2); 5656 // Rm = *++Pm; 5657 // Rn = *--Pn; 5658 // } 5659 // Pm_base[i-len] = t0; 5660 // t0 = t1; t1 = t2; t2 = 0; 5661 // } 5662 5663 // while (t0) 5664 // t0 = sub(Pm_base, Pn_base, t0, len); 5665 // } 5666 }; 5667 5668 5669 // Initialization 5670 void generate_initial() { 5671 // Generate initial stubs and initializes the entry points 5672 5673 // entry points that exist in all platforms Note: This is code 5674 // that could be shared among different platforms - however the 5675 // benefit seems to be smaller than the disadvantage of having a 5676 // much more complicated generator structure. See also comment in 5677 // stubRoutines.hpp. 5678 5679 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5680 5681 StubRoutines::_call_stub_entry = 5682 generate_call_stub(StubRoutines::_call_stub_return_address); 5683 5684 // is referenced by megamorphic call 5685 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5686 5687 // Build this early so it's available for the interpreter. 5688 StubRoutines::_throw_StackOverflowError_entry = 5689 generate_throw_exception("StackOverflowError throw_exception", 5690 CAST_FROM_FN_PTR(address, 5691 SharedRuntime::throw_StackOverflowError)); 5692 StubRoutines::_throw_delayed_StackOverflowError_entry = 5693 generate_throw_exception("delayed StackOverflowError throw_exception", 5694 CAST_FROM_FN_PTR(address, 5695 SharedRuntime::throw_delayed_StackOverflowError)); 5696 if (UseCRC32Intrinsics) { 5697 // set table address before stub generation which use it 5698 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5699 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5700 } 5701 5702 if (UseCRC32CIntrinsics) { 5703 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5704 } 5705 5706 // Disabled until JDK-8210858 is fixed 5707 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5708 // StubRoutines::_dlog = generate_dlog(); 5709 // } 5710 5711 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5712 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 5713 } 5714 5715 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5716 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 5717 } 5718 } 5719 5720 void generate_all() { 5721 // support for verify_oop (must happen after universe_init) 5722 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5723 StubRoutines::_throw_AbstractMethodError_entry = 5724 generate_throw_exception("AbstractMethodError throw_exception", 5725 CAST_FROM_FN_PTR(address, 5726 SharedRuntime:: 5727 throw_AbstractMethodError)); 5728 5729 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5730 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5731 CAST_FROM_FN_PTR(address, 5732 SharedRuntime:: 5733 throw_IncompatibleClassChangeError)); 5734 5735 StubRoutines::_throw_NullPointerException_at_call_entry = 5736 generate_throw_exception("NullPointerException at call throw_exception", 5737 CAST_FROM_FN_PTR(address, 5738 SharedRuntime:: 5739 throw_NullPointerException_at_call)); 5740 5741 // arraycopy stubs used by compilers 5742 generate_arraycopy_stubs(); 5743 5744 // has negatives stub for large arrays. 5745 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5746 5747 // array equals stub for large arrays. 5748 if (!UseSimpleArrayEquals) { 5749 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5750 } 5751 5752 generate_compare_long_strings(); 5753 5754 generate_string_indexof_stubs(); 5755 5756 // byte_array_inflate stub for large arrays. 5757 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5758 5759 #ifdef COMPILER2 5760 if (UseMultiplyToLenIntrinsic) { 5761 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5762 } 5763 5764 if (UseSquareToLenIntrinsic) { 5765 StubRoutines::_squareToLen = generate_squareToLen(); 5766 } 5767 5768 if (UseMulAddIntrinsic) { 5769 StubRoutines::_mulAdd = generate_mulAdd(); 5770 } 5771 5772 if (UseMontgomeryMultiplyIntrinsic) { 5773 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5774 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5775 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5776 } 5777 5778 if (UseMontgomerySquareIntrinsic) { 5779 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5780 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5781 // We use generate_multiply() rather than generate_square() 5782 // because it's faster for the sizes of modulus we care about. 5783 StubRoutines::_montgomerySquare = g.generate_multiply(); 5784 } 5785 #endif // COMPILER2 5786 5787 #ifndef BUILTIN_SIM 5788 // generate GHASH intrinsics code 5789 if (UseGHASHIntrinsics) { 5790 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5791 } 5792 5793 if (UseAESIntrinsics) { 5794 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5795 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5796 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5797 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5798 } 5799 5800 if (UseSHA1Intrinsics) { 5801 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5802 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5803 } 5804 if (UseSHA256Intrinsics) { 5805 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5806 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5807 } 5808 5809 // generate Adler32 intrinsics code 5810 if (UseAdler32Intrinsics) { 5811 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5812 } 5813 5814 // Safefetch stubs. 5815 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5816 &StubRoutines::_safefetch32_fault_pc, 5817 &StubRoutines::_safefetch32_continuation_pc); 5818 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5819 &StubRoutines::_safefetchN_fault_pc, 5820 &StubRoutines::_safefetchN_continuation_pc); 5821 #endif 5822 StubRoutines::aarch64::set_completed(); 5823 } 5824 5825 public: 5826 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5827 if (all) { 5828 generate_all(); 5829 } else { 5830 generate_initial(); 5831 } 5832 } 5833 }; // end class declaration 5834 5835 void StubGenerator_generate(CodeBuffer* code, bool all) { 5836 StubGenerator g(code, all); 5837 }