1 /* 2 * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/universe.hpp" 33 #include "nativeInst_aarch64.hpp" 34 #include "oops/instanceOop.hpp" 35 #include "oops/method.hpp" 36 #include "oops/objArrayKlass.hpp" 37 #include "oops/oop.inline.hpp" 38 #include "prims/methodHandles.hpp" 39 #include "runtime/frame.inline.hpp" 40 #include "runtime/handles.inline.hpp" 41 #include "runtime/sharedRuntime.hpp" 42 #include "runtime/stubCodeGenerator.hpp" 43 #include "runtime/stubRoutines.hpp" 44 #include "runtime/thread.inline.hpp" 45 #include "utilities/align.hpp" 46 #ifdef COMPILER2 47 #include "opto/runtime.hpp" 48 #endif 49 #if INCLUDE_ZGC 50 #include "gc/z/zThreadLocalData.hpp" 51 #endif 52 53 #ifdef BUILTIN_SIM 54 #include "../../../../../../simulator/simulator.hpp" 55 #endif 56 57 // Declaration and definition of StubGenerator (no .hpp file). 58 // For a more detailed description of the stub routine structure 59 // see the comment in stubRoutines.hpp 60 61 #undef __ 62 #define __ _masm-> 63 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 64 65 #ifdef PRODUCT 66 #define BLOCK_COMMENT(str) /* nothing */ 67 #else 68 #define BLOCK_COMMENT(str) __ block_comment(str) 69 #endif 70 71 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 72 73 // Stub Code definitions 74 75 class StubGenerator: public StubCodeGenerator { 76 private: 77 78 #ifdef PRODUCT 79 #define inc_counter_np(counter) ((void)0) 80 #else 81 void inc_counter_np_(int& counter) { 82 __ lea(rscratch2, ExternalAddress((address)&counter)); 83 __ ldrw(rscratch1, Address(rscratch2)); 84 __ addw(rscratch1, rscratch1, 1); 85 __ strw(rscratch1, Address(rscratch2)); 86 } 87 #define inc_counter_np(counter) \ 88 BLOCK_COMMENT("inc_counter " #counter); \ 89 inc_counter_np_(counter); 90 #endif 91 92 // Call stubs are used to call Java from C 93 // 94 // Arguments: 95 // c_rarg0: call wrapper address address 96 // c_rarg1: result address 97 // c_rarg2: result type BasicType 98 // c_rarg3: method Method* 99 // c_rarg4: (interpreter) entry point address 100 // c_rarg5: parameters intptr_t* 101 // c_rarg6: parameter size (in words) int 102 // c_rarg7: thread Thread* 103 // 104 // There is no return from the stub itself as any Java result 105 // is written to result 106 // 107 // we save r30 (lr) as the return PC at the base of the frame and 108 // link r29 (fp) below it as the frame pointer installing sp (r31) 109 // into fp. 110 // 111 // we save r0-r7, which accounts for all the c arguments. 112 // 113 // TODO: strictly do we need to save them all? they are treated as 114 // volatile by C so could we omit saving the ones we are going to 115 // place in global registers (thread? method?) or those we only use 116 // during setup of the Java call? 117 // 118 // we don't need to save r8 which C uses as an indirect result location 119 // return register. 120 // 121 // we don't need to save r9-r15 which both C and Java treat as 122 // volatile 123 // 124 // we don't need to save r16-18 because Java does not use them 125 // 126 // we save r19-r28 which Java uses as scratch registers and C 127 // expects to be callee-save 128 // 129 // we save the bottom 64 bits of each value stored in v8-v15; it is 130 // the responsibility of the caller to preserve larger values. 131 // 132 // so the stub frame looks like this when we enter Java code 133 // 134 // [ return_from_Java ] <--- sp 135 // [ argument word n ] 136 // ... 137 // -27 [ argument word 1 ] 138 // -26 [ saved v15 ] <--- sp_after_call 139 // -25 [ saved v14 ] 140 // -24 [ saved v13 ] 141 // -23 [ saved v12 ] 142 // -22 [ saved v11 ] 143 // -21 [ saved v10 ] 144 // -20 [ saved v9 ] 145 // -19 [ saved v8 ] 146 // -18 [ saved r28 ] 147 // -17 [ saved r27 ] 148 // -16 [ saved r26 ] 149 // -15 [ saved r25 ] 150 // -14 [ saved r24 ] 151 // -13 [ saved r23 ] 152 // -12 [ saved r22 ] 153 // -11 [ saved r21 ] 154 // -10 [ saved r20 ] 155 // -9 [ saved r19 ] 156 // -8 [ call wrapper (r0) ] 157 // -7 [ result (r1) ] 158 // -6 [ result type (r2) ] 159 // -5 [ method (r3) ] 160 // -4 [ entry point (r4) ] 161 // -3 [ parameters (r5) ] 162 // -2 [ parameter size (r6) ] 163 // -1 [ thread (r7) ] 164 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 165 // 1 [ saved lr (r30) ] 166 167 // Call stub stack layout word offsets from fp 168 enum call_stub_layout { 169 sp_after_call_off = -26, 170 171 d15_off = -26, 172 d13_off = -24, 173 d11_off = -22, 174 d9_off = -20, 175 176 r28_off = -18, 177 r26_off = -16, 178 r24_off = -14, 179 r22_off = -12, 180 r20_off = -10, 181 call_wrapper_off = -8, 182 result_off = -7, 183 result_type_off = -6, 184 method_off = -5, 185 entry_point_off = -4, 186 parameter_size_off = -2, 187 thread_off = -1, 188 fp_f = 0, 189 retaddr_off = 1, 190 }; 191 192 address generate_call_stub(address& return_address) { 193 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 194 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 195 "adjust this code"); 196 197 StubCodeMark mark(this, "StubRoutines", "call_stub"); 198 address start = __ pc(); 199 200 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 201 202 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 203 const Address result (rfp, result_off * wordSize); 204 const Address result_type (rfp, result_type_off * wordSize); 205 const Address method (rfp, method_off * wordSize); 206 const Address entry_point (rfp, entry_point_off * wordSize); 207 const Address parameter_size(rfp, parameter_size_off * wordSize); 208 209 const Address thread (rfp, thread_off * wordSize); 210 211 const Address d15_save (rfp, d15_off * wordSize); 212 const Address d13_save (rfp, d13_off * wordSize); 213 const Address d11_save (rfp, d11_off * wordSize); 214 const Address d9_save (rfp, d9_off * wordSize); 215 216 const Address r28_save (rfp, r28_off * wordSize); 217 const Address r26_save (rfp, r26_off * wordSize); 218 const Address r24_save (rfp, r24_off * wordSize); 219 const Address r22_save (rfp, r22_off * wordSize); 220 const Address r20_save (rfp, r20_off * wordSize); 221 222 // stub code 223 224 // we need a C prolog to bootstrap the x86 caller into the sim 225 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 226 227 address aarch64_entry = __ pc(); 228 229 #ifdef BUILTIN_SIM 230 // Save sender's SP for stack traces. 231 __ mov(rscratch1, sp); 232 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 233 #endif 234 // set up frame and move sp to end of save area 235 __ enter(); 236 __ sub(sp, rfp, -sp_after_call_off * wordSize); 237 238 // save register parameters and Java scratch/global registers 239 // n.b. we save thread even though it gets installed in 240 // rthread because we want to sanity check rthread later 241 __ str(c_rarg7, thread); 242 __ strw(c_rarg6, parameter_size); 243 __ stp(c_rarg4, c_rarg5, entry_point); 244 __ stp(c_rarg2, c_rarg3, result_type); 245 __ stp(c_rarg0, c_rarg1, call_wrapper); 246 247 __ stp(r20, r19, r20_save); 248 __ stp(r22, r21, r22_save); 249 __ stp(r24, r23, r24_save); 250 __ stp(r26, r25, r26_save); 251 __ stp(r28, r27, r28_save); 252 253 __ stpd(v9, v8, d9_save); 254 __ stpd(v11, v10, d11_save); 255 __ stpd(v13, v12, d13_save); 256 __ stpd(v15, v14, d15_save); 257 258 // install Java thread in global register now we have saved 259 // whatever value it held 260 __ mov(rthread, c_rarg7); 261 // And method 262 __ mov(rmethod, c_rarg3); 263 264 // set up the heapbase register 265 __ reinit_heapbase(); 266 267 #ifdef ASSERT 268 // make sure we have no pending exceptions 269 { 270 Label L; 271 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 272 __ cmp(rscratch1, (u1)NULL_WORD); 273 __ br(Assembler::EQ, L); 274 __ stop("StubRoutines::call_stub: entered with pending exception"); 275 __ BIND(L); 276 } 277 #endif 278 // pass parameters if any 279 __ mov(esp, sp); 280 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 281 __ andr(sp, rscratch1, -2 * wordSize); 282 283 BLOCK_COMMENT("pass parameters if any"); 284 Label parameters_done; 285 // parameter count is still in c_rarg6 286 // and parameter pointer identifying param 1 is in c_rarg5 287 __ cbzw(c_rarg6, parameters_done); 288 289 address loop = __ pc(); 290 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 291 __ subsw(c_rarg6, c_rarg6, 1); 292 __ push(rscratch1); 293 __ br(Assembler::GT, loop); 294 295 __ BIND(parameters_done); 296 297 // call Java entry -- passing methdoOop, and current sp 298 // rmethod: Method* 299 // r13: sender sp 300 BLOCK_COMMENT("call Java function"); 301 __ mov(r13, sp); 302 __ blr(c_rarg4); 303 304 // tell the simulator we have returned to the stub 305 306 // we do this here because the notify will already have been done 307 // if we get to the next instruction via an exception 308 // 309 // n.b. adding this instruction here affects the calculation of 310 // whether or not a routine returns to the call stub (used when 311 // doing stack walks) since the normal test is to check the return 312 // pc against the address saved below. so we may need to allow for 313 // this extra instruction in the check. 314 315 if (NotifySimulator) { 316 __ notify(Assembler::method_reentry); 317 } 318 // save current address for use by exception handling code 319 320 return_address = __ pc(); 321 322 // store result depending on type (everything that is not 323 // T_OBJECT, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 324 // n.b. this assumes Java returns an integral result in r0 325 // and a floating result in j_farg0 326 __ ldr(j_rarg2, result); 327 Label is_long, is_float, is_double, exit; 328 __ ldr(j_rarg1, result_type); 329 __ cmp(j_rarg1, (u1)T_OBJECT); 330 __ br(Assembler::EQ, is_long); 331 __ cmp(j_rarg1, (u1)T_LONG); 332 __ br(Assembler::EQ, is_long); 333 __ cmp(j_rarg1, (u1)T_FLOAT); 334 __ br(Assembler::EQ, is_float); 335 __ cmp(j_rarg1, (u1)T_DOUBLE); 336 __ br(Assembler::EQ, is_double); 337 338 // handle T_INT case 339 __ strw(r0, Address(j_rarg2)); 340 341 __ BIND(exit); 342 343 // pop parameters 344 __ sub(esp, rfp, -sp_after_call_off * wordSize); 345 346 #ifdef ASSERT 347 // verify that threads correspond 348 { 349 Label L, S; 350 __ ldr(rscratch1, thread); 351 __ cmp(rthread, rscratch1); 352 __ br(Assembler::NE, S); 353 __ get_thread(rscratch1); 354 __ cmp(rthread, rscratch1); 355 __ br(Assembler::EQ, L); 356 __ BIND(S); 357 __ stop("StubRoutines::call_stub: threads must correspond"); 358 __ BIND(L); 359 } 360 #endif 361 362 // restore callee-save registers 363 __ ldpd(v15, v14, d15_save); 364 __ ldpd(v13, v12, d13_save); 365 __ ldpd(v11, v10, d11_save); 366 __ ldpd(v9, v8, d9_save); 367 368 __ ldp(r28, r27, r28_save); 369 __ ldp(r26, r25, r26_save); 370 __ ldp(r24, r23, r24_save); 371 __ ldp(r22, r21, r22_save); 372 __ ldp(r20, r19, r20_save); 373 374 __ ldp(c_rarg0, c_rarg1, call_wrapper); 375 __ ldrw(c_rarg2, result_type); 376 __ ldr(c_rarg3, method); 377 __ ldp(c_rarg4, c_rarg5, entry_point); 378 __ ldp(c_rarg6, c_rarg7, parameter_size); 379 380 #ifndef PRODUCT 381 // tell the simulator we are about to end Java execution 382 if (NotifySimulator) { 383 __ notify(Assembler::method_exit); 384 } 385 #endif 386 // leave frame and return to caller 387 __ leave(); 388 __ ret(lr); 389 390 // handle return types different from T_INT 391 392 __ BIND(is_long); 393 __ str(r0, Address(j_rarg2, 0)); 394 __ br(Assembler::AL, exit); 395 396 __ BIND(is_float); 397 __ strs(j_farg0, Address(j_rarg2, 0)); 398 __ br(Assembler::AL, exit); 399 400 __ BIND(is_double); 401 __ strd(j_farg0, Address(j_rarg2, 0)); 402 __ br(Assembler::AL, exit); 403 404 return start; 405 } 406 407 // Return point for a Java call if there's an exception thrown in 408 // Java code. The exception is caught and transformed into a 409 // pending exception stored in JavaThread that can be tested from 410 // within the VM. 411 // 412 // Note: Usually the parameters are removed by the callee. In case 413 // of an exception crossing an activation frame boundary, that is 414 // not the case if the callee is compiled code => need to setup the 415 // rsp. 416 // 417 // r0: exception oop 418 419 // NOTE: this is used as a target from the signal handler so it 420 // needs an x86 prolog which returns into the current simulator 421 // executing the generated catch_exception code. so the prolog 422 // needs to install rax in a sim register and adjust the sim's 423 // restart pc to enter the generated code at the start position 424 // then return from native to simulated execution. 425 426 address generate_catch_exception() { 427 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 428 address start = __ pc(); 429 430 // same as in generate_call_stub(): 431 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 432 const Address thread (rfp, thread_off * wordSize); 433 434 #ifdef ASSERT 435 // verify that threads correspond 436 { 437 Label L, S; 438 __ ldr(rscratch1, thread); 439 __ cmp(rthread, rscratch1); 440 __ br(Assembler::NE, S); 441 __ get_thread(rscratch1); 442 __ cmp(rthread, rscratch1); 443 __ br(Assembler::EQ, L); 444 __ bind(S); 445 __ stop("StubRoutines::catch_exception: threads must correspond"); 446 __ bind(L); 447 } 448 #endif 449 450 // set pending exception 451 __ verify_oop(r0); 452 453 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 454 __ mov(rscratch1, (address)__FILE__); 455 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 456 __ movw(rscratch1, (int)__LINE__); 457 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 458 459 // complete return to VM 460 assert(StubRoutines::_call_stub_return_address != NULL, 461 "_call_stub_return_address must have been generated before"); 462 __ b(StubRoutines::_call_stub_return_address); 463 464 return start; 465 } 466 467 // Continuation point for runtime calls returning with a pending 468 // exception. The pending exception check happened in the runtime 469 // or native call stub. The pending exception in Thread is 470 // converted into a Java-level exception. 471 // 472 // Contract with Java-level exception handlers: 473 // r0: exception 474 // r3: throwing pc 475 // 476 // NOTE: At entry of this stub, exception-pc must be in LR !! 477 478 // NOTE: this is always used as a jump target within generated code 479 // so it just needs to be generated code wiht no x86 prolog 480 481 address generate_forward_exception() { 482 StubCodeMark mark(this, "StubRoutines", "forward exception"); 483 address start = __ pc(); 484 485 // Upon entry, LR points to the return address returning into 486 // Java (interpreted or compiled) code; i.e., the return address 487 // becomes the throwing pc. 488 // 489 // Arguments pushed before the runtime call are still on the stack 490 // but the exception handler will reset the stack pointer -> 491 // ignore them. A potential result in registers can be ignored as 492 // well. 493 494 #ifdef ASSERT 495 // make sure this code is only executed if there is a pending exception 496 { 497 Label L; 498 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 499 __ cbnz(rscratch1, L); 500 __ stop("StubRoutines::forward exception: no pending exception (1)"); 501 __ bind(L); 502 } 503 #endif 504 505 // compute exception handler into r19 506 507 // call the VM to find the handler address associated with the 508 // caller address. pass thread in r0 and caller pc (ret address) 509 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 510 // the stack. 511 __ mov(c_rarg1, lr); 512 // lr will be trashed by the VM call so we move it to R19 513 // (callee-saved) because we also need to pass it to the handler 514 // returned by this call. 515 __ mov(r19, lr); 516 BLOCK_COMMENT("call exception_handler_for_return_address"); 517 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 518 SharedRuntime::exception_handler_for_return_address), 519 rthread, c_rarg1); 520 // we should not really care that lr is no longer the callee 521 // address. we saved the value the handler needs in r19 so we can 522 // just copy it to r3. however, the C2 handler will push its own 523 // frame and then calls into the VM and the VM code asserts that 524 // the PC for the frame above the handler belongs to a compiled 525 // Java method. So, we restore lr here to satisfy that assert. 526 __ mov(lr, r19); 527 // setup r0 & r3 & clear pending exception 528 __ mov(r3, r19); 529 __ mov(r19, r0); 530 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 531 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 532 533 #ifdef ASSERT 534 // make sure exception is set 535 { 536 Label L; 537 __ cbnz(r0, L); 538 __ stop("StubRoutines::forward exception: no pending exception (2)"); 539 __ bind(L); 540 } 541 #endif 542 543 // continue at exception handler 544 // r0: exception 545 // r3: throwing pc 546 // r19: exception handler 547 __ verify_oop(r0); 548 __ br(r19); 549 550 return start; 551 } 552 553 // Non-destructive plausibility checks for oops 554 // 555 // Arguments: 556 // r0: oop to verify 557 // rscratch1: error message 558 // 559 // Stack after saving c_rarg3: 560 // [tos + 0]: saved c_rarg3 561 // [tos + 1]: saved c_rarg2 562 // [tos + 2]: saved lr 563 // [tos + 3]: saved rscratch2 564 // [tos + 4]: saved r0 565 // [tos + 5]: saved rscratch1 566 address generate_verify_oop() { 567 568 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 569 address start = __ pc(); 570 571 Label exit, error; 572 573 // save c_rarg2 and c_rarg3 574 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 575 576 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 577 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 578 __ ldr(c_rarg3, Address(c_rarg2)); 579 __ add(c_rarg3, c_rarg3, 1); 580 __ str(c_rarg3, Address(c_rarg2)); 581 582 // object is in r0 583 // make sure object is 'reasonable' 584 __ cbz(r0, exit); // if obj is NULL it is OK 585 586 #if INCLUDE_ZGC 587 if (UseZGC) { 588 // Check if mask is good. 589 // verifies that ZAddressBadMask & r0 == 0 590 __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset())); 591 __ andr(c_rarg2, r0, c_rarg3); 592 __ cbnz(c_rarg2, error); 593 } 594 #endif 595 596 // Check if the oop is in the right area of memory 597 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 598 __ andr(c_rarg2, r0, c_rarg3); 599 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 600 601 // Compare c_rarg2 and c_rarg3. We don't use a compare 602 // instruction here because the flags register is live. 603 __ eor(c_rarg2, c_rarg2, c_rarg3); 604 __ cbnz(c_rarg2, error); 605 606 // make sure klass is 'reasonable', which is not zero. 607 __ load_klass(r0, r0); // get klass 608 __ cbz(r0, error); // if klass is NULL it is broken 609 610 // return if everything seems ok 611 __ bind(exit); 612 613 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 614 __ ret(lr); 615 616 // handle errors 617 __ bind(error); 618 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 619 620 __ push(RegSet::range(r0, r29), sp); 621 // debug(char* msg, int64_t pc, int64_t regs[]) 622 __ mov(c_rarg0, rscratch1); // pass address of error message 623 __ mov(c_rarg1, lr); // pass return address 624 __ mov(c_rarg2, sp); // pass address of regs on stack 625 #ifndef PRODUCT 626 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 627 #endif 628 BLOCK_COMMENT("call MacroAssembler::debug"); 629 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 630 __ blrt(rscratch1, 3, 0, 1); 631 632 return start; 633 } 634 635 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 636 637 // The inner part of zero_words(). This is the bulk operation, 638 // zeroing words in blocks, possibly using DC ZVA to do it. The 639 // caller is responsible for zeroing the last few words. 640 // 641 // Inputs: 642 // r10: the HeapWord-aligned base address of an array to zero. 643 // r11: the count in HeapWords, r11 > 0. 644 // 645 // Returns r10 and r11, adjusted for the caller to clear. 646 // r10: the base address of the tail of words left to clear. 647 // r11: the number of words in the tail. 648 // r11 < MacroAssembler::zero_words_block_size. 649 650 address generate_zero_blocks() { 651 Label done; 652 Label base_aligned; 653 654 Register base = r10, cnt = r11; 655 656 __ align(CodeEntryAlignment); 657 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 658 address start = __ pc(); 659 660 if (UseBlockZeroing) { 661 int zva_length = VM_Version::zva_length(); 662 663 // Ensure ZVA length can be divided by 16. This is required by 664 // the subsequent operations. 665 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 666 667 __ tbz(base, 3, base_aligned); 668 __ str(zr, Address(__ post(base, 8))); 669 __ sub(cnt, cnt, 1); 670 __ bind(base_aligned); 671 672 // Ensure count >= zva_length * 2 so that it still deserves a zva after 673 // alignment. 674 Label small; 675 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 676 __ subs(rscratch1, cnt, low_limit >> 3); 677 __ br(Assembler::LT, small); 678 __ zero_dcache_blocks(base, cnt); 679 __ bind(small); 680 } 681 682 { 683 // Number of stp instructions we'll unroll 684 const int unroll = 685 MacroAssembler::zero_words_block_size / 2; 686 // Clear the remaining blocks. 687 Label loop; 688 __ subs(cnt, cnt, unroll * 2); 689 __ br(Assembler::LT, done); 690 __ bind(loop); 691 for (int i = 0; i < unroll; i++) 692 __ stp(zr, zr, __ post(base, 16)); 693 __ subs(cnt, cnt, unroll * 2); 694 __ br(Assembler::GE, loop); 695 __ bind(done); 696 __ add(cnt, cnt, unroll * 2); 697 } 698 699 __ ret(lr); 700 701 return start; 702 } 703 704 705 typedef enum { 706 copy_forwards = 1, 707 copy_backwards = -1 708 } copy_direction; 709 710 // Bulk copy of blocks of 8 words. 711 // 712 // count is a count of words. 713 // 714 // Precondition: count >= 8 715 // 716 // Postconditions: 717 // 718 // The least significant bit of count contains the remaining count 719 // of words to copy. The rest of count is trash. 720 // 721 // s and d are adjusted to point to the remaining words to copy 722 // 723 void generate_copy_longs(Label &start, Register s, Register d, Register count, 724 copy_direction direction) { 725 int unit = wordSize * direction; 726 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 727 728 int offset; 729 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 730 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 731 const Register stride = r13; 732 733 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 734 assert_different_registers(s, d, count, rscratch1); 735 736 Label again, drain; 737 const char *stub_name; 738 if (direction == copy_forwards) 739 stub_name = "forward_copy_longs"; 740 else 741 stub_name = "backward_copy_longs"; 742 743 __ align(CodeEntryAlignment); 744 745 StubCodeMark mark(this, "StubRoutines", stub_name); 746 747 __ bind(start); 748 749 Label unaligned_copy_long; 750 if (AvoidUnalignedAccesses) { 751 __ tbnz(d, 3, unaligned_copy_long); 752 } 753 754 if (direction == copy_forwards) { 755 __ sub(s, s, bias); 756 __ sub(d, d, bias); 757 } 758 759 #ifdef ASSERT 760 // Make sure we are never given < 8 words 761 { 762 Label L; 763 __ cmp(count, (u1)8); 764 __ br(Assembler::GE, L); 765 __ stop("genrate_copy_longs called with < 8 words"); 766 __ bind(L); 767 } 768 #endif 769 770 // Fill 8 registers 771 if (UseSIMDForMemoryOps) { 772 __ ldpq(v0, v1, Address(s, 4 * unit)); 773 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 774 } else { 775 __ ldp(t0, t1, Address(s, 2 * unit)); 776 __ ldp(t2, t3, Address(s, 4 * unit)); 777 __ ldp(t4, t5, Address(s, 6 * unit)); 778 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 779 } 780 781 __ subs(count, count, 16); 782 __ br(Assembler::LO, drain); 783 784 int prefetch = PrefetchCopyIntervalInBytes; 785 bool use_stride = false; 786 if (direction == copy_backwards) { 787 use_stride = prefetch > 256; 788 prefetch = -prefetch; 789 if (use_stride) __ mov(stride, prefetch); 790 } 791 792 __ bind(again); 793 794 if (PrefetchCopyIntervalInBytes > 0) 795 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 796 797 if (UseSIMDForMemoryOps) { 798 __ stpq(v0, v1, Address(d, 4 * unit)); 799 __ ldpq(v0, v1, Address(s, 4 * unit)); 800 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 801 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 802 } else { 803 __ stp(t0, t1, Address(d, 2 * unit)); 804 __ ldp(t0, t1, Address(s, 2 * unit)); 805 __ stp(t2, t3, Address(d, 4 * unit)); 806 __ ldp(t2, t3, Address(s, 4 * unit)); 807 __ stp(t4, t5, Address(d, 6 * unit)); 808 __ ldp(t4, t5, Address(s, 6 * unit)); 809 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 810 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 811 } 812 813 __ subs(count, count, 8); 814 __ br(Assembler::HS, again); 815 816 // Drain 817 __ bind(drain); 818 if (UseSIMDForMemoryOps) { 819 __ stpq(v0, v1, Address(d, 4 * unit)); 820 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 821 } else { 822 __ stp(t0, t1, Address(d, 2 * unit)); 823 __ stp(t2, t3, Address(d, 4 * unit)); 824 __ stp(t4, t5, Address(d, 6 * unit)); 825 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 826 } 827 828 { 829 Label L1, L2; 830 __ tbz(count, exact_log2(4), L1); 831 if (UseSIMDForMemoryOps) { 832 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 833 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 834 } else { 835 __ ldp(t0, t1, Address(s, 2 * unit)); 836 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 837 __ stp(t0, t1, Address(d, 2 * unit)); 838 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 839 } 840 __ bind(L1); 841 842 if (direction == copy_forwards) { 843 __ add(s, s, bias); 844 __ add(d, d, bias); 845 } 846 847 __ tbz(count, 1, L2); 848 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 849 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 850 __ bind(L2); 851 } 852 853 __ ret(lr); 854 855 if (AvoidUnalignedAccesses) { 856 Label drain, again; 857 // Register order for storing. Order is different for backward copy. 858 859 __ bind(unaligned_copy_long); 860 861 // source address is even aligned, target odd aligned 862 // 863 // when forward copying word pairs we read long pairs at offsets 864 // {0, 2, 4, 6} (in long words). when backwards copying we read 865 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 866 // address by -2 in the forwards case so we can compute the 867 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 868 // or -1. 869 // 870 // when forward copying we need to store 1 word, 3 pairs and 871 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 872 // zero offset We adjust the destination by -1 which means we 873 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 874 // 875 // When backwards copyng we need to store 1 word, 3 pairs and 876 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 877 // offsets {1, 3, 5, 7, 8} * unit. 878 879 if (direction == copy_forwards) { 880 __ sub(s, s, 16); 881 __ sub(d, d, 8); 882 } 883 884 // Fill 8 registers 885 // 886 // for forwards copy s was offset by -16 from the original input 887 // value of s so the register contents are at these offsets 888 // relative to the 64 bit block addressed by that original input 889 // and so on for each successive 64 byte block when s is updated 890 // 891 // t0 at offset 0, t1 at offset 8 892 // t2 at offset 16, t3 at offset 24 893 // t4 at offset 32, t5 at offset 40 894 // t6 at offset 48, t7 at offset 56 895 896 // for backwards copy s was not offset so the register contents 897 // are at these offsets into the preceding 64 byte block 898 // relative to that original input and so on for each successive 899 // preceding 64 byte block when s is updated. this explains the 900 // slightly counter-intuitive looking pattern of register usage 901 // in the stp instructions for backwards copy. 902 // 903 // t0 at offset -16, t1 at offset -8 904 // t2 at offset -32, t3 at offset -24 905 // t4 at offset -48, t5 at offset -40 906 // t6 at offset -64, t7 at offset -56 907 908 __ ldp(t0, t1, Address(s, 2 * unit)); 909 __ ldp(t2, t3, Address(s, 4 * unit)); 910 __ ldp(t4, t5, Address(s, 6 * unit)); 911 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 912 913 __ subs(count, count, 16); 914 __ br(Assembler::LO, drain); 915 916 int prefetch = PrefetchCopyIntervalInBytes; 917 bool use_stride = false; 918 if (direction == copy_backwards) { 919 use_stride = prefetch > 256; 920 prefetch = -prefetch; 921 if (use_stride) __ mov(stride, prefetch); 922 } 923 924 __ bind(again); 925 926 if (PrefetchCopyIntervalInBytes > 0) 927 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 928 929 if (direction == copy_forwards) { 930 // allowing for the offset of -8 the store instructions place 931 // registers into the target 64 bit block at the following 932 // offsets 933 // 934 // t0 at offset 0 935 // t1 at offset 8, t2 at offset 16 936 // t3 at offset 24, t4 at offset 32 937 // t5 at offset 40, t6 at offset 48 938 // t7 at offset 56 939 940 __ str(t0, Address(d, 1 * unit)); 941 __ stp(t1, t2, Address(d, 2 * unit)); 942 __ ldp(t0, t1, Address(s, 2 * unit)); 943 __ stp(t3, t4, Address(d, 4 * unit)); 944 __ ldp(t2, t3, Address(s, 4 * unit)); 945 __ stp(t5, t6, Address(d, 6 * unit)); 946 __ ldp(t4, t5, Address(s, 6 * unit)); 947 __ str(t7, Address(__ pre(d, 8 * unit))); 948 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 949 } else { 950 // d was not offset when we started so the registers are 951 // written into the 64 bit block preceding d with the following 952 // offsets 953 // 954 // t1 at offset -8 955 // t3 at offset -24, t0 at offset -16 956 // t5 at offset -48, t2 at offset -32 957 // t7 at offset -56, t4 at offset -48 958 // t6 at offset -64 959 // 960 // note that this matches the offsets previously noted for the 961 // loads 962 963 __ str(t1, Address(d, 1 * unit)); 964 __ stp(t3, t0, Address(d, 3 * unit)); 965 __ ldp(t0, t1, Address(s, 2 * unit)); 966 __ stp(t5, t2, Address(d, 5 * unit)); 967 __ ldp(t2, t3, Address(s, 4 * unit)); 968 __ stp(t7, t4, Address(d, 7 * unit)); 969 __ ldp(t4, t5, Address(s, 6 * unit)); 970 __ str(t6, Address(__ pre(d, 8 * unit))); 971 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 972 } 973 974 __ subs(count, count, 8); 975 __ br(Assembler::HS, again); 976 977 // Drain 978 // 979 // this uses the same pattern of offsets and register arguments 980 // as above 981 __ bind(drain); 982 if (direction == copy_forwards) { 983 __ str(t0, Address(d, 1 * unit)); 984 __ stp(t1, t2, Address(d, 2 * unit)); 985 __ stp(t3, t4, Address(d, 4 * unit)); 986 __ stp(t5, t6, Address(d, 6 * unit)); 987 __ str(t7, Address(__ pre(d, 8 * unit))); 988 } else { 989 __ str(t1, Address(d, 1 * unit)); 990 __ stp(t3, t0, Address(d, 3 * unit)); 991 __ stp(t5, t2, Address(d, 5 * unit)); 992 __ stp(t7, t4, Address(d, 7 * unit)); 993 __ str(t6, Address(__ pre(d, 8 * unit))); 994 } 995 // now we need to copy any remaining part block which may 996 // include a 4 word block subblock and/or a 2 word subblock. 997 // bits 2 and 1 in the count are the tell-tale for whetehr we 998 // have each such subblock 999 { 1000 Label L1, L2; 1001 __ tbz(count, exact_log2(4), L1); 1002 // this is the same as above but copying only 4 longs hence 1003 // with ony one intervening stp between the str instructions 1004 // but note that the offsets and registers still follow the 1005 // same pattern 1006 __ ldp(t0, t1, Address(s, 2 * unit)); 1007 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1008 if (direction == copy_forwards) { 1009 __ str(t0, Address(d, 1 * unit)); 1010 __ stp(t1, t2, Address(d, 2 * unit)); 1011 __ str(t3, Address(__ pre(d, 4 * unit))); 1012 } else { 1013 __ str(t1, Address(d, 1 * unit)); 1014 __ stp(t3, t0, Address(d, 3 * unit)); 1015 __ str(t2, Address(__ pre(d, 4 * unit))); 1016 } 1017 __ bind(L1); 1018 1019 __ tbz(count, 1, L2); 1020 // this is the same as above but copying only 2 longs hence 1021 // there is no intervening stp between the str instructions 1022 // but note that the offset and register patterns are still 1023 // the same 1024 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1025 if (direction == copy_forwards) { 1026 __ str(t0, Address(d, 1 * unit)); 1027 __ str(t1, Address(__ pre(d, 2 * unit))); 1028 } else { 1029 __ str(t1, Address(d, 1 * unit)); 1030 __ str(t0, Address(__ pre(d, 2 * unit))); 1031 } 1032 __ bind(L2); 1033 1034 // for forwards copy we need to re-adjust the offsets we 1035 // applied so that s and d are follow the last words written 1036 1037 if (direction == copy_forwards) { 1038 __ add(s, s, 16); 1039 __ add(d, d, 8); 1040 } 1041 1042 } 1043 1044 __ ret(lr); 1045 } 1046 } 1047 1048 // Small copy: less than 16 bytes. 1049 // 1050 // NB: Ignores all of the bits of count which represent more than 15 1051 // bytes, so a caller doesn't have to mask them. 1052 1053 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1054 bool is_backwards = step < 0; 1055 size_t granularity = uabs(step); 1056 int direction = is_backwards ? -1 : 1; 1057 int unit = wordSize * direction; 1058 1059 Label Lword, Lint, Lshort, Lbyte; 1060 1061 assert(granularity 1062 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1063 1064 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1065 1066 // ??? I don't know if this bit-test-and-branch is the right thing 1067 // to do. It does a lot of jumping, resulting in several 1068 // mispredicted branches. It might make more sense to do this 1069 // with something like Duff's device with a single computed branch. 1070 1071 __ tbz(count, 3 - exact_log2(granularity), Lword); 1072 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1073 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1074 __ bind(Lword); 1075 1076 if (granularity <= sizeof (jint)) { 1077 __ tbz(count, 2 - exact_log2(granularity), Lint); 1078 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1079 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1080 __ bind(Lint); 1081 } 1082 1083 if (granularity <= sizeof (jshort)) { 1084 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1085 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1086 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1087 __ bind(Lshort); 1088 } 1089 1090 if (granularity <= sizeof (jbyte)) { 1091 __ tbz(count, 0, Lbyte); 1092 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1093 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1094 __ bind(Lbyte); 1095 } 1096 } 1097 1098 Label copy_f, copy_b; 1099 1100 // All-singing all-dancing memory copy. 1101 // 1102 // Copy count units of memory from s to d. The size of a unit is 1103 // step, which can be positive or negative depending on the direction 1104 // of copy. If is_aligned is false, we align the source address. 1105 // 1106 1107 void copy_memory(bool is_aligned, Register s, Register d, 1108 Register count, Register tmp, int step) { 1109 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1110 bool is_backwards = step < 0; 1111 int granularity = uabs(step); 1112 const Register t0 = r3, t1 = r4; 1113 1114 // <= 96 bytes do inline. Direction doesn't matter because we always 1115 // load all the data before writing anything 1116 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1117 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1118 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1119 const Register send = r17, dend = r18; 1120 1121 if (PrefetchCopyIntervalInBytes > 0) 1122 __ prfm(Address(s, 0), PLDL1KEEP); 1123 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1124 __ br(Assembler::HI, copy_big); 1125 1126 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1127 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1128 1129 __ cmp(count, u1(16/granularity)); 1130 __ br(Assembler::LS, copy16); 1131 1132 __ cmp(count, u1(64/granularity)); 1133 __ br(Assembler::HI, copy80); 1134 1135 __ cmp(count, u1(32/granularity)); 1136 __ br(Assembler::LS, copy32); 1137 1138 // 33..64 bytes 1139 if (UseSIMDForMemoryOps) { 1140 __ ldpq(v0, v1, Address(s, 0)); 1141 __ ldpq(v2, v3, Address(send, -32)); 1142 __ stpq(v0, v1, Address(d, 0)); 1143 __ stpq(v2, v3, Address(dend, -32)); 1144 } else { 1145 __ ldp(t0, t1, Address(s, 0)); 1146 __ ldp(t2, t3, Address(s, 16)); 1147 __ ldp(t4, t5, Address(send, -32)); 1148 __ ldp(t6, t7, Address(send, -16)); 1149 1150 __ stp(t0, t1, Address(d, 0)); 1151 __ stp(t2, t3, Address(d, 16)); 1152 __ stp(t4, t5, Address(dend, -32)); 1153 __ stp(t6, t7, Address(dend, -16)); 1154 } 1155 __ b(finish); 1156 1157 // 17..32 bytes 1158 __ bind(copy32); 1159 __ ldp(t0, t1, Address(s, 0)); 1160 __ ldp(t2, t3, Address(send, -16)); 1161 __ stp(t0, t1, Address(d, 0)); 1162 __ stp(t2, t3, Address(dend, -16)); 1163 __ b(finish); 1164 1165 // 65..80/96 bytes 1166 // (96 bytes if SIMD because we do 32 byes per instruction) 1167 __ bind(copy80); 1168 if (UseSIMDForMemoryOps) { 1169 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1170 __ ldpq(v4, v5, Address(send, -32)); 1171 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1172 __ stpq(v4, v5, Address(dend, -32)); 1173 } else { 1174 __ ldp(t0, t1, Address(s, 0)); 1175 __ ldp(t2, t3, Address(s, 16)); 1176 __ ldp(t4, t5, Address(s, 32)); 1177 __ ldp(t6, t7, Address(s, 48)); 1178 __ ldp(t8, t9, Address(send, -16)); 1179 1180 __ stp(t0, t1, Address(d, 0)); 1181 __ stp(t2, t3, Address(d, 16)); 1182 __ stp(t4, t5, Address(d, 32)); 1183 __ stp(t6, t7, Address(d, 48)); 1184 __ stp(t8, t9, Address(dend, -16)); 1185 } 1186 __ b(finish); 1187 1188 // 0..16 bytes 1189 __ bind(copy16); 1190 __ cmp(count, u1(8/granularity)); 1191 __ br(Assembler::LO, copy8); 1192 1193 // 8..16 bytes 1194 __ ldr(t0, Address(s, 0)); 1195 __ ldr(t1, Address(send, -8)); 1196 __ str(t0, Address(d, 0)); 1197 __ str(t1, Address(dend, -8)); 1198 __ b(finish); 1199 1200 if (granularity < 8) { 1201 // 4..7 bytes 1202 __ bind(copy8); 1203 __ tbz(count, 2 - exact_log2(granularity), copy4); 1204 __ ldrw(t0, Address(s, 0)); 1205 __ ldrw(t1, Address(send, -4)); 1206 __ strw(t0, Address(d, 0)); 1207 __ strw(t1, Address(dend, -4)); 1208 __ b(finish); 1209 if (granularity < 4) { 1210 // 0..3 bytes 1211 __ bind(copy4); 1212 __ cbz(count, finish); // get rid of 0 case 1213 if (granularity == 2) { 1214 __ ldrh(t0, Address(s, 0)); 1215 __ strh(t0, Address(d, 0)); 1216 } else { // granularity == 1 1217 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1218 // the first and last byte. 1219 // Handle the 3 byte case by loading and storing base + count/2 1220 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1221 // This does means in the 1 byte case we load/store the same 1222 // byte 3 times. 1223 __ lsr(count, count, 1); 1224 __ ldrb(t0, Address(s, 0)); 1225 __ ldrb(t1, Address(send, -1)); 1226 __ ldrb(t2, Address(s, count)); 1227 __ strb(t0, Address(d, 0)); 1228 __ strb(t1, Address(dend, -1)); 1229 __ strb(t2, Address(d, count)); 1230 } 1231 __ b(finish); 1232 } 1233 } 1234 1235 __ bind(copy_big); 1236 if (is_backwards) { 1237 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1238 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1239 } 1240 1241 // Now we've got the small case out of the way we can align the 1242 // source address on a 2-word boundary. 1243 1244 Label aligned; 1245 1246 if (is_aligned) { 1247 // We may have to adjust by 1 word to get s 2-word-aligned. 1248 __ tbz(s, exact_log2(wordSize), aligned); 1249 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1250 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1251 __ sub(count, count, wordSize/granularity); 1252 } else { 1253 if (is_backwards) { 1254 __ andr(rscratch2, s, 2 * wordSize - 1); 1255 } else { 1256 __ neg(rscratch2, s); 1257 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1258 } 1259 // rscratch2 is the byte adjustment needed to align s. 1260 __ cbz(rscratch2, aligned); 1261 int shift = exact_log2(granularity); 1262 if (shift) __ lsr(rscratch2, rscratch2, shift); 1263 __ sub(count, count, rscratch2); 1264 1265 #if 0 1266 // ?? This code is only correct for a disjoint copy. It may or 1267 // may not make sense to use it in that case. 1268 1269 // Copy the first pair; s and d may not be aligned. 1270 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1271 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1272 1273 // Align s and d, adjust count 1274 if (is_backwards) { 1275 __ sub(s, s, rscratch2); 1276 __ sub(d, d, rscratch2); 1277 } else { 1278 __ add(s, s, rscratch2); 1279 __ add(d, d, rscratch2); 1280 } 1281 #else 1282 copy_memory_small(s, d, rscratch2, rscratch1, step); 1283 #endif 1284 } 1285 1286 __ bind(aligned); 1287 1288 // s is now 2-word-aligned. 1289 1290 // We have a count of units and some trailing bytes. Adjust the 1291 // count and do a bulk copy of words. 1292 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1293 if (direction == copy_forwards) 1294 __ bl(copy_f); 1295 else 1296 __ bl(copy_b); 1297 1298 // And the tail. 1299 copy_memory_small(s, d, count, tmp, step); 1300 1301 if (granularity >= 8) __ bind(copy8); 1302 if (granularity >= 4) __ bind(copy4); 1303 __ bind(finish); 1304 } 1305 1306 1307 void clobber_registers() { 1308 #ifdef ASSERT 1309 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1310 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1311 for (Register r = r3; r <= r18; r++) 1312 if (r != rscratch1) __ mov(r, rscratch1); 1313 #endif 1314 } 1315 1316 // Scan over array at a for count oops, verifying each one. 1317 // Preserves a and count, clobbers rscratch1 and rscratch2. 1318 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1319 Label loop, end; 1320 __ mov(rscratch1, a); 1321 __ mov(rscratch2, zr); 1322 __ bind(loop); 1323 __ cmp(rscratch2, count); 1324 __ br(Assembler::HS, end); 1325 if (size == (size_t)wordSize) { 1326 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1327 __ verify_oop(temp); 1328 } else { 1329 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1330 __ decode_heap_oop(temp); // calls verify_oop 1331 } 1332 __ add(rscratch2, rscratch2, size); 1333 __ b(loop); 1334 __ bind(end); 1335 } 1336 1337 // Arguments: 1338 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1339 // ignored 1340 // is_oop - true => oop array, so generate store check code 1341 // name - stub name string 1342 // 1343 // Inputs: 1344 // c_rarg0 - source array address 1345 // c_rarg1 - destination array address 1346 // c_rarg2 - element count, treated as ssize_t, can be zero 1347 // 1348 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1349 // the hardware handle it. The two dwords within qwords that span 1350 // cache line boundaries will still be loaded and stored atomicly. 1351 // 1352 // Side Effects: 1353 // disjoint_int_copy_entry is set to the no-overlap entry point 1354 // used by generate_conjoint_int_oop_copy(). 1355 // 1356 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1357 const char *name, bool dest_uninitialized = false) { 1358 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1359 RegSet saved_reg = RegSet::of(s, d, count); 1360 __ align(CodeEntryAlignment); 1361 StubCodeMark mark(this, "StubRoutines", name); 1362 address start = __ pc(); 1363 __ enter(); 1364 1365 if (entry != NULL) { 1366 *entry = __ pc(); 1367 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1368 BLOCK_COMMENT("Entry:"); 1369 } 1370 1371 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1372 if (dest_uninitialized) { 1373 decorators |= IS_DEST_UNINITIALIZED; 1374 } 1375 if (aligned) { 1376 decorators |= ARRAYCOPY_ALIGNED; 1377 } 1378 1379 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1380 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1381 1382 if (is_oop) { 1383 // save regs before copy_memory 1384 __ push(RegSet::of(d, count), sp); 1385 } 1386 copy_memory(aligned, s, d, count, rscratch1, size); 1387 1388 if (is_oop) { 1389 __ pop(RegSet::of(d, count), sp); 1390 if (VerifyOops) 1391 verify_oop_array(size, d, count, r16); 1392 } 1393 1394 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1395 1396 __ leave(); 1397 __ mov(r0, zr); // return 0 1398 __ ret(lr); 1399 #ifdef BUILTIN_SIM 1400 { 1401 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1402 sim->notifyCompile(const_cast<char*>(name), start); 1403 } 1404 #endif 1405 return start; 1406 } 1407 1408 // Arguments: 1409 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1410 // ignored 1411 // is_oop - true => oop array, so generate store check code 1412 // name - stub name string 1413 // 1414 // Inputs: 1415 // c_rarg0 - source array address 1416 // c_rarg1 - destination array address 1417 // c_rarg2 - element count, treated as ssize_t, can be zero 1418 // 1419 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1420 // the hardware handle it. The two dwords within qwords that span 1421 // cache line boundaries will still be loaded and stored atomicly. 1422 // 1423 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1424 address *entry, const char *name, 1425 bool dest_uninitialized = false) { 1426 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1427 RegSet saved_regs = RegSet::of(s, d, count); 1428 StubCodeMark mark(this, "StubRoutines", name); 1429 address start = __ pc(); 1430 __ enter(); 1431 1432 if (entry != NULL) { 1433 *entry = __ pc(); 1434 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1435 BLOCK_COMMENT("Entry:"); 1436 } 1437 1438 // use fwd copy when (d-s) above_equal (count*size) 1439 __ sub(rscratch1, d, s); 1440 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1441 __ br(Assembler::HS, nooverlap_target); 1442 1443 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1444 if (dest_uninitialized) { 1445 decorators |= IS_DEST_UNINITIALIZED; 1446 } 1447 if (aligned) { 1448 decorators |= ARRAYCOPY_ALIGNED; 1449 } 1450 1451 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1452 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1453 1454 if (is_oop) { 1455 // save regs before copy_memory 1456 __ push(RegSet::of(d, count), sp); 1457 } 1458 copy_memory(aligned, s, d, count, rscratch1, -size); 1459 if (is_oop) { 1460 __ pop(RegSet::of(d, count), sp); 1461 if (VerifyOops) 1462 verify_oop_array(size, d, count, r16); 1463 } 1464 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1465 __ leave(); 1466 __ mov(r0, zr); // return 0 1467 __ ret(lr); 1468 #ifdef BUILTIN_SIM 1469 { 1470 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1471 sim->notifyCompile(const_cast<char*>(name), start); 1472 } 1473 #endif 1474 return start; 1475 } 1476 1477 // Arguments: 1478 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1479 // ignored 1480 // name - stub name string 1481 // 1482 // Inputs: 1483 // c_rarg0 - source array address 1484 // c_rarg1 - destination array address 1485 // c_rarg2 - element count, treated as ssize_t, can be zero 1486 // 1487 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1488 // we let the hardware handle it. The one to eight bytes within words, 1489 // dwords or qwords that span cache line boundaries will still be loaded 1490 // and stored atomically. 1491 // 1492 // Side Effects: 1493 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1494 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1495 // we let the hardware handle it. The one to eight bytes within words, 1496 // dwords or qwords that span cache line boundaries will still be loaded 1497 // and stored atomically. 1498 // 1499 // Side Effects: 1500 // disjoint_byte_copy_entry is set to the no-overlap entry point 1501 // used by generate_conjoint_byte_copy(). 1502 // 1503 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1504 const bool not_oop = false; 1505 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1506 } 1507 1508 // Arguments: 1509 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1510 // ignored 1511 // name - stub name string 1512 // 1513 // Inputs: 1514 // c_rarg0 - source array address 1515 // c_rarg1 - destination array address 1516 // c_rarg2 - element count, treated as ssize_t, can be zero 1517 // 1518 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1519 // we let the hardware handle it. The one to eight bytes within words, 1520 // dwords or qwords that span cache line boundaries will still be loaded 1521 // and stored atomically. 1522 // 1523 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1524 address* entry, const char *name) { 1525 const bool not_oop = false; 1526 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1527 } 1528 1529 // Arguments: 1530 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1531 // ignored 1532 // name - stub name string 1533 // 1534 // Inputs: 1535 // c_rarg0 - source array address 1536 // c_rarg1 - destination array address 1537 // c_rarg2 - element count, treated as ssize_t, can be zero 1538 // 1539 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1540 // let the hardware handle it. The two or four words within dwords 1541 // or qwords that span cache line boundaries will still be loaded 1542 // and stored atomically. 1543 // 1544 // Side Effects: 1545 // disjoint_short_copy_entry is set to the no-overlap entry point 1546 // used by generate_conjoint_short_copy(). 1547 // 1548 address generate_disjoint_short_copy(bool aligned, 1549 address* entry, const char *name) { 1550 const bool not_oop = false; 1551 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1552 } 1553 1554 // Arguments: 1555 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1556 // ignored 1557 // name - stub name string 1558 // 1559 // Inputs: 1560 // c_rarg0 - source array address 1561 // c_rarg1 - destination array address 1562 // c_rarg2 - element count, treated as ssize_t, can be zero 1563 // 1564 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1565 // let the hardware handle it. The two or four words within dwords 1566 // or qwords that span cache line boundaries will still be loaded 1567 // and stored atomically. 1568 // 1569 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1570 address *entry, const char *name) { 1571 const bool not_oop = false; 1572 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1573 1574 } 1575 // Arguments: 1576 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1577 // ignored 1578 // name - stub name string 1579 // 1580 // Inputs: 1581 // c_rarg0 - source array address 1582 // c_rarg1 - destination array address 1583 // c_rarg2 - element count, treated as ssize_t, can be zero 1584 // 1585 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1586 // the hardware handle it. The two dwords within qwords that span 1587 // cache line boundaries will still be loaded and stored atomicly. 1588 // 1589 // Side Effects: 1590 // disjoint_int_copy_entry is set to the no-overlap entry point 1591 // used by generate_conjoint_int_oop_copy(). 1592 // 1593 address generate_disjoint_int_copy(bool aligned, address *entry, 1594 const char *name, bool dest_uninitialized = false) { 1595 const bool not_oop = false; 1596 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1597 } 1598 1599 // Arguments: 1600 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1601 // ignored 1602 // name - stub name string 1603 // 1604 // Inputs: 1605 // c_rarg0 - source array address 1606 // c_rarg1 - destination array address 1607 // c_rarg2 - element count, treated as ssize_t, can be zero 1608 // 1609 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1610 // the hardware handle it. The two dwords within qwords that span 1611 // cache line boundaries will still be loaded and stored atomicly. 1612 // 1613 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1614 address *entry, const char *name, 1615 bool dest_uninitialized = false) { 1616 const bool not_oop = false; 1617 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1618 } 1619 1620 1621 // Arguments: 1622 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1623 // ignored 1624 // name - stub name string 1625 // 1626 // Inputs: 1627 // c_rarg0 - source array address 1628 // c_rarg1 - destination array address 1629 // c_rarg2 - element count, treated as size_t, can be zero 1630 // 1631 // Side Effects: 1632 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1633 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1634 // 1635 address generate_disjoint_long_copy(bool aligned, address *entry, 1636 const char *name, bool dest_uninitialized = false) { 1637 const bool not_oop = false; 1638 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1639 } 1640 1641 // Arguments: 1642 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1643 // ignored 1644 // name - stub name string 1645 // 1646 // Inputs: 1647 // c_rarg0 - source array address 1648 // c_rarg1 - destination array address 1649 // c_rarg2 - element count, treated as size_t, can be zero 1650 // 1651 address generate_conjoint_long_copy(bool aligned, 1652 address nooverlap_target, address *entry, 1653 const char *name, bool dest_uninitialized = false) { 1654 const bool not_oop = false; 1655 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1656 } 1657 1658 // Arguments: 1659 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1660 // ignored 1661 // name - stub name string 1662 // 1663 // Inputs: 1664 // c_rarg0 - source array address 1665 // c_rarg1 - destination array address 1666 // c_rarg2 - element count, treated as size_t, can be zero 1667 // 1668 // Side Effects: 1669 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1670 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1671 // 1672 address generate_disjoint_oop_copy(bool aligned, address *entry, 1673 const char *name, bool dest_uninitialized) { 1674 const bool is_oop = true; 1675 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1676 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1677 } 1678 1679 // Arguments: 1680 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1681 // ignored 1682 // name - stub name string 1683 // 1684 // Inputs: 1685 // c_rarg0 - source array address 1686 // c_rarg1 - destination array address 1687 // c_rarg2 - element count, treated as size_t, can be zero 1688 // 1689 address generate_conjoint_oop_copy(bool aligned, 1690 address nooverlap_target, address *entry, 1691 const char *name, bool dest_uninitialized) { 1692 const bool is_oop = true; 1693 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1694 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1695 name, dest_uninitialized); 1696 } 1697 1698 1699 // Helper for generating a dynamic type check. 1700 // Smashes rscratch1, rscratch2. 1701 void generate_type_check(Register sub_klass, 1702 Register super_check_offset, 1703 Register super_klass, 1704 Label& L_success) { 1705 assert_different_registers(sub_klass, super_check_offset, super_klass); 1706 1707 BLOCK_COMMENT("type_check:"); 1708 1709 Label L_miss; 1710 1711 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1712 super_check_offset); 1713 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1714 1715 // Fall through on failure! 1716 __ BIND(L_miss); 1717 } 1718 1719 // 1720 // Generate checkcasting array copy stub 1721 // 1722 // Input: 1723 // c_rarg0 - source array address 1724 // c_rarg1 - destination array address 1725 // c_rarg2 - element count, treated as ssize_t, can be zero 1726 // c_rarg3 - size_t ckoff (super_check_offset) 1727 // c_rarg4 - oop ckval (super_klass) 1728 // 1729 // Output: 1730 // r0 == 0 - success 1731 // r0 == -1^K - failure, where K is partial transfer count 1732 // 1733 address generate_checkcast_copy(const char *name, address *entry, 1734 bool dest_uninitialized = false) { 1735 1736 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1737 1738 // Input registers (after setup_arg_regs) 1739 const Register from = c_rarg0; // source array address 1740 const Register to = c_rarg1; // destination array address 1741 const Register count = c_rarg2; // elementscount 1742 const Register ckoff = c_rarg3; // super_check_offset 1743 const Register ckval = c_rarg4; // super_klass 1744 1745 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1746 RegSet wb_post_saved_regs = RegSet::of(count); 1747 1748 // Registers used as temps (r18, r19, r20 are save-on-entry) 1749 const Register count_save = r21; // orig elementscount 1750 const Register start_to = r20; // destination array start address 1751 const Register copied_oop = r18; // actual oop copied 1752 const Register r19_klass = r19; // oop._klass 1753 1754 //--------------------------------------------------------------- 1755 // Assembler stub will be used for this call to arraycopy 1756 // if the two arrays are subtypes of Object[] but the 1757 // destination array type is not equal to or a supertype 1758 // of the source type. Each element must be separately 1759 // checked. 1760 1761 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1762 copied_oop, r19_klass, count_save); 1763 1764 __ align(CodeEntryAlignment); 1765 StubCodeMark mark(this, "StubRoutines", name); 1766 address start = __ pc(); 1767 1768 __ enter(); // required for proper stackwalking of RuntimeStub frame 1769 1770 #ifdef ASSERT 1771 // caller guarantees that the arrays really are different 1772 // otherwise, we would have to make conjoint checks 1773 { Label L; 1774 array_overlap_test(L, TIMES_OOP); 1775 __ stop("checkcast_copy within a single array"); 1776 __ bind(L); 1777 } 1778 #endif //ASSERT 1779 1780 // Caller of this entry point must set up the argument registers. 1781 if (entry != NULL) { 1782 *entry = __ pc(); 1783 BLOCK_COMMENT("Entry:"); 1784 } 1785 1786 // Empty array: Nothing to do. 1787 __ cbz(count, L_done); 1788 1789 __ push(RegSet::of(r18, r19, r20, r21), sp); 1790 1791 #ifdef ASSERT 1792 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1793 // The ckoff and ckval must be mutually consistent, 1794 // even though caller generates both. 1795 { Label L; 1796 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1797 __ ldrw(start_to, Address(ckval, sco_offset)); 1798 __ cmpw(ckoff, start_to); 1799 __ br(Assembler::EQ, L); 1800 __ stop("super_check_offset inconsistent"); 1801 __ bind(L); 1802 } 1803 #endif //ASSERT 1804 1805 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1806 bool is_oop = true; 1807 if (dest_uninitialized) { 1808 decorators |= IS_DEST_UNINITIALIZED; 1809 } 1810 1811 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1812 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1813 1814 // save the original count 1815 __ mov(count_save, count); 1816 1817 // Copy from low to high addresses 1818 __ mov(start_to, to); // Save destination array start address 1819 __ b(L_load_element); 1820 1821 // ======== begin loop ======== 1822 // (Loop is rotated; its entry is L_load_element.) 1823 // Loop control: 1824 // for (; count != 0; count--) { 1825 // copied_oop = load_heap_oop(from++); 1826 // ... generate_type_check ...; 1827 // store_heap_oop(to++, copied_oop); 1828 // } 1829 __ align(OptoLoopAlignment); 1830 1831 __ BIND(L_store_element); 1832 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, AS_RAW); // store the oop 1833 __ sub(count, count, 1); 1834 __ cbz(count, L_do_card_marks); 1835 1836 // ======== loop entry is here ======== 1837 __ BIND(L_load_element); 1838 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1839 __ cbz(copied_oop, L_store_element); 1840 1841 __ load_klass(r19_klass, copied_oop);// query the object klass 1842 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1843 // ======== end loop ======== 1844 1845 // It was a real error; we must depend on the caller to finish the job. 1846 // Register count = remaining oops, count_orig = total oops. 1847 // Emit GC store barriers for the oops we have copied and report 1848 // their number to the caller. 1849 1850 __ subs(count, count_save, count); // K = partially copied oop count 1851 __ eon(count, count, zr); // report (-1^K) to caller 1852 __ br(Assembler::EQ, L_done_pop); 1853 1854 __ BIND(L_do_card_marks); 1855 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1856 1857 __ bind(L_done_pop); 1858 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1859 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1860 1861 __ bind(L_done); 1862 __ mov(r0, count); 1863 __ leave(); 1864 __ ret(lr); 1865 1866 return start; 1867 } 1868 1869 // Perform range checks on the proposed arraycopy. 1870 // Kills temp, but nothing else. 1871 // Also, clean the sign bits of src_pos and dst_pos. 1872 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1873 Register src_pos, // source position (c_rarg1) 1874 Register dst, // destination array oo (c_rarg2) 1875 Register dst_pos, // destination position (c_rarg3) 1876 Register length, 1877 Register temp, 1878 Label& L_failed) { 1879 BLOCK_COMMENT("arraycopy_range_checks:"); 1880 1881 assert_different_registers(rscratch1, temp); 1882 1883 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1884 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1885 __ addw(temp, length, src_pos); 1886 __ cmpw(temp, rscratch1); 1887 __ br(Assembler::HI, L_failed); 1888 1889 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1890 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1891 __ addw(temp, length, dst_pos); 1892 __ cmpw(temp, rscratch1); 1893 __ br(Assembler::HI, L_failed); 1894 1895 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1896 __ movw(src_pos, src_pos); 1897 __ movw(dst_pos, dst_pos); 1898 1899 BLOCK_COMMENT("arraycopy_range_checks done"); 1900 } 1901 1902 // These stubs get called from some dumb test routine. 1903 // I'll write them properly when they're called from 1904 // something that's actually doing something. 1905 static void fake_arraycopy_stub(address src, address dst, int count) { 1906 assert(count == 0, "huh?"); 1907 } 1908 1909 1910 // 1911 // Generate 'unsafe' array copy stub 1912 // Though just as safe as the other stubs, it takes an unscaled 1913 // size_t argument instead of an element count. 1914 // 1915 // Input: 1916 // c_rarg0 - source array address 1917 // c_rarg1 - destination array address 1918 // c_rarg2 - byte count, treated as ssize_t, can be zero 1919 // 1920 // Examines the alignment of the operands and dispatches 1921 // to a long, int, short, or byte copy loop. 1922 // 1923 address generate_unsafe_copy(const char *name, 1924 address byte_copy_entry, 1925 address short_copy_entry, 1926 address int_copy_entry, 1927 address long_copy_entry) { 1928 Label L_long_aligned, L_int_aligned, L_short_aligned; 1929 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1930 1931 __ align(CodeEntryAlignment); 1932 StubCodeMark mark(this, "StubRoutines", name); 1933 address start = __ pc(); 1934 __ enter(); // required for proper stackwalking of RuntimeStub frame 1935 1936 // bump this on entry, not on exit: 1937 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1938 1939 __ orr(rscratch1, s, d); 1940 __ orr(rscratch1, rscratch1, count); 1941 1942 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1943 __ cbz(rscratch1, L_long_aligned); 1944 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1945 __ cbz(rscratch1, L_int_aligned); 1946 __ tbz(rscratch1, 0, L_short_aligned); 1947 __ b(RuntimeAddress(byte_copy_entry)); 1948 1949 __ BIND(L_short_aligned); 1950 __ lsr(count, count, LogBytesPerShort); // size => short_count 1951 __ b(RuntimeAddress(short_copy_entry)); 1952 __ BIND(L_int_aligned); 1953 __ lsr(count, count, LogBytesPerInt); // size => int_count 1954 __ b(RuntimeAddress(int_copy_entry)); 1955 __ BIND(L_long_aligned); 1956 __ lsr(count, count, LogBytesPerLong); // size => long_count 1957 __ b(RuntimeAddress(long_copy_entry)); 1958 1959 return start; 1960 } 1961 1962 // 1963 // Generate generic array copy stubs 1964 // 1965 // Input: 1966 // c_rarg0 - src oop 1967 // c_rarg1 - src_pos (32-bits) 1968 // c_rarg2 - dst oop 1969 // c_rarg3 - dst_pos (32-bits) 1970 // c_rarg4 - element count (32-bits) 1971 // 1972 // Output: 1973 // r0 == 0 - success 1974 // r0 == -1^K - failure, where K is partial transfer count 1975 // 1976 address generate_generic_copy(const char *name, 1977 address byte_copy_entry, address short_copy_entry, 1978 address int_copy_entry, address oop_copy_entry, 1979 address long_copy_entry, address checkcast_copy_entry) { 1980 1981 Label L_failed, L_objArray; 1982 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1983 1984 // Input registers 1985 const Register src = c_rarg0; // source array oop 1986 const Register src_pos = c_rarg1; // source position 1987 const Register dst = c_rarg2; // destination array oop 1988 const Register dst_pos = c_rarg3; // destination position 1989 const Register length = c_rarg4; 1990 1991 1992 // Registers used as temps 1993 const Register dst_klass = c_rarg5; 1994 1995 __ align(CodeEntryAlignment); 1996 1997 StubCodeMark mark(this, "StubRoutines", name); 1998 1999 address start = __ pc(); 2000 2001 __ enter(); // required for proper stackwalking of RuntimeStub frame 2002 2003 // bump this on entry, not on exit: 2004 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2005 2006 //----------------------------------------------------------------------- 2007 // Assembler stub will be used for this call to arraycopy 2008 // if the following conditions are met: 2009 // 2010 // (1) src and dst must not be null. 2011 // (2) src_pos must not be negative. 2012 // (3) dst_pos must not be negative. 2013 // (4) length must not be negative. 2014 // (5) src klass and dst klass should be the same and not NULL. 2015 // (6) src and dst should be arrays. 2016 // (7) src_pos + length must not exceed length of src. 2017 // (8) dst_pos + length must not exceed length of dst. 2018 // 2019 2020 // if (src == NULL) return -1; 2021 __ cbz(src, L_failed); 2022 2023 // if (src_pos < 0) return -1; 2024 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2025 2026 // if (dst == NULL) return -1; 2027 __ cbz(dst, L_failed); 2028 2029 // if (dst_pos < 0) return -1; 2030 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2031 2032 // registers used as temp 2033 const Register scratch_length = r16; // elements count to copy 2034 const Register scratch_src_klass = r17; // array klass 2035 const Register lh = r18; // layout helper 2036 2037 // if (length < 0) return -1; 2038 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2039 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2040 2041 __ load_klass(scratch_src_klass, src); 2042 #ifdef ASSERT 2043 // assert(src->klass() != NULL); 2044 { 2045 BLOCK_COMMENT("assert klasses not null {"); 2046 Label L1, L2; 2047 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2048 __ bind(L1); 2049 __ stop("broken null klass"); 2050 __ bind(L2); 2051 __ load_klass(rscratch1, dst); 2052 __ cbz(rscratch1, L1); // this would be broken also 2053 BLOCK_COMMENT("} assert klasses not null done"); 2054 } 2055 #endif 2056 2057 // Load layout helper (32-bits) 2058 // 2059 // |array_tag| | header_size | element_type | |log2_element_size| 2060 // 32 30 24 16 8 2 0 2061 // 2062 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2063 // 2064 2065 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2066 2067 // Handle objArrays completely differently... 2068 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2069 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2070 __ movw(rscratch1, objArray_lh); 2071 __ eorw(rscratch2, lh, rscratch1); 2072 __ cbzw(rscratch2, L_objArray); 2073 2074 // if (src->klass() != dst->klass()) return -1; 2075 __ load_klass(rscratch2, dst); 2076 __ eor(rscratch2, rscratch2, scratch_src_klass); 2077 __ cbnz(rscratch2, L_failed); 2078 2079 // if (!src->is_Array()) return -1; 2080 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2081 2082 // At this point, it is known to be a typeArray (array_tag 0x3). 2083 #ifdef ASSERT 2084 { 2085 BLOCK_COMMENT("assert primitive array {"); 2086 Label L; 2087 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2088 __ cmpw(lh, rscratch2); 2089 __ br(Assembler::GE, L); 2090 __ stop("must be a primitive array"); 2091 __ bind(L); 2092 BLOCK_COMMENT("} assert primitive array done"); 2093 } 2094 #endif 2095 2096 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2097 rscratch2, L_failed); 2098 2099 // TypeArrayKlass 2100 // 2101 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2102 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2103 // 2104 2105 const Register rscratch1_offset = rscratch1; // array offset 2106 const Register r18_elsize = lh; // element size 2107 2108 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2109 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2110 __ add(src, src, rscratch1_offset); // src array offset 2111 __ add(dst, dst, rscratch1_offset); // dst array offset 2112 BLOCK_COMMENT("choose copy loop based on element size"); 2113 2114 // next registers should be set before the jump to corresponding stub 2115 const Register from = c_rarg0; // source array address 2116 const Register to = c_rarg1; // destination array address 2117 const Register count = c_rarg2; // elements count 2118 2119 // 'from', 'to', 'count' registers should be set in such order 2120 // since they are the same as 'src', 'src_pos', 'dst'. 2121 2122 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2123 2124 // The possible values of elsize are 0-3, i.e. exact_log2(element 2125 // size in bytes). We do a simple bitwise binary search. 2126 __ BIND(L_copy_bytes); 2127 __ tbnz(r18_elsize, 1, L_copy_ints); 2128 __ tbnz(r18_elsize, 0, L_copy_shorts); 2129 __ lea(from, Address(src, src_pos));// src_addr 2130 __ lea(to, Address(dst, dst_pos));// dst_addr 2131 __ movw(count, scratch_length); // length 2132 __ b(RuntimeAddress(byte_copy_entry)); 2133 2134 __ BIND(L_copy_shorts); 2135 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2136 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2137 __ movw(count, scratch_length); // length 2138 __ b(RuntimeAddress(short_copy_entry)); 2139 2140 __ BIND(L_copy_ints); 2141 __ tbnz(r18_elsize, 0, L_copy_longs); 2142 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2143 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2144 __ movw(count, scratch_length); // length 2145 __ b(RuntimeAddress(int_copy_entry)); 2146 2147 __ BIND(L_copy_longs); 2148 #ifdef ASSERT 2149 { 2150 BLOCK_COMMENT("assert long copy {"); 2151 Label L; 2152 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2153 __ cmpw(r18_elsize, LogBytesPerLong); 2154 __ br(Assembler::EQ, L); 2155 __ stop("must be long copy, but elsize is wrong"); 2156 __ bind(L); 2157 BLOCK_COMMENT("} assert long copy done"); 2158 } 2159 #endif 2160 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2161 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2162 __ movw(count, scratch_length); // length 2163 __ b(RuntimeAddress(long_copy_entry)); 2164 2165 // ObjArrayKlass 2166 __ BIND(L_objArray); 2167 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2168 2169 Label L_plain_copy, L_checkcast_copy; 2170 // test array classes for subtyping 2171 __ load_klass(r18, dst); 2172 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2173 __ br(Assembler::NE, L_checkcast_copy); 2174 2175 // Identically typed arrays can be copied without element-wise checks. 2176 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2177 rscratch2, L_failed); 2178 2179 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2180 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2181 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2182 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2183 __ movw(count, scratch_length); // length 2184 __ BIND(L_plain_copy); 2185 __ b(RuntimeAddress(oop_copy_entry)); 2186 2187 __ BIND(L_checkcast_copy); 2188 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2189 { 2190 // Before looking at dst.length, make sure dst is also an objArray. 2191 __ ldrw(rscratch1, Address(r18, lh_offset)); 2192 __ movw(rscratch2, objArray_lh); 2193 __ eorw(rscratch1, rscratch1, rscratch2); 2194 __ cbnzw(rscratch1, L_failed); 2195 2196 // It is safe to examine both src.length and dst.length. 2197 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2198 r18, L_failed); 2199 2200 __ load_klass(dst_klass, dst); // reload 2201 2202 // Marshal the base address arguments now, freeing registers. 2203 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2204 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2205 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2206 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2207 __ movw(count, length); // length (reloaded) 2208 Register sco_temp = c_rarg3; // this register is free now 2209 assert_different_registers(from, to, count, sco_temp, 2210 dst_klass, scratch_src_klass); 2211 // assert_clean_int(count, sco_temp); 2212 2213 // Generate the type check. 2214 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2215 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2216 2217 // Smashes rscratch1, rscratch2 2218 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2219 2220 // Fetch destination element klass from the ObjArrayKlass header. 2221 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2222 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2223 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2224 2225 // the checkcast_copy loop needs two extra arguments: 2226 assert(c_rarg3 == sco_temp, "#3 already in place"); 2227 // Set up arguments for checkcast_copy_entry. 2228 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2229 __ b(RuntimeAddress(checkcast_copy_entry)); 2230 } 2231 2232 __ BIND(L_failed); 2233 __ mov(r0, -1); 2234 __ leave(); // required for proper stackwalking of RuntimeStub frame 2235 __ ret(lr); 2236 2237 return start; 2238 } 2239 2240 // 2241 // Generate stub for array fill. If "aligned" is true, the 2242 // "to" address is assumed to be heapword aligned. 2243 // 2244 // Arguments for generated stub: 2245 // to: c_rarg0 2246 // value: c_rarg1 2247 // count: c_rarg2 treated as signed 2248 // 2249 address generate_fill(BasicType t, bool aligned, const char *name) { 2250 __ align(CodeEntryAlignment); 2251 StubCodeMark mark(this, "StubRoutines", name); 2252 address start = __ pc(); 2253 2254 BLOCK_COMMENT("Entry:"); 2255 2256 const Register to = c_rarg0; // source array address 2257 const Register value = c_rarg1; // value 2258 const Register count = c_rarg2; // elements count 2259 2260 const Register bz_base = r10; // base for block_zero routine 2261 const Register cnt_words = r11; // temp register 2262 2263 __ enter(); 2264 2265 Label L_fill_elements, L_exit1; 2266 2267 int shift = -1; 2268 switch (t) { 2269 case T_BYTE: 2270 shift = 0; 2271 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2272 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2273 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2274 __ br(Assembler::LO, L_fill_elements); 2275 break; 2276 case T_SHORT: 2277 shift = 1; 2278 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2279 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2280 __ br(Assembler::LO, L_fill_elements); 2281 break; 2282 case T_INT: 2283 shift = 2; 2284 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2285 __ br(Assembler::LO, L_fill_elements); 2286 break; 2287 default: ShouldNotReachHere(); 2288 } 2289 2290 // Align source address at 8 bytes address boundary. 2291 Label L_skip_align1, L_skip_align2, L_skip_align4; 2292 if (!aligned) { 2293 switch (t) { 2294 case T_BYTE: 2295 // One byte misalignment happens only for byte arrays. 2296 __ tbz(to, 0, L_skip_align1); 2297 __ strb(value, Address(__ post(to, 1))); 2298 __ subw(count, count, 1); 2299 __ bind(L_skip_align1); 2300 // Fallthrough 2301 case T_SHORT: 2302 // Two bytes misalignment happens only for byte and short (char) arrays. 2303 __ tbz(to, 1, L_skip_align2); 2304 __ strh(value, Address(__ post(to, 2))); 2305 __ subw(count, count, 2 >> shift); 2306 __ bind(L_skip_align2); 2307 // Fallthrough 2308 case T_INT: 2309 // Align to 8 bytes, we know we are 4 byte aligned to start. 2310 __ tbz(to, 2, L_skip_align4); 2311 __ strw(value, Address(__ post(to, 4))); 2312 __ subw(count, count, 4 >> shift); 2313 __ bind(L_skip_align4); 2314 break; 2315 default: ShouldNotReachHere(); 2316 } 2317 } 2318 2319 // 2320 // Fill large chunks 2321 // 2322 __ lsrw(cnt_words, count, 3 - shift); // number of words 2323 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2324 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2325 if (UseBlockZeroing) { 2326 Label non_block_zeroing, rest; 2327 // If the fill value is zero we can use the fast zero_words(). 2328 __ cbnz(value, non_block_zeroing); 2329 __ mov(bz_base, to); 2330 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2331 __ zero_words(bz_base, cnt_words); 2332 __ b(rest); 2333 __ bind(non_block_zeroing); 2334 __ fill_words(to, cnt_words, value); 2335 __ bind(rest); 2336 } else { 2337 __ fill_words(to, cnt_words, value); 2338 } 2339 2340 // Remaining count is less than 8 bytes. Fill it by a single store. 2341 // Note that the total length is no less than 8 bytes. 2342 if (t == T_BYTE || t == T_SHORT) { 2343 Label L_exit1; 2344 __ cbzw(count, L_exit1); 2345 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2346 __ str(value, Address(to, -8)); // overwrite some elements 2347 __ bind(L_exit1); 2348 __ leave(); 2349 __ ret(lr); 2350 } 2351 2352 // Handle copies less than 8 bytes. 2353 Label L_fill_2, L_fill_4, L_exit2; 2354 __ bind(L_fill_elements); 2355 switch (t) { 2356 case T_BYTE: 2357 __ tbz(count, 0, L_fill_2); 2358 __ strb(value, Address(__ post(to, 1))); 2359 __ bind(L_fill_2); 2360 __ tbz(count, 1, L_fill_4); 2361 __ strh(value, Address(__ post(to, 2))); 2362 __ bind(L_fill_4); 2363 __ tbz(count, 2, L_exit2); 2364 __ strw(value, Address(to)); 2365 break; 2366 case T_SHORT: 2367 __ tbz(count, 0, L_fill_4); 2368 __ strh(value, Address(__ post(to, 2))); 2369 __ bind(L_fill_4); 2370 __ tbz(count, 1, L_exit2); 2371 __ strw(value, Address(to)); 2372 break; 2373 case T_INT: 2374 __ cbzw(count, L_exit2); 2375 __ strw(value, Address(to)); 2376 break; 2377 default: ShouldNotReachHere(); 2378 } 2379 __ bind(L_exit2); 2380 __ leave(); 2381 __ ret(lr); 2382 return start; 2383 } 2384 2385 void generate_arraycopy_stubs() { 2386 address entry; 2387 address entry_jbyte_arraycopy; 2388 address entry_jshort_arraycopy; 2389 address entry_jint_arraycopy; 2390 address entry_oop_arraycopy; 2391 address entry_jlong_arraycopy; 2392 address entry_checkcast_arraycopy; 2393 2394 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2395 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2396 2397 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2398 2399 //*** jbyte 2400 // Always need aligned and unaligned versions 2401 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2402 "jbyte_disjoint_arraycopy"); 2403 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2404 &entry_jbyte_arraycopy, 2405 "jbyte_arraycopy"); 2406 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2407 "arrayof_jbyte_disjoint_arraycopy"); 2408 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2409 "arrayof_jbyte_arraycopy"); 2410 2411 //*** jshort 2412 // Always need aligned and unaligned versions 2413 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2414 "jshort_disjoint_arraycopy"); 2415 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2416 &entry_jshort_arraycopy, 2417 "jshort_arraycopy"); 2418 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2419 "arrayof_jshort_disjoint_arraycopy"); 2420 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2421 "arrayof_jshort_arraycopy"); 2422 2423 //*** jint 2424 // Aligned versions 2425 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2426 "arrayof_jint_disjoint_arraycopy"); 2427 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2428 "arrayof_jint_arraycopy"); 2429 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2430 // entry_jint_arraycopy always points to the unaligned version 2431 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2432 "jint_disjoint_arraycopy"); 2433 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2434 &entry_jint_arraycopy, 2435 "jint_arraycopy"); 2436 2437 //*** jlong 2438 // It is always aligned 2439 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2440 "arrayof_jlong_disjoint_arraycopy"); 2441 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2442 "arrayof_jlong_arraycopy"); 2443 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2444 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2445 2446 //*** oops 2447 { 2448 // With compressed oops we need unaligned versions; notice that 2449 // we overwrite entry_oop_arraycopy. 2450 bool aligned = !UseCompressedOops; 2451 2452 StubRoutines::_arrayof_oop_disjoint_arraycopy 2453 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2454 /*dest_uninitialized*/false); 2455 StubRoutines::_arrayof_oop_arraycopy 2456 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2457 /*dest_uninitialized*/false); 2458 // Aligned versions without pre-barriers 2459 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2460 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2461 /*dest_uninitialized*/true); 2462 StubRoutines::_arrayof_oop_arraycopy_uninit 2463 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2464 /*dest_uninitialized*/true); 2465 } 2466 2467 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2468 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2469 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2470 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2471 2472 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2473 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2474 /*dest_uninitialized*/true); 2475 2476 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2477 entry_jbyte_arraycopy, 2478 entry_jshort_arraycopy, 2479 entry_jint_arraycopy, 2480 entry_jlong_arraycopy); 2481 2482 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2483 entry_jbyte_arraycopy, 2484 entry_jshort_arraycopy, 2485 entry_jint_arraycopy, 2486 entry_oop_arraycopy, 2487 entry_jlong_arraycopy, 2488 entry_checkcast_arraycopy); 2489 2490 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2491 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2492 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2493 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2494 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2495 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2496 } 2497 2498 void generate_math_stubs() { Unimplemented(); } 2499 2500 // Arguments: 2501 // 2502 // Inputs: 2503 // c_rarg0 - source byte array address 2504 // c_rarg1 - destination byte array address 2505 // c_rarg2 - K (key) in little endian int array 2506 // 2507 address generate_aescrypt_encryptBlock() { 2508 __ align(CodeEntryAlignment); 2509 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2510 2511 Label L_doLast; 2512 2513 const Register from = c_rarg0; // source array address 2514 const Register to = c_rarg1; // destination array address 2515 const Register key = c_rarg2; // key array address 2516 const Register keylen = rscratch1; 2517 2518 address start = __ pc(); 2519 __ enter(); 2520 2521 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2522 2523 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2524 2525 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2526 __ rev32(v1, __ T16B, v1); 2527 __ rev32(v2, __ T16B, v2); 2528 __ rev32(v3, __ T16B, v3); 2529 __ rev32(v4, __ T16B, v4); 2530 __ aese(v0, v1); 2531 __ aesmc(v0, v0); 2532 __ aese(v0, v2); 2533 __ aesmc(v0, v0); 2534 __ aese(v0, v3); 2535 __ aesmc(v0, v0); 2536 __ aese(v0, v4); 2537 __ aesmc(v0, v0); 2538 2539 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2540 __ rev32(v1, __ T16B, v1); 2541 __ rev32(v2, __ T16B, v2); 2542 __ rev32(v3, __ T16B, v3); 2543 __ rev32(v4, __ T16B, v4); 2544 __ aese(v0, v1); 2545 __ aesmc(v0, v0); 2546 __ aese(v0, v2); 2547 __ aesmc(v0, v0); 2548 __ aese(v0, v3); 2549 __ aesmc(v0, v0); 2550 __ aese(v0, v4); 2551 __ aesmc(v0, v0); 2552 2553 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2554 __ rev32(v1, __ T16B, v1); 2555 __ rev32(v2, __ T16B, v2); 2556 2557 __ cmpw(keylen, 44); 2558 __ br(Assembler::EQ, L_doLast); 2559 2560 __ aese(v0, v1); 2561 __ aesmc(v0, v0); 2562 __ aese(v0, v2); 2563 __ aesmc(v0, v0); 2564 2565 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2566 __ rev32(v1, __ T16B, v1); 2567 __ rev32(v2, __ T16B, v2); 2568 2569 __ cmpw(keylen, 52); 2570 __ br(Assembler::EQ, L_doLast); 2571 2572 __ aese(v0, v1); 2573 __ aesmc(v0, v0); 2574 __ aese(v0, v2); 2575 __ aesmc(v0, v0); 2576 2577 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2578 __ rev32(v1, __ T16B, v1); 2579 __ rev32(v2, __ T16B, v2); 2580 2581 __ BIND(L_doLast); 2582 2583 __ aese(v0, v1); 2584 __ aesmc(v0, v0); 2585 __ aese(v0, v2); 2586 2587 __ ld1(v1, __ T16B, key); 2588 __ rev32(v1, __ T16B, v1); 2589 __ eor(v0, __ T16B, v0, v1); 2590 2591 __ st1(v0, __ T16B, to); 2592 2593 __ mov(r0, 0); 2594 2595 __ leave(); 2596 __ ret(lr); 2597 2598 return start; 2599 } 2600 2601 // Arguments: 2602 // 2603 // Inputs: 2604 // c_rarg0 - source byte array address 2605 // c_rarg1 - destination byte array address 2606 // c_rarg2 - K (key) in little endian int array 2607 // 2608 address generate_aescrypt_decryptBlock() { 2609 assert(UseAES, "need AES instructions and misaligned SSE support"); 2610 __ align(CodeEntryAlignment); 2611 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2612 Label L_doLast; 2613 2614 const Register from = c_rarg0; // source array address 2615 const Register to = c_rarg1; // destination array address 2616 const Register key = c_rarg2; // key array address 2617 const Register keylen = rscratch1; 2618 2619 address start = __ pc(); 2620 __ enter(); // required for proper stackwalking of RuntimeStub frame 2621 2622 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2623 2624 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2625 2626 __ ld1(v5, __ T16B, __ post(key, 16)); 2627 __ rev32(v5, __ T16B, v5); 2628 2629 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2630 __ rev32(v1, __ T16B, v1); 2631 __ rev32(v2, __ T16B, v2); 2632 __ rev32(v3, __ T16B, v3); 2633 __ rev32(v4, __ T16B, v4); 2634 __ aesd(v0, v1); 2635 __ aesimc(v0, v0); 2636 __ aesd(v0, v2); 2637 __ aesimc(v0, v0); 2638 __ aesd(v0, v3); 2639 __ aesimc(v0, v0); 2640 __ aesd(v0, v4); 2641 __ aesimc(v0, v0); 2642 2643 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2644 __ rev32(v1, __ T16B, v1); 2645 __ rev32(v2, __ T16B, v2); 2646 __ rev32(v3, __ T16B, v3); 2647 __ rev32(v4, __ T16B, v4); 2648 __ aesd(v0, v1); 2649 __ aesimc(v0, v0); 2650 __ aesd(v0, v2); 2651 __ aesimc(v0, v0); 2652 __ aesd(v0, v3); 2653 __ aesimc(v0, v0); 2654 __ aesd(v0, v4); 2655 __ aesimc(v0, v0); 2656 2657 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2658 __ rev32(v1, __ T16B, v1); 2659 __ rev32(v2, __ T16B, v2); 2660 2661 __ cmpw(keylen, 44); 2662 __ br(Assembler::EQ, L_doLast); 2663 2664 __ aesd(v0, v1); 2665 __ aesimc(v0, v0); 2666 __ aesd(v0, v2); 2667 __ aesimc(v0, v0); 2668 2669 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2670 __ rev32(v1, __ T16B, v1); 2671 __ rev32(v2, __ T16B, v2); 2672 2673 __ cmpw(keylen, 52); 2674 __ br(Assembler::EQ, L_doLast); 2675 2676 __ aesd(v0, v1); 2677 __ aesimc(v0, v0); 2678 __ aesd(v0, v2); 2679 __ aesimc(v0, v0); 2680 2681 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2682 __ rev32(v1, __ T16B, v1); 2683 __ rev32(v2, __ T16B, v2); 2684 2685 __ BIND(L_doLast); 2686 2687 __ aesd(v0, v1); 2688 __ aesimc(v0, v0); 2689 __ aesd(v0, v2); 2690 2691 __ eor(v0, __ T16B, v0, v5); 2692 2693 __ st1(v0, __ T16B, to); 2694 2695 __ mov(r0, 0); 2696 2697 __ leave(); 2698 __ ret(lr); 2699 2700 return start; 2701 } 2702 2703 // Arguments: 2704 // 2705 // Inputs: 2706 // c_rarg0 - source byte array address 2707 // c_rarg1 - destination byte array address 2708 // c_rarg2 - K (key) in little endian int array 2709 // c_rarg3 - r vector byte array address 2710 // c_rarg4 - input length 2711 // 2712 // Output: 2713 // x0 - input length 2714 // 2715 address generate_cipherBlockChaining_encryptAESCrypt() { 2716 assert(UseAES, "need AES instructions and misaligned SSE support"); 2717 __ align(CodeEntryAlignment); 2718 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2719 2720 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2721 2722 const Register from = c_rarg0; // source array address 2723 const Register to = c_rarg1; // destination array address 2724 const Register key = c_rarg2; // key array address 2725 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2726 // and left with the results of the last encryption block 2727 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2728 const Register keylen = rscratch1; 2729 2730 address start = __ pc(); 2731 2732 __ enter(); 2733 2734 __ movw(rscratch2, len_reg); 2735 2736 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2737 2738 __ ld1(v0, __ T16B, rvec); 2739 2740 __ cmpw(keylen, 52); 2741 __ br(Assembler::CC, L_loadkeys_44); 2742 __ br(Assembler::EQ, L_loadkeys_52); 2743 2744 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2745 __ rev32(v17, __ T16B, v17); 2746 __ rev32(v18, __ T16B, v18); 2747 __ BIND(L_loadkeys_52); 2748 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2749 __ rev32(v19, __ T16B, v19); 2750 __ rev32(v20, __ T16B, v20); 2751 __ BIND(L_loadkeys_44); 2752 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2753 __ rev32(v21, __ T16B, v21); 2754 __ rev32(v22, __ T16B, v22); 2755 __ rev32(v23, __ T16B, v23); 2756 __ rev32(v24, __ T16B, v24); 2757 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2758 __ rev32(v25, __ T16B, v25); 2759 __ rev32(v26, __ T16B, v26); 2760 __ rev32(v27, __ T16B, v27); 2761 __ rev32(v28, __ T16B, v28); 2762 __ ld1(v29, v30, v31, __ T16B, key); 2763 __ rev32(v29, __ T16B, v29); 2764 __ rev32(v30, __ T16B, v30); 2765 __ rev32(v31, __ T16B, v31); 2766 2767 __ BIND(L_aes_loop); 2768 __ ld1(v1, __ T16B, __ post(from, 16)); 2769 __ eor(v0, __ T16B, v0, v1); 2770 2771 __ br(Assembler::CC, L_rounds_44); 2772 __ br(Assembler::EQ, L_rounds_52); 2773 2774 __ aese(v0, v17); __ aesmc(v0, v0); 2775 __ aese(v0, v18); __ aesmc(v0, v0); 2776 __ BIND(L_rounds_52); 2777 __ aese(v0, v19); __ aesmc(v0, v0); 2778 __ aese(v0, v20); __ aesmc(v0, v0); 2779 __ BIND(L_rounds_44); 2780 __ aese(v0, v21); __ aesmc(v0, v0); 2781 __ aese(v0, v22); __ aesmc(v0, v0); 2782 __ aese(v0, v23); __ aesmc(v0, v0); 2783 __ aese(v0, v24); __ aesmc(v0, v0); 2784 __ aese(v0, v25); __ aesmc(v0, v0); 2785 __ aese(v0, v26); __ aesmc(v0, v0); 2786 __ aese(v0, v27); __ aesmc(v0, v0); 2787 __ aese(v0, v28); __ aesmc(v0, v0); 2788 __ aese(v0, v29); __ aesmc(v0, v0); 2789 __ aese(v0, v30); 2790 __ eor(v0, __ T16B, v0, v31); 2791 2792 __ st1(v0, __ T16B, __ post(to, 16)); 2793 2794 __ subw(len_reg, len_reg, 16); 2795 __ cbnzw(len_reg, L_aes_loop); 2796 2797 __ st1(v0, __ T16B, rvec); 2798 2799 __ mov(r0, rscratch2); 2800 2801 __ leave(); 2802 __ ret(lr); 2803 2804 return start; 2805 } 2806 2807 // Arguments: 2808 // 2809 // Inputs: 2810 // c_rarg0 - source byte array address 2811 // c_rarg1 - destination byte array address 2812 // c_rarg2 - K (key) in little endian int array 2813 // c_rarg3 - r vector byte array address 2814 // c_rarg4 - input length 2815 // 2816 // Output: 2817 // r0 - input length 2818 // 2819 address generate_cipherBlockChaining_decryptAESCrypt() { 2820 assert(UseAES, "need AES instructions and misaligned SSE support"); 2821 __ align(CodeEntryAlignment); 2822 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2823 2824 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2825 2826 const Register from = c_rarg0; // source array address 2827 const Register to = c_rarg1; // destination array address 2828 const Register key = c_rarg2; // key array address 2829 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2830 // and left with the results of the last encryption block 2831 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2832 const Register keylen = rscratch1; 2833 2834 address start = __ pc(); 2835 2836 __ enter(); 2837 2838 __ movw(rscratch2, len_reg); 2839 2840 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2841 2842 __ ld1(v2, __ T16B, rvec); 2843 2844 __ ld1(v31, __ T16B, __ post(key, 16)); 2845 __ rev32(v31, __ T16B, v31); 2846 2847 __ cmpw(keylen, 52); 2848 __ br(Assembler::CC, L_loadkeys_44); 2849 __ br(Assembler::EQ, L_loadkeys_52); 2850 2851 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2852 __ rev32(v17, __ T16B, v17); 2853 __ rev32(v18, __ T16B, v18); 2854 __ BIND(L_loadkeys_52); 2855 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2856 __ rev32(v19, __ T16B, v19); 2857 __ rev32(v20, __ T16B, v20); 2858 __ BIND(L_loadkeys_44); 2859 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2860 __ rev32(v21, __ T16B, v21); 2861 __ rev32(v22, __ T16B, v22); 2862 __ rev32(v23, __ T16B, v23); 2863 __ rev32(v24, __ T16B, v24); 2864 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2865 __ rev32(v25, __ T16B, v25); 2866 __ rev32(v26, __ T16B, v26); 2867 __ rev32(v27, __ T16B, v27); 2868 __ rev32(v28, __ T16B, v28); 2869 __ ld1(v29, v30, __ T16B, key); 2870 __ rev32(v29, __ T16B, v29); 2871 __ rev32(v30, __ T16B, v30); 2872 2873 __ BIND(L_aes_loop); 2874 __ ld1(v0, __ T16B, __ post(from, 16)); 2875 __ orr(v1, __ T16B, v0, v0); 2876 2877 __ br(Assembler::CC, L_rounds_44); 2878 __ br(Assembler::EQ, L_rounds_52); 2879 2880 __ aesd(v0, v17); __ aesimc(v0, v0); 2881 __ aesd(v0, v18); __ aesimc(v0, v0); 2882 __ BIND(L_rounds_52); 2883 __ aesd(v0, v19); __ aesimc(v0, v0); 2884 __ aesd(v0, v20); __ aesimc(v0, v0); 2885 __ BIND(L_rounds_44); 2886 __ aesd(v0, v21); __ aesimc(v0, v0); 2887 __ aesd(v0, v22); __ aesimc(v0, v0); 2888 __ aesd(v0, v23); __ aesimc(v0, v0); 2889 __ aesd(v0, v24); __ aesimc(v0, v0); 2890 __ aesd(v0, v25); __ aesimc(v0, v0); 2891 __ aesd(v0, v26); __ aesimc(v0, v0); 2892 __ aesd(v0, v27); __ aesimc(v0, v0); 2893 __ aesd(v0, v28); __ aesimc(v0, v0); 2894 __ aesd(v0, v29); __ aesimc(v0, v0); 2895 __ aesd(v0, v30); 2896 __ eor(v0, __ T16B, v0, v31); 2897 __ eor(v0, __ T16B, v0, v2); 2898 2899 __ st1(v0, __ T16B, __ post(to, 16)); 2900 __ orr(v2, __ T16B, v1, v1); 2901 2902 __ subw(len_reg, len_reg, 16); 2903 __ cbnzw(len_reg, L_aes_loop); 2904 2905 __ st1(v2, __ T16B, rvec); 2906 2907 __ mov(r0, rscratch2); 2908 2909 __ leave(); 2910 __ ret(lr); 2911 2912 return start; 2913 } 2914 2915 // Arguments: 2916 // 2917 // Inputs: 2918 // c_rarg0 - byte[] source+offset 2919 // c_rarg1 - int[] SHA.state 2920 // c_rarg2 - int offset 2921 // c_rarg3 - int limit 2922 // 2923 address generate_sha1_implCompress(bool multi_block, const char *name) { 2924 __ align(CodeEntryAlignment); 2925 StubCodeMark mark(this, "StubRoutines", name); 2926 address start = __ pc(); 2927 2928 Register buf = c_rarg0; 2929 Register state = c_rarg1; 2930 Register ofs = c_rarg2; 2931 Register limit = c_rarg3; 2932 2933 Label keys; 2934 Label sha1_loop; 2935 2936 // load the keys into v0..v3 2937 __ adr(rscratch1, keys); 2938 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2939 // load 5 words state into v6, v7 2940 __ ldrq(v6, Address(state, 0)); 2941 __ ldrs(v7, Address(state, 16)); 2942 2943 2944 __ BIND(sha1_loop); 2945 // load 64 bytes of data into v16..v19 2946 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2947 __ rev32(v16, __ T16B, v16); 2948 __ rev32(v17, __ T16B, v17); 2949 __ rev32(v18, __ T16B, v18); 2950 __ rev32(v19, __ T16B, v19); 2951 2952 // do the sha1 2953 __ addv(v4, __ T4S, v16, v0); 2954 __ orr(v20, __ T16B, v6, v6); 2955 2956 FloatRegister d0 = v16; 2957 FloatRegister d1 = v17; 2958 FloatRegister d2 = v18; 2959 FloatRegister d3 = v19; 2960 2961 for (int round = 0; round < 20; round++) { 2962 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2963 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2964 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2965 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2966 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2967 2968 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2969 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2970 __ sha1h(tmp2, __ T4S, v20); 2971 if (round < 5) 2972 __ sha1c(v20, __ T4S, tmp3, tmp4); 2973 else if (round < 10 || round >= 15) 2974 __ sha1p(v20, __ T4S, tmp3, tmp4); 2975 else 2976 __ sha1m(v20, __ T4S, tmp3, tmp4); 2977 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2978 2979 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2980 } 2981 2982 __ addv(v7, __ T2S, v7, v21); 2983 __ addv(v6, __ T4S, v6, v20); 2984 2985 if (multi_block) { 2986 __ add(ofs, ofs, 64); 2987 __ cmp(ofs, limit); 2988 __ br(Assembler::LE, sha1_loop); 2989 __ mov(c_rarg0, ofs); // return ofs 2990 } 2991 2992 __ strq(v6, Address(state, 0)); 2993 __ strs(v7, Address(state, 16)); 2994 2995 __ ret(lr); 2996 2997 __ bind(keys); 2998 __ emit_int32(0x5a827999); 2999 __ emit_int32(0x6ed9eba1); 3000 __ emit_int32(0x8f1bbcdc); 3001 __ emit_int32(0xca62c1d6); 3002 3003 return start; 3004 } 3005 3006 3007 // Arguments: 3008 // 3009 // Inputs: 3010 // c_rarg0 - byte[] source+offset 3011 // c_rarg1 - int[] SHA.state 3012 // c_rarg2 - int offset 3013 // c_rarg3 - int limit 3014 // 3015 address generate_sha256_implCompress(bool multi_block, const char *name) { 3016 static const uint32_t round_consts[64] = { 3017 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3018 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3019 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3020 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3021 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3022 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3023 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3024 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3025 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3026 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3027 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3028 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3029 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3030 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3031 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3032 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3033 }; 3034 __ align(CodeEntryAlignment); 3035 StubCodeMark mark(this, "StubRoutines", name); 3036 address start = __ pc(); 3037 3038 Register buf = c_rarg0; 3039 Register state = c_rarg1; 3040 Register ofs = c_rarg2; 3041 Register limit = c_rarg3; 3042 3043 Label sha1_loop; 3044 3045 __ stpd(v8, v9, __ pre(sp, -32)); 3046 __ stpd(v10, v11, Address(sp, 16)); 3047 3048 // dga == v0 3049 // dgb == v1 3050 // dg0 == v2 3051 // dg1 == v3 3052 // dg2 == v4 3053 // t0 == v6 3054 // t1 == v7 3055 3056 // load 16 keys to v16..v31 3057 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3058 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3059 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3060 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3061 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3062 3063 // load 8 words (256 bits) state 3064 __ ldpq(v0, v1, state); 3065 3066 __ BIND(sha1_loop); 3067 // load 64 bytes of data into v8..v11 3068 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3069 __ rev32(v8, __ T16B, v8); 3070 __ rev32(v9, __ T16B, v9); 3071 __ rev32(v10, __ T16B, v10); 3072 __ rev32(v11, __ T16B, v11); 3073 3074 __ addv(v6, __ T4S, v8, v16); 3075 __ orr(v2, __ T16B, v0, v0); 3076 __ orr(v3, __ T16B, v1, v1); 3077 3078 FloatRegister d0 = v8; 3079 FloatRegister d1 = v9; 3080 FloatRegister d2 = v10; 3081 FloatRegister d3 = v11; 3082 3083 3084 for (int round = 0; round < 16; round++) { 3085 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3086 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3087 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3088 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3089 3090 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3091 __ orr(v4, __ T16B, v2, v2); 3092 if (round < 15) 3093 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3094 __ sha256h(v2, __ T4S, v3, tmp2); 3095 __ sha256h2(v3, __ T4S, v4, tmp2); 3096 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3097 3098 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3099 } 3100 3101 __ addv(v0, __ T4S, v0, v2); 3102 __ addv(v1, __ T4S, v1, v3); 3103 3104 if (multi_block) { 3105 __ add(ofs, ofs, 64); 3106 __ cmp(ofs, limit); 3107 __ br(Assembler::LE, sha1_loop); 3108 __ mov(c_rarg0, ofs); // return ofs 3109 } 3110 3111 __ ldpd(v10, v11, Address(sp, 16)); 3112 __ ldpd(v8, v9, __ post(sp, 32)); 3113 3114 __ stpq(v0, v1, state); 3115 3116 __ ret(lr); 3117 3118 return start; 3119 } 3120 3121 #ifndef BUILTIN_SIM 3122 // Safefetch stubs. 3123 void generate_safefetch(const char* name, int size, address* entry, 3124 address* fault_pc, address* continuation_pc) { 3125 // safefetch signatures: 3126 // int SafeFetch32(int* adr, int errValue); 3127 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3128 // 3129 // arguments: 3130 // c_rarg0 = adr 3131 // c_rarg1 = errValue 3132 // 3133 // result: 3134 // PPC_RET = *adr or errValue 3135 3136 StubCodeMark mark(this, "StubRoutines", name); 3137 3138 // Entry point, pc or function descriptor. 3139 *entry = __ pc(); 3140 3141 // Load *adr into c_rarg1, may fault. 3142 *fault_pc = __ pc(); 3143 switch (size) { 3144 case 4: 3145 // int32_t 3146 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3147 break; 3148 case 8: 3149 // int64_t 3150 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3151 break; 3152 default: 3153 ShouldNotReachHere(); 3154 } 3155 3156 // return errValue or *adr 3157 *continuation_pc = __ pc(); 3158 __ mov(r0, c_rarg1); 3159 __ ret(lr); 3160 } 3161 #endif 3162 3163 /** 3164 * Arguments: 3165 * 3166 * Inputs: 3167 * c_rarg0 - int crc 3168 * c_rarg1 - byte* buf 3169 * c_rarg2 - int length 3170 * 3171 * Ouput: 3172 * rax - int crc result 3173 */ 3174 address generate_updateBytesCRC32() { 3175 assert(UseCRC32Intrinsics, "what are we doing here?"); 3176 3177 __ align(CodeEntryAlignment); 3178 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3179 3180 address start = __ pc(); 3181 3182 const Register crc = c_rarg0; // crc 3183 const Register buf = c_rarg1; // source java byte array address 3184 const Register len = c_rarg2; // length 3185 const Register table0 = c_rarg3; // crc_table address 3186 const Register table1 = c_rarg4; 3187 const Register table2 = c_rarg5; 3188 const Register table3 = c_rarg6; 3189 const Register tmp3 = c_rarg7; 3190 3191 BLOCK_COMMENT("Entry:"); 3192 __ enter(); // required for proper stackwalking of RuntimeStub frame 3193 3194 __ kernel_crc32(crc, buf, len, 3195 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3196 3197 __ leave(); // required for proper stackwalking of RuntimeStub frame 3198 __ ret(lr); 3199 3200 return start; 3201 } 3202 3203 /** 3204 * Arguments: 3205 * 3206 * Inputs: 3207 * c_rarg0 - int crc 3208 * c_rarg1 - byte* buf 3209 * c_rarg2 - int length 3210 * c_rarg3 - int* table 3211 * 3212 * Ouput: 3213 * r0 - int crc result 3214 */ 3215 address generate_updateBytesCRC32C() { 3216 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3217 3218 __ align(CodeEntryAlignment); 3219 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3220 3221 address start = __ pc(); 3222 3223 const Register crc = c_rarg0; // crc 3224 const Register buf = c_rarg1; // source java byte array address 3225 const Register len = c_rarg2; // length 3226 const Register table0 = c_rarg3; // crc_table address 3227 const Register table1 = c_rarg4; 3228 const Register table2 = c_rarg5; 3229 const Register table3 = c_rarg6; 3230 const Register tmp3 = c_rarg7; 3231 3232 BLOCK_COMMENT("Entry:"); 3233 __ enter(); // required for proper stackwalking of RuntimeStub frame 3234 3235 __ kernel_crc32c(crc, buf, len, 3236 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3237 3238 __ leave(); // required for proper stackwalking of RuntimeStub frame 3239 __ ret(lr); 3240 3241 return start; 3242 } 3243 3244 /*** 3245 * Arguments: 3246 * 3247 * Inputs: 3248 * c_rarg0 - int adler 3249 * c_rarg1 - byte* buff 3250 * c_rarg2 - int len 3251 * 3252 * Output: 3253 * c_rarg0 - int adler result 3254 */ 3255 address generate_updateBytesAdler32() { 3256 __ align(CodeEntryAlignment); 3257 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3258 address start = __ pc(); 3259 3260 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3261 3262 // Aliases 3263 Register adler = c_rarg0; 3264 Register s1 = c_rarg0; 3265 Register s2 = c_rarg3; 3266 Register buff = c_rarg1; 3267 Register len = c_rarg2; 3268 Register nmax = r4; 3269 Register base = r5; 3270 Register count = r6; 3271 Register temp0 = rscratch1; 3272 Register temp1 = rscratch2; 3273 FloatRegister vbytes = v0; 3274 FloatRegister vs1acc = v1; 3275 FloatRegister vs2acc = v2; 3276 FloatRegister vtable = v3; 3277 3278 // Max number of bytes we can process before having to take the mod 3279 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3280 unsigned long BASE = 0xfff1; 3281 unsigned long NMAX = 0x15B0; 3282 3283 __ mov(base, BASE); 3284 __ mov(nmax, NMAX); 3285 3286 // Load accumulation coefficients for the upper 16 bits 3287 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 3288 __ ld1(vtable, __ T16B, Address(temp0)); 3289 3290 // s1 is initialized to the lower 16 bits of adler 3291 // s2 is initialized to the upper 16 bits of adler 3292 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3293 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3294 3295 // The pipelined loop needs at least 16 elements for 1 iteration 3296 // It does check this, but it is more effective to skip to the cleanup loop 3297 __ cmp(len, (u1)16); 3298 __ br(Assembler::HS, L_nmax); 3299 __ cbz(len, L_combine); 3300 3301 __ bind(L_simple_by1_loop); 3302 __ ldrb(temp0, Address(__ post(buff, 1))); 3303 __ add(s1, s1, temp0); 3304 __ add(s2, s2, s1); 3305 __ subs(len, len, 1); 3306 __ br(Assembler::HI, L_simple_by1_loop); 3307 3308 // s1 = s1 % BASE 3309 __ subs(temp0, s1, base); 3310 __ csel(s1, temp0, s1, Assembler::HS); 3311 3312 // s2 = s2 % BASE 3313 __ lsr(temp0, s2, 16); 3314 __ lsl(temp1, temp0, 4); 3315 __ sub(temp1, temp1, temp0); 3316 __ add(s2, temp1, s2, ext::uxth); 3317 3318 __ subs(temp0, s2, base); 3319 __ csel(s2, temp0, s2, Assembler::HS); 3320 3321 __ b(L_combine); 3322 3323 __ bind(L_nmax); 3324 __ subs(len, len, nmax); 3325 __ sub(count, nmax, 16); 3326 __ br(Assembler::LO, L_by16); 3327 3328 __ bind(L_nmax_loop); 3329 3330 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3331 vbytes, vs1acc, vs2acc, vtable); 3332 3333 __ subs(count, count, 16); 3334 __ br(Assembler::HS, L_nmax_loop); 3335 3336 // s1 = s1 % BASE 3337 __ lsr(temp0, s1, 16); 3338 __ lsl(temp1, temp0, 4); 3339 __ sub(temp1, temp1, temp0); 3340 __ add(temp1, temp1, s1, ext::uxth); 3341 3342 __ lsr(temp0, temp1, 16); 3343 __ lsl(s1, temp0, 4); 3344 __ sub(s1, s1, temp0); 3345 __ add(s1, s1, temp1, ext:: uxth); 3346 3347 __ subs(temp0, s1, base); 3348 __ csel(s1, temp0, s1, Assembler::HS); 3349 3350 // s2 = s2 % BASE 3351 __ lsr(temp0, s2, 16); 3352 __ lsl(temp1, temp0, 4); 3353 __ sub(temp1, temp1, temp0); 3354 __ add(temp1, temp1, s2, ext::uxth); 3355 3356 __ lsr(temp0, temp1, 16); 3357 __ lsl(s2, temp0, 4); 3358 __ sub(s2, s2, temp0); 3359 __ add(s2, s2, temp1, ext:: uxth); 3360 3361 __ subs(temp0, s2, base); 3362 __ csel(s2, temp0, s2, Assembler::HS); 3363 3364 __ subs(len, len, nmax); 3365 __ sub(count, nmax, 16); 3366 __ br(Assembler::HS, L_nmax_loop); 3367 3368 __ bind(L_by16); 3369 __ adds(len, len, count); 3370 __ br(Assembler::LO, L_by1); 3371 3372 __ bind(L_by16_loop); 3373 3374 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3375 vbytes, vs1acc, vs2acc, vtable); 3376 3377 __ subs(len, len, 16); 3378 __ br(Assembler::HS, L_by16_loop); 3379 3380 __ bind(L_by1); 3381 __ adds(len, len, 15); 3382 __ br(Assembler::LO, L_do_mod); 3383 3384 __ bind(L_by1_loop); 3385 __ ldrb(temp0, Address(__ post(buff, 1))); 3386 __ add(s1, temp0, s1); 3387 __ add(s2, s2, s1); 3388 __ subs(len, len, 1); 3389 __ br(Assembler::HS, L_by1_loop); 3390 3391 __ bind(L_do_mod); 3392 // s1 = s1 % BASE 3393 __ lsr(temp0, s1, 16); 3394 __ lsl(temp1, temp0, 4); 3395 __ sub(temp1, temp1, temp0); 3396 __ add(temp1, temp1, s1, ext::uxth); 3397 3398 __ lsr(temp0, temp1, 16); 3399 __ lsl(s1, temp0, 4); 3400 __ sub(s1, s1, temp0); 3401 __ add(s1, s1, temp1, ext:: uxth); 3402 3403 __ subs(temp0, s1, base); 3404 __ csel(s1, temp0, s1, Assembler::HS); 3405 3406 // s2 = s2 % BASE 3407 __ lsr(temp0, s2, 16); 3408 __ lsl(temp1, temp0, 4); 3409 __ sub(temp1, temp1, temp0); 3410 __ add(temp1, temp1, s2, ext::uxth); 3411 3412 __ lsr(temp0, temp1, 16); 3413 __ lsl(s2, temp0, 4); 3414 __ sub(s2, s2, temp0); 3415 __ add(s2, s2, temp1, ext:: uxth); 3416 3417 __ subs(temp0, s2, base); 3418 __ csel(s2, temp0, s2, Assembler::HS); 3419 3420 // Combine lower bits and higher bits 3421 __ bind(L_combine); 3422 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3423 3424 __ ret(lr); 3425 3426 return start; 3427 } 3428 3429 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 3430 Register temp0, Register temp1, FloatRegister vbytes, 3431 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 3432 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 3433 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 3434 // In non-vectorized code, we update s1 and s2 as: 3435 // s1 <- s1 + b1 3436 // s2 <- s2 + s1 3437 // s1 <- s1 + b2 3438 // s2 <- s2 + b1 3439 // ... 3440 // s1 <- s1 + b16 3441 // s2 <- s2 + s1 3442 // Putting above assignments together, we have: 3443 // s1_new = s1 + b1 + b2 + ... + b16 3444 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 3445 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 3446 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 3447 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 3448 3449 // s2 = s2 + s1 * 16 3450 __ add(s2, s2, s1, Assembler::LSL, 4); 3451 3452 // vs1acc = b1 + b2 + b3 + ... + b16 3453 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 3454 __ umullv(vs2acc, __ T8B, vtable, vbytes); 3455 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 3456 __ uaddlv(vs1acc, __ T16B, vbytes); 3457 __ uaddlv(vs2acc, __ T8H, vs2acc); 3458 3459 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 3460 __ fmovd(temp0, vs1acc); 3461 __ fmovd(temp1, vs2acc); 3462 __ add(s1, s1, temp0); 3463 __ add(s2, s2, temp1); 3464 } 3465 3466 /** 3467 * Arguments: 3468 * 3469 * Input: 3470 * c_rarg0 - x address 3471 * c_rarg1 - x length 3472 * c_rarg2 - y address 3473 * c_rarg3 - y lenth 3474 * c_rarg4 - z address 3475 * c_rarg5 - z length 3476 */ 3477 address generate_multiplyToLen() { 3478 __ align(CodeEntryAlignment); 3479 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3480 3481 address start = __ pc(); 3482 const Register x = r0; 3483 const Register xlen = r1; 3484 const Register y = r2; 3485 const Register ylen = r3; 3486 const Register z = r4; 3487 const Register zlen = r5; 3488 3489 const Register tmp1 = r10; 3490 const Register tmp2 = r11; 3491 const Register tmp3 = r12; 3492 const Register tmp4 = r13; 3493 const Register tmp5 = r14; 3494 const Register tmp6 = r15; 3495 const Register tmp7 = r16; 3496 3497 BLOCK_COMMENT("Entry:"); 3498 __ enter(); // required for proper stackwalking of RuntimeStub frame 3499 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3500 __ leave(); // required for proper stackwalking of RuntimeStub frame 3501 __ ret(lr); 3502 3503 return start; 3504 } 3505 3506 address generate_squareToLen() { 3507 // squareToLen algorithm for sizes 1..127 described in java code works 3508 // faster than multiply_to_len on some CPUs and slower on others, but 3509 // multiply_to_len shows a bit better overall results 3510 __ align(CodeEntryAlignment); 3511 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3512 address start = __ pc(); 3513 3514 const Register x = r0; 3515 const Register xlen = r1; 3516 const Register z = r2; 3517 const Register zlen = r3; 3518 const Register y = r4; // == x 3519 const Register ylen = r5; // == xlen 3520 3521 const Register tmp1 = r10; 3522 const Register tmp2 = r11; 3523 const Register tmp3 = r12; 3524 const Register tmp4 = r13; 3525 const Register tmp5 = r14; 3526 const Register tmp6 = r15; 3527 const Register tmp7 = r16; 3528 3529 RegSet spilled_regs = RegSet::of(y, ylen); 3530 BLOCK_COMMENT("Entry:"); 3531 __ enter(); 3532 __ push(spilled_regs, sp); 3533 __ mov(y, x); 3534 __ mov(ylen, xlen); 3535 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3536 __ pop(spilled_regs, sp); 3537 __ leave(); 3538 __ ret(lr); 3539 return start; 3540 } 3541 3542 address generate_mulAdd() { 3543 __ align(CodeEntryAlignment); 3544 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3545 3546 address start = __ pc(); 3547 3548 const Register out = r0; 3549 const Register in = r1; 3550 const Register offset = r2; 3551 const Register len = r3; 3552 const Register k = r4; 3553 3554 BLOCK_COMMENT("Entry:"); 3555 __ enter(); 3556 __ mul_add(out, in, offset, len, k); 3557 __ leave(); 3558 __ ret(lr); 3559 3560 return start; 3561 } 3562 3563 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3564 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3565 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3566 // Karatsuba multiplication performs a 128*128 -> 256-bit 3567 // multiplication in three 128-bit multiplications and a few 3568 // additions. 3569 // 3570 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3571 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3572 // 3573 // Inputs: 3574 // 3575 // A0 in a.d[0] (subkey) 3576 // A1 in a.d[1] 3577 // (A1+A0) in a1_xor_a0.d[0] 3578 // 3579 // B0 in b.d[0] (state) 3580 // B1 in b.d[1] 3581 3582 __ ext(tmp1, __ T16B, b, b, 0x08); 3583 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3584 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3585 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3586 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3587 3588 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3589 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3590 __ eor(tmp2, __ T16B, tmp2, tmp4); 3591 __ eor(tmp2, __ T16B, tmp2, tmp3); 3592 3593 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3594 __ ins(result_hi, __ D, tmp2, 0, 1); 3595 __ ins(result_lo, __ D, tmp2, 1, 0); 3596 } 3597 3598 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3599 FloatRegister p, FloatRegister z, FloatRegister t1) { 3600 const FloatRegister t0 = result; 3601 3602 // The GCM field polynomial f is z^128 + p(z), where p = 3603 // z^7+z^2+z+1. 3604 // 3605 // z^128 === -p(z) (mod (z^128 + p(z))) 3606 // 3607 // so, given that the product we're reducing is 3608 // a == lo + hi * z^128 3609 // substituting, 3610 // === lo - hi * p(z) (mod (z^128 + p(z))) 3611 // 3612 // we reduce by multiplying hi by p(z) and subtracting the result 3613 // from (i.e. XORing it with) lo. Because p has no nonzero high 3614 // bits we can do this with two 64-bit multiplications, lo*p and 3615 // hi*p. 3616 3617 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3618 __ ext(t1, __ T16B, t0, z, 8); 3619 __ eor(hi, __ T16B, hi, t1); 3620 __ ext(t1, __ T16B, z, t0, 8); 3621 __ eor(lo, __ T16B, lo, t1); 3622 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3623 __ eor(result, __ T16B, lo, t0); 3624 } 3625 3626 address generate_has_negatives(address &has_negatives_long) { 3627 const u1 large_loop_size = 64; 3628 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3629 int dcache_line = VM_Version::dcache_line_size(); 3630 3631 Register ary1 = r1, len = r2, result = r0; 3632 3633 __ align(CodeEntryAlignment); 3634 3635 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3636 3637 address entry = __ pc(); 3638 3639 __ enter(); 3640 3641 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3642 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3643 3644 __ cmp(len, (u1)15); 3645 __ br(Assembler::GT, LEN_OVER_15); 3646 // The only case when execution falls into this code is when pointer is near 3647 // the end of memory page and we have to avoid reading next page 3648 __ add(ary1, ary1, len); 3649 __ subs(len, len, 8); 3650 __ br(Assembler::GT, LEN_OVER_8); 3651 __ ldr(rscratch2, Address(ary1, -8)); 3652 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3653 __ lsrv(rscratch2, rscratch2, rscratch1); 3654 __ tst(rscratch2, UPPER_BIT_MASK); 3655 __ cset(result, Assembler::NE); 3656 __ leave(); 3657 __ ret(lr); 3658 __ bind(LEN_OVER_8); 3659 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3660 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3661 __ tst(rscratch2, UPPER_BIT_MASK); 3662 __ br(Assembler::NE, RET_TRUE_NO_POP); 3663 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3664 __ lsrv(rscratch1, rscratch1, rscratch2); 3665 __ tst(rscratch1, UPPER_BIT_MASK); 3666 __ cset(result, Assembler::NE); 3667 __ leave(); 3668 __ ret(lr); 3669 3670 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3671 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3672 3673 has_negatives_long = __ pc(); // 2nd entry point 3674 3675 __ enter(); 3676 3677 __ bind(LEN_OVER_15); 3678 __ push(spilled_regs, sp); 3679 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3680 __ cbz(rscratch2, ALIGNED); 3681 __ ldp(tmp6, tmp1, Address(ary1)); 3682 __ mov(tmp5, 16); 3683 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3684 __ add(ary1, ary1, rscratch1); 3685 __ sub(len, len, rscratch1); 3686 __ orr(tmp6, tmp6, tmp1); 3687 __ tst(tmp6, UPPER_BIT_MASK); 3688 __ br(Assembler::NE, RET_TRUE); 3689 3690 __ bind(ALIGNED); 3691 __ cmp(len, large_loop_size); 3692 __ br(Assembler::LT, CHECK_16); 3693 // Perform 16-byte load as early return in pre-loop to handle situation 3694 // when initially aligned large array has negative values at starting bytes, 3695 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3696 // slower. Cases with negative bytes further ahead won't be affected that 3697 // much. In fact, it'll be faster due to early loads, less instructions and 3698 // less branches in LARGE_LOOP. 3699 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3700 __ sub(len, len, 16); 3701 __ orr(tmp6, tmp6, tmp1); 3702 __ tst(tmp6, UPPER_BIT_MASK); 3703 __ br(Assembler::NE, RET_TRUE); 3704 __ cmp(len, large_loop_size); 3705 __ br(Assembler::LT, CHECK_16); 3706 3707 if (SoftwarePrefetchHintDistance >= 0 3708 && SoftwarePrefetchHintDistance >= dcache_line) { 3709 // initial prefetch 3710 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3711 } 3712 __ bind(LARGE_LOOP); 3713 if (SoftwarePrefetchHintDistance >= 0) { 3714 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3715 } 3716 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3717 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3718 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3719 // instructions per cycle and have less branches, but this approach disables 3720 // early return, thus, all 64 bytes are loaded and checked every time. 3721 __ ldp(tmp2, tmp3, Address(ary1)); 3722 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3723 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3724 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3725 __ add(ary1, ary1, large_loop_size); 3726 __ sub(len, len, large_loop_size); 3727 __ orr(tmp2, tmp2, tmp3); 3728 __ orr(tmp4, tmp4, tmp5); 3729 __ orr(rscratch1, rscratch1, rscratch2); 3730 __ orr(tmp6, tmp6, tmp1); 3731 __ orr(tmp2, tmp2, tmp4); 3732 __ orr(rscratch1, rscratch1, tmp6); 3733 __ orr(tmp2, tmp2, rscratch1); 3734 __ tst(tmp2, UPPER_BIT_MASK); 3735 __ br(Assembler::NE, RET_TRUE); 3736 __ cmp(len, large_loop_size); 3737 __ br(Assembler::GE, LARGE_LOOP); 3738 3739 __ bind(CHECK_16); // small 16-byte load pre-loop 3740 __ cmp(len, (u1)16); 3741 __ br(Assembler::LT, POST_LOOP16); 3742 3743 __ bind(LOOP16); // small 16-byte load loop 3744 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3745 __ sub(len, len, 16); 3746 __ orr(tmp2, tmp2, tmp3); 3747 __ tst(tmp2, UPPER_BIT_MASK); 3748 __ br(Assembler::NE, RET_TRUE); 3749 __ cmp(len, (u1)16); 3750 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3751 3752 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3753 __ cmp(len, (u1)8); 3754 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3755 __ ldr(tmp3, Address(__ post(ary1, 8))); 3756 __ sub(len, len, 8); 3757 __ tst(tmp3, UPPER_BIT_MASK); 3758 __ br(Assembler::NE, RET_TRUE); 3759 3760 __ bind(POST_LOOP16_LOAD_TAIL); 3761 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3762 __ ldr(tmp1, Address(ary1)); 3763 __ mov(tmp2, 64); 3764 __ sub(tmp4, tmp2, len, __ LSL, 3); 3765 __ lslv(tmp1, tmp1, tmp4); 3766 __ tst(tmp1, UPPER_BIT_MASK); 3767 __ br(Assembler::NE, RET_TRUE); 3768 // Fallthrough 3769 3770 __ bind(RET_FALSE); 3771 __ pop(spilled_regs, sp); 3772 __ leave(); 3773 __ mov(result, zr); 3774 __ ret(lr); 3775 3776 __ bind(RET_TRUE); 3777 __ pop(spilled_regs, sp); 3778 __ bind(RET_TRUE_NO_POP); 3779 __ leave(); 3780 __ mov(result, 1); 3781 __ ret(lr); 3782 3783 __ bind(DONE); 3784 __ pop(spilled_regs, sp); 3785 __ leave(); 3786 __ ret(lr); 3787 return entry; 3788 } 3789 3790 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3791 bool usePrefetch, Label &NOT_EQUAL) { 3792 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3793 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3794 tmp7 = r12, tmp8 = r13; 3795 Label LOOP; 3796 3797 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3798 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3799 __ bind(LOOP); 3800 if (usePrefetch) { 3801 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3802 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3803 } 3804 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3805 __ eor(tmp1, tmp1, tmp2); 3806 __ eor(tmp3, tmp3, tmp4); 3807 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3808 __ orr(tmp1, tmp1, tmp3); 3809 __ cbnz(tmp1, NOT_EQUAL); 3810 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3811 __ eor(tmp5, tmp5, tmp6); 3812 __ eor(tmp7, tmp7, tmp8); 3813 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3814 __ orr(tmp5, tmp5, tmp7); 3815 __ cbnz(tmp5, NOT_EQUAL); 3816 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3817 __ eor(tmp1, tmp1, tmp2); 3818 __ eor(tmp3, tmp3, tmp4); 3819 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3820 __ orr(tmp1, tmp1, tmp3); 3821 __ cbnz(tmp1, NOT_EQUAL); 3822 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3823 __ eor(tmp5, tmp5, tmp6); 3824 __ sub(cnt1, cnt1, 8 * wordSize); 3825 __ eor(tmp7, tmp7, tmp8); 3826 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3827 // tmp6 is not used. MacroAssembler::subs is used here (rather than 3828 // cmp) because subs allows an unlimited range of immediate operand. 3829 __ subs(tmp6, cnt1, loopThreshold); 3830 __ orr(tmp5, tmp5, tmp7); 3831 __ cbnz(tmp5, NOT_EQUAL); 3832 __ br(__ GE, LOOP); 3833 // post-loop 3834 __ eor(tmp1, tmp1, tmp2); 3835 __ eor(tmp3, tmp3, tmp4); 3836 __ orr(tmp1, tmp1, tmp3); 3837 __ sub(cnt1, cnt1, 2 * wordSize); 3838 __ cbnz(tmp1, NOT_EQUAL); 3839 } 3840 3841 void generate_large_array_equals_loop_simd(int loopThreshold, 3842 bool usePrefetch, Label &NOT_EQUAL) { 3843 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3844 tmp2 = rscratch2; 3845 Label LOOP; 3846 3847 __ bind(LOOP); 3848 if (usePrefetch) { 3849 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3850 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3851 } 3852 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3853 __ sub(cnt1, cnt1, 8 * wordSize); 3854 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3855 __ subs(tmp1, cnt1, loopThreshold); 3856 __ eor(v0, __ T16B, v0, v4); 3857 __ eor(v1, __ T16B, v1, v5); 3858 __ eor(v2, __ T16B, v2, v6); 3859 __ eor(v3, __ T16B, v3, v7); 3860 __ orr(v0, __ T16B, v0, v1); 3861 __ orr(v1, __ T16B, v2, v3); 3862 __ orr(v0, __ T16B, v0, v1); 3863 __ umov(tmp1, v0, __ D, 0); 3864 __ umov(tmp2, v0, __ D, 1); 3865 __ orr(tmp1, tmp1, tmp2); 3866 __ cbnz(tmp1, NOT_EQUAL); 3867 __ br(__ GE, LOOP); 3868 } 3869 3870 // a1 = r1 - array1 address 3871 // a2 = r2 - array2 address 3872 // result = r0 - return value. Already contains "false" 3873 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3874 // r3-r5 are reserved temporary registers 3875 address generate_large_array_equals() { 3876 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3877 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3878 tmp7 = r12, tmp8 = r13; 3879 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3880 SMALL_LOOP, POST_LOOP; 3881 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3882 // calculate if at least 32 prefetched bytes are used 3883 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3884 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3885 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3886 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3887 tmp5, tmp6, tmp7, tmp8); 3888 3889 __ align(CodeEntryAlignment); 3890 3891 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3892 3893 address entry = __ pc(); 3894 __ enter(); 3895 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3896 // also advance pointers to use post-increment instead of pre-increment 3897 __ add(a1, a1, wordSize); 3898 __ add(a2, a2, wordSize); 3899 if (AvoidUnalignedAccesses) { 3900 // both implementations (SIMD/nonSIMD) are using relatively large load 3901 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 3902 // on some CPUs in case of address is not at least 16-byte aligned. 3903 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 3904 // load if needed at least for 1st address and make if 16-byte aligned. 3905 Label ALIGNED16; 3906 __ tbz(a1, 3, ALIGNED16); 3907 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3908 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3909 __ sub(cnt1, cnt1, wordSize); 3910 __ eor(tmp1, tmp1, tmp2); 3911 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 3912 __ bind(ALIGNED16); 3913 } 3914 if (UseSIMDForArrayEquals) { 3915 if (SoftwarePrefetchHintDistance >= 0) { 3916 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3917 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3918 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 3919 /* prfm = */ true, NOT_EQUAL); 3920 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3921 __ br(__ LT, TAIL); 3922 } 3923 __ bind(NO_PREFETCH_LARGE_LOOP); 3924 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 3925 /* prfm = */ false, NOT_EQUAL); 3926 } else { 3927 __ push(spilled_regs, sp); 3928 if (SoftwarePrefetchHintDistance >= 0) { 3929 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3930 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3931 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 3932 /* prfm = */ true, NOT_EQUAL); 3933 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3934 __ br(__ LT, TAIL); 3935 } 3936 __ bind(NO_PREFETCH_LARGE_LOOP); 3937 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 3938 /* prfm = */ false, NOT_EQUAL); 3939 } 3940 __ bind(TAIL); 3941 __ cbz(cnt1, EQUAL); 3942 __ subs(cnt1, cnt1, wordSize); 3943 __ br(__ LE, POST_LOOP); 3944 __ bind(SMALL_LOOP); 3945 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3946 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3947 __ subs(cnt1, cnt1, wordSize); 3948 __ eor(tmp1, tmp1, tmp2); 3949 __ cbnz(tmp1, NOT_EQUAL); 3950 __ br(__ GT, SMALL_LOOP); 3951 __ bind(POST_LOOP); 3952 __ ldr(tmp1, Address(a1, cnt1)); 3953 __ ldr(tmp2, Address(a2, cnt1)); 3954 __ eor(tmp1, tmp1, tmp2); 3955 __ cbnz(tmp1, NOT_EQUAL); 3956 __ bind(EQUAL); 3957 __ mov(result, true); 3958 __ bind(NOT_EQUAL); 3959 if (!UseSIMDForArrayEquals) { 3960 __ pop(spilled_regs, sp); 3961 } 3962 __ bind(NOT_EQUAL_NO_POP); 3963 __ leave(); 3964 __ ret(lr); 3965 return entry; 3966 } 3967 3968 address generate_dsin_dcos(bool isCos) { 3969 __ align(CodeEntryAlignment); 3970 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 3971 address start = __ pc(); 3972 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 3973 (address)StubRoutines::aarch64::_two_over_pi, 3974 (address)StubRoutines::aarch64::_pio2, 3975 (address)StubRoutines::aarch64::_dsin_coef, 3976 (address)StubRoutines::aarch64::_dcos_coef); 3977 return start; 3978 } 3979 3980 address generate_dlog() { 3981 __ align(CodeEntryAlignment); 3982 StubCodeMark mark(this, "StubRoutines", "dlog"); 3983 address entry = __ pc(); 3984 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 3985 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 3986 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 3987 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 3988 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 3989 return entry; 3990 } 3991 3992 // code for comparing 16 bytes of strings with same encoding 3993 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 3994 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; 3995 __ ldr(rscratch1, Address(__ post(str1, 8))); 3996 __ eor(rscratch2, tmp1, tmp2); 3997 __ ldr(cnt1, Address(__ post(str2, 8))); 3998 __ cbnz(rscratch2, DIFF1); 3999 __ ldr(tmp1, Address(__ post(str1, 8))); 4000 __ eor(rscratch2, rscratch1, cnt1); 4001 __ ldr(tmp2, Address(__ post(str2, 8))); 4002 __ cbnz(rscratch2, DIFF2); 4003 } 4004 4005 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 4006 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 4007 Label &DIFF2) { 4008 Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12; 4009 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 4010 4011 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 4012 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4013 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4014 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 4015 4016 __ fmovd(tmpL, vtmp3); 4017 __ eor(rscratch2, tmp3, tmpL); 4018 __ cbnz(rscratch2, DIFF2); 4019 4020 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4021 __ umov(tmpL, vtmp3, __ D, 1); 4022 __ eor(rscratch2, tmpU, tmpL); 4023 __ cbnz(rscratch2, DIFF1); 4024 4025 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4026 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4027 __ fmovd(tmpL, vtmp); 4028 __ eor(rscratch2, tmp3, tmpL); 4029 __ cbnz(rscratch2, DIFF2); 4030 4031 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4032 __ umov(tmpL, vtmp, __ D, 1); 4033 __ eor(rscratch2, tmpU, tmpL); 4034 __ cbnz(rscratch2, DIFF1); 4035 } 4036 4037 // r0 = result 4038 // r1 = str1 4039 // r2 = cnt1 4040 // r3 = str2 4041 // r4 = cnt2 4042 // r10 = tmp1 4043 // r11 = tmp2 4044 address generate_compare_long_string_different_encoding(bool isLU) { 4045 __ align(CodeEntryAlignment); 4046 StubCodeMark mark(this, "StubRoutines", isLU 4047 ? "compare_long_string_different_encoding LU" 4048 : "compare_long_string_different_encoding UL"); 4049 address entry = __ pc(); 4050 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 4051 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 4052 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 4053 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4054 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 4055 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 4056 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 4057 4058 int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2); 4059 4060 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 4061 // cnt2 == amount of characters left to compare 4062 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 4063 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4064 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 4065 __ add(str2, str2, isLU ? wordSize : wordSize/2); 4066 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 4067 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 4068 __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1); 4069 __ eor(rscratch2, tmp1, tmp2); 4070 __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0); 4071 __ mov(rscratch1, tmp2); 4072 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 4073 Register strU = isLU ? str2 : str1, 4074 strL = isLU ? str1 : str2, 4075 tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 4076 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 4077 __ push(spilled_regs, sp); 4078 __ sub(tmp2, strL, cnt2); // strL pointer to load from 4079 __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from 4080 4081 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4082 4083 if (SoftwarePrefetchHintDistance >= 0) { 4084 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4085 __ br(__ LT, NO_PREFETCH); 4086 __ bind(LARGE_LOOP_PREFETCH); 4087 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 4088 __ mov(tmp4, 2); 4089 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4090 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 4091 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4092 __ subs(tmp4, tmp4, 1); 4093 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 4094 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4095 __ mov(tmp4, 2); 4096 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 4097 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4098 __ subs(tmp4, tmp4, 1); 4099 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 4100 __ sub(cnt2, cnt2, 64); 4101 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4102 __ br(__ GE, LARGE_LOOP_PREFETCH); 4103 } 4104 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 4105 __ bind(NO_PREFETCH); 4106 __ subs(cnt2, cnt2, 16); 4107 __ br(__ LT, TAIL); 4108 __ bind(SMALL_LOOP); // smaller loop 4109 __ subs(cnt2, cnt2, 16); 4110 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4111 __ br(__ GE, SMALL_LOOP); 4112 __ cmn(cnt2, (u1)16); 4113 __ br(__ EQ, LOAD_LAST); 4114 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 4115 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 8 bytes before last 4 characters in UTF-16 string 4116 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 4117 __ ldr(tmp3, Address(cnt1, -8)); 4118 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 4119 __ b(LOAD_LAST); 4120 __ bind(DIFF2); 4121 __ mov(tmpU, tmp3); 4122 __ bind(DIFF1); 4123 __ pop(spilled_regs, sp); 4124 __ b(CALCULATE_DIFFERENCE); 4125 __ bind(LOAD_LAST); 4126 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 4127 // No need to load it again 4128 __ mov(tmpU, tmp3); 4129 __ pop(spilled_regs, sp); 4130 4131 __ ldrs(vtmp, Address(strL)); 4132 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4133 __ fmovd(tmpL, vtmp); 4134 4135 __ eor(rscratch2, tmpU, tmpL); 4136 __ cbz(rscratch2, DONE); 4137 4138 // Find the first different characters in the longwords and 4139 // compute their difference. 4140 __ bind(CALCULATE_DIFFERENCE); 4141 __ rev(rscratch2, rscratch2); 4142 __ clz(rscratch2, rscratch2); 4143 __ andr(rscratch2, rscratch2, -16); 4144 __ lsrv(tmp1, tmp1, rscratch2); 4145 __ uxthw(tmp1, tmp1); 4146 __ lsrv(rscratch1, rscratch1, rscratch2); 4147 __ uxthw(rscratch1, rscratch1); 4148 __ subw(result, tmp1, rscratch1); 4149 __ bind(DONE); 4150 __ ret(lr); 4151 return entry; 4152 } 4153 4154 // r0 = result 4155 // r1 = str1 4156 // r2 = cnt1 4157 // r3 = str2 4158 // r4 = cnt2 4159 // r10 = tmp1 4160 // r11 = tmp2 4161 address generate_compare_long_string_same_encoding(bool isLL) { 4162 __ align(CodeEntryAlignment); 4163 StubCodeMark mark(this, "StubRoutines", isLL 4164 ? "compare_long_string_same_encoding LL" 4165 : "compare_long_string_same_encoding UU"); 4166 address entry = __ pc(); 4167 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4168 tmp1 = r10, tmp2 = r11; 4169 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, 4170 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, 4171 DIFF_LAST_POSITION, DIFF_LAST_POSITION2; 4172 // exit from large loop when less than 64 bytes left to read or we're about 4173 // to prefetch memory behind array border 4174 int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 4175 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 4176 // update cnt2 counter with already loaded 8 bytes 4177 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 4178 // update pointers, because of previous read 4179 __ add(str1, str1, wordSize); 4180 __ add(str2, str2, wordSize); 4181 if (SoftwarePrefetchHintDistance >= 0) { 4182 __ bind(LARGE_LOOP_PREFETCH); 4183 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 4184 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 4185 compare_string_16_bytes_same(DIFF, DIFF2); 4186 compare_string_16_bytes_same(DIFF, DIFF2); 4187 __ sub(cnt2, cnt2, isLL ? 64 : 32); 4188 compare_string_16_bytes_same(DIFF, DIFF2); 4189 __ subs(rscratch2, cnt2, largeLoopExitCondition); 4190 compare_string_16_bytes_same(DIFF, DIFF2); 4191 __ br(__ GT, LARGE_LOOP_PREFETCH); 4192 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? 4193 } 4194 // less than 16 bytes left? 4195 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4196 __ br(__ LT, TAIL); 4197 __ bind(SMALL_LOOP); 4198 compare_string_16_bytes_same(DIFF, DIFF2); 4199 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4200 __ br(__ GE, SMALL_LOOP); 4201 __ bind(TAIL); 4202 __ adds(cnt2, cnt2, isLL ? 16 : 8); 4203 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); 4204 __ subs(cnt2, cnt2, isLL ? 8 : 4); 4205 __ br(__ LE, CHECK_LAST); 4206 __ eor(rscratch2, tmp1, tmp2); 4207 __ cbnz(rscratch2, DIFF); 4208 __ ldr(tmp1, Address(__ post(str1, 8))); 4209 __ ldr(tmp2, Address(__ post(str2, 8))); 4210 __ sub(cnt2, cnt2, isLL ? 8 : 4); 4211 __ bind(CHECK_LAST); 4212 if (!isLL) { 4213 __ add(cnt2, cnt2, cnt2); // now in bytes 4214 } 4215 __ eor(rscratch2, tmp1, tmp2); 4216 __ cbnz(rscratch2, DIFF); 4217 __ ldr(rscratch1, Address(str1, cnt2)); 4218 __ ldr(cnt1, Address(str2, cnt2)); 4219 __ eor(rscratch2, rscratch1, cnt1); 4220 __ cbz(rscratch2, LENGTH_DIFF); 4221 // Find the first different characters in the longwords and 4222 // compute their difference. 4223 __ bind(DIFF2); 4224 __ rev(rscratch2, rscratch2); 4225 __ clz(rscratch2, rscratch2); 4226 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4227 __ lsrv(rscratch1, rscratch1, rscratch2); 4228 if (isLL) { 4229 __ lsrv(cnt1, cnt1, rscratch2); 4230 __ uxtbw(rscratch1, rscratch1); 4231 __ uxtbw(cnt1, cnt1); 4232 } else { 4233 __ lsrv(cnt1, cnt1, rscratch2); 4234 __ uxthw(rscratch1, rscratch1); 4235 __ uxthw(cnt1, cnt1); 4236 } 4237 __ subw(result, rscratch1, cnt1); 4238 __ b(LENGTH_DIFF); 4239 __ bind(DIFF); 4240 __ rev(rscratch2, rscratch2); 4241 __ clz(rscratch2, rscratch2); 4242 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4243 __ lsrv(tmp1, tmp1, rscratch2); 4244 if (isLL) { 4245 __ lsrv(tmp2, tmp2, rscratch2); 4246 __ uxtbw(tmp1, tmp1); 4247 __ uxtbw(tmp2, tmp2); 4248 } else { 4249 __ lsrv(tmp2, tmp2, rscratch2); 4250 __ uxthw(tmp1, tmp1); 4251 __ uxthw(tmp2, tmp2); 4252 } 4253 __ subw(result, tmp1, tmp2); 4254 __ b(LENGTH_DIFF); 4255 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 4256 __ eor(rscratch2, tmp1, tmp2); 4257 __ cbnz(rscratch2, DIFF); 4258 __ bind(LENGTH_DIFF); 4259 __ ret(lr); 4260 return entry; 4261 } 4262 4263 void generate_compare_long_strings() { 4264 StubRoutines::aarch64::_compare_long_string_LL 4265 = generate_compare_long_string_same_encoding(true); 4266 StubRoutines::aarch64::_compare_long_string_UU 4267 = generate_compare_long_string_same_encoding(false); 4268 StubRoutines::aarch64::_compare_long_string_LU 4269 = generate_compare_long_string_different_encoding(true); 4270 StubRoutines::aarch64::_compare_long_string_UL 4271 = generate_compare_long_string_different_encoding(false); 4272 } 4273 4274 // R0 = result 4275 // R1 = str2 4276 // R2 = cnt1 4277 // R3 = str1 4278 // R4 = cnt2 4279 // This generic linear code use few additional ideas, which makes it faster: 4280 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 4281 // in order to skip initial loading(help in systems with 1 ld pipeline) 4282 // 2) we can use "fast" algorithm of finding single character to search for 4283 // first symbol with less branches(1 branch per each loaded register instead 4284 // of branch for each symbol), so, this is where constants like 4285 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 4286 // 3) after loading and analyzing 1st register of source string, it can be 4287 // used to search for every 1st character entry, saving few loads in 4288 // comparison with "simplier-but-slower" implementation 4289 // 4) in order to avoid lots of push/pop operations, code below is heavily 4290 // re-using/re-initializing/compressing register values, which makes code 4291 // larger and a bit less readable, however, most of extra operations are 4292 // issued during loads or branches, so, penalty is minimal 4293 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 4294 const char* stubName = str1_isL 4295 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 4296 : "indexof_linear_uu"; 4297 __ align(CodeEntryAlignment); 4298 StubCodeMark mark(this, "StubRoutines", stubName); 4299 address entry = __ pc(); 4300 4301 int str1_chr_size = str1_isL ? 1 : 2; 4302 int str2_chr_size = str2_isL ? 1 : 2; 4303 int str1_chr_shift = str1_isL ? 0 : 1; 4304 int str2_chr_shift = str2_isL ? 0 : 1; 4305 bool isL = str1_isL && str2_isL; 4306 // parameters 4307 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 4308 // temporary registers 4309 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 4310 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 4311 // redefinitions 4312 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 4313 4314 __ push(spilled_regs, sp); 4315 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 4316 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 4317 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 4318 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 4319 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 4320 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 4321 // Read whole register from str1. It is safe, because length >=8 here 4322 __ ldr(ch1, Address(str1)); 4323 // Read whole register from str2. It is safe, because length >=8 here 4324 __ ldr(ch2, Address(str2)); 4325 __ sub(cnt2, cnt2, cnt1); 4326 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 4327 if (str1_isL != str2_isL) { 4328 __ eor(v0, __ T16B, v0, v0); 4329 } 4330 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4331 __ mul(first, first, tmp1); 4332 // check if we have less than 1 register to check 4333 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 4334 if (str1_isL != str2_isL) { 4335 __ fmovd(v1, ch1); 4336 } 4337 __ br(__ LE, L_SMALL); 4338 __ eor(ch2, first, ch2); 4339 if (str1_isL != str2_isL) { 4340 __ zip1(v1, __ T16B, v1, v0); 4341 } 4342 __ sub(tmp2, ch2, tmp1); 4343 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4344 __ bics(tmp2, tmp2, ch2); 4345 if (str1_isL != str2_isL) { 4346 __ fmovd(ch1, v1); 4347 } 4348 __ br(__ NE, L_HAS_ZERO); 4349 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4350 __ add(result, result, wordSize/str2_chr_size); 4351 __ add(str2, str2, wordSize); 4352 __ br(__ LT, L_POST_LOOP); 4353 __ BIND(L_LOOP); 4354 __ ldr(ch2, Address(str2)); 4355 __ eor(ch2, first, ch2); 4356 __ sub(tmp2, ch2, tmp1); 4357 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4358 __ bics(tmp2, tmp2, ch2); 4359 __ br(__ NE, L_HAS_ZERO); 4360 __ BIND(L_LOOP_PROCEED); 4361 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4362 __ add(str2, str2, wordSize); 4363 __ add(result, result, wordSize/str2_chr_size); 4364 __ br(__ GE, L_LOOP); 4365 __ BIND(L_POST_LOOP); 4366 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 4367 __ br(__ LE, NOMATCH); 4368 __ ldr(ch2, Address(str2)); 4369 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4370 __ eor(ch2, first, ch2); 4371 __ sub(tmp2, ch2, tmp1); 4372 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4373 __ mov(tmp4, -1); // all bits set 4374 __ b(L_SMALL_PROCEED); 4375 __ align(OptoLoopAlignment); 4376 __ BIND(L_SMALL); 4377 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4378 __ eor(ch2, first, ch2); 4379 if (str1_isL != str2_isL) { 4380 __ zip1(v1, __ T16B, v1, v0); 4381 } 4382 __ sub(tmp2, ch2, tmp1); 4383 __ mov(tmp4, -1); // all bits set 4384 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4385 if (str1_isL != str2_isL) { 4386 __ fmovd(ch1, v1); // move converted 4 symbols 4387 } 4388 __ BIND(L_SMALL_PROCEED); 4389 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 4390 __ bic(tmp2, tmp2, ch2); 4391 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 4392 __ rbit(tmp2, tmp2); 4393 __ br(__ EQ, NOMATCH); 4394 __ BIND(L_SMALL_HAS_ZERO_LOOP); 4395 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 4396 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 4397 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 4398 if (str2_isL) { // LL 4399 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4400 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4401 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4402 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4403 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4404 } else { 4405 __ mov(ch2, 0xE); // all bits in byte set except last one 4406 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4407 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4408 __ lslv(tmp2, tmp2, tmp4); 4409 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4410 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4411 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4412 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4413 } 4414 __ cmp(ch1, ch2); 4415 __ mov(tmp4, wordSize/str2_chr_size); 4416 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4417 __ BIND(L_SMALL_CMP_LOOP); 4418 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4419 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4420 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4421 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4422 __ add(tmp4, tmp4, 1); 4423 __ cmp(tmp4, cnt1); 4424 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 4425 __ cmp(first, ch2); 4426 __ br(__ EQ, L_SMALL_CMP_LOOP); 4427 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 4428 __ cbz(tmp2, NOMATCH); // no more matches. exit 4429 __ clz(tmp4, tmp2); 4430 __ add(result, result, 1); // advance index 4431 __ add(str2, str2, str2_chr_size); // advance pointer 4432 __ b(L_SMALL_HAS_ZERO_LOOP); 4433 __ align(OptoLoopAlignment); 4434 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 4435 __ cmp(first, ch2); 4436 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4437 __ b(DONE); 4438 __ align(OptoLoopAlignment); 4439 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 4440 if (str2_isL) { // LL 4441 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4442 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4443 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4444 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4445 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4446 } else { 4447 __ mov(ch2, 0xE); // all bits in byte set except last one 4448 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4449 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4450 __ lslv(tmp2, tmp2, tmp4); 4451 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4452 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4453 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4454 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4455 } 4456 __ cmp(ch1, ch2); 4457 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4458 __ b(DONE); 4459 __ align(OptoLoopAlignment); 4460 __ BIND(L_HAS_ZERO); 4461 __ rbit(tmp2, tmp2); 4462 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 4463 // Now, perform compression of counters(cnt2 and cnt1) into one register. 4464 // It's fine because both counters are 32bit and are not changed in this 4465 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 4466 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 4467 __ sub(result, result, 1); 4468 __ BIND(L_HAS_ZERO_LOOP); 4469 __ mov(cnt1, wordSize/str2_chr_size); 4470 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4471 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 4472 if (str2_isL) { 4473 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4474 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4475 __ lslv(tmp2, tmp2, tmp4); 4476 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4477 __ add(tmp4, tmp4, 1); 4478 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4479 __ lsl(tmp2, tmp2, 1); 4480 __ mov(tmp4, wordSize/str2_chr_size); 4481 } else { 4482 __ mov(ch2, 0xE); 4483 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4484 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4485 __ lslv(tmp2, tmp2, tmp4); 4486 __ add(tmp4, tmp4, 1); 4487 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4488 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4489 __ lsl(tmp2, tmp2, 1); 4490 __ mov(tmp4, wordSize/str2_chr_size); 4491 __ sub(str2, str2, str2_chr_size); 4492 } 4493 __ cmp(ch1, ch2); 4494 __ mov(tmp4, wordSize/str2_chr_size); 4495 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4496 __ BIND(L_CMP_LOOP); 4497 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4498 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4499 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4500 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4501 __ add(tmp4, tmp4, 1); 4502 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4503 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 4504 __ cmp(cnt1, ch2); 4505 __ br(__ EQ, L_CMP_LOOP); 4506 __ BIND(L_CMP_LOOP_NOMATCH); 4507 // here we're not matched 4508 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 4509 __ clz(tmp4, tmp2); 4510 __ add(str2, str2, str2_chr_size); // advance pointer 4511 __ b(L_HAS_ZERO_LOOP); 4512 __ align(OptoLoopAlignment); 4513 __ BIND(L_CMP_LOOP_LAST_CMP); 4514 __ cmp(cnt1, ch2); 4515 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4516 __ b(DONE); 4517 __ align(OptoLoopAlignment); 4518 __ BIND(L_CMP_LOOP_LAST_CMP2); 4519 if (str2_isL) { 4520 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4521 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4522 __ lslv(tmp2, tmp2, tmp4); 4523 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4524 __ add(tmp4, tmp4, 1); 4525 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4526 __ lsl(tmp2, tmp2, 1); 4527 } else { 4528 __ mov(ch2, 0xE); 4529 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4530 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4531 __ lslv(tmp2, tmp2, tmp4); 4532 __ add(tmp4, tmp4, 1); 4533 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4534 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4535 __ lsl(tmp2, tmp2, 1); 4536 __ sub(str2, str2, str2_chr_size); 4537 } 4538 __ cmp(ch1, ch2); 4539 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4540 __ b(DONE); 4541 __ align(OptoLoopAlignment); 4542 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 4543 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 4544 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 4545 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 4546 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 4547 // result by analyzed characters value, so, we can just reset lower bits 4548 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 4549 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 4550 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 4551 // index of last analyzed substring inside current octet. So, str2 in at 4552 // respective start address. We need to advance it to next octet 4553 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 4554 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 4555 __ bfm(result, zr, 0, 2 - str2_chr_shift); 4556 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 4557 __ movw(cnt2, cnt2); 4558 __ b(L_LOOP_PROCEED); 4559 __ align(OptoLoopAlignment); 4560 __ BIND(NOMATCH); 4561 __ mov(result, -1); 4562 __ BIND(DONE); 4563 __ pop(spilled_regs, sp); 4564 __ ret(lr); 4565 return entry; 4566 } 4567 4568 void generate_string_indexof_stubs() { 4569 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 4570 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 4571 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 4572 } 4573 4574 void inflate_and_store_2_fp_registers(bool generatePrfm, 4575 FloatRegister src1, FloatRegister src2) { 4576 Register dst = r1; 4577 __ zip1(v1, __ T16B, src1, v0); 4578 __ zip2(v2, __ T16B, src1, v0); 4579 if (generatePrfm) { 4580 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 4581 } 4582 __ zip1(v3, __ T16B, src2, v0); 4583 __ zip2(v4, __ T16B, src2, v0); 4584 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4585 } 4586 4587 // R0 = src 4588 // R1 = dst 4589 // R2 = len 4590 // R3 = len >> 3 4591 // V0 = 0 4592 // v1 = loaded 8 bytes 4593 address generate_large_byte_array_inflate() { 4594 __ align(CodeEntryAlignment); 4595 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4596 address entry = __ pc(); 4597 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4598 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4599 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4; 4600 4601 // do one more 8-byte read to have address 16-byte aligned in most cases 4602 // also use single store instruction 4603 __ ldrd(v2, __ post(src, 8)); 4604 __ sub(octetCounter, octetCounter, 2); 4605 __ zip1(v1, __ T16B, v1, v0); 4606 __ zip1(v2, __ T16B, v2, v0); 4607 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4608 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4609 __ subs(rscratch1, octetCounter, large_loop_threshold); 4610 __ br(__ LE, LOOP_START); 4611 __ b(LOOP_PRFM_START); 4612 __ bind(LOOP_PRFM); 4613 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4614 __ bind(LOOP_PRFM_START); 4615 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4616 __ sub(octetCounter, octetCounter, 8); 4617 __ subs(rscratch1, octetCounter, large_loop_threshold); 4618 inflate_and_store_2_fp_registers(true, v3, v4); 4619 inflate_and_store_2_fp_registers(true, v5, v6); 4620 __ br(__ GT, LOOP_PRFM); 4621 __ cmp(octetCounter, (u1)8); 4622 __ br(__ LT, DONE); 4623 __ bind(LOOP); 4624 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4625 __ bind(LOOP_START); 4626 __ sub(octetCounter, octetCounter, 8); 4627 __ cmp(octetCounter, (u1)8); 4628 inflate_and_store_2_fp_registers(false, v3, v4); 4629 inflate_and_store_2_fp_registers(false, v5, v6); 4630 __ br(__ GE, LOOP); 4631 __ bind(DONE); 4632 __ ret(lr); 4633 return entry; 4634 } 4635 4636 /** 4637 * Arguments: 4638 * 4639 * Input: 4640 * c_rarg0 - current state address 4641 * c_rarg1 - H key address 4642 * c_rarg2 - data address 4643 * c_rarg3 - number of blocks 4644 * 4645 * Output: 4646 * Updated state at c_rarg0 4647 */ 4648 address generate_ghash_processBlocks() { 4649 // Bafflingly, GCM uses little-endian for the byte order, but 4650 // big-endian for the bit order. For example, the polynomial 1 is 4651 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4652 // 4653 // So, we must either reverse the bytes in each word and do 4654 // everything big-endian or reverse the bits in each byte and do 4655 // it little-endian. On AArch64 it's more idiomatic to reverse 4656 // the bits in each byte (we have an instruction, RBIT, to do 4657 // that) and keep the data in little-endian bit order throught the 4658 // calculation, bit-reversing the inputs and outputs. 4659 4660 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4661 __ align(wordSize * 2); 4662 address p = __ pc(); 4663 __ emit_int64(0x87); // The low-order bits of the field 4664 // polynomial (i.e. p = z^7+z^2+z+1) 4665 // repeated in the low and high parts of a 4666 // 128-bit vector 4667 __ emit_int64(0x87); 4668 4669 __ align(CodeEntryAlignment); 4670 address start = __ pc(); 4671 4672 Register state = c_rarg0; 4673 Register subkeyH = c_rarg1; 4674 Register data = c_rarg2; 4675 Register blocks = c_rarg3; 4676 4677 FloatRegister vzr = v30; 4678 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4679 4680 __ ldrq(v0, Address(state)); 4681 __ ldrq(v1, Address(subkeyH)); 4682 4683 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4684 __ rbit(v0, __ T16B, v0); 4685 __ rev64(v1, __ T16B, v1); 4686 __ rbit(v1, __ T16B, v1); 4687 4688 __ ldrq(v26, p); 4689 4690 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4691 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4692 4693 { 4694 Label L_ghash_loop; 4695 __ bind(L_ghash_loop); 4696 4697 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4698 // reversing each byte 4699 __ rbit(v2, __ T16B, v2); 4700 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4701 4702 // Multiply state in v2 by subkey in v1 4703 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4704 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4705 /*temps*/v6, v20, v18, v21); 4706 // Reduce v7:v5 by the field polynomial 4707 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4708 4709 __ sub(blocks, blocks, 1); 4710 __ cbnz(blocks, L_ghash_loop); 4711 } 4712 4713 // The bit-reversed result is at this point in v0 4714 __ rev64(v1, __ T16B, v0); 4715 __ rbit(v1, __ T16B, v1); 4716 4717 __ st1(v1, __ T16B, state); 4718 __ ret(lr); 4719 4720 return start; 4721 } 4722 4723 // Continuation point for throwing of implicit exceptions that are 4724 // not handled in the current activation. Fabricates an exception 4725 // oop and initiates normal exception dispatching in this 4726 // frame. Since we need to preserve callee-saved values (currently 4727 // only for C2, but done for C1 as well) we need a callee-saved oop 4728 // map and therefore have to make these stubs into RuntimeStubs 4729 // rather than BufferBlobs. If the compiler needs all registers to 4730 // be preserved between the fault point and the exception handler 4731 // then it must assume responsibility for that in 4732 // AbstractCompiler::continuation_for_implicit_null_exception or 4733 // continuation_for_implicit_division_by_zero_exception. All other 4734 // implicit exceptions (e.g., NullPointerException or 4735 // AbstractMethodError on entry) are either at call sites or 4736 // otherwise assume that stack unwinding will be initiated, so 4737 // caller saved registers were assumed volatile in the compiler. 4738 4739 #undef __ 4740 #define __ masm-> 4741 4742 address generate_throw_exception(const char* name, 4743 address runtime_entry, 4744 Register arg1 = noreg, 4745 Register arg2 = noreg) { 4746 // Information about frame layout at time of blocking runtime call. 4747 // Note that we only have to preserve callee-saved registers since 4748 // the compilers are responsible for supplying a continuation point 4749 // if they expect all registers to be preserved. 4750 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4751 enum layout { 4752 rfp_off = 0, 4753 rfp_off2, 4754 return_off, 4755 return_off2, 4756 framesize // inclusive of return address 4757 }; 4758 4759 int insts_size = 512; 4760 int locs_size = 64; 4761 4762 CodeBuffer code(name, insts_size, locs_size); 4763 OopMapSet* oop_maps = new OopMapSet(); 4764 MacroAssembler* masm = new MacroAssembler(&code); 4765 4766 address start = __ pc(); 4767 4768 // This is an inlined and slightly modified version of call_VM 4769 // which has the ability to fetch the return PC out of 4770 // thread-local storage and also sets up last_Java_sp slightly 4771 // differently than the real call_VM 4772 4773 __ enter(); // Save FP and LR before call 4774 4775 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4776 4777 // lr and fp are already in place 4778 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4779 4780 int frame_complete = __ pc() - start; 4781 4782 // Set up last_Java_sp and last_Java_fp 4783 address the_pc = __ pc(); 4784 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 4785 4786 // Call runtime 4787 if (arg1 != noreg) { 4788 assert(arg2 != c_rarg1, "clobbered"); 4789 __ mov(c_rarg1, arg1); 4790 } 4791 if (arg2 != noreg) { 4792 __ mov(c_rarg2, arg2); 4793 } 4794 __ mov(c_rarg0, rthread); 4795 BLOCK_COMMENT("call runtime_entry"); 4796 __ mov(rscratch1, runtime_entry); 4797 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4798 4799 // Generate oop map 4800 OopMap* map = new OopMap(framesize, 0); 4801 4802 oop_maps->add_gc_map(the_pc - start, map); 4803 4804 __ reset_last_Java_frame(true); 4805 __ maybe_isb(); 4806 4807 __ leave(); 4808 4809 // check for pending exceptions 4810 #ifdef ASSERT 4811 Label L; 4812 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4813 __ cbnz(rscratch1, L); 4814 __ should_not_reach_here(); 4815 __ bind(L); 4816 #endif // ASSERT 4817 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4818 4819 4820 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4821 RuntimeStub* stub = 4822 RuntimeStub::new_runtime_stub(name, 4823 &code, 4824 frame_complete, 4825 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4826 oop_maps, false); 4827 return stub->entry_point(); 4828 } 4829 4830 class MontgomeryMultiplyGenerator : public MacroAssembler { 4831 4832 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4833 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4834 4835 RegSet _toSave; 4836 bool _squaring; 4837 4838 public: 4839 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4840 : MacroAssembler(as->code()), _squaring(squaring) { 4841 4842 // Register allocation 4843 4844 Register reg = c_rarg0; 4845 Pa_base = reg; // Argument registers 4846 if (squaring) 4847 Pb_base = Pa_base; 4848 else 4849 Pb_base = ++reg; 4850 Pn_base = ++reg; 4851 Rlen= ++reg; 4852 inv = ++reg; 4853 Pm_base = ++reg; 4854 4855 // Working registers: 4856 Ra = ++reg; // The current digit of a, b, n, and m. 4857 Rb = ++reg; 4858 Rm = ++reg; 4859 Rn = ++reg; 4860 4861 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4862 Pb = ++reg; 4863 Pm = ++reg; 4864 Pn = ++reg; 4865 4866 t0 = ++reg; // Three registers which form a 4867 t1 = ++reg; // triple-precision accumuator. 4868 t2 = ++reg; 4869 4870 Ri = ++reg; // Inner and outer loop indexes. 4871 Rj = ++reg; 4872 4873 Rhi_ab = ++reg; // Product registers: low and high parts 4874 Rlo_ab = ++reg; // of a*b and m*n. 4875 Rhi_mn = ++reg; 4876 Rlo_mn = ++reg; 4877 4878 // r19 and up are callee-saved. 4879 _toSave = RegSet::range(r19, reg) + Pm_base; 4880 } 4881 4882 private: 4883 void save_regs() { 4884 push(_toSave, sp); 4885 } 4886 4887 void restore_regs() { 4888 pop(_toSave, sp); 4889 } 4890 4891 template <typename T> 4892 void unroll_2(Register count, T block) { 4893 Label loop, end, odd; 4894 tbnz(count, 0, odd); 4895 cbz(count, end); 4896 align(16); 4897 bind(loop); 4898 (this->*block)(); 4899 bind(odd); 4900 (this->*block)(); 4901 subs(count, count, 2); 4902 br(Assembler::GT, loop); 4903 bind(end); 4904 } 4905 4906 template <typename T> 4907 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4908 Label loop, end, odd; 4909 tbnz(count, 0, odd); 4910 cbz(count, end); 4911 align(16); 4912 bind(loop); 4913 (this->*block)(d, s, tmp); 4914 bind(odd); 4915 (this->*block)(d, s, tmp); 4916 subs(count, count, 2); 4917 br(Assembler::GT, loop); 4918 bind(end); 4919 } 4920 4921 void pre1(RegisterOrConstant i) { 4922 block_comment("pre1"); 4923 // Pa = Pa_base; 4924 // Pb = Pb_base + i; 4925 // Pm = Pm_base; 4926 // Pn = Pn_base + i; 4927 // Ra = *Pa; 4928 // Rb = *Pb; 4929 // Rm = *Pm; 4930 // Rn = *Pn; 4931 ldr(Ra, Address(Pa_base)); 4932 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4933 ldr(Rm, Address(Pm_base)); 4934 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4935 lea(Pa, Address(Pa_base)); 4936 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4937 lea(Pm, Address(Pm_base)); 4938 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4939 4940 // Zero the m*n result. 4941 mov(Rhi_mn, zr); 4942 mov(Rlo_mn, zr); 4943 } 4944 4945 // The core multiply-accumulate step of a Montgomery 4946 // multiplication. The idea is to schedule operations as a 4947 // pipeline so that instructions with long latencies (loads and 4948 // multiplies) have time to complete before their results are 4949 // used. This most benefits in-order implementations of the 4950 // architecture but out-of-order ones also benefit. 4951 void step() { 4952 block_comment("step"); 4953 // MACC(Ra, Rb, t0, t1, t2); 4954 // Ra = *++Pa; 4955 // Rb = *--Pb; 4956 umulh(Rhi_ab, Ra, Rb); 4957 mul(Rlo_ab, Ra, Rb); 4958 ldr(Ra, pre(Pa, wordSize)); 4959 ldr(Rb, pre(Pb, -wordSize)); 4960 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4961 // previous iteration. 4962 // MACC(Rm, Rn, t0, t1, t2); 4963 // Rm = *++Pm; 4964 // Rn = *--Pn; 4965 umulh(Rhi_mn, Rm, Rn); 4966 mul(Rlo_mn, Rm, Rn); 4967 ldr(Rm, pre(Pm, wordSize)); 4968 ldr(Rn, pre(Pn, -wordSize)); 4969 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4970 } 4971 4972 void post1() { 4973 block_comment("post1"); 4974 4975 // MACC(Ra, Rb, t0, t1, t2); 4976 // Ra = *++Pa; 4977 // Rb = *--Pb; 4978 umulh(Rhi_ab, Ra, Rb); 4979 mul(Rlo_ab, Ra, Rb); 4980 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4981 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4982 4983 // *Pm = Rm = t0 * inv; 4984 mul(Rm, t0, inv); 4985 str(Rm, Address(Pm)); 4986 4987 // MACC(Rm, Rn, t0, t1, t2); 4988 // t0 = t1; t1 = t2; t2 = 0; 4989 umulh(Rhi_mn, Rm, Rn); 4990 4991 #ifndef PRODUCT 4992 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 4993 { 4994 mul(Rlo_mn, Rm, Rn); 4995 add(Rlo_mn, t0, Rlo_mn); 4996 Label ok; 4997 cbz(Rlo_mn, ok); { 4998 stop("broken Montgomery multiply"); 4999 } bind(ok); 5000 } 5001 #endif 5002 // We have very carefully set things up so that 5003 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5004 // the lower half of Rm * Rn because we know the result already: 5005 // it must be -t0. t0 + (-t0) must generate a carry iff 5006 // t0 != 0. So, rather than do a mul and an adds we just set 5007 // the carry flag iff t0 is nonzero. 5008 // 5009 // mul(Rlo_mn, Rm, Rn); 5010 // adds(zr, t0, Rlo_mn); 5011 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5012 adcs(t0, t1, Rhi_mn); 5013 adc(t1, t2, zr); 5014 mov(t2, zr); 5015 } 5016 5017 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 5018 block_comment("pre2"); 5019 // Pa = Pa_base + i-len; 5020 // Pb = Pb_base + len; 5021 // Pm = Pm_base + i-len; 5022 // Pn = Pn_base + len; 5023 5024 if (i.is_register()) { 5025 sub(Rj, i.as_register(), len); 5026 } else { 5027 mov(Rj, i.as_constant()); 5028 sub(Rj, Rj, len); 5029 } 5030 // Rj == i-len 5031 5032 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 5033 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 5034 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5035 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 5036 5037 // Ra = *++Pa; 5038 // Rb = *--Pb; 5039 // Rm = *++Pm; 5040 // Rn = *--Pn; 5041 ldr(Ra, pre(Pa, wordSize)); 5042 ldr(Rb, pre(Pb, -wordSize)); 5043 ldr(Rm, pre(Pm, wordSize)); 5044 ldr(Rn, pre(Pn, -wordSize)); 5045 5046 mov(Rhi_mn, zr); 5047 mov(Rlo_mn, zr); 5048 } 5049 5050 void post2(RegisterOrConstant i, RegisterOrConstant len) { 5051 block_comment("post2"); 5052 if (i.is_constant()) { 5053 mov(Rj, i.as_constant()-len.as_constant()); 5054 } else { 5055 sub(Rj, i.as_register(), len); 5056 } 5057 5058 adds(t0, t0, Rlo_mn); // The pending m*n, low part 5059 5060 // As soon as we know the least significant digit of our result, 5061 // store it. 5062 // Pm_base[i-len] = t0; 5063 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5064 5065 // t0 = t1; t1 = t2; t2 = 0; 5066 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 5067 adc(t1, t2, zr); 5068 mov(t2, zr); 5069 } 5070 5071 // A carry in t0 after Montgomery multiplication means that we 5072 // should subtract multiples of n from our result in m. We'll 5073 // keep doing that until there is no carry. 5074 void normalize(RegisterOrConstant len) { 5075 block_comment("normalize"); 5076 // while (t0) 5077 // t0 = sub(Pm_base, Pn_base, t0, len); 5078 Label loop, post, again; 5079 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 5080 cbz(t0, post); { 5081 bind(again); { 5082 mov(i, zr); 5083 mov(cnt, len); 5084 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5085 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5086 subs(zr, zr, zr); // set carry flag, i.e. no borrow 5087 align(16); 5088 bind(loop); { 5089 sbcs(Rm, Rm, Rn); 5090 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5091 add(i, i, 1); 5092 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5093 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5094 sub(cnt, cnt, 1); 5095 } cbnz(cnt, loop); 5096 sbc(t0, t0, zr); 5097 } cbnz(t0, again); 5098 } bind(post); 5099 } 5100 5101 // Move memory at s to d, reversing words. 5102 // Increments d to end of copied memory 5103 // Destroys tmp1, tmp2 5104 // Preserves len 5105 // Leaves s pointing to the address which was in d at start 5106 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 5107 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 5108 5109 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 5110 mov(tmp1, len); 5111 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 5112 sub(s, d, len, ext::uxtw, LogBytesPerWord); 5113 } 5114 // where 5115 void reverse1(Register d, Register s, Register tmp) { 5116 ldr(tmp, pre(s, -wordSize)); 5117 ror(tmp, tmp, 32); 5118 str(tmp, post(d, wordSize)); 5119 } 5120 5121 void step_squaring() { 5122 // An extra ACC 5123 step(); 5124 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5125 } 5126 5127 void last_squaring(RegisterOrConstant i) { 5128 Label dont; 5129 // if ((i & 1) == 0) { 5130 tbnz(i.as_register(), 0, dont); { 5131 // MACC(Ra, Rb, t0, t1, t2); 5132 // Ra = *++Pa; 5133 // Rb = *--Pb; 5134 umulh(Rhi_ab, Ra, Rb); 5135 mul(Rlo_ab, Ra, Rb); 5136 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5137 } bind(dont); 5138 } 5139 5140 void extra_step_squaring() { 5141 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5142 5143 // MACC(Rm, Rn, t0, t1, t2); 5144 // Rm = *++Pm; 5145 // Rn = *--Pn; 5146 umulh(Rhi_mn, Rm, Rn); 5147 mul(Rlo_mn, Rm, Rn); 5148 ldr(Rm, pre(Pm, wordSize)); 5149 ldr(Rn, pre(Pn, -wordSize)); 5150 } 5151 5152 void post1_squaring() { 5153 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5154 5155 // *Pm = Rm = t0 * inv; 5156 mul(Rm, t0, inv); 5157 str(Rm, Address(Pm)); 5158 5159 // MACC(Rm, Rn, t0, t1, t2); 5160 // t0 = t1; t1 = t2; t2 = 0; 5161 umulh(Rhi_mn, Rm, Rn); 5162 5163 #ifndef PRODUCT 5164 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5165 { 5166 mul(Rlo_mn, Rm, Rn); 5167 add(Rlo_mn, t0, Rlo_mn); 5168 Label ok; 5169 cbz(Rlo_mn, ok); { 5170 stop("broken Montgomery multiply"); 5171 } bind(ok); 5172 } 5173 #endif 5174 // We have very carefully set things up so that 5175 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5176 // the lower half of Rm * Rn because we know the result already: 5177 // it must be -t0. t0 + (-t0) must generate a carry iff 5178 // t0 != 0. So, rather than do a mul and an adds we just set 5179 // the carry flag iff t0 is nonzero. 5180 // 5181 // mul(Rlo_mn, Rm, Rn); 5182 // adds(zr, t0, Rlo_mn); 5183 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5184 adcs(t0, t1, Rhi_mn); 5185 adc(t1, t2, zr); 5186 mov(t2, zr); 5187 } 5188 5189 void acc(Register Rhi, Register Rlo, 5190 Register t0, Register t1, Register t2) { 5191 adds(t0, t0, Rlo); 5192 adcs(t1, t1, Rhi); 5193 adc(t2, t2, zr); 5194 } 5195 5196 public: 5197 /** 5198 * Fast Montgomery multiplication. The derivation of the 5199 * algorithm is in A Cryptographic Library for the Motorola 5200 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 5201 * 5202 * Arguments: 5203 * 5204 * Inputs for multiplication: 5205 * c_rarg0 - int array elements a 5206 * c_rarg1 - int array elements b 5207 * c_rarg2 - int array elements n (the modulus) 5208 * c_rarg3 - int length 5209 * c_rarg4 - int inv 5210 * c_rarg5 - int array elements m (the result) 5211 * 5212 * Inputs for squaring: 5213 * c_rarg0 - int array elements a 5214 * c_rarg1 - int array elements n (the modulus) 5215 * c_rarg2 - int length 5216 * c_rarg3 - int inv 5217 * c_rarg4 - int array elements m (the result) 5218 * 5219 */ 5220 address generate_multiply() { 5221 Label argh, nothing; 5222 bind(argh); 5223 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5224 5225 align(CodeEntryAlignment); 5226 address entry = pc(); 5227 5228 cbzw(Rlen, nothing); 5229 5230 enter(); 5231 5232 // Make room. 5233 cmpw(Rlen, 512); 5234 br(Assembler::HI, argh); 5235 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5236 andr(sp, Ra, -2 * wordSize); 5237 5238 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5239 5240 { 5241 // Copy input args, reversing as we go. We use Ra as a 5242 // temporary variable. 5243 reverse(Ra, Pa_base, Rlen, t0, t1); 5244 if (!_squaring) 5245 reverse(Ra, Pb_base, Rlen, t0, t1); 5246 reverse(Ra, Pn_base, Rlen, t0, t1); 5247 } 5248 5249 // Push all call-saved registers and also Pm_base which we'll need 5250 // at the end. 5251 save_regs(); 5252 5253 #ifndef PRODUCT 5254 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 5255 { 5256 ldr(Rn, Address(Pn_base, 0)); 5257 mul(Rlo_mn, Rn, inv); 5258 subs(zr, Rlo_mn, -1); 5259 Label ok; 5260 br(EQ, ok); { 5261 stop("broken inverse in Montgomery multiply"); 5262 } bind(ok); 5263 } 5264 #endif 5265 5266 mov(Pm_base, Ra); 5267 5268 mov(t0, zr); 5269 mov(t1, zr); 5270 mov(t2, zr); 5271 5272 block_comment("for (int i = 0; i < len; i++) {"); 5273 mov(Ri, zr); { 5274 Label loop, end; 5275 cmpw(Ri, Rlen); 5276 br(Assembler::GE, end); 5277 5278 bind(loop); 5279 pre1(Ri); 5280 5281 block_comment(" for (j = i; j; j--) {"); { 5282 movw(Rj, Ri); 5283 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5284 } block_comment(" } // j"); 5285 5286 post1(); 5287 addw(Ri, Ri, 1); 5288 cmpw(Ri, Rlen); 5289 br(Assembler::LT, loop); 5290 bind(end); 5291 block_comment("} // i"); 5292 } 5293 5294 block_comment("for (int i = len; i < 2*len; i++) {"); 5295 mov(Ri, Rlen); { 5296 Label loop, end; 5297 cmpw(Ri, Rlen, Assembler::LSL, 1); 5298 br(Assembler::GE, end); 5299 5300 bind(loop); 5301 pre2(Ri, Rlen); 5302 5303 block_comment(" for (j = len*2-i-1; j; j--) {"); { 5304 lslw(Rj, Rlen, 1); 5305 subw(Rj, Rj, Ri); 5306 subw(Rj, Rj, 1); 5307 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5308 } block_comment(" } // j"); 5309 5310 post2(Ri, Rlen); 5311 addw(Ri, Ri, 1); 5312 cmpw(Ri, Rlen, Assembler::LSL, 1); 5313 br(Assembler::LT, loop); 5314 bind(end); 5315 } 5316 block_comment("} // i"); 5317 5318 normalize(Rlen); 5319 5320 mov(Ra, Pm_base); // Save Pm_base in Ra 5321 restore_regs(); // Restore caller's Pm_base 5322 5323 // Copy our result into caller's Pm_base 5324 reverse(Pm_base, Ra, Rlen, t0, t1); 5325 5326 leave(); 5327 bind(nothing); 5328 ret(lr); 5329 5330 return entry; 5331 } 5332 // In C, approximately: 5333 5334 // void 5335 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 5336 // unsigned long Pn_base[], unsigned long Pm_base[], 5337 // unsigned long inv, int len) { 5338 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5339 // unsigned long *Pa, *Pb, *Pn, *Pm; 5340 // unsigned long Ra, Rb, Rn, Rm; 5341 5342 // int i; 5343 5344 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5345 5346 // for (i = 0; i < len; i++) { 5347 // int j; 5348 5349 // Pa = Pa_base; 5350 // Pb = Pb_base + i; 5351 // Pm = Pm_base; 5352 // Pn = Pn_base + i; 5353 5354 // Ra = *Pa; 5355 // Rb = *Pb; 5356 // Rm = *Pm; 5357 // Rn = *Pn; 5358 5359 // int iters = i; 5360 // for (j = 0; iters--; j++) { 5361 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5362 // MACC(Ra, Rb, t0, t1, t2); 5363 // Ra = *++Pa; 5364 // Rb = *--Pb; 5365 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5366 // MACC(Rm, Rn, t0, t1, t2); 5367 // Rm = *++Pm; 5368 // Rn = *--Pn; 5369 // } 5370 5371 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 5372 // MACC(Ra, Rb, t0, t1, t2); 5373 // *Pm = Rm = t0 * inv; 5374 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5375 // MACC(Rm, Rn, t0, t1, t2); 5376 5377 // assert(t0 == 0, "broken Montgomery multiply"); 5378 5379 // t0 = t1; t1 = t2; t2 = 0; 5380 // } 5381 5382 // for (i = len; i < 2*len; i++) { 5383 // int j; 5384 5385 // Pa = Pa_base + i-len; 5386 // Pb = Pb_base + len; 5387 // Pm = Pm_base + i-len; 5388 // Pn = Pn_base + len; 5389 5390 // Ra = *++Pa; 5391 // Rb = *--Pb; 5392 // Rm = *++Pm; 5393 // Rn = *--Pn; 5394 5395 // int iters = len*2-i-1; 5396 // for (j = i-len+1; iters--; j++) { 5397 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5398 // MACC(Ra, Rb, t0, t1, t2); 5399 // Ra = *++Pa; 5400 // Rb = *--Pb; 5401 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5402 // MACC(Rm, Rn, t0, t1, t2); 5403 // Rm = *++Pm; 5404 // Rn = *--Pn; 5405 // } 5406 5407 // Pm_base[i-len] = t0; 5408 // t0 = t1; t1 = t2; t2 = 0; 5409 // } 5410 5411 // while (t0) 5412 // t0 = sub(Pm_base, Pn_base, t0, len); 5413 // } 5414 5415 /** 5416 * Fast Montgomery squaring. This uses asymptotically 25% fewer 5417 * multiplies than Montgomery multiplication so it should be up to 5418 * 25% faster. However, its loop control is more complex and it 5419 * may actually run slower on some machines. 5420 * 5421 * Arguments: 5422 * 5423 * Inputs: 5424 * c_rarg0 - int array elements a 5425 * c_rarg1 - int array elements n (the modulus) 5426 * c_rarg2 - int length 5427 * c_rarg3 - int inv 5428 * c_rarg4 - int array elements m (the result) 5429 * 5430 */ 5431 address generate_square() { 5432 Label argh; 5433 bind(argh); 5434 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5435 5436 align(CodeEntryAlignment); 5437 address entry = pc(); 5438 5439 enter(); 5440 5441 // Make room. 5442 cmpw(Rlen, 512); 5443 br(Assembler::HI, argh); 5444 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5445 andr(sp, Ra, -2 * wordSize); 5446 5447 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5448 5449 { 5450 // Copy input args, reversing as we go. We use Ra as a 5451 // temporary variable. 5452 reverse(Ra, Pa_base, Rlen, t0, t1); 5453 reverse(Ra, Pn_base, Rlen, t0, t1); 5454 } 5455 5456 // Push all call-saved registers and also Pm_base which we'll need 5457 // at the end. 5458 save_regs(); 5459 5460 mov(Pm_base, Ra); 5461 5462 mov(t0, zr); 5463 mov(t1, zr); 5464 mov(t2, zr); 5465 5466 block_comment("for (int i = 0; i < len; i++) {"); 5467 mov(Ri, zr); { 5468 Label loop, end; 5469 bind(loop); 5470 cmp(Ri, Rlen); 5471 br(Assembler::GE, end); 5472 5473 pre1(Ri); 5474 5475 block_comment("for (j = (i+1)/2; j; j--) {"); { 5476 add(Rj, Ri, 1); 5477 lsr(Rj, Rj, 1); 5478 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5479 } block_comment(" } // j"); 5480 5481 last_squaring(Ri); 5482 5483 block_comment(" for (j = i/2; j; j--) {"); { 5484 lsr(Rj, Ri, 1); 5485 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5486 } block_comment(" } // j"); 5487 5488 post1_squaring(); 5489 add(Ri, Ri, 1); 5490 cmp(Ri, Rlen); 5491 br(Assembler::LT, loop); 5492 5493 bind(end); 5494 block_comment("} // i"); 5495 } 5496 5497 block_comment("for (int i = len; i < 2*len; i++) {"); 5498 mov(Ri, Rlen); { 5499 Label loop, end; 5500 bind(loop); 5501 cmp(Ri, Rlen, Assembler::LSL, 1); 5502 br(Assembler::GE, end); 5503 5504 pre2(Ri, Rlen); 5505 5506 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 5507 lsl(Rj, Rlen, 1); 5508 sub(Rj, Rj, Ri); 5509 sub(Rj, Rj, 1); 5510 lsr(Rj, Rj, 1); 5511 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5512 } block_comment(" } // j"); 5513 5514 last_squaring(Ri); 5515 5516 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 5517 lsl(Rj, Rlen, 1); 5518 sub(Rj, Rj, Ri); 5519 lsr(Rj, Rj, 1); 5520 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5521 } block_comment(" } // j"); 5522 5523 post2(Ri, Rlen); 5524 add(Ri, Ri, 1); 5525 cmp(Ri, Rlen, Assembler::LSL, 1); 5526 5527 br(Assembler::LT, loop); 5528 bind(end); 5529 block_comment("} // i"); 5530 } 5531 5532 normalize(Rlen); 5533 5534 mov(Ra, Pm_base); // Save Pm_base in Ra 5535 restore_regs(); // Restore caller's Pm_base 5536 5537 // Copy our result into caller's Pm_base 5538 reverse(Pm_base, Ra, Rlen, t0, t1); 5539 5540 leave(); 5541 ret(lr); 5542 5543 return entry; 5544 } 5545 // In C, approximately: 5546 5547 // void 5548 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 5549 // unsigned long Pm_base[], unsigned long inv, int len) { 5550 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5551 // unsigned long *Pa, *Pb, *Pn, *Pm; 5552 // unsigned long Ra, Rb, Rn, Rm; 5553 5554 // int i; 5555 5556 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5557 5558 // for (i = 0; i < len; i++) { 5559 // int j; 5560 5561 // Pa = Pa_base; 5562 // Pb = Pa_base + i; 5563 // Pm = Pm_base; 5564 // Pn = Pn_base + i; 5565 5566 // Ra = *Pa; 5567 // Rb = *Pb; 5568 // Rm = *Pm; 5569 // Rn = *Pn; 5570 5571 // int iters = (i+1)/2; 5572 // for (j = 0; iters--; j++) { 5573 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5574 // MACC2(Ra, Rb, t0, t1, t2); 5575 // Ra = *++Pa; 5576 // Rb = *--Pb; 5577 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5578 // MACC(Rm, Rn, t0, t1, t2); 5579 // Rm = *++Pm; 5580 // Rn = *--Pn; 5581 // } 5582 // if ((i & 1) == 0) { 5583 // assert(Ra == Pa_base[j], "must be"); 5584 // MACC(Ra, Ra, t0, t1, t2); 5585 // } 5586 // iters = i/2; 5587 // assert(iters == i-j, "must be"); 5588 // for (; iters--; j++) { 5589 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5590 // MACC(Rm, Rn, t0, t1, t2); 5591 // Rm = *++Pm; 5592 // Rn = *--Pn; 5593 // } 5594 5595 // *Pm = Rm = t0 * inv; 5596 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5597 // MACC(Rm, Rn, t0, t1, t2); 5598 5599 // assert(t0 == 0, "broken Montgomery multiply"); 5600 5601 // t0 = t1; t1 = t2; t2 = 0; 5602 // } 5603 5604 // for (i = len; i < 2*len; i++) { 5605 // int start = i-len+1; 5606 // int end = start + (len - start)/2; 5607 // int j; 5608 5609 // Pa = Pa_base + i-len; 5610 // Pb = Pa_base + len; 5611 // Pm = Pm_base + i-len; 5612 // Pn = Pn_base + len; 5613 5614 // Ra = *++Pa; 5615 // Rb = *--Pb; 5616 // Rm = *++Pm; 5617 // Rn = *--Pn; 5618 5619 // int iters = (2*len-i-1)/2; 5620 // assert(iters == end-start, "must be"); 5621 // for (j = start; iters--; j++) { 5622 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5623 // MACC2(Ra, Rb, t0, t1, t2); 5624 // Ra = *++Pa; 5625 // Rb = *--Pb; 5626 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5627 // MACC(Rm, Rn, t0, t1, t2); 5628 // Rm = *++Pm; 5629 // Rn = *--Pn; 5630 // } 5631 // if ((i & 1) == 0) { 5632 // assert(Ra == Pa_base[j], "must be"); 5633 // MACC(Ra, Ra, t0, t1, t2); 5634 // } 5635 // iters = (2*len-i)/2; 5636 // assert(iters == len-j, "must be"); 5637 // for (; iters--; j++) { 5638 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5639 // MACC(Rm, Rn, t0, t1, t2); 5640 // Rm = *++Pm; 5641 // Rn = *--Pn; 5642 // } 5643 // Pm_base[i-len] = t0; 5644 // t0 = t1; t1 = t2; t2 = 0; 5645 // } 5646 5647 // while (t0) 5648 // t0 = sub(Pm_base, Pn_base, t0, len); 5649 // } 5650 }; 5651 5652 5653 // Initialization 5654 void generate_initial() { 5655 // Generate initial stubs and initializes the entry points 5656 5657 // entry points that exist in all platforms Note: This is code 5658 // that could be shared among different platforms - however the 5659 // benefit seems to be smaller than the disadvantage of having a 5660 // much more complicated generator structure. See also comment in 5661 // stubRoutines.hpp. 5662 5663 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5664 5665 StubRoutines::_call_stub_entry = 5666 generate_call_stub(StubRoutines::_call_stub_return_address); 5667 5668 // is referenced by megamorphic call 5669 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5670 5671 // Build this early so it's available for the interpreter. 5672 StubRoutines::_throw_StackOverflowError_entry = 5673 generate_throw_exception("StackOverflowError throw_exception", 5674 CAST_FROM_FN_PTR(address, 5675 SharedRuntime::throw_StackOverflowError)); 5676 StubRoutines::_throw_delayed_StackOverflowError_entry = 5677 generate_throw_exception("delayed StackOverflowError throw_exception", 5678 CAST_FROM_FN_PTR(address, 5679 SharedRuntime::throw_delayed_StackOverflowError)); 5680 if (UseCRC32Intrinsics) { 5681 // set table address before stub generation which use it 5682 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5683 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5684 } 5685 5686 if (UseCRC32CIntrinsics) { 5687 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5688 } 5689 5690 // Disabled until JDK-8210858 is fixed 5691 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5692 // StubRoutines::_dlog = generate_dlog(); 5693 // } 5694 5695 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5696 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 5697 } 5698 5699 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5700 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 5701 } 5702 } 5703 5704 void generate_all() { 5705 // support for verify_oop (must happen after universe_init) 5706 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5707 StubRoutines::_throw_AbstractMethodError_entry = 5708 generate_throw_exception("AbstractMethodError throw_exception", 5709 CAST_FROM_FN_PTR(address, 5710 SharedRuntime:: 5711 throw_AbstractMethodError)); 5712 5713 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5714 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5715 CAST_FROM_FN_PTR(address, 5716 SharedRuntime:: 5717 throw_IncompatibleClassChangeError)); 5718 5719 StubRoutines::_throw_NullPointerException_at_call_entry = 5720 generate_throw_exception("NullPointerException at call throw_exception", 5721 CAST_FROM_FN_PTR(address, 5722 SharedRuntime:: 5723 throw_NullPointerException_at_call)); 5724 5725 // arraycopy stubs used by compilers 5726 generate_arraycopy_stubs(); 5727 5728 // has negatives stub for large arrays. 5729 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5730 5731 // array equals stub for large arrays. 5732 if (!UseSimpleArrayEquals) { 5733 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5734 } 5735 5736 generate_compare_long_strings(); 5737 5738 generate_string_indexof_stubs(); 5739 5740 // byte_array_inflate stub for large arrays. 5741 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5742 5743 #ifdef COMPILER2 5744 if (UseMultiplyToLenIntrinsic) { 5745 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5746 } 5747 5748 if (UseSquareToLenIntrinsic) { 5749 StubRoutines::_squareToLen = generate_squareToLen(); 5750 } 5751 5752 if (UseMulAddIntrinsic) { 5753 StubRoutines::_mulAdd = generate_mulAdd(); 5754 } 5755 5756 if (UseMontgomeryMultiplyIntrinsic) { 5757 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5758 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5759 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5760 } 5761 5762 if (UseMontgomerySquareIntrinsic) { 5763 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5764 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5765 // We use generate_multiply() rather than generate_square() 5766 // because it's faster for the sizes of modulus we care about. 5767 StubRoutines::_montgomerySquare = g.generate_multiply(); 5768 } 5769 #endif // COMPILER2 5770 5771 #ifndef BUILTIN_SIM 5772 // generate GHASH intrinsics code 5773 if (UseGHASHIntrinsics) { 5774 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5775 } 5776 5777 if (UseAESIntrinsics) { 5778 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5779 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5780 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5781 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5782 } 5783 5784 if (UseSHA1Intrinsics) { 5785 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5786 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5787 } 5788 if (UseSHA256Intrinsics) { 5789 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5790 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5791 } 5792 5793 // generate Adler32 intrinsics code 5794 if (UseAdler32Intrinsics) { 5795 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5796 } 5797 5798 // Safefetch stubs. 5799 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5800 &StubRoutines::_safefetch32_fault_pc, 5801 &StubRoutines::_safefetch32_continuation_pc); 5802 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 5803 &StubRoutines::_safefetchN_fault_pc, 5804 &StubRoutines::_safefetchN_continuation_pc); 5805 #endif 5806 StubRoutines::aarch64::set_completed(); 5807 } 5808 5809 public: 5810 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 5811 if (all) { 5812 generate_all(); 5813 } else { 5814 generate_initial(); 5815 } 5816 } 5817 }; // end class declaration 5818 5819 void StubGenerator_generate(CodeBuffer* code, bool all) { 5820 StubGenerator g(code, all); 5821 }