1 /* 2 * Copyright (c) 2003, 2019, Oracle and/or its affiliates. All rights reserved. 3 * Copyright (c) 2014, 2019, Red Hat Inc. All rights reserved. 4 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. 5 * 6 * This code is free software; you can redistribute it and/or modify it 7 * under the terms of the GNU General Public License version 2 only, as 8 * published by the Free Software Foundation. 9 * 10 * This code is distributed in the hope that it will be useful, but WITHOUT 11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or 12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License 13 * version 2 for more details (a copy is included in the LICENSE file that 14 * accompanied this code). 15 * 16 * You should have received a copy of the GNU General Public License version 17 * 2 along with this work; if not, write to the Free Software Foundation, 18 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. 19 * 20 * Please contact Oracle, 500 Oracle Parkway, Redwood Shores, CA 94065 USA 21 * or visit www.oracle.com if you need additional information or have any 22 * questions. 23 * 24 */ 25 26 #include "precompiled.hpp" 27 #include "asm/macroAssembler.hpp" 28 #include "asm/macroAssembler.inline.hpp" 29 #include "gc/shared/barrierSet.hpp" 30 #include "gc/shared/barrierSetAssembler.hpp" 31 #include "interpreter/interpreter.hpp" 32 #include "memory/universe.hpp" 33 #include "nativeInst_aarch64.hpp" 34 #include "oops/instanceOop.hpp" 35 #include "oops/method.hpp" 36 #include "oops/objArrayKlass.hpp" 37 #include "oops/oop.inline.hpp" 38 #include "prims/methodHandles.hpp" 39 #include "runtime/frame.inline.hpp" 40 #include "runtime/handles.inline.hpp" 41 #include "runtime/sharedRuntime.hpp" 42 #include "runtime/stubCodeGenerator.hpp" 43 #include "runtime/stubRoutines.hpp" 44 #include "runtime/thread.inline.hpp" 45 #include "utilities/align.hpp" 46 #ifdef COMPILER2 47 #include "opto/runtime.hpp" 48 #endif 49 #if INCLUDE_ZGC 50 #include "gc/z/zThreadLocalData.hpp" 51 #endif 52 53 #ifdef BUILTIN_SIM 54 #include "../../../../../../simulator/simulator.hpp" 55 #endif 56 57 // Declaration and definition of StubGenerator (no .hpp file). 58 // For a more detailed description of the stub routine structure 59 // see the comment in stubRoutines.hpp 60 61 #undef __ 62 #define __ _masm-> 63 #define TIMES_OOP Address::sxtw(exact_log2(UseCompressedOops ? 4 : 8)) 64 65 #ifdef PRODUCT 66 #define BLOCK_COMMENT(str) /* nothing */ 67 #else 68 #define BLOCK_COMMENT(str) __ block_comment(str) 69 #endif 70 71 #define BIND(label) bind(label); BLOCK_COMMENT(#label ":") 72 73 // Stub Code definitions 74 75 class StubGenerator: public StubCodeGenerator { 76 private: 77 78 #ifdef PRODUCT 79 #define inc_counter_np(counter) ((void)0) 80 #else 81 void inc_counter_np_(int& counter) { 82 __ lea(rscratch2, ExternalAddress((address)&counter)); 83 __ ldrw(rscratch1, Address(rscratch2)); 84 __ addw(rscratch1, rscratch1, 1); 85 __ strw(rscratch1, Address(rscratch2)); 86 } 87 #define inc_counter_np(counter) \ 88 BLOCK_COMMENT("inc_counter " #counter); \ 89 inc_counter_np_(counter); 90 #endif 91 92 // Call stubs are used to call Java from C 93 // 94 // Arguments: 95 // c_rarg0: call wrapper address address 96 // c_rarg1: result address 97 // c_rarg2: result type BasicType 98 // c_rarg3: method Method* 99 // c_rarg4: (interpreter) entry point address 100 // c_rarg5: parameters intptr_t* 101 // c_rarg6: parameter size (in words) int 102 // c_rarg7: thread Thread* 103 // 104 // There is no return from the stub itself as any Java result 105 // is written to result 106 // 107 // we save r30 (lr) as the return PC at the base of the frame and 108 // link r29 (fp) below it as the frame pointer installing sp (r31) 109 // into fp. 110 // 111 // we save r0-r7, which accounts for all the c arguments. 112 // 113 // TODO: strictly do we need to save them all? they are treated as 114 // volatile by C so could we omit saving the ones we are going to 115 // place in global registers (thread? method?) or those we only use 116 // during setup of the Java call? 117 // 118 // we don't need to save r8 which C uses as an indirect result location 119 // return register. 120 // 121 // we don't need to save r9-r15 which both C and Java treat as 122 // volatile 123 // 124 // we don't need to save r16-18 because Java does not use them 125 // 126 // we save r19-r28 which Java uses as scratch registers and C 127 // expects to be callee-save 128 // 129 // we save the bottom 64 bits of each value stored in v8-v15; it is 130 // the responsibility of the caller to preserve larger values. 131 // 132 // so the stub frame looks like this when we enter Java code 133 // 134 // [ return_from_Java ] <--- sp 135 // [ argument word n ] 136 // ... 137 // -27 [ argument word 1 ] 138 // -26 [ saved v15 ] <--- sp_after_call 139 // -25 [ saved v14 ] 140 // -24 [ saved v13 ] 141 // -23 [ saved v12 ] 142 // -22 [ saved v11 ] 143 // -21 [ saved v10 ] 144 // -20 [ saved v9 ] 145 // -19 [ saved v8 ] 146 // -18 [ saved r28 ] 147 // -17 [ saved r27 ] 148 // -16 [ saved r26 ] 149 // -15 [ saved r25 ] 150 // -14 [ saved r24 ] 151 // -13 [ saved r23 ] 152 // -12 [ saved r22 ] 153 // -11 [ saved r21 ] 154 // -10 [ saved r20 ] 155 // -9 [ saved r19 ] 156 // -8 [ call wrapper (r0) ] 157 // -7 [ result (r1) ] 158 // -6 [ result type (r2) ] 159 // -5 [ method (r3) ] 160 // -4 [ entry point (r4) ] 161 // -3 [ parameters (r5) ] 162 // -2 [ parameter size (r6) ] 163 // -1 [ thread (r7) ] 164 // 0 [ saved fp (r29) ] <--- fp == saved sp (r31) 165 // 1 [ saved lr (r30) ] 166 167 // Call stub stack layout word offsets from fp 168 enum call_stub_layout { 169 sp_after_call_off = -26, 170 171 d15_off = -26, 172 d13_off = -24, 173 d11_off = -22, 174 d9_off = -20, 175 176 r28_off = -18, 177 r26_off = -16, 178 r24_off = -14, 179 r22_off = -12, 180 r20_off = -10, 181 call_wrapper_off = -8, 182 result_off = -7, 183 result_type_off = -6, 184 method_off = -5, 185 entry_point_off = -4, 186 parameter_size_off = -2, 187 thread_off = -1, 188 fp_f = 0, 189 retaddr_off = 1, 190 }; 191 192 address generate_call_stub(address& return_address) { 193 assert((int)frame::entry_frame_after_call_words == -(int)sp_after_call_off + 1 && 194 (int)frame::entry_frame_call_wrapper_offset == (int)call_wrapper_off, 195 "adjust this code"); 196 197 StubCodeMark mark(this, "StubRoutines", "call_stub"); 198 address start = __ pc(); 199 200 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 201 202 const Address call_wrapper (rfp, call_wrapper_off * wordSize); 203 const Address result (rfp, result_off * wordSize); 204 const Address result_type (rfp, result_type_off * wordSize); 205 const Address method (rfp, method_off * wordSize); 206 const Address entry_point (rfp, entry_point_off * wordSize); 207 const Address parameter_size(rfp, parameter_size_off * wordSize); 208 209 const Address thread (rfp, thread_off * wordSize); 210 211 const Address d15_save (rfp, d15_off * wordSize); 212 const Address d13_save (rfp, d13_off * wordSize); 213 const Address d11_save (rfp, d11_off * wordSize); 214 const Address d9_save (rfp, d9_off * wordSize); 215 216 const Address r28_save (rfp, r28_off * wordSize); 217 const Address r26_save (rfp, r26_off * wordSize); 218 const Address r24_save (rfp, r24_off * wordSize); 219 const Address r22_save (rfp, r22_off * wordSize); 220 const Address r20_save (rfp, r20_off * wordSize); 221 222 // stub code 223 224 // we need a C prolog to bootstrap the x86 caller into the sim 225 __ c_stub_prolog(8, 0, MacroAssembler::ret_type_void); 226 227 address aarch64_entry = __ pc(); 228 229 #ifdef BUILTIN_SIM 230 // Save sender's SP for stack traces. 231 __ mov(rscratch1, sp); 232 __ str(rscratch1, Address(__ pre(sp, -2 * wordSize))); 233 #endif 234 // set up frame and move sp to end of save area 235 __ enter(); 236 __ sub(sp, rfp, -sp_after_call_off * wordSize); 237 238 // save register parameters and Java scratch/global registers 239 // n.b. we save thread even though it gets installed in 240 // rthread because we want to sanity check rthread later 241 __ str(c_rarg7, thread); 242 __ strw(c_rarg6, parameter_size); 243 __ stp(c_rarg4, c_rarg5, entry_point); 244 __ stp(c_rarg2, c_rarg3, result_type); 245 __ stp(c_rarg0, c_rarg1, call_wrapper); 246 247 __ stp(r20, r19, r20_save); 248 __ stp(r22, r21, r22_save); 249 __ stp(r24, r23, r24_save); 250 __ stp(r26, r25, r26_save); 251 __ stp(r28, r27, r28_save); 252 253 __ stpd(v9, v8, d9_save); 254 __ stpd(v11, v10, d11_save); 255 __ stpd(v13, v12, d13_save); 256 __ stpd(v15, v14, d15_save); 257 258 // install Java thread in global register now we have saved 259 // whatever value it held 260 __ mov(rthread, c_rarg7); 261 // And method 262 __ mov(rmethod, c_rarg3); 263 264 // set up the heapbase register 265 __ reinit_heapbase(); 266 267 #ifdef ASSERT 268 // make sure we have no pending exceptions 269 { 270 Label L; 271 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 272 __ cmp(rscratch1, (u1)NULL_WORD); 273 __ br(Assembler::EQ, L); 274 __ stop("StubRoutines::call_stub: entered with pending exception"); 275 __ BIND(L); 276 } 277 #endif 278 // pass parameters if any 279 __ mov(esp, sp); 280 __ sub(rscratch1, sp, c_rarg6, ext::uxtw, LogBytesPerWord); // Move SP out of the way 281 __ andr(sp, rscratch1, -2 * wordSize); 282 283 BLOCK_COMMENT("pass parameters if any"); 284 Label parameters_done; 285 // parameter count is still in c_rarg6 286 // and parameter pointer identifying param 1 is in c_rarg5 287 __ cbzw(c_rarg6, parameters_done); 288 289 address loop = __ pc(); 290 __ ldr(rscratch1, Address(__ post(c_rarg5, wordSize))); 291 __ subsw(c_rarg6, c_rarg6, 1); 292 __ push(rscratch1); 293 __ br(Assembler::GT, loop); 294 295 __ BIND(parameters_done); 296 297 // call Java entry -- passing methdoOop, and current sp 298 // rmethod: Method* 299 // r13: sender sp 300 BLOCK_COMMENT("call Java function"); 301 __ mov(r13, sp); 302 __ blr(c_rarg4); 303 304 // tell the simulator we have returned to the stub 305 306 // we do this here because the notify will already have been done 307 // if we get to the next instruction via an exception 308 // 309 // n.b. adding this instruction here affects the calculation of 310 // whether or not a routine returns to the call stub (used when 311 // doing stack walks) since the normal test is to check the return 312 // pc against the address saved below. so we may need to allow for 313 // this extra instruction in the check. 314 315 if (NotifySimulator) { 316 __ notify(Assembler::method_reentry); 317 } 318 // save current address for use by exception handling code 319 320 return_address = __ pc(); 321 322 // store result depending on type (everything that is not 323 // T_OBJECT, T_VALUETYPE, T_LONG, T_FLOAT or T_DOUBLE is treated as T_INT) 324 // n.b. this assumes Java returns an integral result in r0 325 // and a floating result in j_farg0 326 __ ldr(j_rarg2, result); 327 Label is_long, is_float, is_double, is_value, exit; 328 __ ldr(j_rarg1, result_type); 329 __ cmp(j_rarg1, (u1)T_OBJECT); 330 __ br(Assembler::EQ, is_long); 331 __ cmp(j_rarg1, (u1)T_VALUETYPE); 332 __ br(Assembler::EQ, is_value); 333 __ cmp(j_rarg1, (u1)T_LONG); 334 __ br(Assembler::EQ, is_long); 335 __ cmp(j_rarg1, (u1)T_FLOAT); 336 __ br(Assembler::EQ, is_float); 337 __ cmp(j_rarg1, (u1)T_DOUBLE); 338 __ br(Assembler::EQ, is_double); 339 340 // handle T_INT case 341 __ strw(r0, Address(j_rarg2)); 342 343 __ BIND(exit); 344 345 // pop parameters 346 __ sub(esp, rfp, -sp_after_call_off * wordSize); 347 348 #ifdef ASSERT 349 // verify that threads correspond 350 { 351 Label L, S; 352 __ ldr(rscratch1, thread); 353 __ cmp(rthread, rscratch1); 354 __ br(Assembler::NE, S); 355 __ get_thread(rscratch1); 356 __ cmp(rthread, rscratch1); 357 __ br(Assembler::EQ, L); 358 __ BIND(S); 359 __ stop("StubRoutines::call_stub: threads must correspond"); 360 __ BIND(L); 361 } 362 #endif 363 364 // restore callee-save registers 365 __ ldpd(v15, v14, d15_save); 366 __ ldpd(v13, v12, d13_save); 367 __ ldpd(v11, v10, d11_save); 368 __ ldpd(v9, v8, d9_save); 369 370 __ ldp(r28, r27, r28_save); 371 __ ldp(r26, r25, r26_save); 372 __ ldp(r24, r23, r24_save); 373 __ ldp(r22, r21, r22_save); 374 __ ldp(r20, r19, r20_save); 375 376 __ ldp(c_rarg0, c_rarg1, call_wrapper); 377 __ ldrw(c_rarg2, result_type); 378 __ ldr(c_rarg3, method); 379 __ ldp(c_rarg4, c_rarg5, entry_point); 380 __ ldp(c_rarg6, c_rarg7, parameter_size); 381 382 #ifndef PRODUCT 383 // tell the simulator we are about to end Java execution 384 if (NotifySimulator) { 385 __ notify(Assembler::method_exit); 386 } 387 #endif 388 // leave frame and return to caller 389 __ leave(); 390 __ ret(lr); 391 392 // handle return types different from T_INT 393 __ BIND(is_value); 394 if (ValueTypeReturnedAsFields) { 395 // Check for flattened return value 396 __ cbz(r0, is_long); 397 // Initialize pre-allocated buffer 398 __ mov(r1, r0); 399 __ andr(r1, r1, -2); 400 __ ldr(r1, Address(r1, InstanceKlass::adr_valueklass_fixed_block_offset())); 401 __ ldr(r1, Address(r1, ValueKlass::pack_handler_offset())); 402 __ ldr(r0, Address(j_rarg2, 0)); 403 __ blr(r1); 404 __ b(exit); 405 } 406 407 __ BIND(is_long); 408 __ str(r0, Address(j_rarg2, 0)); 409 __ br(Assembler::AL, exit); 410 411 __ BIND(is_float); 412 __ strs(j_farg0, Address(j_rarg2, 0)); 413 __ br(Assembler::AL, exit); 414 415 __ BIND(is_double); 416 __ strd(j_farg0, Address(j_rarg2, 0)); 417 __ br(Assembler::AL, exit); 418 419 return start; 420 } 421 422 // Return point for a Java call if there's an exception thrown in 423 // Java code. The exception is caught and transformed into a 424 // pending exception stored in JavaThread that can be tested from 425 // within the VM. 426 // 427 // Note: Usually the parameters are removed by the callee. In case 428 // of an exception crossing an activation frame boundary, that is 429 // not the case if the callee is compiled code => need to setup the 430 // rsp. 431 // 432 // r0: exception oop 433 434 // NOTE: this is used as a target from the signal handler so it 435 // needs an x86 prolog which returns into the current simulator 436 // executing the generated catch_exception code. so the prolog 437 // needs to install rax in a sim register and adjust the sim's 438 // restart pc to enter the generated code at the start position 439 // then return from native to simulated execution. 440 441 address generate_catch_exception() { 442 StubCodeMark mark(this, "StubRoutines", "catch_exception"); 443 address start = __ pc(); 444 445 // same as in generate_call_stub(): 446 const Address sp_after_call(rfp, sp_after_call_off * wordSize); 447 const Address thread (rfp, thread_off * wordSize); 448 449 #ifdef ASSERT 450 // verify that threads correspond 451 { 452 Label L, S; 453 __ ldr(rscratch1, thread); 454 __ cmp(rthread, rscratch1); 455 __ br(Assembler::NE, S); 456 __ get_thread(rscratch1); 457 __ cmp(rthread, rscratch1); 458 __ br(Assembler::EQ, L); 459 __ bind(S); 460 __ stop("StubRoutines::catch_exception: threads must correspond"); 461 __ bind(L); 462 } 463 #endif 464 465 // set pending exception 466 __ verify_oop(r0); 467 468 __ str(r0, Address(rthread, Thread::pending_exception_offset())); 469 __ mov(rscratch1, (address)__FILE__); 470 __ str(rscratch1, Address(rthread, Thread::exception_file_offset())); 471 __ movw(rscratch1, (int)__LINE__); 472 __ strw(rscratch1, Address(rthread, Thread::exception_line_offset())); 473 474 // complete return to VM 475 assert(StubRoutines::_call_stub_return_address != NULL, 476 "_call_stub_return_address must have been generated before"); 477 __ b(StubRoutines::_call_stub_return_address); 478 479 return start; 480 } 481 482 // Continuation point for runtime calls returning with a pending 483 // exception. The pending exception check happened in the runtime 484 // or native call stub. The pending exception in Thread is 485 // converted into a Java-level exception. 486 // 487 // Contract with Java-level exception handlers: 488 // r0: exception 489 // r3: throwing pc 490 // 491 // NOTE: At entry of this stub, exception-pc must be in LR !! 492 493 // NOTE: this is always used as a jump target within generated code 494 // so it just needs to be generated code wiht no x86 prolog 495 496 address generate_forward_exception() { 497 StubCodeMark mark(this, "StubRoutines", "forward exception"); 498 address start = __ pc(); 499 500 // Upon entry, LR points to the return address returning into 501 // Java (interpreted or compiled) code; i.e., the return address 502 // becomes the throwing pc. 503 // 504 // Arguments pushed before the runtime call are still on the stack 505 // but the exception handler will reset the stack pointer -> 506 // ignore them. A potential result in registers can be ignored as 507 // well. 508 509 #ifdef ASSERT 510 // make sure this code is only executed if there is a pending exception 511 { 512 Label L; 513 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 514 __ cbnz(rscratch1, L); 515 __ stop("StubRoutines::forward exception: no pending exception (1)"); 516 __ bind(L); 517 } 518 #endif 519 520 // compute exception handler into r19 521 522 // call the VM to find the handler address associated with the 523 // caller address. pass thread in r0 and caller pc (ret address) 524 // in r1. n.b. the caller pc is in lr, unlike x86 where it is on 525 // the stack. 526 __ mov(c_rarg1, lr); 527 // lr will be trashed by the VM call so we move it to R19 528 // (callee-saved) because we also need to pass it to the handler 529 // returned by this call. 530 __ mov(r19, lr); 531 BLOCK_COMMENT("call exception_handler_for_return_address"); 532 __ call_VM_leaf(CAST_FROM_FN_PTR(address, 533 SharedRuntime::exception_handler_for_return_address), 534 rthread, c_rarg1); 535 // we should not really care that lr is no longer the callee 536 // address. we saved the value the handler needs in r19 so we can 537 // just copy it to r3. however, the C2 handler will push its own 538 // frame and then calls into the VM and the VM code asserts that 539 // the PC for the frame above the handler belongs to a compiled 540 // Java method. So, we restore lr here to satisfy that assert. 541 __ mov(lr, r19); 542 // setup r0 & r3 & clear pending exception 543 __ mov(r3, r19); 544 __ mov(r19, r0); 545 __ ldr(r0, Address(rthread, Thread::pending_exception_offset())); 546 __ str(zr, Address(rthread, Thread::pending_exception_offset())); 547 548 #ifdef ASSERT 549 // make sure exception is set 550 { 551 Label L; 552 __ cbnz(r0, L); 553 __ stop("StubRoutines::forward exception: no pending exception (2)"); 554 __ bind(L); 555 } 556 #endif 557 558 // continue at exception handler 559 // r0: exception 560 // r3: throwing pc 561 // r19: exception handler 562 __ verify_oop(r0); 563 __ br(r19); 564 565 return start; 566 } 567 568 // Non-destructive plausibility checks for oops 569 // 570 // Arguments: 571 // r0: oop to verify 572 // rscratch1: error message 573 // 574 // Stack after saving c_rarg3: 575 // [tos + 0]: saved c_rarg3 576 // [tos + 1]: saved c_rarg2 577 // [tos + 2]: saved lr 578 // [tos + 3]: saved rscratch2 579 // [tos + 4]: saved r0 580 // [tos + 5]: saved rscratch1 581 address generate_verify_oop() { 582 583 StubCodeMark mark(this, "StubRoutines", "verify_oop"); 584 address start = __ pc(); 585 586 Label exit, error; 587 588 // save c_rarg2 and c_rarg3 589 __ stp(c_rarg3, c_rarg2, Address(__ pre(sp, -16))); 590 591 // __ incrementl(ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 592 __ lea(c_rarg2, ExternalAddress((address) StubRoutines::verify_oop_count_addr())); 593 __ ldr(c_rarg3, Address(c_rarg2)); 594 __ add(c_rarg3, c_rarg3, 1); 595 __ str(c_rarg3, Address(c_rarg2)); 596 597 // object is in r0 598 // make sure object is 'reasonable' 599 __ cbz(r0, exit); // if obj is NULL it is OK 600 601 #if INCLUDE_ZGC 602 if (UseZGC) { 603 // Check if mask is good. 604 // verifies that ZAddressBadMask & r0 == 0 605 __ ldr(c_rarg3, Address(rthread, ZThreadLocalData::address_bad_mask_offset())); 606 __ andr(c_rarg2, r0, c_rarg3); 607 __ cbnz(c_rarg2, error); 608 } 609 #endif 610 611 // Check if the oop is in the right area of memory 612 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_mask()); 613 __ andr(c_rarg2, r0, c_rarg3); 614 __ mov(c_rarg3, (intptr_t) Universe::verify_oop_bits()); 615 616 // Compare c_rarg2 and c_rarg3. We don't use a compare 617 // instruction here because the flags register is live. 618 __ eor(c_rarg2, c_rarg2, c_rarg3); 619 __ cbnz(c_rarg2, error); 620 621 // make sure klass is 'reasonable', which is not zero. 622 __ load_klass(r0, r0); // get klass 623 __ cbz(r0, error); // if klass is NULL it is broken 624 625 // return if everything seems ok 626 __ bind(exit); 627 628 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 629 __ ret(lr); 630 631 // handle errors 632 __ bind(error); 633 __ ldp(c_rarg3, c_rarg2, Address(__ post(sp, 16))); 634 635 __ push(RegSet::range(r0, r29), sp); 636 // debug(char* msg, int64_t pc, int64_t regs[]) 637 __ mov(c_rarg0, rscratch1); // pass address of error message 638 __ mov(c_rarg1, lr); // pass return address 639 __ mov(c_rarg2, sp); // pass address of regs on stack 640 #ifndef PRODUCT 641 assert(frame::arg_reg_save_area_bytes == 0, "not expecting frame reg save area"); 642 #endif 643 BLOCK_COMMENT("call MacroAssembler::debug"); 644 __ mov(rscratch1, CAST_FROM_FN_PTR(address, MacroAssembler::debug64)); 645 __ blrt(rscratch1, 3, 0, 1); 646 647 return start; 648 } 649 650 void array_overlap_test(Label& L_no_overlap, Address::sxtw sf) { __ b(L_no_overlap); } 651 652 // The inner part of zero_words(). This is the bulk operation, 653 // zeroing words in blocks, possibly using DC ZVA to do it. The 654 // caller is responsible for zeroing the last few words. 655 // 656 // Inputs: 657 // r10: the HeapWord-aligned base address of an array to zero. 658 // r11: the count in HeapWords, r11 > 0. 659 // 660 // Returns r10 and r11, adjusted for the caller to clear. 661 // r10: the base address of the tail of words left to clear. 662 // r11: the number of words in the tail. 663 // r11 < MacroAssembler::zero_words_block_size. 664 665 address generate_zero_blocks() { 666 Label done; 667 Label base_aligned; 668 669 Register base = r10, cnt = r11; 670 671 __ align(CodeEntryAlignment); 672 StubCodeMark mark(this, "StubRoutines", "zero_blocks"); 673 address start = __ pc(); 674 675 if (UseBlockZeroing) { 676 int zva_length = VM_Version::zva_length(); 677 678 // Ensure ZVA length can be divided by 16. This is required by 679 // the subsequent operations. 680 assert (zva_length % 16 == 0, "Unexpected ZVA Length"); 681 682 __ tbz(base, 3, base_aligned); 683 __ str(zr, Address(__ post(base, 8))); 684 __ sub(cnt, cnt, 1); 685 __ bind(base_aligned); 686 687 // Ensure count >= zva_length * 2 so that it still deserves a zva after 688 // alignment. 689 Label small; 690 int low_limit = MAX2(zva_length * 2, (int)BlockZeroingLowLimit); 691 __ subs(rscratch1, cnt, low_limit >> 3); 692 __ br(Assembler::LT, small); 693 __ zero_dcache_blocks(base, cnt); 694 __ bind(small); 695 } 696 697 { 698 // Number of stp instructions we'll unroll 699 const int unroll = 700 MacroAssembler::zero_words_block_size / 2; 701 // Clear the remaining blocks. 702 Label loop; 703 __ subs(cnt, cnt, unroll * 2); 704 __ br(Assembler::LT, done); 705 __ bind(loop); 706 for (int i = 0; i < unroll; i++) 707 __ stp(zr, zr, __ post(base, 16)); 708 __ subs(cnt, cnt, unroll * 2); 709 __ br(Assembler::GE, loop); 710 __ bind(done); 711 __ add(cnt, cnt, unroll * 2); 712 } 713 714 __ ret(lr); 715 716 return start; 717 } 718 719 720 typedef enum { 721 copy_forwards = 1, 722 copy_backwards = -1 723 } copy_direction; 724 725 // Bulk copy of blocks of 8 words. 726 // 727 // count is a count of words. 728 // 729 // Precondition: count >= 8 730 // 731 // Postconditions: 732 // 733 // The least significant bit of count contains the remaining count 734 // of words to copy. The rest of count is trash. 735 // 736 // s and d are adjusted to point to the remaining words to copy 737 // 738 void generate_copy_longs(Label &start, Register s, Register d, Register count, 739 copy_direction direction) { 740 int unit = wordSize * direction; 741 int bias = (UseSIMDForMemoryOps ? 4:2) * wordSize; 742 743 int offset; 744 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6, 745 t4 = r7, t5 = r10, t6 = r11, t7 = r12; 746 const Register stride = r13; 747 748 assert_different_registers(rscratch1, t0, t1, t2, t3, t4, t5, t6, t7); 749 assert_different_registers(s, d, count, rscratch1); 750 751 Label again, drain; 752 const char *stub_name; 753 if (direction == copy_forwards) 754 stub_name = "forward_copy_longs"; 755 else 756 stub_name = "backward_copy_longs"; 757 758 __ align(CodeEntryAlignment); 759 760 StubCodeMark mark(this, "StubRoutines", stub_name); 761 762 __ bind(start); 763 764 Label unaligned_copy_long; 765 if (AvoidUnalignedAccesses) { 766 __ tbnz(d, 3, unaligned_copy_long); 767 } 768 769 if (direction == copy_forwards) { 770 __ sub(s, s, bias); 771 __ sub(d, d, bias); 772 } 773 774 #ifdef ASSERT 775 // Make sure we are never given < 8 words 776 { 777 Label L; 778 __ cmp(count, (u1)8); 779 __ br(Assembler::GE, L); 780 __ stop("genrate_copy_longs called with < 8 words"); 781 __ bind(L); 782 } 783 #endif 784 785 // Fill 8 registers 786 if (UseSIMDForMemoryOps) { 787 __ ldpq(v0, v1, Address(s, 4 * unit)); 788 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 789 } else { 790 __ ldp(t0, t1, Address(s, 2 * unit)); 791 __ ldp(t2, t3, Address(s, 4 * unit)); 792 __ ldp(t4, t5, Address(s, 6 * unit)); 793 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 794 } 795 796 __ subs(count, count, 16); 797 __ br(Assembler::LO, drain); 798 799 int prefetch = PrefetchCopyIntervalInBytes; 800 bool use_stride = false; 801 if (direction == copy_backwards) { 802 use_stride = prefetch > 256; 803 prefetch = -prefetch; 804 if (use_stride) __ mov(stride, prefetch); 805 } 806 807 __ bind(again); 808 809 if (PrefetchCopyIntervalInBytes > 0) 810 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 811 812 if (UseSIMDForMemoryOps) { 813 __ stpq(v0, v1, Address(d, 4 * unit)); 814 __ ldpq(v0, v1, Address(s, 4 * unit)); 815 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 816 __ ldpq(v2, v3, Address(__ pre(s, 8 * unit))); 817 } else { 818 __ stp(t0, t1, Address(d, 2 * unit)); 819 __ ldp(t0, t1, Address(s, 2 * unit)); 820 __ stp(t2, t3, Address(d, 4 * unit)); 821 __ ldp(t2, t3, Address(s, 4 * unit)); 822 __ stp(t4, t5, Address(d, 6 * unit)); 823 __ ldp(t4, t5, Address(s, 6 * unit)); 824 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 825 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 826 } 827 828 __ subs(count, count, 8); 829 __ br(Assembler::HS, again); 830 831 // Drain 832 __ bind(drain); 833 if (UseSIMDForMemoryOps) { 834 __ stpq(v0, v1, Address(d, 4 * unit)); 835 __ stpq(v2, v3, Address(__ pre(d, 8 * unit))); 836 } else { 837 __ stp(t0, t1, Address(d, 2 * unit)); 838 __ stp(t2, t3, Address(d, 4 * unit)); 839 __ stp(t4, t5, Address(d, 6 * unit)); 840 __ stp(t6, t7, Address(__ pre(d, 8 * unit))); 841 } 842 843 { 844 Label L1, L2; 845 __ tbz(count, exact_log2(4), L1); 846 if (UseSIMDForMemoryOps) { 847 __ ldpq(v0, v1, Address(__ pre(s, 4 * unit))); 848 __ stpq(v0, v1, Address(__ pre(d, 4 * unit))); 849 } else { 850 __ ldp(t0, t1, Address(s, 2 * unit)); 851 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 852 __ stp(t0, t1, Address(d, 2 * unit)); 853 __ stp(t2, t3, Address(__ pre(d, 4 * unit))); 854 } 855 __ bind(L1); 856 857 if (direction == copy_forwards) { 858 __ add(s, s, bias); 859 __ add(d, d, bias); 860 } 861 862 __ tbz(count, 1, L2); 863 __ ldp(t0, t1, Address(__ adjust(s, 2 * unit, direction == copy_backwards))); 864 __ stp(t0, t1, Address(__ adjust(d, 2 * unit, direction == copy_backwards))); 865 __ bind(L2); 866 } 867 868 __ ret(lr); 869 870 if (AvoidUnalignedAccesses) { 871 Label drain, again; 872 // Register order for storing. Order is different for backward copy. 873 874 __ bind(unaligned_copy_long); 875 876 // source address is even aligned, target odd aligned 877 // 878 // when forward copying word pairs we read long pairs at offsets 879 // {0, 2, 4, 6} (in long words). when backwards copying we read 880 // long pairs at offsets {-2, -4, -6, -8}. We adjust the source 881 // address by -2 in the forwards case so we can compute the 882 // source offsets for both as {2, 4, 6, 8} * unit where unit = 1 883 // or -1. 884 // 885 // when forward copying we need to store 1 word, 3 pairs and 886 // then 1 word at offsets {0, 1, 3, 5, 7}. Rather thna use a 887 // zero offset We adjust the destination by -1 which means we 888 // have to use offsets { 1, 2, 4, 6, 8} * unit for the stores. 889 // 890 // When backwards copyng we need to store 1 word, 3 pairs and 891 // then 1 word at offsets {-1, -3, -5, -7, -8} i.e. we use 892 // offsets {1, 3, 5, 7, 8} * unit. 893 894 if (direction == copy_forwards) { 895 __ sub(s, s, 16); 896 __ sub(d, d, 8); 897 } 898 899 // Fill 8 registers 900 // 901 // for forwards copy s was offset by -16 from the original input 902 // value of s so the register contents are at these offsets 903 // relative to the 64 bit block addressed by that original input 904 // and so on for each successive 64 byte block when s is updated 905 // 906 // t0 at offset 0, t1 at offset 8 907 // t2 at offset 16, t3 at offset 24 908 // t4 at offset 32, t5 at offset 40 909 // t6 at offset 48, t7 at offset 56 910 911 // for backwards copy s was not offset so the register contents 912 // are at these offsets into the preceding 64 byte block 913 // relative to that original input and so on for each successive 914 // preceding 64 byte block when s is updated. this explains the 915 // slightly counter-intuitive looking pattern of register usage 916 // in the stp instructions for backwards copy. 917 // 918 // t0 at offset -16, t1 at offset -8 919 // t2 at offset -32, t3 at offset -24 920 // t4 at offset -48, t5 at offset -40 921 // t6 at offset -64, t7 at offset -56 922 923 __ ldp(t0, t1, Address(s, 2 * unit)); 924 __ ldp(t2, t3, Address(s, 4 * unit)); 925 __ ldp(t4, t5, Address(s, 6 * unit)); 926 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 927 928 __ subs(count, count, 16); 929 __ br(Assembler::LO, drain); 930 931 int prefetch = PrefetchCopyIntervalInBytes; 932 bool use_stride = false; 933 if (direction == copy_backwards) { 934 use_stride = prefetch > 256; 935 prefetch = -prefetch; 936 if (use_stride) __ mov(stride, prefetch); 937 } 938 939 __ bind(again); 940 941 if (PrefetchCopyIntervalInBytes > 0) 942 __ prfm(use_stride ? Address(s, stride) : Address(s, prefetch), PLDL1KEEP); 943 944 if (direction == copy_forwards) { 945 // allowing for the offset of -8 the store instructions place 946 // registers into the target 64 bit block at the following 947 // offsets 948 // 949 // t0 at offset 0 950 // t1 at offset 8, t2 at offset 16 951 // t3 at offset 24, t4 at offset 32 952 // t5 at offset 40, t6 at offset 48 953 // t7 at offset 56 954 955 __ str(t0, Address(d, 1 * unit)); 956 __ stp(t1, t2, Address(d, 2 * unit)); 957 __ ldp(t0, t1, Address(s, 2 * unit)); 958 __ stp(t3, t4, Address(d, 4 * unit)); 959 __ ldp(t2, t3, Address(s, 4 * unit)); 960 __ stp(t5, t6, Address(d, 6 * unit)); 961 __ ldp(t4, t5, Address(s, 6 * unit)); 962 __ str(t7, Address(__ pre(d, 8 * unit))); 963 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 964 } else { 965 // d was not offset when we started so the registers are 966 // written into the 64 bit block preceding d with the following 967 // offsets 968 // 969 // t1 at offset -8 970 // t3 at offset -24, t0 at offset -16 971 // t5 at offset -48, t2 at offset -32 972 // t7 at offset -56, t4 at offset -48 973 // t6 at offset -64 974 // 975 // note that this matches the offsets previously noted for the 976 // loads 977 978 __ str(t1, Address(d, 1 * unit)); 979 __ stp(t3, t0, Address(d, 3 * unit)); 980 __ ldp(t0, t1, Address(s, 2 * unit)); 981 __ stp(t5, t2, Address(d, 5 * unit)); 982 __ ldp(t2, t3, Address(s, 4 * unit)); 983 __ stp(t7, t4, Address(d, 7 * unit)); 984 __ ldp(t4, t5, Address(s, 6 * unit)); 985 __ str(t6, Address(__ pre(d, 8 * unit))); 986 __ ldp(t6, t7, Address(__ pre(s, 8 * unit))); 987 } 988 989 __ subs(count, count, 8); 990 __ br(Assembler::HS, again); 991 992 // Drain 993 // 994 // this uses the same pattern of offsets and register arguments 995 // as above 996 __ bind(drain); 997 if (direction == copy_forwards) { 998 __ str(t0, Address(d, 1 * unit)); 999 __ stp(t1, t2, Address(d, 2 * unit)); 1000 __ stp(t3, t4, Address(d, 4 * unit)); 1001 __ stp(t5, t6, Address(d, 6 * unit)); 1002 __ str(t7, Address(__ pre(d, 8 * unit))); 1003 } else { 1004 __ str(t1, Address(d, 1 * unit)); 1005 __ stp(t3, t0, Address(d, 3 * unit)); 1006 __ stp(t5, t2, Address(d, 5 * unit)); 1007 __ stp(t7, t4, Address(d, 7 * unit)); 1008 __ str(t6, Address(__ pre(d, 8 * unit))); 1009 } 1010 // now we need to copy any remaining part block which may 1011 // include a 4 word block subblock and/or a 2 word subblock. 1012 // bits 2 and 1 in the count are the tell-tale for whetehr we 1013 // have each such subblock 1014 { 1015 Label L1, L2; 1016 __ tbz(count, exact_log2(4), L1); 1017 // this is the same as above but copying only 4 longs hence 1018 // with ony one intervening stp between the str instructions 1019 // but note that the offsets and registers still follow the 1020 // same pattern 1021 __ ldp(t0, t1, Address(s, 2 * unit)); 1022 __ ldp(t2, t3, Address(__ pre(s, 4 * unit))); 1023 if (direction == copy_forwards) { 1024 __ str(t0, Address(d, 1 * unit)); 1025 __ stp(t1, t2, Address(d, 2 * unit)); 1026 __ str(t3, Address(__ pre(d, 4 * unit))); 1027 } else { 1028 __ str(t1, Address(d, 1 * unit)); 1029 __ stp(t3, t0, Address(d, 3 * unit)); 1030 __ str(t2, Address(__ pre(d, 4 * unit))); 1031 } 1032 __ bind(L1); 1033 1034 __ tbz(count, 1, L2); 1035 // this is the same as above but copying only 2 longs hence 1036 // there is no intervening stp between the str instructions 1037 // but note that the offset and register patterns are still 1038 // the same 1039 __ ldp(t0, t1, Address(__ pre(s, 2 * unit))); 1040 if (direction == copy_forwards) { 1041 __ str(t0, Address(d, 1 * unit)); 1042 __ str(t1, Address(__ pre(d, 2 * unit))); 1043 } else { 1044 __ str(t1, Address(d, 1 * unit)); 1045 __ str(t0, Address(__ pre(d, 2 * unit))); 1046 } 1047 __ bind(L2); 1048 1049 // for forwards copy we need to re-adjust the offsets we 1050 // applied so that s and d are follow the last words written 1051 1052 if (direction == copy_forwards) { 1053 __ add(s, s, 16); 1054 __ add(d, d, 8); 1055 } 1056 1057 } 1058 1059 __ ret(lr); 1060 } 1061 } 1062 1063 // Small copy: less than 16 bytes. 1064 // 1065 // NB: Ignores all of the bits of count which represent more than 15 1066 // bytes, so a caller doesn't have to mask them. 1067 1068 void copy_memory_small(Register s, Register d, Register count, Register tmp, int step) { 1069 bool is_backwards = step < 0; 1070 size_t granularity = uabs(step); 1071 int direction = is_backwards ? -1 : 1; 1072 int unit = wordSize * direction; 1073 1074 Label Lword, Lint, Lshort, Lbyte; 1075 1076 assert(granularity 1077 && granularity <= sizeof (jlong), "Impossible granularity in copy_memory_small"); 1078 1079 const Register t0 = r3, t1 = r4, t2 = r5, t3 = r6; 1080 1081 // ??? I don't know if this bit-test-and-branch is the right thing 1082 // to do. It does a lot of jumping, resulting in several 1083 // mispredicted branches. It might make more sense to do this 1084 // with something like Duff's device with a single computed branch. 1085 1086 __ tbz(count, 3 - exact_log2(granularity), Lword); 1087 __ ldr(tmp, Address(__ adjust(s, unit, is_backwards))); 1088 __ str(tmp, Address(__ adjust(d, unit, is_backwards))); 1089 __ bind(Lword); 1090 1091 if (granularity <= sizeof (jint)) { 1092 __ tbz(count, 2 - exact_log2(granularity), Lint); 1093 __ ldrw(tmp, Address(__ adjust(s, sizeof (jint) * direction, is_backwards))); 1094 __ strw(tmp, Address(__ adjust(d, sizeof (jint) * direction, is_backwards))); 1095 __ bind(Lint); 1096 } 1097 1098 if (granularity <= sizeof (jshort)) { 1099 __ tbz(count, 1 - exact_log2(granularity), Lshort); 1100 __ ldrh(tmp, Address(__ adjust(s, sizeof (jshort) * direction, is_backwards))); 1101 __ strh(tmp, Address(__ adjust(d, sizeof (jshort) * direction, is_backwards))); 1102 __ bind(Lshort); 1103 } 1104 1105 if (granularity <= sizeof (jbyte)) { 1106 __ tbz(count, 0, Lbyte); 1107 __ ldrb(tmp, Address(__ adjust(s, sizeof (jbyte) * direction, is_backwards))); 1108 __ strb(tmp, Address(__ adjust(d, sizeof (jbyte) * direction, is_backwards))); 1109 __ bind(Lbyte); 1110 } 1111 } 1112 1113 Label copy_f, copy_b; 1114 1115 // All-singing all-dancing memory copy. 1116 // 1117 // Copy count units of memory from s to d. The size of a unit is 1118 // step, which can be positive or negative depending on the direction 1119 // of copy. If is_aligned is false, we align the source address. 1120 // 1121 1122 void copy_memory(bool is_aligned, Register s, Register d, 1123 Register count, Register tmp, int step) { 1124 copy_direction direction = step < 0 ? copy_backwards : copy_forwards; 1125 bool is_backwards = step < 0; 1126 int granularity = uabs(step); 1127 const Register t0 = r3, t1 = r4; 1128 1129 // <= 96 bytes do inline. Direction doesn't matter because we always 1130 // load all the data before writing anything 1131 Label copy4, copy8, copy16, copy32, copy80, copy_big, finish; 1132 const Register t2 = r5, t3 = r6, t4 = r7, t5 = r8; 1133 const Register t6 = r9, t7 = r10, t8 = r11, t9 = r12; 1134 const Register send = r17, dend = r18; 1135 1136 if (PrefetchCopyIntervalInBytes > 0) 1137 __ prfm(Address(s, 0), PLDL1KEEP); 1138 __ cmp(count, u1((UseSIMDForMemoryOps ? 96:80)/granularity)); 1139 __ br(Assembler::HI, copy_big); 1140 1141 __ lea(send, Address(s, count, Address::lsl(exact_log2(granularity)))); 1142 __ lea(dend, Address(d, count, Address::lsl(exact_log2(granularity)))); 1143 1144 __ cmp(count, u1(16/granularity)); 1145 __ br(Assembler::LS, copy16); 1146 1147 __ cmp(count, u1(64/granularity)); 1148 __ br(Assembler::HI, copy80); 1149 1150 __ cmp(count, u1(32/granularity)); 1151 __ br(Assembler::LS, copy32); 1152 1153 // 33..64 bytes 1154 if (UseSIMDForMemoryOps) { 1155 __ ldpq(v0, v1, Address(s, 0)); 1156 __ ldpq(v2, v3, Address(send, -32)); 1157 __ stpq(v0, v1, Address(d, 0)); 1158 __ stpq(v2, v3, Address(dend, -32)); 1159 } else { 1160 __ ldp(t0, t1, Address(s, 0)); 1161 __ ldp(t2, t3, Address(s, 16)); 1162 __ ldp(t4, t5, Address(send, -32)); 1163 __ ldp(t6, t7, Address(send, -16)); 1164 1165 __ stp(t0, t1, Address(d, 0)); 1166 __ stp(t2, t3, Address(d, 16)); 1167 __ stp(t4, t5, Address(dend, -32)); 1168 __ stp(t6, t7, Address(dend, -16)); 1169 } 1170 __ b(finish); 1171 1172 // 17..32 bytes 1173 __ bind(copy32); 1174 __ ldp(t0, t1, Address(s, 0)); 1175 __ ldp(t2, t3, Address(send, -16)); 1176 __ stp(t0, t1, Address(d, 0)); 1177 __ stp(t2, t3, Address(dend, -16)); 1178 __ b(finish); 1179 1180 // 65..80/96 bytes 1181 // (96 bytes if SIMD because we do 32 byes per instruction) 1182 __ bind(copy80); 1183 if (UseSIMDForMemoryOps) { 1184 __ ld4(v0, v1, v2, v3, __ T16B, Address(s, 0)); 1185 __ ldpq(v4, v5, Address(send, -32)); 1186 __ st4(v0, v1, v2, v3, __ T16B, Address(d, 0)); 1187 __ stpq(v4, v5, Address(dend, -32)); 1188 } else { 1189 __ ldp(t0, t1, Address(s, 0)); 1190 __ ldp(t2, t3, Address(s, 16)); 1191 __ ldp(t4, t5, Address(s, 32)); 1192 __ ldp(t6, t7, Address(s, 48)); 1193 __ ldp(t8, t9, Address(send, -16)); 1194 1195 __ stp(t0, t1, Address(d, 0)); 1196 __ stp(t2, t3, Address(d, 16)); 1197 __ stp(t4, t5, Address(d, 32)); 1198 __ stp(t6, t7, Address(d, 48)); 1199 __ stp(t8, t9, Address(dend, -16)); 1200 } 1201 __ b(finish); 1202 1203 // 0..16 bytes 1204 __ bind(copy16); 1205 __ cmp(count, u1(8/granularity)); 1206 __ br(Assembler::LO, copy8); 1207 1208 // 8..16 bytes 1209 __ ldr(t0, Address(s, 0)); 1210 __ ldr(t1, Address(send, -8)); 1211 __ str(t0, Address(d, 0)); 1212 __ str(t1, Address(dend, -8)); 1213 __ b(finish); 1214 1215 if (granularity < 8) { 1216 // 4..7 bytes 1217 __ bind(copy8); 1218 __ tbz(count, 2 - exact_log2(granularity), copy4); 1219 __ ldrw(t0, Address(s, 0)); 1220 __ ldrw(t1, Address(send, -4)); 1221 __ strw(t0, Address(d, 0)); 1222 __ strw(t1, Address(dend, -4)); 1223 __ b(finish); 1224 if (granularity < 4) { 1225 // 0..3 bytes 1226 __ bind(copy4); 1227 __ cbz(count, finish); // get rid of 0 case 1228 if (granularity == 2) { 1229 __ ldrh(t0, Address(s, 0)); 1230 __ strh(t0, Address(d, 0)); 1231 } else { // granularity == 1 1232 // Now 1..3 bytes. Handle the 1 and 2 byte case by copying 1233 // the first and last byte. 1234 // Handle the 3 byte case by loading and storing base + count/2 1235 // (count == 1 (s+0)->(d+0), count == 2,3 (s+1) -> (d+1)) 1236 // This does means in the 1 byte case we load/store the same 1237 // byte 3 times. 1238 __ lsr(count, count, 1); 1239 __ ldrb(t0, Address(s, 0)); 1240 __ ldrb(t1, Address(send, -1)); 1241 __ ldrb(t2, Address(s, count)); 1242 __ strb(t0, Address(d, 0)); 1243 __ strb(t1, Address(dend, -1)); 1244 __ strb(t2, Address(d, count)); 1245 } 1246 __ b(finish); 1247 } 1248 } 1249 1250 __ bind(copy_big); 1251 if (is_backwards) { 1252 __ lea(s, Address(s, count, Address::lsl(exact_log2(-step)))); 1253 __ lea(d, Address(d, count, Address::lsl(exact_log2(-step)))); 1254 } 1255 1256 // Now we've got the small case out of the way we can align the 1257 // source address on a 2-word boundary. 1258 1259 Label aligned; 1260 1261 if (is_aligned) { 1262 // We may have to adjust by 1 word to get s 2-word-aligned. 1263 __ tbz(s, exact_log2(wordSize), aligned); 1264 __ ldr(tmp, Address(__ adjust(s, direction * wordSize, is_backwards))); 1265 __ str(tmp, Address(__ adjust(d, direction * wordSize, is_backwards))); 1266 __ sub(count, count, wordSize/granularity); 1267 } else { 1268 if (is_backwards) { 1269 __ andr(rscratch2, s, 2 * wordSize - 1); 1270 } else { 1271 __ neg(rscratch2, s); 1272 __ andr(rscratch2, rscratch2, 2 * wordSize - 1); 1273 } 1274 // rscratch2 is the byte adjustment needed to align s. 1275 __ cbz(rscratch2, aligned); 1276 int shift = exact_log2(granularity); 1277 if (shift) __ lsr(rscratch2, rscratch2, shift); 1278 __ sub(count, count, rscratch2); 1279 1280 #if 0 1281 // ?? This code is only correct for a disjoint copy. It may or 1282 // may not make sense to use it in that case. 1283 1284 // Copy the first pair; s and d may not be aligned. 1285 __ ldp(t0, t1, Address(s, is_backwards ? -2 * wordSize : 0)); 1286 __ stp(t0, t1, Address(d, is_backwards ? -2 * wordSize : 0)); 1287 1288 // Align s and d, adjust count 1289 if (is_backwards) { 1290 __ sub(s, s, rscratch2); 1291 __ sub(d, d, rscratch2); 1292 } else { 1293 __ add(s, s, rscratch2); 1294 __ add(d, d, rscratch2); 1295 } 1296 #else 1297 copy_memory_small(s, d, rscratch2, rscratch1, step); 1298 #endif 1299 } 1300 1301 __ bind(aligned); 1302 1303 // s is now 2-word-aligned. 1304 1305 // We have a count of units and some trailing bytes. Adjust the 1306 // count and do a bulk copy of words. 1307 __ lsr(rscratch2, count, exact_log2(wordSize/granularity)); 1308 if (direction == copy_forwards) 1309 __ bl(copy_f); 1310 else 1311 __ bl(copy_b); 1312 1313 // And the tail. 1314 copy_memory_small(s, d, count, tmp, step); 1315 1316 if (granularity >= 8) __ bind(copy8); 1317 if (granularity >= 4) __ bind(copy4); 1318 __ bind(finish); 1319 } 1320 1321 1322 void clobber_registers() { 1323 #ifdef ASSERT 1324 __ mov(rscratch1, (uint64_t)0xdeadbeef); 1325 __ orr(rscratch1, rscratch1, rscratch1, Assembler::LSL, 32); 1326 for (Register r = r3; r <= r18; r++) 1327 if (r != rscratch1) __ mov(r, rscratch1); 1328 #endif 1329 } 1330 1331 // Scan over array at a for count oops, verifying each one. 1332 // Preserves a and count, clobbers rscratch1 and rscratch2. 1333 void verify_oop_array (size_t size, Register a, Register count, Register temp) { 1334 Label loop, end; 1335 __ mov(rscratch1, a); 1336 __ mov(rscratch2, zr); 1337 __ bind(loop); 1338 __ cmp(rscratch2, count); 1339 __ br(Assembler::HS, end); 1340 if (size == (size_t)wordSize) { 1341 __ ldr(temp, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1342 __ verify_oop(temp); 1343 } else { 1344 __ ldrw(r16, Address(a, rscratch2, Address::lsl(exact_log2(size)))); 1345 __ decode_heap_oop(temp); // calls verify_oop 1346 } 1347 __ add(rscratch2, rscratch2, size); 1348 __ b(loop); 1349 __ bind(end); 1350 } 1351 1352 // Arguments: 1353 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1354 // ignored 1355 // is_oop - true => oop array, so generate store check code 1356 // name - stub name string 1357 // 1358 // Inputs: 1359 // c_rarg0 - source array address 1360 // c_rarg1 - destination array address 1361 // c_rarg2 - element count, treated as ssize_t, can be zero 1362 // 1363 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1364 // the hardware handle it. The two dwords within qwords that span 1365 // cache line boundaries will still be loaded and stored atomicly. 1366 // 1367 // Side Effects: 1368 // disjoint_int_copy_entry is set to the no-overlap entry point 1369 // used by generate_conjoint_int_oop_copy(). 1370 // 1371 address generate_disjoint_copy(size_t size, bool aligned, bool is_oop, address *entry, 1372 const char *name, bool dest_uninitialized = false) { 1373 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1374 RegSet saved_reg = RegSet::of(s, d, count); 1375 __ align(CodeEntryAlignment); 1376 StubCodeMark mark(this, "StubRoutines", name); 1377 address start = __ pc(); 1378 __ enter(); 1379 1380 if (entry != NULL) { 1381 *entry = __ pc(); 1382 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1383 BLOCK_COMMENT("Entry:"); 1384 } 1385 1386 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_DISJOINT; 1387 if (dest_uninitialized) { 1388 decorators |= IS_DEST_UNINITIALIZED; 1389 } 1390 if (aligned) { 1391 decorators |= ARRAYCOPY_ALIGNED; 1392 } 1393 1394 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1395 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_reg); 1396 1397 if (is_oop) { 1398 // save regs before copy_memory 1399 __ push(RegSet::of(d, count), sp); 1400 } 1401 copy_memory(aligned, s, d, count, rscratch1, size); 1402 1403 if (is_oop) { 1404 __ pop(RegSet::of(d, count), sp); 1405 if (VerifyOops) 1406 verify_oop_array(size, d, count, r16); 1407 } 1408 1409 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1410 1411 __ leave(); 1412 __ mov(r0, zr); // return 0 1413 __ ret(lr); 1414 #ifdef BUILTIN_SIM 1415 { 1416 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1417 sim->notifyCompile(const_cast<char*>(name), start); 1418 } 1419 #endif 1420 return start; 1421 } 1422 1423 // Arguments: 1424 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1425 // ignored 1426 // is_oop - true => oop array, so generate store check code 1427 // name - stub name string 1428 // 1429 // Inputs: 1430 // c_rarg0 - source array address 1431 // c_rarg1 - destination array address 1432 // c_rarg2 - element count, treated as ssize_t, can be zero 1433 // 1434 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1435 // the hardware handle it. The two dwords within qwords that span 1436 // cache line boundaries will still be loaded and stored atomicly. 1437 // 1438 address generate_conjoint_copy(size_t size, bool aligned, bool is_oop, address nooverlap_target, 1439 address *entry, const char *name, 1440 bool dest_uninitialized = false) { 1441 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1442 RegSet saved_regs = RegSet::of(s, d, count); 1443 StubCodeMark mark(this, "StubRoutines", name); 1444 address start = __ pc(); 1445 __ enter(); 1446 1447 if (entry != NULL) { 1448 *entry = __ pc(); 1449 // caller can pass a 64-bit byte count here (from Unsafe.copyMemory) 1450 BLOCK_COMMENT("Entry:"); 1451 } 1452 1453 // use fwd copy when (d-s) above_equal (count*size) 1454 __ sub(rscratch1, d, s); 1455 __ cmp(rscratch1, count, Assembler::LSL, exact_log2(size)); 1456 __ br(Assembler::HS, nooverlap_target); 1457 1458 DecoratorSet decorators = IN_HEAP | IS_ARRAY; 1459 if (dest_uninitialized) { 1460 decorators |= IS_DEST_UNINITIALIZED; 1461 } 1462 if (aligned) { 1463 decorators |= ARRAYCOPY_ALIGNED; 1464 } 1465 1466 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1467 bs->arraycopy_prologue(_masm, decorators, is_oop, s, d, count, saved_regs); 1468 1469 if (is_oop) { 1470 // save regs before copy_memory 1471 __ push(RegSet::of(d, count), sp); 1472 } 1473 copy_memory(aligned, s, d, count, rscratch1, -size); 1474 if (is_oop) { 1475 __ pop(RegSet::of(d, count), sp); 1476 if (VerifyOops) 1477 verify_oop_array(size, d, count, r16); 1478 } 1479 bs->arraycopy_epilogue(_masm, decorators, is_oop, d, count, rscratch1, RegSet()); 1480 __ leave(); 1481 __ mov(r0, zr); // return 0 1482 __ ret(lr); 1483 #ifdef BUILTIN_SIM 1484 { 1485 AArch64Simulator *sim = AArch64Simulator::get_current(UseSimulatorCache, DisableBCCheck); 1486 sim->notifyCompile(const_cast<char*>(name), start); 1487 } 1488 #endif 1489 return start; 1490 } 1491 1492 // Arguments: 1493 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1494 // ignored 1495 // name - stub name string 1496 // 1497 // Inputs: 1498 // c_rarg0 - source array address 1499 // c_rarg1 - destination array address 1500 // c_rarg2 - element count, treated as ssize_t, can be zero 1501 // 1502 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1503 // we let the hardware handle it. The one to eight bytes within words, 1504 // dwords or qwords that span cache line boundaries will still be loaded 1505 // and stored atomically. 1506 // 1507 // Side Effects: 1508 // disjoint_byte_copy_entry is set to the no-overlap entry point // 1509 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1510 // we let the hardware handle it. The one to eight bytes within words, 1511 // dwords or qwords that span cache line boundaries will still be loaded 1512 // and stored atomically. 1513 // 1514 // Side Effects: 1515 // disjoint_byte_copy_entry is set to the no-overlap entry point 1516 // used by generate_conjoint_byte_copy(). 1517 // 1518 address generate_disjoint_byte_copy(bool aligned, address* entry, const char *name) { 1519 const bool not_oop = false; 1520 return generate_disjoint_copy(sizeof (jbyte), aligned, not_oop, entry, name); 1521 } 1522 1523 // Arguments: 1524 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1525 // ignored 1526 // name - stub name string 1527 // 1528 // Inputs: 1529 // c_rarg0 - source array address 1530 // c_rarg1 - destination array address 1531 // c_rarg2 - element count, treated as ssize_t, can be zero 1532 // 1533 // If 'from' and/or 'to' are aligned on 4-, 2-, or 1-byte boundaries, 1534 // we let the hardware handle it. The one to eight bytes within words, 1535 // dwords or qwords that span cache line boundaries will still be loaded 1536 // and stored atomically. 1537 // 1538 address generate_conjoint_byte_copy(bool aligned, address nooverlap_target, 1539 address* entry, const char *name) { 1540 const bool not_oop = false; 1541 return generate_conjoint_copy(sizeof (jbyte), aligned, not_oop, nooverlap_target, entry, name); 1542 } 1543 1544 // Arguments: 1545 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1546 // ignored 1547 // name - stub name string 1548 // 1549 // Inputs: 1550 // c_rarg0 - source array address 1551 // c_rarg1 - destination array address 1552 // c_rarg2 - element count, treated as ssize_t, can be zero 1553 // 1554 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1555 // let the hardware handle it. The two or four words within dwords 1556 // or qwords that span cache line boundaries will still be loaded 1557 // and stored atomically. 1558 // 1559 // Side Effects: 1560 // disjoint_short_copy_entry is set to the no-overlap entry point 1561 // used by generate_conjoint_short_copy(). 1562 // 1563 address generate_disjoint_short_copy(bool aligned, 1564 address* entry, const char *name) { 1565 const bool not_oop = false; 1566 return generate_disjoint_copy(sizeof (jshort), aligned, not_oop, entry, name); 1567 } 1568 1569 // Arguments: 1570 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1571 // ignored 1572 // name - stub name string 1573 // 1574 // Inputs: 1575 // c_rarg0 - source array address 1576 // c_rarg1 - destination array address 1577 // c_rarg2 - element count, treated as ssize_t, can be zero 1578 // 1579 // If 'from' and/or 'to' are aligned on 4- or 2-byte boundaries, we 1580 // let the hardware handle it. The two or four words within dwords 1581 // or qwords that span cache line boundaries will still be loaded 1582 // and stored atomically. 1583 // 1584 address generate_conjoint_short_copy(bool aligned, address nooverlap_target, 1585 address *entry, const char *name) { 1586 const bool not_oop = false; 1587 return generate_conjoint_copy(sizeof (jshort), aligned, not_oop, nooverlap_target, entry, name); 1588 1589 } 1590 // Arguments: 1591 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1592 // ignored 1593 // name - stub name string 1594 // 1595 // Inputs: 1596 // c_rarg0 - source array address 1597 // c_rarg1 - destination array address 1598 // c_rarg2 - element count, treated as ssize_t, can be zero 1599 // 1600 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1601 // the hardware handle it. The two dwords within qwords that span 1602 // cache line boundaries will still be loaded and stored atomicly. 1603 // 1604 // Side Effects: 1605 // disjoint_int_copy_entry is set to the no-overlap entry point 1606 // used by generate_conjoint_int_oop_copy(). 1607 // 1608 address generate_disjoint_int_copy(bool aligned, address *entry, 1609 const char *name, bool dest_uninitialized = false) { 1610 const bool not_oop = false; 1611 return generate_disjoint_copy(sizeof (jint), aligned, not_oop, entry, name); 1612 } 1613 1614 // Arguments: 1615 // aligned - true => Input and output aligned on a HeapWord == 8-byte boundary 1616 // ignored 1617 // name - stub name string 1618 // 1619 // Inputs: 1620 // c_rarg0 - source array address 1621 // c_rarg1 - destination array address 1622 // c_rarg2 - element count, treated as ssize_t, can be zero 1623 // 1624 // If 'from' and/or 'to' are aligned on 4-byte boundaries, we let 1625 // the hardware handle it. The two dwords within qwords that span 1626 // cache line boundaries will still be loaded and stored atomicly. 1627 // 1628 address generate_conjoint_int_copy(bool aligned, address nooverlap_target, 1629 address *entry, const char *name, 1630 bool dest_uninitialized = false) { 1631 const bool not_oop = false; 1632 return generate_conjoint_copy(sizeof (jint), aligned, not_oop, nooverlap_target, entry, name); 1633 } 1634 1635 1636 // Arguments: 1637 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1638 // ignored 1639 // name - stub name string 1640 // 1641 // Inputs: 1642 // c_rarg0 - source array address 1643 // c_rarg1 - destination array address 1644 // c_rarg2 - element count, treated as size_t, can be zero 1645 // 1646 // Side Effects: 1647 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1648 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1649 // 1650 address generate_disjoint_long_copy(bool aligned, address *entry, 1651 const char *name, bool dest_uninitialized = false) { 1652 const bool not_oop = false; 1653 return generate_disjoint_copy(sizeof (jlong), aligned, not_oop, entry, name); 1654 } 1655 1656 // Arguments: 1657 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1658 // ignored 1659 // name - stub name string 1660 // 1661 // Inputs: 1662 // c_rarg0 - source array address 1663 // c_rarg1 - destination array address 1664 // c_rarg2 - element count, treated as size_t, can be zero 1665 // 1666 address generate_conjoint_long_copy(bool aligned, 1667 address nooverlap_target, address *entry, 1668 const char *name, bool dest_uninitialized = false) { 1669 const bool not_oop = false; 1670 return generate_conjoint_copy(sizeof (jlong), aligned, not_oop, nooverlap_target, entry, name); 1671 } 1672 1673 // Arguments: 1674 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1675 // ignored 1676 // name - stub name string 1677 // 1678 // Inputs: 1679 // c_rarg0 - source array address 1680 // c_rarg1 - destination array address 1681 // c_rarg2 - element count, treated as size_t, can be zero 1682 // 1683 // Side Effects: 1684 // disjoint_oop_copy_entry or disjoint_long_copy_entry is set to the 1685 // no-overlap entry point used by generate_conjoint_long_oop_copy(). 1686 // 1687 address generate_disjoint_oop_copy(bool aligned, address *entry, 1688 const char *name, bool dest_uninitialized) { 1689 const bool is_oop = true; 1690 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1691 return generate_disjoint_copy(size, aligned, is_oop, entry, name, dest_uninitialized); 1692 } 1693 1694 // Arguments: 1695 // aligned - true => Input and output aligned on a HeapWord boundary == 8 bytes 1696 // ignored 1697 // name - stub name string 1698 // 1699 // Inputs: 1700 // c_rarg0 - source array address 1701 // c_rarg1 - destination array address 1702 // c_rarg2 - element count, treated as size_t, can be zero 1703 // 1704 address generate_conjoint_oop_copy(bool aligned, 1705 address nooverlap_target, address *entry, 1706 const char *name, bool dest_uninitialized) { 1707 const bool is_oop = true; 1708 const size_t size = UseCompressedOops ? sizeof (jint) : sizeof (jlong); 1709 return generate_conjoint_copy(size, aligned, is_oop, nooverlap_target, entry, 1710 name, dest_uninitialized); 1711 } 1712 1713 1714 // Helper for generating a dynamic type check. 1715 // Smashes rscratch1, rscratch2. 1716 void generate_type_check(Register sub_klass, 1717 Register super_check_offset, 1718 Register super_klass, 1719 Label& L_success) { 1720 assert_different_registers(sub_klass, super_check_offset, super_klass); 1721 1722 BLOCK_COMMENT("type_check:"); 1723 1724 Label L_miss; 1725 1726 __ check_klass_subtype_fast_path(sub_klass, super_klass, noreg, &L_success, &L_miss, NULL, 1727 super_check_offset); 1728 __ check_klass_subtype_slow_path(sub_klass, super_klass, noreg, noreg, &L_success, NULL); 1729 1730 // Fall through on failure! 1731 __ BIND(L_miss); 1732 } 1733 1734 // 1735 // Generate checkcasting array copy stub 1736 // 1737 // Input: 1738 // c_rarg0 - source array address 1739 // c_rarg1 - destination array address 1740 // c_rarg2 - element count, treated as ssize_t, can be zero 1741 // c_rarg3 - size_t ckoff (super_check_offset) 1742 // c_rarg4 - oop ckval (super_klass) 1743 // 1744 // Output: 1745 // r0 == 0 - success 1746 // r0 == -1^K - failure, where K is partial transfer count 1747 // 1748 address generate_checkcast_copy(const char *name, address *entry, 1749 bool dest_uninitialized = false) { 1750 1751 Label L_load_element, L_store_element, L_do_card_marks, L_done, L_done_pop; 1752 1753 // Input registers (after setup_arg_regs) 1754 const Register from = c_rarg0; // source array address 1755 const Register to = c_rarg1; // destination array address 1756 const Register count = c_rarg2; // elementscount 1757 const Register ckoff = c_rarg3; // super_check_offset 1758 const Register ckval = c_rarg4; // super_klass 1759 1760 RegSet wb_pre_saved_regs = RegSet::range(c_rarg0, c_rarg4); 1761 RegSet wb_post_saved_regs = RegSet::of(count); 1762 1763 // Registers used as temps (r18, r19, r20 are save-on-entry) 1764 const Register count_save = r21; // orig elementscount 1765 const Register start_to = r20; // destination array start address 1766 const Register copied_oop = r18; // actual oop copied 1767 const Register r19_klass = r19; // oop._klass 1768 1769 //--------------------------------------------------------------- 1770 // Assembler stub will be used for this call to arraycopy 1771 // if the two arrays are subtypes of Object[] but the 1772 // destination array type is not equal to or a supertype 1773 // of the source type. Each element must be separately 1774 // checked. 1775 1776 assert_different_registers(from, to, count, ckoff, ckval, start_to, 1777 copied_oop, r19_klass, count_save); 1778 1779 __ align(CodeEntryAlignment); 1780 StubCodeMark mark(this, "StubRoutines", name); 1781 address start = __ pc(); 1782 1783 __ enter(); // required for proper stackwalking of RuntimeStub frame 1784 1785 #ifdef ASSERT 1786 // caller guarantees that the arrays really are different 1787 // otherwise, we would have to make conjoint checks 1788 { Label L; 1789 array_overlap_test(L, TIMES_OOP); 1790 __ stop("checkcast_copy within a single array"); 1791 __ bind(L); 1792 } 1793 #endif //ASSERT 1794 1795 // Caller of this entry point must set up the argument registers. 1796 if (entry != NULL) { 1797 *entry = __ pc(); 1798 BLOCK_COMMENT("Entry:"); 1799 } 1800 1801 // Empty array: Nothing to do. 1802 __ cbz(count, L_done); 1803 1804 __ push(RegSet::of(r18, r19, r20, r21), sp); 1805 1806 #ifdef ASSERT 1807 BLOCK_COMMENT("assert consistent ckoff/ckval"); 1808 // The ckoff and ckval must be mutually consistent, 1809 // even though caller generates both. 1810 { Label L; 1811 int sco_offset = in_bytes(Klass::super_check_offset_offset()); 1812 __ ldrw(start_to, Address(ckval, sco_offset)); 1813 __ cmpw(ckoff, start_to); 1814 __ br(Assembler::EQ, L); 1815 __ stop("super_check_offset inconsistent"); 1816 __ bind(L); 1817 } 1818 #endif //ASSERT 1819 1820 DecoratorSet decorators = IN_HEAP | IS_ARRAY | ARRAYCOPY_CHECKCAST | ARRAYCOPY_DISJOINT; 1821 bool is_oop = true; 1822 if (dest_uninitialized) { 1823 decorators |= IS_DEST_UNINITIALIZED; 1824 } 1825 1826 BarrierSetAssembler *bs = BarrierSet::barrier_set()->barrier_set_assembler(); 1827 bs->arraycopy_prologue(_masm, decorators, is_oop, from, to, count, wb_pre_saved_regs); 1828 1829 // save the original count 1830 __ mov(count_save, count); 1831 1832 // Copy from low to high addresses 1833 __ mov(start_to, to); // Save destination array start address 1834 __ b(L_load_element); 1835 1836 // ======== begin loop ======== 1837 // (Loop is rotated; its entry is L_load_element.) 1838 // Loop control: 1839 // for (; count != 0; count--) { 1840 // copied_oop = load_heap_oop(from++); 1841 // ... generate_type_check ...; 1842 // store_heap_oop(to++, copied_oop); 1843 // } 1844 __ align(OptoLoopAlignment); 1845 1846 __ BIND(L_store_element); 1847 __ store_heap_oop(__ post(to, UseCompressedOops ? 4 : 8), copied_oop, noreg, noreg, noreg, AS_RAW); // store the oop 1848 __ sub(count, count, 1); 1849 __ cbz(count, L_do_card_marks); 1850 1851 // ======== loop entry is here ======== 1852 __ BIND(L_load_element); 1853 __ load_heap_oop(copied_oop, __ post(from, UseCompressedOops ? 4 : 8), noreg, noreg, AS_RAW); // load the oop 1854 __ cbz(copied_oop, L_store_element); 1855 1856 __ load_klass(r19_klass, copied_oop);// query the object klass 1857 generate_type_check(r19_klass, ckoff, ckval, L_store_element); 1858 // ======== end loop ======== 1859 1860 // It was a real error; we must depend on the caller to finish the job. 1861 // Register count = remaining oops, count_orig = total oops. 1862 // Emit GC store barriers for the oops we have copied and report 1863 // their number to the caller. 1864 1865 __ subs(count, count_save, count); // K = partially copied oop count 1866 __ eon(count, count, zr); // report (-1^K) to caller 1867 __ br(Assembler::EQ, L_done_pop); 1868 1869 __ BIND(L_do_card_marks); 1870 bs->arraycopy_epilogue(_masm, decorators, is_oop, start_to, count_save, rscratch1, wb_post_saved_regs); 1871 1872 __ bind(L_done_pop); 1873 __ pop(RegSet::of(r18, r19, r20, r21), sp); 1874 inc_counter_np(SharedRuntime::_checkcast_array_copy_ctr); 1875 1876 __ bind(L_done); 1877 __ mov(r0, count); 1878 __ leave(); 1879 __ ret(lr); 1880 1881 return start; 1882 } 1883 1884 // Perform range checks on the proposed arraycopy. 1885 // Kills temp, but nothing else. 1886 // Also, clean the sign bits of src_pos and dst_pos. 1887 void arraycopy_range_checks(Register src, // source array oop (c_rarg0) 1888 Register src_pos, // source position (c_rarg1) 1889 Register dst, // destination array oo (c_rarg2) 1890 Register dst_pos, // destination position (c_rarg3) 1891 Register length, 1892 Register temp, 1893 Label& L_failed) { 1894 BLOCK_COMMENT("arraycopy_range_checks:"); 1895 1896 assert_different_registers(rscratch1, temp); 1897 1898 // if (src_pos + length > arrayOop(src)->length()) FAIL; 1899 __ ldrw(rscratch1, Address(src, arrayOopDesc::length_offset_in_bytes())); 1900 __ addw(temp, length, src_pos); 1901 __ cmpw(temp, rscratch1); 1902 __ br(Assembler::HI, L_failed); 1903 1904 // if (dst_pos + length > arrayOop(dst)->length()) FAIL; 1905 __ ldrw(rscratch1, Address(dst, arrayOopDesc::length_offset_in_bytes())); 1906 __ addw(temp, length, dst_pos); 1907 __ cmpw(temp, rscratch1); 1908 __ br(Assembler::HI, L_failed); 1909 1910 // Have to clean up high 32 bits of 'src_pos' and 'dst_pos'. 1911 __ movw(src_pos, src_pos); 1912 __ movw(dst_pos, dst_pos); 1913 1914 BLOCK_COMMENT("arraycopy_range_checks done"); 1915 } 1916 1917 // These stubs get called from some dumb test routine. 1918 // I'll write them properly when they're called from 1919 // something that's actually doing something. 1920 static void fake_arraycopy_stub(address src, address dst, int count) { 1921 assert(count == 0, "huh?"); 1922 } 1923 1924 1925 // 1926 // Generate 'unsafe' array copy stub 1927 // Though just as safe as the other stubs, it takes an unscaled 1928 // size_t argument instead of an element count. 1929 // 1930 // Input: 1931 // c_rarg0 - source array address 1932 // c_rarg1 - destination array address 1933 // c_rarg2 - byte count, treated as ssize_t, can be zero 1934 // 1935 // Examines the alignment of the operands and dispatches 1936 // to a long, int, short, or byte copy loop. 1937 // 1938 address generate_unsafe_copy(const char *name, 1939 address byte_copy_entry, 1940 address short_copy_entry, 1941 address int_copy_entry, 1942 address long_copy_entry) { 1943 Label L_long_aligned, L_int_aligned, L_short_aligned; 1944 Register s = c_rarg0, d = c_rarg1, count = c_rarg2; 1945 1946 __ align(CodeEntryAlignment); 1947 StubCodeMark mark(this, "StubRoutines", name); 1948 address start = __ pc(); 1949 __ enter(); // required for proper stackwalking of RuntimeStub frame 1950 1951 // bump this on entry, not on exit: 1952 inc_counter_np(SharedRuntime::_unsafe_array_copy_ctr); 1953 1954 __ orr(rscratch1, s, d); 1955 __ orr(rscratch1, rscratch1, count); 1956 1957 __ andr(rscratch1, rscratch1, BytesPerLong-1); 1958 __ cbz(rscratch1, L_long_aligned); 1959 __ andr(rscratch1, rscratch1, BytesPerInt-1); 1960 __ cbz(rscratch1, L_int_aligned); 1961 __ tbz(rscratch1, 0, L_short_aligned); 1962 __ b(RuntimeAddress(byte_copy_entry)); 1963 1964 __ BIND(L_short_aligned); 1965 __ lsr(count, count, LogBytesPerShort); // size => short_count 1966 __ b(RuntimeAddress(short_copy_entry)); 1967 __ BIND(L_int_aligned); 1968 __ lsr(count, count, LogBytesPerInt); // size => int_count 1969 __ b(RuntimeAddress(int_copy_entry)); 1970 __ BIND(L_long_aligned); 1971 __ lsr(count, count, LogBytesPerLong); // size => long_count 1972 __ b(RuntimeAddress(long_copy_entry)); 1973 1974 return start; 1975 } 1976 1977 // 1978 // Generate generic array copy stubs 1979 // 1980 // Input: 1981 // c_rarg0 - src oop 1982 // c_rarg1 - src_pos (32-bits) 1983 // c_rarg2 - dst oop 1984 // c_rarg3 - dst_pos (32-bits) 1985 // c_rarg4 - element count (32-bits) 1986 // 1987 // Output: 1988 // r0 == 0 - success 1989 // r0 == -1^K - failure, where K is partial transfer count 1990 // 1991 address generate_generic_copy(const char *name, 1992 address byte_copy_entry, address short_copy_entry, 1993 address int_copy_entry, address oop_copy_entry, 1994 address long_copy_entry, address checkcast_copy_entry) { 1995 1996 Label L_failed, L_objArray; 1997 Label L_copy_bytes, L_copy_shorts, L_copy_ints, L_copy_longs; 1998 1999 // Input registers 2000 const Register src = c_rarg0; // source array oop 2001 const Register src_pos = c_rarg1; // source position 2002 const Register dst = c_rarg2; // destination array oop 2003 const Register dst_pos = c_rarg3; // destination position 2004 const Register length = c_rarg4; 2005 2006 2007 // Registers used as temps 2008 const Register dst_klass = c_rarg5; 2009 2010 __ align(CodeEntryAlignment); 2011 2012 StubCodeMark mark(this, "StubRoutines", name); 2013 2014 address start = __ pc(); 2015 2016 __ enter(); // required for proper stackwalking of RuntimeStub frame 2017 2018 // bump this on entry, not on exit: 2019 inc_counter_np(SharedRuntime::_generic_array_copy_ctr); 2020 2021 //----------------------------------------------------------------------- 2022 // Assembler stub will be used for this call to arraycopy 2023 // if the following conditions are met: 2024 // 2025 // (1) src and dst must not be null. 2026 // (2) src_pos must not be negative. 2027 // (3) dst_pos must not be negative. 2028 // (4) length must not be negative. 2029 // (5) src klass and dst klass should be the same and not NULL. 2030 // (6) src and dst should be arrays. 2031 // (7) src_pos + length must not exceed length of src. 2032 // (8) dst_pos + length must not exceed length of dst. 2033 // 2034 2035 // if (src == NULL) return -1; 2036 __ cbz(src, L_failed); 2037 2038 // if (src_pos < 0) return -1; 2039 __ tbnz(src_pos, 31, L_failed); // i.e. sign bit set 2040 2041 // if (dst == NULL) return -1; 2042 __ cbz(dst, L_failed); 2043 2044 // if (dst_pos < 0) return -1; 2045 __ tbnz(dst_pos, 31, L_failed); // i.e. sign bit set 2046 2047 // registers used as temp 2048 const Register scratch_length = r16; // elements count to copy 2049 const Register scratch_src_klass = r17; // array klass 2050 const Register lh = r18; // layout helper 2051 2052 // if (length < 0) return -1; 2053 __ movw(scratch_length, length); // length (elements count, 32-bits value) 2054 __ tbnz(scratch_length, 31, L_failed); // i.e. sign bit set 2055 2056 __ load_klass(scratch_src_klass, src); 2057 #ifdef ASSERT 2058 // assert(src->klass() != NULL); 2059 { 2060 BLOCK_COMMENT("assert klasses not null {"); 2061 Label L1, L2; 2062 __ cbnz(scratch_src_klass, L2); // it is broken if klass is NULL 2063 __ bind(L1); 2064 __ stop("broken null klass"); 2065 __ bind(L2); 2066 __ load_klass(rscratch1, dst); 2067 __ cbz(rscratch1, L1); // this would be broken also 2068 BLOCK_COMMENT("} assert klasses not null done"); 2069 } 2070 #endif 2071 2072 // Load layout helper (32-bits) 2073 // 2074 // |array_tag| | header_size | element_type | |log2_element_size| 2075 // 32 30 24 16 8 2 0 2076 // 2077 // array_tag: typeArray = 0x3, objArray = 0x2, non-array = 0x0 2078 // 2079 2080 const int lh_offset = in_bytes(Klass::layout_helper_offset()); 2081 2082 // Handle objArrays completely differently... 2083 const jint objArray_lh = Klass::array_layout_helper(T_OBJECT); 2084 __ ldrw(lh, Address(scratch_src_klass, lh_offset)); 2085 __ movw(rscratch1, objArray_lh); 2086 __ eorw(rscratch2, lh, rscratch1); 2087 __ cbzw(rscratch2, L_objArray); 2088 2089 // if (src->klass() != dst->klass()) return -1; 2090 __ load_klass(rscratch2, dst); 2091 __ eor(rscratch2, rscratch2, scratch_src_klass); 2092 __ cbnz(rscratch2, L_failed); 2093 2094 // if (!src->is_Array()) return -1; 2095 __ tbz(lh, 31, L_failed); // i.e. (lh >= 0) 2096 2097 // At this point, it is known to be a typeArray (array_tag 0x3). 2098 #ifdef ASSERT 2099 { 2100 BLOCK_COMMENT("assert primitive array {"); 2101 Label L; 2102 __ movw(rscratch2, Klass::_lh_array_tag_type_value << Klass::_lh_array_tag_shift); 2103 __ cmpw(lh, rscratch2); 2104 __ br(Assembler::GE, L); 2105 __ stop("must be a primitive array"); 2106 __ bind(L); 2107 BLOCK_COMMENT("} assert primitive array done"); 2108 } 2109 #endif 2110 2111 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2112 rscratch2, L_failed); 2113 2114 // TypeArrayKlass 2115 // 2116 // src_addr = (src + array_header_in_bytes()) + (src_pos << log2elemsize); 2117 // dst_addr = (dst + array_header_in_bytes()) + (dst_pos << log2elemsize); 2118 // 2119 2120 const Register rscratch1_offset = rscratch1; // array offset 2121 const Register r18_elsize = lh; // element size 2122 2123 __ ubfx(rscratch1_offset, lh, Klass::_lh_header_size_shift, 2124 exact_log2(Klass::_lh_header_size_mask+1)); // array_offset 2125 __ add(src, src, rscratch1_offset); // src array offset 2126 __ add(dst, dst, rscratch1_offset); // dst array offset 2127 BLOCK_COMMENT("choose copy loop based on element size"); 2128 2129 // next registers should be set before the jump to corresponding stub 2130 const Register from = c_rarg0; // source array address 2131 const Register to = c_rarg1; // destination array address 2132 const Register count = c_rarg2; // elements count 2133 2134 // 'from', 'to', 'count' registers should be set in such order 2135 // since they are the same as 'src', 'src_pos', 'dst'. 2136 2137 assert(Klass::_lh_log2_element_size_shift == 0, "fix this code"); 2138 2139 // The possible values of elsize are 0-3, i.e. exact_log2(element 2140 // size in bytes). We do a simple bitwise binary search. 2141 __ BIND(L_copy_bytes); 2142 __ tbnz(r18_elsize, 1, L_copy_ints); 2143 __ tbnz(r18_elsize, 0, L_copy_shorts); 2144 __ lea(from, Address(src, src_pos));// src_addr 2145 __ lea(to, Address(dst, dst_pos));// dst_addr 2146 __ movw(count, scratch_length); // length 2147 __ b(RuntimeAddress(byte_copy_entry)); 2148 2149 __ BIND(L_copy_shorts); 2150 __ lea(from, Address(src, src_pos, Address::lsl(1)));// src_addr 2151 __ lea(to, Address(dst, dst_pos, Address::lsl(1)));// dst_addr 2152 __ movw(count, scratch_length); // length 2153 __ b(RuntimeAddress(short_copy_entry)); 2154 2155 __ BIND(L_copy_ints); 2156 __ tbnz(r18_elsize, 0, L_copy_longs); 2157 __ lea(from, Address(src, src_pos, Address::lsl(2)));// src_addr 2158 __ lea(to, Address(dst, dst_pos, Address::lsl(2)));// dst_addr 2159 __ movw(count, scratch_length); // length 2160 __ b(RuntimeAddress(int_copy_entry)); 2161 2162 __ BIND(L_copy_longs); 2163 #ifdef ASSERT 2164 { 2165 BLOCK_COMMENT("assert long copy {"); 2166 Label L; 2167 __ andw(lh, lh, Klass::_lh_log2_element_size_mask); // lh -> r18_elsize 2168 __ cmpw(r18_elsize, LogBytesPerLong); 2169 __ br(Assembler::EQ, L); 2170 __ stop("must be long copy, but elsize is wrong"); 2171 __ bind(L); 2172 BLOCK_COMMENT("} assert long copy done"); 2173 } 2174 #endif 2175 __ lea(from, Address(src, src_pos, Address::lsl(3)));// src_addr 2176 __ lea(to, Address(dst, dst_pos, Address::lsl(3)));// dst_addr 2177 __ movw(count, scratch_length); // length 2178 __ b(RuntimeAddress(long_copy_entry)); 2179 2180 // ObjArrayKlass 2181 __ BIND(L_objArray); 2182 // live at this point: scratch_src_klass, scratch_length, src[_pos], dst[_pos] 2183 2184 Label L_plain_copy, L_checkcast_copy; 2185 // test array classes for subtyping 2186 __ load_klass(r18, dst); 2187 __ cmp(scratch_src_klass, r18); // usual case is exact equality 2188 __ br(Assembler::NE, L_checkcast_copy); 2189 2190 // Identically typed arrays can be copied without element-wise checks. 2191 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2192 rscratch2, L_failed); 2193 2194 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2195 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2196 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2197 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2198 __ movw(count, scratch_length); // length 2199 __ BIND(L_plain_copy); 2200 __ b(RuntimeAddress(oop_copy_entry)); 2201 2202 __ BIND(L_checkcast_copy); 2203 // live at this point: scratch_src_klass, scratch_length, r18 (dst_klass) 2204 { 2205 // Before looking at dst.length, make sure dst is also an objArray. 2206 __ ldrw(rscratch1, Address(r18, lh_offset)); 2207 __ movw(rscratch2, objArray_lh); 2208 __ eorw(rscratch1, rscratch1, rscratch2); 2209 __ cbnzw(rscratch1, L_failed); 2210 2211 // It is safe to examine both src.length and dst.length. 2212 arraycopy_range_checks(src, src_pos, dst, dst_pos, scratch_length, 2213 r18, L_failed); 2214 2215 __ load_klass(dst_klass, dst); // reload 2216 2217 // Marshal the base address arguments now, freeing registers. 2218 __ lea(from, Address(src, src_pos, Address::lsl(LogBytesPerHeapOop))); 2219 __ add(from, from, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2220 __ lea(to, Address(dst, dst_pos, Address::lsl(LogBytesPerHeapOop))); 2221 __ add(to, to, arrayOopDesc::base_offset_in_bytes(T_OBJECT)); 2222 __ movw(count, length); // length (reloaded) 2223 Register sco_temp = c_rarg3; // this register is free now 2224 assert_different_registers(from, to, count, sco_temp, 2225 dst_klass, scratch_src_klass); 2226 // assert_clean_int(count, sco_temp); 2227 2228 // Generate the type check. 2229 const int sco_offset = in_bytes(Klass::super_check_offset_offset()); 2230 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2231 2232 // Smashes rscratch1, rscratch2 2233 generate_type_check(scratch_src_klass, sco_temp, dst_klass, L_plain_copy); 2234 2235 // Fetch destination element klass from the ObjArrayKlass header. 2236 int ek_offset = in_bytes(ObjArrayKlass::element_klass_offset()); 2237 __ ldr(dst_klass, Address(dst_klass, ek_offset)); 2238 __ ldrw(sco_temp, Address(dst_klass, sco_offset)); 2239 2240 // the checkcast_copy loop needs two extra arguments: 2241 assert(c_rarg3 == sco_temp, "#3 already in place"); 2242 // Set up arguments for checkcast_copy_entry. 2243 __ mov(c_rarg4, dst_klass); // dst.klass.element_klass 2244 __ b(RuntimeAddress(checkcast_copy_entry)); 2245 } 2246 2247 __ BIND(L_failed); 2248 __ mov(r0, -1); 2249 __ leave(); // required for proper stackwalking of RuntimeStub frame 2250 __ ret(lr); 2251 2252 return start; 2253 } 2254 2255 // 2256 // Generate stub for array fill. If "aligned" is true, the 2257 // "to" address is assumed to be heapword aligned. 2258 // 2259 // Arguments for generated stub: 2260 // to: c_rarg0 2261 // value: c_rarg1 2262 // count: c_rarg2 treated as signed 2263 // 2264 address generate_fill(BasicType t, bool aligned, const char *name) { 2265 __ align(CodeEntryAlignment); 2266 StubCodeMark mark(this, "StubRoutines", name); 2267 address start = __ pc(); 2268 2269 BLOCK_COMMENT("Entry:"); 2270 2271 const Register to = c_rarg0; // source array address 2272 const Register value = c_rarg1; // value 2273 const Register count = c_rarg2; // elements count 2274 2275 const Register bz_base = r10; // base for block_zero routine 2276 const Register cnt_words = r11; // temp register 2277 2278 __ enter(); 2279 2280 Label L_fill_elements, L_exit1; 2281 2282 int shift = -1; 2283 switch (t) { 2284 case T_BYTE: 2285 shift = 0; 2286 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2287 __ bfi(value, value, 8, 8); // 8 bit -> 16 bit 2288 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2289 __ br(Assembler::LO, L_fill_elements); 2290 break; 2291 case T_SHORT: 2292 shift = 1; 2293 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2294 __ bfi(value, value, 16, 16); // 16 bit -> 32 bit 2295 __ br(Assembler::LO, L_fill_elements); 2296 break; 2297 case T_INT: 2298 shift = 2; 2299 __ cmpw(count, 8 >> shift); // Short arrays (< 8 bytes) fill by element 2300 __ br(Assembler::LO, L_fill_elements); 2301 break; 2302 default: ShouldNotReachHere(); 2303 } 2304 2305 // Align source address at 8 bytes address boundary. 2306 Label L_skip_align1, L_skip_align2, L_skip_align4; 2307 if (!aligned) { 2308 switch (t) { 2309 case T_BYTE: 2310 // One byte misalignment happens only for byte arrays. 2311 __ tbz(to, 0, L_skip_align1); 2312 __ strb(value, Address(__ post(to, 1))); 2313 __ subw(count, count, 1); 2314 __ bind(L_skip_align1); 2315 // Fallthrough 2316 case T_SHORT: 2317 // Two bytes misalignment happens only for byte and short (char) arrays. 2318 __ tbz(to, 1, L_skip_align2); 2319 __ strh(value, Address(__ post(to, 2))); 2320 __ subw(count, count, 2 >> shift); 2321 __ bind(L_skip_align2); 2322 // Fallthrough 2323 case T_INT: 2324 // Align to 8 bytes, we know we are 4 byte aligned to start. 2325 __ tbz(to, 2, L_skip_align4); 2326 __ strw(value, Address(__ post(to, 4))); 2327 __ subw(count, count, 4 >> shift); 2328 __ bind(L_skip_align4); 2329 break; 2330 default: ShouldNotReachHere(); 2331 } 2332 } 2333 2334 // 2335 // Fill large chunks 2336 // 2337 __ lsrw(cnt_words, count, 3 - shift); // number of words 2338 __ bfi(value, value, 32, 32); // 32 bit -> 64 bit 2339 __ subw(count, count, cnt_words, Assembler::LSL, 3 - shift); 2340 if (UseBlockZeroing) { 2341 Label non_block_zeroing, rest; 2342 // If the fill value is zero we can use the fast zero_words(). 2343 __ cbnz(value, non_block_zeroing); 2344 __ mov(bz_base, to); 2345 __ add(to, to, cnt_words, Assembler::LSL, LogBytesPerWord); 2346 __ zero_words(bz_base, cnt_words); 2347 __ b(rest); 2348 __ bind(non_block_zeroing); 2349 __ fill_words(to, cnt_words, value); 2350 __ bind(rest); 2351 } else { 2352 __ fill_words(to, cnt_words, value); 2353 } 2354 2355 // Remaining count is less than 8 bytes. Fill it by a single store. 2356 // Note that the total length is no less than 8 bytes. 2357 if (t == T_BYTE || t == T_SHORT) { 2358 Label L_exit1; 2359 __ cbzw(count, L_exit1); 2360 __ add(to, to, count, Assembler::LSL, shift); // points to the end 2361 __ str(value, Address(to, -8)); // overwrite some elements 2362 __ bind(L_exit1); 2363 __ leave(); 2364 __ ret(lr); 2365 } 2366 2367 // Handle copies less than 8 bytes. 2368 Label L_fill_2, L_fill_4, L_exit2; 2369 __ bind(L_fill_elements); 2370 switch (t) { 2371 case T_BYTE: 2372 __ tbz(count, 0, L_fill_2); 2373 __ strb(value, Address(__ post(to, 1))); 2374 __ bind(L_fill_2); 2375 __ tbz(count, 1, L_fill_4); 2376 __ strh(value, Address(__ post(to, 2))); 2377 __ bind(L_fill_4); 2378 __ tbz(count, 2, L_exit2); 2379 __ strw(value, Address(to)); 2380 break; 2381 case T_SHORT: 2382 __ tbz(count, 0, L_fill_4); 2383 __ strh(value, Address(__ post(to, 2))); 2384 __ bind(L_fill_4); 2385 __ tbz(count, 1, L_exit2); 2386 __ strw(value, Address(to)); 2387 break; 2388 case T_INT: 2389 __ cbzw(count, L_exit2); 2390 __ strw(value, Address(to)); 2391 break; 2392 default: ShouldNotReachHere(); 2393 } 2394 __ bind(L_exit2); 2395 __ leave(); 2396 __ ret(lr); 2397 return start; 2398 } 2399 2400 void generate_arraycopy_stubs() { 2401 address entry; 2402 address entry_jbyte_arraycopy; 2403 address entry_jshort_arraycopy; 2404 address entry_jint_arraycopy; 2405 address entry_oop_arraycopy; 2406 address entry_jlong_arraycopy; 2407 address entry_checkcast_arraycopy; 2408 2409 generate_copy_longs(copy_f, r0, r1, rscratch2, copy_forwards); 2410 generate_copy_longs(copy_b, r0, r1, rscratch2, copy_backwards); 2411 2412 StubRoutines::aarch64::_zero_blocks = generate_zero_blocks(); 2413 2414 //*** jbyte 2415 // Always need aligned and unaligned versions 2416 StubRoutines::_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(false, &entry, 2417 "jbyte_disjoint_arraycopy"); 2418 StubRoutines::_jbyte_arraycopy = generate_conjoint_byte_copy(false, entry, 2419 &entry_jbyte_arraycopy, 2420 "jbyte_arraycopy"); 2421 StubRoutines::_arrayof_jbyte_disjoint_arraycopy = generate_disjoint_byte_copy(true, &entry, 2422 "arrayof_jbyte_disjoint_arraycopy"); 2423 StubRoutines::_arrayof_jbyte_arraycopy = generate_conjoint_byte_copy(true, entry, NULL, 2424 "arrayof_jbyte_arraycopy"); 2425 2426 //*** jshort 2427 // Always need aligned and unaligned versions 2428 StubRoutines::_jshort_disjoint_arraycopy = generate_disjoint_short_copy(false, &entry, 2429 "jshort_disjoint_arraycopy"); 2430 StubRoutines::_jshort_arraycopy = generate_conjoint_short_copy(false, entry, 2431 &entry_jshort_arraycopy, 2432 "jshort_arraycopy"); 2433 StubRoutines::_arrayof_jshort_disjoint_arraycopy = generate_disjoint_short_copy(true, &entry, 2434 "arrayof_jshort_disjoint_arraycopy"); 2435 StubRoutines::_arrayof_jshort_arraycopy = generate_conjoint_short_copy(true, entry, NULL, 2436 "arrayof_jshort_arraycopy"); 2437 2438 //*** jint 2439 // Aligned versions 2440 StubRoutines::_arrayof_jint_disjoint_arraycopy = generate_disjoint_int_copy(true, &entry, 2441 "arrayof_jint_disjoint_arraycopy"); 2442 StubRoutines::_arrayof_jint_arraycopy = generate_conjoint_int_copy(true, entry, &entry_jint_arraycopy, 2443 "arrayof_jint_arraycopy"); 2444 // In 64 bit we need both aligned and unaligned versions of jint arraycopy. 2445 // entry_jint_arraycopy always points to the unaligned version 2446 StubRoutines::_jint_disjoint_arraycopy = generate_disjoint_int_copy(false, &entry, 2447 "jint_disjoint_arraycopy"); 2448 StubRoutines::_jint_arraycopy = generate_conjoint_int_copy(false, entry, 2449 &entry_jint_arraycopy, 2450 "jint_arraycopy"); 2451 2452 //*** jlong 2453 // It is always aligned 2454 StubRoutines::_arrayof_jlong_disjoint_arraycopy = generate_disjoint_long_copy(true, &entry, 2455 "arrayof_jlong_disjoint_arraycopy"); 2456 StubRoutines::_arrayof_jlong_arraycopy = generate_conjoint_long_copy(true, entry, &entry_jlong_arraycopy, 2457 "arrayof_jlong_arraycopy"); 2458 StubRoutines::_jlong_disjoint_arraycopy = StubRoutines::_arrayof_jlong_disjoint_arraycopy; 2459 StubRoutines::_jlong_arraycopy = StubRoutines::_arrayof_jlong_arraycopy; 2460 2461 //*** oops 2462 { 2463 // With compressed oops we need unaligned versions; notice that 2464 // we overwrite entry_oop_arraycopy. 2465 bool aligned = !UseCompressedOops; 2466 2467 StubRoutines::_arrayof_oop_disjoint_arraycopy 2468 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy", 2469 /*dest_uninitialized*/false); 2470 StubRoutines::_arrayof_oop_arraycopy 2471 = generate_conjoint_oop_copy(aligned, entry, &entry_oop_arraycopy, "arrayof_oop_arraycopy", 2472 /*dest_uninitialized*/false); 2473 // Aligned versions without pre-barriers 2474 StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit 2475 = generate_disjoint_oop_copy(aligned, &entry, "arrayof_oop_disjoint_arraycopy_uninit", 2476 /*dest_uninitialized*/true); 2477 StubRoutines::_arrayof_oop_arraycopy_uninit 2478 = generate_conjoint_oop_copy(aligned, entry, NULL, "arrayof_oop_arraycopy_uninit", 2479 /*dest_uninitialized*/true); 2480 } 2481 2482 StubRoutines::_oop_disjoint_arraycopy = StubRoutines::_arrayof_oop_disjoint_arraycopy; 2483 StubRoutines::_oop_arraycopy = StubRoutines::_arrayof_oop_arraycopy; 2484 StubRoutines::_oop_disjoint_arraycopy_uninit = StubRoutines::_arrayof_oop_disjoint_arraycopy_uninit; 2485 StubRoutines::_oop_arraycopy_uninit = StubRoutines::_arrayof_oop_arraycopy_uninit; 2486 2487 StubRoutines::_checkcast_arraycopy = generate_checkcast_copy("checkcast_arraycopy", &entry_checkcast_arraycopy); 2488 StubRoutines::_checkcast_arraycopy_uninit = generate_checkcast_copy("checkcast_arraycopy_uninit", NULL, 2489 /*dest_uninitialized*/true); 2490 2491 StubRoutines::_unsafe_arraycopy = generate_unsafe_copy("unsafe_arraycopy", 2492 entry_jbyte_arraycopy, 2493 entry_jshort_arraycopy, 2494 entry_jint_arraycopy, 2495 entry_jlong_arraycopy); 2496 2497 StubRoutines::_generic_arraycopy = generate_generic_copy("generic_arraycopy", 2498 entry_jbyte_arraycopy, 2499 entry_jshort_arraycopy, 2500 entry_jint_arraycopy, 2501 entry_oop_arraycopy, 2502 entry_jlong_arraycopy, 2503 entry_checkcast_arraycopy); 2504 2505 StubRoutines::_jbyte_fill = generate_fill(T_BYTE, false, "jbyte_fill"); 2506 StubRoutines::_jshort_fill = generate_fill(T_SHORT, false, "jshort_fill"); 2507 StubRoutines::_jint_fill = generate_fill(T_INT, false, "jint_fill"); 2508 StubRoutines::_arrayof_jbyte_fill = generate_fill(T_BYTE, true, "arrayof_jbyte_fill"); 2509 StubRoutines::_arrayof_jshort_fill = generate_fill(T_SHORT, true, "arrayof_jshort_fill"); 2510 StubRoutines::_arrayof_jint_fill = generate_fill(T_INT, true, "arrayof_jint_fill"); 2511 } 2512 2513 void generate_math_stubs() { Unimplemented(); } 2514 2515 // Arguments: 2516 // 2517 // Inputs: 2518 // c_rarg0 - source byte array address 2519 // c_rarg1 - destination byte array address 2520 // c_rarg2 - K (key) in little endian int array 2521 // 2522 address generate_aescrypt_encryptBlock() { 2523 __ align(CodeEntryAlignment); 2524 StubCodeMark mark(this, "StubRoutines", "aescrypt_encryptBlock"); 2525 2526 Label L_doLast; 2527 2528 const Register from = c_rarg0; // source array address 2529 const Register to = c_rarg1; // destination array address 2530 const Register key = c_rarg2; // key array address 2531 const Register keylen = rscratch1; 2532 2533 address start = __ pc(); 2534 __ enter(); 2535 2536 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2537 2538 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2539 2540 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2541 __ rev32(v1, __ T16B, v1); 2542 __ rev32(v2, __ T16B, v2); 2543 __ rev32(v3, __ T16B, v3); 2544 __ rev32(v4, __ T16B, v4); 2545 __ aese(v0, v1); 2546 __ aesmc(v0, v0); 2547 __ aese(v0, v2); 2548 __ aesmc(v0, v0); 2549 __ aese(v0, v3); 2550 __ aesmc(v0, v0); 2551 __ aese(v0, v4); 2552 __ aesmc(v0, v0); 2553 2554 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2555 __ rev32(v1, __ T16B, v1); 2556 __ rev32(v2, __ T16B, v2); 2557 __ rev32(v3, __ T16B, v3); 2558 __ rev32(v4, __ T16B, v4); 2559 __ aese(v0, v1); 2560 __ aesmc(v0, v0); 2561 __ aese(v0, v2); 2562 __ aesmc(v0, v0); 2563 __ aese(v0, v3); 2564 __ aesmc(v0, v0); 2565 __ aese(v0, v4); 2566 __ aesmc(v0, v0); 2567 2568 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2569 __ rev32(v1, __ T16B, v1); 2570 __ rev32(v2, __ T16B, v2); 2571 2572 __ cmpw(keylen, 44); 2573 __ br(Assembler::EQ, L_doLast); 2574 2575 __ aese(v0, v1); 2576 __ aesmc(v0, v0); 2577 __ aese(v0, v2); 2578 __ aesmc(v0, v0); 2579 2580 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2581 __ rev32(v1, __ T16B, v1); 2582 __ rev32(v2, __ T16B, v2); 2583 2584 __ cmpw(keylen, 52); 2585 __ br(Assembler::EQ, L_doLast); 2586 2587 __ aese(v0, v1); 2588 __ aesmc(v0, v0); 2589 __ aese(v0, v2); 2590 __ aesmc(v0, v0); 2591 2592 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2593 __ rev32(v1, __ T16B, v1); 2594 __ rev32(v2, __ T16B, v2); 2595 2596 __ BIND(L_doLast); 2597 2598 __ aese(v0, v1); 2599 __ aesmc(v0, v0); 2600 __ aese(v0, v2); 2601 2602 __ ld1(v1, __ T16B, key); 2603 __ rev32(v1, __ T16B, v1); 2604 __ eor(v0, __ T16B, v0, v1); 2605 2606 __ st1(v0, __ T16B, to); 2607 2608 __ mov(r0, 0); 2609 2610 __ leave(); 2611 __ ret(lr); 2612 2613 return start; 2614 } 2615 2616 // Arguments: 2617 // 2618 // Inputs: 2619 // c_rarg0 - source byte array address 2620 // c_rarg1 - destination byte array address 2621 // c_rarg2 - K (key) in little endian int array 2622 // 2623 address generate_aescrypt_decryptBlock() { 2624 assert(UseAES, "need AES instructions and misaligned SSE support"); 2625 __ align(CodeEntryAlignment); 2626 StubCodeMark mark(this, "StubRoutines", "aescrypt_decryptBlock"); 2627 Label L_doLast; 2628 2629 const Register from = c_rarg0; // source array address 2630 const Register to = c_rarg1; // destination array address 2631 const Register key = c_rarg2; // key array address 2632 const Register keylen = rscratch1; 2633 2634 address start = __ pc(); 2635 __ enter(); // required for proper stackwalking of RuntimeStub frame 2636 2637 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2638 2639 __ ld1(v0, __ T16B, from); // get 16 bytes of input 2640 2641 __ ld1(v5, __ T16B, __ post(key, 16)); 2642 __ rev32(v5, __ T16B, v5); 2643 2644 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2645 __ rev32(v1, __ T16B, v1); 2646 __ rev32(v2, __ T16B, v2); 2647 __ rev32(v3, __ T16B, v3); 2648 __ rev32(v4, __ T16B, v4); 2649 __ aesd(v0, v1); 2650 __ aesimc(v0, v0); 2651 __ aesd(v0, v2); 2652 __ aesimc(v0, v0); 2653 __ aesd(v0, v3); 2654 __ aesimc(v0, v0); 2655 __ aesd(v0, v4); 2656 __ aesimc(v0, v0); 2657 2658 __ ld1(v1, v2, v3, v4, __ T16B, __ post(key, 64)); 2659 __ rev32(v1, __ T16B, v1); 2660 __ rev32(v2, __ T16B, v2); 2661 __ rev32(v3, __ T16B, v3); 2662 __ rev32(v4, __ T16B, v4); 2663 __ aesd(v0, v1); 2664 __ aesimc(v0, v0); 2665 __ aesd(v0, v2); 2666 __ aesimc(v0, v0); 2667 __ aesd(v0, v3); 2668 __ aesimc(v0, v0); 2669 __ aesd(v0, v4); 2670 __ aesimc(v0, v0); 2671 2672 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2673 __ rev32(v1, __ T16B, v1); 2674 __ rev32(v2, __ T16B, v2); 2675 2676 __ cmpw(keylen, 44); 2677 __ br(Assembler::EQ, L_doLast); 2678 2679 __ aesd(v0, v1); 2680 __ aesimc(v0, v0); 2681 __ aesd(v0, v2); 2682 __ aesimc(v0, v0); 2683 2684 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2685 __ rev32(v1, __ T16B, v1); 2686 __ rev32(v2, __ T16B, v2); 2687 2688 __ cmpw(keylen, 52); 2689 __ br(Assembler::EQ, L_doLast); 2690 2691 __ aesd(v0, v1); 2692 __ aesimc(v0, v0); 2693 __ aesd(v0, v2); 2694 __ aesimc(v0, v0); 2695 2696 __ ld1(v1, v2, __ T16B, __ post(key, 32)); 2697 __ rev32(v1, __ T16B, v1); 2698 __ rev32(v2, __ T16B, v2); 2699 2700 __ BIND(L_doLast); 2701 2702 __ aesd(v0, v1); 2703 __ aesimc(v0, v0); 2704 __ aesd(v0, v2); 2705 2706 __ eor(v0, __ T16B, v0, v5); 2707 2708 __ st1(v0, __ T16B, to); 2709 2710 __ mov(r0, 0); 2711 2712 __ leave(); 2713 __ ret(lr); 2714 2715 return start; 2716 } 2717 2718 // Arguments: 2719 // 2720 // Inputs: 2721 // c_rarg0 - source byte array address 2722 // c_rarg1 - destination byte array address 2723 // c_rarg2 - K (key) in little endian int array 2724 // c_rarg3 - r vector byte array address 2725 // c_rarg4 - input length 2726 // 2727 // Output: 2728 // x0 - input length 2729 // 2730 address generate_cipherBlockChaining_encryptAESCrypt() { 2731 assert(UseAES, "need AES instructions and misaligned SSE support"); 2732 __ align(CodeEntryAlignment); 2733 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_encryptAESCrypt"); 2734 2735 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2736 2737 const Register from = c_rarg0; // source array address 2738 const Register to = c_rarg1; // destination array address 2739 const Register key = c_rarg2; // key array address 2740 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2741 // and left with the results of the last encryption block 2742 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2743 const Register keylen = rscratch1; 2744 2745 address start = __ pc(); 2746 2747 __ enter(); 2748 2749 __ movw(rscratch2, len_reg); 2750 2751 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2752 2753 __ ld1(v0, __ T16B, rvec); 2754 2755 __ cmpw(keylen, 52); 2756 __ br(Assembler::CC, L_loadkeys_44); 2757 __ br(Assembler::EQ, L_loadkeys_52); 2758 2759 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2760 __ rev32(v17, __ T16B, v17); 2761 __ rev32(v18, __ T16B, v18); 2762 __ BIND(L_loadkeys_52); 2763 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2764 __ rev32(v19, __ T16B, v19); 2765 __ rev32(v20, __ T16B, v20); 2766 __ BIND(L_loadkeys_44); 2767 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2768 __ rev32(v21, __ T16B, v21); 2769 __ rev32(v22, __ T16B, v22); 2770 __ rev32(v23, __ T16B, v23); 2771 __ rev32(v24, __ T16B, v24); 2772 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2773 __ rev32(v25, __ T16B, v25); 2774 __ rev32(v26, __ T16B, v26); 2775 __ rev32(v27, __ T16B, v27); 2776 __ rev32(v28, __ T16B, v28); 2777 __ ld1(v29, v30, v31, __ T16B, key); 2778 __ rev32(v29, __ T16B, v29); 2779 __ rev32(v30, __ T16B, v30); 2780 __ rev32(v31, __ T16B, v31); 2781 2782 __ BIND(L_aes_loop); 2783 __ ld1(v1, __ T16B, __ post(from, 16)); 2784 __ eor(v0, __ T16B, v0, v1); 2785 2786 __ br(Assembler::CC, L_rounds_44); 2787 __ br(Assembler::EQ, L_rounds_52); 2788 2789 __ aese(v0, v17); __ aesmc(v0, v0); 2790 __ aese(v0, v18); __ aesmc(v0, v0); 2791 __ BIND(L_rounds_52); 2792 __ aese(v0, v19); __ aesmc(v0, v0); 2793 __ aese(v0, v20); __ aesmc(v0, v0); 2794 __ BIND(L_rounds_44); 2795 __ aese(v0, v21); __ aesmc(v0, v0); 2796 __ aese(v0, v22); __ aesmc(v0, v0); 2797 __ aese(v0, v23); __ aesmc(v0, v0); 2798 __ aese(v0, v24); __ aesmc(v0, v0); 2799 __ aese(v0, v25); __ aesmc(v0, v0); 2800 __ aese(v0, v26); __ aesmc(v0, v0); 2801 __ aese(v0, v27); __ aesmc(v0, v0); 2802 __ aese(v0, v28); __ aesmc(v0, v0); 2803 __ aese(v0, v29); __ aesmc(v0, v0); 2804 __ aese(v0, v30); 2805 __ eor(v0, __ T16B, v0, v31); 2806 2807 __ st1(v0, __ T16B, __ post(to, 16)); 2808 2809 __ subw(len_reg, len_reg, 16); 2810 __ cbnzw(len_reg, L_aes_loop); 2811 2812 __ st1(v0, __ T16B, rvec); 2813 2814 __ mov(r0, rscratch2); 2815 2816 __ leave(); 2817 __ ret(lr); 2818 2819 return start; 2820 } 2821 2822 // Arguments: 2823 // 2824 // Inputs: 2825 // c_rarg0 - source byte array address 2826 // c_rarg1 - destination byte array address 2827 // c_rarg2 - K (key) in little endian int array 2828 // c_rarg3 - r vector byte array address 2829 // c_rarg4 - input length 2830 // 2831 // Output: 2832 // r0 - input length 2833 // 2834 address generate_cipherBlockChaining_decryptAESCrypt() { 2835 assert(UseAES, "need AES instructions and misaligned SSE support"); 2836 __ align(CodeEntryAlignment); 2837 StubCodeMark mark(this, "StubRoutines", "cipherBlockChaining_decryptAESCrypt"); 2838 2839 Label L_loadkeys_44, L_loadkeys_52, L_aes_loop, L_rounds_44, L_rounds_52; 2840 2841 const Register from = c_rarg0; // source array address 2842 const Register to = c_rarg1; // destination array address 2843 const Register key = c_rarg2; // key array address 2844 const Register rvec = c_rarg3; // r byte array initialized from initvector array address 2845 // and left with the results of the last encryption block 2846 const Register len_reg = c_rarg4; // src len (must be multiple of blocksize 16) 2847 const Register keylen = rscratch1; 2848 2849 address start = __ pc(); 2850 2851 __ enter(); 2852 2853 __ movw(rscratch2, len_reg); 2854 2855 __ ldrw(keylen, Address(key, arrayOopDesc::length_offset_in_bytes() - arrayOopDesc::base_offset_in_bytes(T_INT))); 2856 2857 __ ld1(v2, __ T16B, rvec); 2858 2859 __ ld1(v31, __ T16B, __ post(key, 16)); 2860 __ rev32(v31, __ T16B, v31); 2861 2862 __ cmpw(keylen, 52); 2863 __ br(Assembler::CC, L_loadkeys_44); 2864 __ br(Assembler::EQ, L_loadkeys_52); 2865 2866 __ ld1(v17, v18, __ T16B, __ post(key, 32)); 2867 __ rev32(v17, __ T16B, v17); 2868 __ rev32(v18, __ T16B, v18); 2869 __ BIND(L_loadkeys_52); 2870 __ ld1(v19, v20, __ T16B, __ post(key, 32)); 2871 __ rev32(v19, __ T16B, v19); 2872 __ rev32(v20, __ T16B, v20); 2873 __ BIND(L_loadkeys_44); 2874 __ ld1(v21, v22, v23, v24, __ T16B, __ post(key, 64)); 2875 __ rev32(v21, __ T16B, v21); 2876 __ rev32(v22, __ T16B, v22); 2877 __ rev32(v23, __ T16B, v23); 2878 __ rev32(v24, __ T16B, v24); 2879 __ ld1(v25, v26, v27, v28, __ T16B, __ post(key, 64)); 2880 __ rev32(v25, __ T16B, v25); 2881 __ rev32(v26, __ T16B, v26); 2882 __ rev32(v27, __ T16B, v27); 2883 __ rev32(v28, __ T16B, v28); 2884 __ ld1(v29, v30, __ T16B, key); 2885 __ rev32(v29, __ T16B, v29); 2886 __ rev32(v30, __ T16B, v30); 2887 2888 __ BIND(L_aes_loop); 2889 __ ld1(v0, __ T16B, __ post(from, 16)); 2890 __ orr(v1, __ T16B, v0, v0); 2891 2892 __ br(Assembler::CC, L_rounds_44); 2893 __ br(Assembler::EQ, L_rounds_52); 2894 2895 __ aesd(v0, v17); __ aesimc(v0, v0); 2896 __ aesd(v0, v18); __ aesimc(v0, v0); 2897 __ BIND(L_rounds_52); 2898 __ aesd(v0, v19); __ aesimc(v0, v0); 2899 __ aesd(v0, v20); __ aesimc(v0, v0); 2900 __ BIND(L_rounds_44); 2901 __ aesd(v0, v21); __ aesimc(v0, v0); 2902 __ aesd(v0, v22); __ aesimc(v0, v0); 2903 __ aesd(v0, v23); __ aesimc(v0, v0); 2904 __ aesd(v0, v24); __ aesimc(v0, v0); 2905 __ aesd(v0, v25); __ aesimc(v0, v0); 2906 __ aesd(v0, v26); __ aesimc(v0, v0); 2907 __ aesd(v0, v27); __ aesimc(v0, v0); 2908 __ aesd(v0, v28); __ aesimc(v0, v0); 2909 __ aesd(v0, v29); __ aesimc(v0, v0); 2910 __ aesd(v0, v30); 2911 __ eor(v0, __ T16B, v0, v31); 2912 __ eor(v0, __ T16B, v0, v2); 2913 2914 __ st1(v0, __ T16B, __ post(to, 16)); 2915 __ orr(v2, __ T16B, v1, v1); 2916 2917 __ subw(len_reg, len_reg, 16); 2918 __ cbnzw(len_reg, L_aes_loop); 2919 2920 __ st1(v2, __ T16B, rvec); 2921 2922 __ mov(r0, rscratch2); 2923 2924 __ leave(); 2925 __ ret(lr); 2926 2927 return start; 2928 } 2929 2930 // Arguments: 2931 // 2932 // Inputs: 2933 // c_rarg0 - byte[] source+offset 2934 // c_rarg1 - int[] SHA.state 2935 // c_rarg2 - int offset 2936 // c_rarg3 - int limit 2937 // 2938 address generate_sha1_implCompress(bool multi_block, const char *name) { 2939 __ align(CodeEntryAlignment); 2940 StubCodeMark mark(this, "StubRoutines", name); 2941 address start = __ pc(); 2942 2943 Register buf = c_rarg0; 2944 Register state = c_rarg1; 2945 Register ofs = c_rarg2; 2946 Register limit = c_rarg3; 2947 2948 Label keys; 2949 Label sha1_loop; 2950 2951 // load the keys into v0..v3 2952 __ adr(rscratch1, keys); 2953 __ ld4r(v0, v1, v2, v3, __ T4S, Address(rscratch1)); 2954 // load 5 words state into v6, v7 2955 __ ldrq(v6, Address(state, 0)); 2956 __ ldrs(v7, Address(state, 16)); 2957 2958 2959 __ BIND(sha1_loop); 2960 // load 64 bytes of data into v16..v19 2961 __ ld1(v16, v17, v18, v19, __ T4S, multi_block ? __ post(buf, 64) : buf); 2962 __ rev32(v16, __ T16B, v16); 2963 __ rev32(v17, __ T16B, v17); 2964 __ rev32(v18, __ T16B, v18); 2965 __ rev32(v19, __ T16B, v19); 2966 2967 // do the sha1 2968 __ addv(v4, __ T4S, v16, v0); 2969 __ orr(v20, __ T16B, v6, v6); 2970 2971 FloatRegister d0 = v16; 2972 FloatRegister d1 = v17; 2973 FloatRegister d2 = v18; 2974 FloatRegister d3 = v19; 2975 2976 for (int round = 0; round < 20; round++) { 2977 FloatRegister tmp1 = (round & 1) ? v4 : v5; 2978 FloatRegister tmp2 = (round & 1) ? v21 : v22; 2979 FloatRegister tmp3 = round ? ((round & 1) ? v22 : v21) : v7; 2980 FloatRegister tmp4 = (round & 1) ? v5 : v4; 2981 FloatRegister key = (round < 4) ? v0 : ((round < 9) ? v1 : ((round < 14) ? v2 : v3)); 2982 2983 if (round < 16) __ sha1su0(d0, __ T4S, d1, d2); 2984 if (round < 19) __ addv(tmp1, __ T4S, d1, key); 2985 __ sha1h(tmp2, __ T4S, v20); 2986 if (round < 5) 2987 __ sha1c(v20, __ T4S, tmp3, tmp4); 2988 else if (round < 10 || round >= 15) 2989 __ sha1p(v20, __ T4S, tmp3, tmp4); 2990 else 2991 __ sha1m(v20, __ T4S, tmp3, tmp4); 2992 if (round < 16) __ sha1su1(d0, __ T4S, d3); 2993 2994 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 2995 } 2996 2997 __ addv(v7, __ T2S, v7, v21); 2998 __ addv(v6, __ T4S, v6, v20); 2999 3000 if (multi_block) { 3001 __ add(ofs, ofs, 64); 3002 __ cmp(ofs, limit); 3003 __ br(Assembler::LE, sha1_loop); 3004 __ mov(c_rarg0, ofs); // return ofs 3005 } 3006 3007 __ strq(v6, Address(state, 0)); 3008 __ strs(v7, Address(state, 16)); 3009 3010 __ ret(lr); 3011 3012 __ bind(keys); 3013 __ emit_int32(0x5a827999); 3014 __ emit_int32(0x6ed9eba1); 3015 __ emit_int32(0x8f1bbcdc); 3016 __ emit_int32(0xca62c1d6); 3017 3018 return start; 3019 } 3020 3021 3022 // Arguments: 3023 // 3024 // Inputs: 3025 // c_rarg0 - byte[] source+offset 3026 // c_rarg1 - int[] SHA.state 3027 // c_rarg2 - int offset 3028 // c_rarg3 - int limit 3029 // 3030 address generate_sha256_implCompress(bool multi_block, const char *name) { 3031 static const uint32_t round_consts[64] = { 3032 0x428a2f98, 0x71374491, 0xb5c0fbcf, 0xe9b5dba5, 3033 0x3956c25b, 0x59f111f1, 0x923f82a4, 0xab1c5ed5, 3034 0xd807aa98, 0x12835b01, 0x243185be, 0x550c7dc3, 3035 0x72be5d74, 0x80deb1fe, 0x9bdc06a7, 0xc19bf174, 3036 0xe49b69c1, 0xefbe4786, 0x0fc19dc6, 0x240ca1cc, 3037 0x2de92c6f, 0x4a7484aa, 0x5cb0a9dc, 0x76f988da, 3038 0x983e5152, 0xa831c66d, 0xb00327c8, 0xbf597fc7, 3039 0xc6e00bf3, 0xd5a79147, 0x06ca6351, 0x14292967, 3040 0x27b70a85, 0x2e1b2138, 0x4d2c6dfc, 0x53380d13, 3041 0x650a7354, 0x766a0abb, 0x81c2c92e, 0x92722c85, 3042 0xa2bfe8a1, 0xa81a664b, 0xc24b8b70, 0xc76c51a3, 3043 0xd192e819, 0xd6990624, 0xf40e3585, 0x106aa070, 3044 0x19a4c116, 0x1e376c08, 0x2748774c, 0x34b0bcb5, 3045 0x391c0cb3, 0x4ed8aa4a, 0x5b9cca4f, 0x682e6ff3, 3046 0x748f82ee, 0x78a5636f, 0x84c87814, 0x8cc70208, 3047 0x90befffa, 0xa4506ceb, 0xbef9a3f7, 0xc67178f2, 3048 }; 3049 __ align(CodeEntryAlignment); 3050 StubCodeMark mark(this, "StubRoutines", name); 3051 address start = __ pc(); 3052 3053 Register buf = c_rarg0; 3054 Register state = c_rarg1; 3055 Register ofs = c_rarg2; 3056 Register limit = c_rarg3; 3057 3058 Label sha1_loop; 3059 3060 __ stpd(v8, v9, __ pre(sp, -32)); 3061 __ stpd(v10, v11, Address(sp, 16)); 3062 3063 // dga == v0 3064 // dgb == v1 3065 // dg0 == v2 3066 // dg1 == v3 3067 // dg2 == v4 3068 // t0 == v6 3069 // t1 == v7 3070 3071 // load 16 keys to v16..v31 3072 __ lea(rscratch1, ExternalAddress((address)round_consts)); 3073 __ ld1(v16, v17, v18, v19, __ T4S, __ post(rscratch1, 64)); 3074 __ ld1(v20, v21, v22, v23, __ T4S, __ post(rscratch1, 64)); 3075 __ ld1(v24, v25, v26, v27, __ T4S, __ post(rscratch1, 64)); 3076 __ ld1(v28, v29, v30, v31, __ T4S, rscratch1); 3077 3078 // load 8 words (256 bits) state 3079 __ ldpq(v0, v1, state); 3080 3081 __ BIND(sha1_loop); 3082 // load 64 bytes of data into v8..v11 3083 __ ld1(v8, v9, v10, v11, __ T4S, multi_block ? __ post(buf, 64) : buf); 3084 __ rev32(v8, __ T16B, v8); 3085 __ rev32(v9, __ T16B, v9); 3086 __ rev32(v10, __ T16B, v10); 3087 __ rev32(v11, __ T16B, v11); 3088 3089 __ addv(v6, __ T4S, v8, v16); 3090 __ orr(v2, __ T16B, v0, v0); 3091 __ orr(v3, __ T16B, v1, v1); 3092 3093 FloatRegister d0 = v8; 3094 FloatRegister d1 = v9; 3095 FloatRegister d2 = v10; 3096 FloatRegister d3 = v11; 3097 3098 3099 for (int round = 0; round < 16; round++) { 3100 FloatRegister tmp1 = (round & 1) ? v6 : v7; 3101 FloatRegister tmp2 = (round & 1) ? v7 : v6; 3102 FloatRegister tmp3 = (round & 1) ? v2 : v4; 3103 FloatRegister tmp4 = (round & 1) ? v4 : v2; 3104 3105 if (round < 12) __ sha256su0(d0, __ T4S, d1); 3106 __ orr(v4, __ T16B, v2, v2); 3107 if (round < 15) 3108 __ addv(tmp1, __ T4S, d1, as_FloatRegister(round + 17)); 3109 __ sha256h(v2, __ T4S, v3, tmp2); 3110 __ sha256h2(v3, __ T4S, v4, tmp2); 3111 if (round < 12) __ sha256su1(d0, __ T4S, d2, d3); 3112 3113 tmp1 = d0; d0 = d1; d1 = d2; d2 = d3; d3 = tmp1; 3114 } 3115 3116 __ addv(v0, __ T4S, v0, v2); 3117 __ addv(v1, __ T4S, v1, v3); 3118 3119 if (multi_block) { 3120 __ add(ofs, ofs, 64); 3121 __ cmp(ofs, limit); 3122 __ br(Assembler::LE, sha1_loop); 3123 __ mov(c_rarg0, ofs); // return ofs 3124 } 3125 3126 __ ldpd(v10, v11, Address(sp, 16)); 3127 __ ldpd(v8, v9, __ post(sp, 32)); 3128 3129 __ stpq(v0, v1, state); 3130 3131 __ ret(lr); 3132 3133 return start; 3134 } 3135 3136 #ifndef BUILTIN_SIM 3137 // Safefetch stubs. 3138 void generate_safefetch(const char* name, int size, address* entry, 3139 address* fault_pc, address* continuation_pc) { 3140 // safefetch signatures: 3141 // int SafeFetch32(int* adr, int errValue); 3142 // intptr_t SafeFetchN (intptr_t* adr, intptr_t errValue); 3143 // 3144 // arguments: 3145 // c_rarg0 = adr 3146 // c_rarg1 = errValue 3147 // 3148 // result: 3149 // PPC_RET = *adr or errValue 3150 3151 StubCodeMark mark(this, "StubRoutines", name); 3152 3153 // Entry point, pc or function descriptor. 3154 *entry = __ pc(); 3155 3156 // Load *adr into c_rarg1, may fault. 3157 *fault_pc = __ pc(); 3158 switch (size) { 3159 case 4: 3160 // int32_t 3161 __ ldrw(c_rarg1, Address(c_rarg0, 0)); 3162 break; 3163 case 8: 3164 // int64_t 3165 __ ldr(c_rarg1, Address(c_rarg0, 0)); 3166 break; 3167 default: 3168 ShouldNotReachHere(); 3169 } 3170 3171 // return errValue or *adr 3172 *continuation_pc = __ pc(); 3173 __ mov(r0, c_rarg1); 3174 __ ret(lr); 3175 } 3176 #endif 3177 3178 /** 3179 * Arguments: 3180 * 3181 * Inputs: 3182 * c_rarg0 - int crc 3183 * c_rarg1 - byte* buf 3184 * c_rarg2 - int length 3185 * 3186 * Ouput: 3187 * rax - int crc result 3188 */ 3189 address generate_updateBytesCRC32() { 3190 assert(UseCRC32Intrinsics, "what are we doing here?"); 3191 3192 __ align(CodeEntryAlignment); 3193 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32"); 3194 3195 address start = __ pc(); 3196 3197 const Register crc = c_rarg0; // crc 3198 const Register buf = c_rarg1; // source java byte array address 3199 const Register len = c_rarg2; // length 3200 const Register table0 = c_rarg3; // crc_table address 3201 const Register table1 = c_rarg4; 3202 const Register table2 = c_rarg5; 3203 const Register table3 = c_rarg6; 3204 const Register tmp3 = c_rarg7; 3205 3206 BLOCK_COMMENT("Entry:"); 3207 __ enter(); // required for proper stackwalking of RuntimeStub frame 3208 3209 __ kernel_crc32(crc, buf, len, 3210 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3211 3212 __ leave(); // required for proper stackwalking of RuntimeStub frame 3213 __ ret(lr); 3214 3215 return start; 3216 } 3217 3218 /** 3219 * Arguments: 3220 * 3221 * Inputs: 3222 * c_rarg0 - int crc 3223 * c_rarg1 - byte* buf 3224 * c_rarg2 - int length 3225 * c_rarg3 - int* table 3226 * 3227 * Ouput: 3228 * r0 - int crc result 3229 */ 3230 address generate_updateBytesCRC32C() { 3231 assert(UseCRC32CIntrinsics, "what are we doing here?"); 3232 3233 __ align(CodeEntryAlignment); 3234 StubCodeMark mark(this, "StubRoutines", "updateBytesCRC32C"); 3235 3236 address start = __ pc(); 3237 3238 const Register crc = c_rarg0; // crc 3239 const Register buf = c_rarg1; // source java byte array address 3240 const Register len = c_rarg2; // length 3241 const Register table0 = c_rarg3; // crc_table address 3242 const Register table1 = c_rarg4; 3243 const Register table2 = c_rarg5; 3244 const Register table3 = c_rarg6; 3245 const Register tmp3 = c_rarg7; 3246 3247 BLOCK_COMMENT("Entry:"); 3248 __ enter(); // required for proper stackwalking of RuntimeStub frame 3249 3250 __ kernel_crc32c(crc, buf, len, 3251 table0, table1, table2, table3, rscratch1, rscratch2, tmp3); 3252 3253 __ leave(); // required for proper stackwalking of RuntimeStub frame 3254 __ ret(lr); 3255 3256 return start; 3257 } 3258 3259 /*** 3260 * Arguments: 3261 * 3262 * Inputs: 3263 * c_rarg0 - int adler 3264 * c_rarg1 - byte* buff 3265 * c_rarg2 - int len 3266 * 3267 * Output: 3268 * c_rarg0 - int adler result 3269 */ 3270 address generate_updateBytesAdler32() { 3271 __ align(CodeEntryAlignment); 3272 StubCodeMark mark(this, "StubRoutines", "updateBytesAdler32"); 3273 address start = __ pc(); 3274 3275 Label L_simple_by1_loop, L_nmax, L_nmax_loop, L_by16, L_by16_loop, L_by1_loop, L_do_mod, L_combine, L_by1; 3276 3277 // Aliases 3278 Register adler = c_rarg0; 3279 Register s1 = c_rarg0; 3280 Register s2 = c_rarg3; 3281 Register buff = c_rarg1; 3282 Register len = c_rarg2; 3283 Register nmax = r4; 3284 Register base = r5; 3285 Register count = r6; 3286 Register temp0 = rscratch1; 3287 Register temp1 = rscratch2; 3288 FloatRegister vbytes = v0; 3289 FloatRegister vs1acc = v1; 3290 FloatRegister vs2acc = v2; 3291 FloatRegister vtable = v3; 3292 3293 // Max number of bytes we can process before having to take the mod 3294 // 0x15B0 is 5552 in decimal, the largest n such that 255n(n+1)/2 + (n+1)(BASE-1) <= 2^32-1 3295 unsigned long BASE = 0xfff1; 3296 unsigned long NMAX = 0x15B0; 3297 3298 __ mov(base, BASE); 3299 __ mov(nmax, NMAX); 3300 3301 // Load accumulation coefficients for the upper 16 bits 3302 __ lea(temp0, ExternalAddress((address) StubRoutines::aarch64::_adler_table)); 3303 __ ld1(vtable, __ T16B, Address(temp0)); 3304 3305 // s1 is initialized to the lower 16 bits of adler 3306 // s2 is initialized to the upper 16 bits of adler 3307 __ ubfx(s2, adler, 16, 16); // s2 = ((adler >> 16) & 0xffff) 3308 __ uxth(s1, adler); // s1 = (adler & 0xffff) 3309 3310 // The pipelined loop needs at least 16 elements for 1 iteration 3311 // It does check this, but it is more effective to skip to the cleanup loop 3312 __ cmp(len, (u1)16); 3313 __ br(Assembler::HS, L_nmax); 3314 __ cbz(len, L_combine); 3315 3316 __ bind(L_simple_by1_loop); 3317 __ ldrb(temp0, Address(__ post(buff, 1))); 3318 __ add(s1, s1, temp0); 3319 __ add(s2, s2, s1); 3320 __ subs(len, len, 1); 3321 __ br(Assembler::HI, L_simple_by1_loop); 3322 3323 // s1 = s1 % BASE 3324 __ subs(temp0, s1, base); 3325 __ csel(s1, temp0, s1, Assembler::HS); 3326 3327 // s2 = s2 % BASE 3328 __ lsr(temp0, s2, 16); 3329 __ lsl(temp1, temp0, 4); 3330 __ sub(temp1, temp1, temp0); 3331 __ add(s2, temp1, s2, ext::uxth); 3332 3333 __ subs(temp0, s2, base); 3334 __ csel(s2, temp0, s2, Assembler::HS); 3335 3336 __ b(L_combine); 3337 3338 __ bind(L_nmax); 3339 __ subs(len, len, nmax); 3340 __ sub(count, nmax, 16); 3341 __ br(Assembler::LO, L_by16); 3342 3343 __ bind(L_nmax_loop); 3344 3345 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3346 vbytes, vs1acc, vs2acc, vtable); 3347 3348 __ subs(count, count, 16); 3349 __ br(Assembler::HS, L_nmax_loop); 3350 3351 // s1 = s1 % BASE 3352 __ lsr(temp0, s1, 16); 3353 __ lsl(temp1, temp0, 4); 3354 __ sub(temp1, temp1, temp0); 3355 __ add(temp1, temp1, s1, ext::uxth); 3356 3357 __ lsr(temp0, temp1, 16); 3358 __ lsl(s1, temp0, 4); 3359 __ sub(s1, s1, temp0); 3360 __ add(s1, s1, temp1, ext:: uxth); 3361 3362 __ subs(temp0, s1, base); 3363 __ csel(s1, temp0, s1, Assembler::HS); 3364 3365 // s2 = s2 % BASE 3366 __ lsr(temp0, s2, 16); 3367 __ lsl(temp1, temp0, 4); 3368 __ sub(temp1, temp1, temp0); 3369 __ add(temp1, temp1, s2, ext::uxth); 3370 3371 __ lsr(temp0, temp1, 16); 3372 __ lsl(s2, temp0, 4); 3373 __ sub(s2, s2, temp0); 3374 __ add(s2, s2, temp1, ext:: uxth); 3375 3376 __ subs(temp0, s2, base); 3377 __ csel(s2, temp0, s2, Assembler::HS); 3378 3379 __ subs(len, len, nmax); 3380 __ sub(count, nmax, 16); 3381 __ br(Assembler::HS, L_nmax_loop); 3382 3383 __ bind(L_by16); 3384 __ adds(len, len, count); 3385 __ br(Assembler::LO, L_by1); 3386 3387 __ bind(L_by16_loop); 3388 3389 generate_updateBytesAdler32_accum(s1, s2, buff, temp0, temp1, 3390 vbytes, vs1acc, vs2acc, vtable); 3391 3392 __ subs(len, len, 16); 3393 __ br(Assembler::HS, L_by16_loop); 3394 3395 __ bind(L_by1); 3396 __ adds(len, len, 15); 3397 __ br(Assembler::LO, L_do_mod); 3398 3399 __ bind(L_by1_loop); 3400 __ ldrb(temp0, Address(__ post(buff, 1))); 3401 __ add(s1, temp0, s1); 3402 __ add(s2, s2, s1); 3403 __ subs(len, len, 1); 3404 __ br(Assembler::HS, L_by1_loop); 3405 3406 __ bind(L_do_mod); 3407 // s1 = s1 % BASE 3408 __ lsr(temp0, s1, 16); 3409 __ lsl(temp1, temp0, 4); 3410 __ sub(temp1, temp1, temp0); 3411 __ add(temp1, temp1, s1, ext::uxth); 3412 3413 __ lsr(temp0, temp1, 16); 3414 __ lsl(s1, temp0, 4); 3415 __ sub(s1, s1, temp0); 3416 __ add(s1, s1, temp1, ext:: uxth); 3417 3418 __ subs(temp0, s1, base); 3419 __ csel(s1, temp0, s1, Assembler::HS); 3420 3421 // s2 = s2 % BASE 3422 __ lsr(temp0, s2, 16); 3423 __ lsl(temp1, temp0, 4); 3424 __ sub(temp1, temp1, temp0); 3425 __ add(temp1, temp1, s2, ext::uxth); 3426 3427 __ lsr(temp0, temp1, 16); 3428 __ lsl(s2, temp0, 4); 3429 __ sub(s2, s2, temp0); 3430 __ add(s2, s2, temp1, ext:: uxth); 3431 3432 __ subs(temp0, s2, base); 3433 __ csel(s2, temp0, s2, Assembler::HS); 3434 3435 // Combine lower bits and higher bits 3436 __ bind(L_combine); 3437 __ orr(s1, s1, s2, Assembler::LSL, 16); // adler = s1 | (s2 << 16) 3438 3439 __ ret(lr); 3440 3441 return start; 3442 } 3443 3444 void generate_updateBytesAdler32_accum(Register s1, Register s2, Register buff, 3445 Register temp0, Register temp1, FloatRegister vbytes, 3446 FloatRegister vs1acc, FloatRegister vs2acc, FloatRegister vtable) { 3447 // Below is a vectorized implementation of updating s1 and s2 for 16 bytes. 3448 // We use b1, b2, ..., b16 to denote the 16 bytes loaded in each iteration. 3449 // In non-vectorized code, we update s1 and s2 as: 3450 // s1 <- s1 + b1 3451 // s2 <- s2 + s1 3452 // s1 <- s1 + b2 3453 // s2 <- s2 + b1 3454 // ... 3455 // s1 <- s1 + b16 3456 // s2 <- s2 + s1 3457 // Putting above assignments together, we have: 3458 // s1_new = s1 + b1 + b2 + ... + b16 3459 // s2_new = s2 + (s1 + b1) + (s1 + b1 + b2) + ... + (s1 + b1 + b2 + ... + b16) 3460 // = s2 + s1 * 16 + (b1 * 16 + b2 * 15 + ... + b16 * 1) 3461 // = s2 + s1 * 16 + (b1, b2, ... b16) dot (16, 15, ... 1) 3462 __ ld1(vbytes, __ T16B, Address(__ post(buff, 16))); 3463 3464 // s2 = s2 + s1 * 16 3465 __ add(s2, s2, s1, Assembler::LSL, 4); 3466 3467 // vs1acc = b1 + b2 + b3 + ... + b16 3468 // vs2acc = (b1 * 16) + (b2 * 15) + (b3 * 14) + ... + (b16 * 1) 3469 __ umullv(vs2acc, __ T8B, vtable, vbytes); 3470 __ umlalv(vs2acc, __ T16B, vtable, vbytes); 3471 __ uaddlv(vs1acc, __ T16B, vbytes); 3472 __ uaddlv(vs2acc, __ T8H, vs2acc); 3473 3474 // s1 = s1 + vs1acc, s2 = s2 + vs2acc 3475 __ fmovd(temp0, vs1acc); 3476 __ fmovd(temp1, vs2acc); 3477 __ add(s1, s1, temp0); 3478 __ add(s2, s2, temp1); 3479 } 3480 3481 /** 3482 * Arguments: 3483 * 3484 * Input: 3485 * c_rarg0 - x address 3486 * c_rarg1 - x length 3487 * c_rarg2 - y address 3488 * c_rarg3 - y lenth 3489 * c_rarg4 - z address 3490 * c_rarg5 - z length 3491 */ 3492 address generate_multiplyToLen() { 3493 __ align(CodeEntryAlignment); 3494 StubCodeMark mark(this, "StubRoutines", "multiplyToLen"); 3495 3496 address start = __ pc(); 3497 const Register x = r0; 3498 const Register xlen = r1; 3499 const Register y = r2; 3500 const Register ylen = r3; 3501 const Register z = r4; 3502 const Register zlen = r5; 3503 3504 const Register tmp1 = r10; 3505 const Register tmp2 = r11; 3506 const Register tmp3 = r12; 3507 const Register tmp4 = r13; 3508 const Register tmp5 = r14; 3509 const Register tmp6 = r15; 3510 const Register tmp7 = r16; 3511 3512 BLOCK_COMMENT("Entry:"); 3513 __ enter(); // required for proper stackwalking of RuntimeStub frame 3514 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3515 __ leave(); // required for proper stackwalking of RuntimeStub frame 3516 __ ret(lr); 3517 3518 return start; 3519 } 3520 3521 address generate_squareToLen() { 3522 // squareToLen algorithm for sizes 1..127 described in java code works 3523 // faster than multiply_to_len on some CPUs and slower on others, but 3524 // multiply_to_len shows a bit better overall results 3525 __ align(CodeEntryAlignment); 3526 StubCodeMark mark(this, "StubRoutines", "squareToLen"); 3527 address start = __ pc(); 3528 3529 const Register x = r0; 3530 const Register xlen = r1; 3531 const Register z = r2; 3532 const Register zlen = r3; 3533 const Register y = r4; // == x 3534 const Register ylen = r5; // == xlen 3535 3536 const Register tmp1 = r10; 3537 const Register tmp2 = r11; 3538 const Register tmp3 = r12; 3539 const Register tmp4 = r13; 3540 const Register tmp5 = r14; 3541 const Register tmp6 = r15; 3542 const Register tmp7 = r16; 3543 3544 RegSet spilled_regs = RegSet::of(y, ylen); 3545 BLOCK_COMMENT("Entry:"); 3546 __ enter(); 3547 __ push(spilled_regs, sp); 3548 __ mov(y, x); 3549 __ mov(ylen, xlen); 3550 __ multiply_to_len(x, xlen, y, ylen, z, zlen, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7); 3551 __ pop(spilled_regs, sp); 3552 __ leave(); 3553 __ ret(lr); 3554 return start; 3555 } 3556 3557 address generate_mulAdd() { 3558 __ align(CodeEntryAlignment); 3559 StubCodeMark mark(this, "StubRoutines", "mulAdd"); 3560 3561 address start = __ pc(); 3562 3563 const Register out = r0; 3564 const Register in = r1; 3565 const Register offset = r2; 3566 const Register len = r3; 3567 const Register k = r4; 3568 3569 BLOCK_COMMENT("Entry:"); 3570 __ enter(); 3571 __ mul_add(out, in, offset, len, k); 3572 __ leave(); 3573 __ ret(lr); 3574 3575 return start; 3576 } 3577 3578 void ghash_multiply(FloatRegister result_lo, FloatRegister result_hi, 3579 FloatRegister a, FloatRegister b, FloatRegister a1_xor_a0, 3580 FloatRegister tmp1, FloatRegister tmp2, FloatRegister tmp3, FloatRegister tmp4) { 3581 // Karatsuba multiplication performs a 128*128 -> 256-bit 3582 // multiplication in three 128-bit multiplications and a few 3583 // additions. 3584 // 3585 // (C1:C0) = A1*B1, (D1:D0) = A0*B0, (E1:E0) = (A0+A1)(B0+B1) 3586 // (A1:A0)(B1:B0) = C1:(C0+C1+D1+E1):(D1+C0+D0+E0):D0 3587 // 3588 // Inputs: 3589 // 3590 // A0 in a.d[0] (subkey) 3591 // A1 in a.d[1] 3592 // (A1+A0) in a1_xor_a0.d[0] 3593 // 3594 // B0 in b.d[0] (state) 3595 // B1 in b.d[1] 3596 3597 __ ext(tmp1, __ T16B, b, b, 0x08); 3598 __ pmull2(result_hi, __ T1Q, b, a, __ T2D); // A1*B1 3599 __ eor(tmp1, __ T16B, tmp1, b); // (B1+B0) 3600 __ pmull(result_lo, __ T1Q, b, a, __ T1D); // A0*B0 3601 __ pmull(tmp2, __ T1Q, tmp1, a1_xor_a0, __ T1D); // (A1+A0)(B1+B0) 3602 3603 __ ext(tmp4, __ T16B, result_lo, result_hi, 0x08); 3604 __ eor(tmp3, __ T16B, result_hi, result_lo); // A1*B1+A0*B0 3605 __ eor(tmp2, __ T16B, tmp2, tmp4); 3606 __ eor(tmp2, __ T16B, tmp2, tmp3); 3607 3608 // Register pair <result_hi:result_lo> holds the result of carry-less multiplication 3609 __ ins(result_hi, __ D, tmp2, 0, 1); 3610 __ ins(result_lo, __ D, tmp2, 1, 0); 3611 } 3612 3613 void ghash_reduce(FloatRegister result, FloatRegister lo, FloatRegister hi, 3614 FloatRegister p, FloatRegister z, FloatRegister t1) { 3615 const FloatRegister t0 = result; 3616 3617 // The GCM field polynomial f is z^128 + p(z), where p = 3618 // z^7+z^2+z+1. 3619 // 3620 // z^128 === -p(z) (mod (z^128 + p(z))) 3621 // 3622 // so, given that the product we're reducing is 3623 // a == lo + hi * z^128 3624 // substituting, 3625 // === lo - hi * p(z) (mod (z^128 + p(z))) 3626 // 3627 // we reduce by multiplying hi by p(z) and subtracting the result 3628 // from (i.e. XORing it with) lo. Because p has no nonzero high 3629 // bits we can do this with two 64-bit multiplications, lo*p and 3630 // hi*p. 3631 3632 __ pmull2(t0, __ T1Q, hi, p, __ T2D); 3633 __ ext(t1, __ T16B, t0, z, 8); 3634 __ eor(hi, __ T16B, hi, t1); 3635 __ ext(t1, __ T16B, z, t0, 8); 3636 __ eor(lo, __ T16B, lo, t1); 3637 __ pmull(t0, __ T1Q, hi, p, __ T1D); 3638 __ eor(result, __ T16B, lo, t0); 3639 } 3640 3641 address generate_has_negatives(address &has_negatives_long) { 3642 const u1 large_loop_size = 64; 3643 const uint64_t UPPER_BIT_MASK=0x8080808080808080; 3644 int dcache_line = VM_Version::dcache_line_size(); 3645 3646 Register ary1 = r1, len = r2, result = r0; 3647 3648 __ align(CodeEntryAlignment); 3649 3650 StubCodeMark mark(this, "StubRoutines", "has_negatives"); 3651 3652 address entry = __ pc(); 3653 3654 __ enter(); 3655 3656 Label RET_TRUE, RET_TRUE_NO_POP, RET_FALSE, ALIGNED, LOOP16, CHECK_16, DONE, 3657 LARGE_LOOP, POST_LOOP16, LEN_OVER_15, LEN_OVER_8, POST_LOOP16_LOAD_TAIL; 3658 3659 __ cmp(len, (u1)15); 3660 __ br(Assembler::GT, LEN_OVER_15); 3661 // The only case when execution falls into this code is when pointer is near 3662 // the end of memory page and we have to avoid reading next page 3663 __ add(ary1, ary1, len); 3664 __ subs(len, len, 8); 3665 __ br(Assembler::GT, LEN_OVER_8); 3666 __ ldr(rscratch2, Address(ary1, -8)); 3667 __ sub(rscratch1, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes. 3668 __ lsrv(rscratch2, rscratch2, rscratch1); 3669 __ tst(rscratch2, UPPER_BIT_MASK); 3670 __ cset(result, Assembler::NE); 3671 __ leave(); 3672 __ ret(lr); 3673 __ bind(LEN_OVER_8); 3674 __ ldp(rscratch1, rscratch2, Address(ary1, -16)); 3675 __ sub(len, len, 8); // no data dep., then sub can be executed while loading 3676 __ tst(rscratch2, UPPER_BIT_MASK); 3677 __ br(Assembler::NE, RET_TRUE_NO_POP); 3678 __ sub(rscratch2, zr, len, __ LSL, 3); // LSL 3 is to get bits from bytes 3679 __ lsrv(rscratch1, rscratch1, rscratch2); 3680 __ tst(rscratch1, UPPER_BIT_MASK); 3681 __ cset(result, Assembler::NE); 3682 __ leave(); 3683 __ ret(lr); 3684 3685 Register tmp1 = r3, tmp2 = r4, tmp3 = r5, tmp4 = r6, tmp5 = r7, tmp6 = r10; 3686 const RegSet spilled_regs = RegSet::range(tmp1, tmp5) + tmp6; 3687 3688 has_negatives_long = __ pc(); // 2nd entry point 3689 3690 __ enter(); 3691 3692 __ bind(LEN_OVER_15); 3693 __ push(spilled_regs, sp); 3694 __ andr(rscratch2, ary1, 15); // check pointer for 16-byte alignment 3695 __ cbz(rscratch2, ALIGNED); 3696 __ ldp(tmp6, tmp1, Address(ary1)); 3697 __ mov(tmp5, 16); 3698 __ sub(rscratch1, tmp5, rscratch2); // amount of bytes until aligned address 3699 __ add(ary1, ary1, rscratch1); 3700 __ sub(len, len, rscratch1); 3701 __ orr(tmp6, tmp6, tmp1); 3702 __ tst(tmp6, UPPER_BIT_MASK); 3703 __ br(Assembler::NE, RET_TRUE); 3704 3705 __ bind(ALIGNED); 3706 __ cmp(len, large_loop_size); 3707 __ br(Assembler::LT, CHECK_16); 3708 // Perform 16-byte load as early return in pre-loop to handle situation 3709 // when initially aligned large array has negative values at starting bytes, 3710 // so LARGE_LOOP would do 4 reads instead of 1 (in worst case), which is 3711 // slower. Cases with negative bytes further ahead won't be affected that 3712 // much. In fact, it'll be faster due to early loads, less instructions and 3713 // less branches in LARGE_LOOP. 3714 __ ldp(tmp6, tmp1, Address(__ post(ary1, 16))); 3715 __ sub(len, len, 16); 3716 __ orr(tmp6, tmp6, tmp1); 3717 __ tst(tmp6, UPPER_BIT_MASK); 3718 __ br(Assembler::NE, RET_TRUE); 3719 __ cmp(len, large_loop_size); 3720 __ br(Assembler::LT, CHECK_16); 3721 3722 if (SoftwarePrefetchHintDistance >= 0 3723 && SoftwarePrefetchHintDistance >= dcache_line) { 3724 // initial prefetch 3725 __ prfm(Address(ary1, SoftwarePrefetchHintDistance - dcache_line)); 3726 } 3727 __ bind(LARGE_LOOP); 3728 if (SoftwarePrefetchHintDistance >= 0) { 3729 __ prfm(Address(ary1, SoftwarePrefetchHintDistance)); 3730 } 3731 // Issue load instructions first, since it can save few CPU/MEM cycles, also 3732 // instead of 4 triples of "orr(...), addr(...);cbnz(...);" (for each ldp) 3733 // better generate 7 * orr(...) + 1 andr(...) + 1 cbnz(...) which saves 3 3734 // instructions per cycle and have less branches, but this approach disables 3735 // early return, thus, all 64 bytes are loaded and checked every time. 3736 __ ldp(tmp2, tmp3, Address(ary1)); 3737 __ ldp(tmp4, tmp5, Address(ary1, 16)); 3738 __ ldp(rscratch1, rscratch2, Address(ary1, 32)); 3739 __ ldp(tmp6, tmp1, Address(ary1, 48)); 3740 __ add(ary1, ary1, large_loop_size); 3741 __ sub(len, len, large_loop_size); 3742 __ orr(tmp2, tmp2, tmp3); 3743 __ orr(tmp4, tmp4, tmp5); 3744 __ orr(rscratch1, rscratch1, rscratch2); 3745 __ orr(tmp6, tmp6, tmp1); 3746 __ orr(tmp2, tmp2, tmp4); 3747 __ orr(rscratch1, rscratch1, tmp6); 3748 __ orr(tmp2, tmp2, rscratch1); 3749 __ tst(tmp2, UPPER_BIT_MASK); 3750 __ br(Assembler::NE, RET_TRUE); 3751 __ cmp(len, large_loop_size); 3752 __ br(Assembler::GE, LARGE_LOOP); 3753 3754 __ bind(CHECK_16); // small 16-byte load pre-loop 3755 __ cmp(len, (u1)16); 3756 __ br(Assembler::LT, POST_LOOP16); 3757 3758 __ bind(LOOP16); // small 16-byte load loop 3759 __ ldp(tmp2, tmp3, Address(__ post(ary1, 16))); 3760 __ sub(len, len, 16); 3761 __ orr(tmp2, tmp2, tmp3); 3762 __ tst(tmp2, UPPER_BIT_MASK); 3763 __ br(Assembler::NE, RET_TRUE); 3764 __ cmp(len, (u1)16); 3765 __ br(Assembler::GE, LOOP16); // 16-byte load loop end 3766 3767 __ bind(POST_LOOP16); // 16-byte aligned, so we can read unconditionally 3768 __ cmp(len, (u1)8); 3769 __ br(Assembler::LE, POST_LOOP16_LOAD_TAIL); 3770 __ ldr(tmp3, Address(__ post(ary1, 8))); 3771 __ sub(len, len, 8); 3772 __ tst(tmp3, UPPER_BIT_MASK); 3773 __ br(Assembler::NE, RET_TRUE); 3774 3775 __ bind(POST_LOOP16_LOAD_TAIL); 3776 __ cbz(len, RET_FALSE); // Can't shift left by 64 when len==0 3777 __ ldr(tmp1, Address(ary1)); 3778 __ mov(tmp2, 64); 3779 __ sub(tmp4, tmp2, len, __ LSL, 3); 3780 __ lslv(tmp1, tmp1, tmp4); 3781 __ tst(tmp1, UPPER_BIT_MASK); 3782 __ br(Assembler::NE, RET_TRUE); 3783 // Fallthrough 3784 3785 __ bind(RET_FALSE); 3786 __ pop(spilled_regs, sp); 3787 __ leave(); 3788 __ mov(result, zr); 3789 __ ret(lr); 3790 3791 __ bind(RET_TRUE); 3792 __ pop(spilled_regs, sp); 3793 __ bind(RET_TRUE_NO_POP); 3794 __ leave(); 3795 __ mov(result, 1); 3796 __ ret(lr); 3797 3798 __ bind(DONE); 3799 __ pop(spilled_regs, sp); 3800 __ leave(); 3801 __ ret(lr); 3802 return entry; 3803 } 3804 3805 void generate_large_array_equals_loop_nonsimd(int loopThreshold, 3806 bool usePrefetch, Label &NOT_EQUAL) { 3807 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3808 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3809 tmp7 = r12, tmp8 = r13; 3810 Label LOOP; 3811 3812 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3813 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3814 __ bind(LOOP); 3815 if (usePrefetch) { 3816 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3817 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3818 } 3819 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3820 __ eor(tmp1, tmp1, tmp2); 3821 __ eor(tmp3, tmp3, tmp4); 3822 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3823 __ orr(tmp1, tmp1, tmp3); 3824 __ cbnz(tmp1, NOT_EQUAL); 3825 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3826 __ eor(tmp5, tmp5, tmp6); 3827 __ eor(tmp7, tmp7, tmp8); 3828 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3829 __ orr(tmp5, tmp5, tmp7); 3830 __ cbnz(tmp5, NOT_EQUAL); 3831 __ ldp(tmp5, tmp7, Address(__ post(a1, 2 * wordSize))); 3832 __ eor(tmp1, tmp1, tmp2); 3833 __ eor(tmp3, tmp3, tmp4); 3834 __ ldp(tmp6, tmp8, Address(__ post(a2, 2 * wordSize))); 3835 __ orr(tmp1, tmp1, tmp3); 3836 __ cbnz(tmp1, NOT_EQUAL); 3837 __ ldp(tmp1, tmp3, Address(__ post(a1, 2 * wordSize))); 3838 __ eor(tmp5, tmp5, tmp6); 3839 __ sub(cnt1, cnt1, 8 * wordSize); 3840 __ eor(tmp7, tmp7, tmp8); 3841 __ ldp(tmp2, tmp4, Address(__ post(a2, 2 * wordSize))); 3842 // tmp6 is not used. MacroAssembler::subs is used here (rather than 3843 // cmp) because subs allows an unlimited range of immediate operand. 3844 __ subs(tmp6, cnt1, loopThreshold); 3845 __ orr(tmp5, tmp5, tmp7); 3846 __ cbnz(tmp5, NOT_EQUAL); 3847 __ br(__ GE, LOOP); 3848 // post-loop 3849 __ eor(tmp1, tmp1, tmp2); 3850 __ eor(tmp3, tmp3, tmp4); 3851 __ orr(tmp1, tmp1, tmp3); 3852 __ sub(cnt1, cnt1, 2 * wordSize); 3853 __ cbnz(tmp1, NOT_EQUAL); 3854 } 3855 3856 void generate_large_array_equals_loop_simd(int loopThreshold, 3857 bool usePrefetch, Label &NOT_EQUAL) { 3858 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3859 tmp2 = rscratch2; 3860 Label LOOP; 3861 3862 __ bind(LOOP); 3863 if (usePrefetch) { 3864 __ prfm(Address(a1, SoftwarePrefetchHintDistance)); 3865 __ prfm(Address(a2, SoftwarePrefetchHintDistance)); 3866 } 3867 __ ld1(v0, v1, v2, v3, __ T2D, Address(__ post(a1, 4 * 2 * wordSize))); 3868 __ sub(cnt1, cnt1, 8 * wordSize); 3869 __ ld1(v4, v5, v6, v7, __ T2D, Address(__ post(a2, 4 * 2 * wordSize))); 3870 __ subs(tmp1, cnt1, loopThreshold); 3871 __ eor(v0, __ T16B, v0, v4); 3872 __ eor(v1, __ T16B, v1, v5); 3873 __ eor(v2, __ T16B, v2, v6); 3874 __ eor(v3, __ T16B, v3, v7); 3875 __ orr(v0, __ T16B, v0, v1); 3876 __ orr(v1, __ T16B, v2, v3); 3877 __ orr(v0, __ T16B, v0, v1); 3878 __ umov(tmp1, v0, __ D, 0); 3879 __ umov(tmp2, v0, __ D, 1); 3880 __ orr(tmp1, tmp1, tmp2); 3881 __ cbnz(tmp1, NOT_EQUAL); 3882 __ br(__ GE, LOOP); 3883 } 3884 3885 // a1 = r1 - array1 address 3886 // a2 = r2 - array2 address 3887 // result = r0 - return value. Already contains "false" 3888 // cnt1 = r10 - amount of elements left to check, reduced by wordSize 3889 // r3-r5 are reserved temporary registers 3890 address generate_large_array_equals() { 3891 Register a1 = r1, a2 = r2, result = r0, cnt1 = r10, tmp1 = rscratch1, 3892 tmp2 = rscratch2, tmp3 = r3, tmp4 = r4, tmp5 = r5, tmp6 = r11, 3893 tmp7 = r12, tmp8 = r13; 3894 Label TAIL, NOT_EQUAL, EQUAL, NOT_EQUAL_NO_POP, NO_PREFETCH_LARGE_LOOP, 3895 SMALL_LOOP, POST_LOOP; 3896 const int PRE_LOOP_SIZE = UseSIMDForArrayEquals ? 0 : 16; 3897 // calculate if at least 32 prefetched bytes are used 3898 int prefetchLoopThreshold = SoftwarePrefetchHintDistance + 32; 3899 int nonPrefetchLoopThreshold = (64 + PRE_LOOP_SIZE); 3900 RegSet spilled_regs = RegSet::range(tmp6, tmp8); 3901 assert_different_registers(a1, a2, result, cnt1, tmp1, tmp2, tmp3, tmp4, 3902 tmp5, tmp6, tmp7, tmp8); 3903 3904 __ align(CodeEntryAlignment); 3905 3906 StubCodeMark mark(this, "StubRoutines", "large_array_equals"); 3907 3908 address entry = __ pc(); 3909 __ enter(); 3910 __ sub(cnt1, cnt1, wordSize); // first 8 bytes were loaded outside of stub 3911 // also advance pointers to use post-increment instead of pre-increment 3912 __ add(a1, a1, wordSize); 3913 __ add(a2, a2, wordSize); 3914 if (AvoidUnalignedAccesses) { 3915 // both implementations (SIMD/nonSIMD) are using relatively large load 3916 // instructions (ld1/ldp), which has huge penalty (up to x2 exec time) 3917 // on some CPUs in case of address is not at least 16-byte aligned. 3918 // Arrays are 8-byte aligned currently, so, we can make additional 8-byte 3919 // load if needed at least for 1st address and make if 16-byte aligned. 3920 Label ALIGNED16; 3921 __ tbz(a1, 3, ALIGNED16); 3922 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3923 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3924 __ sub(cnt1, cnt1, wordSize); 3925 __ eor(tmp1, tmp1, tmp2); 3926 __ cbnz(tmp1, NOT_EQUAL_NO_POP); 3927 __ bind(ALIGNED16); 3928 } 3929 if (UseSIMDForArrayEquals) { 3930 if (SoftwarePrefetchHintDistance >= 0) { 3931 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3932 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3933 generate_large_array_equals_loop_simd(prefetchLoopThreshold, 3934 /* prfm = */ true, NOT_EQUAL); 3935 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3936 __ br(__ LT, TAIL); 3937 } 3938 __ bind(NO_PREFETCH_LARGE_LOOP); 3939 generate_large_array_equals_loop_simd(nonPrefetchLoopThreshold, 3940 /* prfm = */ false, NOT_EQUAL); 3941 } else { 3942 __ push(spilled_regs, sp); 3943 if (SoftwarePrefetchHintDistance >= 0) { 3944 __ subs(tmp1, cnt1, prefetchLoopThreshold); 3945 __ br(__ LE, NO_PREFETCH_LARGE_LOOP); 3946 generate_large_array_equals_loop_nonsimd(prefetchLoopThreshold, 3947 /* prfm = */ true, NOT_EQUAL); 3948 __ subs(zr, cnt1, nonPrefetchLoopThreshold); 3949 __ br(__ LT, TAIL); 3950 } 3951 __ bind(NO_PREFETCH_LARGE_LOOP); 3952 generate_large_array_equals_loop_nonsimd(nonPrefetchLoopThreshold, 3953 /* prfm = */ false, NOT_EQUAL); 3954 } 3955 __ bind(TAIL); 3956 __ cbz(cnt1, EQUAL); 3957 __ subs(cnt1, cnt1, wordSize); 3958 __ br(__ LE, POST_LOOP); 3959 __ bind(SMALL_LOOP); 3960 __ ldr(tmp1, Address(__ post(a1, wordSize))); 3961 __ ldr(tmp2, Address(__ post(a2, wordSize))); 3962 __ subs(cnt1, cnt1, wordSize); 3963 __ eor(tmp1, tmp1, tmp2); 3964 __ cbnz(tmp1, NOT_EQUAL); 3965 __ br(__ GT, SMALL_LOOP); 3966 __ bind(POST_LOOP); 3967 __ ldr(tmp1, Address(a1, cnt1)); 3968 __ ldr(tmp2, Address(a2, cnt1)); 3969 __ eor(tmp1, tmp1, tmp2); 3970 __ cbnz(tmp1, NOT_EQUAL); 3971 __ bind(EQUAL); 3972 __ mov(result, true); 3973 __ bind(NOT_EQUAL); 3974 if (!UseSIMDForArrayEquals) { 3975 __ pop(spilled_regs, sp); 3976 } 3977 __ bind(NOT_EQUAL_NO_POP); 3978 __ leave(); 3979 __ ret(lr); 3980 return entry; 3981 } 3982 3983 address generate_dsin_dcos(bool isCos) { 3984 __ align(CodeEntryAlignment); 3985 StubCodeMark mark(this, "StubRoutines", isCos ? "libmDcos" : "libmDsin"); 3986 address start = __ pc(); 3987 __ generate_dsin_dcos(isCos, (address)StubRoutines::aarch64::_npio2_hw, 3988 (address)StubRoutines::aarch64::_two_over_pi, 3989 (address)StubRoutines::aarch64::_pio2, 3990 (address)StubRoutines::aarch64::_dsin_coef, 3991 (address)StubRoutines::aarch64::_dcos_coef); 3992 return start; 3993 } 3994 3995 address generate_dlog() { 3996 __ align(CodeEntryAlignment); 3997 StubCodeMark mark(this, "StubRoutines", "dlog"); 3998 address entry = __ pc(); 3999 FloatRegister vtmp0 = v0, vtmp1 = v1, vtmp2 = v2, vtmp3 = v3, vtmp4 = v4, 4000 vtmp5 = v5, tmpC1 = v16, tmpC2 = v17, tmpC3 = v18, tmpC4 = v19; 4001 Register tmp1 = r0, tmp2 = r1, tmp3 = r2, tmp4 = r3, tmp5 = r4; 4002 __ fast_log(vtmp0, vtmp1, vtmp2, vtmp3, vtmp4, vtmp5, tmpC1, tmpC2, tmpC3, 4003 tmpC4, tmp1, tmp2, tmp3, tmp4, tmp5); 4004 return entry; 4005 } 4006 4007 // code for comparing 16 bytes of strings with same encoding 4008 void compare_string_16_bytes_same(Label &DIFF1, Label &DIFF2) { 4009 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, tmp1 = r10, tmp2 = r11; 4010 __ ldr(rscratch1, Address(__ post(str1, 8))); 4011 __ eor(rscratch2, tmp1, tmp2); 4012 __ ldr(cnt1, Address(__ post(str2, 8))); 4013 __ cbnz(rscratch2, DIFF1); 4014 __ ldr(tmp1, Address(__ post(str1, 8))); 4015 __ eor(rscratch2, rscratch1, cnt1); 4016 __ ldr(tmp2, Address(__ post(str2, 8))); 4017 __ cbnz(rscratch2, DIFF2); 4018 } 4019 4020 // code for comparing 16 characters of strings with Latin1 and Utf16 encoding 4021 void compare_string_16_x_LU(Register tmpL, Register tmpU, Label &DIFF1, 4022 Label &DIFF2) { 4023 Register cnt1 = r2, tmp1 = r10, tmp2 = r11, tmp3 = r12; 4024 FloatRegister vtmp = v1, vtmpZ = v0, vtmp3 = v2; 4025 4026 __ ldrq(vtmp, Address(__ post(tmp2, 16))); 4027 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4028 __ zip1(vtmp3, __ T16B, vtmp, vtmpZ); 4029 // now we have 32 bytes of characters (converted to U) in vtmp:vtmp3 4030 4031 __ fmovd(tmpL, vtmp3); 4032 __ eor(rscratch2, tmp3, tmpL); 4033 __ cbnz(rscratch2, DIFF2); 4034 4035 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4036 __ umov(tmpL, vtmp3, __ D, 1); 4037 __ eor(rscratch2, tmpU, tmpL); 4038 __ cbnz(rscratch2, DIFF1); 4039 4040 __ zip2(vtmp, __ T16B, vtmp, vtmpZ); 4041 __ ldr(tmpU, Address(__ post(cnt1, 8))); 4042 __ fmovd(tmpL, vtmp); 4043 __ eor(rscratch2, tmp3, tmpL); 4044 __ cbnz(rscratch2, DIFF2); 4045 4046 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4047 __ umov(tmpL, vtmp, __ D, 1); 4048 __ eor(rscratch2, tmpU, tmpL); 4049 __ cbnz(rscratch2, DIFF1); 4050 } 4051 4052 // r0 = result 4053 // r1 = str1 4054 // r2 = cnt1 4055 // r3 = str2 4056 // r4 = cnt2 4057 // r10 = tmp1 4058 // r11 = tmp2 4059 address generate_compare_long_string_different_encoding(bool isLU) { 4060 __ align(CodeEntryAlignment); 4061 StubCodeMark mark(this, "StubRoutines", isLU 4062 ? "compare_long_string_different_encoding LU" 4063 : "compare_long_string_different_encoding UL"); 4064 address entry = __ pc(); 4065 Label SMALL_LOOP, TAIL, TAIL_LOAD_16, LOAD_LAST, DIFF1, DIFF2, 4066 DONE, CALCULATE_DIFFERENCE, LARGE_LOOP_PREFETCH, NO_PREFETCH, 4067 LARGE_LOOP_PREFETCH_REPEAT1, LARGE_LOOP_PREFETCH_REPEAT2; 4068 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4069 tmp1 = r10, tmp2 = r11, tmp3 = r12, tmp4 = r14; 4070 FloatRegister vtmpZ = v0, vtmp = v1, vtmp3 = v2; 4071 RegSet spilled_regs = RegSet::of(tmp3, tmp4); 4072 4073 int prefetchLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance/2); 4074 4075 __ eor(vtmpZ, __ T16B, vtmpZ, vtmpZ); 4076 // cnt2 == amount of characters left to compare 4077 // Check already loaded first 4 symbols(vtmp and tmp2(LU)/tmp1(UL)) 4078 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4079 __ add(str1, str1, isLU ? wordSize/2 : wordSize); 4080 __ add(str2, str2, isLU ? wordSize : wordSize/2); 4081 __ fmovd(isLU ? tmp1 : tmp2, vtmp); 4082 __ subw(cnt2, cnt2, 8); // Already loaded 4 symbols. Last 4 is special case. 4083 __ add(str1, str1, cnt2, __ LSL, isLU ? 0 : 1); 4084 __ eor(rscratch2, tmp1, tmp2); 4085 __ add(str2, str2, cnt2, __ LSL, isLU ? 1 : 0); 4086 __ mov(rscratch1, tmp2); 4087 __ cbnz(rscratch2, CALCULATE_DIFFERENCE); 4088 Register strU = isLU ? str2 : str1, 4089 strL = isLU ? str1 : str2, 4090 tmpU = isLU ? rscratch1 : tmp1, // where to keep U for comparison 4091 tmpL = isLU ? tmp1 : rscratch1; // where to keep L for comparison 4092 __ push(spilled_regs, sp); 4093 __ sub(tmp2, strL, cnt2); // strL pointer to load from 4094 __ sub(cnt1, strU, cnt2, __ LSL, 1); // strU pointer to load from 4095 4096 __ ldr(tmp3, Address(__ post(cnt1, 8))); 4097 4098 if (SoftwarePrefetchHintDistance >= 0) { 4099 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4100 __ br(__ LT, NO_PREFETCH); 4101 __ bind(LARGE_LOOP_PREFETCH); 4102 __ prfm(Address(tmp2, SoftwarePrefetchHintDistance)); 4103 __ mov(tmp4, 2); 4104 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4105 __ bind(LARGE_LOOP_PREFETCH_REPEAT1); 4106 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4107 __ subs(tmp4, tmp4, 1); 4108 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT1); 4109 __ prfm(Address(cnt1, SoftwarePrefetchHintDistance)); 4110 __ mov(tmp4, 2); 4111 __ bind(LARGE_LOOP_PREFETCH_REPEAT2); 4112 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4113 __ subs(tmp4, tmp4, 1); 4114 __ br(__ GT, LARGE_LOOP_PREFETCH_REPEAT2); 4115 __ sub(cnt2, cnt2, 64); 4116 __ subs(rscratch2, cnt2, prefetchLoopExitCondition); 4117 __ br(__ GE, LARGE_LOOP_PREFETCH); 4118 } 4119 __ cbz(cnt2, LOAD_LAST); // no characters left except last load 4120 __ bind(NO_PREFETCH); 4121 __ subs(cnt2, cnt2, 16); 4122 __ br(__ LT, TAIL); 4123 __ bind(SMALL_LOOP); // smaller loop 4124 __ subs(cnt2, cnt2, 16); 4125 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); 4126 __ br(__ GE, SMALL_LOOP); 4127 __ cmn(cnt2, (u1)16); 4128 __ br(__ EQ, LOAD_LAST); 4129 __ bind(TAIL); // 1..15 characters left until last load (last 4 characters) 4130 __ add(cnt1, cnt1, cnt2, __ LSL, 1); // Address of 8 bytes before last 4 characters in UTF-16 string 4131 __ add(tmp2, tmp2, cnt2); // Address of 16 bytes before last 4 characters in Latin1 string 4132 __ ldr(tmp3, Address(cnt1, -8)); 4133 compare_string_16_x_LU(tmpL, tmpU, DIFF1, DIFF2); // last 16 characters before last load 4134 __ b(LOAD_LAST); 4135 __ bind(DIFF2); 4136 __ mov(tmpU, tmp3); 4137 __ bind(DIFF1); 4138 __ pop(spilled_regs, sp); 4139 __ b(CALCULATE_DIFFERENCE); 4140 __ bind(LOAD_LAST); 4141 // Last 4 UTF-16 characters are already pre-loaded into tmp3 by compare_string_16_x_LU. 4142 // No need to load it again 4143 __ mov(tmpU, tmp3); 4144 __ pop(spilled_regs, sp); 4145 4146 __ ldrs(vtmp, Address(strL)); 4147 __ zip1(vtmp, __ T8B, vtmp, vtmpZ); 4148 __ fmovd(tmpL, vtmp); 4149 4150 __ eor(rscratch2, tmpU, tmpL); 4151 __ cbz(rscratch2, DONE); 4152 4153 // Find the first different characters in the longwords and 4154 // compute their difference. 4155 __ bind(CALCULATE_DIFFERENCE); 4156 __ rev(rscratch2, rscratch2); 4157 __ clz(rscratch2, rscratch2); 4158 __ andr(rscratch2, rscratch2, -16); 4159 __ lsrv(tmp1, tmp1, rscratch2); 4160 __ uxthw(tmp1, tmp1); 4161 __ lsrv(rscratch1, rscratch1, rscratch2); 4162 __ uxthw(rscratch1, rscratch1); 4163 __ subw(result, tmp1, rscratch1); 4164 __ bind(DONE); 4165 __ ret(lr); 4166 return entry; 4167 } 4168 4169 // r0 = result 4170 // r1 = str1 4171 // r2 = cnt1 4172 // r3 = str2 4173 // r4 = cnt2 4174 // r10 = tmp1 4175 // r11 = tmp2 4176 address generate_compare_long_string_same_encoding(bool isLL) { 4177 __ align(CodeEntryAlignment); 4178 StubCodeMark mark(this, "StubRoutines", isLL 4179 ? "compare_long_string_same_encoding LL" 4180 : "compare_long_string_same_encoding UU"); 4181 address entry = __ pc(); 4182 Register result = r0, str1 = r1, cnt1 = r2, str2 = r3, cnt2 = r4, 4183 tmp1 = r10, tmp2 = r11; 4184 Label SMALL_LOOP, LARGE_LOOP_PREFETCH, CHECK_LAST, DIFF2, TAIL, 4185 LENGTH_DIFF, DIFF, LAST_CHECK_AND_LENGTH_DIFF, 4186 DIFF_LAST_POSITION, DIFF_LAST_POSITION2; 4187 // exit from large loop when less than 64 bytes left to read or we're about 4188 // to prefetch memory behind array border 4189 int largeLoopExitCondition = MAX(64, SoftwarePrefetchHintDistance)/(isLL ? 1 : 2); 4190 // cnt1/cnt2 contains amount of characters to compare. cnt1 can be re-used 4191 // update cnt2 counter with already loaded 8 bytes 4192 __ sub(cnt2, cnt2, wordSize/(isLL ? 1 : 2)); 4193 // update pointers, because of previous read 4194 __ add(str1, str1, wordSize); 4195 __ add(str2, str2, wordSize); 4196 if (SoftwarePrefetchHintDistance >= 0) { 4197 __ bind(LARGE_LOOP_PREFETCH); 4198 __ prfm(Address(str1, SoftwarePrefetchHintDistance)); 4199 __ prfm(Address(str2, SoftwarePrefetchHintDistance)); 4200 compare_string_16_bytes_same(DIFF, DIFF2); 4201 compare_string_16_bytes_same(DIFF, DIFF2); 4202 __ sub(cnt2, cnt2, isLL ? 64 : 32); 4203 compare_string_16_bytes_same(DIFF, DIFF2); 4204 __ subs(rscratch2, cnt2, largeLoopExitCondition); 4205 compare_string_16_bytes_same(DIFF, DIFF2); 4206 __ br(__ GT, LARGE_LOOP_PREFETCH); 4207 __ cbz(cnt2, LAST_CHECK_AND_LENGTH_DIFF); // no more chars left? 4208 } 4209 // less than 16 bytes left? 4210 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4211 __ br(__ LT, TAIL); 4212 __ bind(SMALL_LOOP); 4213 compare_string_16_bytes_same(DIFF, DIFF2); 4214 __ subs(cnt2, cnt2, isLL ? 16 : 8); 4215 __ br(__ GE, SMALL_LOOP); 4216 __ bind(TAIL); 4217 __ adds(cnt2, cnt2, isLL ? 16 : 8); 4218 __ br(__ EQ, LAST_CHECK_AND_LENGTH_DIFF); 4219 __ subs(cnt2, cnt2, isLL ? 8 : 4); 4220 __ br(__ LE, CHECK_LAST); 4221 __ eor(rscratch2, tmp1, tmp2); 4222 __ cbnz(rscratch2, DIFF); 4223 __ ldr(tmp1, Address(__ post(str1, 8))); 4224 __ ldr(tmp2, Address(__ post(str2, 8))); 4225 __ sub(cnt2, cnt2, isLL ? 8 : 4); 4226 __ bind(CHECK_LAST); 4227 if (!isLL) { 4228 __ add(cnt2, cnt2, cnt2); // now in bytes 4229 } 4230 __ eor(rscratch2, tmp1, tmp2); 4231 __ cbnz(rscratch2, DIFF); 4232 __ ldr(rscratch1, Address(str1, cnt2)); 4233 __ ldr(cnt1, Address(str2, cnt2)); 4234 __ eor(rscratch2, rscratch1, cnt1); 4235 __ cbz(rscratch2, LENGTH_DIFF); 4236 // Find the first different characters in the longwords and 4237 // compute their difference. 4238 __ bind(DIFF2); 4239 __ rev(rscratch2, rscratch2); 4240 __ clz(rscratch2, rscratch2); 4241 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4242 __ lsrv(rscratch1, rscratch1, rscratch2); 4243 if (isLL) { 4244 __ lsrv(cnt1, cnt1, rscratch2); 4245 __ uxtbw(rscratch1, rscratch1); 4246 __ uxtbw(cnt1, cnt1); 4247 } else { 4248 __ lsrv(cnt1, cnt1, rscratch2); 4249 __ uxthw(rscratch1, rscratch1); 4250 __ uxthw(cnt1, cnt1); 4251 } 4252 __ subw(result, rscratch1, cnt1); 4253 __ b(LENGTH_DIFF); 4254 __ bind(DIFF); 4255 __ rev(rscratch2, rscratch2); 4256 __ clz(rscratch2, rscratch2); 4257 __ andr(rscratch2, rscratch2, isLL ? -8 : -16); 4258 __ lsrv(tmp1, tmp1, rscratch2); 4259 if (isLL) { 4260 __ lsrv(tmp2, tmp2, rscratch2); 4261 __ uxtbw(tmp1, tmp1); 4262 __ uxtbw(tmp2, tmp2); 4263 } else { 4264 __ lsrv(tmp2, tmp2, rscratch2); 4265 __ uxthw(tmp1, tmp1); 4266 __ uxthw(tmp2, tmp2); 4267 } 4268 __ subw(result, tmp1, tmp2); 4269 __ b(LENGTH_DIFF); 4270 __ bind(LAST_CHECK_AND_LENGTH_DIFF); 4271 __ eor(rscratch2, tmp1, tmp2); 4272 __ cbnz(rscratch2, DIFF); 4273 __ bind(LENGTH_DIFF); 4274 __ ret(lr); 4275 return entry; 4276 } 4277 4278 void generate_compare_long_strings() { 4279 StubRoutines::aarch64::_compare_long_string_LL 4280 = generate_compare_long_string_same_encoding(true); 4281 StubRoutines::aarch64::_compare_long_string_UU 4282 = generate_compare_long_string_same_encoding(false); 4283 StubRoutines::aarch64::_compare_long_string_LU 4284 = generate_compare_long_string_different_encoding(true); 4285 StubRoutines::aarch64::_compare_long_string_UL 4286 = generate_compare_long_string_different_encoding(false); 4287 } 4288 4289 // R0 = result 4290 // R1 = str2 4291 // R2 = cnt1 4292 // R3 = str1 4293 // R4 = cnt2 4294 // This generic linear code use few additional ideas, which makes it faster: 4295 // 1) we can safely keep at least 1st register of pattern(since length >= 8) 4296 // in order to skip initial loading(help in systems with 1 ld pipeline) 4297 // 2) we can use "fast" algorithm of finding single character to search for 4298 // first symbol with less branches(1 branch per each loaded register instead 4299 // of branch for each symbol), so, this is where constants like 4300 // 0x0101...01, 0x00010001...0001, 0x7f7f...7f, 0x7fff7fff...7fff comes from 4301 // 3) after loading and analyzing 1st register of source string, it can be 4302 // used to search for every 1st character entry, saving few loads in 4303 // comparison with "simplier-but-slower" implementation 4304 // 4) in order to avoid lots of push/pop operations, code below is heavily 4305 // re-using/re-initializing/compressing register values, which makes code 4306 // larger and a bit less readable, however, most of extra operations are 4307 // issued during loads or branches, so, penalty is minimal 4308 address generate_string_indexof_linear(bool str1_isL, bool str2_isL) { 4309 const char* stubName = str1_isL 4310 ? (str2_isL ? "indexof_linear_ll" : "indexof_linear_ul") 4311 : "indexof_linear_uu"; 4312 __ align(CodeEntryAlignment); 4313 StubCodeMark mark(this, "StubRoutines", stubName); 4314 address entry = __ pc(); 4315 4316 int str1_chr_size = str1_isL ? 1 : 2; 4317 int str2_chr_size = str2_isL ? 1 : 2; 4318 int str1_chr_shift = str1_isL ? 0 : 1; 4319 int str2_chr_shift = str2_isL ? 0 : 1; 4320 bool isL = str1_isL && str2_isL; 4321 // parameters 4322 Register result = r0, str2 = r1, cnt1 = r2, str1 = r3, cnt2 = r4; 4323 // temporary registers 4324 Register tmp1 = r20, tmp2 = r21, tmp3 = r22, tmp4 = r23; 4325 RegSet spilled_regs = RegSet::range(tmp1, tmp4); 4326 // redefinitions 4327 Register ch1 = rscratch1, ch2 = rscratch2, first = tmp3; 4328 4329 __ push(spilled_regs, sp); 4330 Label L_LOOP, L_LOOP_PROCEED, L_SMALL, L_HAS_ZERO, 4331 L_HAS_ZERO_LOOP, L_CMP_LOOP, L_CMP_LOOP_NOMATCH, L_SMALL_PROCEED, 4332 L_SMALL_HAS_ZERO_LOOP, L_SMALL_CMP_LOOP_NOMATCH, L_SMALL_CMP_LOOP, 4333 L_POST_LOOP, L_CMP_LOOP_LAST_CMP, L_HAS_ZERO_LOOP_NOMATCH, 4334 L_SMALL_CMP_LOOP_LAST_CMP, L_SMALL_CMP_LOOP_LAST_CMP2, 4335 L_CMP_LOOP_LAST_CMP2, DONE, NOMATCH; 4336 // Read whole register from str1. It is safe, because length >=8 here 4337 __ ldr(ch1, Address(str1)); 4338 // Read whole register from str2. It is safe, because length >=8 here 4339 __ ldr(ch2, Address(str2)); 4340 __ sub(cnt2, cnt2, cnt1); 4341 __ andr(first, ch1, str1_isL ? 0xFF : 0xFFFF); 4342 if (str1_isL != str2_isL) { 4343 __ eor(v0, __ T16B, v0, v0); 4344 } 4345 __ mov(tmp1, str2_isL ? 0x0101010101010101 : 0x0001000100010001); 4346 __ mul(first, first, tmp1); 4347 // check if we have less than 1 register to check 4348 __ subs(cnt2, cnt2, wordSize/str2_chr_size - 1); 4349 if (str1_isL != str2_isL) { 4350 __ fmovd(v1, ch1); 4351 } 4352 __ br(__ LE, L_SMALL); 4353 __ eor(ch2, first, ch2); 4354 if (str1_isL != str2_isL) { 4355 __ zip1(v1, __ T16B, v1, v0); 4356 } 4357 __ sub(tmp2, ch2, tmp1); 4358 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4359 __ bics(tmp2, tmp2, ch2); 4360 if (str1_isL != str2_isL) { 4361 __ fmovd(ch1, v1); 4362 } 4363 __ br(__ NE, L_HAS_ZERO); 4364 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4365 __ add(result, result, wordSize/str2_chr_size); 4366 __ add(str2, str2, wordSize); 4367 __ br(__ LT, L_POST_LOOP); 4368 __ BIND(L_LOOP); 4369 __ ldr(ch2, Address(str2)); 4370 __ eor(ch2, first, ch2); 4371 __ sub(tmp2, ch2, tmp1); 4372 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4373 __ bics(tmp2, tmp2, ch2); 4374 __ br(__ NE, L_HAS_ZERO); 4375 __ BIND(L_LOOP_PROCEED); 4376 __ subs(cnt2, cnt2, wordSize/str2_chr_size); 4377 __ add(str2, str2, wordSize); 4378 __ add(result, result, wordSize/str2_chr_size); 4379 __ br(__ GE, L_LOOP); 4380 __ BIND(L_POST_LOOP); 4381 __ subs(zr, cnt2, -wordSize/str2_chr_size); // no extra characters to check 4382 __ br(__ LE, NOMATCH); 4383 __ ldr(ch2, Address(str2)); 4384 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4385 __ eor(ch2, first, ch2); 4386 __ sub(tmp2, ch2, tmp1); 4387 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4388 __ mov(tmp4, -1); // all bits set 4389 __ b(L_SMALL_PROCEED); 4390 __ align(OptoLoopAlignment); 4391 __ BIND(L_SMALL); 4392 __ sub(cnt2, zr, cnt2, __ LSL, LogBitsPerByte + str2_chr_shift); 4393 __ eor(ch2, first, ch2); 4394 if (str1_isL != str2_isL) { 4395 __ zip1(v1, __ T16B, v1, v0); 4396 } 4397 __ sub(tmp2, ch2, tmp1); 4398 __ mov(tmp4, -1); // all bits set 4399 __ orr(ch2, ch2, str2_isL ? 0x7f7f7f7f7f7f7f7f : 0x7fff7fff7fff7fff); 4400 if (str1_isL != str2_isL) { 4401 __ fmovd(ch1, v1); // move converted 4 symbols 4402 } 4403 __ BIND(L_SMALL_PROCEED); 4404 __ lsrv(tmp4, tmp4, cnt2); // mask. zeroes on useless bits. 4405 __ bic(tmp2, tmp2, ch2); 4406 __ ands(tmp2, tmp2, tmp4); // clear useless bits and check 4407 __ rbit(tmp2, tmp2); 4408 __ br(__ EQ, NOMATCH); 4409 __ BIND(L_SMALL_HAS_ZERO_LOOP); 4410 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some cpu's 4411 __ cmp(cnt1, u1(wordSize/str2_chr_size)); 4412 __ br(__ LE, L_SMALL_CMP_LOOP_LAST_CMP2); 4413 if (str2_isL) { // LL 4414 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4415 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4416 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4417 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4418 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4419 } else { 4420 __ mov(ch2, 0xE); // all bits in byte set except last one 4421 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4422 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4423 __ lslv(tmp2, tmp2, tmp4); 4424 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4425 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4426 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4427 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4428 } 4429 __ cmp(ch1, ch2); 4430 __ mov(tmp4, wordSize/str2_chr_size); 4431 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4432 __ BIND(L_SMALL_CMP_LOOP); 4433 str1_isL ? __ ldrb(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4434 : __ ldrh(first, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4435 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4436 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4437 __ add(tmp4, tmp4, 1); 4438 __ cmp(tmp4, cnt1); 4439 __ br(__ GE, L_SMALL_CMP_LOOP_LAST_CMP); 4440 __ cmp(first, ch2); 4441 __ br(__ EQ, L_SMALL_CMP_LOOP); 4442 __ BIND(L_SMALL_CMP_LOOP_NOMATCH); 4443 __ cbz(tmp2, NOMATCH); // no more matches. exit 4444 __ clz(tmp4, tmp2); 4445 __ add(result, result, 1); // advance index 4446 __ add(str2, str2, str2_chr_size); // advance pointer 4447 __ b(L_SMALL_HAS_ZERO_LOOP); 4448 __ align(OptoLoopAlignment); 4449 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP); 4450 __ cmp(first, ch2); 4451 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4452 __ b(DONE); 4453 __ align(OptoLoopAlignment); 4454 __ BIND(L_SMALL_CMP_LOOP_LAST_CMP2); 4455 if (str2_isL) { // LL 4456 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); // address of "index" 4457 __ ldr(ch2, Address(str2)); // read whole register of str2. Safe. 4458 __ lslv(tmp2, tmp2, tmp4); // shift off leading zeroes from match info 4459 __ add(result, result, tmp4, __ LSR, LogBitsPerByte); 4460 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4461 } else { 4462 __ mov(ch2, 0xE); // all bits in byte set except last one 4463 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4464 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4465 __ lslv(tmp2, tmp2, tmp4); 4466 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4467 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4468 __ lsl(tmp2, tmp2, 1); // shift off leading "1" from match info 4469 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4470 } 4471 __ cmp(ch1, ch2); 4472 __ br(__ NE, L_SMALL_CMP_LOOP_NOMATCH); 4473 __ b(DONE); 4474 __ align(OptoLoopAlignment); 4475 __ BIND(L_HAS_ZERO); 4476 __ rbit(tmp2, tmp2); 4477 __ clz(tmp4, tmp2); // potentially long. Up to 4 cycles on some CPU's 4478 // Now, perform compression of counters(cnt2 and cnt1) into one register. 4479 // It's fine because both counters are 32bit and are not changed in this 4480 // loop. Just restore it on exit. So, cnt1 can be re-used in this loop. 4481 __ orr(cnt2, cnt2, cnt1, __ LSL, BitsPerByte * wordSize / 2); 4482 __ sub(result, result, 1); 4483 __ BIND(L_HAS_ZERO_LOOP); 4484 __ mov(cnt1, wordSize/str2_chr_size); 4485 __ cmp(cnt1, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4486 __ br(__ GE, L_CMP_LOOP_LAST_CMP2); // case of 8 bytes only to compare 4487 if (str2_isL) { 4488 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4489 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4490 __ lslv(tmp2, tmp2, tmp4); 4491 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4492 __ add(tmp4, tmp4, 1); 4493 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4494 __ lsl(tmp2, tmp2, 1); 4495 __ mov(tmp4, wordSize/str2_chr_size); 4496 } else { 4497 __ mov(ch2, 0xE); 4498 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4499 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4500 __ lslv(tmp2, tmp2, tmp4); 4501 __ add(tmp4, tmp4, 1); 4502 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4503 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4504 __ lsl(tmp2, tmp2, 1); 4505 __ mov(tmp4, wordSize/str2_chr_size); 4506 __ sub(str2, str2, str2_chr_size); 4507 } 4508 __ cmp(ch1, ch2); 4509 __ mov(tmp4, wordSize/str2_chr_size); 4510 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4511 __ BIND(L_CMP_LOOP); 4512 str1_isL ? __ ldrb(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))) 4513 : __ ldrh(cnt1, Address(str1, tmp4, Address::lsl(str1_chr_shift))); 4514 str2_isL ? __ ldrb(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))) 4515 : __ ldrh(ch2, Address(str2, tmp4, Address::lsl(str2_chr_shift))); 4516 __ add(tmp4, tmp4, 1); 4517 __ cmp(tmp4, cnt2, __ LSR, BitsPerByte * wordSize / 2); 4518 __ br(__ GE, L_CMP_LOOP_LAST_CMP); 4519 __ cmp(cnt1, ch2); 4520 __ br(__ EQ, L_CMP_LOOP); 4521 __ BIND(L_CMP_LOOP_NOMATCH); 4522 // here we're not matched 4523 __ cbz(tmp2, L_HAS_ZERO_LOOP_NOMATCH); // no more matches. Proceed to main loop 4524 __ clz(tmp4, tmp2); 4525 __ add(str2, str2, str2_chr_size); // advance pointer 4526 __ b(L_HAS_ZERO_LOOP); 4527 __ align(OptoLoopAlignment); 4528 __ BIND(L_CMP_LOOP_LAST_CMP); 4529 __ cmp(cnt1, ch2); 4530 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4531 __ b(DONE); 4532 __ align(OptoLoopAlignment); 4533 __ BIND(L_CMP_LOOP_LAST_CMP2); 4534 if (str2_isL) { 4535 __ lsr(ch2, tmp4, LogBitsPerByte + str2_chr_shift); // char index 4536 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4537 __ lslv(tmp2, tmp2, tmp4); 4538 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4539 __ add(tmp4, tmp4, 1); 4540 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4541 __ lsl(tmp2, tmp2, 1); 4542 } else { 4543 __ mov(ch2, 0xE); 4544 __ andr(ch2, ch2, tmp4, __ LSR, LogBitsPerByte); // byte shift amount 4545 __ ldr(ch2, Address(str2, ch2)); // read whole register of str2. Safe. 4546 __ lslv(tmp2, tmp2, tmp4); 4547 __ add(tmp4, tmp4, 1); 4548 __ add(result, result, tmp4, __ LSR, LogBitsPerByte + str2_chr_shift); 4549 __ add(str2, str2, tmp4, __ LSR, LogBitsPerByte); 4550 __ lsl(tmp2, tmp2, 1); 4551 __ sub(str2, str2, str2_chr_size); 4552 } 4553 __ cmp(ch1, ch2); 4554 __ br(__ NE, L_CMP_LOOP_NOMATCH); 4555 __ b(DONE); 4556 __ align(OptoLoopAlignment); 4557 __ BIND(L_HAS_ZERO_LOOP_NOMATCH); 4558 // 1) Restore "result" index. Index was wordSize/str2_chr_size * N until 4559 // L_HAS_ZERO block. Byte octet was analyzed in L_HAS_ZERO_LOOP, 4560 // so, result was increased at max by wordSize/str2_chr_size - 1, so, 4561 // respective high bit wasn't changed. L_LOOP_PROCEED will increase 4562 // result by analyzed characters value, so, we can just reset lower bits 4563 // in result here. Clear 2 lower bits for UU/UL and 3 bits for LL 4564 // 2) restore cnt1 and cnt2 values from "compressed" cnt2 4565 // 3) advance str2 value to represent next str2 octet. result & 7/3 is 4566 // index of last analyzed substring inside current octet. So, str2 in at 4567 // respective start address. We need to advance it to next octet 4568 __ andr(tmp2, result, wordSize/str2_chr_size - 1); // symbols analyzed 4569 __ lsr(cnt1, cnt2, BitsPerByte * wordSize / 2); 4570 __ bfm(result, zr, 0, 2 - str2_chr_shift); 4571 __ sub(str2, str2, tmp2, __ LSL, str2_chr_shift); // restore str2 4572 __ movw(cnt2, cnt2); 4573 __ b(L_LOOP_PROCEED); 4574 __ align(OptoLoopAlignment); 4575 __ BIND(NOMATCH); 4576 __ mov(result, -1); 4577 __ BIND(DONE); 4578 __ pop(spilled_regs, sp); 4579 __ ret(lr); 4580 return entry; 4581 } 4582 4583 void generate_string_indexof_stubs() { 4584 StubRoutines::aarch64::_string_indexof_linear_ll = generate_string_indexof_linear(true, true); 4585 StubRoutines::aarch64::_string_indexof_linear_uu = generate_string_indexof_linear(false, false); 4586 StubRoutines::aarch64::_string_indexof_linear_ul = generate_string_indexof_linear(true, false); 4587 } 4588 4589 void inflate_and_store_2_fp_registers(bool generatePrfm, 4590 FloatRegister src1, FloatRegister src2) { 4591 Register dst = r1; 4592 __ zip1(v1, __ T16B, src1, v0); 4593 __ zip2(v2, __ T16B, src1, v0); 4594 if (generatePrfm) { 4595 __ prfm(Address(dst, SoftwarePrefetchHintDistance), PSTL1STRM); 4596 } 4597 __ zip1(v3, __ T16B, src2, v0); 4598 __ zip2(v4, __ T16B, src2, v0); 4599 __ st1(v1, v2, v3, v4, __ T16B, Address(__ post(dst, 64))); 4600 } 4601 4602 // R0 = src 4603 // R1 = dst 4604 // R2 = len 4605 // R3 = len >> 3 4606 // V0 = 0 4607 // v1 = loaded 8 bytes 4608 address generate_large_byte_array_inflate() { 4609 __ align(CodeEntryAlignment); 4610 StubCodeMark mark(this, "StubRoutines", "large_byte_array_inflate"); 4611 address entry = __ pc(); 4612 Label LOOP, LOOP_START, LOOP_PRFM, LOOP_PRFM_START, DONE; 4613 Register src = r0, dst = r1, len = r2, octetCounter = r3; 4614 const int large_loop_threshold = MAX(64, SoftwarePrefetchHintDistance)/8 + 4; 4615 4616 // do one more 8-byte read to have address 16-byte aligned in most cases 4617 // also use single store instruction 4618 __ ldrd(v2, __ post(src, 8)); 4619 __ sub(octetCounter, octetCounter, 2); 4620 __ zip1(v1, __ T16B, v1, v0); 4621 __ zip1(v2, __ T16B, v2, v0); 4622 __ st1(v1, v2, __ T16B, __ post(dst, 32)); 4623 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4624 __ subs(rscratch1, octetCounter, large_loop_threshold); 4625 __ br(__ LE, LOOP_START); 4626 __ b(LOOP_PRFM_START); 4627 __ bind(LOOP_PRFM); 4628 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4629 __ bind(LOOP_PRFM_START); 4630 __ prfm(Address(src, SoftwarePrefetchHintDistance)); 4631 __ sub(octetCounter, octetCounter, 8); 4632 __ subs(rscratch1, octetCounter, large_loop_threshold); 4633 inflate_and_store_2_fp_registers(true, v3, v4); 4634 inflate_and_store_2_fp_registers(true, v5, v6); 4635 __ br(__ GT, LOOP_PRFM); 4636 __ cmp(octetCounter, (u1)8); 4637 __ br(__ LT, DONE); 4638 __ bind(LOOP); 4639 __ ld1(v3, v4, v5, v6, __ T16B, Address(__ post(src, 64))); 4640 __ bind(LOOP_START); 4641 __ sub(octetCounter, octetCounter, 8); 4642 __ cmp(octetCounter, (u1)8); 4643 inflate_and_store_2_fp_registers(false, v3, v4); 4644 inflate_and_store_2_fp_registers(false, v5, v6); 4645 __ br(__ GE, LOOP); 4646 __ bind(DONE); 4647 __ ret(lr); 4648 return entry; 4649 } 4650 4651 /** 4652 * Arguments: 4653 * 4654 * Input: 4655 * c_rarg0 - current state address 4656 * c_rarg1 - H key address 4657 * c_rarg2 - data address 4658 * c_rarg3 - number of blocks 4659 * 4660 * Output: 4661 * Updated state at c_rarg0 4662 */ 4663 address generate_ghash_processBlocks() { 4664 // Bafflingly, GCM uses little-endian for the byte order, but 4665 // big-endian for the bit order. For example, the polynomial 1 is 4666 // represented as the 16-byte string 80 00 00 00 | 12 bytes of 00. 4667 // 4668 // So, we must either reverse the bytes in each word and do 4669 // everything big-endian or reverse the bits in each byte and do 4670 // it little-endian. On AArch64 it's more idiomatic to reverse 4671 // the bits in each byte (we have an instruction, RBIT, to do 4672 // that) and keep the data in little-endian bit order throught the 4673 // calculation, bit-reversing the inputs and outputs. 4674 4675 StubCodeMark mark(this, "StubRoutines", "ghash_processBlocks"); 4676 __ align(wordSize * 2); 4677 address p = __ pc(); 4678 __ emit_int64(0x87); // The low-order bits of the field 4679 // polynomial (i.e. p = z^7+z^2+z+1) 4680 // repeated in the low and high parts of a 4681 // 128-bit vector 4682 __ emit_int64(0x87); 4683 4684 __ align(CodeEntryAlignment); 4685 address start = __ pc(); 4686 4687 Register state = c_rarg0; 4688 Register subkeyH = c_rarg1; 4689 Register data = c_rarg2; 4690 Register blocks = c_rarg3; 4691 4692 FloatRegister vzr = v30; 4693 __ eor(vzr, __ T16B, vzr, vzr); // zero register 4694 4695 __ ldrq(v0, Address(state)); 4696 __ ldrq(v1, Address(subkeyH)); 4697 4698 __ rev64(v0, __ T16B, v0); // Bit-reverse words in state and subkeyH 4699 __ rbit(v0, __ T16B, v0); 4700 __ rev64(v1, __ T16B, v1); 4701 __ rbit(v1, __ T16B, v1); 4702 4703 __ ldrq(v26, p); 4704 4705 __ ext(v16, __ T16B, v1, v1, 0x08); // long-swap subkeyH into v1 4706 __ eor(v16, __ T16B, v16, v1); // xor subkeyH into subkeyL (Karatsuba: (A1+A0)) 4707 4708 { 4709 Label L_ghash_loop; 4710 __ bind(L_ghash_loop); 4711 4712 __ ldrq(v2, Address(__ post(data, 0x10))); // Load the data, bit 4713 // reversing each byte 4714 __ rbit(v2, __ T16B, v2); 4715 __ eor(v2, __ T16B, v0, v2); // bit-swapped data ^ bit-swapped state 4716 4717 // Multiply state in v2 by subkey in v1 4718 ghash_multiply(/*result_lo*/v5, /*result_hi*/v7, 4719 /*a*/v1, /*b*/v2, /*a1_xor_a0*/v16, 4720 /*temps*/v6, v20, v18, v21); 4721 // Reduce v7:v5 by the field polynomial 4722 ghash_reduce(v0, v5, v7, v26, vzr, v20); 4723 4724 __ sub(blocks, blocks, 1); 4725 __ cbnz(blocks, L_ghash_loop); 4726 } 4727 4728 // The bit-reversed result is at this point in v0 4729 __ rev64(v1, __ T16B, v0); 4730 __ rbit(v1, __ T16B, v1); 4731 4732 __ st1(v1, __ T16B, state); 4733 __ ret(lr); 4734 4735 return start; 4736 } 4737 4738 // Continuation point for throwing of implicit exceptions that are 4739 // not handled in the current activation. Fabricates an exception 4740 // oop and initiates normal exception dispatching in this 4741 // frame. Since we need to preserve callee-saved values (currently 4742 // only for C2, but done for C1 as well) we need a callee-saved oop 4743 // map and therefore have to make these stubs into RuntimeStubs 4744 // rather than BufferBlobs. If the compiler needs all registers to 4745 // be preserved between the fault point and the exception handler 4746 // then it must assume responsibility for that in 4747 // AbstractCompiler::continuation_for_implicit_null_exception or 4748 // continuation_for_implicit_division_by_zero_exception. All other 4749 // implicit exceptions (e.g., NullPointerException or 4750 // AbstractMethodError on entry) are either at call sites or 4751 // otherwise assume that stack unwinding will be initiated, so 4752 // caller saved registers were assumed volatile in the compiler. 4753 4754 #undef __ 4755 #define __ masm-> 4756 4757 address generate_throw_exception(const char* name, 4758 address runtime_entry, 4759 Register arg1 = noreg, 4760 Register arg2 = noreg) { 4761 // Information about frame layout at time of blocking runtime call. 4762 // Note that we only have to preserve callee-saved registers since 4763 // the compilers are responsible for supplying a continuation point 4764 // if they expect all registers to be preserved. 4765 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 4766 enum layout { 4767 rfp_off = 0, 4768 rfp_off2, 4769 return_off, 4770 return_off2, 4771 framesize // inclusive of return address 4772 }; 4773 4774 int insts_size = 512; 4775 int locs_size = 64; 4776 4777 CodeBuffer code(name, insts_size, locs_size); 4778 OopMapSet* oop_maps = new OopMapSet(); 4779 MacroAssembler* masm = new MacroAssembler(&code); 4780 4781 address start = __ pc(); 4782 4783 // This is an inlined and slightly modified version of call_VM 4784 // which has the ability to fetch the return PC out of 4785 // thread-local storage and also sets up last_Java_sp slightly 4786 // differently than the real call_VM 4787 4788 __ enter(); // Save FP and LR before call 4789 4790 assert(is_even(framesize/2), "sp not 16-byte aligned"); 4791 4792 // lr and fp are already in place 4793 __ sub(sp, rfp, ((unsigned)framesize-4) << LogBytesPerInt); // prolog 4794 4795 int frame_complete = __ pc() - start; 4796 4797 // Set up last_Java_sp and last_Java_fp 4798 address the_pc = __ pc(); 4799 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 4800 4801 // Call runtime 4802 if (arg1 != noreg) { 4803 assert(arg2 != c_rarg1, "clobbered"); 4804 __ mov(c_rarg1, arg1); 4805 } 4806 if (arg2 != noreg) { 4807 __ mov(c_rarg2, arg2); 4808 } 4809 __ mov(c_rarg0, rthread); 4810 BLOCK_COMMENT("call runtime_entry"); 4811 __ mov(rscratch1, runtime_entry); 4812 __ blrt(rscratch1, 3 /* number_of_arguments */, 0, 1); 4813 4814 // Generate oop map 4815 OopMap* map = new OopMap(framesize, 0); 4816 4817 oop_maps->add_gc_map(the_pc - start, map); 4818 4819 __ reset_last_Java_frame(true); 4820 __ maybe_isb(); 4821 4822 __ leave(); 4823 4824 // check for pending exceptions 4825 #ifdef ASSERT 4826 Label L; 4827 __ ldr(rscratch1, Address(rthread, Thread::pending_exception_offset())); 4828 __ cbnz(rscratch1, L); 4829 __ should_not_reach_here(); 4830 __ bind(L); 4831 #endif // ASSERT 4832 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 4833 4834 4835 // codeBlob framesize is in words (not VMRegImpl::slot_size) 4836 RuntimeStub* stub = 4837 RuntimeStub::new_runtime_stub(name, 4838 &code, 4839 frame_complete, 4840 (framesize >> (LogBytesPerWord - LogBytesPerInt)), 4841 oop_maps, false); 4842 return stub->entry_point(); 4843 } 4844 4845 class MontgomeryMultiplyGenerator : public MacroAssembler { 4846 4847 Register Pa_base, Pb_base, Pn_base, Pm_base, inv, Rlen, Ra, Rb, Rm, Rn, 4848 Pa, Pb, Pn, Pm, Rhi_ab, Rlo_ab, Rhi_mn, Rlo_mn, t0, t1, t2, Ri, Rj; 4849 4850 RegSet _toSave; 4851 bool _squaring; 4852 4853 public: 4854 MontgomeryMultiplyGenerator (Assembler *as, bool squaring) 4855 : MacroAssembler(as->code()), _squaring(squaring) { 4856 4857 // Register allocation 4858 4859 Register reg = c_rarg0; 4860 Pa_base = reg; // Argument registers 4861 if (squaring) 4862 Pb_base = Pa_base; 4863 else 4864 Pb_base = ++reg; 4865 Pn_base = ++reg; 4866 Rlen= ++reg; 4867 inv = ++reg; 4868 Pm_base = ++reg; 4869 4870 // Working registers: 4871 Ra = ++reg; // The current digit of a, b, n, and m. 4872 Rb = ++reg; 4873 Rm = ++reg; 4874 Rn = ++reg; 4875 4876 Pa = ++reg; // Pointers to the current/next digit of a, b, n, and m. 4877 Pb = ++reg; 4878 Pm = ++reg; 4879 Pn = ++reg; 4880 4881 t0 = ++reg; // Three registers which form a 4882 t1 = ++reg; // triple-precision accumuator. 4883 t2 = ++reg; 4884 4885 Ri = ++reg; // Inner and outer loop indexes. 4886 Rj = ++reg; 4887 4888 Rhi_ab = ++reg; // Product registers: low and high parts 4889 Rlo_ab = ++reg; // of a*b and m*n. 4890 Rhi_mn = ++reg; 4891 Rlo_mn = ++reg; 4892 4893 // r19 and up are callee-saved. 4894 _toSave = RegSet::range(r19, reg) + Pm_base; 4895 } 4896 4897 private: 4898 void save_regs() { 4899 push(_toSave, sp); 4900 } 4901 4902 void restore_regs() { 4903 pop(_toSave, sp); 4904 } 4905 4906 template <typename T> 4907 void unroll_2(Register count, T block) { 4908 Label loop, end, odd; 4909 tbnz(count, 0, odd); 4910 cbz(count, end); 4911 align(16); 4912 bind(loop); 4913 (this->*block)(); 4914 bind(odd); 4915 (this->*block)(); 4916 subs(count, count, 2); 4917 br(Assembler::GT, loop); 4918 bind(end); 4919 } 4920 4921 template <typename T> 4922 void unroll_2(Register count, T block, Register d, Register s, Register tmp) { 4923 Label loop, end, odd; 4924 tbnz(count, 0, odd); 4925 cbz(count, end); 4926 align(16); 4927 bind(loop); 4928 (this->*block)(d, s, tmp); 4929 bind(odd); 4930 (this->*block)(d, s, tmp); 4931 subs(count, count, 2); 4932 br(Assembler::GT, loop); 4933 bind(end); 4934 } 4935 4936 void pre1(RegisterOrConstant i) { 4937 block_comment("pre1"); 4938 // Pa = Pa_base; 4939 // Pb = Pb_base + i; 4940 // Pm = Pm_base; 4941 // Pn = Pn_base + i; 4942 // Ra = *Pa; 4943 // Rb = *Pb; 4944 // Rm = *Pm; 4945 // Rn = *Pn; 4946 ldr(Ra, Address(Pa_base)); 4947 ldr(Rb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4948 ldr(Rm, Address(Pm_base)); 4949 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4950 lea(Pa, Address(Pa_base)); 4951 lea(Pb, Address(Pb_base, i, Address::uxtw(LogBytesPerWord))); 4952 lea(Pm, Address(Pm_base)); 4953 lea(Pn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 4954 4955 // Zero the m*n result. 4956 mov(Rhi_mn, zr); 4957 mov(Rlo_mn, zr); 4958 } 4959 4960 // The core multiply-accumulate step of a Montgomery 4961 // multiplication. The idea is to schedule operations as a 4962 // pipeline so that instructions with long latencies (loads and 4963 // multiplies) have time to complete before their results are 4964 // used. This most benefits in-order implementations of the 4965 // architecture but out-of-order ones also benefit. 4966 void step() { 4967 block_comment("step"); 4968 // MACC(Ra, Rb, t0, t1, t2); 4969 // Ra = *++Pa; 4970 // Rb = *--Pb; 4971 umulh(Rhi_ab, Ra, Rb); 4972 mul(Rlo_ab, Ra, Rb); 4973 ldr(Ra, pre(Pa, wordSize)); 4974 ldr(Rb, pre(Pb, -wordSize)); 4975 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n from the 4976 // previous iteration. 4977 // MACC(Rm, Rn, t0, t1, t2); 4978 // Rm = *++Pm; 4979 // Rn = *--Pn; 4980 umulh(Rhi_mn, Rm, Rn); 4981 mul(Rlo_mn, Rm, Rn); 4982 ldr(Rm, pre(Pm, wordSize)); 4983 ldr(Rn, pre(Pn, -wordSize)); 4984 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4985 } 4986 4987 void post1() { 4988 block_comment("post1"); 4989 4990 // MACC(Ra, Rb, t0, t1, t2); 4991 // Ra = *++Pa; 4992 // Rb = *--Pb; 4993 umulh(Rhi_ab, Ra, Rb); 4994 mul(Rlo_ab, Ra, Rb); 4995 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 4996 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 4997 4998 // *Pm = Rm = t0 * inv; 4999 mul(Rm, t0, inv); 5000 str(Rm, Address(Pm)); 5001 5002 // MACC(Rm, Rn, t0, t1, t2); 5003 // t0 = t1; t1 = t2; t2 = 0; 5004 umulh(Rhi_mn, Rm, Rn); 5005 5006 #ifndef PRODUCT 5007 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5008 { 5009 mul(Rlo_mn, Rm, Rn); 5010 add(Rlo_mn, t0, Rlo_mn); 5011 Label ok; 5012 cbz(Rlo_mn, ok); { 5013 stop("broken Montgomery multiply"); 5014 } bind(ok); 5015 } 5016 #endif 5017 // We have very carefully set things up so that 5018 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5019 // the lower half of Rm * Rn because we know the result already: 5020 // it must be -t0. t0 + (-t0) must generate a carry iff 5021 // t0 != 0. So, rather than do a mul and an adds we just set 5022 // the carry flag iff t0 is nonzero. 5023 // 5024 // mul(Rlo_mn, Rm, Rn); 5025 // adds(zr, t0, Rlo_mn); 5026 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5027 adcs(t0, t1, Rhi_mn); 5028 adc(t1, t2, zr); 5029 mov(t2, zr); 5030 } 5031 5032 void pre2(RegisterOrConstant i, RegisterOrConstant len) { 5033 block_comment("pre2"); 5034 // Pa = Pa_base + i-len; 5035 // Pb = Pb_base + len; 5036 // Pm = Pm_base + i-len; 5037 // Pn = Pn_base + len; 5038 5039 if (i.is_register()) { 5040 sub(Rj, i.as_register(), len); 5041 } else { 5042 mov(Rj, i.as_constant()); 5043 sub(Rj, Rj, len); 5044 } 5045 // Rj == i-len 5046 5047 lea(Pa, Address(Pa_base, Rj, Address::uxtw(LogBytesPerWord))); 5048 lea(Pb, Address(Pb_base, len, Address::uxtw(LogBytesPerWord))); 5049 lea(Pm, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5050 lea(Pn, Address(Pn_base, len, Address::uxtw(LogBytesPerWord))); 5051 5052 // Ra = *++Pa; 5053 // Rb = *--Pb; 5054 // Rm = *++Pm; 5055 // Rn = *--Pn; 5056 ldr(Ra, pre(Pa, wordSize)); 5057 ldr(Rb, pre(Pb, -wordSize)); 5058 ldr(Rm, pre(Pm, wordSize)); 5059 ldr(Rn, pre(Pn, -wordSize)); 5060 5061 mov(Rhi_mn, zr); 5062 mov(Rlo_mn, zr); 5063 } 5064 5065 void post2(RegisterOrConstant i, RegisterOrConstant len) { 5066 block_comment("post2"); 5067 if (i.is_constant()) { 5068 mov(Rj, i.as_constant()-len.as_constant()); 5069 } else { 5070 sub(Rj, i.as_register(), len); 5071 } 5072 5073 adds(t0, t0, Rlo_mn); // The pending m*n, low part 5074 5075 // As soon as we know the least significant digit of our result, 5076 // store it. 5077 // Pm_base[i-len] = t0; 5078 str(t0, Address(Pm_base, Rj, Address::uxtw(LogBytesPerWord))); 5079 5080 // t0 = t1; t1 = t2; t2 = 0; 5081 adcs(t0, t1, Rhi_mn); // The pending m*n, high part 5082 adc(t1, t2, zr); 5083 mov(t2, zr); 5084 } 5085 5086 // A carry in t0 after Montgomery multiplication means that we 5087 // should subtract multiples of n from our result in m. We'll 5088 // keep doing that until there is no carry. 5089 void normalize(RegisterOrConstant len) { 5090 block_comment("normalize"); 5091 // while (t0) 5092 // t0 = sub(Pm_base, Pn_base, t0, len); 5093 Label loop, post, again; 5094 Register cnt = t1, i = t2; // Re-use registers; we're done with them now 5095 cbz(t0, post); { 5096 bind(again); { 5097 mov(i, zr); 5098 mov(cnt, len); 5099 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5100 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5101 subs(zr, zr, zr); // set carry flag, i.e. no borrow 5102 align(16); 5103 bind(loop); { 5104 sbcs(Rm, Rm, Rn); 5105 str(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5106 add(i, i, 1); 5107 ldr(Rm, Address(Pm_base, i, Address::uxtw(LogBytesPerWord))); 5108 ldr(Rn, Address(Pn_base, i, Address::uxtw(LogBytesPerWord))); 5109 sub(cnt, cnt, 1); 5110 } cbnz(cnt, loop); 5111 sbc(t0, t0, zr); 5112 } cbnz(t0, again); 5113 } bind(post); 5114 } 5115 5116 // Move memory at s to d, reversing words. 5117 // Increments d to end of copied memory 5118 // Destroys tmp1, tmp2 5119 // Preserves len 5120 // Leaves s pointing to the address which was in d at start 5121 void reverse(Register d, Register s, Register len, Register tmp1, Register tmp2) { 5122 assert(tmp1 < r19 && tmp2 < r19, "register corruption"); 5123 5124 lea(s, Address(s, len, Address::uxtw(LogBytesPerWord))); 5125 mov(tmp1, len); 5126 unroll_2(tmp1, &MontgomeryMultiplyGenerator::reverse1, d, s, tmp2); 5127 sub(s, d, len, ext::uxtw, LogBytesPerWord); 5128 } 5129 // where 5130 void reverse1(Register d, Register s, Register tmp) { 5131 ldr(tmp, pre(s, -wordSize)); 5132 ror(tmp, tmp, 32); 5133 str(tmp, post(d, wordSize)); 5134 } 5135 5136 void step_squaring() { 5137 // An extra ACC 5138 step(); 5139 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5140 } 5141 5142 void last_squaring(RegisterOrConstant i) { 5143 Label dont; 5144 // if ((i & 1) == 0) { 5145 tbnz(i.as_register(), 0, dont); { 5146 // MACC(Ra, Rb, t0, t1, t2); 5147 // Ra = *++Pa; 5148 // Rb = *--Pb; 5149 umulh(Rhi_ab, Ra, Rb); 5150 mul(Rlo_ab, Ra, Rb); 5151 acc(Rhi_ab, Rlo_ab, t0, t1, t2); 5152 } bind(dont); 5153 } 5154 5155 void extra_step_squaring() { 5156 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5157 5158 // MACC(Rm, Rn, t0, t1, t2); 5159 // Rm = *++Pm; 5160 // Rn = *--Pn; 5161 umulh(Rhi_mn, Rm, Rn); 5162 mul(Rlo_mn, Rm, Rn); 5163 ldr(Rm, pre(Pm, wordSize)); 5164 ldr(Rn, pre(Pn, -wordSize)); 5165 } 5166 5167 void post1_squaring() { 5168 acc(Rhi_mn, Rlo_mn, t0, t1, t2); // The pending m*n 5169 5170 // *Pm = Rm = t0 * inv; 5171 mul(Rm, t0, inv); 5172 str(Rm, Address(Pm)); 5173 5174 // MACC(Rm, Rn, t0, t1, t2); 5175 // t0 = t1; t1 = t2; t2 = 0; 5176 umulh(Rhi_mn, Rm, Rn); 5177 5178 #ifndef PRODUCT 5179 // assert(m[i] * n[0] + t0 == 0, "broken Montgomery multiply"); 5180 { 5181 mul(Rlo_mn, Rm, Rn); 5182 add(Rlo_mn, t0, Rlo_mn); 5183 Label ok; 5184 cbz(Rlo_mn, ok); { 5185 stop("broken Montgomery multiply"); 5186 } bind(ok); 5187 } 5188 #endif 5189 // We have very carefully set things up so that 5190 // m[i]*n[0] + t0 == 0 (mod b), so we don't have to calculate 5191 // the lower half of Rm * Rn because we know the result already: 5192 // it must be -t0. t0 + (-t0) must generate a carry iff 5193 // t0 != 0. So, rather than do a mul and an adds we just set 5194 // the carry flag iff t0 is nonzero. 5195 // 5196 // mul(Rlo_mn, Rm, Rn); 5197 // adds(zr, t0, Rlo_mn); 5198 subs(zr, t0, 1); // Set carry iff t0 is nonzero 5199 adcs(t0, t1, Rhi_mn); 5200 adc(t1, t2, zr); 5201 mov(t2, zr); 5202 } 5203 5204 void acc(Register Rhi, Register Rlo, 5205 Register t0, Register t1, Register t2) { 5206 adds(t0, t0, Rlo); 5207 adcs(t1, t1, Rhi); 5208 adc(t2, t2, zr); 5209 } 5210 5211 public: 5212 /** 5213 * Fast Montgomery multiplication. The derivation of the 5214 * algorithm is in A Cryptographic Library for the Motorola 5215 * DSP56000, Dusse and Kaliski, Proc. EUROCRYPT 90, pp. 230-237. 5216 * 5217 * Arguments: 5218 * 5219 * Inputs for multiplication: 5220 * c_rarg0 - int array elements a 5221 * c_rarg1 - int array elements b 5222 * c_rarg2 - int array elements n (the modulus) 5223 * c_rarg3 - int length 5224 * c_rarg4 - int inv 5225 * c_rarg5 - int array elements m (the result) 5226 * 5227 * Inputs for squaring: 5228 * c_rarg0 - int array elements a 5229 * c_rarg1 - int array elements n (the modulus) 5230 * c_rarg2 - int length 5231 * c_rarg3 - int inv 5232 * c_rarg4 - int array elements m (the result) 5233 * 5234 */ 5235 address generate_multiply() { 5236 Label argh, nothing; 5237 bind(argh); 5238 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5239 5240 align(CodeEntryAlignment); 5241 address entry = pc(); 5242 5243 cbzw(Rlen, nothing); 5244 5245 enter(); 5246 5247 // Make room. 5248 cmpw(Rlen, 512); 5249 br(Assembler::HI, argh); 5250 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5251 andr(sp, Ra, -2 * wordSize); 5252 5253 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5254 5255 { 5256 // Copy input args, reversing as we go. We use Ra as a 5257 // temporary variable. 5258 reverse(Ra, Pa_base, Rlen, t0, t1); 5259 if (!_squaring) 5260 reverse(Ra, Pb_base, Rlen, t0, t1); 5261 reverse(Ra, Pn_base, Rlen, t0, t1); 5262 } 5263 5264 // Push all call-saved registers and also Pm_base which we'll need 5265 // at the end. 5266 save_regs(); 5267 5268 #ifndef PRODUCT 5269 // assert(inv * n[0] == -1UL, "broken inverse in Montgomery multiply"); 5270 { 5271 ldr(Rn, Address(Pn_base, 0)); 5272 mul(Rlo_mn, Rn, inv); 5273 subs(zr, Rlo_mn, -1); 5274 Label ok; 5275 br(EQ, ok); { 5276 stop("broken inverse in Montgomery multiply"); 5277 } bind(ok); 5278 } 5279 #endif 5280 5281 mov(Pm_base, Ra); 5282 5283 mov(t0, zr); 5284 mov(t1, zr); 5285 mov(t2, zr); 5286 5287 block_comment("for (int i = 0; i < len; i++) {"); 5288 mov(Ri, zr); { 5289 Label loop, end; 5290 cmpw(Ri, Rlen); 5291 br(Assembler::GE, end); 5292 5293 bind(loop); 5294 pre1(Ri); 5295 5296 block_comment(" for (j = i; j; j--) {"); { 5297 movw(Rj, Ri); 5298 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5299 } block_comment(" } // j"); 5300 5301 post1(); 5302 addw(Ri, Ri, 1); 5303 cmpw(Ri, Rlen); 5304 br(Assembler::LT, loop); 5305 bind(end); 5306 block_comment("} // i"); 5307 } 5308 5309 block_comment("for (int i = len; i < 2*len; i++) {"); 5310 mov(Ri, Rlen); { 5311 Label loop, end; 5312 cmpw(Ri, Rlen, Assembler::LSL, 1); 5313 br(Assembler::GE, end); 5314 5315 bind(loop); 5316 pre2(Ri, Rlen); 5317 5318 block_comment(" for (j = len*2-i-1; j; j--) {"); { 5319 lslw(Rj, Rlen, 1); 5320 subw(Rj, Rj, Ri); 5321 subw(Rj, Rj, 1); 5322 unroll_2(Rj, &MontgomeryMultiplyGenerator::step); 5323 } block_comment(" } // j"); 5324 5325 post2(Ri, Rlen); 5326 addw(Ri, Ri, 1); 5327 cmpw(Ri, Rlen, Assembler::LSL, 1); 5328 br(Assembler::LT, loop); 5329 bind(end); 5330 } 5331 block_comment("} // i"); 5332 5333 normalize(Rlen); 5334 5335 mov(Ra, Pm_base); // Save Pm_base in Ra 5336 restore_regs(); // Restore caller's Pm_base 5337 5338 // Copy our result into caller's Pm_base 5339 reverse(Pm_base, Ra, Rlen, t0, t1); 5340 5341 leave(); 5342 bind(nothing); 5343 ret(lr); 5344 5345 return entry; 5346 } 5347 // In C, approximately: 5348 5349 // void 5350 // montgomery_multiply(unsigned long Pa_base[], unsigned long Pb_base[], 5351 // unsigned long Pn_base[], unsigned long Pm_base[], 5352 // unsigned long inv, int len) { 5353 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5354 // unsigned long *Pa, *Pb, *Pn, *Pm; 5355 // unsigned long Ra, Rb, Rn, Rm; 5356 5357 // int i; 5358 5359 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5360 5361 // for (i = 0; i < len; i++) { 5362 // int j; 5363 5364 // Pa = Pa_base; 5365 // Pb = Pb_base + i; 5366 // Pm = Pm_base; 5367 // Pn = Pn_base + i; 5368 5369 // Ra = *Pa; 5370 // Rb = *Pb; 5371 // Rm = *Pm; 5372 // Rn = *Pn; 5373 5374 // int iters = i; 5375 // for (j = 0; iters--; j++) { 5376 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5377 // MACC(Ra, Rb, t0, t1, t2); 5378 // Ra = *++Pa; 5379 // Rb = *--Pb; 5380 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5381 // MACC(Rm, Rn, t0, t1, t2); 5382 // Rm = *++Pm; 5383 // Rn = *--Pn; 5384 // } 5385 5386 // assert(Ra == Pa_base[i] && Rb == Pb_base[0], "must be"); 5387 // MACC(Ra, Rb, t0, t1, t2); 5388 // *Pm = Rm = t0 * inv; 5389 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5390 // MACC(Rm, Rn, t0, t1, t2); 5391 5392 // assert(t0 == 0, "broken Montgomery multiply"); 5393 5394 // t0 = t1; t1 = t2; t2 = 0; 5395 // } 5396 5397 // for (i = len; i < 2*len; i++) { 5398 // int j; 5399 5400 // Pa = Pa_base + i-len; 5401 // Pb = Pb_base + len; 5402 // Pm = Pm_base + i-len; 5403 // Pn = Pn_base + len; 5404 5405 // Ra = *++Pa; 5406 // Rb = *--Pb; 5407 // Rm = *++Pm; 5408 // Rn = *--Pn; 5409 5410 // int iters = len*2-i-1; 5411 // for (j = i-len+1; iters--; j++) { 5412 // assert(Ra == Pa_base[j] && Rb == Pb_base[i-j], "must be"); 5413 // MACC(Ra, Rb, t0, t1, t2); 5414 // Ra = *++Pa; 5415 // Rb = *--Pb; 5416 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5417 // MACC(Rm, Rn, t0, t1, t2); 5418 // Rm = *++Pm; 5419 // Rn = *--Pn; 5420 // } 5421 5422 // Pm_base[i-len] = t0; 5423 // t0 = t1; t1 = t2; t2 = 0; 5424 // } 5425 5426 // while (t0) 5427 // t0 = sub(Pm_base, Pn_base, t0, len); 5428 // } 5429 5430 /** 5431 * Fast Montgomery squaring. This uses asymptotically 25% fewer 5432 * multiplies than Montgomery multiplication so it should be up to 5433 * 25% faster. However, its loop control is more complex and it 5434 * may actually run slower on some machines. 5435 * 5436 * Arguments: 5437 * 5438 * Inputs: 5439 * c_rarg0 - int array elements a 5440 * c_rarg1 - int array elements n (the modulus) 5441 * c_rarg2 - int length 5442 * c_rarg3 - int inv 5443 * c_rarg4 - int array elements m (the result) 5444 * 5445 */ 5446 address generate_square() { 5447 Label argh; 5448 bind(argh); 5449 stop("MontgomeryMultiply total_allocation must be <= 8192"); 5450 5451 align(CodeEntryAlignment); 5452 address entry = pc(); 5453 5454 enter(); 5455 5456 // Make room. 5457 cmpw(Rlen, 512); 5458 br(Assembler::HI, argh); 5459 sub(Ra, sp, Rlen, ext::uxtw, exact_log2(4 * sizeof (jint))); 5460 andr(sp, Ra, -2 * wordSize); 5461 5462 lsrw(Rlen, Rlen, 1); // length in longwords = len/2 5463 5464 { 5465 // Copy input args, reversing as we go. We use Ra as a 5466 // temporary variable. 5467 reverse(Ra, Pa_base, Rlen, t0, t1); 5468 reverse(Ra, Pn_base, Rlen, t0, t1); 5469 } 5470 5471 // Push all call-saved registers and also Pm_base which we'll need 5472 // at the end. 5473 save_regs(); 5474 5475 mov(Pm_base, Ra); 5476 5477 mov(t0, zr); 5478 mov(t1, zr); 5479 mov(t2, zr); 5480 5481 block_comment("for (int i = 0; i < len; i++) {"); 5482 mov(Ri, zr); { 5483 Label loop, end; 5484 bind(loop); 5485 cmp(Ri, Rlen); 5486 br(Assembler::GE, end); 5487 5488 pre1(Ri); 5489 5490 block_comment("for (j = (i+1)/2; j; j--) {"); { 5491 add(Rj, Ri, 1); 5492 lsr(Rj, Rj, 1); 5493 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5494 } block_comment(" } // j"); 5495 5496 last_squaring(Ri); 5497 5498 block_comment(" for (j = i/2; j; j--) {"); { 5499 lsr(Rj, Ri, 1); 5500 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5501 } block_comment(" } // j"); 5502 5503 post1_squaring(); 5504 add(Ri, Ri, 1); 5505 cmp(Ri, Rlen); 5506 br(Assembler::LT, loop); 5507 5508 bind(end); 5509 block_comment("} // i"); 5510 } 5511 5512 block_comment("for (int i = len; i < 2*len; i++) {"); 5513 mov(Ri, Rlen); { 5514 Label loop, end; 5515 bind(loop); 5516 cmp(Ri, Rlen, Assembler::LSL, 1); 5517 br(Assembler::GE, end); 5518 5519 pre2(Ri, Rlen); 5520 5521 block_comment(" for (j = (2*len-i-1)/2; j; j--) {"); { 5522 lsl(Rj, Rlen, 1); 5523 sub(Rj, Rj, Ri); 5524 sub(Rj, Rj, 1); 5525 lsr(Rj, Rj, 1); 5526 unroll_2(Rj, &MontgomeryMultiplyGenerator::step_squaring); 5527 } block_comment(" } // j"); 5528 5529 last_squaring(Ri); 5530 5531 block_comment(" for (j = (2*len-i)/2; j; j--) {"); { 5532 lsl(Rj, Rlen, 1); 5533 sub(Rj, Rj, Ri); 5534 lsr(Rj, Rj, 1); 5535 unroll_2(Rj, &MontgomeryMultiplyGenerator::extra_step_squaring); 5536 } block_comment(" } // j"); 5537 5538 post2(Ri, Rlen); 5539 add(Ri, Ri, 1); 5540 cmp(Ri, Rlen, Assembler::LSL, 1); 5541 5542 br(Assembler::LT, loop); 5543 bind(end); 5544 block_comment("} // i"); 5545 } 5546 5547 normalize(Rlen); 5548 5549 mov(Ra, Pm_base); // Save Pm_base in Ra 5550 restore_regs(); // Restore caller's Pm_base 5551 5552 // Copy our result into caller's Pm_base 5553 reverse(Pm_base, Ra, Rlen, t0, t1); 5554 5555 leave(); 5556 ret(lr); 5557 5558 return entry; 5559 } 5560 // In C, approximately: 5561 5562 // void 5563 // montgomery_square(unsigned long Pa_base[], unsigned long Pn_base[], 5564 // unsigned long Pm_base[], unsigned long inv, int len) { 5565 // unsigned long t0 = 0, t1 = 0, t2 = 0; // Triple-precision accumulator 5566 // unsigned long *Pa, *Pb, *Pn, *Pm; 5567 // unsigned long Ra, Rb, Rn, Rm; 5568 5569 // int i; 5570 5571 // assert(inv * Pn_base[0] == -1UL, "broken inverse in Montgomery multiply"); 5572 5573 // for (i = 0; i < len; i++) { 5574 // int j; 5575 5576 // Pa = Pa_base; 5577 // Pb = Pa_base + i; 5578 // Pm = Pm_base; 5579 // Pn = Pn_base + i; 5580 5581 // Ra = *Pa; 5582 // Rb = *Pb; 5583 // Rm = *Pm; 5584 // Rn = *Pn; 5585 5586 // int iters = (i+1)/2; 5587 // for (j = 0; iters--; j++) { 5588 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5589 // MACC2(Ra, Rb, t0, t1, t2); 5590 // Ra = *++Pa; 5591 // Rb = *--Pb; 5592 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5593 // MACC(Rm, Rn, t0, t1, t2); 5594 // Rm = *++Pm; 5595 // Rn = *--Pn; 5596 // } 5597 // if ((i & 1) == 0) { 5598 // assert(Ra == Pa_base[j], "must be"); 5599 // MACC(Ra, Ra, t0, t1, t2); 5600 // } 5601 // iters = i/2; 5602 // assert(iters == i-j, "must be"); 5603 // for (; iters--; j++) { 5604 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5605 // MACC(Rm, Rn, t0, t1, t2); 5606 // Rm = *++Pm; 5607 // Rn = *--Pn; 5608 // } 5609 5610 // *Pm = Rm = t0 * inv; 5611 // assert(Rm == Pm_base[i] && Rn == Pn_base[0], "must be"); 5612 // MACC(Rm, Rn, t0, t1, t2); 5613 5614 // assert(t0 == 0, "broken Montgomery multiply"); 5615 5616 // t0 = t1; t1 = t2; t2 = 0; 5617 // } 5618 5619 // for (i = len; i < 2*len; i++) { 5620 // int start = i-len+1; 5621 // int end = start + (len - start)/2; 5622 // int j; 5623 5624 // Pa = Pa_base + i-len; 5625 // Pb = Pa_base + len; 5626 // Pm = Pm_base + i-len; 5627 // Pn = Pn_base + len; 5628 5629 // Ra = *++Pa; 5630 // Rb = *--Pb; 5631 // Rm = *++Pm; 5632 // Rn = *--Pn; 5633 5634 // int iters = (2*len-i-1)/2; 5635 // assert(iters == end-start, "must be"); 5636 // for (j = start; iters--; j++) { 5637 // assert(Ra == Pa_base[j] && Rb == Pa_base[i-j], "must be"); 5638 // MACC2(Ra, Rb, t0, t1, t2); 5639 // Ra = *++Pa; 5640 // Rb = *--Pb; 5641 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5642 // MACC(Rm, Rn, t0, t1, t2); 5643 // Rm = *++Pm; 5644 // Rn = *--Pn; 5645 // } 5646 // if ((i & 1) == 0) { 5647 // assert(Ra == Pa_base[j], "must be"); 5648 // MACC(Ra, Ra, t0, t1, t2); 5649 // } 5650 // iters = (2*len-i)/2; 5651 // assert(iters == len-j, "must be"); 5652 // for (; iters--; j++) { 5653 // assert(Rm == Pm_base[j] && Rn == Pn_base[i-j], "must be"); 5654 // MACC(Rm, Rn, t0, t1, t2); 5655 // Rm = *++Pm; 5656 // Rn = *--Pn; 5657 // } 5658 // Pm_base[i-len] = t0; 5659 // t0 = t1; t1 = t2; t2 = 0; 5660 // } 5661 5662 // while (t0) 5663 // t0 = sub(Pm_base, Pn_base, t0, len); 5664 // } 5665 }; 5666 5667 5668 // Call here from the interpreter or compiled code to either load 5669 // multiple returned values from the value type instance being 5670 // returned to registers or to store returned values to a newly 5671 // allocated value type instance. 5672 address generate_return_value_stub(address destination, const char* name, bool has_res) { 5673 5674 // Information about frame layout at time of blocking runtime call. 5675 // Note that we only have to preserve callee-saved registers since 5676 // the compilers are responsible for supplying a continuation point 5677 // if they expect all registers to be preserved. 5678 // n.b. aarch64 asserts that frame::arg_reg_save_area_bytes == 0 5679 enum layout { 5680 rfp_off = 0, rfp_off2, 5681 5682 j_rarg7_off, j_rarg7_2, 5683 j_rarg6_off, j_rarg6_2, 5684 j_rarg5_off, j_rarg5_2, 5685 j_rarg4_off, j_rarg4_2, 5686 j_rarg3_off, j_rarg3_2, 5687 j_rarg2_off, j_rarg2_2, 5688 j_rarg1_off, j_rarg1_2, 5689 j_rarg0_off, j_rarg0_2, 5690 5691 j_farg0_off, j_farg0_2, 5692 j_farg1_off, j_farg1_2, 5693 j_farg2_off, j_farg2_2, 5694 j_farg3_off, j_farg3_2, 5695 j_farg4_off, j_farg4_2, 5696 j_farg5_off, j_farg5_2, 5697 j_farg6_off, j_farg6_2, 5698 j_farg7_off, j_farg7_2, 5699 5700 return_off, return_off2, 5701 framesize // inclusive of return address 5702 }; 5703 5704 int insts_size = 512; 5705 int locs_size = 64; 5706 5707 CodeBuffer code(name, insts_size, locs_size); 5708 OopMapSet* oop_maps = new OopMapSet(); 5709 MacroAssembler* masm = new MacroAssembler(&code); 5710 5711 address start = __ pc(); 5712 5713 const Address f7_save (rfp, j_farg7_off * wordSize); 5714 const Address f6_save (rfp, j_farg6_off * wordSize); 5715 const Address f5_save (rfp, j_farg5_off * wordSize); 5716 const Address f4_save (rfp, j_farg4_off * wordSize); 5717 const Address f3_save (rfp, j_farg3_off * wordSize); 5718 const Address f2_save (rfp, j_farg2_off * wordSize); 5719 const Address f1_save (rfp, j_farg1_off * wordSize); 5720 const Address f0_save (rfp, j_farg0_off * wordSize); 5721 5722 const Address r0_save (rfp, j_rarg0_off * wordSize); 5723 const Address r1_save (rfp, j_rarg1_off * wordSize); 5724 const Address r2_save (rfp, j_rarg2_off * wordSize); 5725 const Address r3_save (rfp, j_rarg3_off * wordSize); 5726 const Address r4_save (rfp, j_rarg4_off * wordSize); 5727 const Address r5_save (rfp, j_rarg5_off * wordSize); 5728 const Address r6_save (rfp, j_rarg6_off * wordSize); 5729 const Address r7_save (rfp, j_rarg7_off * wordSize); 5730 5731 // Generate oop map 5732 OopMap* map = new OopMap(framesize, 0); 5733 5734 map->set_callee_saved(VMRegImpl::stack2reg(rfp_off), rfp->as_VMReg()); 5735 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg7_off), j_rarg7->as_VMReg()); 5736 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg6_off), j_rarg6->as_VMReg()); 5737 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg5_off), j_rarg5->as_VMReg()); 5738 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg4_off), j_rarg4->as_VMReg()); 5739 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg3_off), j_rarg3->as_VMReg()); 5740 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg2_off), j_rarg2->as_VMReg()); 5741 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg1_off), j_rarg1->as_VMReg()); 5742 map->set_callee_saved(VMRegImpl::stack2reg(j_rarg0_off), j_rarg0->as_VMReg()); 5743 5744 map->set_callee_saved(VMRegImpl::stack2reg(j_farg0_off), j_farg0->as_VMReg()); 5745 map->set_callee_saved(VMRegImpl::stack2reg(j_farg1_off), j_farg1->as_VMReg()); 5746 map->set_callee_saved(VMRegImpl::stack2reg(j_farg2_off), j_farg2->as_VMReg()); 5747 map->set_callee_saved(VMRegImpl::stack2reg(j_farg3_off), j_farg3->as_VMReg()); 5748 map->set_callee_saved(VMRegImpl::stack2reg(j_farg4_off), j_farg4->as_VMReg()); 5749 map->set_callee_saved(VMRegImpl::stack2reg(j_farg5_off), j_farg5->as_VMReg()); 5750 map->set_callee_saved(VMRegImpl::stack2reg(j_farg6_off), j_farg6->as_VMReg()); 5751 map->set_callee_saved(VMRegImpl::stack2reg(j_farg7_off), j_farg7->as_VMReg()); 5752 5753 // This is an inlined and slightly modified version of call_VM 5754 // which has the ability to fetch the return PC out of 5755 // thread-local storage and also sets up last_Java_sp slightly 5756 // differently than the real call_VM 5757 5758 __ enter(); // Save FP and LR before call 5759 5760 assert(is_even(framesize/2), "sp not 16-byte aligned"); 5761 5762 // lr and fp are already in place 5763 __ sub(sp, rfp, ((unsigned)framesize - 4) << LogBytesPerInt); // prolog 5764 5765 __ strd(j_farg7, f7_save); 5766 __ strd(j_farg6, f6_save); 5767 __ strd(j_farg5, f5_save); 5768 __ strd(j_farg4, f4_save); 5769 __ strd(j_farg3, f3_save); 5770 __ strd(j_farg2, f2_save); 5771 __ strd(j_farg1, f1_save); 5772 __ strd(j_farg0, f0_save); 5773 5774 __ str(j_rarg0, r0_save); 5775 __ str(j_rarg1, r1_save); 5776 __ str(j_rarg2, r2_save); 5777 __ str(j_rarg3, r3_save); 5778 __ str(j_rarg4, r4_save); 5779 __ str(j_rarg5, r5_save); 5780 __ str(j_rarg6, r6_save); 5781 __ str(j_rarg7, r7_save); 5782 5783 int frame_complete = __ pc() - start; 5784 5785 // Set up last_Java_sp and last_Java_fp 5786 address the_pc = __ pc(); 5787 __ set_last_Java_frame(sp, rfp, the_pc, rscratch1); 5788 5789 // Call runtime 5790 __ mov(c_rarg0, rthread); 5791 __ mov(c_rarg1, r0); 5792 5793 BLOCK_COMMENT("call runtime_entry"); 5794 __ mov(rscratch1, destination); 5795 __ blrt(rscratch1, 2 /* number_of_arguments */, 0, 1); 5796 5797 oop_maps->add_gc_map(the_pc - start, map); 5798 5799 __ reset_last_Java_frame(false); 5800 __ maybe_isb(); 5801 5802 __ ldrd(j_farg7, f7_save); 5803 __ ldrd(j_farg6, f6_save); 5804 __ ldrd(j_farg5, f5_save); 5805 __ ldrd(j_farg4, f4_save); 5806 __ ldrd(j_farg3, f3_save); 5807 __ ldrd(j_farg3, f2_save); 5808 __ ldrd(j_farg1, f1_save); 5809 __ ldrd(j_farg0, f0_save); 5810 5811 __ ldr(j_rarg0, r0_save); 5812 __ ldr(j_rarg1, r1_save); 5813 __ ldr(j_rarg2, r2_save); 5814 __ ldr(j_rarg3, r3_save); 5815 __ ldr(j_rarg4, r4_save); 5816 __ ldr(j_rarg5, r5_save); 5817 __ ldr(j_rarg6, r6_save); 5818 __ ldr(j_rarg7, r7_save); 5819 5820 __ leave(); 5821 5822 // check for pending exceptions 5823 Label pending; 5824 __ ldr(rscratch1, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 5825 __ cmp(rscratch1, (u1)NULL_WORD); 5826 __ br(Assembler::NE, pending); 5827 5828 if (has_res) { 5829 __ get_vm_result(r0, rthread); 5830 } 5831 __ ret(lr); 5832 5833 __ bind(pending); 5834 __ ldr(r0, Address(rthread, in_bytes(Thread::pending_exception_offset()))); 5835 __ far_jump(RuntimeAddress(StubRoutines::forward_exception_entry())); 5836 5837 5838 // codeBlob framesize is in words (not VMRegImpl::slot_size) 5839 int frame_size_in_words = (framesize >> (LogBytesPerWord - LogBytesPerInt)); 5840 RuntimeStub* stub = 5841 RuntimeStub::new_runtime_stub(name, &code, frame_complete, frame_size_in_words, oop_maps, false); 5842 5843 return stub->entry_point(); 5844 } 5845 5846 // Initialization 5847 void generate_initial() { 5848 // Generate initial stubs and initializes the entry points 5849 5850 // entry points that exist in all platforms Note: This is code 5851 // that could be shared among different platforms - however the 5852 // benefit seems to be smaller than the disadvantage of having a 5853 // much more complicated generator structure. See also comment in 5854 // stubRoutines.hpp. 5855 5856 StubRoutines::_forward_exception_entry = generate_forward_exception(); 5857 5858 StubRoutines::_call_stub_entry = 5859 generate_call_stub(StubRoutines::_call_stub_return_address); 5860 5861 // is referenced by megamorphic call 5862 StubRoutines::_catch_exception_entry = generate_catch_exception(); 5863 5864 // Build this early so it's available for the interpreter. 5865 StubRoutines::_throw_StackOverflowError_entry = 5866 generate_throw_exception("StackOverflowError throw_exception", 5867 CAST_FROM_FN_PTR(address, 5868 SharedRuntime::throw_StackOverflowError)); 5869 StubRoutines::_throw_delayed_StackOverflowError_entry = 5870 generate_throw_exception("delayed StackOverflowError throw_exception", 5871 CAST_FROM_FN_PTR(address, 5872 SharedRuntime::throw_delayed_StackOverflowError)); 5873 if (UseCRC32Intrinsics) { 5874 // set table address before stub generation which use it 5875 StubRoutines::_crc_table_adr = (address)StubRoutines::aarch64::_crc_table; 5876 StubRoutines::_updateBytesCRC32 = generate_updateBytesCRC32(); 5877 } 5878 5879 if (UseCRC32CIntrinsics) { 5880 StubRoutines::_updateBytesCRC32C = generate_updateBytesCRC32C(); 5881 } 5882 5883 // Disabled until JDK-8210858 is fixed 5884 // if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dlog)) { 5885 // StubRoutines::_dlog = generate_dlog(); 5886 // } 5887 5888 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dsin)) { 5889 StubRoutines::_dsin = generate_dsin_dcos(/* isCos = */ false); 5890 } 5891 5892 if (vmIntrinsics::is_intrinsic_available(vmIntrinsics::_dcos)) { 5893 StubRoutines::_dcos = generate_dsin_dcos(/* isCos = */ true); 5894 } 5895 5896 5897 StubRoutines::_load_value_type_fields_in_regs = 5898 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::load_value_type_fields_in_regs), "load_value_type_fields_in_regs", false); 5899 StubRoutines::_store_value_type_fields_to_buf = 5900 generate_return_value_stub(CAST_FROM_FN_PTR(address, SharedRuntime::store_value_type_fields_to_buf), "store_value_type_fields_to_buf", true); 5901 } 5902 5903 void generate_all() { 5904 // support for verify_oop (must happen after universe_init) 5905 StubRoutines::_verify_oop_subroutine_entry = generate_verify_oop(); 5906 StubRoutines::_throw_AbstractMethodError_entry = 5907 generate_throw_exception("AbstractMethodError throw_exception", 5908 CAST_FROM_FN_PTR(address, 5909 SharedRuntime:: 5910 throw_AbstractMethodError)); 5911 5912 StubRoutines::_throw_IncompatibleClassChangeError_entry = 5913 generate_throw_exception("IncompatibleClassChangeError throw_exception", 5914 CAST_FROM_FN_PTR(address, 5915 SharedRuntime:: 5916 throw_IncompatibleClassChangeError)); 5917 5918 StubRoutines::_throw_NullPointerException_at_call_entry = 5919 generate_throw_exception("NullPointerException at call throw_exception", 5920 CAST_FROM_FN_PTR(address, 5921 SharedRuntime:: 5922 throw_NullPointerException_at_call)); 5923 5924 // arraycopy stubs used by compilers 5925 generate_arraycopy_stubs(); 5926 5927 // has negatives stub for large arrays. 5928 StubRoutines::aarch64::_has_negatives = generate_has_negatives(StubRoutines::aarch64::_has_negatives_long); 5929 5930 // array equals stub for large arrays. 5931 if (!UseSimpleArrayEquals) { 5932 StubRoutines::aarch64::_large_array_equals = generate_large_array_equals(); 5933 } 5934 5935 generate_compare_long_strings(); 5936 5937 generate_string_indexof_stubs(); 5938 5939 // byte_array_inflate stub for large arrays. 5940 StubRoutines::aarch64::_large_byte_array_inflate = generate_large_byte_array_inflate(); 5941 5942 #ifdef COMPILER2 5943 if (UseMultiplyToLenIntrinsic) { 5944 StubRoutines::_multiplyToLen = generate_multiplyToLen(); 5945 } 5946 5947 if (UseSquareToLenIntrinsic) { 5948 StubRoutines::_squareToLen = generate_squareToLen(); 5949 } 5950 5951 if (UseMulAddIntrinsic) { 5952 StubRoutines::_mulAdd = generate_mulAdd(); 5953 } 5954 5955 if (UseMontgomeryMultiplyIntrinsic) { 5956 StubCodeMark mark(this, "StubRoutines", "montgomeryMultiply"); 5957 MontgomeryMultiplyGenerator g(_masm, /*squaring*/false); 5958 StubRoutines::_montgomeryMultiply = g.generate_multiply(); 5959 } 5960 5961 if (UseMontgomerySquareIntrinsic) { 5962 StubCodeMark mark(this, "StubRoutines", "montgomerySquare"); 5963 MontgomeryMultiplyGenerator g(_masm, /*squaring*/true); 5964 // We use generate_multiply() rather than generate_square() 5965 // because it's faster for the sizes of modulus we care about. 5966 StubRoutines::_montgomerySquare = g.generate_multiply(); 5967 } 5968 #endif // COMPILER2 5969 5970 #ifndef BUILTIN_SIM 5971 // generate GHASH intrinsics code 5972 if (UseGHASHIntrinsics) { 5973 StubRoutines::_ghash_processBlocks = generate_ghash_processBlocks(); 5974 } 5975 5976 if (UseAESIntrinsics) { 5977 StubRoutines::_aescrypt_encryptBlock = generate_aescrypt_encryptBlock(); 5978 StubRoutines::_aescrypt_decryptBlock = generate_aescrypt_decryptBlock(); 5979 StubRoutines::_cipherBlockChaining_encryptAESCrypt = generate_cipherBlockChaining_encryptAESCrypt(); 5980 StubRoutines::_cipherBlockChaining_decryptAESCrypt = generate_cipherBlockChaining_decryptAESCrypt(); 5981 } 5982 5983 if (UseSHA1Intrinsics) { 5984 StubRoutines::_sha1_implCompress = generate_sha1_implCompress(false, "sha1_implCompress"); 5985 StubRoutines::_sha1_implCompressMB = generate_sha1_implCompress(true, "sha1_implCompressMB"); 5986 } 5987 if (UseSHA256Intrinsics) { 5988 StubRoutines::_sha256_implCompress = generate_sha256_implCompress(false, "sha256_implCompress"); 5989 StubRoutines::_sha256_implCompressMB = generate_sha256_implCompress(true, "sha256_implCompressMB"); 5990 } 5991 5992 // generate Adler32 intrinsics code 5993 if (UseAdler32Intrinsics) { 5994 StubRoutines::_updateBytesAdler32 = generate_updateBytesAdler32(); 5995 } 5996 5997 // Safefetch stubs. 5998 generate_safefetch("SafeFetch32", sizeof(int), &StubRoutines::_safefetch32_entry, 5999 &StubRoutines::_safefetch32_fault_pc, 6000 &StubRoutines::_safefetch32_continuation_pc); 6001 generate_safefetch("SafeFetchN", sizeof(intptr_t), &StubRoutines::_safefetchN_entry, 6002 &StubRoutines::_safefetchN_fault_pc, 6003 &StubRoutines::_safefetchN_continuation_pc); 6004 #endif 6005 StubRoutines::aarch64::set_completed(); 6006 } 6007 6008 public: 6009 StubGenerator(CodeBuffer* code, bool all) : StubCodeGenerator(code) { 6010 if (all) { 6011 generate_all(); 6012 } else { 6013 generate_initial(); 6014 } 6015 } 6016 }; // end class declaration 6017 6018 void StubGenerator_generate(CodeBuffer* code, bool all) { 6019 StubGenerator g(code, all); 6020 }